├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── docs
    ├── data-analysis.md
    ├── getting-started.md
    ├── query-language.md
    ├── string-operations.md
    └── text-processing.md
├── examples
    ├── README.md
    ├── scripts
    │   ├── README.md
    │   ├── cleanup.sh
    │   ├── download_datasets.sh
    │   └── generate_large.sh
    └── small
    │   ├── application.log
    │   ├── customers.json
    │   ├── ec2_instances.json
    │   ├── employees.json
    │   ├── error_messages.txt
    │   ├── nginx.conf
    │   ├── nginx_access.log
    │   ├── orders.csv
    │   ├── products.yaml
    │   ├── urls.txt
    │   └── user_behavior.json
└── src
    ├── arg.rs
    ├── error.rs
    ├── executor.rs
    ├── filter.rs
    ├── lib.rs
    ├── main.rs
    ├── output.rs
    ├── parser.rs
    ├── setup.rs
    ├── stats_opts.rs
    ├── string_ops.rs
    └── utils.rs


/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | dist
 3 | *.rs.bk
 4 | 
 5 | # Generated large datasets (user can create these with scripts)
 6 | examples/large/
 7 | examples/external/
 8 | examples/generated/
 9 | 
10 | # Temporary files
11 | *.tmp
12 | *.temp
13 | *.swp
14 | *.bak
15 | 
16 | # OS generated files
17 | .DS_Store
18 | .DS_Store?
19 | ._*
20 | .Spotlight-V100
21 | .Trashes
22 | ehthumbs.db
23 | Thumbs.db
24 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  7 | 
  8 | ## [0.2.2] - 2025-07-18
  9 | 
 10 | ### 🎉 New Features
 11 | 
 12 | #### Logical Operations
 13 | 
 14 | - **NOT operator support**: Added `not` operator for negating conditions in `select()` statements
 15 | 
 16 |   - Syntax: `select(not (.condition))` - parentheses are required for clarity
 17 |   - Works with all comparison operators and string operations
 18 |   - Examples: `select(not (.age > 65))`, `select(not (.email | contains("spam")))`
 19 | 
 20 | - **OR operator support**: Added pipe-delimited pattern matching for OR conditions
 21 |   - Syntax: `select(.field | contains("pattern1|pattern2|pattern3"))`
 22 |   - Multi-pattern matching in a single operation
 23 |   - Examples: `select(.level | contains("ERROR|FATAL"))`, `select(.role | contains("admin|manager|supervisor"))`
 24 | 
 25 | #### Array Slicing Operations
 26 | 
 27 | - **Python-style slicing**: Complete slice notation support for arrays and string split results
 28 | 
 29 |   - Basic slicing: `.[start:end]`, `.[start:]`, `.[:end]`, `.[:]`
 30 |   - Negative index support: `.[-5:]`, `.[:-3]`, `.[-10:-5]`
 31 |   - Field-specific slicing: `.users[0:10]`, `.logs[-100:]`
 32 | 
 33 | - **String split slicing**: Advanced slicing support for string split operations
 34 | 
 35 |   - Direct slicing after split: `split(",")[1:3]`, `split("/")[-1]`
 36 |   - Complex path processing: `split("/")[1:-1]` for middle path components
 37 |   - CSV column extraction: `split(",")[2:5]` for specific column ranges
 38 | 
 39 | - **Negative indexing**: Support for negative array indices
 40 |   - Last element access: `.array[-1]`, `.users[-1]`
 41 |   - Reverse indexing: `.array[-3]` for third from last
 42 |   - Compatible with all array operations
 43 | 
 44 | #### Enhanced Filtering Capabilities
 45 | 
 46 | - **Complex logical combinations**: Combine NOT and OR operations for sophisticated filtering
 47 | 
 48 |   - Example: `select(not (.status | contains("deleted|suspended|inactive")))`
 49 |   - Multi-condition filtering: `select(not (.type | contains("debug|trace|verbose")))`
 50 | 
 51 | - **Pattern-based exclusion**: Exclude multiple patterns efficiently
 52 |   - File filtering: `select(not (.filename | contains(".tmp|.bak|.swp")))`
 53 |   - Content filtering: `select(not (. | contains("DEBUG|INFO|TRACE")))`
 54 | 
 55 | ### 🔧 Improvements
 56 | 
 57 | #### Enhanced String Operations
 58 | 
 59 | - **Improved split operations**: Better integration with slicing for complex text processing
 60 | - **Optimized pattern matching**: More efficient OR pattern processing with pipe-delimited syntax
 61 | - **Better error handling**: Clearer error messages for logical operation syntax errors
 62 | 
 63 | #### Performance Optimizations
 64 | 
 65 | - **Slice operation efficiency**: Optimized memory usage for large array slicing operations
 66 | - **Pattern matching performance**: Improved performance for multi-pattern OR operations
 67 | - **Logical operation caching**: Better performance for complex NOT/OR combinations
 68 | 
 69 | #### Documentation and Examples
 70 | 
 71 | - **Comprehensive logical operation examples**: Real-world use cases for NOT and OR operations
 72 | - **Slicing operation guide**: Complete reference for all slicing capabilities
 73 | - **Advanced filtering patterns**: Examples combining multiple logical operations
 74 | 
 75 | ### 📊 New Use Cases Enabled
 76 | 
 77 | #### Advanced Log Analysis
 78 | 
 79 | ```bash
 80 | # Exclude multiple log levels efficiently
 81 | hawk '. | select(not (.level | contains("DEBUG|INFO|TRACE")))' app.log
 82 | 
 83 | # Get recent critical errors only
 84 | hawk '.logs[-1000:] | select(.level | contains("ERROR|FATAL|CRITICAL"))' system.log
 85 | 
 86 | # Extract specific time ranges with slicing
 87 | hawk '.logs[] | .timestamp | split("T")[0] | split("-")[1:3]' logs.json
 88 | ```
 89 | 
 90 | #### Sophisticated Data Filtering
 91 | 
 92 | ```bash
 93 | # Filter active users excluding test accounts
 94 | hawk '.users[] | select(not (.email | contains("test|demo|temp")))' users.json
 95 | 
 96 | # Find high-priority items excluding archived
 97 | hawk '.items[] | select(.priority | contains("high|urgent")) | select(not (.status | contains("archived|deleted")))' items.json
 98 | 
 99 | # Process middle sections of arrays
100 | hawk '.data[100:200] | select(not (.type | contains("noise|test")))' dataset.json
101 | ```
102 | 
103 | #### Complex Text Processing
104 | 
105 | ```bash
106 | # Extract domain names efficiently
107 | hawk '.urls[] | split("://")[1] | split("/")[0]' urls.txt
108 | 
109 | # Get file paths without extension
110 | hawk '.files[] | .path | split(".")[:-1] | join(".")' filelist.json
111 | 
112 | # Process CSV columns with exclusions
113 | hawk -t '. | split(",")[2:8] | select(not (.[0] | contains("null|empty|N/A")))' data.csv
114 | ```
115 | 
116 | #### Advanced Data Analysis Workflows
117 | 
118 | ```bash
119 | # Multi-step filtering with slicing
120 | hawk '.events[0:5000] | select(not (.type | contains("debug|trace"))) | group_by(.service) | count' events.json
121 | 
122 | # Recent data analysis with pattern exclusion
123 | hawk '.metrics[-500:] | select(not (.source | contains("test|staging"))) | avg(.value)' metrics.json
124 | 
125 | # Complex field extraction with logical operations
126 | hawk '.users[] | select(.role | contains("admin|manager")) | select(not (.status | contains("inactive|suspended"))) | count' users.json
127 | ```
128 | 
129 | ### 🛠️ Technical Improvements
130 | 
131 | #### Filter Module Enhancements
132 | 
133 | - **New function `parse_not_condition_with_parentheses`**: Robust NOT operator parsing with mandatory parentheses
134 | - **Enhanced `apply_filter_with_string_operations`**: Support for NOT operator in string operation pipelines
135 | - **Improved error handling**: Better error messages for missing parentheses and invalid syntax
136 | 
137 | #### Slicing Infrastructure
138 | 
139 | - **Universal slicing support**: `apply_universal_slice_operation` handles all data types
140 | - **Negative index processing**: `parse_index_with_negative` for Python-style negative indexing
141 | - **Data structure detection**: `detect_data_structure` for intelligent slicing behavior
142 | 
143 | #### Pattern Matching Optimization
144 | 
145 | - **OR pattern preprocessing**: Efficient pipe-delimited pattern parsing
146 | - **Multi-pattern contains operations**: Optimized string matching for multiple patterns
147 | - **Regex-free implementation**: Fast pattern matching without regex overhead
148 | 
149 | ### 🔄 Breaking Changes
150 | 
151 | None. This release is fully backward compatible with v0.2.x.
152 | 
153 | ### 🐛 Bug Fixes
154 | 
155 | - Fixed slice boundary checking for out-of-range indices
156 | - Improved pattern matching edge cases with empty patterns
157 | - Enhanced error reporting for malformed logical operations
158 | - Fixed memory usage issues with large slice operations
159 | 
160 | ### 📖 Documentation Updates
161 | 
162 | - **Complete logical operations reference**: Documentation for NOT and OR operators
163 | - **Comprehensive slicing guide**: All slicing capabilities with examples
164 | - **Advanced filtering patterns**: Real-world use case examples
165 | - **Performance best practices**: Guidelines for efficient query construction
166 | 
167 | ### 🚀 Migration Guide
168 | 
169 | #### For users upgrading from v0.2.1:
170 | 
171 | All existing queries continue to work without changes. New features are additive:
172 | 
173 | **New NOT operator usage:**
174 | 
175 | ```bash
176 | # Old approach (still works)
177 | hawk '.users[] | select(.age <= 65)' users.json
178 | 
179 | # New approach with NOT operator
180 | hawk '.users[] | select(not (.age > 65))' users.json
181 | ```
182 | 
183 | **New OR operator usage:**
184 | 
185 | ```bash
186 | # Old approach with multiple queries
187 | hawk '.logs[] | select(.level == "ERROR")' logs.json
188 | hawk '.logs[] | select(.level == "FATAL")' logs.json
189 | 
190 | # New approach with OR operator
191 | hawk '.logs[] | select(.level | contains("ERROR|FATAL"))' logs.json
192 | ```
193 | 
194 | **New slicing capabilities:**
195 | 
196 | ```bash
197 | # Get last 10 users (new)
198 | hawk '.users[-10:]' users.json
199 | 
200 | # Get middle section of data (new)
201 | hawk '.data[100:200]' data.json
202 | 
203 | # Extract filename from path (new)
204 | hawk '.files[] | .path | split("/")[-1]' files.json
205 | ```
206 | 
207 | ## [0.2.1] - 2024-07-16
208 | 
209 | ### 🐛 Bug Fixes
210 | 
211 | - Fixed single object field access (e.g., `.Parameters` in CloudFormation templates)
212 | - Corrected info display for single objects ("Single Object" vs "Object Array")
213 | - Enhanced support for YAML/JSON single object files
214 | 
215 | ### 🔧 Improvements
216 | 
217 | - Better error messages for field access
218 | - Improved CloudFormation, Docker Compose, Kubernetes manifest support
219 | 
220 | ## [0.2.0] - 2025-07-16
221 | 
222 | ### 🎉 Major Features Added
223 | 
224 | #### Plain Text Support
225 | 
226 | - **Universal file format support**: Now processes plain text files, log files, configuration files, and any text-based data
227 | - **Automatic format detection**: Intelligently detects JSON, YAML, CSV, and plain text files
228 | - **Unified query syntax**: Same query language works across all supported formats
229 | - **Text-as-array processing**: Each line becomes a string element in an array for consistent processing
230 | 
231 | #### String Operations
232 | 
233 | - **Complete string manipulation suite**: `upper`, `lower`, `trim`, `trim_start`, `trim_end`
234 | - **String analysis functions**: `length`, `reverse`
235 | - **Pattern matching**: `contains(pattern)`, `starts_with(pattern)`, `ends_with(pattern)`
236 | - **Text transformation**: `replace(old, new)`, `substring(start, length)`
237 | - **String parsing**: `split(delimiter)` to convert strings to arrays
238 | - **Array joining**: `join(delimiter)` to convert arrays back to strings
239 | 
240 | #### Enhanced map() Function
241 | 
242 | - **Data transformation pipeline**: Transform data elements with chained string operations
243 | - **Type-safe operations**: Proper error handling for incompatible data types
244 | - **Complex pipelines**: Support for multi-step transformations like `map(. | trim | upper | replace("old", "new"))`
245 | 
246 | #### Statistical Functions
247 | 
248 | - **Descriptive statistics**: `median`, `stddev` (standard deviation)
249 | - **Data manipulation**: `unique` (remove duplicates), `sort` (sort values)
250 | - **Array operations**: `length` for counting elements
251 | - **Field-specific operations**: All statistical functions support field specification (e.g., `median(.price)`)
252 | 
253 | #### Colored Output
254 | 
255 | - **Automatic TTY detection**: Colors in terminal, plain text when piped or redirected
256 | - **Beautiful syntax highlighting**:
257 |   - Table headers in blue with bold formatting
258 |   - Numbers in green
259 |   - Boolean values in yellow
260 |   - Null values in gray
261 |   - JSON syntax highlighting with colored keys and values
262 | - **Environment variable support**: Respects `NO_COLOR` environment variable
263 | - **Multiple output formats**: Enhanced table, JSON, and list outputs with appropriate coloring
264 | 
265 | ### 🔧 Improvements
266 | 
267 | #### Enhanced Error Handling
268 | 
269 | - **Detailed error messages**: Context-aware error reporting with specific field and operation information
270 | - **Type-safe operations**: Better validation of operations on different data types
271 | - **Pipeline debugging**: Improved error location reporting in complex query pipelines
272 | 
273 | #### Better File Format Detection
274 | 
275 | - **Robust detection algorithms**: Improved heuristics for distinguishing between formats
276 | - **Edge case handling**: Better support for malformed or ambiguous files
277 | - **Fallback mechanisms**: Graceful degradation to text processing when format detection fails
278 | 
279 | #### Performance Optimizations
280 | 
281 | - **Memory-efficient processing**: Optimized data structures for large datasets
282 | - **Faster pipeline execution**: Improved query parsing and execution engine
283 | - **Reduced startup time**: Optimized initialization and dependency loading
284 | 
285 | #### Pipeline Processing Improvements
286 | 
287 | - **Parentheses-aware parsing**: Proper handling of nested operations like `map(. | contains("text") | not)`
288 | - **Complex query support**: Better support for multi-level operations and transformations
289 | - **Operation chaining**: Improved reliability of long pipeline chains
290 | 
291 | ### 📊 New Use Cases Enabled
292 | 
293 | #### Log File Analysis
294 | 
295 | ```bash
296 | # Extract error logs with timestamps
297 | hawk '. | select(. | contains("ERROR")) | map(. | substring(0, 19))' app.log
298 | 
299 | # Count log levels
300 | hawk '. | map(. | split(" ")[2]) | group_by(.) | count' application.log
301 | 
302 | # Find unique IP addresses
303 | hawk '. | map(. | split(" ")[0]) | unique | sort' access.log
304 | ```
305 | 
306 | #### Text Data Processing
307 | 
308 | ```bash
309 | # Clean and normalize text
310 | hawk '. | map(. | trim | lower)' names.txt
311 | 
312 | # Extract file extensions
313 | hawk '. | map(. | split(".") | last)' filelist.txt
314 | 
315 | # Statistical text analysis
316 | hawk '. | map(. | split(" ") | length) | median' documents.txt
317 | ```
318 | 
319 | #### Data Cleaning and Normalization
320 | 
321 | ```bash
322 | # Email normalization
323 | hawk '.users[] | map(.email | lower | trim)' users.csv
324 | 
325 | # Complex string transformations
326 | hawk '.products[] | map(.name | replace("_", " ") | upper)' inventory.json
327 | 
328 | # Data validation and cleaning
329 | hawk '.records[] | select(.id | length == 8) | map(.status | upper)' data.csv
330 | ```
331 | 
332 | #### Advanced Analytics
333 | 
334 | ```bash
335 | # Statistical analysis
336 | hawk '.measurements[] | group_by(.sensor) | stddev(.temperature)' sensor_data.json
337 | 
338 | # Median calculations
339 | hawk '.sales[] | group_by(.region) | median(.amount)' sales_data.csv
340 | 
341 | # Unique value analysis
342 | hawk '.users[] | unique(.department) | sort' employee_data.json
343 | ```
344 | 
345 | ### 🛠️ Technical Improvements
346 | 
347 | #### New Dependencies
348 | 
349 | - `termcolor ^1.4`: For colored output support
350 | - `is-terminal ^0.4`: For TTY detection
351 | 
352 | #### Code Architecture
353 | 
354 | - **New module `string_ops`**: Centralized string operation handling
355 | - **New module `stats_ops`**: Statistical function implementations
356 | - **Enhanced `filter.rs`**: Improved pipeline operation handling
357 | - **Updated `output.rs`**: Comprehensive colored output support
358 | - **Improved `setup.rs`**: Better file format detection and text processing
359 | 
360 | #### Testing
361 | 
362 | - **Comprehensive test suite**: Added tests for all new string operations
363 | - **Statistical function testing**: Validation of median, stddev, and other statistical operations
364 | - **Integration testing**: End-to-end testing of complex pipeline operations
365 | - **Edge case coverage**: Testing of malformed inputs and error conditions
366 | 
367 | ### 🔄 Breaking Changes
368 | 
369 | None. This release is fully backward compatible with v0.1.x.
370 | 
371 | ### 📦 Migration Guide
372 | 
373 | No migration required. All existing queries and workflows continue to work unchanged.
374 | 
375 | ### 🐛 Bug Fixes
376 | 
377 | - Fixed pipeline parsing issues with complex nested operations
378 | - Improved CSV type inference accuracy
379 | - Enhanced error reporting for malformed queries
380 | - Fixed memory usage issues with large datasets
381 | 
382 | ### 📖 Documentation Updates
383 | 
384 | - **Comprehensive README update**: Added extensive documentation for new features
385 | - **String operations guide**: Complete reference for all string manipulation functions
386 | - **Statistical functions documentation**: Usage examples and parameter descriptions
387 | - **Text processing examples**: Real-world use cases for log analysis and text processing
388 | - **Enhanced query syntax reference**: Updated with all new operations and examples
389 | 
390 | ### 🙏 Acknowledgments
391 | 
392 | - Community feedback on string processing needs
393 | - Performance suggestions from early adopters
394 | - Documentation improvements from user contributions
395 | 
396 | ## [0.1.0] - 2024-07-12
397 | 
398 | ### 🎉 Initial Release
399 | 
400 | #### Core Features
401 | 
402 | - **Multi-format support**: JSON, YAML, CSV parsing and processing
403 | - **Pandas-like query language**: Intuitive syntax for data analysis
404 | - **Field access and navigation**: Deep nested field access with array expansion
405 | - **Filtering operations**: `select()` with comparison operators
406 | - **Aggregation functions**: `count`, `sum`, `avg`, `min`, `max`
407 | - **Grouping operations**: `group_by()` with aggregation support
408 | - **Multiple output formats**: Table, JSON, list with automatic format detection
409 | 
410 | #### Technical Foundation
411 | 
412 | - **Rust implementation**: Fast, memory-safe data processing
413 | - **serde_json integration**: Robust JSON parsing and manipulation
414 | - **Type-aware processing**: Intelligent handling of numbers, strings, booleans
415 | - **Error handling**: Comprehensive error reporting with thiserror
416 | - **CLI interface**: User-friendly command-line interface with clap
417 | 
418 | #### Supported Operations
419 | 
420 | - Field access: `.field`, `.array[0]`, `.array[]`, `.nested.field`
421 | - Filtering: `select(.field > value)`, `select(.field == "value")`
422 | - Aggregation: `sum(.field)`, `avg(.field)`, `min(.field)`, `max(.field)`, `count`
423 | - Grouping: `group_by(.field)` with aggregation support
424 | - Info: `. | info` for data structure exploration
425 | 
426 | #### Output Formats
427 | 
428 | - **Table format**: Structured table output for object arrays
429 | - **JSON format**: Pretty-printed JSON output
430 | - **List format**: Simple list output for array data
431 | - **Auto format**: Intelligent format selection based on data structure
432 | 
433 | ---
434 | 
435 | For more details about any release, please see the [GitHub releases page](https://github.com/kyotalab/hawk/releases).
436 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to hawk 🦅
  2 | 
  3 | Thank you for your interest in contributing to hawk! We welcome contributions from everyone, whether you're fixing a bug, adding a feature, improving documentation, or suggesting enhancements.
  4 | 
  5 | ## 🤝 Ways to Contribute
  6 | 
  7 | - 🐛 **Bug Reports**: Found an issue? Let us know!
  8 | - ✨ **Feature Requests**: Have an idea for improvement?
  9 | - 🔧 **Code Contributions**: Bug fixes, new features, optimizations
 10 | - 📚 **Documentation**: Improve README, add examples, write tutorials
 11 | - 🧪 **Testing**: Add test cases, improve test coverage
 12 | - 💡 **Examples**: Real-world use cases and sample datasets
 13 | 
 14 | ## 🚀 Getting Started
 15 | 
 16 | ### Development Setup
 17 | 
 18 | 1. **Fork the repository**
 19 |    ```bash
 20 |    git clone https://github.com/kyotalab/hawk.git
 21 |    cd hawk
 22 |    ```
 23 | 
 24 | 2. **Install Rust** (if not already installed)
 25 |    ```bash
 26 |    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 27 |    source ~/.cargo/env
 28 |    ```
 29 | 
 30 | 3. **Build the project**
 31 |    ```bash
 32 |    cargo build
 33 |    ```
 34 | 
 35 | 4. **Run tests**
 36 |    ```bash
 37 |    cargo test
 38 |    ```
 39 | 
 40 | 5. **Try it out**
 41 |    ```bash
 42 |    cargo run -- '.users[0].name' sample-data/users.json
 43 |    ```
 44 | 
 45 | ### Development Workflow
 46 | 
 47 | 1. **Create a branch** for your changes
 48 |    ```bash
 49 |    git checkout -b feature/amazing-feature
 50 |    ```
 51 | 
 52 | 2. **Make your changes** with clear, focused commits
 53 | 
 54 | 3. **Test thoroughly**
 55 |    ```bash
 56 |    cargo test
 57 |    cargo clippy  # Lint check
 58 |    cargo fmt     # Format code
 59 |    ```
 60 | 
 61 | 4. **Submit a Pull Request** with a clear description
 62 | 
 63 | ## 🐛 Reporting Bugs
 64 | 
 65 | When reporting bugs, please include:
 66 | 
 67 | ### Bug Report Template
 68 | ```
 69 | **Describe the bug**
 70 | A clear description of what the bug is.
 71 | 
 72 | **To Reproduce**
 73 | Steps to reproduce the behavior:
 74 | 1. Run command '...'
 75 | 2. With data file '...'
 76 | 3. See error
 77 | 
 78 | **Expected behavior**
 79 | What you expected to happen.
 80 | 
 81 | **Actual behavior**
 82 | What actually happened.
 83 | 
 84 | **Environment**
 85 | - OS: [e.g., Ubuntu 22.04, macOS 13.0, Windows 11]
 86 | - Rust version: [e.g., 1.70.0]
 87 | - hawk version: [e.g., 0.1.0]
 88 | 
 89 | **Sample data (if applicable)**
 90 | Minimal example that reproduces the issue.
 91 | 
 92 | **Additional context**
 93 | Any other relevant information.
 94 | ```
 95 | 
 96 | ## ✨ Feature Requests
 97 | 
 98 | We love new ideas! When suggesting features:
 99 | 
100 | ### Feature Request Template
101 | ```
102 | **Feature Summary**
103 | Brief description of the feature.
104 | 
105 | **Motivation**
106 | Why would this feature be useful? What problem does it solve?
107 | 
108 | **Proposed Solution**
109 | How should this feature work?
110 | 
111 | **Example Usage**
112 | Show how users would interact with this feature:
113 | hawk '.data | new_feature(.field)' data.json
114 | 
115 | **Alternatives Considered**
116 | Are there other ways to solve this problem?
117 | 
118 | **Additional Context**
119 | Any other relevant information, mockups, or examples.
120 | ```
121 | 
122 | ## 💻 Code Contributions
123 | 
124 | ### Coding Standards
125 | 
126 | - **Follow Rust conventions**: Use `cargo fmt` and `cargo clippy`
127 | - **Write tests**: All new features should include tests
128 | - **Document public APIs**: Add doc comments for public functions
129 | - **Keep it simple**: Prefer readable code over clever code
130 | - **Follow existing patterns**: Match the existing codebase style
131 | 
132 | ### Code Organization
133 | 
134 | ```
135 | src/
136 | ├── main.rs          # Entry point
137 | ├── lib.rs           # Library root
138 | ├── cli.rs           # Command line interface
139 | ├── error.rs         # Error types
140 | ├── setup.rs         # File reading & format detection
141 | ├── parser.rs        # Query parsing
142 | ├── executor.rs      # Query execution
143 | ├── filter.rs        # Filtering & aggregation
144 | ├── output.rs        # Output formatting
145 | └── utils.rs         # Utility functions
146 | ```
147 | 
148 | ### Adding New Features
149 | 
150 | 1. **Start with tests**: Write tests for your feature first
151 | 2. **Implement incrementally**: Break large features into smaller chunks
152 | 3. **Update documentation**: Add examples and update README if needed
153 | 4. **Consider backwards compatibility**: Don't break existing queries
154 | 
155 | ### Example: Adding a New Aggregation Function
156 | 
157 | ```rust
158 | // 1. Add to apply_pipeline_operation in filter.rs
159 | } else if operation.starts_with("median(") && operation.ends_with(")") {
160 |     let field = &operation[7..operation.len()-1];
161 |     let field_name = field.trim_start_matches('.');
162 | 
163 |     if is_grouped_data(&data) {
164 |         apply_aggregation_to_groups(data, "median", field_name)
165 |     } else {
166 |         calculate_median_simple(data, field_name)
167 |     }
168 | 
169 | // 2. Implement the calculation function
170 | fn calculate_median_simple(data: Vec<Value>, field_name: &str) -> Result<Vec<Value>, Error> {
171 |     // Implementation here
172 | }
173 | 
174 | // 3. Add group support in apply_aggregation_to_groups
175 | "median" => calculate_median(items, field_name)?,
176 | 
177 | // 4. Write tests
178 | #[test]
179 | fn test_median_calculation() {
180 |     // Test cases here
181 | }
182 | ```
183 | 
184 | ## 🧪 Testing Guidelines
185 | 
186 | ### Running Tests
187 | ```bash
188 | cargo test                     # All tests
189 | cargo test test_name          # Specific test
190 | cargo test --test integration # Integration tests only
191 | ```
192 | 
193 | ### Test Categories
194 | 
195 | 1. **Unit Tests**: Test individual functions
196 |    ```rust
197 |    #[cfg(test)]
198 |    mod tests {
199 |        use super::*;
200 | 
201 |        #[test]
202 |        fn test_parse_simple_query() {
203 |            // Test implementation
204 |        }
205 |    }
206 |    ```
207 | 
208 | 2. **Integration Tests**: Test complete workflows
209 |    ```rust
210 |    // tests/integration_test.rs
211 |    #[test]
212 |    fn test_csv_groupby_workflow() {
213 |        // End-to-end test
214 |    }
215 |    ```
216 | 
217 | 3. **Example Tests**: Verify README examples work
218 |    ```rust
219 |    #[test]
220 |    fn test_readme_examples() {
221 |        // Test examples from documentation
222 |    }
223 |    ```
224 | 
225 | ### Adding Test Data
226 | 
227 | Place test files in `tests/data/`:
228 | ```
229 | tests/
230 | ├── data/
231 | │   ├── users.json
232 | │   ├── sales.csv
233 | │   └── config.yaml
234 | └── integration_test.rs
235 | ```
236 | 
237 | ## 📚 Documentation Guidelines
238 | 
239 | ### Code Documentation
240 | ```rust
241 | /// Calculates the median value for a numeric field
242 | ///
243 | /// # Arguments
244 | /// * `data` - Vector of JSON values to process
245 | /// * `field_name` - Name of the field to calculate median for
246 | ///
247 | /// # Examples
248 | /// ```
249 | /// let result = calculate_median(data, "price")?;
250 | /// ```
251 | pub fn calculate_median(data: Vec<Value>, field_name: &str) -> Result<Value, Error> {
252 |     // Implementation
253 | }
254 | ```
255 | 
256 | ### README Updates
257 | - Add new features to the feature list
258 | - Include usage examples
259 | - Update the comparison table if needed
260 | - Add real-world use cases
261 | 
262 | ## 🎯 Priority Areas
263 | 
264 | We're especially interested in contributions in these areas:
265 | 
266 | ### High Priority
267 | - 🐛 **Bug fixes**: Any correctness issues
268 | - 🚀 **Performance improvements**: Memory usage, speed optimizations
269 | - 📊 **New aggregation functions**: `median`, `stddev`, `percentile`
270 | - 🔧 **CSV improvements**: Better type detection, delimiter handling
271 | 
272 | ### Medium Priority
273 | - 🌐 **Output formats**: XML, TSV support
274 | - 🔍 **Query enhancements**: Regular expressions, string functions
275 | - 📈 **Visualization**: ASCII charts, histograms
276 | - 🔄 **Streaming**: Large file support
277 | 
278 | ### Lower Priority
279 | - 🎨 **UI improvements**: Colors, better formatting
280 | - 📦 **Packaging**: Homebrew, APT packages
281 | - 🔌 **Plugins**: Extensibility system
282 | 
283 | ## 📋 Pull Request Guidelines
284 | 
285 | ### Before Submitting
286 | - [ ] All tests pass (`cargo test`)
287 | - [ ] Code is formatted (`cargo fmt`)
288 | - [ ] No clippy warnings (`cargo clippy`)
289 | - [ ] Documentation updated if needed
290 | - [ ] Examples work as expected
291 | 
292 | ### PR Description Template
293 | ```
294 | ## Summary
295 | Brief description of changes
296 | 
297 | ## Motivation
298 | Why is this change needed?
299 | 
300 | ## Changes
301 | - [ ] Feature A added
302 | - [ ] Bug B fixed
303 | - [ ] Tests updated
304 | 
305 | ## Testing
306 | How was this tested?
307 | 
308 | ## Breaking Changes
309 | Any backwards incompatible changes?
310 | 
311 | ## Related Issues
312 | Fixes #123
313 | ```
314 | 
315 | ## 🌟 Recognition
316 | 
317 | Contributors will be recognized in:
318 | - README acknowledgments
319 | - Release notes
320 | - GitHub contributors page
321 | 
322 | ## 📞 Getting Help
323 | 
324 | - 💬 **Discussions**: Use GitHub Discussions for questions
325 | - 🐛 **Issues**: Use GitHub Issues for bugs and feature requests
326 | - 📧 **Email**: Contact maintainers for sensitive issues
327 | 
328 | ## 📜 Code of Conduct
329 | 
330 | We follow the [Rust Code of Conduct](https://www.rust-lang.org/policies/code-of-conduct). Please be respectful and inclusive in all interactions.
331 | 
332 | ## 🙏 Thank You!
333 | 
334 | Every contribution helps make hawk better for everyone. Whether it's a typo fix or a major feature, we appreciate your effort!
335 | 
336 | ---
337 | 
338 | Happy contributing! 🦅


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 4
  4 | 
  5 | [[package]]
  6 | name = "aho-corasick"
  7 | version = "1.1.3"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
 10 | dependencies = [
 11 |  "memchr",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "anstream"
 16 | version = "0.6.19"
 17 | source = "registry+https://github.com/rust-lang/crates.io-index"
 18 | checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933"
 19 | dependencies = [
 20 |  "anstyle",
 21 |  "anstyle-parse",
 22 |  "anstyle-query",
 23 |  "anstyle-wincon",
 24 |  "colorchoice",
 25 |  "is_terminal_polyfill",
 26 |  "utf8parse",
 27 | ]
 28 | 
 29 | [[package]]
 30 | name = "anstyle"
 31 | version = "1.0.11"
 32 | source = "registry+https://github.com/rust-lang/crates.io-index"
 33 | checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd"
 34 | 
 35 | [[package]]
 36 | name = "anstyle-parse"
 37 | version = "0.2.7"
 38 | source = "registry+https://github.com/rust-lang/crates.io-index"
 39 | checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
 40 | dependencies = [
 41 |  "utf8parse",
 42 | ]
 43 | 
 44 | [[package]]
 45 | name = "anstyle-query"
 46 | version = "1.1.3"
 47 | source = "registry+https://github.com/rust-lang/crates.io-index"
 48 | checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9"
 49 | dependencies = [
 50 |  "windows-sys",
 51 | ]
 52 | 
 53 | [[package]]
 54 | name = "anstyle-wincon"
 55 | version = "3.0.9"
 56 | source = "registry+https://github.com/rust-lang/crates.io-index"
 57 | checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882"
 58 | dependencies = [
 59 |  "anstyle",
 60 |  "once_cell_polyfill",
 61 |  "windows-sys",
 62 | ]
 63 | 
 64 | [[package]]
 65 | name = "anyhow"
 66 | version = "1.0.98"
 67 | source = "registry+https://github.com/rust-lang/crates.io-index"
 68 | checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
 69 | 
 70 | [[package]]
 71 | name = "clap"
 72 | version = "4.5.40"
 73 | source = "registry+https://github.com/rust-lang/crates.io-index"
 74 | checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f"
 75 | dependencies = [
 76 |  "clap_builder",
 77 |  "clap_derive",
 78 | ]
 79 | 
 80 | [[package]]
 81 | name = "clap_builder"
 82 | version = "4.5.40"
 83 | source = "registry+https://github.com/rust-lang/crates.io-index"
 84 | checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e"
 85 | dependencies = [
 86 |  "anstream",
 87 |  "anstyle",
 88 |  "clap_lex",
 89 |  "strsim",
 90 | ]
 91 | 
 92 | [[package]]
 93 | name = "clap_derive"
 94 | version = "4.5.40"
 95 | source = "registry+https://github.com/rust-lang/crates.io-index"
 96 | checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce"
 97 | dependencies = [
 98 |  "heck",
 99 |  "proc-macro2",
100 |  "quote",
101 |  "syn",
102 | ]
103 | 
104 | [[package]]
105 | name = "clap_lex"
106 | version = "0.7.5"
107 | source = "registry+https://github.com/rust-lang/crates.io-index"
108 | checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
109 | 
110 | [[package]]
111 | name = "colorchoice"
112 | version = "1.0.4"
113 | source = "registry+https://github.com/rust-lang/crates.io-index"
114 | checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
115 | 
116 | [[package]]
117 | name = "csv"
118 | version = "1.3.1"
119 | source = "registry+https://github.com/rust-lang/crates.io-index"
120 | checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
121 | dependencies = [
122 |  "csv-core",
123 |  "itoa",
124 |  "ryu",
125 |  "serde",
126 | ]
127 | 
128 | [[package]]
129 | name = "csv-core"
130 | version = "0.1.12"
131 | source = "registry+https://github.com/rust-lang/crates.io-index"
132 | checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d"
133 | dependencies = [
134 |  "memchr",
135 | ]
136 | 
137 | [[package]]
138 | name = "equivalent"
139 | version = "1.0.2"
140 | source = "registry+https://github.com/rust-lang/crates.io-index"
141 | checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
142 | 
143 | [[package]]
144 | name = "hashbrown"
145 | version = "0.15.4"
146 | source = "registry+https://github.com/rust-lang/crates.io-index"
147 | checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5"
148 | 
149 | [[package]]
150 | name = "hawk-data"
151 | version = "0.2.3"
152 | dependencies = [
153 |  "anyhow",
154 |  "clap",
155 |  "csv",
156 |  "indexmap",
157 |  "is-terminal",
158 |  "regex",
159 |  "serde",
160 |  "serde_json",
161 |  "serde_yaml",
162 |  "termcolor",
163 |  "thiserror",
164 | ]
165 | 
166 | [[package]]
167 | name = "heck"
168 | version = "0.5.0"
169 | source = "registry+https://github.com/rust-lang/crates.io-index"
170 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
171 | 
172 | [[package]]
173 | name = "hermit-abi"
174 | version = "0.5.2"
175 | source = "registry+https://github.com/rust-lang/crates.io-index"
176 | checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
177 | 
178 | [[package]]
179 | name = "indexmap"
180 | version = "2.10.0"
181 | source = "registry+https://github.com/rust-lang/crates.io-index"
182 | checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661"
183 | dependencies = [
184 |  "equivalent",
185 |  "hashbrown",
186 |  "serde",
187 | ]
188 | 
189 | [[package]]
190 | name = "is-terminal"
191 | version = "0.4.16"
192 | source = "registry+https://github.com/rust-lang/crates.io-index"
193 | checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
194 | dependencies = [
195 |  "hermit-abi",
196 |  "libc",
197 |  "windows-sys",
198 | ]
199 | 
200 | [[package]]
201 | name = "is_terminal_polyfill"
202 | version = "1.70.1"
203 | source = "registry+https://github.com/rust-lang/crates.io-index"
204 | checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
205 | 
206 | [[package]]
207 | name = "itoa"
208 | version = "1.0.15"
209 | source = "registry+https://github.com/rust-lang/crates.io-index"
210 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
211 | 
212 | [[package]]
213 | name = "libc"
214 | version = "0.2.174"
215 | source = "registry+https://github.com/rust-lang/crates.io-index"
216 | checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
217 | 
218 | [[package]]
219 | name = "memchr"
220 | version = "2.7.5"
221 | source = "registry+https://github.com/rust-lang/crates.io-index"
222 | checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
223 | 
224 | [[package]]
225 | name = "once_cell_polyfill"
226 | version = "1.70.1"
227 | source = "registry+https://github.com/rust-lang/crates.io-index"
228 | checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
229 | 
230 | [[package]]
231 | name = "proc-macro2"
232 | version = "1.0.95"
233 | source = "registry+https://github.com/rust-lang/crates.io-index"
234 | checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
235 | dependencies = [
236 |  "unicode-ident",
237 | ]
238 | 
239 | [[package]]
240 | name = "quote"
241 | version = "1.0.40"
242 | source = "registry+https://github.com/rust-lang/crates.io-index"
243 | checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
244 | dependencies = [
245 |  "proc-macro2",
246 | ]
247 | 
248 | [[package]]
249 | name = "regex"
250 | version = "1.11.1"
251 | source = "registry+https://github.com/rust-lang/crates.io-index"
252 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
253 | dependencies = [
254 |  "aho-corasick",
255 |  "memchr",
256 |  "regex-automata",
257 |  "regex-syntax",
258 | ]
259 | 
260 | [[package]]
261 | name = "regex-automata"
262 | version = "0.4.9"
263 | source = "registry+https://github.com/rust-lang/crates.io-index"
264 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
265 | dependencies = [
266 |  "aho-corasick",
267 |  "memchr",
268 |  "regex-syntax",
269 | ]
270 | 
271 | [[package]]
272 | name = "regex-syntax"
273 | version = "0.8.5"
274 | source = "registry+https://github.com/rust-lang/crates.io-index"
275 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
276 | 
277 | [[package]]
278 | name = "ryu"
279 | version = "1.0.20"
280 | source = "registry+https://github.com/rust-lang/crates.io-index"
281 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
282 | 
283 | [[package]]
284 | name = "serde"
285 | version = "1.0.219"
286 | source = "registry+https://github.com/rust-lang/crates.io-index"
287 | checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
288 | dependencies = [
289 |  "serde_derive",
290 | ]
291 | 
292 | [[package]]
293 | name = "serde_derive"
294 | version = "1.0.219"
295 | source = "registry+https://github.com/rust-lang/crates.io-index"
296 | checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
297 | dependencies = [
298 |  "proc-macro2",
299 |  "quote",
300 |  "syn",
301 | ]
302 | 
303 | [[package]]
304 | name = "serde_json"
305 | version = "1.0.140"
306 | source = "registry+https://github.com/rust-lang/crates.io-index"
307 | checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
308 | dependencies = [
309 |  "indexmap",
310 |  "itoa",
311 |  "memchr",
312 |  "ryu",
313 |  "serde",
314 | ]
315 | 
316 | [[package]]
317 | name = "serde_yaml"
318 | version = "0.9.34+deprecated"
319 | source = "registry+https://github.com/rust-lang/crates.io-index"
320 | checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
321 | dependencies = [
322 |  "indexmap",
323 |  "itoa",
324 |  "ryu",
325 |  "serde",
326 |  "unsafe-libyaml",
327 | ]
328 | 
329 | [[package]]
330 | name = "strsim"
331 | version = "0.11.1"
332 | source = "registry+https://github.com/rust-lang/crates.io-index"
333 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
334 | 
335 | [[package]]
336 | name = "syn"
337 | version = "2.0.104"
338 | source = "registry+https://github.com/rust-lang/crates.io-index"
339 | checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
340 | dependencies = [
341 |  "proc-macro2",
342 |  "quote",
343 |  "unicode-ident",
344 | ]
345 | 
346 | [[package]]
347 | name = "termcolor"
348 | version = "1.4.1"
349 | source = "registry+https://github.com/rust-lang/crates.io-index"
350 | checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
351 | dependencies = [
352 |  "winapi-util",
353 | ]
354 | 
355 | [[package]]
356 | name = "thiserror"
357 | version = "2.0.12"
358 | source = "registry+https://github.com/rust-lang/crates.io-index"
359 | checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
360 | dependencies = [
361 |  "thiserror-impl",
362 | ]
363 | 
364 | [[package]]
365 | name = "thiserror-impl"
366 | version = "2.0.12"
367 | source = "registry+https://github.com/rust-lang/crates.io-index"
368 | checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
369 | dependencies = [
370 |  "proc-macro2",
371 |  "quote",
372 |  "syn",
373 | ]
374 | 
375 | [[package]]
376 | name = "unicode-ident"
377 | version = "1.0.18"
378 | source = "registry+https://github.com/rust-lang/crates.io-index"
379 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
380 | 
381 | [[package]]
382 | name = "unsafe-libyaml"
383 | version = "0.2.11"
384 | source = "registry+https://github.com/rust-lang/crates.io-index"
385 | checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
386 | 
387 | [[package]]
388 | name = "utf8parse"
389 | version = "0.2.2"
390 | source = "registry+https://github.com/rust-lang/crates.io-index"
391 | checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
392 | 
393 | [[package]]
394 | name = "winapi-util"
395 | version = "0.1.9"
396 | source = "registry+https://github.com/rust-lang/crates.io-index"
397 | checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
398 | dependencies = [
399 |  "windows-sys",
400 | ]
401 | 
402 | [[package]]
403 | name = "windows-sys"
404 | version = "0.59.0"
405 | source = "registry+https://github.com/rust-lang/crates.io-index"
406 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
407 | dependencies = [
408 |  "windows-targets",
409 | ]
410 | 
411 | [[package]]
412 | name = "windows-targets"
413 | version = "0.52.6"
414 | source = "registry+https://github.com/rust-lang/crates.io-index"
415 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
416 | dependencies = [
417 |  "windows_aarch64_gnullvm",
418 |  "windows_aarch64_msvc",
419 |  "windows_i686_gnu",
420 |  "windows_i686_gnullvm",
421 |  "windows_i686_msvc",
422 |  "windows_x86_64_gnu",
423 |  "windows_x86_64_gnullvm",
424 |  "windows_x86_64_msvc",
425 | ]
426 | 
427 | [[package]]
428 | name = "windows_aarch64_gnullvm"
429 | version = "0.52.6"
430 | source = "registry+https://github.com/rust-lang/crates.io-index"
431 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
432 | 
433 | [[package]]
434 | name = "windows_aarch64_msvc"
435 | version = "0.52.6"
436 | source = "registry+https://github.com/rust-lang/crates.io-index"
437 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
438 | 
439 | [[package]]
440 | name = "windows_i686_gnu"
441 | version = "0.52.6"
442 | source = "registry+https://github.com/rust-lang/crates.io-index"
443 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
444 | 
445 | [[package]]
446 | name = "windows_i686_gnullvm"
447 | version = "0.52.6"
448 | source = "registry+https://github.com/rust-lang/crates.io-index"
449 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
450 | 
451 | [[package]]
452 | name = "windows_i686_msvc"
453 | version = "0.52.6"
454 | source = "registry+https://github.com/rust-lang/crates.io-index"
455 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
456 | 
457 | [[package]]
458 | name = "windows_x86_64_gnu"
459 | version = "0.52.6"
460 | source = "registry+https://github.com/rust-lang/crates.io-index"
461 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
462 | 
463 | [[package]]
464 | name = "windows_x86_64_gnullvm"
465 | version = "0.52.6"
466 | source = "registry+https://github.com/rust-lang/crates.io-index"
467 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
468 | 
469 | [[package]]
470 | name = "windows_x86_64_msvc"
471 | version = "0.52.6"
472 | source = "registry+https://github.com/rust-lang/crates.io-index"
473 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
474 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hawk-data"
 3 | version = "0.2.3"
 4 | edition = "2024"
 5 | authors = ["Kyota <jagbvg.eptpeox@gmail.com>"]
 6 | license = "MIT"
 7 | description = "Modern data analysis tool for structured data (JSON, YAML, CSV)"
 8 | readme = "README.md"
 9 | homepage = "https://github.com/kyotalab/hawk"
10 | repository = "https://github.com/kyotalab/hawk"
11 | keywords = ["awk", "cli", "jq", "analysis"]
12 | categories = ["command-line-utilities"]
13 | 
14 | [[bin]]
15 | name = "hawk"
16 | path = "src/main.rs"
17 | 
18 | [dependencies]
19 | anyhow = "1.0.98"
20 | clap = { version = "4.5.40", features = ["derive"] }
21 | csv = "1.3.1"
22 | indexmap = { version = "2.10.0", features = ["serde"] }
23 | is-terminal = "0.4.16"
24 | regex = "1.11.1"
25 | serde = { version = "1.0.219", features = ["derive"] }
26 | serde_json = { version = "1.0.140", features = ["preserve_order"] }
27 | serde_yaml = "0.9.34"
28 | termcolor = "1.4.1"
29 | thiserror = "2.0.12"
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 kyotalab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # hawk 🦅
  2 | 
  3 | **Modern data analysis tool for JSON, YAML, CSV, and text files**
  4 | 
  5 | [![Rust](https://img.shields.io/badge/rust-1.70%2B-orange.svg)](https://www.rust-lang.org/)
  6 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
  7 | [![Crates.io](https://img.shields.io/crates/v/hawk-data.svg)](https://crates.io/crates/hawk-data)
  8 | [![Crates.io](https://img.shields.io/crates/d/hawk-data.svg)](https://crates.io/crates/hawk-data)
  9 | [![GitHub Stars](https://img.shields.io/github/stars/kyotalab/hawk.svg)](https://github.com/kyotalab/hawk/stargazers)
 10 | [![GitHub Release](https://img.shields.io/github/release/kyotalab/hawk.svg)](https://github.com/kyotalab/hawk/releases)
 11 | 
 12 | hawk combines the simplicity of `awk` with the power of `pandas`, bringing unified data processing to your command line. Process any data format with the same intuitive syntax.
 13 | 
 14 | ## ⚡ Quick Start
 15 | 
 16 | ### Installation
 17 | 
 18 | ```bash
 19 | # Homebrew (macOS/Linux)
 20 | brew install kyotalab/tools/hawk
 21 | 
 22 | # Cargo (Rust)
 23 | cargo install hawk-data
 24 | 
 25 | # Verify installation
 26 | hawk --version
 27 | ```
 28 | 
 29 | ### 30-Second Demo
 30 | 
 31 | ```bash
 32 | # JSON/CSV analysis - same syntax!
 33 | hawk '.users[] | select(.age > 30) | count' users.json
 34 | hawk '.[] | group_by(.department) | avg(.salary)' employees.csv
 35 | 
 36 | # Text/log processing with slicing (NEW!)
 37 | hawk -t '. | select(. | contains("ERROR|WARN")) | .[-100:]' app.log
 38 | hawk -t '. | map(. | split(" ")[0:3]) | unique' access.log
 39 | 
 40 | # Advanced string operations with multiple fields
 41 | hawk '.posts[] | map(.title, .content | trim | lower)' blog.json
 42 | hawk '.[] | group_by(.category) | .[0:10] | avg(.price)' products.json
 43 | ```
 44 | 
 45 | ## 🚀 Why hawk?
 46 | 
 47 | | Feature                  | hawk                       | jq               | awk           | pandas             |
 48 | | ------------------------ | -------------------------- | ---------------- | ------------- | ------------------ |
 49 | | **Multi-format**         | ✅ JSON, YAML, CSV, Text   | ❌ JSON only     | ❌ Text only  | ❌ Python required |
 50 | | **Unified syntax**       | ✅ Same queries everywhere | ❌ JSON-specific | ❌ Line-based | ❌ Complex setup   |
 51 | | **String operations**    | ✅ 14 built-in + slicing   | ⚠️ Limited       | ⚠️ Basic      | ✅ Extensive       |
 52 | | **Statistical analysis** | ✅ Built-in median, stddev | ❌ None          | ❌ None       | ✅ Full suite      |
 53 | | **Learning curve**       | 🟢 Familiar pandas-like    | 🟡 Steep         | 🟢 Simple     | 🔴 High            |
 54 | 
 55 | ## 🎯 Key Features
 56 | 
 57 | ### **Universal Data Processing**
 58 | 
 59 | Process any format with identical syntax:
 60 | 
 61 | ```bash
 62 | hawk '.items[] | select(.price > 100)' data.json   # JSON
 63 | hawk '.items[] | select(.price > 100)' data.csv    # CSV
 64 | hawk '.items[] | select(.price > 100)' data.yaml   # YAML
 65 | hawk -t '. | select(. | contains("$"))' data.txt   # Text
 66 | ```
 67 | 
 68 | ### **Advanced Text Processing (NEW in v0.2.3!)**
 69 | 
 70 | ```bash
 71 | # Split with slicing - extract exactly what you need
 72 | echo "2024-01-15 10:30:45 INFO message" | hawk -t '. | map(. | split(" ")[0:2])'
 73 | # → ["2024-01-15", "10:30:45"]
 74 | 
 75 | # OR conditions for flexible filtering
 76 | hawk -t '. | select(. | contains("GET|POST|PUT"))' access.log
 77 | 
 78 | # Powerful slicing for any operation result
 79 | hawk '.[] | sort(.revenue) | .[-10:]' companies.json  # Top 10
 80 | hawk '.[] | group_by(.category) | .[0:5]' products.json  # 5 from each group
 81 | ```
 82 | 
 83 | ### **Statistical Analysis Made Simple**
 84 | 
 85 | ```bash
 86 | # Instant insights from your data
 87 | hawk '.sales[] | group_by(.region) | median(.amount)' sales.json
 88 | hawk '.users[] | select(.active) | stddev(.session_time)' analytics.json
 89 | hawk '.metrics[] | unique(.user_id) | count' engagement.json
 90 | ```
 91 | 
 92 | ## 📚 Documentation
 93 | 
 94 | ### **Get Started in 5 Minutes**
 95 | 
 96 | - 🚀 [**Quick Start Guide**](docs/getting-started.md) - Essential basics
 97 | - 📖 [**Query Language Reference**](docs/query-language.md) - Complete syntax
 98 | - 🧵 [**String Operations**](docs/string-operations.md) - Text processing guide
 99 | 
100 | ### **Master Advanced Features**
101 | 
102 | - 📊 [**Data Analysis**](docs/data-analysis.md) - Statistical workflows
103 | - 📄 [**Text Processing**](docs/text-processing.md) - Log analysis and text manipulation
104 | - 💼 [**Real-world Examples**](docs/examples/) - Industry-specific use cases
105 | 
106 | ### **Use Case Guides(In progress)**
107 | 
108 | - 🔍 [**Log Analysis**](docs/examples/log-analysis.md) - Docker, nginx, application logs
109 | - ⚙️ [**DevOps Workflows**](docs/examples/devops-workflows.md) - Kubernetes, CI/CD, monitoring
110 | - 📈 [**Data Science**](docs/examples/data-science.md) - CSV analysis, statistics, ML prep
111 | 
112 | ## 🌟 Popular Workflows
113 | 
114 | ### **Log Analysis**
115 | 
116 | ```bash
117 | # Find error patterns in application logs
118 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[0:2]) | unique' app.log
119 | 
120 | # Analyze Docker container performance
121 | hawk -t '. | group_by(. | split(" ")[1]) | count' docker.log
122 | ```
123 | 
124 | ### **Data Exploration**
125 | 
126 | ```bash
127 | # Quick dataset overview
128 | hawk '. | info' unknown-data.json
129 | 
130 | # Statistical analysis
131 | hawk '.users[] | group_by(.department) | median(.salary)' employees.csv
132 | ```
133 | 
134 | ### **DevOps Automation**
135 | 
136 | ```bash
137 | # Kubernetes resource analysis
138 | hawk '.items[] | select(.status.phase == "Running") | count' pods.json
139 | 
140 | # Performance monitoring
141 | hawk '.metrics[] | group_by(.service) | avg(.response_time)' monitoring.json
142 | ```
143 | 
144 | ## ⭐ What's New in v0.2.3
145 | 
146 | - **🎯 Advanced Slicing**: `.[0:10]`, `.[-5:]`, `group_by(.field) | .[0:3]`
147 | - **✂️ Split with Slicing**: `split(" ")[0:3]`, `split(",")[-2:]`
148 | - **🔍 OR Conditions**: `contains("GET|POST")`, `starts_with("ERROR|WARN")`
149 | - **📊 Stratified Sampling**: Sample from each group for unbiased analysis
150 | - **⚡ Performance**: Optimized for large datasets with efficient memory usage
151 | 
152 | ## 🤝 Contributing
153 | 
154 | We welcome contributions! See our [Contributing Guide](CONTRIBUTING.md).
155 | 
156 | ```bash
157 | git clone https://github.com/kyotalab/hawk.git
158 | cd hawk
159 | cargo build --release
160 | cargo test
161 | ```
162 | 
163 | ## 📄 License
164 | 
165 | MIT License - see [LICENSE](LICENSE) for details.
166 | 
167 | ---
168 | 
169 | **Ready to transform your data workflows?** Start with our [5-minute tutorial](docs/getting-started.md) 🚀
170 | 


--------------------------------------------------------------------------------
/docs/data-analysis.md:
--------------------------------------------------------------------------------
  1 | # Data Analysis Guide
  2 | 
  3 | Comprehensive guide to data analysis workflows with hawk.
  4 | 
  5 | ## 📖 Table of Contents
  6 | 
  7 | - [Data Analysis Fundamentals](#data-analysis-fundamentals)
  8 | - [Exploratory Data Analysis](#exploratory-data-analysis)
  9 | - [Statistical Operations](#statistical-operations)
 10 | - [Data Filtering and Selection](#data-filtering-and-selection)
 11 | - [Grouping and Aggregation](#grouping-and-aggregation)
 12 | - [Data Transformation](#data-transformation)
 13 | - [Time Series Analysis](#time-series-analysis)
 14 | - [Performance Analytics](#performance-analytics)
 15 | - [Business Intelligence](#business-intelligence)
 16 | - [Advanced Analytics Patterns](#advanced-analytics-patterns)
 17 | 
 18 | ## Data Analysis Fundamentals
 19 | 
 20 | ### The hawk Analytics Workflow
 21 | 
 22 | ```bash
 23 | 1. Data Exploration    → hawk '. | info' data.json
 24 | 2. Data Cleaning      → hawk '.[] | select(.field) | map(.field | trim)'
 25 | 3. Data Filtering     → hawk '.[] | select(.condition)'
 26 | 4. Data Aggregation   → hawk '.[] | group_by(.field) | agg_function'
 27 | 5. Results Export     → hawk '.results[]' --format csv > output.csv
 28 | ```
 29 | 
 30 | ### Understanding Your Data Structure
 31 | 
 32 | Before analysis, always understand your data:
 33 | 
 34 | ```bash
 35 | # Get basic information
 36 | hawk '. | info' dataset.json
 37 | 
 38 | # Count total records
 39 | hawk '. | count' data.csv
 40 | 
 41 | # Sample the first few records
 42 | hawk '.[0:5]' data.json
 43 | 
 44 | # Check for missing values
 45 | hawk '.[] | select(.field == null) | count' data.json
 46 | ```
 47 | 
 48 | ## Exploratory Data Analysis
 49 | 
 50 | ### Data Overview and Profiling
 51 | 
 52 | ```bash
 53 | # Dataset summary
 54 | hawk '. | info' sales_data.json
 55 | 
 56 | # Record count by category
 57 | hawk '.[] | group_by(.category) | count' products.csv
 58 | 
 59 | # Unique values in a field
 60 | hawk '.[] | .department | unique' employees.json
 61 | 
 62 | # Data quality check
 63 | hawk '.[] | select(.email | contains("@")) | count' users.csv
 64 | ```
 65 | 
 66 | ### Sample Data Analysis Workflow
 67 | 
 68 | Let's work with a sample sales dataset:
 69 | 
 70 | ```json
 71 | {
 72 |   "sales": [
 73 |     {
 74 |       "date": "2024-01-15",
 75 |       "product": "Laptop",
 76 |       "category": "Electronics",
 77 |       "amount": 1200,
 78 |       "quantity": 1,
 79 |       "region": "North",
 80 |       "salesperson": "Alice"
 81 |     },
 82 |     {
 83 |       "date": "2024-01-15",
 84 |       "product": "Mouse",
 85 |       "category": "Electronics",
 86 |       "amount": 25,
 87 |       "quantity": 3,
 88 |       "region": "South",
 89 |       "salesperson": "Bob"
 90 |     },
 91 |     {
 92 |       "date": "2024-01-16",
 93 |       "product": "Desk",
 94 |       "category": "Furniture",
 95 |       "amount": 300,
 96 |       "quantity": 2,
 97 |       "region": "North",
 98 |       "salesperson": "Alice"
 99 |     },
100 |     {
101 |       "date": "2024-01-16",
102 |       "product": "Chair",
103 |       "category": "Furniture",
104 |       "amount": 150,
105 |       "quantity": 4,
106 |       "region": "South",
107 |       "salesperson": "Carol"
108 |     }
109 |   ]
110 | }
111 | ```
112 | 
113 | **Basic Analysis:**
114 | 
115 | ```bash
116 | # Total sales count
117 | hawk '.sales[] | count' sales_data.json
118 | 
119 | # Total revenue
120 | hawk '.sales[] | sum(.amount)' sales_data.json
121 | 
122 | # Average sale amount
123 | hawk '.sales[] | avg(.amount)' sales_data.json
124 | 
125 | # Sales by category
126 | hawk '.sales[] | group_by(.category) | sum(.amount)' sales_data.json
127 | 
128 | # Top performing regions
129 | hawk '.sales[] | group_by(.region) | sum(.amount)' sales_data.json
130 | ```
131 | 
132 | ## Statistical Operations
133 | 
134 | ### Descriptive Statistics
135 | 
136 | ```bash
137 | # Central tendency
138 | hawk '.[] | avg(.field)' data.json          # Mean
139 | hawk '.[] | median(.field)' data.json       # Median
140 | 
141 | # Variability
142 | hawk '.[] | min(.field)' data.json          # Minimum
143 | hawk '.[] | max(.field)' data.json          # Maximum
144 | hawk '.[] | stddev(.field)' data.json       # Standard deviation
145 | 
146 | # Distribution
147 | hawk '.[] | .field | unique | sort' data.json  # Unique values
148 | hawk '.[] | .field | sort' data.json           # All values sorted
149 | ```
150 | 
151 | ### Advanced Statistical Analysis
152 | 
153 | ```bash
154 | # Quartile analysis (using slicing)
155 | hawk '.[] | sort(.price) | length' products.json  # Get total count
156 | 
157 | # Range analysis
158 | hawk '.[] | select(.price >= 100) | select(.price <= 500) | count' products.json
159 | 
160 | # Frequency analysis
161 | hawk '.[] | group_by(.grade) | count' grades.json
162 | ```
163 | 
164 | ### Statistical Comparisons
165 | 
166 | ```bash
167 | # Compare groups
168 | hawk '.[] | group_by(.department) | avg(.salary)' employees.csv
169 | hawk '.[] | group_by(.department) | stddev(.salary)' employees.csv
170 | 
171 | # Performance metrics
172 | hawk '.[] | group_by(.team) | min(.response_time)' performance.json
173 | hawk '.[] | group_by(.team) | max(.response_time)' performance.json
174 | 
175 | # Correlation analysis (manual)
176 | hawk '.[] | select(.x > 0) | select(.y > 0) | count' correlation_data.json
177 | ```
178 | 
179 | ## Data Filtering and Selection
180 | 
181 | ### Conditional Filtering
182 | 
183 | ```bash
184 | # Numeric conditions
185 | hawk '.[] | select(.age >= 18)' users.json              # Adults only
186 | hawk '.[] | select(.price > 100) | select(.price <= 500)' products.json  # Price range
187 | 
188 | # String conditions
189 | hawk '.[] | select(.status == "active")' accounts.json  # Active accounts
190 | hawk '.[] | select(.email | ends_with(".com"))' users.json  # .com emails
191 | 
192 | # Date filtering (string-based)
193 | hawk '.[] | select(.date >= "2024-01-01")' transactions.json
194 | hawk '.[] | select(.date | starts_with("2024-01"))' logs.json
195 | ```
196 | 
197 | ### Complex Multi-condition Filtering
198 | 
199 | ```bash
200 | # Multiple AND conditions
201 | hawk '.[] | select(.age >= 18) | select(.status == "active") | select(.region == "North")' users.json
202 | 
203 | # Range filtering
204 | hawk '.[] | select(.score >= 80) | select(.score <= 100)' grades.json
205 | 
206 | # Category filtering
207 | hawk '.[] | select(.category == "Electronics") | select(.price < 1000)' products.json
208 | 
209 | # Data quality filtering
210 | hawk '.[] | select(.email | contains("@")) | select(.phone | length == 10)' contacts.json
211 | ```
212 | 
213 | ### Sampling and Data Selection
214 | 
215 | ```bash
216 | # Random sampling (using slicing)
217 | hawk '.[] | .[0:100]' large_dataset.json              # First 100 records
218 | hawk '.[] | .[1000:1100]' large_dataset.json          # Records 1000-1100
219 | 
220 | # Stratified sampling
221 | hawk '.[] | group_by(.category) | .[0:10]' products.json  # 10 from each category
222 | 
223 | # Top/Bottom N
224 | hawk '.[] | sort(.revenue) | .[-10:]' companies.json      # Top 10 by revenue
225 | hawk '.[] | sort(.score) | .[0:5]' results.json           # Bottom 5 by score
226 | ```
227 | 
228 | ## Grouping and Aggregation
229 | 
230 | ### Basic Grouping Operations
231 | 
232 | ```bash
233 | # Group by single field
234 | hawk '.[] | group_by(.department) | count' employees.json
235 | hawk '.[] | group_by(.region) | sum(.sales)' sales.json
236 | hawk '.[] | group_by(.category) | avg(.price)' products.json
237 | 
238 | # Group by multiple criteria (sequential)
239 | hawk '.[] | group_by(.region) | group_by(.category) | sum(.amount)' sales.json
240 | ```
241 | 
242 | ### Advanced Aggregation Patterns
243 | 
244 | ```bash
245 | # Multiple aggregations per group
246 | hawk '.[] | group_by(.department)' employees.json  # Then analyze each group
247 | hawk '.[] | group_by(.department) | count' employees.json     # Count per group
248 | hawk '.[] | group_by(.department) | avg(.salary)' employees.json  # Average per group
249 | hawk '.[] | group_by(.department) | sum(.salary)' employees.json  # Total per group
250 | 
251 | # Performance analytics
252 | hawk '.[] | group_by(.server) | avg(.response_time)' performance.json
253 | hawk '.[] | group_by(.server) | max(.memory_usage)' performance.json
254 | hawk '.[] | group_by(.server) | min(.cpu_usage)' performance.json
255 | ```
256 | 
257 | ### Business Intelligence Aggregations
258 | 
259 | ```bash
260 | # Sales analysis
261 | hawk '.[] | group_by(.salesperson) | sum(.amount)' sales.json
262 | hawk '.[] | group_by(.product) | avg(.rating)' reviews.json
263 | hawk '.[] | group_by(.region) | count' customers.json
264 | 
265 | # Financial analysis
266 | hawk '.[] | group_by(.quarter) | sum(.revenue)' financial.json
267 | hawk '.[] | group_by(.cost_center) | sum(.expenses)' budget.json
268 | 
269 | # User behavior analysis
270 | hawk '.[] | group_by(.user_type) | avg(.session_duration)' analytics.json
271 | hawk '.[] | group_by(.device_type) | count' user_sessions.json
272 | ```
273 | 
274 | ## Data Transformation
275 | 
276 | ### Data Cleaning and Normalization
277 | 
278 | ```bash
279 | # Clean text data
280 | hawk '.[] | map(.name | trim | upper)' contacts.json
281 | hawk '.[] | map(.email | lower)' users.json
282 | 
283 | # Normalize numeric data
284 | hawk '.[] | map(.amount | * 1.0)' transactions.json  # Ensure float
285 | 
286 | # Handle missing data
287 | hawk '.[] | select(.field)' data.json                # Remove nulls
288 | hawk '.[] | map(.field // "default_value")' data.json  # Replace nulls
289 | ```
290 | 
291 | ### Feature Engineering
292 | 
293 | ```bash
294 | # Extract date components
295 | hawk '.[] | map(.year | split("-")[0])' events.json
296 | hawk '.[] | map(.month | split("-")[1])' events.json
297 | 
298 | # Categorize numeric data
299 | hawk '.[] | select(.age >= 18) | select(.age < 65) | map(.age_group = "adult")' users.json
300 | ```
301 | 
302 | ### Data Reshaping
303 | 
304 | ```bash
305 | # Extract specific fields
306 | hawk '.[] | select_fields(id,name,email)' users.json
307 | ```
308 | 
309 | ## Time Series Analysis
310 | 
311 | ### Date-based Analysis
312 | 
313 | ```bash
314 | # Group by time periods
315 | hawk '.[] | group_by(.date | split("-")[0])' time_series.json      # By year
316 | hawk '.[] | group_by(.date | split("-")[1])' time_series.json      # By month
317 | hawk '.[] | group_by(.date | substring(0, 7))' time_series.json    # By year-month
318 | 
319 | # Trend analysis
320 | hawk '.[] | sort(.date) | .[0:10]' events.json     # First 10 chronologically
321 | hawk '.[] | sort(.date) | .[-10:]' events.json     # Last 10 chronologically
322 | ```
323 | 
324 | ### Sales Trend Analysis
325 | 
326 | ```bash
327 | # Monthly sales trends
328 | hawk '.[] | group_by(.date | substring(0, 7)) | sum(.amount)' sales.json
329 | 
330 | # Daily transaction counts
331 | hawk '.[] | group_by(.date) | count' transactions.json
332 | 
333 | # Seasonal analysis
334 | hawk '.[] | group_by(.date | split("-")[1]) | avg(.temperature)' weather.json
335 | 
336 | # Growth analysis
337 | hawk '.[] | sort(.date) | .[0:100]' historical_data.json  # Historical baseline
338 | hawk '.[] | sort(.date) | .[-100:]' historical_data.json  # Recent data
339 | ```
340 | 
341 | ### Performance Over Time
342 | 
343 | ```bash
344 | # System performance trends
345 | hawk '.[] | group_by(.hour) | avg(.response_time)' performance_logs.json
346 | 
347 | # User engagement trends
348 | hawk '.[] | group_by(.week) | sum(.active_users)' analytics.json
349 | 
350 | # Error rate analysis
351 | hawk '.[] | group_by(.date) | select(.level == "ERROR") | count' error_logs.json
352 | ```
353 | 
354 | ## Performance Analytics
355 | 
356 | ### Application Performance Analysis
357 | 
358 | ```bash
359 | # Response time analysis
360 | hawk '.[] | group_by(.endpoint) | avg(.response_time)' api_logs.json
361 | hawk '.[] | group_by(.endpoint) | max(.response_time)' api_logs.json
362 | hawk '.[] | group_by(.endpoint) | min(.response_time)' api_logs.json
363 | 
364 | # Error rate calculation
365 | hawk '.[] | group_by(.service) | select(.status >= 400) | count' api_logs.json
366 | 
367 | # Throughput analysis
368 | hawk '.[] | group_by(.hour) | count' requests.json
369 | ```
370 | 
371 | ### System Resource Analysis
372 | 
373 | ```bash
374 | # Memory usage analysis
375 | hawk '.[] | group_by(.server) | avg(.memory_usage)' system_metrics.json
376 | hawk '.[] | group_by(.server) | max(.memory_usage)' system_metrics.json
377 | 
378 | # CPU utilization
379 | hawk '.[] | group_by(.process) | avg(.cpu_percent)' process_metrics.json
380 | 
381 | # Disk usage trends
382 | hawk '.[] | group_by(.mount_point) | max(.disk_usage)' disk_metrics.json
383 | ```
384 | 
385 | ### User Performance Analysis
386 | 
387 | ```bash
388 | # Page load times
389 | hawk '.[] | group_by(.page) | avg(.load_time)' user_metrics.json
390 | 
391 | # User session analysis
392 | hawk '.[] | group_by(.user_id) | avg(.session_duration)' sessions.json
393 | 
394 | # Conversion rate analysis
395 | hawk '.[] | group_by(.campaign) | select(.converted == true) | count' marketing.json
396 | ```
397 | 
398 | ## Business Intelligence
399 | 
400 | ### Sales Analytics
401 | 
402 | ```bash
403 | # Revenue analysis
404 | hawk '.[] | group_by(.quarter) | sum(.revenue)' quarterly_sales.json
405 | hawk '.[] | group_by(.product_line) | sum(.revenue)' product_sales.json
406 | hawk '.[] | group_by(.region) | sum(.revenue)' regional_sales.json
407 | 
408 | # Profitability analysis
409 | hawk '.[] | group_by(.product) | sum(.profit)' product_profitability.json
410 | hawk '.[] | group_by(.customer_segment) | avg(.margin)' customer_analysis.json
411 | 
412 | # Sales performance
413 | hawk '.[] | group_by(.salesperson) | sum(.deals_closed)' sales_performance.json
414 | hawk '.[] | group_by(.salesperson) | avg(.deal_size)' sales_performance.json
415 | ```
416 | 
417 | ### Customer Analytics
418 | 
419 | ```bash
420 | # Customer segmentation
421 | hawk '.[] | group_by(.customer_type) | avg(.lifetime_value)' customers.json
422 | hawk '.[] | group_by(.acquisition_channel) | count' customers.json
423 | 
424 | # Customer behavior
425 | hawk '.[] | group_by(.customer_id) | sum(.total_spent)' transactions.json
426 | hawk '.[] | group_by(.customer_id) | count' purchases.json
427 | 
428 | # Retention analysis
429 | hawk '.[] | group_by(.cohort) | avg(.retention_rate)' retention.json
430 | ```
431 | 
432 | ### Marketing Analytics
433 | 
434 | ```bash
435 | # Campaign performance
436 | hawk '.[] | group_by(.campaign) | sum(.impressions)' marketing.json
437 | hawk '.[] | group_by(.campaign) | avg(.click_through_rate)' marketing.json
438 | 
439 | # Channel effectiveness
440 | hawk '.[] | group_by(.channel) | sum(.conversions)' marketing.json
441 | hawk '.[] | group_by(.channel) | avg(.cost_per_acquisition)' marketing.json
442 | 
443 | # ROI analysis
444 | hawk '.[] | group_by(.campaign) | sum(.revenue - .spend)' marketing.json
445 | ```
446 | 
447 | ## Advanced Analytics Patterns
448 | 
449 | ### Cohort Analysis
450 | 
451 | ```bash
452 | # User cohorts by signup month
453 | hawk '.[] | group_by(.signup_month) | count' users.json
454 | hawk '.[] | group_by(.signup_month) | avg(.lifetime_value)' users.json
455 | 
456 | # Retention by cohort
457 | hawk '.[] | group_by(.cohort) | select(.active == true) | count' user_activity.json
458 | ```
459 | 
460 | ### Funnel Analysis
461 | 
462 | ```bash
463 | # Conversion funnel
464 | hawk '.[] | select(.stage == "awareness") | count' funnel.json
465 | hawk '.[] | select(.stage == "consideration") | count' funnel.json
466 | hawk '.[] | select(.stage == "purchase") | count' funnel.json
467 | 
468 | # Drop-off analysis
469 | hawk '.[] | group_by(.exit_page) | count' user_sessions.json
470 | ```
471 | 
472 | ### A/B Testing Analysis
473 | 
474 | ```bash
475 | # Test group comparison
476 | hawk '.[] | group_by(.test_group) | avg(.conversion_rate)' ab_test.json
477 | hawk '.[] | group_by(.test_group) | count' ab_test.json
478 | 
479 | # Statistical significance (basic)
480 | hawk '.[] | group_by(.variant) | stddev(.metric)' ab_test.json
481 | ```
482 | 
483 | ### Anomaly Detection (Basic)
484 | 
485 | ```bash
486 | # Outlier detection using statistical methods
487 | hawk '.[] | sort(.value) | .[0:5]' data.json      # Bottom 5 (potential outliers)
488 | hawk '.[] | sort(.value) | .[-5:]' data.json      # Top 5 (potential outliers)
489 | 
490 | # Threshold-based anomalies
491 | hawk '.[] | avg(.response_time)' baseline.json    # Calculate baseline
492 | hawk '.[] | select(.response_time > baseline * 2)' current.json  # 2x baseline
493 | ```
494 | 
495 | ## Export and Reporting
496 | 
497 | ### Data Export Formats
498 | 
499 | ```bash
500 | # Export to JSON
501 | hawk '.summary' --format json > summary_report.json
502 | 
503 | # Export specific fields
504 | hawk '.[] | select_fields(id,name,value)' --format table > report.txt
505 | ```
506 | 
507 | ### Report Generation
508 | 
509 | ```bash
510 | # Summary statistics report
511 | echo "=== Sales Summary ===" > report.txt
512 | hawk '.[] | sum(.amount)' sales.json >> report.txt
513 | hawk '.[] | avg(.amount)' sales.json >> report.txt
514 | hawk '.[] | count' sales.json >> report.txt
515 | ```
516 | 
517 | ## Best Practices
518 | 
519 | ### Data Analysis Workflow
520 | 
521 | 1. **Start with exploration**: Always use `hawk '. | info'` first
522 | 2. **Sample your data**: Use slicing `.[0:100]` for large datasets
523 | 3. **Check data quality**: Filter out invalid records early
524 | 4. **Build incrementally**: Add complexity step by step
525 | 5. **Validate results**: Cross-check with known values
526 | 
527 | ### Performance Optimization
528 | 
529 | ```bash
530 | # ✅ Filter early in pipeline
531 | hawk '.[] | select(.active == true) | group_by(.region) | count'
532 | 
533 | # ❌ Filter late in pipeline
534 | hawk '.[] | group_by(.region) | select(.active == true) | count'
535 | 
536 | # ✅ Use appropriate data types
537 | hawk '.[] | select(.amount > 100.0)' numeric_data.json
538 | 
539 | # ✅ Sample large datasets
540 | hawk '.[0:1000] | group_by(.category) | avg(.price)' large_data.json
541 | ```
542 | 
543 | ### Common Pitfalls
544 | 
545 | ```bash
546 | # ❌ Ignoring missing data
547 | hawk '.[] | avg(.field)'  # May include nulls
548 | 
549 | # ✅ Handle missing data
550 | hawk '.[] | select(.field) | avg(.field)'
551 | 
552 | # ❌ Not validating data types
553 | hawk '.[] | sum(.text_field)'  # Error if not numeric
554 | 
555 | # ✅ Validate data types
556 | hawk '.[] | select(.numeric_field > 0) | sum(.numeric_field)'
557 | ```
558 | 
559 | ---
560 | 
561 | **Related Documentation:**
562 | 
563 | - [Getting Started](getting-started.md) - Basic introduction
564 | - [Query Language Reference](query-language.md) - Complete syntax
565 | - [String Operations](string-operations.md) - Text processing
566 | - [Examples](examples/) - Real-world use cases
567 | 


--------------------------------------------------------------------------------
/docs/getting-started.md:
--------------------------------------------------------------------------------
  1 | # Getting Started with hawk 🦅
  2 | 
  3 | **5-minute introduction to hawk's data processing capabilities**
  4 | 
  5 | hawk is a command-line tool that lets you explore and analyze data using a simple, unified query language. Whether you're working with JSON APIs, CSV files, YAML configs, or log files, hawk uses the same intuitive syntax.
  6 | 
  7 | ## 📦 Installation
  8 | 
  9 | Choose your preferred installation method:
 10 | 
 11 | ### Homebrew (Recommended)
 12 | 
 13 | ```bash
 14 | brew install kyotalab/tools/hawk
 15 | ```
 16 | 
 17 | ### Cargo (Rust)
 18 | 
 19 | ```bash
 20 | cargo install hawk-data
 21 | ```
 22 | 
 23 | ### Verify Installation
 24 | 
 25 | ```bash
 26 | hawk --version
 27 | # Output: hawk 0.2.2
 28 | ```
 29 | 
 30 | ## 🎯 Your First hawk Command
 31 | 
 32 | Let's start with a simple example. Create a test file:
 33 | 
 34 | ```bash
 35 | cat << 'EOF' > users.json
 36 | {
 37 |   "users": [
 38 |     {"name": "Alice", "age": 30},
 39 |     {"name": "Bob", "age": 25}
 40 |   ]
 41 | }
 42 | EOF
 43 | ```
 44 | 
 45 | Now run your first hawk command:
 46 | 
 47 | ```bash
 48 | hawk '.users[0].name' users.json
 49 | ```
 50 | 
 51 | **Output:** `Alice`
 52 | 
 53 | **What happened?**
 54 | 
 55 | - `.users` → access the "users" field
 56 | - `[0]` → get the first element of the array
 57 | - `.name` → get the "name" field from that element
 58 | 
 59 | ## 🏗️ Basic Building Blocks
 60 | 
 61 | ### 1. Field Access
 62 | 
 63 | ```bash
 64 | # Access a field
 65 | hawk '.name' data.json
 66 | 
 67 | # Access nested fields
 68 | hawk '.user.profile.email' data.json
 69 | 
 70 | # Access array elements
 71 | hawk '.items[0]' data.json
 72 | ```
 73 | 
 74 | ### 2. Array Operations
 75 | 
 76 | ```bash
 77 | # Get all array elements
 78 | hawk '.users[]' users.json
 79 | 
 80 | # Access specific fields from all elements
 81 | hawk '.users[].name' users.json
 82 | ```
 83 | 
 84 | ### 3. Filtering with select()
 85 | 
 86 | ```bash
 87 | # Find users older than 25
 88 | hawk '.users[] | select(.age > 25)' users.json
 89 | 
 90 | # Find users named "Alice"
 91 | hawk '.users[] | select(.name == "Alice")' users.json
 92 | ```
 93 | 
 94 | ### 4. Counting and Aggregation
 95 | 
 96 | ```bash
 97 | # Count total users
 98 | hawk '.users | count' users.json
 99 | 
100 | # Average age
101 | hawk '.users[] | avg(.age)' users.json
102 | ```
103 | 
104 | ## 🧪 Hands-on Examples
105 | 
106 | Let's work through progressively complex examples with sample data.
107 | 
108 | ### Example 1: JSON Data Analysis
109 | 
110 | Create a sample dataset:
111 | 
112 | ```bash
113 | cat > sales.json << 'EOF'
114 | {
115 |   "sales": [
116 |     {"product": "Laptop", "price": 1200, "quantity": 3, "region": "North"},
117 |     {"product": "Mouse", "price": 25, "quantity": 50, "region": "South"},
118 |     {"product": "Keyboard", "price": 80, "quantity": 20, "region": "North"},
119 |     {"product": "Monitor", "price": 300, "quantity": 10, "region": "South"}
120 |   ]
121 | }
122 | EOF
123 | ```
124 | 
125 | **Basic Operations:**
126 | 
127 | ```bash
128 | # See all products
129 | hawk '.sales[].product' sales.json
130 | 
131 | # Find expensive items (>$100)
132 | hawk '.sales[] | select(.price > 100)' sales.json
133 | 
134 | # Count items by region
135 | hawk '.sales[] | group_by(.region) | count' sales.json
136 | 
137 | # Average price by region
138 | hawk '.sales[] | group_by(.region) | avg(.price)' sales.json
139 | ```
140 | 
141 | ### Example 2: CSV Data Processing
142 | 
143 | Create a CSV file:
144 | 
145 | ```bash
146 | cat > employees.csv << 'EOF'
147 | name,age,department,salary
148 | Alice,30,Engineering,95000
149 | Bob,25,Marketing,75000
150 | Carol,35,Engineering,105000
151 | David,28,Sales,80000
152 | EOF
153 | ```
154 | 
155 | **CSV Operations:**
156 | 
157 | ```bash
158 | # See all names
159 | hawk '.[].name' employees.csv
160 | 
161 | # Find engineers
162 | hawk '.[] | select(.department == "Engineering")' employees.csv
163 | 
164 | # Average salary by department
165 | hawk '.[] | group_by(.department) | avg(.salary)' employees.csv
166 | 
167 | # Count employees by department
168 | hawk '.[] | group_by(.department) | count' employees.csv
169 | ```
170 | 
171 | ### Example 3: Text/Log Processing
172 | 
173 | Create a sample log file:
174 | 
175 | ```bash
176 | cat > app.log << 'EOF'
177 | 2024-01-15 09:00:01 INFO Application started
178 | 2024-01-15 09:00:15 ERROR Database connection failed
179 | 2024-01-15 09:00:16 INFO Retrying connection
180 | 2024-01-15 09:01:20 WARN High memory usage: 85%
181 | 2024-01-15 09:01:45 ERROR Timeout occurred
182 | EOF
183 | ```
184 | 
185 | **Text Processing Operations:**
186 | 
187 | ```bash
188 | # Process as text (use -t flag for logs)
189 | # Find all ERROR lines
190 | hawk -t '. | select(. | contains("ERROR"))' app.log
191 | 
192 | # Extract timestamps
193 | hawk -t '. | map(. | split(" ")[0])' app.log
194 | 
195 | # Extract log levels
196 | hawk -t '. | map(. | split(" ")[2])' app.log
197 | ```
198 | 
199 | ## 🔧 String Operations
200 | 
201 | hawk includes powerful string manipulation:
202 | 
203 | ```bash
204 | # Text transformation
205 | echo '"  Hello World  "' | hawk '. | map(. | trim | upper)'
206 | 
207 | # String splitting with index access (NEW!)
208 | echo '"apple banana cherry"' | hawk '. | map(. | split(" ")[1])'
209 | 
210 | # Multiple field processing
211 | cat << 'EOF' | hawk '. | map(.first, .last | upper)'
212 | {
213 | "first": "john",
214 | "last": "doe"
215 | }
216 | EOF
217 | 
218 | ```
219 | 
220 | ## 📊 Understanding Output Formats
221 | 
222 | hawk automatically chooses the best output format:
223 | 
224 | ```bash
225 | # Single value → simple output
226 | hawk '.users[0].name' users.json
227 | # Output: Alice
228 | 
229 | # Array of objects → table format
230 | hawk '.users[]' users.json
231 | # Output: Formatted table with columns
232 | ```
233 | 
234 | You can force specific formats:
235 | 
236 | ```bash
237 | hawk '.users[]' --format json users.json    # Force JSON
238 | hawk '.users[]' --format table users.json   # Force table
239 | hawk '.users[].name' --format list users.json  # Force list
240 | ```
241 | 
242 | ## 🎯 Common Patterns
243 | 
244 | ### Data Exploration
245 | 
246 | ```bash
247 | # Understand data structure
248 | hawk '. | info' unknown-data.json
249 | 
250 | # Count total records
251 | hawk '. | count' data.json
252 | 
253 | # See unique values
254 | hawk '.field_name[] | unique' data.json
255 | ```
256 | 
257 | ### Filtering and Aggregation
258 | 
259 | ```bash
260 | # Filter → count pattern
261 | hawk '.items[] | select(.price > 100) | count' data.json
262 | 
263 | # Group → aggregate pattern
264 | hawk '.sales[] | group_by(.category) | sum(.amount)' data.json
265 | 
266 | # Filter → group → aggregate pattern
267 | hawk '.orders[] | select(.status == "completed") | group_by(.region) | avg(.total)' data.json
268 | ```
269 | 
270 | ### Text Processing
271 | 
272 | ```bash
273 | # Extract → unique pattern
274 | hawk -t '. | map(. | split(" ")[0]) | unique' logs.txt
275 | 
276 | # Filter → extract pattern
277 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[1])' logs.txt
278 | 
279 | # Clean → transform pattern
280 | hawk '.users[] | map(.email | lower | trim)' users.json
281 | ```
282 | 
283 | ## 🚨 When to Use --text Flag
284 | 
285 | Use the `--text` flag when processing files that might be misdetected:
286 | 
287 | ```bash
288 | # For log files that look like YAML
289 | hawk --text '. | select(. | contains("ERROR"))' structured.log
290 | 
291 | # For any text file you want to process line-by-line
292 | hawk -t '. | map(. | length) | avg' text-file.txt
293 | ```
294 | 
295 | ## 🎓 Next Steps
296 | 
297 | Now that you know the basics, explore these guides:
298 | 
299 | ### **Immediate Next Steps**
300 | 
301 | 1. **[String Operations Guide](string-operations.md)** - Master text processing
302 | 2. **[Query Language Reference](query-language.md)** - Complete syntax guide
303 | 3. **[Log Analysis Examples](examples/log-analysis.md)** - Real-world log processing
304 | 
305 | ### **By Use Case**
306 | 
307 | - **Data Analysis**: [Data Analysis Guide](data-analysis.md)
308 | - **DevOps**: [DevOps Workflows](examples/devops-workflows.md)
309 | - **API Work**: [API Exploration](examples/api-exploration.md)
310 | 
311 | ### **Advanced Topics**
312 | 
313 | - **Performance**: [Optimization Tips](advanced/performance.md)
314 | - **Complex Workflows**: [Custom Workflows](advanced/custom-workflows.md)
315 | 
316 | ## 🔗 Quick Reference Card
317 | 
318 | ### Essential Commands
319 | 
320 | ```bash
321 | # Field access
322 | hawk '.field' data.json
323 | hawk '.array[0]' data.json
324 | hawk '.array[]' data.json
325 | 
326 | # Filtering
327 | hawk '.array[] | select(.field > value)' data.json
328 | 
329 | # Aggregation
330 | hawk '.array[] | count/sum/avg/min/max(.field)' data.json
331 | 
332 | # Grouping
333 | hawk '.array[] | group_by(.field) | count' data.json
334 | 
335 | # Text processing
336 | hawk -t '. | select(. | contains("pattern"))' file.txt
337 | hawk -t '. | map(. | split(" ")[0])' file.txt
338 | 
339 | # String operations
340 | hawk '.field | upper/lower/trim/length' data.json
341 | hawk '.field | split(",")[0]' data.json
342 | ```
343 | 
344 | ### Data Types
345 | 
346 | - **JSON**: `data.json` → auto-detected
347 | - **YAML**: `config.yaml` → auto-detected
348 | - **CSV**: `data.csv` → auto-detected
349 | - **Text**: `file.txt` → use `-t` flag for line processing
350 | 
351 | ## 💡 Pro Tips
352 | 
353 | 1. **Start Simple**: Begin with basic field access, then add complexity
354 | 2. **Use `info`**: Always start data exploration with `hawk '. | info' file`
355 | 3. **Test in Steps**: Build complex queries incrementally
356 | 4. **Use `--text`**: When in doubt with text files, use the `-t` flag
357 | 5. **Read Error Messages**: hawk provides helpful error context
358 | 
359 | ## 🎉 You're Ready!
360 | 
361 | You now know enough hawk to be productive! The key is to start with simple operations and gradually build more complex queries as you become comfortable with the syntax.
362 | 
363 | **Remember**: hawk uses the same syntax across all data formats, so skills learned with JSON work with CSV, YAML, and text files.
364 | 
365 | Happy data exploring! 🦅
366 | 
367 | ---
368 | 
369 | **Quick Links:**
370 | 
371 | - [String Operations](string-operations.md) - Text processing guide
372 | - [Examples](../examples/README.md) - Real-world use cases
373 | 


--------------------------------------------------------------------------------
/docs/string-operations.md:
--------------------------------------------------------------------------------
  1 | # String Operations Guide
  2 | 
  3 | Comprehensive guide to hawk's text processing capabilities.
  4 | 
  5 | ## 📖 Table of Contents
  6 | 
  7 | - [Basic Operations](#basic-operations)
  8 | - [Advanced Operations](#advanced-operations)
  9 | - [Array Operations](#array-operations)
 10 | - [Multi-field Operations](#multi-field-operations)
 11 | - [Practical Examples](#practical-examples)
 12 | - [Performance Tips](#performance-tips)
 13 | 
 14 | ## Basic Operations
 15 | 
 16 | ### Case Conversion
 17 | 
 18 | ```bash
 19 | # Convert to uppercase
 20 | hawk '. | map(. | upper)' names.txt
 21 | 
 22 | # Convert to lowercase
 23 | hawk '.users[] | map(.email | lower)' users.json
 24 | 
 25 | # Example
 26 | "Hello World" | upper  → "HELLO WORLD"
 27 | "Hello World" | lower  → "hello world"
 28 | ```
 29 | 
 30 | ### Whitespace Management
 31 | 
 32 | ```bash
 33 | # Remove all whitespace
 34 | hawk '. | map(. | trim)' messy-data.txt
 35 | 
 36 | # Remove leading whitespace
 37 | hawk '. | map(. | trim_start)' indented.txt
 38 | 
 39 | # Remove trailing whitespace
 40 | hawk '. | map(. | trim_end)' data.txt
 41 | 
 42 | # Examples
 43 | "  hello  " | trim       → "hello"
 44 | "  hello  " | trim_start → "hello  "
 45 | "  hello  " | trim_end   → "  hello"
 46 | ```
 47 | 
 48 | ### String Analysis
 49 | 
 50 | ```bash
 51 | # Get string length
 52 | hawk '. | map(. | length)' text.txt
 53 | 
 54 | # Reverse strings
 55 | hawk '. | map(. | reverse)' data.txt
 56 | 
 57 | # Examples
 58 | "hello" | length  → 5
 59 | "hello" | reverse → "olleh"
 60 | ```
 61 | 
 62 | ## Advanced Operations
 63 | 
 64 | ### Pattern Matching
 65 | 
 66 | ```bash
 67 | # Check if string contains pattern
 68 | hawk '. | select(. | contains("ERROR"))' logs.txt
 69 | 
 70 | # Check string start/end
 71 | hawk '. | select(. | starts_with("2024"))' timestamps.txt
 72 | hawk '. | select(. | ends_with(".log"))' filenames.txt
 73 | 
 74 | # Examples
 75 | "Hello World" | contains("World")     → true
 76 | "Hello World" | starts_with("Hello") → true
 77 | "Hello World" | ends_with("World")   → true
 78 | ```
 79 | 
 80 | ### Text Transformation
 81 | 
 82 | ```bash
 83 | # Replace text
 84 | hawk '. | map(. | replace("old", "new"))' text.txt
 85 | 
 86 | # Extract substrings
 87 | hawk '. | map(. | substring(0, 10))' long-text.txt
 88 | hawk '. | map(. | substring(5))' text.txt  # from index 5 to end
 89 | 
 90 | # Examples
 91 | "Hello World" | replace("World", "Rust") → "Hello Rust"
 92 | "Hello World" | substring(0, 5)          → "Hello"
 93 | "Hello World" | substring(6)             → "World"
 94 | ```
 95 | 
 96 | ## Array Operations
 97 | 
 98 | ### String Splitting
 99 | 
100 | ```bash
101 | # Split into array
102 | hawk '. | map(. | split(","))' csv-lines.txt
103 | hawk '. | map(. | split(" "))' sentences.txt
104 | 
105 | # Split with index access (NEW in v0.2.2!)
106 | hawk '. | map(. | split(" ")[0])' space-separated.txt
107 | hawk '. | map(. | split(",")[2])' csv-data.txt
108 | 
109 | # Examples
110 | "apple,banana,cherry" | split(",")    → ["apple", "banana", "cherry"]
111 | "apple,banana,cherry" | split(",")[0] → "apple"
112 | "apple,banana,cherry" | split(",")[1] → "banana"
113 | ```
114 | 
115 | ### Array Joining
116 | 
117 | ```bash
118 | # Join array elements
119 | hawk '.tags[] | join(",")' data.json
120 | hawk '.words[] | join(" ")' word-lists.json
121 | 
122 | # Examples
123 | ["apple", "banana"] | join(",") → "apple,banana"
124 | ["hello", "world"] | join(" ")  → "hello world"
125 | ```
126 | 
127 | ## Multi-field Operations
128 | 
129 | Process multiple fields with the same operation (NEW in v0.2.2!):
130 | 
131 | ```bash
132 | # Apply join to multiple array fields
133 | hawk '.users[] | map(.skills, .projects | join(","))' users.json
134 | 
135 | # Convert multiple fields to uppercase
136 | hawk '.users[] | map(.first_name, .last_name | upper)' users.json
137 | 
138 | # Get length of multiple string fields
139 | hawk '.posts[] | map(.title, .content | length)' posts.json
140 | ```
141 | 
142 | ### Example: User Data Processing
143 | 
144 | ```json
145 | {
146 |   "users": [
147 |     {
148 |       "name": "alice",
149 |       "skills": ["python", "rust"],
150 |       "projects": ["web-app", "cli-tool"],
151 |       "department": "engineering"
152 |     }
153 |   ]
154 | }
155 | ```
156 | 
157 | ```bash
158 | # Process multiple fields simultaneously
159 | hawk --format json '.users[] | map(.name, .department | upper)' users.json
160 | 
161 | # Result
162 | {
163 |   "users": [
164 |     {
165 |       "name": "ALICE",           // ← converted
166 |       "skills": ["python", "rust"],
167 |       "projects": ["web-app", "cli-tool"],
168 |       "department": "ENGINEERING" // ← converted
169 |     }
170 |   ]
171 | }
172 | ```
173 | 
174 | ## Practical Examples
175 | 
176 | ### Log File Processing
177 | 
178 | ```bash
179 | # Extract timestamps from logs
180 | hawk -t '. | map(. | split(" ")[0])' app.log
181 | 
182 | # Find unique IP addresses
183 | hawk -t '. | map(. | split(" ")[0]) | unique' access.log
184 | 
185 | # Extract error messages
186 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(": ")[1])' error.log
187 | ```
188 | 
189 | ### Data Cleaning
190 | 
191 | ```bash
192 | # Normalize email addresses
193 | hawk '.users[] | map(.email | lower | trim)' users.csv
194 | 
195 | # Clean phone numbers
196 | hawk '.contacts[] | map(.phone | replace("-", "") | replace("(", "") | replace(")", ""))' contacts.json
197 | 
198 | # Standardize names
199 | hawk '.people[] | map(.name | trim | upper)' people.csv
200 | ```
201 | 
202 | ### CSV Processing
203 | 
204 | ```bash
205 | # Extract specific columns from CSV-like text
206 | hawk -t '. | map(. | split(",")[1])' data.txt
207 | 
208 | # Process headers and data separately
209 | hawk -t '.[0] | split(",")' data.txt  # headers
210 | hawk -t '.[1:] | map(. | split(",")[2])' data.txt  # data column
211 | ```
212 | 
213 | ### Docker/Container Logs
214 | 
215 | ```bash
216 | # Extract container names
217 | hawk -t '. | map(. | split(" ")[1]) | unique' docker.log
218 | 
219 | # Get timestamps and services
220 | hawk -t '. | map(. replace("T", " ")) | map(. | split(" ")[0:2] | map(. | join("-"))' docker.log
221 | 
222 | # Filter by service and extract messages
223 | hawk -t '. | select(. | contains("web_server")) | map(. | split(" ")[3:] | join(" "))' docker.log
224 | ```
225 | 
226 | ## Performance Tips
227 | 
228 | ### Efficient Patterns
229 | 
230 | ```bash
231 | # ✅ Good: Filter first, then transform
232 | hawk '. | select(. | contains("ERROR")) | map(. | upper)' logs.txt
233 | 
234 | # ❌ Avoid: Transform everything, then filter
235 | hawk '. | map(. | upper) | select(. | contains("ERROR"))' logs.txt
236 | ```
237 | 
238 | ### Memory Considerations
239 | 
240 | ```bash
241 | # ✅ Process in chunks for large files
242 | hawk '. | select(. | length > 100) | map(. | substring(0, 50))' large.txt
243 | 
244 | # ✅ Use specific operations instead of general ones
245 | hawk '. | map(. | split(" ")[0])' data.txt  # Better than complex regex
246 | ```
247 | 
248 | ### Text Format Detection
249 | 
250 | ```bash
251 | # ✅ Use --text flag for ambiguous files
252 | hawk -t '. | map(. | split(" ")[0])' structured.log
253 | 
254 | # ✅ Especially important for logs that might be detected as YAML
255 | hawk --text '. | select(. | contains("GC"))' gc.log
256 | ```
257 | 
258 | ## Error Handling
259 | 
260 | ### Common Issues
261 | 
262 | ```bash
263 | # Array index out of bounds → returns empty string
264 | "a,b" | split(",")[5]  → ""
265 | 
266 | # Missing fields → error (use select to filter first)
267 | hawk '.users[] | select(.email) | map(.email | lower)' users.json
268 | ```
269 | 
270 | ### Debugging Tips
271 | 
272 | ```bash
273 | # Check data structure first
274 | hawk '. | info' unknown-data.json
275 | 
276 | # Test operations step by step
277 | hawk '. | map(. | split(" "))' data.txt        # Step 1: split
278 | hawk '. | map(. | split(" ")[0])' data.txt     # Step 2: index access
279 | ```
280 | 
281 | ## Chaining Operations
282 | 
283 | ### Pipeline Examples
284 | 
285 | ```bash
286 | # Complex text processing pipeline
287 | hawk -t '. | select(. | length > 10) | map(. | trim | upper | substring(0, 20))' text.txt
288 | 
289 | # Multi-step data cleaning
290 | hawk '.users[] | map(.email | lower | trim) | select(. | ends_with(".com"))' users.json
291 | 
292 | # Log analysis workflow
293 | hawk -t '. | select(. | contains("ERROR")) | map(. | split("][")[1] | split(" ")[0]) | unique | sort' app.log
294 | ```
295 | 
296 | ---
297 | 
298 | **Next Steps:**
299 | 
300 | - [Data Analysis Guide](data-analysis.md) - Statistical operations and aggregation
301 | - [Log Analysis Examples](examples/log-analysis.md) - Real-world log processing
302 | - [Query Language Reference](query-language.md) - Complete syntax guide
303 | 


--------------------------------------------------------------------------------
/docs/text-processing.md:
--------------------------------------------------------------------------------
  1 | # Text Processing Guide
  2 | 
  3 | Comprehensive guide to text and log processing with hawk.
  4 | 
  5 | ## 📖 Table of Contents
  6 | 
  7 | - [Text Processing Fundamentals](#text-processing-fundamentals)
  8 | - [Log File Analysis](#log-file-analysis)
  9 | - [String Operations](#string-operations)
 10 | - [Pattern Matching and Filtering](#pattern-matching-and-filtering)
 11 | - [Text Transformation](#text-transformation)
 12 | - [Data Extraction](#data-extraction)
 13 | - [Advanced Text Patterns](#advanced-text-patterns)
 14 | - [Performance Optimization](#performance-optimization)
 15 | - [Real-world Examples](#real-world-examples)
 16 | 
 17 | ## Text Processing Fundamentals
 18 | 
 19 | ### Understanding Text Mode
 20 | 
 21 | hawk processes text files line-by-line when using the `--text` flag, treating each line as a string element in an array.
 22 | 
 23 | ```bash
 24 | # Force text processing mode
 25 | hawk --text 'query' file.txt
 26 | hawk -t 'query' file.txt
 27 | 
 28 | # When to use --text flag
 29 | hawk -t '. | select(. | contains("ERROR"))' app.log
 30 | ```
 31 | 
 32 | ### Text vs Structured Data
 33 | 
 34 | | Mode               | Use Case              | Example                                    |
 35 | | ------------------ | --------------------- | ------------------------------------------ |
 36 | | **Auto-detect**    | JSON, YAML, CSV files | `hawk '.field' data.json`                  |
 37 | | **Text mode (-t)** | Log files, plain text | `hawk -t '. \| contains("ERROR")' app.log` |
 38 | | **Force text**     | Ambiguous files       | `hawk -t 'query' structured.log`           |
 39 | 
 40 | ### Basic Text Processing Workflow
 41 | 
 42 | ```bash
 43 | 1. Read text file    → hawk -t '. | length' file.txt
 44 | 2. Filter lines      → hawk -t '. | select(condition)' file.txt
 45 | 3. Transform text    → hawk -t '. | map(operation)' file.txt
 46 | 4. Extract data      → hawk -t '. | map(. | split(" ")[0])' file.txt
 47 | 5. Analyze results   → hawk -t '. | unique | count' file.txt
 48 | ```
 49 | 
 50 | ## Log File Analysis
 51 | 
 52 | ### Common Log Formats
 53 | 
 54 | #### Application Logs
 55 | 
 56 | ```
 57 | 2024-01-15 09:00:01 INFO Application started successfully
 58 | 2024-01-15 09:00:02 DEBUG Loading configuration from /etc/app/config.json
 59 | 2024-01-15 09:01:23 ERROR Failed to process user request: connection timeout
 60 | 2024-01-15 09:01:24 INFO Retrying connection...
 61 | 2024-01-15 09:02:45 WARN High memory usage detected: 85%
 62 | ```
 63 | 
 64 | **Analysis Examples:**
 65 | 
 66 | ```bash
 67 | # Find all error messages
 68 | hawk -t '. | select(. | contains("ERROR"))' app.log
 69 | 
 70 | # Extract timestamps
 71 | hawk -t '. | map(. | split(" ")[0])' app.log
 72 | 
 73 | # Count log levels
 74 | hawk -t '. | map(. | split(" ")[2]) | unique | count' app.log
 75 | 
 76 | # Get unique dates
 77 | hawk -t '. | map(. | substring(0, 10)) | unique | sort' app.log
 78 | ```
 79 | 
 80 | #### Docker Container Logs
 81 | 
 82 | ```
 83 | 2024-01-15T10:30:45Z web_server GET /api/users 200 0.045s
 84 | 2024-01-15T10:30:46Z database_service Connected to MySQL
 85 | 2024-01-15T10:30:47Z web_server POST /api/auth 401 0.012s
 86 | 2024-01-15T10:30:48Z cache_service Redis cache miss for key:user:123
 87 | ```
 88 | 
 89 | **Analysis Examples:**
 90 | 
 91 | ```bash
 92 | # Extract service names
 93 | hawk -t '. | map(. | split(" ")[1]) | unique' docker.log
 94 | 
 95 | # HTTP status code analysis
 96 | hawk -t '. | select(. | contains("GET|POST")) | map(. | split(" ")[4]) | group_by(.) | count' docker.log
 97 | 
 98 | # Service activity timeline
 99 | hawk -t '. | map(. replace("T", " ")) | map(. | split(" ")[0:2] | map(. | join("-"))' docker.log
100 | ```
101 | 
102 | #### Nginx/Apache Access Logs
103 | 
104 | ```
105 | 192.168.1.100 - - [15/Jan/2024:10:30:45 +0000] "GET /api/users HTTP/1.1" 200 1234 "https://example.com" "Mozilla/5.0"
106 | 192.168.1.101 - - [15/Jan/2024:10:30:46 +0000] "POST /api/auth HTTP/1.1" 401 567 "-" "curl/7.68.0"
107 | 192.168.1.102 - - [15/Jan/2024:10:30:47 +0000] "GET /favicon.ico HTTP/1.1" 404 0 "https://example.com" "Mozilla/5.0"
108 | ```
109 | 
110 | **Analysis Examples:**
111 | 
112 | ```bash
113 | # Extract IP addresses
114 | hawk -t '. | map(. | split(" ")[0]) | unique | sort' access.log
115 | 
116 | # Status code distribution
117 | hawk -t '. | map(. | split("\"")[2] | split(" ")[1]) | group_by(.) | count' access.log
118 | 
119 | # Find 4xx and 5xx errors
120 | hawk -t '. | select(. | contains("\" 4") | . | contains("\" 5"))' access.log
121 | 
122 | # Top user agents
123 | hawk -t '. | map(. | split("\"")[5]) | group_by(.) | count | sort' access.log
124 | 
125 | # Requests per hour
126 | hawk -t '. | map(. | split("[")[1] | split(":")[1]) | group_by(.) | count' access.log
127 | ```
128 | 
129 | #### System Logs (syslog format)
130 | 
131 | ```
132 | Jan 15 10:30:45 server01 kernel: [12345.678] TCP: Peer 192.168.1.100:443 unexpectedly shrunk window
133 | Jan 15 10:30:46 server01 sshd[1234]: Accepted password for user from 192.168.1.200 port 22 ssh2
134 | Jan 15 10:30:47 server01 systemd[1]: Started User Manager for UID 1000.
135 | ```
136 | 
137 | **Analysis Examples:**
138 | 
139 | ```bash
140 | # Extract service names
141 | hawk -t '. | map(. | split(" ")[3] | split("[")[0]) | unique' syslog
142 | 
143 | # SSH connection analysis
144 | hawk -t '. | select(. | contains("sshd")) | map(. | split(" from ")[1] | split(" ")[0]) | unique' syslog
145 | 
146 | # System service events
147 | hawk -t '. | select(. | contains("systemd")) | map(. | split(": ")[1])' syslog
148 | 
149 | # Error pattern analysis
150 | hawk -t '. | select(. | contains("error\|Error\|ERROR")) | map(. | split(" ")[3])' syslog
151 | ```
152 | 
153 | ## String Operations
154 | 
155 | ### Basic String Transformations
156 | 
157 | ```bash
158 | # Case conversion
159 | hawk -t '. | map(. | upper)' text.txt           # Convert to uppercase
160 | hawk -t '. | map(. | lower)' text.txt           # Convert to lowercase
161 | 
162 | # Whitespace management
163 | hawk -t '. | map(. | trim)' text.txt            # Remove leading/trailing spaces
164 | hawk -t '. | map(. | trim_start)' text.txt      # Remove leading spaces only
165 | hawk -t '. | map(. | trim_end)' text.txt        # Remove trailing spaces only
166 | 
167 | # String analysis
168 | hawk -t '. | map(. | length)' text.txt          # Get line lengths
169 | hawk -t '. | map(. | reverse)' text.txt         # Reverse each line
170 | ```
171 | 
172 | ### Advanced String Operations
173 | 
174 | ```bash
175 | # Text replacement
176 | hawk -t '. | map(. | replace("old", "new"))' text.txt
177 | 
178 | # Substring extraction
179 | hawk -t '. | map(. | substring(0, 10))' text.txt        # First 10 characters
180 | hawk -t '. | map(. | substring(5))' text.txt            # From 5th character to end
181 | 
182 | # String splitting with array access (NEW!)
183 | hawk -t '. | map(. | split(" ")[0])' text.txt           # First word
184 | hawk -t '. | map(. | split(",")[2])' csv_like.txt       # Third CSV column
185 | hawk -t '. | map(. | split(":")[1] | trim)' key_value.txt # Extract values
186 | ```
187 | 
188 | ### Multiple Field String Operations (NEW!)
189 | 
190 | ```bash
191 | # Apply same operation to multiple fields in structured data
192 | hawk '.users[] | map(.first_name, .last_name | upper)' users.json
193 | hawk '.posts[] | map(.title, .content | trim)' posts.json
194 | hawk '.logs[] | map(.message, .details | lower)' structured_logs.json
195 | ```
196 | 
197 | ## Pattern Matching and Filtering
198 | 
199 | ### Basic Pattern Matching
200 | 
201 | ```bash
202 | # Contains pattern
203 | hawk -t '. | select(. | contains("ERROR"))' logs.txt
204 | 
205 | # Case-insensitive search
206 | hawk -t '. | select(. | upper | contains("ERROR"))' logs.txt
207 | 
208 | # Multiple patterns (OR logic)
209 | hawk -t '. | select(. | contains("ERROR") | . | contains("WARN"))' logs.txt
210 | 
211 | # Exclude patterns
212 | hawk -t '. | select(not (. | contains("INFO"))' logs.txt
213 | ```
214 | 
215 | ### Advanced Pattern Matching
216 | 
217 | ```bash
218 | # String starts/ends with pattern
219 | hawk -t '. | select(. | starts_with("[INFO]"))' logs.txt
220 | hawk -t '. | select(. | ends_with(".log"))' filenames.txt
221 | 
222 | # Length-based filtering
223 | hawk -t '. | select(. | length > 100)' long_lines.txt
224 | hawk -t '. | select(. | length < 20)' short_lines.txt
225 | 
226 | # Complex conditions
227 | hawk -t '. | select(. | contains("HTTP") && . | contains("200"))' access.log
228 | hawk -t '. | select(. | starts_with("2024") && . | contains("ERROR"))' timestamped.log
229 | ```
230 | 
231 | ### Log Level Filtering
232 | 
233 | ```bash
234 | # Standard log levels
235 | hawk -t '. | select(. | contains("DEBUG"))' app.log
236 | hawk -t '. | select(. | contains("INFO"))' app.log
237 | hawk -t '. | select(. | contains("WARN"))' app.log
238 | hawk -t '. | select(. | contains("ERROR"))' app.log
239 | hawk -t '. | select(. | contains("FATAL"))' app.log
240 | 
241 | # Severity filtering (ERROR and above)
242 | hawk -t '. | select(. | contains("ERROR|FATAL"))' app.log
243 | 
244 | # Time-based filtering
245 | hawk -t '. | select(. | starts_with("2024-01-15"))' dated_logs.txt
246 | hawk -t '. | select(. | substring(11, 2) == "09")' hourly_filter.log  # 9 AM only
247 | ```
248 | 
249 | ## Text Transformation
250 | 
251 | ### Data Extraction
252 | 
253 | ```bash
254 | # Extract timestamps from logs
255 | hawk -t '. | map(. | split(" ")[0])' timestamped.log
256 | 
257 | # Extract IP addresses from access logs
258 | hawk -t '. | map(. | split(" ")[0])' access.log
259 | 
260 | # Extract HTTP methods
261 | hawk -t '. | map(. | split("\"")[1] | split(" ")[0])' access.log
262 | 
263 | # Extract file paths
264 | hawk -t '. | map(. | split("/")[-1])' file_paths.txt
265 | 
266 | # Extract domains from URLs
267 | hawk -t '. | map(. | split("://")[1] | split("/")[0])' urls.txt
268 | ```
269 | 
270 | ### CSV-like Text Processing
271 | 
272 | ```bash
273 | # Process comma-separated values
274 | hawk -t '. | map(. | split(",")[0])' csv_data.txt        # First column
275 | hawk -t '. | map(. | split(",")[1] | trim)' csv_data.txt # Second column, trimmed
276 | 
277 | # Process tab-separated values
278 | hawk -t '. | map(. | split("\t")[2])' tsv_data.txt
279 | 
280 | # Process pipe-separated values
281 | hawk -t '. | map(. | split("|")[1])' pipe_data.txt
282 | 
283 | # Join processed data back
284 | hawk -t '. | map(. | split(",") | join(" | "))' csv_data.txt
285 | ```
286 | 
287 | ### Key-Value Extraction
288 | 
289 | ```bash
290 | # Extract values from key=value format
291 | hawk -t '. | select(. | contains("=")) | map(. | split("=")[1])' config.txt
292 | 
293 | # Extract specific keys
294 | hawk -t '. | select(. | starts_with("user=")) | map(. | split("=")[1])' key_value.txt
295 | 
296 | # Process JSON-like logs
297 | hawk -t '. | select(. | contains("\"level\"")) | map(. | split("\"level\":\"")[1] | split("\"")[0])' json_logs.txt
298 | ```
299 | 
300 | ## Data Extraction
301 | 
302 | ### Email and URL Extraction
303 | 
304 | ```bash
305 | # Extract email addresses (basic pattern)
306 | hawk -t '. | select(. | contains("@")) | map(. | split(" ") | select(. | contains("@")))' text.txt
307 | 
308 | # Extract domains from emails
309 | hawk -t '. | select(. | contains("@")) | map(. | split("@")[1])' emails.txt
310 | 
311 | # Extract URLs (basic pattern)
312 | hawk -t '. | select(. | contains("http")) | map(. | split(" ") | select(. | starts_with("http")))' text.txt
313 | ```
314 | 
315 | ### Numeric Data Extraction
316 | 
317 | ```bash
318 | # Extract numbers from text
319 | hawk -t '. | map(. | split(" ") | select(. | length > 0) | select(. | replace("[^0-9.]", "") | length > 0))' mixed.txt
320 | 
321 | # Extract percentages
322 | hawk -t '. | select(. | contains("%")) | map(. | split("%")[0] | split(" ") | last)' percentages.txt
323 | 
324 | # Extract timestamps (ISO format)
325 | hawk -t '. | map(. | substring(0, 19))' iso_timestamps.txt
326 | 
327 | # Extract version numbers
328 | hawk -t '. | select(. | contains("v")) | map(. | split("v")[1] | split(" ")[0])' versions.txt
329 | ```
330 | 
331 | ### Error Code and Status Extraction
332 | 
333 | ```bash
334 | # HTTP status codes
335 | hawk -t '. | map(. | split(" ")[8])' access.log            # Standard access log format
336 | hawk -t '. | select(. | split(" ")[8] >= "400")' access.log # 4xx and 5xx errors
337 | 
338 | # Exit codes from logs
339 | hawk -t '. | select(. | contains("exit code")) | map(. | split("exit code ")[1] | split(" ")[0])' process.log
340 | 
341 | # Error numbers
342 | hawk -t '. | select(. | contains("errno")) | map(. | split("errno=")[1] | split(" ")[0])' system.log
343 | ```
344 | 
345 | ## Advanced Text Patterns
346 | 
347 | ### Multi-line Log Processing
348 | 
349 | ```bash
350 | # Process stack traces (keep related lines together)
351 | hawk -t '. | select(. | contains("Exception") | . | starts_with("\t"))' java.log
352 | 
353 | # Group by session ID
354 | hawk -t '. | select(. | contains("session=")) | map(. | split("session=")[1] | split(" ")[0])' session.log
355 | 
356 | # Process multiline JSON logs (single line JSON per log entry)
357 | hawk -t '. | select(. | starts_with("{") && . | ends_with("}"))' json.log
358 | ```
359 | 
360 | ### Performance Log Analysis
361 | 
362 | ```bash
363 | # Response time analysis
364 | hawk -t '. | select(. | contains("ms")) | map(. | split(" ") | select(. | ends_with("ms")) | replace("ms", ""))' perf.log
365 | 
366 | # Memory usage tracking
367 | hawk -t '. | select(. | contains("memory")) | map(. | split("memory: ")[1] | split(" ")[0])' memory.log
368 | 
369 | # CPU usage extraction
370 | hawk -t '. | select(. | contains("cpu")) | map(. | split("cpu: ")[1] | split("%")[0])' cpu.log
371 | ```
372 | 
373 | ### Security Log Analysis
374 | 
375 | ```bash
376 | # Failed login attempts
377 | hawk -t '. | select(. | contains("failed login")) | map(. | split("from ")[1] | split(" ")[0])' auth.log
378 | 
379 | # Suspicious activity patterns
380 | hawk -t '. | select(. | contains("SUSPICIOUS") | . | contains("ANOMALY"))' security.log
381 | 
382 | # IP-based analysis
383 | hawk -t '. | map(. | split(" ")[0]) | group_by(.) | count | sort' network.log
384 | ```
385 | 
386 | ## Performance Optimization
387 | 
388 | ### Efficient Text Processing
389 | 
390 | ```bash
391 | # ✅ Filter early in pipeline
392 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[0])' large.log
393 | 
394 | # ❌ Process everything then filter
395 | hawk -t '. | map(. | split(" ")[0]) | select(. | contains("ERROR"))' large.log
396 | 
397 | # ✅ Use specific operations
398 | hawk -t '. | map(. | split(" ")[0])' log.txt
399 | 
400 | # ❌ Use complex operations when simple ones suffice
401 | hawk -t '. | map(. | replace(...) | substring(...) | split(...))' log.txt
402 | ```
403 | 
404 | ### Memory Management
405 | 
406 | ```bash
407 | # ✅ Process in chunks for large files
408 | hawk -t '.[0:10000] | select(. | contains("ERROR"))' huge.log
409 | 
410 | # ✅ Sample large datasets
411 | hawk -t '.[::100] | map(. | split(" ")[0])' massive.log  # Every 100th line
412 | ```
413 | 
414 | ### Slicing for Performance (NEW!)
415 | 
416 | ```bash
417 | # ✅ Process recent logs only
418 | hawk -t '.[-1000:] | select(. | contains("ERROR"))' app.log    # Last 1000 lines
419 | 
420 | # ✅ Sample from different time periods
421 | hawk -t '.[0:100] | .[500:600] | .[1000:1100]' distributed_sample.log
422 | 
423 | # ✅ Top/bottom analysis
424 | hawk -t '. | sort | .[0:10]' values.txt                # Bottom 10
425 | hawk -t '. | sort | .[-10:]' values.txt                # Top 10
426 | ```
427 | 
428 | ## Real-world Examples
429 | 
430 | ### Complete Log Analysis Workflows
431 | 
432 | #### Web Server Log Analysis
433 | 
434 | ```bash
435 | # 1. Overview of traffic
436 | hawk -t '. | count' access.log                          # Total requests
437 | hawk -t '. | map(. | split(" ")[0]) | unique | count' access.log  # Unique IPs
438 | 
439 | # 2. Error analysis
440 | hawk -t '. | select(. | contains("\" 4") | . | contains("\" 5")) | count' access.log
441 | 
442 | # 3. Top pages
443 | hawk -t '. | map(. | split("\"")[1] | split(" ")[1]) | group_by(.) | count | sort' access.log
444 | 
445 | # 4. Traffic patterns by hour
446 | hawk -t '. | map(. | split("[")[1] | split(":")[1]) | group_by(.) | count' access.log
447 | 
448 | # 5. User agent analysis
449 | hawk -t '. | map(. | split("\"")[5]) | group_by(.) | count | .[-10:]' access.log
450 | ```
451 | 
452 | #### Application Error Investigation
453 | 
454 | ```bash
455 | # 1. Error trend analysis
456 | hawk -t '. | select(. | contains("ERROR")) | map(. | substring(0, 13)) | group_by(.) | count' app.log
457 | 
458 | # 2. Error types
459 | hawk -t '. | select(. | contains("ERROR")) | map(. | split("ERROR ")[1] | split(":")[0]) | count' app.log
460 | 
461 | # 3. Related warnings
462 | hawk -t '. | select(. | contains("WARN")) | select(. | contains("connection\|timeout\|retry"))' app.log
463 | ```
464 | 
465 | #### System Performance Monitoring
466 | 
467 | ```bash
468 | # 1. Memory usage trends
469 | hawk -t '. | select(. | contains("memory")) | map(. | split("memory: ")[1] | split(" ")[0])' system.log
470 | 
471 | # 2. Disk space monitoring
472 | hawk -t '. | select(. | contains("disk")) | map(. | split("usage: ")[1] | split("%")[0])' disk.log
473 | 
474 | # 3. Network activity
475 | hawk -t '. | select(. | contains("bytes")) | map(. | split("bytes: ")[1] | split(" ")[0])' network.log
476 | 
477 | # 4. Process analysis
478 | hawk -t '. | select(. | contains("process")) | map(. | split(" ")[3]) | group_by(.) | count' process.log
479 | ```
480 | 
481 | #### Security Log Analysis
482 | 
483 | ```bash
484 | # 1. Authentication failures
485 | hawk -t '. | select(. | contains("authentication failed")) | map(. | split("from ")[1] | split(" ")[0]) | group_by(.) | count' security.log
486 | 
487 | # 2. Unusual access patterns
488 | hawk -t '. | select(. | contains("GET") && . | contains("admin")) | map(. | split(" ")[0])' access.log
489 | 
490 | # 3. Brute force detection
491 | hawk -t '. | select(. | contains("failed password")) | map(. | split(" ")[0]) | group_by(.) | count | select(. > 10)' auth.log
492 | 
493 | # 4. Geographic analysis (if GeoIP data available)
494 | hawk -t '. | map(. | split(" ")[0]) | unique' access.log  # Extract IPs for GeoIP lookup
495 | ```
496 | 
497 | #### DevOps Pipeline Logs
498 | 
499 | ```bash
500 | # 1. Build success/failure rates
501 | hawk -t '. | select(. | contains("BUILD")) | map(. | split("BUILD ")[1] | split(" ")[0]) | group_by(.) | count' ci.log
502 | 
503 | # 2. Deployment timing
504 | hawk -t '. | select(. | contains("DEPLOY")) | map(. | split(" ")[0])' deploy.log
505 | 
506 | # 3. Test results analysis
507 | hawk -t '. | select(. | contains("TEST")) | map(. | split("TEST ")[1]) | group_by(.) | count' test.log
508 | 
509 | # 4. Resource usage during builds
510 | hawk -t '. | select(. | contains("CPU\|MEMORY")) | map(. | split(": ")[1])' resource.log
511 | ```
512 | 
513 | ## Best Practices
514 | 
515 | ### Text Processing Guidelines
516 | 
517 | 1. **Always use --text flag for log files**: Prevents YAML/JSON misdetection
518 | 2. **Filter early**: Apply `select()` before expensive operations
519 | 3. **Use specific extractors**: Prefer `split()[index]` over complex regex alternatives
520 | 4. **Handle edge cases**: Check for empty results and missing fields
521 | 5. **Sample large files**: Use slicing for performance with huge datasets
522 | 
523 | ### Common Patterns
524 | 
525 | ```bash
526 | # ✅ Good: Extract then analyze
527 | hawk -t '. | map(. | split(" ")[0]) | unique | count' log.txt
528 | 
529 | # ✅ Good: Filter then transform
530 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[1])' log.txt
531 | 
532 | # ✅ Good: Use appropriate data types
533 | hawk -t '. | select(. | length > 0) | map(. | trim)' text.txt
534 | 
535 | # ✅ Good: Handle missing data
536 | hawk -t '. | select(. | contains(" ")) | map(. | split(" ")[1])' structured.txt
537 | ```
538 | 
539 | ### Debugging Text Processing
540 | 
541 | ```bash
542 | # Check data structure
543 | hawk -t '. | .[0:5]' file.txt                          # Sample first 5 lines
544 | 
545 | # Validate operations step by step
546 | hawk -t '. | map(. | split(" "))' file.txt             # Step 1: split
547 | hawk -t '. | map(. | split(" ")[0])' file.txt          # Step 2: index access
548 | 
549 | # Check for empty or problematic lines
550 | hawk -t '. | select(. | length == 0)' file.txt         # Find empty lines
551 | hawk -t '. | select(. | contains("\t"))' file.txt      # Find tab characters
552 | ```
553 | 
554 | ---
555 | 
556 | **Related Documentation:**
557 | 
558 | - [Getting Started](getting-started.md) - Basic hawk introduction
559 | - [String Operations](string-operations.md) - Detailed string processing reference
560 | - [Query Language](query-language.md) - Complete syntax guide
561 | - [Log Analysis Examples](examples/log-analysis.md) - Real-world log processing cases
562 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
  1 | # Hawk Examples
  2 | 
  3 | This directory contains sample data and query examples to learn and explore all hawk features.
  4 | 
  5 | ## 🚀 Quick Start
  6 | 
  7 | Lightweight sample data (~200KB total) ready to use immediately after git clone:
  8 | 
  9 | ```bash
 10 | cd examples/small
 11 | 
 12 | # Explore data structure
 13 | hawk '. | info' customers.json
 14 | 
 15 | # Basic filtering
 16 | hawk '.[] | select(.status == "active")' customers.json
 17 | 
 18 | # New feature: NOT operator
 19 | hawk '.[] | select(not (.segment == "enterprise"))' customers.json
 20 | 
 21 | # New feature: OR operator
 22 | hawk -t '. | select(. | contains("ERROR|WARN"))' application.log
 23 | 
 24 | # New feature: Array slicing
 25 | hawk '.[]' customers.json | hawk '.[0:3]'
 26 | ```
 27 | 
 28 | ## 📁 Dataset Overview
 29 | 
 30 | ### small/ - Lightweight Learning Data
 31 | 
 32 | | File                   | Size | Records  | Format | Use Case                                 |
 33 | | ---------------------- | ---- | -------- | ------ | ---------------------------------------- |
 34 | | `customers.json`       | ~2KB | 10       | JSON   | Customer management, basic queries       |
 35 | | `orders.csv`           | ~1KB | 25       | CSV    | Sales analysis, JOIN operations          |
 36 | | `products.yaml`        | ~1KB | 8        | YAML   | Product catalog, price analysis          |
 37 | | `employees.json`       | ~2KB | 15       | JSON   | HR data, grouping operations             |
 38 | | `ec2_instances.json`   | ~2KB | 5        | JSON   | AWS resources, infrastructure monitoring |
 39 | | `user_behavior.json`   | ~3KB | 20       | JSON   | Analytics, statistical processing        |
 40 | | `survey_responses.csv` | ~2KB | 30       | CSV    | Survey analysis, aggregation             |
 41 | | `application.log`      | ~3KB | 50 lines | TEXT   | Log analysis, error extraction           |
 42 | | `nginx_access.log`     | ~2KB | 30 lines | TEXT   | Web server logs, IP analysis             |
 43 | | `urls.txt`             | ~1KB | 20 lines | TEXT   | URL processing, domain extraction        |
 44 | | `error_messages.txt`   | ~1KB | 15 lines | TEXT   | Error categorization, pattern extraction |
 45 | | `nginx.conf`           | ~2KB | -        | TEXT   | Configuration file analysis              |
 46 | 
 47 | ## 🎯 Learning Path
 48 | 
 49 | ### Level 1: Basic Operations
 50 | 
 51 | ```bash
 52 | # Understanding data structure
 53 | hawk '. | info' customers.json
 54 | hawk '.[] | count' customers.json
 55 | 
 56 | # Simple filtering
 57 | hawk '.[] | select(.country == "USA")' customers.json
 58 | hawk '.products[] | select(.price > 100)' products.yaml
 59 | ```
 60 | 
 61 | ### Level 2: Aggregation and Grouping
 62 | 
 63 | ```bash
 64 | # Aggregation functions
 65 | hawk '.[] | sum(.lifetime_value)' customers.json
 66 | hawk '.products[] | avg(.price)' products.yaml
 67 | 
 68 | # Grouping
 69 | hawk '.[] | group_by(.country) | count' customers.json
 70 | hawk '.[] | group_by(.department) | avg(.salary)' employees.json
 71 | ```
 72 | 
 73 | ### Level 3: New Features (Logical Operations & Slicing)
 74 | 
 75 | ```bash
 76 | # NOT operator
 77 | hawk '.[] | select(not (.status == "inactive"))' customers.json
 78 | hawk -t '. | select(not (. | contains("DEBUG")))' application.log
 79 | 
 80 | # OR operator
 81 | hawk '.[] | select(.segment | contains("enterprise|business"))' customers.json
 82 | hawk -t '. | select(. | contains("ERROR|FATAL|CRITICAL"))' application.log
 83 | 
 84 | # Array slicing
 85 | hawk '.[0:5]' customers.json        # First 5 records
 86 | ```
 87 | 
 88 | ### Level 4: Complex Text Processing
 89 | 
 90 | ```bash
 91 | # Log analysis
 92 | hawk -t '. | map(. | split(" ")[0:3] | join(" "))' application.log
 93 | 
 94 | # URL processing
 95 | hawk -t '. | map(. | split("://")[1] | split("/")[0])' urls.txt
 96 | hawk -t '. | select(not (. | starts_with("https://")))' urls.txt
 97 | 
 98 | # Configuration file analysis
 99 | hawk -t '. | select(not (. | starts_with("#"))) | select(. | contains("="))' nginx.conf
100 | ```
101 | 
102 | ### Level 5: Advanced Queries
103 | 
104 | ```bash
105 | # Multiple condition combinations
106 | hawk '.[] | select(.status == "active") | select(not (.segment == "test")) | group_by(.country) | count' customers.json
107 | 
108 | # String operations with complex logic
109 | hawk -t '. | select(. | contains("ERROR|WARN")) | map(. | split(" ")[0:2] | join(" ")) | unique' application.log
110 | 
111 | # Slicing with aggregation
112 | hawk '.[0:10] | avg(.duration_seconds)' user_behavior.json
113 | ```
114 | 
115 | ## 🛠️ Larger Datasets
116 | 
117 | After mastering the basics with small sample data, practice with larger datasets:
118 | 
119 | ```bash
120 | # Generate large sample datasets (1000-10000 records)
121 | ./scripts/generate_large.sh
122 | 
123 | # Download real-world open datasets
124 | ./scripts/download_datasets.sh
125 | 
126 | # Practice with generated data
127 | hawk '.[] | group_by(.country) | count' large/customers_large.json
128 | ```
129 | 
130 | ## 📊 Practical Use Cases
131 | 
132 | ### Business Analytics
133 | 
134 | ```bash
135 | # Sales trends (by month)
136 | hawk '.[] | map(.order_date | split("-")[0:2] | join("-")) | group_by(.) | sum(.price)' orders.csv
137 | ```
138 | 
139 | ### Infrastructure Monitoring
140 | 
141 | ```bash
142 | # Identify high-load instances
143 | hawk '.[] | select(.cpu_utilization > 80)' ec2_instances.json
144 | 
145 | # Time-series error log analysis
146 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[0:2] | join(" ")) | group_by(.) | count' application.log
147 | ```
148 | 
149 | ### Data Cleaning
150 | 
151 | ```bash
152 | # Filter out invalid data
153 | hawk '.[] | select(not (.email | contains("test|demo|temp"))) | select(.lifetime_value > 0)' customers.json
154 | ```
155 | 
156 | ## 🔧 Scripts
157 | 
158 | ### scripts/generate_large.sh
159 | 
160 | Generate larger sample datasets:
161 | 
162 | - `--size N`: Specify number of records to generate
163 | - `--type TYPE`: Specify data type to generate
164 | - `--format FORMAT`: Specify output format
165 | 
166 | ### scripts/download_datasets.sh
167 | 
168 | Download real-world open datasets:
169 | 
170 | - GitHub API responses
171 | - Public API sample data
172 | - Real log file examples
173 | 
174 | See [scripts/README.md](scripts/README.md) for detailed documentation.
175 | 
176 | ## 🎓 Next Steps
177 | 
178 | 1. **Master the basics**: Try all features with small/ data
179 | 2. **Practice with real data**: Use scripts/ to generate larger datasets
180 | 3. **Apply to your projects**: Use hawk with your own data files
181 | 
182 | ## 💡 Tips
183 | 
184 | - **Performance**: Apply filters early for large datasets
185 | - **Debugging**: Use `| info` to inspect data structure
186 | - **Incremental building**: Build complex queries step by step
187 | - **Formatting**: Use `--format table` for readable output
188 | 
189 | ## 🤝 Contributing
190 | 
191 | New sample data and query examples are welcome!
192 | 
193 | ---
194 | 
195 | **Related Documentation:**
196 | 
197 | - [Query Language Reference](../docs/query-language.md)
198 | - [Getting Started Guide](../docs/getting-started.md)
199 | - [String Operations](../docs/string-operations.md)
200 | 


--------------------------------------------------------------------------------
/examples/scripts/README.md:
--------------------------------------------------------------------------------
  1 | # Sample Data Generation Scripts
  2 | 
  3 | This directory contains scripts to generate larger sample datasets for hawk learning and testing.
  4 | 
  5 | ## 📋 Scripts Overview
  6 | 
  7 | ### generate_large.sh
  8 | 
  9 | Generate larger sample datasets (1,000-10,000 records) for performance testing and advanced learning.
 10 | 
 11 | **Usage:**
 12 | 
 13 | ```bash
 14 | # Generate all datasets with default settings
 15 | ./scripts/generate_large.sh
 16 | 
 17 | # Specify custom size
 18 | ./scripts/generate_large.sh --size 5000
 19 | 
 20 | # Generate specific data types only
 21 | ./scripts/generate_large.sh --type customers
 22 | ./scripts/generate_large.sh --type logs
 23 | ./scripts/generate_large.sh --type metrics
 24 | 
 25 | # Specify output directory
 26 | ./scripts/generate_large.sh --output examples/large
 27 | ```
 28 | 
 29 | **Options:**
 30 | 
 31 | - `--size N`: Number of records to generate (default: 1000)
 32 | - `--type TYPE`: Data type to generate (customers, orders, employees, logs, metrics, all)
 33 | - `--output DIR`: Output directory (default: examples/large)
 34 | - `--format FORMAT`: Output format (json, csv, yaml)
 35 | - `--parallel`: Enable parallel generation for faster processing
 36 | - `--help`: Show help message
 37 | 
 38 | ### download_datasets.sh
 39 | 
 40 | Download real-world open datasets for practicing with actual data patterns.
 41 | 
 42 | **Usage:**
 43 | 
 44 | ```bash
 45 | # Download all available datasets
 46 | ./scripts/download_datasets.sh
 47 | 
 48 | # Download specific datasets only
 49 | ./scripts/download_datasets.sh --dataset github
 50 | ./scripts/download_datasets.sh --dataset logs
 51 | ```
 52 | 
 53 | **Available Datasets:**
 54 | 
 55 | - `github`: GitHub API responses (repositories, users, issues)
 56 | - `apis`: Public API samples (REST, GraphQL responses)
 57 | - `logs`: Real application logs from open source projects
 58 | - `configs`: Configuration file samples (nginx, docker, k8s)
 59 | 
 60 | ### cleanup.sh
 61 | 
 62 | Clean up generated data files and temporary files.
 63 | 
 64 | ```bash
 65 | # Clean all generated files with confirmation
 66 | ./scripts/cleanup.sh
 67 | 
 68 | # Clean specific targets
 69 | ./scripts/cleanup.sh --target large
 70 | ./scripts/cleanup.sh --target external
 71 | 
 72 | # Preview what would be deleted (dry run)
 73 | ./scripts/cleanup.sh --dry-run
 74 | ```
 75 | 
 76 | ## 🎯 Generated Data
 77 | 
 78 | ### Large Dataset (examples/large/)
 79 | 
 80 | | File                       | Records              | Size (approx) | Use Case                              |
 81 | | -------------------------- | -------------------- | ------------- | ------------------------------------- |
 82 | | `customers_large.json`     | 1,000-10,000         | 200KB-2MB     | Customer analysis, segmentation       |
 83 | | `orders_large.csv`         | 5,000-50,000         | 500KB-5MB     | Sales analysis, trend analysis        |
 84 | | `employees_large.json`     | 500-5,000            | 100KB-1MB     | HR analysis, organizational analysis  |
 85 | | `logs_large.log`           | 10,000-100,000 lines | 1MB-10MB      | Log analysis, error analysis          |
 86 | | `metrics_large.csv`        | 1,440-14,400         | 100KB-1MB     | Time series analysis, monitoring data |
 87 | | `user_behavior_large.json` | 10,000-100,000       | 2MB-20MB      | Behavior analysis, A/B testing        |
 88 | 
 89 | ### External Dataset (examples/external/)
 90 | 
 91 | | File                    | Source                | Size   | Use Case                            |
 92 | | ----------------------- | --------------------- | ------ | ----------------------------------- |
 93 | | `github_repos.json`     | GitHub API            | ~50KB  | Real API response processing        |
 94 | | `public_apis.json`      | Public APIs Directory | ~100KB | API data analysis                   |
 95 | | `real_logs.log`         | Open source projects  | ~500KB | Real log analysis                   |
 96 | | `config_samples.tar.gz` | Configuration samples | ~200KB | Config analysis, pattern extraction |
 97 | 
 98 | ## 🚀 Usage Examples
 99 | 
100 | ### Performance Testing with Large Data
101 | 
102 | ```bash
103 | # Generate large customer dataset for aggregation testing
104 | ./scripts/generate_large.sh --type customers --size 10000
105 | hawk '.[] | group_by(.country) | count' large/customers_large.json
106 | 
107 | # Test filtering performance with large logs
108 | ./scripts/generate_large.sh --type logs --size 100000
109 | hawk -t '. | select(. | contains("ERROR|CRITICAL")) | count' large/logs_large.log
110 | 
111 | # Time series analysis with metrics data
112 | ./scripts/generate_large.sh --type metrics --size 14400
113 | hawk '.[] | group_by(.hour) | avg(.cpu_usage)' large/metrics_large.csv
114 | ```
115 | 
116 | ### Practicing with Real Data
117 | 
118 | ```bash
119 | # Practice with GitHub data
120 | ./scripts/download_datasets.sh --dataset github
121 | hawk '.items[] | select(.language == "Rust") | group_by(.owner.login) | count' external/github_repos.json
122 | 
123 | # Real log error analysis
124 | ./scripts/download_datasets.sh --dataset logs
125 | hawk -t '. | select(. | contains("ERROR|FATAL")) | map(. | split(" ")[0:3] | join(" ")) | unique' external/real_logs.log
126 | ```
127 | 
128 | ## ⚙️ Script Details
129 | 
130 | ### Data Generation Algorithms
131 | 
132 | **customers_large.json:**
133 | 
134 | - Random but realistic names, emails, countries
135 | - Realistic company names and segment distribution
136 | - Regional purchasing power reflected in lifetime_value
137 | - Distribution adjusted by country population
138 | 
139 | **logs_large.log:**
140 | 
141 | - Chronological natural log generation
142 | - Realistic ERROR/WARN/INFO ratios (1:5:20)
143 | - Correlated IP addresses, URLs, response codes
144 | - Mimics real application patterns
145 | 
146 | **metrics_large.csv:**
147 | 
148 | - 24 hours × days of time series data
149 | - CPU, memory, network correlation relationships
150 | - Load variation patterns by time of day
151 | - Weekend/weekday differences reflected
152 | 
153 | ### Performance Considerations
154 | 
155 | - **Parallel Generation**: Simultaneous generation of multiple files for speed
156 | - **Memory Efficiency**: Streaming generation for large capacity support
157 | - **Progress Display**: Real-time progress indication
158 | - **Error Handling**: Proper handling of generation failures
159 | 
160 | ## 🛠️ Customization
161 | 
162 | ### Custom Data Generation
163 | 
164 | Create your own data patterns based on the scripts:
165 | 
166 | ```bash
167 | # Copy template
168 | cp scripts/generate_large.sh scripts/generate_custom.sh
169 | 
170 | # Implement custom data patterns
171 | # Add generate_custom_dataset() function
172 | ```
173 | 
174 | ### Configuration File
175 | 
176 | Customize generation parameters with `scripts/config.yaml`:
177 | 
178 | ```yaml
179 | generation:
180 |   default_size: 1000
181 |   output_dir: "examples/large"
182 | 
183 | datasets:
184 |   customers:
185 |     countries: ["USA", "Canada", "UK", "Germany", "Japan"]
186 |     segments: ["enterprise", "business", "small"]
187 | 
188 |   logs:
189 |     log_levels: ["ERROR", "WARN", "INFO", "DEBUG"]
190 |     level_ratios: [1, 5, 20, 50]
191 | ```
192 | 
193 | ## 🧹 Cleanup
194 | 
195 | ### Safe Deletion
196 | 
197 | ```bash
198 | # Deletion with confirmation
199 | ./scripts/cleanup.sh --interactive
200 | 
201 | # Specific files only
202 | ./scripts/cleanup.sh --pattern "*.log"
203 | 
204 | # Size-limited deletion
205 | ./scripts/cleanup.sh --size-limit 10MB
206 | ```
207 | 
208 | ### Automated Cleanup
209 | 
210 | ```bash
211 | # Periodic cleanup of old files (cron example)
212 | 0 2 * * * /path/to/scripts/cleanup.sh --older-than 7days
213 | ```
214 | 
215 | ## 💡 Tips
216 | 
217 | 1. **Progressive Learning**: Learn in order: small → large → external
218 | 2. **Memory Monitoring**: Watch memory usage when generating large datasets
219 | 3. **Disk Space**: 10,000 records require approximately 10-50MB
220 | 4. **Parallel Processing**: Test performance with multiple concurrent queries
221 | 
222 | ## 🐛 Troubleshooting
223 | 
224 | ### Common Issues
225 | 
226 | **Slow generation:**
227 | 
228 | ```bash
229 | # Enable parallel generation
230 | ./scripts/generate_large.sh --parallel 4
231 | 
232 | # Adjust size
233 | ./scripts/generate_large.sh --size 1000
234 | ```
235 | 
236 | **Out of memory errors:**
237 | 
238 | ```bash
239 | # Use streaming mode
240 | ./scripts/generate_large.sh --streaming
241 | 
242 | # Adjust batch size
243 | ./scripts/generate_large.sh --batch-size 100
244 | ```
245 | 
246 | **Download failures:**
247 | 
248 | ```bash
249 | # Enable retry
250 | ./scripts/download_datasets.sh --retry 3
251 | 
252 | # Set proxy
253 | export https_proxy=http://proxy.company.com:8080
254 | ./scripts/download_datasets.sh
255 | ```
256 | 
257 | ---
258 | 
259 | **Related Documentation:**
260 | 
261 | - [Main Examples README](../README.md)
262 | - [Query Language Reference](../../docs/query-language.md)
263 | 
264 | ## 🛠️ Customization
265 | 
266 | ### Custom Data Generation
267 | 
268 | スクリプトをベースに独自のデータパターンを作成：
269 | 
270 | ```bash
271 | # Copy template
272 | cp scripts/generate_large.sh scripts/generate_custom.sh
273 | 
274 | # Add custom data patterns
275 | # generate_custom_dataset() 関数を実装
276 | ```
277 | 
278 | ### Configuration File
279 | 
280 | Generate custom parameters in `scripts/config.yaml`：
281 | 
282 | ```yaml
283 | generation:
284 |   default_size: 1000
285 |   output_dir: "examples/large"
286 | 
287 | datasets:
288 |   customers:
289 |     countries: ["USA", "Canada", "UK", "Germany", "Japan"]
290 |     segments: ["enterprise", "business", "small"]
291 | 
292 |   logs:
293 |     log_levels: ["ERROR", "WARN", "INFO", "DEBUG"]
294 |     level_ratios: [1, 5, 20, 50]
295 | ```
296 | 
297 | ## 🧹 Cleanup
298 | 
299 | ### Safe Deletion
300 | 
301 | ```bash
302 | # Deletion with confirmation
303 | ./scripts/cleanup.sh --interactive
304 | 
305 | # Specific files only
306 | ./scripts/cleanup.sh --pattern "*.log"
307 | 
308 | # Size-limited deletion
309 | ./scripts/cleanup.sh --size-limit 10MB
310 | ```
311 | 
312 | ### Automated Cleanup
313 | 
314 | ```bash
315 | # Periodic cleanup of old files (cron example)
316 | 0 2 * * * /path/to/scripts/cleanup.sh --older-than 7days
317 | ```
318 | 
319 | ## 💡 Tips
320 | 
321 | 1. **Progressive Learning**: Learn in order: small → large → external
322 | 2. **Memory Monitoring**: Watch memory usage when generating large datasets
323 | 3. **Disk Space**: 10,000 records require approximately 10-50MB
324 | 4. **Parallel Processing**: Test performance with multiple concurrent queries
325 | 
326 | ## 🐛 Troubleshooting
327 | 
328 | ### Common Issues
329 | 
330 | **Slow generation:**
331 | 
332 | ```bash
333 | # Enable parallel generation
334 | ./scripts/generate_large.sh --parallel 4
335 | 
336 | # Adjust size
337 | ./scripts/generate_large.sh --size 1000
338 | ```
339 | 
340 | **Out of memory errors:**
341 | 
342 | ```bash
343 | # Use streaming mode
344 | ./scripts/generate_large.sh --streaming
345 | 
346 | # Adjust batch size
347 | ./scripts/generate_large.sh --batch-size 100
348 | ```
349 | 
350 | **Download failures:**
351 | 
352 | ```bash
353 | # Enable retry
354 | ./scripts/download_datasets.sh --retry 3
355 | 
356 | # Set proxy
357 | export https_proxy=http://proxy.company.com:8080
358 | ./scripts/download_datasets.sh
359 | ```
360 | 
361 | ---
362 | 
363 | **Related Documentation:**
364 | 
365 | - [Main Examples README](../README.md)
366 | - [Query Language Reference](../../docs/query-language.md)
367 | 


--------------------------------------------------------------------------------
/examples/scripts/cleanup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # cleanup.sh - Clean up generated datasets and temporary files
  4 | # Usage: ./cleanup.sh [options]
  5 | 
  6 | set -euo pipefail
  7 | 
  8 | # Default configuration
  9 | DEFAULT_TARGET="all"
 10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 11 | BASE_DIR="$(dirname "$SCRIPT_DIR")"
 12 | 
 13 | # Colors for output
 14 | RED='\033[0;31m'
 15 | GREEN='\033[0;32m'
 16 | YELLOW='\033[1;33m'
 17 | BLUE='\033[0;34m'
 18 | NC='\033[0m' # No Color
 19 | 
 20 | # Configuration variables
 21 | TARGET="$DEFAULT_TARGET"
 22 | INTERACTIVE=false
 23 | DRY_RUN=false
 24 | FORCE=false
 25 | 
 26 | # Helper functions
 27 | log_info() {
 28 |     echo -e "${BLUE}[INFO]${NC} $1"
 29 | }
 30 | 
 31 | log_success() {
 32 |     echo -e "${GREEN}[SUCCESS]${NC} $1"
 33 | }
 34 | 
 35 | log_warning() {
 36 |     echo -e "${YELLOW}[WARNING]${NC} $1"
 37 | }
 38 | 
 39 | log_error() {
 40 |     echo -e "${RED}[ERROR]${NC} $1"
 41 | }
 42 | 
 43 | show_help() {
 44 |     cat <<EOF
 45 | Clean up generated datasets and temporary files.
 46 | 
 47 | Usage: $0 [OPTIONS]
 48 | 
 49 | Options:
 50 |     --target TARGET     What to clean: large, external, generated, temp, all (default: $DEFAULT_TARGET)
 51 |     --interactive       Ask for confirmation before deleting each item
 52 |     --dry-run          Show what would be deleted without actually deleting
 53 |     --force            Delete without any confirmation (dangerous!)
 54 |     --help             Show this help message
 55 | 
 56 | Targets:
 57 |     large              Generated large datasets (examples/large/)
 58 |     external           Downloaded external datasets (examples/external/)
 59 |     generated          All generated content (large + external)
 60 |     temp               Temporary files (*.tmp, *.temp, etc.)
 61 |     all                Everything except small sample data
 62 | 
 63 | Examples:
 64 |     $0                                    # Clean everything with confirmation
 65 |     $0 --target large --dry-run           # Preview large dataset cleanup
 66 |     $0 --target temp --force              # Force delete temp files
 67 |     $0 --interactive                      # Ask before each deletion
 68 | 
 69 | Safety Features:
 70 |     - Never deletes small/ sample data
 71 |     - Always shows what will be deleted
 72 |     - Requires confirmation unless --force is used
 73 |     - Supports dry-run mode
 74 | 
 75 | EOF
 76 | }
 77 | 
 78 | # Parse command line arguments
 79 | parse_args() {
 80 |     while [[ $# -gt 0 ]]; do
 81 |         case $1 in
 82 |         --target)
 83 |             TARGET="$2"
 84 |             shift 2
 85 |             ;;
 86 |         --interactive)
 87 |             INTERACTIVE=true
 88 |             shift
 89 |             ;;
 90 |         --dry-run)
 91 |             DRY_RUN=true
 92 |             shift
 93 |             ;;
 94 |         --force)
 95 |             FORCE=true
 96 |             shift
 97 |             ;;
 98 |         --help)
 99 |             show_help
100 |             exit 0
101 |             ;;
102 |         *)
103 |             log_error "Unknown option: $1"
104 |             show_help
105 |             exit 1
106 |             ;;
107 |         esac
108 |     done
109 | }
110 | 
111 | # Get files to clean based on target
112 | get_cleanup_targets() {
113 |     local target=$1
114 |     local targets=()
115 | 
116 |     case $target in
117 |     "large")
118 |         if [[ -d "$BASE_DIR/large" ]]; then
119 |             targets+=("$BASE_DIR/large")
120 |         fi
121 |         ;;
122 |     "external")
123 |         if [[ -d "$BASE_DIR/external" ]]; then
124 |             targets+=("$BASE_DIR/external")
125 |         fi
126 |         ;;
127 |     "generated")
128 |         if [[ -d "$BASE_DIR/large" ]]; then
129 |             targets+=("$BASE_DIR/large")
130 |         fi
131 |         if [[ -d "$BASE_DIR/external" ]]; then
132 |             targets+=("$BASE_DIR/external")
133 |         fi
134 |         ;;
135 |     "temp")
136 |         # Find temporary files
137 |         while IFS= read -r -d '' file; do
138 |             targets+=("$file")
139 |         done < <(find "$BASE_DIR" -type f \( -name "*.tmp" -o -name "*.temp" -o -name "*.swp" -o -name "*.bak" \) -print0 2>/dev/null || true)
140 |         ;;
141 |     "all")
142 |         if [[ -d "$BASE_DIR/large" ]]; then
143 |             targets+=("$BASE_DIR/large")
144 |         fi
145 |         if [[ -d "$BASE_DIR/external" ]]; then
146 |             targets+=("$BASE_DIR/external")
147 |         fi
148 |         # Add temp files
149 |         while IFS= read -r -d '' file; do
150 |             targets+=("$file")
151 |         done < <(find "$BASE_DIR" -type f \( -name "*.tmp" -o -name "*.temp" -o -name "*.swp" -o -name "*.bak" \) -print0 2>/dev/null || true)
152 |         ;;
153 |     *)
154 |         log_error "Unknown target: $target"
155 |         exit 1
156 |         ;;
157 |     esac
158 | 
159 |     printf '%s\n' "${targets[@]}"
160 | }
161 | 
162 | # Calculate total size of targets
163 | calculate_total_size() {
164 |     local targets=("$@")
165 |     local total_size=0
166 | 
167 |     for target in "${targets[@]}"; do
168 |         if [[ -e "$target" ]]; then
169 |             if [[ -d "$target" ]]; then
170 |                 local size=$(du -sb "$target" 2>/dev/null | cut -f1 || echo 0)
171 |             else
172 |                 local size=$(stat -c%s "$target" 2>/dev/null || echo 0)
173 |             fi
174 |             total_size=$((total_size + size))
175 |         fi
176 |     done
177 | 
178 |     # Convert bytes to human readable
179 |     if [[ $total_size -eq 0 ]]; then
180 |         echo "0B"
181 |     elif [[ $total_size -lt 1024 ]]; then
182 |         echo "${total_size}B"
183 |     elif [[ $total_size -lt 1048576 ]]; then
184 |         echo "$((total_size / 1024))KB"
185 |     elif [[ $total_size -lt 1073741824 ]]; then
186 |         echo "$((total_size / 1048576))MB"
187 |     else
188 |         echo "$((total_size / 1073741824))GB"
189 |     fi
190 | }
191 | 
192 | # Show what will be deleted
193 | show_cleanup_preview() {
194 |     local targets=("$@")
195 | 
196 |     if [[ ${#targets[@]} -eq 0 ]]; then
197 |         log_info "No files to clean up"
198 |         return 0
199 |     fi
200 | 
201 |     log_info "Files and directories to be deleted:"
202 | 
203 |     for target in "${targets[@]}"; do
204 |         if [[ -e "$target" ]]; then
205 |             local relative_path="${target#$BASE_DIR/}"
206 |             if [[ -d "$target" ]]; then
207 |                 local size=$(du -sh "$target" 2>/dev/null | cut -f1 || echo "unknown")
208 |                 local count=$(find "$target" -type f | wc -l)
209 |                 echo "  📁 $relative_path/ ($size, $count files)"
210 |             else
211 |                 local size=$(du -h "$target" 2>/dev/null | cut -f1 || echo "unknown")
212 |                 echo "  📄 $relative_path ($size)"
213 |             fi
214 |         fi
215 |     done
216 | 
217 |     local total_size=$(calculate_total_size "${targets[@]}")
218 |     echo
219 |     log_info "Total size to be freed: $total_size"
220 | }
221 | 
222 | # Confirm deletion
223 | confirm_deletion() {
224 |     if [[ "$FORCE" == "true" ]]; then
225 |         return 0
226 |     fi
227 | 
228 |     echo
229 |     if [[ "$DRY_RUN" == "true" ]]; then
230 |         log_info "This is a dry run - no files will actually be deleted"
231 |         return 0
232 |     fi
233 | 
234 |     read -p "$(echo -e "${YELLOW}Do you want to proceed with deletion? (y/N): ${NC}")" -n 1 -r
235 |     echo
236 |     if [[ $REPLY =~ ^[Yy]$ ]]; then
237 |         return 0
238 |     else
239 |         log_info "Cleanup cancelled by user"
240 |         return 1
241 |     fi
242 | }
243 | 
244 | # Interactive confirmation for each item
245 | confirm_item() {
246 |     local item="$1"
247 |     local relative_path="${item#$BASE_DIR/}"
248 | 
249 |     read -p "$(echo -e "${YELLOW}Delete $relative_path? (y/N/q): ${NC}")" -n 1 -r
250 |     echo
251 |     case $REPLY in
252 |     [Yy])
253 |         return 0
254 |         ;;
255 |     [Qq])
256 |         log_info "Cleanup cancelled by user"
257 |         exit 0
258 |         ;;
259 |     *)
260 |         return 1
261 |         ;;
262 |     esac
263 | }
264 | 
265 | # Perform cleanup
266 | perform_cleanup() {
267 |     local targets=("$@")
268 |     local deleted_count=0
269 |     local total_freed=0
270 | 
271 |     for target in "${targets[@]}"; do
272 |         if [[ ! -e "$target" ]]; then
273 |             continue
274 |         fi
275 | 
276 |         if [[ "$INTERACTIVE" == "true" && "$DRY_RUN" != "true" ]]; then
277 |             if ! confirm_item "$target"; then
278 |                 continue
279 |             fi
280 |         fi
281 | 
282 |         local relative_path="${target#$BASE_DIR/}"
283 | 
284 |         if [[ "$DRY_RUN" == "true" ]]; then
285 |             log_info "[DRY RUN] Would delete: $relative_path"
286 |             ((deleted_count++))
287 |         else
288 |             # Calculate size before deletion
289 |             local size=0
290 |             if [[ -d "$target" ]]; then
291 |                 size=$(du -sb "$target" 2>/dev/null | cut -f1 || echo 0)
292 |             else
293 |                 size=$(stat -c%s "$target" 2>/dev/null || echo 0)
294 |             fi
295 | 
296 |             # Perform deletion
297 |             if rm -rf "$target" 2>/dev/null; then
298 |                 log_success "Deleted: $relative_path"
299 |                 ((deleted_count++))
300 |                 total_freed=$((total_freed + size))
301 |             else
302 |                 log_error "Failed to delete: $relative_path"
303 |             fi
304 |         fi
305 |     done
306 | 
307 |     if [[ "$DRY_RUN" == "true" ]]; then
308 |         log_info "Dry run completed: $deleted_count items would be deleted"
309 |     else
310 |         local freed_readable=$(echo $total_freed | awk '{
311 |             if ($1 >= 1073741824) printf "%.1fGB", $1/1073741824
312 |             else if ($1 >= 1048576) printf "%.1fMB", $1/1048576  
313 |             else if ($1 >= 1024) printf "%.1fKB", $1/1024
314 |             else printf "%dB", $1
315 |         }')
316 |         log_success "Cleanup completed: $deleted_count items deleted, $freed_readable freed"
317 |     fi
318 | }
319 | 
320 | # Validate target
321 | validate_target() {
322 |     case "$TARGET" in
323 |     "large" | "external" | "generated" | "temp" | "all") ;;
324 |     *)
325 |         log_error "Invalid target: $TARGET"
326 |         log_error "Valid targets: large, external, generated, temp, all"
327 |         exit 1
328 |         ;;
329 |     esac
330 | }
331 | 
332 | # Safety check to prevent accidental deletion of important files
333 | safety_check() {
334 |     local targets=("$@")
335 | 
336 |     for target in "${targets[@]}"; do
337 |         # Ensure we never delete the small samples directory
338 |         if [[ "$target" == *"/small"* ]] || [[ "$target" == *"/scripts"* ]]; then
339 |             log_error "Safety check failed: attempting to delete protected directory: $target"
340 |             log_error "This script will never delete small sample data or scripts"
341 |             exit 1
342 |         fi
343 | 
344 |         # Ensure we're only deleting within the examples directory
345 |         if [[ "$target" != "$BASE_DIR"* ]]; then
346 |             log_error "Safety check failed: attempting to delete outside examples directory: $target"
347 |             exit 1
348 |         fi
349 |     done
350 | }
351 | 
352 | # Main function
353 | main() {
354 |     log_info "Starting cleanup process..."
355 |     log_info "Target: $TARGET"
356 | 
357 |     # Get targets to clean
358 |     mapfile -t targets < <(get_cleanup_targets "$TARGET")
359 | 
360 |     if [[ ${#targets[@]} -eq 0 ]]; then
361 |         log_info "Nothing to clean up for target: $TARGET"
362 |         return 0
363 |     fi
364 | 
365 |     # Safety checks
366 |     safety_check "${targets[@]}"
367 | 
368 |     # Show preview
369 |     show_cleanup_preview "${targets[@]}"
370 | 
371 |     # Confirm and perform cleanup
372 |     if confirm_deletion; then
373 |         perform_cleanup "${targets[@]}"
374 |     fi
375 | }
376 | 
377 | # Display banner
378 | show_banner() {
379 |     echo "🧹 Hawk Examples Cleanup Tool"
380 |     echo "=============================="
381 |     echo
382 | }
383 | 
384 | # Parse arguments and run
385 | parse_args "$@"
386 | validate_target
387 | 
388 | # Show banner unless in quiet mode
389 | show_banner
390 | 
391 | # Conflict checking
392 | if [[ "$INTERACTIVE" == "true" && "$FORCE" == "true" ]]; then
393 |     log_error "Cannot use --interactive and --force together"
394 |     exit 1
395 | fi
396 | 
397 | if [[ "$DRY_RUN" == "true" && "$FORCE" == "true" ]]; then
398 |     log_warning "--force has no effect in dry-run mode"
399 | fi
400 | 
401 | # Run main function
402 | main
403 | 
404 | log_info "Cleanup process completed"
405 | 


--------------------------------------------------------------------------------
/examples/small/application.log:
--------------------------------------------------------------------------------
 1 | 2024-07-18 09:15:23 INFO  [main] Application started successfully on port 8080
 2 | 2024-07-18 09:15:24 DEBUG [worker-1] Loading configuration from /etc/app/config.yaml
 3 | 2024-07-18 09:15:25 INFO  [database] Connected to PostgreSQL database: app_production
 4 | 2024-07-18 09:15:26 DEBUG [cache] Redis connection established: localhost:6379
 5 | 2024-07-18 09:15:27 INFO  [auth] JWT authentication module initialized
 6 | 2024-07-18 09:15:28 DEBUG [worker-2] Processing request GET /api/v1/health
 7 | 2024-07-18 09:15:29 INFO  [metrics] Prometheus metrics endpoint available at /metrics
 8 | 2024-07-18 09:15:30 DEBUG [worker-1] Request completed: GET /api/v1/health - 200 OK (15ms)
 9 | 2024-07-18 09:16:15 INFO  [worker-3] User login successful: user_id=12345, email=alice@company.com
10 | 2024-07-18 09:16:45 DEBUG [worker-2] Processing request POST /api/v1/orders
11 | 2024-07-18 09:16:46 INFO  [validation] Order validation passed: order_id=ORD001
12 | 2024-07-18 09:16:47 DEBUG [database] Executing SQL: INSERT INTO orders (id, customer_id, total) VALUES ($1, $2, $3)
13 | 2024-07-18 09:16:48 INFO  [worker-2] Order created successfully: order_id=ORD001, total=$299.99
14 | 2024-07-18 09:17:12 WARN  [worker-1] Slow query detected: SELECT * FROM products WHERE category='electronics' (2.5s)
15 | 2024-07-18 09:17:30 ERROR [worker-4] Database connection failed: connection timeout after 30s
16 | 2024-07-18 09:17:31 ERROR [retry] Retrying database connection (attempt 1/3)
17 | 2024-07-18 09:17:32 INFO  [worker-4] Database connection restored successfully
18 | 2024-07-18 09:18:05 DEBUG [cache] Cache hit for key: user_profile_12345
19 | 2024-07-18 09:18:22 WARN  [queue] Queue size approaching limit: 950/1000 messages
20 | 2024-07-18 09:18:45 INFO  [worker-5] User logout: user_id=12345, session_duration=2m30s
21 | 2024-07-18 09:19:10 DEBUG [worker-1] Processing request GET /api/v1/products?category=electronics
22 | 2024-07-18 09:19:11 INFO  [search] Search query executed: 'electronics', results=25, duration=45ms
23 | 2024-07-18 09:19:33 ERROR [payment] Payment processing failed: card_declined, order_id=ORD002
24 | 2024-07-18 09:19:34 WARN  [notification] Failed to send payment failure email to customer@example.com
25 | 2024-07-18 09:20:01 INFO  [scheduler] Starting scheduled job: daily_report_generation
26 | 2024-07-18 09:20:15 DEBUG [worker-3] Processing request PUT /api/v1/users/12345/profile
27 | 2024-07-18 09:20:16 INFO  [validation] Profile update validation passed for user_id=12345
28 | 2024-07-18 09:20:17 DEBUG [database] Executing SQL: UPDATE users SET profile_data=$1 WHERE id=$2
29 | 2024-07-18 09:20:45 CRITICAL [security] Multiple failed login attempts detected: IP=192.168.1.100, attempts=5
30 | 2024-07-18 09:20:46 INFO  [security] IP address blocked for 1 hour: 192.168.1.100
31 | 2024-07-18 09:21:12 ERROR [external_api] Third-party API call failed: timeout to payments.example.com
32 | 2024-07-18 09:21:13 WARN  [circuit_breaker] Circuit breaker opened for payments service
33 | 2024-07-18 09:21:30 DEBUG [worker-2] Processing request DELETE /api/v1/cart/items/123
34 | 2024-07-18 09:21:31 INFO  [cart] Item removed from cart: user_id=67890, item_id=123
35 | 2024-07-18 09:22:05 INFO  [health_check] All services healthy: database=OK, cache=OK, queue=OK
36 | 2024-07-18 09:22:30 DEBUG [worker-4] Processing request GET /api/v1/analytics/dashboard
37 | 2024-07-18 09:22:31 WARN  [performance] High memory usage detected: 85% of available memory
38 | 2024-07-18 09:22:55 ERROR [file_system] Failed to write log file: disk space low (5% remaining)
39 | 2024-07-18 09:23:10 FATAL [storage] Critical disk space shortage: less than 1GB remaining
40 | 2024-07-18 09:23:11 ERROR [alert] Failed to send critical alert: notification service unavailable
41 | 2024-07-18 09:23:30 INFO  [maintenance] Starting automated cleanup: removing old log files
42 | 2024-07-18 09:23:45 DEBUG [cleanup] Removed 150 old log files, freed 2.5GB disk space
43 | 2024-07-18 09:24:01 INFO  [recovery] System recovered from disk space issue
44 | 2024-07-18 09:24:15 DEBUG [worker-1] Processing request GET /api/v1/reports/sales
45 | 2024-07-18 09:24:16 INFO  [reports] Sales report generated: period=2024-07, total_orders=1250
46 | 2024-07-18 09:24:45 WARN  [rate_limit] Rate limit exceeded for API key: api_key_abc123 (100 req/min)
47 | 2024-07-18 09:25:10 DEBUG [worker-3] Processing request POST /api/v1/feedback
48 | 2024-07-18 09:25:11 INFO  [feedback] Customer feedback submitted: rating=5, order_id=ORD003
49 | 2024-07-18 09:25:30 ERROR [email] SMTP server connection failed: authentication error
50 | 2024-07-18 09:25:45 INFO  [backup] Database backup completed successfully: backup_20240718_092545.sql
51 | 2024-07-18 09:26:00 DEBUG [worker-2] Processing request GET /api/v1/inventory/status
52 | 2024-07-18 09:26:15 WARN  [inventory] Low stock alert: product_id=PROD001, quantity=5 remaining
53 | 2024-07-18 09:26:30 INFO  [scheduler] Scheduled job completed: daily_report_generation (duration=6m30s)
54 | 


--------------------------------------------------------------------------------
/examples/small/customers.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "id": "CUST001",
  4 |     "name": "Alice Johnson",
  5 |     "email": "alice.johnson@techcorp.com",
  6 |     "company": "TechCorp Inc",
  7 |     "country": "USA",
  8 |     "status": "active",
  9 |     "lifetime_value": 15240.5,
 10 |     "segment": "enterprise",
 11 |     "registration_date": "2023-01-15"
 12 |   },
 13 |   {
 14 |     "id": "CUST002",
 15 |     "name": "Bob Smith",
 16 |     "email": "bob.smith@startup.io",
 17 |     "company": "StartupXYZ",
 18 |     "country": "Canada",
 19 |     "status": "active",
 20 |     "lifetime_value": 8500.25,
 21 |     "segment": "business",
 22 |     "registration_date": "2023-03-22"
 23 |   },
 24 |   {
 25 |     "id": "CUST003",
 26 |     "name": "Carol Wang",
 27 |     "email": "carol.wang@example.com",
 28 |     "company": "Global Solutions",
 29 |     "country": "USA",
 30 |     "status": "inactive",
 31 |     "lifetime_value": 12750.75,
 32 |     "segment": "enterprise",
 33 |     "registration_date": "2022-11-08"
 34 |   },
 35 |   {
 36 |     "id": "CUST004",
 37 |     "name": "David Brown",
 38 |     "email": "david.brown@smallbiz.com",
 39 |     "company": "Small Business Co",
 40 |     "country": "UK",
 41 |     "status": "active",
 42 |     "lifetime_value": 3200.0,
 43 |     "segment": "small",
 44 |     "registration_date": "2024-01-10"
 45 |   },
 46 |   {
 47 |     "id": "CUST005",
 48 |     "name": "Elena Rodriguez",
 49 |     "email": "elena.rodriguez@innovate.es",
 50 |     "company": "Innovate España",
 51 |     "country": "Spain",
 52 |     "status": "active",
 53 |     "lifetime_value": 9875.5,
 54 |     "segment": "business",
 55 |     "registration_date": "2023-06-14"
 56 |   },
 57 |   {
 58 |     "id": "CUST006",
 59 |     "name": "Frank Chen",
 60 |     "email": "frank.chen@testcompany.com",
 61 |     "company": "Test Company",
 62 |     "country": "USA",
 63 |     "status": "suspended",
 64 |     "lifetime_value": 0.0,
 65 |     "segment": "test",
 66 |     "registration_date": "2024-02-01"
 67 |   },
 68 |   {
 69 |     "id": "CUST007",
 70 |     "name": "Grace Kim",
 71 |     "email": "grace.kim@fintech.kr",
 72 |     "company": "FinTech Korea",
 73 |     "country": "South Korea",
 74 |     "status": "active",
 75 |     "lifetime_value": 18920.25,
 76 |     "segment": "enterprise",
 77 |     "registration_date": "2022-09-30"
 78 |   },
 79 |   {
 80 |     "id": "CUST008",
 81 |     "name": "Henry Taylor",
 82 |     "email": "henry.taylor@demo.org",
 83 |     "company": "Demo Organization",
 84 |     "country": "Australia",
 85 |     "status": "inactive",
 86 |     "lifetime_value": 1500.0,
 87 |     "segment": "demo",
 88 |     "registration_date": "2024-03-15"
 89 |   },
 90 |   {
 91 |     "id": "CUST009",
 92 |     "name": "Irene Foster",
 93 |     "email": "irene.foster@enterprise.de",
 94 |     "company": "Enterprise Germany",
 95 |     "country": "Germany",
 96 |     "status": "active",
 97 |     "lifetime_value": 22450.75,
 98 |     "segment": "enterprise",
 99 |     "registration_date": "2023-04-20"
100 |   },
101 |   {
102 |     "id": "CUST010",
103 |     "name": "Jack Wilson",
104 |     "email": "jack.wilson@temp.example",
105 |     "company": "Temporary Inc",
106 |     "country": "Canada",
107 |     "status": "deleted",
108 |     "lifetime_value": 500.0,
109 |     "segment": "temp",
110 |     "registration_date": "2024-01-01"
111 |   }
112 | ]
113 | 


--------------------------------------------------------------------------------
/examples/small/ec2_instances.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "instance_id": "i-0123456789abcdef0",
  4 |     "instance_type": "t3.medium",
  5 |     "state": "running",
  6 |     "availability_zone": "us-west-2a",
  7 |     "private_ip": "10.0.1.15",
  8 |     "public_ip": "54.123.45.67",
  9 |     "security_groups": ["sg-web", "sg-ssh"],
 10 |     "tags": {
 11 |       "Name": "web-server-01",
 12 |       "Environment": "production",
 13 |       "Team": "backend",
 14 |       "Application": "web-app"
 15 |     },
 16 |     "launch_time": "2024-01-10T09:30:00Z",
 17 |     "cpu_utilization": 45.2,
 18 |     "memory_utilization": 62.8,
 19 |     "network_in_mb": 125.4,
 20 |     "network_out_mb": 89.2
 21 |   },
 22 |   {
 23 |     "instance_id": "i-0fedcba987654321a",
 24 |     "instance_type": "t3.large",
 25 |     "state": "running",
 26 |     "availability_zone": "us-west-2b",
 27 |     "private_ip": "10.0.2.22",
 28 |     "public_ip": "54.234.56.78",
 29 |     "security_groups": ["sg-database", "sg-internal"],
 30 |     "tags": {
 31 |       "Name": "database-server-01",
 32 |       "Environment": "production",
 33 |       "Team": "database",
 34 |       "Application": "postgres"
 35 |     },
 36 |     "launch_time": "2024-01-05T14:15:00Z",
 37 |     "cpu_utilization": 78.5,
 38 |     "memory_utilization": 85.3,
 39 |     "network_in_mb": 45.7,
 40 |     "network_out_mb": 67.1
 41 |   },
 42 |   {
 43 |     "instance_id": "i-0abcdef123456789b",
 44 |     "instance_type": "t3.small",
 45 |     "state": "stopped",
 46 |     "availability_zone": "us-west-2a",
 47 |     "private_ip": "10.0.1.33",
 48 |     "public_ip": null,
 49 |     "security_groups": ["sg-staging"],
 50 |     "tags": {
 51 |       "Name": "staging-server-01",
 52 |       "Environment": "staging",
 53 |       "Team": "qa",
 54 |       "Application": "test-app"
 55 |     },
 56 |     "launch_time": "2024-01-15T11:20:00Z",
 57 |     "cpu_utilization": 0.0,
 58 |     "memory_utilization": 0.0,
 59 |     "network_in_mb": 0.0,
 60 |     "network_out_mb": 0.0
 61 |   },
 62 |   {
 63 |     "instance_id": "i-0987654321fedcbac",
 64 |     "instance_type": "t3.xlarge",
 65 |     "state": "running",
 66 |     "availability_zone": "us-west-2c",
 67 |     "private_ip": "10.0.3.44",
 68 |     "public_ip": "54.345.67.89",
 69 |     "security_groups": ["sg-processing", "sg-ssh"],
 70 |     "tags": {
 71 |       "Name": "batch-processor-01",
 72 |       "Environment": "production",
 73 |       "Team": "data",
 74 |       "Application": "etl"
 75 |     },
 76 |     "launch_time": "2024-01-08T16:45:00Z",
 77 |     "cpu_utilization": 92.7,
 78 |     "memory_utilization": 76.4,
 79 |     "network_in_mb": 234.8,
 80 |     "network_out_mb": 456.2
 81 |   },
 82 |   {
 83 |     "instance_id": "i-0456789abcdef012d",
 84 |     "instance_type": "t3.micro",
 85 |     "state": "running",
 86 |     "availability_zone": "us-west-2a",
 87 |     "private_ip": "10.0.1.55",
 88 |     "public_ip": "54.456.78.90",
 89 |     "security_groups": ["sg-monitoring", "sg-ssh"],
 90 |     "tags": {
 91 |       "Name": "monitoring-server-01",
 92 |       "Environment": "production",
 93 |       "Team": "devops",
 94 |       "Application": "prometheus"
 95 |     },
 96 |     "launch_time": "2024-01-12T08:00:00Z",
 97 |     "cpu_utilization": 25.3,
 98 |     "memory_utilization": 48.9,
 99 |     "network_in_mb": 15.6,
100 |     "network_out_mb": 22.1
101 |   }
102 | ]
103 | 


--------------------------------------------------------------------------------
/examples/small/employees.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "id": "EMP001",
  4 |     "name": "Sarah Connor",
  5 |     "email": "sarah.connor@company.com",
  6 |     "department": "Engineering",
  7 |     "role": "Senior Developer",
  8 |     "salary": 95000,
  9 |     "hire_date": "2022-03-15",
 10 |     "status": "active",
 11 |     "location": "San Francisco",
 12 |     "skills": ["Python", "JavaScript", "AWS", "Docker"],
 13 |     "manager_id": "EMP003"
 14 |   },
 15 |   {
 16 |     "id": "EMP002",
 17 |     "name": "John Doe",
 18 |     "email": "john.doe@company.com",
 19 |     "department": "Marketing",
 20 |     "role": "Marketing Manager",
 21 |     "salary": 75000,
 22 |     "hire_date": "2021-08-20",
 23 |     "status": "active",
 24 |     "location": "New York",
 25 |     "skills": ["SEO", "Analytics", "Content Marketing", "Social Media"],
 26 |     "manager_id": "EMP015"
 27 |   },
 28 |   {
 29 |     "id": "EMP003",
 30 |     "name": "Alice Kim",
 31 |     "email": "alice.kim@company.com",
 32 |     "department": "Engineering",
 33 |     "role": "Engineering Manager",
 34 |     "salary": 120000,
 35 |     "hire_date": "2020-01-10",
 36 |     "status": "active",
 37 |     "location": "San Francisco",
 38 |     "skills": ["Leadership", "Architecture", "Python", "Kubernetes"],
 39 |     "manager_id": null
 40 |   },
 41 |   {
 42 |     "id": "EMP004",
 43 |     "name": "Bob Johnson",
 44 |     "email": "bob.johnson@company.com",
 45 |     "department": "Sales",
 46 |     "role": "Sales Representative",
 47 |     "salary": 65000,
 48 |     "hire_date": "2023-05-12",
 49 |     "status": "active",
 50 |     "location": "Chicago",
 51 |     "skills": ["CRM", "Negotiation", "Presentations", "Lead Generation"],
 52 |     "manager_id": "EMP008"
 53 |   },
 54 |   {
 55 |     "id": "EMP005",
 56 |     "name": "Emily Davis",
 57 |     "email": "emily.davis@company.com",
 58 |     "department": "Engineering",
 59 |     "role": "Junior Developer",
 60 |     "salary": 70000,
 61 |     "hire_date": "2023-09-01",
 62 |     "status": "active",
 63 |     "location": "Remote",
 64 |     "skills": ["JavaScript", "React", "Node.js", "Git"],
 65 |     "manager_id": "EMP003"
 66 |   },
 67 |   {
 68 |     "id": "EMP006",
 69 |     "name": "Michael Chen",
 70 |     "email": "michael.chen@company.com",
 71 |     "department": "DevOps",
 72 |     "role": "DevOps Engineer",
 73 |     "salary": 105000,
 74 |     "hire_date": "2021-11-30",
 75 |     "status": "active",
 76 |     "location": "Seattle",
 77 |     "skills": ["AWS", "Terraform", "Jenkins", "Monitoring"],
 78 |     "manager_id": "EMP003"
 79 |   },
 80 |   {
 81 |     "id": "EMP007",
 82 |     "name": "Lisa Wang",
 83 |     "email": "lisa.wang@company.com",
 84 |     "department": "Design",
 85 |     "role": "UX Designer",
 86 |     "salary": 80000,
 87 |     "hire_date": "2022-07-18",
 88 |     "status": "active",
 89 |     "location": "Los Angeles",
 90 |     "skills": ["Figma", "User Research", "Prototyping", "Usability Testing"],
 91 |     "manager_id": "EMP012"
 92 |   },
 93 |   {
 94 |     "id": "EMP008",
 95 |     "name": "David Miller",
 96 |     "email": "david.miller@company.com",
 97 |     "department": "Sales",
 98 |     "role": "Sales Manager",
 99 |     "salary": 90000,
100 |     "hire_date": "2020-06-05",
101 |     "status": "active",
102 |     "location": "Chicago",
103 |     "skills": ["Team Leadership", "Strategy", "CRM", "Analytics"],
104 |     "manager_id": null
105 |   },
106 |   {
107 |     "id": "EMP009",
108 |     "name": "Jennifer Garcia",
109 |     "email": "jennifer.garcia@company.com",
110 |     "department": "HR",
111 |     "role": "HR Specialist",
112 |     "salary": 60000,
113 |     "hire_date": "2023-02-14",
114 |     "status": "active",
115 |     "location": "Austin",
116 |     "skills": ["Recruiting", "Employee Relations", "Benefits", "Compliance"],
117 |     "manager_id": "EMP013"
118 |   },
119 |   {
120 |     "id": "EMP010",
121 |     "name": "Robert Taylor",
122 |     "email": "robert.taylor@company.com",
123 |     "department": "Engineering",
124 |     "role": "Data Engineer",
125 |     "salary": 98000,
126 |     "hire_date": "2022-04-25",
127 |     "status": "on_leave",
128 |     "location": "Boston",
129 |     "skills": ["SQL", "Spark", "Airflow", "Data Modeling"],
130 |     "manager_id": "EMP003"
131 |   },
132 |   {
133 |     "id": "EMP011",
134 |     "name": "Amanda Wilson",
135 |     "email": "amanda.wilson@company.com",
136 |     "department": "Finance",
137 |     "role": "Financial Analyst",
138 |     "salary": 72000,
139 |     "hire_date": "2021-12-10",
140 |     "status": "active",
141 |     "location": "New York",
142 |     "skills": ["Excel", "Financial Modeling", "Reporting", "Analysis"],
143 |     "manager_id": "EMP014"
144 |   },
145 |   {
146 |     "id": "EMP012",
147 |     "name": "Kevin Brown",
148 |     "email": "kevin.brown@company.com",
149 |     "department": "Design",
150 |     "role": "Design Manager",
151 |     "salary": 95000,
152 |     "hire_date": "2020-09-15",
153 |     "status": "active",
154 |     "location": "Los Angeles",
155 |     "skills": ["Design Leadership", "Brand Strategy", "Creative Direction"],
156 |     "manager_id": null
157 |   },
158 |   {
159 |     "id": "EMP013",
160 |     "name": "Nancy Rodriguez",
161 |     "email": "nancy.rodriguez@company.com",
162 |     "department": "HR",
163 |     "role": "HR Manager",
164 |     "salary": 85000,
165 |     "hire_date": "2019-11-20",
166 |     "status": "active",
167 |     "location": "Austin",
168 |     "skills": ["HR Strategy", "Leadership", "Policy Development", "Training"],
169 |     "manager_id": null
170 |   },
171 |   {
172 |     "id": "EMP014",
173 |     "name": "Steve Anderson",
174 |     "email": "steve.anderson@company.com",
175 |     "department": "Finance",
176 |     "role": "Finance Manager",
177 |     "salary": 100000,
178 |     "hire_date": "2020-03-08",
179 |     "status": "active",
180 |     "location": "New York",
181 |     "skills": [
182 |       "Financial Planning",
183 |       "Budgeting",
184 |       "Team Leadership",
185 |       "Strategy"
186 |     ],
187 |     "manager_id": null
188 |   },
189 |   {
190 |     "id": "EMP015",
191 |     "name": "Michelle Lee",
192 |     "email": "michelle.lee@company.com",
193 |     "department": "Marketing",
194 |     "role": "Marketing Director",
195 |     "salary": 110000,
196 |     "hire_date": "2019-05-30",
197 |     "status": "terminated",
198 |     "location": "New York",
199 |     "skills": [
200 |       "Marketing Strategy",
201 |       "Brand Management",
202 |       "Digital Marketing",
203 |       "Leadership"
204 |     ],
205 |     "manager_id": null
206 |   }
207 | ]
208 | 


--------------------------------------------------------------------------------
/examples/small/error_messages.txt:
--------------------------------------------------------------------------------
 1 | ERROR: Database connection timeout after 30 seconds
 2 | WARNING: Memory usage exceeded 80% threshold (current: 85%)
 3 | CRITICAL: Disk space below 5% on /var/log partition
 4 | INFO: User session expired, redirecting to login page
 5 | DEBUG: Cache miss for key user_profile_12345
 6 | FATAL: Unable to start application server on port 8080
 7 | ERROR: Payment processing failed - card declined
 8 | WARNING: Rate limit exceeded for API key abc123
 9 | CRITICAL: Security breach detected - multiple failed login attempts
10 | INFO: Scheduled maintenance window starting in 10 minutes
11 | ERROR: Failed to send notification email to user@example.com
12 | WARNING: SSL certificate expires in 7 days
13 | DEBUG: SQL query execution time: 2.5 seconds (slow query)
14 | FATAL: Out of memory - cannot allocate additional heap space
15 | ERROR: External API call failed - timeout to payments.example.com
16 | 


--------------------------------------------------------------------------------
/examples/small/nginx.conf:
--------------------------------------------------------------------------------
  1 | # Main nginx configuration file
  2 | user nginx;
  3 | worker_processes auto;
  4 | error_log /var/log/nginx/error.log warn;
  5 | pid /var/run/nginx.pid;
  6 | 
  7 | events {
  8 |     worker_connections 1024;
  9 |     use epoll;
 10 |     multi_accept on;
 11 | }
 12 | 
 13 | http {
 14 |     # Basic settings
 15 |     include /etc/nginx/mime.types;
 16 |     default_type application/octet-stream;
 17 |     
 18 |     # Logging format
 19 |     log_format main '$remote_addr - $remote_user [$time_local] "$request" '
 20 |                     '$status $body_bytes_sent "$http_referer" '
 21 |                     '"$http_user_agent" "$http_x_forwarded_for"';
 22 |     
 23 |     access_log /var/log/nginx/access.log main;
 24 |     
 25 |     # Performance settings
 26 |     sendfile on;
 27 |     tcp_nopush on;
 28 |     tcp_nodelay on;
 29 |     keepalive_timeout 65;
 30 |     types_hash_max_size 2048;
 31 |     
 32 |     # Gzip compression
 33 |     gzip on;
 34 |     gzip_vary on;
 35 |     gzip_min_length 1024;
 36 |     gzip_types text/plain text/css application/json application/javascript text/xml;
 37 |     
 38 |     # Rate limiting
 39 |     limit_req_zone $binary_remote_addr zone=api:10m rate=100r/m;
 40 |     limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m;
 41 |     
 42 |     # SSL settings
 43 |     ssl_protocols TLSv1.2 TLSv1.3;
 44 |     ssl_prefer_server_ciphers off;
 45 |     ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256;
 46 |     
 47 |     # Upstream backend servers
 48 |     upstream app_backend {
 49 |         server 127.0.0.1:8080 max_fails=3 fail_timeout=30s;
 50 |         server 127.0.0.1:8081 max_fails=3 fail_timeout=30s backup;
 51 |     }
 52 |     
 53 |     upstream api_backend {
 54 |         least_conn;
 55 |         server 10.0.1.10:3000 weight=3;
 56 |         server 10.0.1.11:3000 weight=2;
 57 |         server 10.0.1.12:3000 weight=1;
 58 |     }
 59 |     
 60 |     # Main server block
 61 |     server {
 62 |         listen 80;
 63 |         listen [::]:80;
 64 |         server_name example.com www.example.com;
 65 |         
 66 |         # Redirect HTTP to HTTPS
 67 |         return 301 https://$server_name$request_uri;
 68 |     }
 69 |     
 70 |     # HTTPS server block
 71 |     server {
 72 |         listen 443 ssl http2;
 73 |         listen [::]:443 ssl http2;
 74 |         server_name example.com www.example.com;
 75 |         
 76 |         # SSL configuration
 77 |         ssl_certificate /etc/ssl/certs/example.com.crt;
 78 |         ssl_certificate_key /etc/ssl/private/example.com.key;
 79 |         ssl_session_timeout 1d;
 80 |         ssl_session_cache shared:MozTLS:10m;
 81 |         ssl_session_tickets off;
 82 |         
 83 |         # Security headers
 84 |         add_header Strict-Transport-Security "max-age=63072000" always;
 85 |         add_header X-Content-Type-Options "nosniff" always;
 86 |         add_header X-Frame-Options "DENY" always;
 87 |         add_header X-XSS-Protection "1; mode=block" always;
 88 |         
 89 |         # API endpoints
 90 |         location /api/ {
 91 |             limit_req zone=api burst=20 nodelay;
 92 |             proxy_pass http://api_backend;
 93 |             proxy_set_header Host $host;
 94 |             proxy_set_header X-Real-IP $remote_addr;
 95 |             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
 96 |             proxy_set_header X-Forwarded-Proto $scheme;
 97 |             proxy_connect_timeout 30s;
 98 |             proxy_send_timeout 30s;
 99 |             proxy_read_timeout 30s;
100 |         }
101 |         
102 |         # Authentication endpoints (stricter rate limiting)
103 |         location /api/auth/ {
104 |             limit_req zone=login burst=5 nodelay;
105 |             proxy_pass http://api_backend;
106 |             proxy_set_header Host $host;
107 |             proxy_set_header X-Real-IP $remote_addr;
108 |             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
109 |             proxy_set_header X-Forwarded-Proto $scheme;
110 |         }
111 |         
112 |         # Static files
113 |         location /static/ {
114 |             alias /var/www/static/;
115 |             expires 1M;
116 |             add_header Cache-Control "public, immutable";
117 |         }
118 |         
119 |         # Images with optimization
120 |         location /images/ {
121 |             alias /var/www/images/;
122 |             expires 7d;
123 |             add_header Cache-Control "public";
124 |         }
125 |         
126 |         # Health check endpoint
127 |         location /health {
128 |             access_log off;
129 |             return 200 "healthy\n";
130 |             add_header Content-Type text/plain;
131 |         }
132 |         
133 |         # Admin interface (restricted access)
134 |         location /admin/ {
135 |             allow 192.168.1.0/24;
136 |             allow 10.0.0.0/8;
137 |             deny all;
138 |             
139 |             proxy_pass http://app_backend;
140 |             proxy_set_header Host $host;
141 |             proxy_set_header X-Real-IP $remote_addr;
142 |             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
143 |         }
144 |         
145 |         # Default location
146 |         location / {
147 |             proxy_pass http://app_backend;
148 |             proxy_set_header Host $host;
149 |             proxy_set_header X-Real-IP $remote_addr;
150 |             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
151 |             proxy_set_header X-Forwarded-Proto $scheme;
152 |         }
153 |         
154 |         # Custom error pages
155 |         error_page 404 /404.html;
156 |         error_page 500 502 503 504 /50x.html;
157 |         
158 |         location = /404.html {
159 |             internal;
160 |             root /var/www/error;
161 |         }
162 |         
163 |         location = /50x.html {
164 |             internal;
165 |             root /var/www/error;
166 |         }
167 |     }
168 | }
169 | 


--------------------------------------------------------------------------------
/examples/small/nginx_access.log:
--------------------------------------------------------------------------------
 1 | 192.168.1.100 - - [18/Jul/2024:09:15:30 +0000] "GET /api/v1/health HTTP/1.1" 200 15 "-" "curl/7.68.0"
 2 | 203.0.113.45 - - [18/Jul/2024:09:16:45 +0000] "POST /api/v1/orders HTTP/1.1" 201 342 "https://shop.example.com/cart" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
 3 | 198.51.100.23 - - [18/Jul/2024:09:17:12 +0000] "GET /api/v1/products?category=electronics HTTP/1.1" 200 2500 "https://shop.example.com/search" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
 4 | 192.168.1.150 - - [18/Jul/2024:09:17:30 +0000] "GET /api/v1/health HTTP/1.1" 503 0 "-" "HealthCheck/1.0"
 5 | 203.0.113.67 - - [18/Jul/2024:09:18:05 +0000] "GET /api/v1/users/12345/profile HTTP/1.1" 200 890 "https://shop.example.com/account" "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15"
 6 | 198.51.100.89 - - [18/Jul/2024:09:18:45 +0000] "PUT /api/v1/users/12345/profile HTTP/1.1" 200 156 "https://shop.example.com/account/edit" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
 7 | 192.168.1.200 - - [18/Jul/2024:09:19:10 +0000] "GET /api/v1/products?page=2&limit=20 HTTP/1.1" 200 1800 "https://shop.example.com/products" "Mozilla/5.0 (Linux; Android 11; SM-G991B) AppleWebKit/537.36"
 8 | 203.0.113.12 - - [18/Jul/2024:09:19:33 +0000] "POST /api/v1/payments HTTP/1.1" 400 89 "https://shop.example.com/checkout" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
 9 | 198.51.100.156 - - [18/Jul/2024:09:20:01 +0000] "GET /api/v1/analytics/dashboard HTTP/1.1" 200 5600 "https://admin.example.com/dashboard" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
10 | 192.168.1.100 - - [18/Jul/2024:09:20:15 +0000] "DELETE /api/v1/cart/items/123 HTTP/1.1" 204 0 "https://shop.example.com/cart" "Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15"
11 | 203.0.113.45 - - [18/Jul/2024:09:20:45 +0000] "POST /api/v1/auth/login HTTP/1.1" 401 67 "https://shop.example.com/login" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
12 | 203.0.113.45 - - [18/Jul/2024:09:20:46 +0000] "POST /api/v1/auth/login HTTP/1.1" 401 67 "https://shop.example.com/login" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
13 | 203.0.113.45 - - [18/Jul/2024:09:20:47 +0000] "POST /api/v1/auth/login HTTP/1.1" 401 67 "https://shop.example.com/login" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
14 | 198.51.100.78 - - [18/Jul/2024:09:21:12 +0000] "GET /api/v1/orders/recent HTTP/1.1" 200 1200 "https://shop.example.com/account/orders" "Mozilla/5.0 (Linux; Android 10; SM-A505F) AppleWebKit/537.36"
15 | 192.168.1.175 - - [18/Jul/2024:09:21:30 +0000] "GET /metrics HTTP/1.1" 200 3400 "-" "Prometheus/2.30.0"
16 | 203.0.113.89 - - [18/Jul/2024:09:22:05 +0000] "GET /api/v1/products/PROD001 HTTP/1.1" 200 678 "https://shop.example.com/products/electronics" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_16_0) AppleWebKit/537.36"
17 | 198.51.100.234 - - [18/Jul/2024:09:22:30 +0000] "POST /api/v1/reviews HTTP/1.1" 201 245 "https://shop.example.com/products/PROD001" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0"
18 | 192.168.1.100 - - [18/Jul/2024:09:22:55 +0000] "GET /api/v1/search?q=wireless+headphones HTTP/1.1" 200 2100 "https://shop.example.com/search" "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15"
19 | 203.0.113.123 - - [18/Jul/2024:09:23:10 +0000] "GET /api/v1/categories HTTP/1.1" 200 890 "https://shop.example.com/" "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36"
20 | 198.51.100.67 - - [18/Jul/2024:09:23:30 +0000] "PUT /api/v1/cart/items/456 HTTP/1.1" 200 123 "https://shop.example.com/cart" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
21 | 192.168.1.200 - - [18/Jul/2024:09:24:01 +0000] "GET /api/v1/wishlist HTTP/1.1" 200 567 "https://shop.example.com/account/wishlist" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15"
22 | 203.0.113.198 - - [18/Jul/2024:09:24:15 +0000] "POST /api/v1/feedback HTTP/1.1" 201 89 "https://shop.example.com/support/contact" "Mozilla/5.0 (iPad; CPU OS 15_0 like Mac OS X) AppleWebKit/605.1.15"
23 | 198.51.100.111 - - [18/Jul/2024:09:24:45 +0000] "GET /api/v1/products?category=office&sort=price HTTP/1.1" 200 1600 "https://shop.example.com/categories/office" "Mozilla/5.0 (Linux; Android 11; OnePlus 9) AppleWebKit/537.36"
24 | 192.168.1.150 - - [18/Jul/2024:09:25:10 +0000] "GET /api/v1/health HTTP/1.1" 200 15 "-" "HealthCheck/1.0"
25 | 203.0.113.45 - - [18/Jul/2024:09:25:30 +0000] "GET /api/v1/orders/ORD001/status HTTP/1.1" 200 234 "https://shop.example.com/orders/track" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
26 | 198.51.100.89 - - [18/Jul/2024:09:26:00 +0000] "POST /api/v1/auth/logout HTTP/1.1" 200 45 "https://shop.example.com/account" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
27 | 192.168.1.175 - - [18/Jul/2024:09:26:15 +0000] "GET /metrics HTTP/1.1" 200 3500 "-" "Prometheus/2.30.0"
28 | 203.0.113.67 - - [18/Jul/2024:09:26:30 +0000] "GET /api/v1/notifications HTTP/1.1" 200 450 "https://shop.example.com/account/notifications" "Mozilla/5.0 (iPhone; CPU iPhone OS 14_8 like Mac OS X) AppleWebKit/605.1.15"
29 | 198.51.100.234 - - [18/Jul/2024:09:26:45 +0000] "GET /api/v1/inventory/status HTTP/1.1" 200 1100 "https://admin.example.com/inventory" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0"
30 | 192.168.1.100 - - [18/Jul/2024:09:27:00 +0000] "GET /api/v1/reports/sales?period=monthly HTTP/1.1" 200 2800 "https://admin.example.com/reports" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
31 | 


--------------------------------------------------------------------------------
/examples/small/orders.csv:
--------------------------------------------------------------------------------
 1 | order_id,customer_id,product_id,quantity,price,order_date,status,payment_method
 2 | ORD001,CUST001,PROD001,2,299.99,2024-01-15,completed,credit_card
 3 | ORD002,CUST002,PROD003,1,599.99,2024-01-15,processing,paypal
 4 | ORD003,CUST001,PROD002,3,149.97,2024-01-16,completed,credit_card
 5 | ORD004,CUST003,PROD001,1,299.99,2024-01-17,cancelled,bank_transfer
 6 | ORD005,CUST004,PROD004,5,249.95,2024-01-18,completed,credit_card
 7 | ORD006,CUST005,PROD002,2,99.98,2024-01-19,shipped,paypal
 8 | ORD007,CUST001,PROD005,1,899.99,2024-01-20,processing,credit_card
 9 | ORD008,CUST007,PROD001,4,1199.96,2024-01-21,completed,bank_transfer
10 | ORD009,CUST002,PROD004,2,99.98,2024-01-22,shipped,paypal
11 | ORD010,CUST009,PROD003,1,599.99,2024-01-23,completed,credit_card
12 | ORD011,CUST004,PROD002,1,49.99,2024-01-24,processing,credit_card
13 | ORD012,CUST005,PROD005,1,899.99,2024-01-25,cancelled,paypal
14 | ORD013,CUST007,PROD004,3,149.97,2024-01-26,completed,bank_transfer
15 | ORD014,CUST001,PROD001,1,299.99,2024-01-27,shipped,credit_card
16 | ORD015,CUST003,PROD002,2,99.98,2024-01-28,processing,credit_card
17 | ORD016,CUST009,PROD001,2,599.98,2024-01-29,completed,credit_card
18 | ORD017,CUST002,PROD005,1,899.99,2024-01-30,shipped,paypal
19 | ORD018,CUST004,PROD003,1,599.99,2024-02-01,completed,credit_card
20 | ORD019,CUST007,PROD002,4,199.96,2024-02-02,processing,bank_transfer
21 | ORD020,CUST005,PROD001,1,299.99,2024-02-03,cancelled,paypal
22 | ORD021,CUST001,PROD004,2,99.98,2024-02-04,completed,credit_card
23 | ORD022,CUST009,PROD005,1,899.99,2024-02-05,shipped,credit_card
24 | ORD023,CUST002,PROD002,3,149.97,2024-02-06,processing,paypal
25 | ORD024,CUST003,PROD004,1,49.99,2024-02-07,completed,credit_card
26 | ORD025,CUST007,PROD003,1,599.99,2024-02-08,shipped,bank_transfer
27 | 


--------------------------------------------------------------------------------
/examples/small/products.yaml:
--------------------------------------------------------------------------------
  1 | products:
  2 |   - id: PROD001
  3 |     name: "Wireless Headphones"
  4 |     category: "Electronics"
  5 |     price: 299.99
  6 |     in_stock: true
  7 |     supplier: "AudioTech"
  8 |     specifications:
  9 |       color: ["black", "white", "blue"]
 10 |       warranty_months: 24
 11 |       weight_grams: 250
 12 |     tags: ["bluetooth", "noise-canceling", "premium"]
 13 | 
 14 |   - id: PROD002
 15 |     name: "USB-C Cable"
 16 |     category: "Accessories"
 17 |     price: 49.99
 18 |     in_stock: true
 19 |     supplier: "CableCorp"
 20 |     specifications:
 21 |       length_meters: 2
 22 |       data_speed: "USB 3.0"
 23 |       warranty_months: 12
 24 |     tags: ["usb-c", "fast-charging", "durable"]
 25 | 
 26 |   - id: PROD003
 27 |     name: "Laptop Stand"
 28 |     category: "Office"
 29 |     price: 599.99
 30 |     in_stock: false
 31 |     supplier: "OfficeGear"
 32 |     specifications:
 33 |       material: "aluminum"
 34 |       adjustable: true
 35 |       max_weight_kg: 10
 36 |       warranty_months: 36
 37 |     tags: ["ergonomic", "adjustable", "premium"]
 38 | 
 39 |   - id: PROD004
 40 |     name: "Mouse Pad"
 41 |     category: "Accessories"
 42 |     price: 49.99
 43 |     in_stock: true
 44 |     supplier: "DeskMate"
 45 |     specifications:
 46 |       size: "large"
 47 |       material: "fabric"
 48 |       non_slip: true
 49 |       warranty_months: 6
 50 |     tags: ["gaming", "large", "non-slip"]
 51 | 
 52 |   - id: PROD005
 53 |     name: "4K Monitor"
 54 |     category: "Electronics"
 55 |     price: 899.99
 56 |     in_stock: true
 57 |     supplier: "DisplayTech"
 58 |     specifications:
 59 |       size_inches: 27
 60 |       resolution: "3840x2160"
 61 |       refresh_rate_hz: 60
 62 |       warranty_months: 24
 63 |     tags: ["4k", "monitor", "professional"]
 64 | 
 65 |   - id: PROD006
 66 |     name: "Mechanical Keyboard"
 67 |     category: "Electronics"
 68 |     price: 199.99
 69 |     in_stock: false
 70 |     supplier: "KeyCraft"
 71 |     specifications:
 72 |       switch_type: "blue"
 73 |       backlit: true
 74 |       wireless: false
 75 |       warranty_months: 12
 76 |     tags: ["mechanical", "tactile", "gaming"]
 77 | 
 78 |   - id: PROD007
 79 |     name: "Webcam HD"
 80 |     category: "Electronics"
 81 |     price: 149.99
 82 |     in_stock: true
 83 |     supplier: "VideoTech"
 84 |     specifications:
 85 |       resolution: "1080p"
 86 |       frame_rate: 30
 87 |       auto_focus: true
 88 |       warranty_months: 18
 89 |     tags: ["hd", "streaming", "auto-focus"]
 90 | 
 91 |   - id: PROD008
 92 |     name: "Desk Organizer"
 93 |     category: "Office"
 94 |     price: 79.99
 95 |     in_stock: true
 96 |     supplier: "OfficeGear"
 97 |     specifications:
 98 |       material: "bamboo"
 99 |       compartments: 6
100 |       eco_friendly: true
101 |       warranty_months: 12
102 |     tags: ["bamboo", "eco-friendly", "organizer"]
103 | 


--------------------------------------------------------------------------------
/examples/small/urls.txt:
--------------------------------------------------------------------------------
 1 | https://api.example.com/v1/users
 2 | https://cdn.example.com/images/logo.png
 3 | http://legacy.example.com/old-api
 4 | https://docs.example.com/guide/getting-started
 5 | ftp://files.example.com/downloads/
 6 | https://shop.example.com/products/electronics
 7 | http://internal.example.com/health-check
 8 | https://payments.stripe.com/api/v1/charges
 9 | https://storage.googleapis.com/bucket/file.pdf
10 | http://monitoring.example.com/metrics
11 | https://auth.example.com/oauth/token
12 | https://api.github.com/repos/user/project
13 | http://database.internal/phpmyadmin
14 | https://mail.google.com/mail/
15 | https://aws.amazon.com/s3/
16 | http://192.168.1.100:8080/admin
17 | https://kubernetes.io/docs/
18 | https://registry.hub.docker.com/
19 | http://jenkins.company.com:8080/job/deploy
20 | https://grafana.monitoring.local/dashboard
21 | 


--------------------------------------------------------------------------------
/examples/small/user_behavior.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "session_id": "sess_123456789",
  4 |     "user_id": "user_001",
  5 |     "timestamp": "2024-07-18T14:30:15Z",
  6 |     "page": "/products/electronics",
  7 |     "action": "page_view",
  8 |     "duration_seconds": 45,
  9 |     "device": "desktop",
 10 |     "browser": "Chrome",
 11 |     "location": {
 12 |       "country": "USA",
 13 |       "state": "California",
 14 |       "city": "San Francisco"
 15 |     },
 16 |     "referrer": "https://google.com/search?q=wireless+headphones"
 17 |   },
 18 |   {
 19 |     "session_id": "sess_123456789",
 20 |     "user_id": "user_001",
 21 |     "timestamp": "2024-07-18T14:31:00Z",
 22 |     "page": "/products/headphones/wireless",
 23 |     "action": "click",
 24 |     "duration_seconds": 120,
 25 |     "device": "desktop",
 26 |     "browser": "Chrome",
 27 |     "location": {
 28 |       "country": "USA",
 29 |       "state": "California",
 30 |       "city": "San Francisco"
 31 |     },
 32 |     "referrer": "/products/electronics"
 33 |   },
 34 |   {
 35 |     "session_id": "sess_987654321",
 36 |     "user_id": "user_002",
 37 |     "timestamp": "2024-07-18T14:32:30Z",
 38 |     "page": "/cart",
 39 |     "action": "add_to_cart",
 40 |     "duration_seconds": 15,
 41 |     "device": "mobile",
 42 |     "browser": "Safari",
 43 |     "location": {
 44 |       "country": "Canada",
 45 |       "state": "Ontario",
 46 |       "city": "Toronto"
 47 |     },
 48 |     "referrer": "/products/accessories/cables"
 49 |   },
 50 |   {
 51 |     "session_id": "sess_456789123",
 52 |     "user_id": "user_003",
 53 |     "timestamp": "2024-07-18T14:33:45Z",
 54 |     "page": "/checkout",
 55 |     "action": "purchase",
 56 |     "duration_seconds": 180,
 57 |     "device": "desktop",
 58 |     "browser": "Firefox",
 59 |     "location": {
 60 |       "country": "UK",
 61 |       "state": "England",
 62 |       "city": "London"
 63 |     },
 64 |     "referrer": "/cart"
 65 |   },
 66 |   {
 67 |     "session_id": "sess_789123456",
 68 |     "user_id": "user_004",
 69 |     "timestamp": "2024-07-18T14:35:00Z",
 70 |     "page": "/support/contact",
 71 |     "action": "page_view",
 72 |     "duration_seconds": 90,
 73 |     "device": "tablet",
 74 |     "browser": "Chrome",
 75 |     "location": {
 76 |       "country": "Germany",
 77 |       "state": "Bavaria",
 78 |       "city": "Munich"
 79 |     },
 80 |     "referrer": "/products/monitors"
 81 |   },
 82 |   {
 83 |     "session_id": "sess_321654987",
 84 |     "user_id": "user_005",
 85 |     "timestamp": "2024-07-18T14:36:15Z",
 86 |     "page": "/login",
 87 |     "action": "login_attempt",
 88 |     "duration_seconds": 30,
 89 |     "device": "mobile",
 90 |     "browser": "Safari",
 91 |     "location": {
 92 |       "country": "Australia",
 93 |       "state": "New South Wales",
 94 |       "city": "Sydney"
 95 |     },
 96 |     "referrer": "/account/dashboard"
 97 |   },
 98 |   {
 99 |     "session_id": "sess_654987321",
100 |     "user_id": "user_006",
101 |     "timestamp": "2024-07-18T14:37:30Z",
102 |     "page": "/search",
103 |     "action": "search",
104 |     "duration_seconds": 25,
105 |     "device": "desktop",
106 |     "browser": "Edge",
107 |     "location": {
108 |       "country": "USA",
109 |       "state": "New York",
110 |       "city": "New York"
111 |     },
112 |     "referrer": "https://bing.com/search?q=laptop+stand"
113 |   },
114 |   {
115 |     "session_id": "sess_147258369",
116 |     "user_id": "user_007",
117 |     "timestamp": "2024-07-18T14:38:45Z",
118 |     "page": "/products/office/stands",
119 |     "action": "page_view",
120 |     "duration_seconds": 75,
121 |     "device": "desktop",
122 |     "browser": "Chrome",
123 |     "location": {
124 |       "country": "Japan",
125 |       "state": "Tokyo",
126 |       "city": "Tokyo"
127 |     },
128 |     "referrer": "/search"
129 |   },
130 |   {
131 |     "session_id": "sess_369258147",
132 |     "user_id": "user_008",
133 |     "timestamp": "2024-07-18T14:40:00Z",
134 |     "page": "/wishlist",
135 |     "action": "add_to_wishlist",
136 |     "duration_seconds": 10,
137 |     "device": "mobile",
138 |     "browser": "Chrome",
139 |     "location": {
140 |       "country": "France",
141 |       "state": "Île-de-France",
142 |       "city": "Paris"
143 |     },
144 |     "referrer": "/products/keyboards/mechanical"
145 |   },
146 |   {
147 |     "session_id": "sess_258147369",
148 |     "user_id": "user_009",
149 |     "timestamp": "2024-07-18T14:41:15Z",
150 |     "page": "/products/webcams",
151 |     "action": "page_view",
152 |     "duration_seconds": 60,
153 |     "device": "desktop",
154 |     "browser": "Safari",
155 |     "location": {
156 |       "country": "Brazil",
157 |       "state": "São Paulo",
158 |       "city": "São Paulo"
159 |     },
160 |     "referrer": "/categories/electronics"
161 |   },
162 |   {
163 |     "session_id": "sess_741852963",
164 |     "user_id": "user_010",
165 |     "timestamp": "2024-07-18T14:42:30Z",
166 |     "page": "/reviews",
167 |     "action": "write_review",
168 |     "duration_seconds": 300,
169 |     "device": "desktop",
170 |     "browser": "Firefox",
171 |     "location": {
172 |       "country": "India",
173 |       "state": "Maharashtra",
174 |       "city": "Mumbai"
175 |     },
176 |     "referrer": "/products/monitors/4k"
177 |   },
178 |   {
179 |     "session_id": "sess_852963741",
180 |     "user_id": "user_011",
181 |     "timestamp": "2024-07-18T14:45:00Z",
182 |     "page": "/account/orders",
183 |     "action": "page_view",
184 |     "duration_seconds": 40,
185 |     "device": "mobile",
186 |     "browser": "Safari",
187 |     "location": {
188 |       "country": "Mexico",
189 |       "state": "Mexico City",
190 |       "city": "Mexico City"
191 |     },
192 |     "referrer": "/login"
193 |   },
194 |   {
195 |     "session_id": "sess_963741852",
196 |     "user_id": "user_012",
197 |     "timestamp": "2024-07-18T14:46:15Z",
198 |     "page": "/help/faq",
199 |     "action": "page_view",
200 |     "duration_seconds": 150,
201 |     "device": "tablet",
202 |     "browser": "Chrome",
203 |     "location": {
204 |       "country": "South Korea",
205 |       "state": "Seoul",
206 |       "city": "Seoul"
207 |     },
208 |     "referrer": "/support/contact"
209 |   },
210 |   {
211 |     "session_id": "sess_159357426",
212 |     "user_id": "user_013",
213 |     "timestamp": "2024-07-18T14:48:30Z",
214 |     "page": "/logout",
215 |     "action": "logout",
216 |     "duration_seconds": 5,
217 |     "device": "desktop",
218 |     "browser": "Chrome",
219 |     "location": {
220 |       "country": "Netherlands",
221 |       "state": "North Holland",
222 |       "city": "Amsterdam"
223 |     },
224 |     "referrer": "/account/settings"
225 |   },
226 |   {
227 |     "session_id": "sess_357426159",
228 |     "user_id": "user_014",
229 |     "timestamp": "2024-07-18T14:50:00Z",
230 |     "page": "/newsletter/signup",
231 |     "action": "newsletter_signup",
232 |     "duration_seconds": 45,
233 |     "device": "mobile",
234 |     "browser": "Firefox",
235 |     "location": {
236 |       "country": "Italy",
237 |       "state": "Lazio",
238 |       "city": "Rome"
239 |     },
240 |     "referrer": "/home"
241 |   },
242 |   {
243 |     "session_id": "sess_426159357",
244 |     "user_id": "user_015",
245 |     "timestamp": "2024-07-18T14:51:30Z",
246 |     "page": "/categories/office",
247 |     "action": "page_view",
248 |     "duration_seconds": 35,
249 |     "device": "desktop",
250 |     "browser": "Edge",
251 |     "location": {
252 |       "country": "Russia",
253 |       "state": "Moscow",
254 |       "city": "Moscow"
255 |     },
256 |     "referrer": "/home"
257 |   },
258 |   {
259 |     "session_id": "sess_591837264",
260 |     "user_id": "user_016",
261 |     "timestamp": "2024-07-18T14:53:00Z",
262 |     "page": "/products/organizers",
263 |     "action": "page_view",
264 |     "duration_seconds": 80,
265 |     "device": "desktop",
266 |     "browser": "Chrome",
267 |     "location": {
268 |       "country": "Spain",
269 |       "state": "Madrid",
270 |       "city": "Madrid"
271 |     },
272 |     "referrer": "/categories/office"
273 |   },
274 |   {
275 |     "session_id": "sess_837264591",
276 |     "user_id": "user_017",
277 |     "timestamp": "2024-07-18T14:54:45Z",
278 |     "page": "/compare",
279 |     "action": "product_compare",
280 |     "duration_seconds": 120,
281 |     "device": "tablet",
282 |     "browser": "Safari",
283 |     "location": {
284 |       "country": "Argentina",
285 |       "state": "Buenos Aires",
286 |       "city": "Buenos Aires"
287 |     },
288 |     "referrer": "/products/headphones"
289 |   },
290 |   {
291 |     "session_id": "sess_264591837",
292 |     "user_id": "user_018",
293 |     "timestamp": "2024-07-18T14:56:30Z",
294 |     "page": "/blog/tech-news",
295 |     "action": "page_view",
296 |     "duration_seconds": 200,
297 |     "device": "desktop",
298 |     "browser": "Firefox",
299 |     "location": {
300 |       "country": "Sweden",
301 |       "state": "Stockholm",
302 |       "city": "Stockholm"
303 |     },
304 |     "referrer": "https://twitter.com/company"
305 |   },
306 |   {
307 |     "session_id": "sess_975318642",
308 |     "user_id": "user_019",
309 |     "timestamp": "2024-07-18T14:58:15Z",
310 |     "page": "/sitemap",
311 |     "action": "page_view",
312 |     "duration_seconds": 20,
313 |     "device": "mobile",
314 |     "browser": "Chrome",
315 |     "location": {
316 |       "country": "Turkey",
317 |       "state": "Istanbul",
318 |       "city": "Istanbul"
319 |     },
320 |     "referrer": "/help/faq"
321 |   },
322 |   {
323 |     "session_id": "sess_318642975",
324 |     "user_id": "user_020",
325 |     "timestamp": "2024-07-18T15:00:00Z",
326 |     "page": "/contact/sales",
327 |     "action": "contact_form",
328 |     "duration_seconds": 240,
329 |     "device": "desktop",
330 |     "browser": "Safari",
331 |     "location": {
332 |       "country": "South Africa",
333 |       "state": "Western Cape",
334 |       "city": "Cape Town"
335 |     },
336 |     "referrer": "/products/enterprise"
337 |   }
338 | ]
339 | 


--------------------------------------------------------------------------------
/src/arg.rs:
--------------------------------------------------------------------------------
  1 | use std::path::PathBuf;
  2 | 
  3 | use clap::Parser;
  4 | 
  5 | use crate::Error;
  6 | 
  7 | /// hawk - Modern data analysis tool for structured data (JSON, YAML, CSV)
  8 | ///
  9 | /// hawk combines the simplicity of awk with the power of pandas for data exploration.
 10 | /// Perfect for analyzing JSON APIs, YAML configs, and CSV datasets.
 11 | #[derive(Debug, Parser)]
 12 | #[command(name = "hawk")]
 13 | #[command(version = "0.2.2")]
 14 | #[command(about = "Modern data analysis tool for structured data and text files")]
 15 | #[command(long_about = "
 16 | hawk is a command-line data analysis tool that brings pandas-like functionality
 17 | to your terminal. It supports JSON, YAML, CSV, and plain text formats with automatic
 18 | detection, powerful filtering, grouping, aggregation, and string manipulation capabilities.
 19 | 
 20 | EXAMPLES:
 21 | # Basic field access
 22 |     hawk ‘.users[0].name’ data.json
 23 |     hawk ‘.users.name’ data.csv
 24 | 
 25 | 
 26 | # Text processing (NEW in v0.2.0!)
 27 |     hawk '. | select(. | contains(\"ERROR\"))' app.log
 28 |     hawk '. | map(. | trim | upper)' data.txt
 29 |     hawk '. | map(. | substring(0, 19))' access.log
 30 | 
 31 | # String operations
 32 |     hawk '. | map(. | replace(\"old\", \"new\"))' text.txt
 33 |     hawk '. | map(. | split(\",\") | join(\" | \"))' csv_lines.txt
 34 | 
 35 | # Filtering and aggregation
 36 |     hawk '.users[] | select(.age > 30)' data.yaml
 37 |     hawk '.sales | group_by(.region) | avg(.amount)' sales.csv
 38 | 
 39 | # Statistical analysis (NEW!)
 40 |     hawk '. | unique | sort' numbers.txt
 41 |     hawk '.scores[] | median(.value)' scores.json
 42 |     hawk '.data[] | stddev(.measurement)' sensor_data.csv
 43 | 
 44 | # Complex pipelines
 45 |     hawk '. | select(. | contains(\"WARN\")) | map(. | substring(11, 8)) | unique' app.log
 46 |     hawk '.users[] | map(.email | lower | trim) | select(. | ends_with(\".com\"))' users.csv
 47 | 
 48 | # Data exploration
 49 |     hawk '. | info' data.json
 50 |     hawk '.users | count' data.csv
 51 |     hawk '. | length' any_file.txt
 52 | 
 53 | 
 54 | SUPPORTED FORMATS:
 55 |     JSON, YAML, CSV, Plain Text (automatically detected)
 56 | 
 57 | QUERY SYNTAX:
 58 |     # Field Access
 59 |     .field                    - Access field
 60 |     .array[0]                 - Access array element
 61 |     .array[]                  - Access all array elements
 62 | 
 63 | 
 64 | # Text Processing (NEW!)
 65 |     . | map(. | upper)        - Convert to uppercase
 66 |     . | map(. | lower)        - Convert to lowercase
 67 |     . | map(. | trim)         - Remove whitespace
 68 |     . | map(. | length)       - Get string length
 69 |     . | map(. | reverse)      - Reverse string
 70 | 
 71 | # String Manipulation
 72 |     . | map(. | replace(\"a\", \"b\"))  - Replace text
 73 |     . | map(. | substring(0, 5))      - Extract substring
 74 |     . | map(. | split(\",\"))          - Split by delimiter
 75 |     .array[] | join(\", \")            - Join array elements
 76 | 
 77 | # String Filtering
 78 |     . | select(. | contains(\"text\"))     - Contains pattern
 79 |     . | select(. | starts_with(\"pre\"))   - Starts with pattern
 80 |     . | select(. | ends_with(\"suf\"))     - Ends with pattern
 81 | 
 82 | # Statistical Functions (NEW!)
 83 |     . | unique                - Remove duplicates
 84 |     . | sort                  - Sort values
 85 |     . | median                - Calculate median
 86 |     . | stddev                - Calculate standard deviation
 87 |     . | length                - Get array/text length
 88 | 
 89 | # Filtering & Aggregation
 90 |     . | select(.field > 10)   - Filter data
 91 |     . | group_by(.category)   - Group data
 92 |     . | count/sum/avg/min/max - Aggregate functions
 93 | 
 94 | # Data Transformation
 95 |     . | map(.field | operation) - Transform data with string operations
 96 | 
 97 | 
 98 | OUTPUT FORMATS:
 99 |     –format table           - Colored table output (default for structured data)
100 |     –format json            - JSON output with syntax highlighting
101 |     –format list            - Simple list output
102 |     –format auto            - Smart format detection (default)
103 | 
104 | COLORED OUTPUT:
105 |     Automatic color detection (TTY), respects NO_COLOR environment variable
106 | ")]
107 | 
108 | pub struct Args {
109 |     /// JSONPath-style query to execute
110 |     ///
111 |     /// Examples:
112 |     ///
113 |     ///   .users[0].name              - Get first user's name
114 |     ///
115 |     ///   .users | select(.age > 30)  - Filter users by age
116 |     ///
117 |     ///   . | group_by(.department)   - Group by department
118 |     pub query: String,
119 | 
120 |     /// Input file path (JSON, YAML, or CSV)
121 |     ///
122 |     /// If not provided, reads from stdin.
123 |     /// File format is automatically detected.
124 |     pub path: Option<PathBuf>,
125 | 
126 |     /// Output format
127 |     ///
128 |     ///    auto: Smart detection (table for arrays, list for values, json for complex)
129 |     ///
130 |     ///    table: Force tabular output
131 |     ///
132 |     ///    json: Force JSON output
133 |     ///
134 |     ///    list: Force list output
135 |     ///
136 |     ///    csv: Force CSV output
137 |     #[arg(long, default_value = "auto")]
138 |     #[arg(value_parser = ["auto", "table", "json", "list", "csv"])]
139 |     pub format: String,
140 | 
141 |     #[arg(long, short)]
142 |     #[arg(help = "Force text format (skip auto-detection)")]
143 |     pub text: bool,
144 | }
145 | 
146 | #[derive(Debug, Clone)]
147 | pub enum OutputFormat {
148 |     Auto,
149 |     Json,
150 |     Table,
151 |     List,
152 |     Csv,
153 | }
154 | 
155 | impl std::str::FromStr for OutputFormat {
156 |     type Err = Error;
157 | 
158 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
159 |         match s.to_lowercase().as_str() {
160 |             "auto" => Ok(OutputFormat::Auto),
161 |             "json" => Ok(OutputFormat::Json),
162 |             "table" => Ok(OutputFormat::Table),
163 |             "list" => Ok(OutputFormat::List),
164 |             "csv" => Ok(OutputFormat::Csv),
165 |             _ => Err(Error::InvalidFormat(format!(
166 |                 "Invalid format: {}. Valid options: auto, json, table, list",
167 |                 s
168 |             ))),
169 |         }
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | #[derive(Debug, Error)]
 4 | pub enum Error {
 5 |     #[error("Invalid output format: {0}")]
 6 |     InvalidFormat(String),
 7 | 
 8 |     #[error("File not found: {0}")]
 9 |     FileNotFound(#[from] std::io::Error),
10 | 
11 |     #[error("JSON deserialization error: {0}")]
12 |     Json(#[from] serde_json::Error),
13 | 
14 |     #[error("YAML deserialization error: {0}")]
15 |     Yaml(#[from] serde_yaml::Error),
16 | 
17 |     #[error("CSV parsing error: {0}")]
18 |     Csv(#[from] csv::Error),
19 | 
20 |     #[error("str parse int error: {0}")]
21 |     StrToInt(#[from] std::num::ParseIntError),
22 | 
23 |     #[error("Invalid query format: {0}")]
24 |     InvalidQuery(String),
25 | 
26 |     #[error("Array index out of bounds: {0}")]
27 |     IndexOutOfBounds(usize),
28 | 
29 |     #[error("Text processing error: {0}")]
30 |     TextProcessing(String),
31 | 
32 |     #[error("String operation error: {0}")]
33 |     StringOperation(String),
34 | }
35 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod arg;
 2 | pub mod error;
 3 | pub mod executor;
 4 | pub mod filter;
 5 | pub mod output;
 6 | pub mod parser;
 7 | pub mod setup;
 8 | pub mod stats_opts;
 9 | pub mod string_ops;
10 | pub mod utils;
11 | 
12 | pub use arg::*;
13 | pub use error::*;
14 | pub use executor::*;
15 | pub use filter::*;
16 | pub use output::*;
17 | pub use parser::*;
18 | use serde_json::Value;
19 | pub use setup::*;
20 | pub use stats_opts::*;
21 | pub use string_ops::*;
22 | pub use utils::*;
23 | 
24 | pub fn debug_json_order(json: &Value) {
25 |     println!("=== Original JSON field order ===");
26 | 
27 |     // ルートレベル
28 |     // Root level
29 |     if let Value::Object(obj) = json {
30 |         println!("Root fields:");
31 |         for key in obj.keys() {
32 |             println!("  {}", key);
33 |         }
34 | 
35 |         // users配列の最初の要素のフィールド順序
36 |         // Field order of the first element in the users array
37 |         if let Some(Value::Array(users)) = obj.get("users") {
38 |             if let Some(Value::Object(first_user)) = users.first() {
39 |                 println!("First user fields:");
40 |                 for key in first_user.keys() {
41 |                     println!("  {}", key);
42 |                 }
43 |             }
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use hawk_data::{Error, execute_query, setup};
 3 | 
 4 | fn main() -> Result<(), Error> {
 5 |     let result = run();
 6 | 
 7 |     if let Err(ref e) = result {
 8 |         eprintln!("Error: {}", e);
 9 | 
10 |         if let Error::InvalidQuery(_) = e {
11 |             eprintln!("\nTry 'hawk --help' for usage examples.");
12 |         }
13 |         std::process::exit(1);
14 |     }
15 | 
16 |     result
17 | }
18 | 
19 | fn run() -> Result<(), Error> {
20 |     let (json, query, format) = setup()?;
21 |     execute_query(&json, &query, format)?;
22 |     Ok(())
23 | }
24 | 


--------------------------------------------------------------------------------
/src/parser.rs:
--------------------------------------------------------------------------------
  1 | use crate::Error;
  2 | 
  3 | pub fn parse_query_segments(query: &str) -> Result<(&str, Vec<&str>), Error> {
  4 |     // println!("=== parse_query_segments Debug ===");
  5 |     // println!("Input query: '{}'", query);
  6 | 
  7 |     if query == "." {
  8 |         return Ok(("", vec![]));
  9 |     }
 10 | 
 11 |     // パイプライン操作がある場合は基本クエリ部分のみを処理
 12 |     let base_query = if query.contains('|') {
 13 |         query.split('|').next().unwrap().trim()
 14 |     } else {
 15 |         query
 16 |     };
 17 | 
 18 |     // println!("Base query: '{}'", base_query);
 19 | 
 20 |     // ".[0]" のような場合の特別扱い
 21 |     if base_query.starts_with(".[") {
 22 |         let remaining = &base_query[1..];
 23 |         let mut segments = remaining.split('.');
 24 |         let first_segment = segments.next().unwrap();
 25 |         let rest: Vec<&str> = segments.collect();
 26 |         let result = Ok(("", [vec![first_segment], rest].concat()));
 27 |         // println!("Root array access result: {:?}", result);
 28 |         return result;
 29 |     }
 30 | 
 31 |     let mut segments = base_query.split('.').skip(1);
 32 |     let segment = segments
 33 |         .next()
 34 |         .ok_or(Error::InvalidQuery("Missing field segment in query".into()))?;
 35 |     let fields: Vec<&str> = segments.collect();
 36 | 
 37 |     // println!("Normal parse result: {:?}", result);
 38 |     Ok((segment, fields))
 39 | }
 40 | 
 41 | pub fn parse_array_segment(segment: &str) -> Result<(usize, usize), Error> {
 42 |     let idx = segment
 43 |         .find('[')
 44 |         .ok_or(Error::InvalidQuery("Missing '[' in segment".into()))?;
 45 |     let ridx = segment
 46 |         .find(']')
 47 |         .ok_or(Error::InvalidQuery("Missing ']' in segment".into()))?;
 48 | 
 49 |     if idx >= ridx {
 50 |         return Err(Error::InvalidQuery("Invalid bracket order".into()));
 51 |     }
 52 | 
 53 |     Ok((idx, ridx))
 54 | }
 55 | 
 56 | #[cfg(test)]
 57 | mod tests {
 58 |     use super::*;
 59 |     use crate::Error;
 60 | 
 61 |     #[test]
 62 |     fn test_parse_query_segments_normal_case() {
 63 |         // 正常ケース: 基本的なクエリ
 64 |         let result = parse_query_segments(".users.name");
 65 |         assert!(result.is_ok());
 66 |         let (segment, field) = result.unwrap();
 67 |         assert_eq!(segment, "users");
 68 |         assert_eq!(field, vec!["name"]);
 69 |     }
 70 | 
 71 |     #[test]
 72 |     fn test_parse_query_segments_with_array_index() {
 73 |         // 正常ケース: 配列インデックス付き
 74 |         let result = parse_query_segments(".users[0].name");
 75 |         assert!(result.is_ok());
 76 |         let (segment, field) = result.unwrap();
 77 |         assert_eq!(segment, "users[0]");
 78 |         assert_eq!(field, vec!["name"]);
 79 |     }
 80 | 
 81 |     #[test]
 82 |     fn test_parse_query_segments_different_fields() {
 83 |         // 正常ケース: 異なるフィールド名
 84 |         let result = parse_query_segments(".products.price");
 85 |         assert!(result.is_ok());
 86 |         let (segment, field) = result.unwrap();
 87 |         assert_eq!(segment, "products");
 88 |         assert_eq!(field, vec!["price"]);
 89 |     }
 90 | 
 91 |     #[test]
 92 |     fn test_parse_query_segments_complex_index() {
 93 |         // 正常ケース: 大きなインデックス
 94 |         let result = parse_query_segments(".items[123].description");
 95 |         assert!(result.is_ok());
 96 |         let (segment, field) = result.unwrap();
 97 |         assert_eq!(segment, "items[123]");
 98 |         assert_eq!(field, vec!["description"]);
 99 |     }
100 | 
101 |     #[test]
102 |     fn test_parse_query_segments_truly_missing_field() {
103 |         // エラーケース: 本当にフィールドセグメントが不足
104 |         let result = parse_query_segments("");
105 |         assert!(result.is_err());
106 |         match result.unwrap_err() {
107 |             Error::InvalidQuery(msg) => {
108 |                 assert!(msg.contains("Missing field segment"));
109 |             }
110 |             _ => panic!("Expected InvalidQuery error"),
111 |         }
112 |     }
113 | 
114 |     #[test]
115 |     fn test_parse_query_segments_empty_query() {
116 |         // エラーケース: 空のクエリ
117 |         let result = parse_query_segments("");
118 |         assert!(result.is_err());
119 |         match result.unwrap_err() {
120 |             Error::InvalidQuery(msg) => {
121 |                 assert!(msg.contains("Missing field segment"));
122 |             }
123 |             _ => panic!("Expected InvalidQuery error"),
124 |         }
125 |     }
126 | 
127 |     #[test]
128 |     fn test_parse_array_segment_normal_case() {
129 |         // 正常ケース: 基本的な配列インデックス
130 |         let result = parse_array_segment("users[0]");
131 |         assert!(result.is_ok());
132 |         let (idx, ridx) = result.unwrap();
133 |         assert_eq!(idx, 5); // '[' の位置
134 |         assert_eq!(ridx, 7); // ']' の位置
135 |     }
136 | 
137 |     #[test]
138 |     fn test_parse_array_segment_large_index() {
139 |         // 正常ケース: 大きなインデックス
140 |         let result = parse_array_segment("items[123]");
141 |         assert!(result.is_ok());
142 |         let (idx, ridx) = result.unwrap();
143 |         assert_eq!(idx, 5); // '[' の位置
144 |         assert_eq!(ridx, 9); // ']' の位置
145 |     }
146 | 
147 |     #[test]
148 |     fn test_parse_array_segment_short_name() {
149 |         // 正常ケース: 短いフィールド名
150 |         let result = parse_array_segment("a[5]");
151 |         assert!(result.is_ok());
152 |         let (idx, ridx) = result.unwrap();
153 |         assert_eq!(idx, 1); // '[' の位置
154 |         assert_eq!(ridx, 3); // ']' の位置
155 |     }
156 | 
157 |     #[test]
158 |     fn test_parse_array_segment_missing_open_bracket() {
159 |         // エラーケース: '[' がない
160 |         let result = parse_array_segment("users0]");
161 |         assert!(result.is_err());
162 |         match result.unwrap_err() {
163 |             Error::InvalidQuery(msg) => {
164 |                 assert!(msg.contains("Missing '[' in segment"));
165 |             }
166 |             _ => panic!("Expected InvalidQuery error"),
167 |         }
168 |     }
169 | 
170 |     #[test]
171 |     fn test_parse_array_segment_missing_close_bracket() {
172 |         // エラーケース: ']' がない
173 |         let result = parse_array_segment("users[0");
174 |         assert!(result.is_err());
175 |         match result.unwrap_err() {
176 |             Error::InvalidQuery(msg) => {
177 |                 assert!(msg.contains("Missing ']' in segment"));
178 |             }
179 |             _ => panic!("Expected InvalidQuery error"),
180 |         }
181 |     }
182 | 
183 |     #[test]
184 |     fn test_parse_array_segment_invalid_bracket_order() {
185 |         // エラーケース: ブラケットの順序が逆
186 |         let result = parse_array_segment("users]0[");
187 |         assert!(result.is_err());
188 |         match result.unwrap_err() {
189 |             Error::InvalidQuery(msg) => {
190 |                 assert!(msg.contains("Invalid bracket order"));
191 |             }
192 |             _ => panic!("Expected InvalidQuery error"),
193 |         }
194 |     }
195 | 
196 |     #[test]
197 |     fn test_parse_array_segment_empty_brackets() {
198 |         // エラーケース: 空のブラケット
199 |         let result = parse_array_segment("users[]");
200 |         assert!(result.is_ok()); // パース自体は成功する
201 |         let (idx, ridx) = result.unwrap();
202 |         assert_eq!(idx, 5); // '[' の位置
203 |         assert_eq!(ridx, 6); // ']' の位置
204 |     }
205 | 
206 |     #[test]
207 |     fn test_parse_array_segment_no_brackets() {
208 |         // エラーケース: ブラケットが全くない
209 |         let result = parse_array_segment("users");
210 |         assert!(result.is_err());
211 |         match result.unwrap_err() {
212 |             Error::InvalidQuery(msg) => {
213 |                 assert!(msg.contains("Missing '[' in segment"));
214 |             }
215 |             _ => panic!("Expected InvalidQuery error"),
216 |         }
217 |     }
218 | }
219 | 


--------------------------------------------------------------------------------
/src/setup.rs:
--------------------------------------------------------------------------------
  1 | use std::io::{self, Read};
  2 | 
  3 | use clap::Parser;
  4 | use serde_json::Value;
  5 | 
  6 | use crate::{Args, Error, OutputFormat};
  7 | 
  8 | pub fn setup() -> Result<(Value, String, OutputFormat), Error> {
  9 |     let args = Args::parse();
 10 | 
 11 |     let content = if let Some(path) = args.path {
 12 |         std::fs::read_to_string(path)?
 13 |     } else {
 14 |         let mut buffer = String::new();
 15 |         io::stdin().read_to_string(&mut buffer)?;
 16 |         buffer
 17 |     };
 18 | 
 19 |     let input_format = if args.text {
 20 |         InputFormat::Text
 21 |     } else {
 22 |         detect_input_format(&content)
 23 |     };
 24 | 
 25 |     let data = parse_content(&content, input_format)?;
 26 |     let query = args.query;
 27 | 
 28 |     let format = args
 29 |         .format
 30 |         .parse::<OutputFormat>()
 31 |         .map_err(|e| Error::InvalidFormat(e.to_string()))?;
 32 | 
 33 |     // debug
 34 |     // debug_json_order(&json);
 35 |     Ok((data, query, format))
 36 | }
 37 | 
 38 | #[derive(Debug)]
 39 | enum InputFormat {
 40 |     Json,
 41 |     Yaml,
 42 |     Csv,
 43 |     Text,
 44 | }
 45 | 
 46 | fn detect_input_format(content: &str) -> InputFormat {
 47 |     let trimmed = content.trim();
 48 | 
 49 |     // CSV判定を最初に行う（シンプルな形式から）
 50 |     if is_likely_csv(trimmed) {
 51 |         return InputFormat::Csv;
 52 |     }
 53 | 
 54 |     // JSON判定（厳密にチェック）- YAMLより先に判定
 55 |     if (trimmed.starts_with('{') && trimmed.ends_with('}'))
 56 |         || (trimmed.starts_with('[') && trimmed.ends_with(']'))
 57 |     {
 58 |         // さらに、全体がJSONとして有効かチェック
 59 |         if serde_json::from_str::<serde_json::Value>(trimmed).is_ok() {
 60 |             return InputFormat::Json;
 61 |         }
 62 |     }
 63 | 
 64 |     // YAML判定 - より厳格な条件に変更
 65 |     if is_structured_yaml(trimmed) {
 66 |         return InputFormat::Yaml;
 67 |     }
 68 | 
 69 |     // 上記のいずれにも該当しない場合はText
 70 |     InputFormat::Text
 71 | }
 72 | 
 73 | // 構造化されたYAMLかどうかを厳格に判定
 74 | fn is_structured_yaml(content: &str) -> bool {
 75 |     let lines: Vec<&str> = content.lines().collect();
 76 | 
 77 |     if lines.is_empty() {
 78 |         return false;
 79 |     }
 80 | 
 81 |     // Kubernetes/Docker Compose等の明確なYAMLマーカー
 82 |     if content.contains("apiVersion:")
 83 |         || content.contains("kind:")
 84 |         || content.contains("version:")
 85 |         || content.contains("services:")
 86 |     {
 87 |         return true;
 88 |     }
 89 | 
 90 |     let mut yaml_indicators = 0;
 91 |     let mut total_meaningful_lines = 0;
 92 | 
 93 |     for line in lines {
 94 |         let trimmed = line.trim();
 95 | 
 96 |         // 空行やコメントは除外
 97 |         if trimmed.is_empty() || trimmed.starts_with('#') {
 98 |             continue;
 99 |         }
100 | 
101 |         total_meaningful_lines += 1;
102 | 
103 |         // YAML構造の特徴を検出
104 |         if is_valid_yaml_line(trimmed) {
105 |             yaml_indicators += 1;
106 |         }
107 |     }
108 | 
109 |     // 意味のある行が少ない場合はYAMLではない
110 |     if total_meaningful_lines < 3 {
111 |         return false;
112 |     }
113 | 
114 |     // 80%以上の行がYAML構造ならYAMLと判定
115 |     (yaml_indicators as f64 / total_meaningful_lines as f64) > 0.8
116 | }
117 | 
118 | // 有効なYAML行かどうかを判定
119 | fn is_valid_yaml_line(line: &str) -> bool {
120 |     // リスト形式 (- item)
121 |     if line.starts_with("- ") {
122 |         return true;
123 |     }
124 | 
125 |     // key: value 形式
126 |     if let Some(colon_pos) = line.find(':') {
127 |         let key_part = line[..colon_pos].trim();
128 |         let value_part = line[colon_pos + 1..].trim();
129 | 
130 |         // キー部分の検証
131 |         if key_part.is_empty() {
132 |             return false;
133 |         }
134 | 
135 |         // キーに無効な文字が含まれていない
136 |         if key_part.contains(' ') && !key_part.starts_with('"') && !key_part.starts_with('\'') {
137 |             return false;
138 |         }
139 | 
140 |         // インデントされたネスト構造
141 |         if line.starts_with("  ") || line.starts_with("\t") {
142 |             return true;
143 |         }
144 | 
145 |         // 値が明らかにYAML的
146 |         if value_part.is_empty()
147 |             || value_part.starts_with('[')
148 |             || value_part.starts_with('{')
149 |             || value_part == "true"
150 |             || value_part == "false"
151 |             || value_part.parse::<f64>().is_ok()
152 |         {
153 |             return true;
154 |         }
155 | 
156 |         // パス、URL、タイムスタンプなどが含まれていたらYAMLではない可能性が高い
157 |         if value_part.contains('/') && value_part.len() > 10 {
158 |             return false;
159 |         }
160 | 
161 |         return true;
162 |     }
163 | 
164 |     false
165 | }
166 | 
167 | fn parse_content(content: &str, format: InputFormat) -> Result<Value, Error> {
168 |     match format {
169 |         InputFormat::Json => serde_json::from_str(content).map_err(Error::Json),
170 |         InputFormat::Yaml => {
171 |             // 複数ドキュメントに対応
172 |             if content.contains("---") {
173 |                 parse_multi_document_yaml(content)
174 |             } else {
175 |                 serde_yaml::from_str(content).map_err(Error::Yaml)
176 |             }
177 |         }
178 |         InputFormat::Csv => parse_csv_to_json(content),
179 |         InputFormat::Text => parse_text_to_json(content),
180 |     }
181 | }
182 | 
183 | fn parse_text_to_json(content: &str) -> Result<Value, Error> {
184 |     // テキストを行ごとに分割して配列として扱う
185 |     let lines: Vec<Value> = content
186 |         .lines()
187 |         .map(|line| Value::String(line.to_string()))
188 |         .collect();
189 | 
190 |     // 空のファイルの場合も配列として返す
191 |     Ok(Value::Array(lines))
192 | }
193 | 
194 | fn parse_multi_document_yaml(content: &str) -> Result<Value, Error> {
195 |     let documents: Vec<&str> = content
196 |         .split("---")
197 |         .map(|doc| doc.trim())
198 |         .filter(|doc| !doc.is_empty())
199 |         .collect();
200 | 
201 |     let mut parsed_docs = Vec::new();
202 | 
203 |     for doc in documents {
204 |         let parsed: Value = serde_yaml::from_str(doc).map_err(Error::Yaml)?;
205 |         parsed_docs.push(parsed);
206 |     }
207 | 
208 |     // 複数ドキュメントを配列として返す
209 |     Ok(Value::Array(parsed_docs))
210 | }
211 | 
212 | fn is_likely_csv(content: &str) -> bool {
213 |     let lines: Vec<&str> = content.lines().take(5).collect();
214 | 
215 |     if lines.is_empty() {
216 |         return false;
217 |     }
218 | 
219 |     // 最初の行をヘッダーとして想定
220 |     let first_line = lines[0];
221 |     let comma_count = first_line.matches(',').count();
222 | 
223 |     // カンマが1個以上あり、他の行も同じような構造
224 |     if comma_count > 0 {
225 |         // 他の行も同じようなカンマ数か確認
226 |         lines.iter().skip(1).all(|line| {
227 |             let line_comma_count = line.matches(',').count();
228 |             (line_comma_count as i32 - comma_count as i32).abs() <= 1
229 |         })
230 |     } else {
231 |         false
232 |     }
233 | }
234 | 
235 | fn parse_csv_to_json(content: &str) -> Result<Value, Error> {
236 |     let mut reader = csv::Reader::from_reader(content.as_bytes());
237 | 
238 |     // ヘッダーを取得
239 |     let headers: Vec<String> = reader
240 |         .headers()
241 |         .map_err(Error::Csv)?
242 |         .iter()
243 |         .map(|h| h.trim().to_string())
244 |         .collect();
245 | 
246 |     let mut records = Vec::new();
247 | 
248 |     for result in reader.records() {
249 |         let record = result.map_err(Error::Csv)?;
250 |         let mut object = serde_json::Map::new();
251 | 
252 |         for (i, field) in record.iter().enumerate() {
253 |             if let Some(header) = headers.get(i) {
254 |                 let value = infer_value_type(field.trim());
255 |                 object.insert(header.clone(), value);
256 |             }
257 |         }
258 | 
259 |         records.push(Value::Object(object));
260 |     }
261 | 
262 |     // 直接配列を返す（二重配列にしない）
263 |     Ok(Value::Array(records))
264 | }
265 | 
266 | fn infer_value_type(field: &str) -> Value {
267 |     // 空文字チェック
268 |     if field.is_empty() {
269 |         return Value::Null;
270 |     }
271 | 
272 |     // 真偽値判定
273 |     match field.to_lowercase().as_str() {
274 |         "true" => return Value::Bool(true),
275 |         "false" => return Value::Bool(false),
276 |         _ => {}
277 |     }
278 | 
279 |     // 整数判定
280 |     if let Ok(int_val) = field.parse::<i64>() {
281 |         return Value::Number(serde_json::Number::from(int_val));
282 |     }
283 | 
284 |     // 浮動小数点数判定
285 |     if let Ok(float_val) = field.parse::<f64>() {
286 |         if let Some(num) = serde_json::Number::from_f64(float_val) {
287 |             return Value::Number(num);
288 |         }
289 |     }
290 | 
291 |     // デフォルトは文字列
292 |     Value::String(field.to_string())
293 | }
294 | 
295 | // テキスト処理用のヘルパー関数
296 | pub fn text_to_json_values(content: &str) -> Result<Vec<Value>, Error> {
297 |     let lines: Vec<Value> = content
298 |         .lines()
299 |         .map(|line| Value::String(line.to_string()))
300 |         .collect();
301 |     Ok(lines)
302 | }
303 | 
304 | #[cfg(test)]
305 | mod tests {
306 |     use super::*;
307 | 
308 |     #[test]
309 |     fn test_text_parsing() {
310 |         let content = "line1\nline2\nERROR: something happened";
311 |         let result = parse_text_to_json(content).unwrap();
312 | 
313 |         if let Value::Array(lines) = result {
314 |             assert_eq!(lines.len(), 3);
315 |             assert_eq!(lines[0], Value::String("line1".to_string()));
316 |             assert_eq!(lines[1], Value::String("line2".to_string()));
317 |             assert_eq!(
318 |                 lines[2],
319 |                 Value::String("ERROR: something happened".to_string())
320 |             );
321 |         } else {
322 |             panic!("Expected array result");
323 |         }
324 |     }
325 | 
326 |     #[test]
327 |     fn test_yaml_detection() {
328 |         use super::{is_structured_yaml, is_valid_yaml_line};
329 | 
330 |         // 明確にYAMLとして認識されるべき
331 |         assert!(is_structured_yaml("apiVersion: v1\nkind: Pod"));
332 |         assert!(is_structured_yaml(
333 |             "key: value\nother: data\nnested:\n  sub: item"
334 |         ));
335 | 
336 |         // YAMLとして認識されないべき
337 |         assert!(!is_structured_yaml("2024-01-01 10:00:00 INFO Starting"));
338 |         assert!(!is_structured_yaml("plain text\nwith some: colons"));
339 |         assert!(!is_structured_yaml(
340 |             "ServerName: localhost\nServerPort: 8080"
341 |         )); // 設定ファイル風だがYAMLではない
342 | 
343 |         // 個別行のテスト
344 |         assert!(is_valid_yaml_line("key: value"));
345 |         assert!(is_valid_yaml_line("  nested: item"));
346 |         assert!(is_valid_yaml_line("- list_item"));
347 |         assert!(!is_valid_yaml_line("2024-01-01 10:00:00 INFO message"));
348 |         assert!(!is_valid_yaml_line("random text line"));
349 |     }
350 | }
351 | 


--------------------------------------------------------------------------------
/src/stats_opts.rs:
--------------------------------------------------------------------------------
  1 | use crate::Error;
  2 | use serde_json::Value;
  3 | 
  4 | /// 統計操作を適用する
  5 | pub fn apply_stats_operation(
  6 |     data: &[Value],
  7 |     operation: &str,
  8 |     field: Option<&str>,
  9 | ) -> Result<Value, Error> {
 10 |     match operation {
 11 |         "unique" => apply_unique(data, field),
 12 |         "sort" => apply_sort(data, field),
 13 |         "median" => apply_median(data, field),
 14 |         "stddev" => apply_stddev(data, field),
 15 |         "length" => Ok(Value::Number(serde_json::Number::from(data.len()))),
 16 |         _ => Err(Error::StringOperation(format!(
 17 |             "Unknown stats operation: {}",
 18 |             operation
 19 |         ))),
 20 |     }
 21 | }
 22 | 
 23 | /// ユニーク値を取得
 24 | fn apply_unique(data: &[Value], field: Option<&str>) -> Result<Value, Error> {
 25 |     use std::collections::HashSet;
 26 | 
 27 |     let mut unique_values = HashSet::new();
 28 |     let mut result = Vec::new();
 29 | 
 30 |     for item in data {
 31 |         let value_to_check = if let Some(field_name) = field {
 32 |             // フィールド指定がある場合
 33 |             item.get(field_name).unwrap_or(&Value::Null).clone()
 34 |         } else {
 35 |             // フィールド指定がない場合は値そのもの
 36 |             item.clone()
 37 |         };
 38 | 
 39 |         // JSON値をハッシュ可能な文字列に変換
 40 |         let key = serde_json::to_string(&value_to_check).unwrap_or_default();
 41 | 
 42 |         if unique_values.insert(key) {
 43 |             result.push(value_to_check);
 44 |         }
 45 |     }
 46 | 
 47 |     Ok(Value::Array(result))
 48 | }
 49 | 
 50 | /// ソート
 51 | fn apply_sort(data: &[Value], field: Option<&str>) -> Result<Value, Error> {
 52 |     let mut sorted_data = data.to_vec();
 53 | 
 54 |     sorted_data.sort_by(|a, b| {
 55 |         let val_a = if let Some(field_name) = field {
 56 |             a.get(field_name).unwrap_or(&Value::Null)
 57 |         } else {
 58 |             a
 59 |         };
 60 | 
 61 |         let val_b = if let Some(field_name) = field {
 62 |             b.get(field_name).unwrap_or(&Value::Null)
 63 |         } else {
 64 |             b
 65 |         };
 66 | 
 67 |         compare_json_values(val_a, val_b)
 68 |     });
 69 | 
 70 |     Ok(Value::Array(sorted_data))
 71 | }
 72 | 
 73 | /// 中央値を計算
 74 | fn apply_median(data: &[Value], field: Option<&str>) -> Result<Value, Error> {
 75 |     let mut numbers = extract_numbers(data, field)?;
 76 | 
 77 |     if numbers.is_empty() {
 78 |         return Ok(Value::Null);
 79 |     }
 80 | 
 81 |     numbers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
 82 | 
 83 |     let len = numbers.len();
 84 |     let median = if len % 2 == 0 {
 85 |         // 偶数個の場合は中央2つの平均
 86 |         (numbers[len / 2 - 1] + numbers[len / 2]) / 2.0
 87 |     } else {
 88 |         // 奇数個の場合は中央値
 89 |         numbers[len / 2]
 90 |     };
 91 | 
 92 |     Ok(Value::Number(serde_json::Number::from_f64(median).unwrap()))
 93 | }
 94 | 
 95 | /// 標準偏差を計算
 96 | fn apply_stddev(data: &[Value], field: Option<&str>) -> Result<Value, Error> {
 97 |     let numbers = extract_numbers(data, field)?;
 98 | 
 99 |     if numbers.len() < 2 {
100 |         return Ok(Value::Null);
101 |     }
102 | 
103 |     let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
104 |     let variance =
105 |         numbers.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (numbers.len() - 1) as f64; // 標本標準偏差
106 | 
107 |     let stddev = variance.sqrt();
108 | 
109 |     Ok(Value::Number(serde_json::Number::from_f64(stddev).unwrap()))
110 | }
111 | 
112 | /// 数値を抽出
113 | fn extract_numbers(data: &[Value], field: Option<&str>) -> Result<Vec<f64>, Error> {
114 |     let mut numbers = Vec::new();
115 | 
116 |     for item in data {
117 |         let value = if let Some(field_name) = field {
118 |             item.get(field_name).unwrap_or(&Value::Null)
119 |         } else {
120 |             item
121 |         };
122 | 
123 |         if let Some(num) = value.as_f64() {
124 |             numbers.push(num);
125 |         }
126 |     }
127 | 
128 |     Ok(numbers)
129 | }
130 | 
131 | /// JSON値の比較
132 | fn compare_json_values(a: &Value, b: &Value) -> std::cmp::Ordering {
133 |     use std::cmp::Ordering;
134 | 
135 |     match (a, b) {
136 |         (Value::Number(n1), Value::Number(n2)) => n1
137 |             .as_f64()
138 |             .unwrap_or(0.0)
139 |             .partial_cmp(&n2.as_f64().unwrap_or(0.0))
140 |             .unwrap_or(Ordering::Equal),
141 |         (Value::String(s1), Value::String(s2)) => s1.cmp(s2),
142 |         (Value::Bool(b1), Value::Bool(b2)) => b1.cmp(b2),
143 |         (Value::Null, Value::Null) => Ordering::Equal,
144 |         (Value::Null, _) => Ordering::Less,
145 |         (_, Value::Null) => Ordering::Greater,
146 |         // 異なる型の場合は型名で比較
147 |         _ => get_type_priority(a).cmp(&get_type_priority(b)),
148 |     }
149 | }
150 | 
151 | /// 型の優先順位
152 | fn get_type_priority(value: &Value) -> u8 {
153 |     match value {
154 |         Value::Null => 0,
155 |         Value::Bool(_) => 1,
156 |         Value::Number(_) => 2,
157 |         Value::String(_) => 3,
158 |         Value::Array(_) => 4,
159 |         Value::Object(_) => 5,
160 |     }
161 | }
162 | 
163 | #[cfg(test)]
164 | mod tests {
165 |     use super::*;
166 | 
167 |     #[test]
168 |     fn test_unique_operation() {
169 |         let data = vec![
170 |             Value::String("apple".to_string()),
171 |             Value::String("banana".to_string()),
172 |             Value::String("apple".to_string()),
173 |             Value::String("cherry".to_string()),
174 |         ];
175 | 
176 |         let result = apply_unique(&data, None).unwrap();
177 |         if let Value::Array(arr) = result {
178 |             assert_eq!(arr.len(), 3); // apple, banana, cherry
179 |         } else {
180 |             panic!("Expected array result");
181 |         }
182 |     }
183 | 
184 |     #[test]
185 |     fn test_sort_numbers() {
186 |         let data = vec![
187 |             Value::Number(3.into()),
188 |             Value::Number(1.into()),
189 |             Value::Number(4.into()),
190 |             Value::Number(2.into()),
191 |         ];
192 | 
193 |         let result = apply_sort(&data, None).unwrap();
194 |         if let Value::Array(arr) = result {
195 |             assert_eq!(arr[0], Value::Number(1.into()));
196 |             assert_eq!(arr[1], Value::Number(2.into()));
197 |             assert_eq!(arr[2], Value::Number(3.into()));
198 |             assert_eq!(arr[3], Value::Number(4.into()));
199 |         } else {
200 |             panic!("Expected array result");
201 |         }
202 |     }
203 | 
204 |     #[test]
205 |     fn test_median_even() {
206 |         let data = vec![
207 |             Value::Number(1.into()),
208 |             Value::Number(2.into()),
209 |             Value::Number(4.into()),
210 |             Value::Number(5.into()),
211 |         ];
212 | 
213 |         let result = apply_median(&data, None).unwrap();
214 |         assert_eq!(
215 |             result,
216 |             Value::Number(serde_json::Number::from_f64(3.0).unwrap())
217 |         );
218 |     }
219 | 
220 |     #[test]
221 |     fn test_stddev() {
222 |         let data = vec![
223 |             Value::Number(1.into()),
224 |             Value::Number(2.into()),
225 |             Value::Number(3.into()),
226 |             Value::Number(4.into()),
227 |             Value::Number(5.into()),
228 |         ];
229 | 
230 |         let result = apply_stddev(&data, None).unwrap();
231 |         // 標本標準偏差 ≈ 1.58
232 |         if let Value::Number(n) = result {
233 |             let stddev = n.as_f64().unwrap();
234 |             assert!((stddev - 1.58).abs() < 0.1);
235 |         } else {
236 |             panic!("Expected number result");
237 |         }
238 |     }
239 | 
240 |     #[test]
241 |     fn test_unique_with_field() {
242 |         let data = vec![
243 |             serde_json::json!({"name": "Alice", "age": 30}),
244 |             serde_json::json!({"name": "Bob", "age": 25}),
245 |             serde_json::json!({"name": "Alice", "age": 35}),
246 |         ];
247 | 
248 |         let result = apply_unique(&data, Some("name")).unwrap();
249 |         if let Value::Array(arr) = result {
250 |             assert_eq!(arr.len(), 2); // Alice, Bob
251 |         } else {
252 |             panic!("Expected array result");
253 |         }
254 |     }
255 | }
256 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use serde_json::Value;
 2 | 
 3 | pub fn value_to_string(value: &Value) -> String {
 4 |     match value {
 5 |         Value::String(s) => s.clone(),
 6 |         _ => value.to_string().trim_matches('"').to_string(),
 7 |     }
 8 | }
 9 | 
10 | #[cfg(test)]
11 | mod tests {
12 |     use super::*;
13 | 
14 |     #[test]
15 |     fn test_value_to_string_with_string() {
16 |         let value = Value::String("Alice".to_string());
17 |         assert_eq!(value_to_string(&value), "Alice");
18 |     }
19 | 
20 |     #[test]
21 |     fn test_value_to_string_with_number() {
22 |         let value = Value::Number(serde_json::Number::from(42));
23 |         assert_eq!(value_to_string(&value), "42");
24 |     }
25 | 
26 |     #[test]
27 |     fn test_value_to_string_with_boolean() {
28 |         let value = Value::Bool(true);
29 |         assert_eq!(value_to_string(&value), "true");
30 | 
31 |         let value = Value::Bool(false);
32 |         assert_eq!(value_to_string(&value), "false");
33 |     }
34 | 
35 |     #[test]
36 |     fn test_value_to_string_with_null() {
37 |         let value = Value::Null;
38 |         assert_eq!(value_to_string(&value), "null");
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------