├── .gitignore ├── CONTRIBUTORS.md ├── EXAMPLES.md ├── LDE.g4 ├── LICENSE ├── Makefile ├── PERFORMANCE.md ├── PERFORMANCE_MANUAL.md ├── RATIONALE.md ├── README.md ├── SAMPLE.md ├── TOOL_RULES.md ├── benchmarker.7z ├── benchmarking ├── easy.go ├── easy.ragel ├── easy_floats.go ├── easy_floats.ragel ├── easy_real_world_test.go ├── harder_test.go ├── main.go ├── performance_test.go ├── rule.lde ├── rule_lde.go ├── template.go └── template.ragel ├── columngen.7z ├── dicts.go ├── go.mod ├── go.sum ├── internal ├── ast │ ├── action.go │ ├── action_anonymous_option.go │ ├── action_at_end.go │ ├── action_check_fixed_without_pass.go │ ├── action_error_on_mismatch.go │ ├── action_may_be_start_char.go │ ├── action_may_be_start_string.go │ ├── action_optional.go │ ├── action_optional_silent.go │ ├── action_pass_after.go │ ├── action_pass_after_or_ignore.go │ ├── action_pass_before.go │ ├── action_pass_before_or_ignore.go │ ├── action_pass_first.go │ ├── action_pass_heading_characters.go │ ├── action_rest_length_check.go │ ├── action_start_char.go │ ├── action_start_char_without_pass.go │ ├── action_start_string.go │ ├── action_start_string_without_pass.go │ ├── action_take.go │ ├── action_take_including.go │ ├── action_take_rest.go │ ├── action_take_until_including_or_rest.go │ ├── action_take_until_or_rest.go │ ├── action_type_registration.go │ ├── action_z_dispatcher.go │ ├── error.go │ ├── field.go │ ├── rule.go │ ├── target-enums.go │ ├── targetenum_string.go │ ├── targets.go │ └── util.go ├── generator │ ├── generator.go │ ├── gogen │ │ ├── decodergen.go │ │ ├── generator.go │ │ ├── heads.go │ │ ├── internal │ │ │ ├── mnemo │ │ │ │ ├── mnemo.go │ │ │ │ └── shortcut.go │ │ │ └── srcobj │ │ │ │ ├── assign.go │ │ │ │ ├── body.go │ │ │ │ ├── call.go │ │ │ │ ├── drawchar.go │ │ │ │ ├── file.go │ │ │ │ ├── flow_for.go │ │ │ │ ├── flow_if.go │ │ │ │ ├── index.go │ │ │ │ ├── lookup_long.go │ │ │ │ ├── lookup_short.go │ │ │ │ ├── main.go │ │ │ │ ├── method.go │ │ │ │ ├── misc.go │ │ │ │ ├── operators.go │ │ │ │ ├── prefix.go │ │ │ │ ├── raw.go │ │ │ │ ├── return.go │ │ │ │ ├── slice.go │ │ │ │ ├── struct.go │ │ │ │ └── vars.go │ │ ├── lookup.go │ │ ├── optionals.go │ │ ├── takers.go │ │ └── util.go │ └── platform_type.go ├── ldetesting │ ├── error_1.lde │ ├── error_test.go │ ├── generate.go │ ├── missing_import.lde │ ├── missing_import_lde.go │ ├── missing_import_lde_test.go │ ├── missing_import_unmarshals.go │ ├── parsing.lde │ ├── parsing_lde.go │ ├── parsing_test.go │ ├── parsing_unmarshals.go │ ├── regressions.lde │ ├── regressions_lde.go │ ├── regressions_lde_test.go │ ├── regressions_unmarshals.go │ ├── string.lde │ ├── string_lde.go │ ├── string_test.go │ └── string_unmarshals.go ├── listener │ └── listener.go ├── parser │ ├── LDE.interp │ ├── LDE.tokens │ ├── LDELexer.interp │ ├── LDELexer.tokens │ ├── lde_base_listener.go │ ├── lde_lexer.go │ ├── lde_listener.go │ └── lde_parser.go ├── srcbuilder │ ├── builder.go │ └── dispatching.go └── types │ ├── builtin.go │ ├── field.go │ ├── fields_generator.go │ ├── gen_bool.go │ ├── gen_dec128.go │ ├── gen_dec32.go │ ├── gen_dec64.go │ ├── gen_float32.go │ ├── gen_float64.go │ ├── gen_hex.go │ ├── gen_hex16.go │ ├── gen_hex32.go │ ├── gen_hex64.go │ ├── gen_hex8.go │ ├── gen_int.go │ ├── gen_int16.go │ ├── gen_int32.go │ ├── gen_int64.go │ ├── gen_int8.go │ ├── gen_oct.go │ ├── gen_oct16.go │ ├── gen_oct32.go │ ├── gen_oct64.go │ ├── gen_oct8.go │ ├── gen_str.go │ ├── gen_string.go │ ├── gen_uint.go │ ├── gen_uint16.go │ ├── gen_uint32.go │ ├── gen_uint64.go │ ├── gen_uint8.go │ ├── internal │ └── gen-builtin │ │ └── gen-builtin.go │ ├── type_custom.go │ ├── type_imported.go │ ├── type_local.go │ └── type_registration.go ├── main.go ├── msg_translator.go ├── msg_translator_rules.lde └── msg_translator_rules_lde.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.dll 4 | *.so 5 | *.dylib 6 | vendor/* 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 15 | .glide/ 16 | 17 | .idea/ 18 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | | Name | Github profile | Contribution | 2 | |------------------|----------------------------------------------------|-----------------------| 3 | | Denis Cheremisov | [github.com/sirkon](https://github.com/sirkon) | Owner | 4 | | Tim De Waal | [github.com/Caesurus](https://github.com/Caesurus) | Bug reports and fixes | 5 | | Matt Hook | [github.com/hookenz](https://github.com/hookenz) | Bug reports and proposals | 6 | | | [github.com/traetox](https://github.com/traetox) | Bug reports | 7 | | Casey Strouse | [github.com/cstrouse](https://github.com/cstrouse) | Bug reports and fixes | 8 | -------------------------------------------------------------------------------- /EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | 3 | 4 | # What if some parts may be missing 5 | 6 | Example: 7 | 8 | ``` 9 | 1905-04-15 name[Sergey Mihoparov] died[2010-07-12] education[school4] jailed[1951-11-02_1959-11-01] moderate-negative single experience[35] 10 | 1989-03-29 name[Nikita Zverev] education[school10+university6] marred1- experience[6] 11 | 2017-09-14 name[Aleksey Bezborodko] single experience[] 12 | ``` 13 | 14 | See at `moderate-negative` charactestics – imagine we don't care about it. In this case a rule that will consume both these lines may look like 15 | 16 | ```perl 17 | Rule = 18 | Time(string) ' ' 19 | ^"name[" Name(string) ']' ^' ' 20 | ??Death( 21 | ^"died[" Date(string) ']' ^' ' 22 | ) 23 | ^"education[" Education(string) ']' ^' ' 24 | ?Jailed( 25 | ^"jailed[" Intervals(string) ']' ^' ' 26 | ) 27 | ?( ^"moderate-negative ") 28 | Status(string) ' ' 29 | ^"experience[" 30 | ??Experience( 31 | Years(int) ']' 32 | ); 33 | ``` 34 | 35 | Please take a look at optional areas. The area `?Jailed` will create a substructure with `Valid bool` field, which is set to true if it was matched successfully and set it to 36 | false otherwise. Remember though, if you have some numeric taker and conversion failed the whole `Rule` failed. Use `??` to ignore conversion errors as well. 37 | 38 | And so called ___anonymous area___ `?( ^"moderate-negative ")`. It is checking if the rest starts with `moderate-negative ` and passes it in case it is true. Or go to the next action otherwise. 39 | 40 | 41 | # Case when we need all data right to the some character or all the data if the character was not found 42 | 43 | Lines: 44 | 45 | ``` 46 | Vladimir 37 186 91 15 47 | Mikhail 35 184 76 48 | ``` 49 | 50 | And we don't need the last number `Vladimir' has. So, use this rule 51 | 52 | ```perl 53 | Rule = 54 | Name(string) ' ' 55 | Age(int) ' ' 56 | Height(int) ' ' 57 | Weight(int) ?' ' 58 | ; 59 | ``` 60 | 61 | See the `?` before the last `' '` – it commands to take everything as `Weight` right to the end or the first `' '` 62 | 63 | # Case when bounds needed too 64 | 65 | Let we have something like 66 | 67 | ``` 68 | 2019-09-08T23:47:11.671194+03:00 id=some_id version=1.1.1 coords={"lat":11.11, "lon":22.22} locale=en-EN 69 | ``` 70 | 71 | and we `' '` won't work as a bound for coords as we have one in JSON itself. The solution is to use 72 | `[]` taker, it will store bound itself: 73 | 74 | ```perl 75 | Rule = 76 | Time(string) " id=" 77 | ID(string) " version=" 78 | Version(string) " coords=" 79 | Coords[string] '}' ^" locale=" 80 | Locale(string) 81 | ; 82 | ``` 83 | 84 | # Case of extended data format, where lines in B looks like lines from A with some more data appended 85 | 86 | Let we have two files __a.info__ and __b.info__: 87 | 88 | ##### `a.info` 89 | ```a.info a.info 90 | 1 2 3 4 91 | 12 13 14 15 92 | 1 1 1 1 93 | ``` 94 | 95 | ##### `b.info` 96 | 97 | ``` 98 | 1 2 3 4 5 99 | 5 4 3 2 1 100 | ``` 101 | 102 | You see, format in `b.info` clearly extends the one in `a.info` and we would 103 | love to process both in a similar way (with the last column defaulted to some 104 | value in case of `a.info`, most likely to `0`) 105 | 106 | How can we achieve this? Tricky, but simple: 107 | 108 | ### Rules 109 | 110 | 111 | ```perl 112 | A = First(int) ' ' Second(int) ' ' Third(int) ' ' Fourth(int); 113 | B = First(int) ' ' Second(int) ' ' Third(int) ' ' Fourth(int) ` ` Fifth(int); 114 | ``` 115 | 116 | and compile it 117 | 118 | ``` 119 | ldetool --package main rule.lde 120 | ``` 121 | 122 | ### Usage 123 | 124 | `b.info` parses in a simple manner 125 | 126 | ```go 127 | for bInfo.Next() { 128 | var b B 129 | if ok, err := b.Extract(bInfo.Bytes()); !ok { 130 | if err != nil { 131 | return … 132 | } 133 | return … 134 | } 135 | processInfo(b) 136 | } 137 | ``` 138 | 139 | parsing `a.info`: 140 | 141 | ```go 142 | for aInfo.Next() { 143 | var b B 144 | if ok, err := (*A)(unsafe.Pointer(&b)).Extract(aInfo.Bytes()); !ok { 145 | if err != nil { 146 | return … 147 | } 148 | return … 149 | } 150 | processInfo(b) 151 | } 152 | ``` 153 | 154 | This will work as memory layout in A looks exactly like the memory layout of the first four fields 155 | of B -------------------------------------------------------------------------------- /LDE.g4: -------------------------------------------------------------------------------- 1 | grammar LDE; 2 | 3 | @lexer::declarations { 4 | comments map[int]map[int]string 5 | } 6 | 7 | @lexer::construction { 8 | l.comments = map[int]map[int]string{} 9 | } 10 | 11 | @lexer::members { 12 | func (l *LDELexer) Comments() map[int]map[int]string { 13 | return l.comments 14 | } 15 | } 16 | 17 | rules 18 | : typeDeclaration* atomicRule* EOF 19 | ; 20 | 21 | typeDeclaration 22 | : 'type' TypeName 'from' StringLit ';' 23 | | 'type' Identifier ';' 24 | ; 25 | 26 | atomicRule 27 | : Identifier '=' baseAction ';'; 28 | 29 | baseAction 30 | : Stress baseAction 31 | | '(' baseAction ')' baseAction 32 | | '(' baseAction ')' 33 | | atomicAction baseAction 34 | | atomicAction 35 | ; 36 | 37 | atomicAction 38 | : passTargetPrefix 39 | | checkTargetPrefix 40 | | passHeadingCharacters 41 | | mayBePassTargetPrefix 42 | | passChars 43 | | passUntil 44 | | mayPassUntil 45 | | goUntil 46 | | mayGoUntil 47 | | takeUntil 48 | | takeUntilIncluding 49 | | takeUntilOrRest 50 | | takeUntilIncludingOrRest 51 | | takeUntilRest 52 | | optionalNamedArea 53 | | optionalNamedSilentArea 54 | | optionalArea 55 | | restCheck 56 | | atEnd; 57 | 58 | passHeadingCharacters 59 | : '*' CharLit; 60 | 61 | passTargetPrefix 62 | : '^' targetLit '[' IntLit ']' 63 | | '^' targetLit 64 | ; 65 | 66 | checkTargetPrefix 67 | : '@' targetLit '[' IntLit ']' 68 | | '@' targetLit 69 | ; 70 | 71 | mayBePassTargetPrefix 72 | : '?' '^' targetLit '[' IntLit ']' 73 | | '?' '^' targetLit 74 | ; 75 | 76 | passChars 77 | : '_' '[' IntLit ':' ']'; 78 | 79 | goUntil 80 | : '..' target; 81 | 82 | mayGoUntil 83 | : '?' '..' target; 84 | 85 | passUntil 86 | : '_' target; 87 | 88 | mayPassUntil 89 | : '?' '_' target; 90 | 91 | takeUntil 92 | : Identifier '(' fieldType ')' target; 93 | 94 | takeUntilIncluding 95 | : Identifier '[' fieldType ']' target; 96 | 97 | takeUntilOrRest 98 | : Identifier '(' fieldType ')' '?' target; 99 | 100 | takeUntilIncludingOrRest 101 | : Identifier '[' fieldType ']' '?' target; 102 | 103 | takeUntilRest 104 | : Identifier '(' fieldType ')'; 105 | 106 | optionalNamedArea 107 | : '?' Identifier '(' baseAction ')'; 108 | 109 | optionalNamedSilentArea 110 | : '??' Identifier '(' baseAction ')'; 111 | 112 | optionalArea 113 | : '?' '(' baseAction ')'; 114 | 115 | restCheck 116 | : '%' IntLit 117 | | '%' ComparisonOperator IntLit; 118 | 119 | atEnd 120 | : '$'; 121 | 122 | 123 | target 124 | : targetLit bound 125 | | targetLit limit 126 | | targetLit exact 127 | | targetLit jump 128 | | targetLit 129 | | '~' target; 130 | 131 | targetLit 132 | : CharLit 133 | | StringLit; 134 | 135 | bound 136 | : '[' IntLit ':' IntLit ']' 137 | ; 138 | 139 | limit 140 | : '[' ':' IntLit ']' 141 | ; 142 | 143 | jump 144 | : '[' IntLit ':' ']' 145 | ; 146 | 147 | exact 148 | : '[' IntLit ']' 149 | ; 150 | 151 | fieldType 152 | : IdentifierWithFraction 153 | | Identifier 154 | | DollarIdentifier 155 | | TypeName 156 | ; 157 | 158 | ComparisonOperator 159 | : [<>] 160 | ; 161 | 162 | DollarIdentifier 163 | : '$' [a-zA-Z_] ([a-zA-Z0-9_]*) 164 | ; 165 | 166 | Identifier 167 | : [a-zA-Z_] ([a-zA-Z0-9_]*) 168 | ; 169 | 170 | TypeName 171 | : '*'* [a-zA-Z_] ([a-zA-Z0-9_]*) '.' [a-zA-Z_] ([a-zA-Z0-9_]*) 172 | ; 173 | 174 | IdentifierMayStar 175 | : '*'* [a-zA-Z_] ([a-zA-Z0-9_]*) 176 | ; 177 | 178 | IdentifierWithFraction 179 | : [a-zA-Z_] ([a-zA-Z0-9_]*) '.' [0-9]+ 180 | ; 181 | 182 | IntLit 183 | : [0-9]+ 184 | ; 185 | 186 | fragment EscapedQuote : '\\"'; 187 | StringLit : '"' ( EscapedQuote | ~('\n'|'\r'|'\t') ) ( EscapedQuote | ~('\n'|'\r'|'\t') )*? '"' 188 | ; // remember this is actually not empty string, as empty strings has no sense in this task 189 | 190 | fragment EscapedApo : '\\\''; 191 | 192 | CharLit 193 | : '\'' ( EscapedApo | ~('\n'|'\r'|'\t') )*? '\'' 194 | ; 195 | 196 | WS 197 | : [ \n\t\r] -> skip 198 | ; 199 | 200 | LineComment 201 | : '#' ~[\r\n]* { 202 | v, ok := l.comments[l.GetLine()] 203 | if !ok { 204 | v = map[int]string{} 205 | } 206 | v[l.GetCharPositionInLine()-len([]rune(l.GetText()))] = l.GetText()[1:] 207 | l.comments[l.GetLine()] = v 208 | l.Skip() 209 | }; 210 | 211 | Stress 212 | : '!' 213 | ; 214 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Denis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | PATH=${GOPATH}/bin:${PATH} 3 | go install 4 | go generate ./internal/ldetesting 5 | which ldetool 6 | go test -test.v ./internal/ldetesting 7 | 8 | grammar: 9 | antlr4 -no-visitor -listener -o internal/parser -Dlanguage=Go LDE.g4 10 | -------------------------------------------------------------------------------- /RATIONALE.md: -------------------------------------------------------------------------------- 1 | # Rationale 2 | 3 | There's a traditional solution for this kind of tasks: regular expression with capture groups. But it has numerous generic and Go-specific disadvantages: 4 | 5 | 1. Regexes are hard to read and debug. 6 | 2. Regexes don't help with string → numeric conversions 7 | 3. You can't generally say what caused an error with regexes, so +1 to debug complexity. 8 | 4. Speed. While simple non-capturing regular expressions can be speedy, they quickly becomes slow as the complexity of the regular expression grows 9 | 5. They are overpowered for simple log parsing. In our experience with log processing we are not looking for patterns within the line. Usually our data is well structured and it is easier to think (and compute!) in terms of bounds and separators. And if the data is not well structured then it is a good idea to make it one, just for the sake of readability. 10 | 6. Go regular expressions are slow. Go regular expressions with group capture are even slower. 11 | 7. There are no cheap way in Go regexes what would give us a convenient way to access a group's value, we must use arrays instead of access to captured value by group name, thus it is hard for reading and comprehension. 12 | 13 | There is another traditional approach: manual data extraction. We manually command to find a symbol or substring and pass 14 | it or take everything before it and put into variable, it also has his share of generic disadvantages: 15 | 16 | 1. It is annoying as hell to write it 17 | 2. It can be hard to read 18 | 19 | Still, the major advantage is: 20 | 1. It can be fast 21 | 22 | In my previous job we had lack of funding for hardware while the data was constantly growing. We moved from Python into Go to deal with it. And a prototype of this tool was created to simplify our tasks, as we were writing line data decomposers manually. In the end our parsing scripts became even faster than they were before, as the code generator is not afraid of making even the most boring yet speedy things again and again. 23 | -------------------------------------------------------------------------------- /SAMPLE.md: -------------------------------------------------------------------------------- 1 | ```go 2 | /* This file was autogenerated via 3 | ---------------------------------------- 4 | ldetool generate --package main line.lde 5 | ---------------------------------------- 6 | do not touch it with bare hands! 7 | */ 8 | 9 | package main 10 | 11 | import ( 12 | "bytes" 13 | "fmt" 14 | "strconv" 15 | "unsafe" 16 | ) 17 | 18 | var countryLsbrck = []byte("country[") 19 | var firstLsbrck = []byte("first[") 20 | var spaceFETCHSpace = []byte(" FETCH ") 21 | var spaceFormatLsbrck = []byte(" format[") 22 | var spaceHiddenLsbrck = []byte(" hidden[") 23 | var spaceUserUscoreAgentLsbrck = []byte(" user_agent[") 24 | 25 | // Line autogenerated parser 26 | type Line struct { 27 | rest []byte 28 | Time []byte 29 | First uint8 30 | Format []byte 31 | Hidden struct { 32 | Valid bool 33 | Value uint8 34 | } 35 | UserAgent []byte 36 | Country []byte 37 | } 38 | 39 | // Extract autogenerated method of Line 40 | func (p *Line) Extract(line []byte) (bool, error) { 41 | var err error 42 | var hiddenRest []byte 43 | var pos int 44 | var tmp []byte 45 | var tmpUint uint64 46 | p.rest = line 47 | 48 | // Checks if the rest starts with '[' symbol and pass it 49 | if len(p.rest) > 0 && p.rest[0] == '[' { 50 | p.rest = p.rest[1:] 51 | } else { 52 | return false, nil 53 | } 54 | 55 | // Put data before ']' into Time 56 | if pos = bytes.IndexByte(p.rest, ']'); pos >= 0 { 57 | p.Time = p.rest[:pos] 58 | 59 | p.rest = p.rest[pos+1:] 60 | } else { 61 | return false, nil 62 | } 63 | 64 | // Checks if the rest starts with " FETCH " and pass it 65 | if bytes.HasPrefix(p.rest, spaceFETCHSpace) { 66 | p.rest = p.rest[len(spaceFETCHSpace):] 67 | } else { 68 | return false, nil 69 | } 70 | 71 | // Checks if the rest starts with "first[" and pass it 72 | if bytes.HasPrefix(p.rest, firstLsbrck) { 73 | p.rest = p.rest[len(firstLsbrck):] 74 | } else { 75 | return false, nil 76 | } 77 | 78 | // Put data before ']' into First 79 | if pos = bytes.IndexByte(p.rest, ']'); pos >= 0 { 80 | tmp = p.rest[:pos] 81 | if tmpUint, err = strconv.ParseUint(*(*string)(unsafe.Pointer(&tmp)), 10, 8); err != nil { 82 | return false, fmt.Errorf("Error parsing \033[1m%s\033[0m value as uint8 for field \033[1mFirst\033[0m: %s", string(tmp), err) 83 | } 84 | p.First = uint8(tmpUint) 85 | 86 | p.rest = p.rest[pos+1:] 87 | } else { 88 | return false, nil 89 | } 90 | 91 | // Checks if the rest starts with " format[" and pass it 92 | if bytes.HasPrefix(p.rest, spaceFormatLsbrck) { 93 | p.rest = p.rest[len(spaceFormatLsbrck):] 94 | } else { 95 | return false, nil 96 | } 97 | 98 | // Put data before ']' into Format 99 | if pos = bytes.IndexByte(p.rest, ']'); pos >= 0 { 100 | p.Format = p.rest[:pos] 101 | 102 | p.rest = p.rest[pos+1:] 103 | } else { 104 | return false, nil 105 | } 106 | hiddenRest = p.rest 107 | 108 | // Checks if the rest starts with " hidden[" and pass it 109 | if bytes.HasPrefix(hiddenRest, spaceHiddenLsbrck) { 110 | hiddenRest = hiddenRest[len(spaceHiddenLsbrck):] 111 | } else { 112 | p.Hidden.Valid = false 113 | goto hiddenLabel 114 | } 115 | 116 | // Put data before ']' into Hidden.Value 117 | if pos = bytes.IndexByte(hiddenRest, ']'); pos >= 0 { 118 | tmp = hiddenRest[:pos] 119 | if tmpUint, err = strconv.ParseUint(*(*string)(unsafe.Pointer(&tmp)), 10, 8); err != nil { 120 | return false, fmt.Errorf("Error parsing \033[1m%s\033[0m value as uint8 for field \033[1mHidden.Value\033[0m: %s", string(tmp), err) 121 | } 122 | p.Hidden.Value = uint8(tmpUint) 123 | 124 | hiddenRest = hiddenRest[pos+1:] 125 | } else { 126 | p.Hidden.Valid = false 127 | goto hiddenLabel 128 | } 129 | p.Hidden.Valid = true 130 | p.rest = hiddenRest 131 | hiddenLabel: 132 | 133 | // Checks if the rest starts with " user_agent[" and pass it 134 | if bytes.HasPrefix(p.rest, spaceUserUscoreAgentLsbrck) { 135 | p.rest = p.rest[len(spaceUserUscoreAgentLsbrck):] 136 | } else { 137 | return false, nil 138 | } 139 | 140 | // Put data before ']' into UserAgent 141 | if pos = bytes.IndexByte(p.rest, ']'); pos >= 0 { 142 | p.UserAgent = p.rest[:pos] 143 | 144 | p.rest = p.rest[pos+1:] 145 | } else { 146 | return false, nil 147 | } 148 | 149 | // Looking for "country[" and then pass it 150 | pos = bytes.Index(p.rest, countryLsbrck) 151 | if pos >= 0 { 152 | p.rest = p.rest[pos+len(countryLsbrck):] 153 | } else { 154 | return false, nil 155 | } 156 | 157 | // Put data before ']' into Country 158 | if pos = bytes.IndexByte(p.rest, ']'); pos >= 0 { 159 | p.Country = p.rest[:pos] 160 | 161 | p.rest = p.rest[pos+1:] 162 | } else { 163 | return false, nil 164 | } 165 | 166 | return true, nil 167 | } 168 | 169 | // GetHiddenValue retrieves optional value for HiddenValue.Name 170 | func (p *Line) GetHiddenValue() (res uint8) { 171 | if !p.Hidden.Valid { 172 | return 173 | } 174 | return p.Hidden.Value 175 | } 176 | ``` 177 | -------------------------------------------------------------------------------- /benchmarker.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sirkon/ldetool/27cd835afb0cad80a34924abc91c52575783c813/benchmarker.7z -------------------------------------------------------------------------------- /benchmarking/easy.ragel: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // Easy based parsing 4 | type Easy struct { 5 | Time []byte 6 | UID []byte 7 | UA []byte 8 | Geo struct { 9 | Valid bool 10 | Lat []byte 11 | Lon []byte 12 | } 13 | Activity []byte 14 | } 15 | 16 | %% machine easy; 17 | %% write data; 18 | 19 | // Extract extracts field from 20 | func (r *Easy) Extract(data []byte) (ok bool, error error) { 21 | cs, p, pe := 0, 0, len(data) 22 | var pos = 0 23 | r.Geo.Valid = false 24 | 25 | %%{ 26 | action shot { pos = p + 1 } 27 | action take_time { r.Time = data[pos:p+1] } 28 | action take_uid { r.UID = data[pos:p+1] } 29 | action take_ua { r.UA = data[pos:p+1] } 30 | action take_lat { r.Geo.Lat = data[pos:p+1] } 31 | action take_lon { r.Geo.Lon = data[pos:p+1] } 32 | action take_act { r.Activity = data[pos:p+1] } 33 | action set_geo { r.Geo.Valid = true } 34 | 35 | ns = (any -- " ")*; 36 | main := 37 | ns " "@shot ((any -- "]")*)@take_time "] PRESENCE uid="@shot 38 | ns@take_uid " ua='"@shot ((any -- "'")*)@take_ua "' "@shot 39 | ( 40 | "Geo={Lat: "@set_geo@shot ((any -- ",")*)@take_lat ", Lon: "@shot ((any -- "}")*)@take_lon "} "@shot 41 | )? 42 | "Activity="@shot (any*)@take_act 43 | ; 44 | write init; 45 | write exec; 46 | }%% 47 | return true, nil 48 | } -------------------------------------------------------------------------------- /benchmarking/easy_floats.ragel: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | 4 | import ( 5 | "unsafe" 6 | "strconv" 7 | ) 8 | 9 | // EasyFloat based parsing 10 | type EasyFloat struct { 11 | Time []byte 12 | UID []byte 13 | UA []byte 14 | Geo struct { 15 | Valid bool 16 | Lat float64 17 | Lon float64 18 | } 19 | Activity uint8 20 | } 21 | 22 | %% machine easyfloats; 23 | %% write data; 24 | 25 | // Extract extracts field from 26 | func (r *EasyFloat) Extract(data []byte) (ok bool, err error) { 27 | cs, p, pe := 0, 0, len(data) 28 | var pos = 0 29 | r.Geo.Valid = false 30 | var tmpFloat float64 31 | var tmpUint uint64 32 | var tmp []byte 33 | 34 | %%{ 35 | action shot { pos = p + 1 } 36 | action take_time { r.Time = data[pos:p+1] } 37 | action take_uid { r.UID = data[pos:p+1] } 38 | action take_ua { r.UA = data[pos:p+1] } 39 | action tmp_float { 40 | tmp = data[pos:p+1] 41 | if tmpFloat, err = strconv.ParseFloat(*(*string)(unsafe.Pointer(&tmp)), 64); err != nil { 42 | return false, err 43 | } 44 | } 45 | action take_lat { r.Geo.Lat = tmpFloat } 46 | action take_lon { r.Geo.Lon = tmpFloat } 47 | action take_act { 48 | tmp = data[pos:p+1] 49 | if tmpUint, err = strconv.ParseUint(*(*string)(unsafe.Pointer(&tmp)), 10, 8); err != nil { 50 | return false, err 51 | } 52 | r.Activity = uint8(tmpUint) 53 | } 54 | action set_geo { r.Geo.Valid = true } 55 | 56 | ns = (any -- " ")*; 57 | main := 58 | ns " "@shot ((any -- "]")*)@take_time "] PRESENCE uid="@shot 59 | ns@take_uid " ua='"@shot ((any -- "'")*)@take_ua "' "@shot 60 | ( 61 | "Geo={Lat: "@set_geo@shot ((any -- ",")*)@tmp_float@take_lat ", Lon: "@shot ((any -- "}")*)@tmp_float@take_lon "} "@shot 62 | )? 63 | "Activity="@shot (any*)@take_act 64 | ; 65 | write init; 66 | write exec; 67 | }%% 68 | return true, nil 69 | } 70 | -------------------------------------------------------------------------------- /benchmarking/harder_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | 6 | "bytes" 7 | 8 | "regexp" 9 | 10 | "strconv" 11 | 12 | "github.com/stretchr/testify/require" 13 | ) 14 | 15 | var sampleLines = []string{ 16 | "[22653 22.09.2016 23:53:51] I MCHAT-ST txn_finalize(): TXN 0x357f4a000[674459048.ModAnketa() " + 17 | "reqid '1474577512273-1474577630:51#modChatAlpha.674459048@chat.agent' from " + 18 | "3:666366008(666366008@uin.icq) orgn 51:1474577512273-1474577630:51#modChatAlpha.674459048@chat.agent:]: " + 19 | "DONE (200[]) FLAGS[set:.......J.., unset:..........] " + 20 | "FIELDS[changed:.............] ank_ver[{6333262700747542897=22.09.16-23:53:51-189387121}] " + 21 | "list_ver[{6333246817957146840=22.09.16-22:52:13-188051672}] " + 22 | "name[🔥👑الصياد 👑خط احمر🚫❌🔫💣🔥] stamp[AoLBzdsoJtJ1LI6-RgY] regions[KW] " + 23 | "flags[0x10b(PLC....J..)] created[1474573933=22.09.16-22:52:13] creator[3:666366008] " + 24 | "avatars_lastcheck[1474577584] cavatar_lastmod[1474575351] " + 25 | "origin[0::] abuse_drugs[0] abuse_spam[0] abuse_porno[0] abuse_unknown[0] abuse_unknown[0]", 26 | "[22653 22.09.2016 23:53:51] I MCHAT-ST txn_finalize(): TXN 0x357f4a000[674459048.ModAnketa() " + 27 | "reqid '1474577512273-1474577630:51#modChatAlpha.674459048@chat.agent' from " + 28 | "3:666366008(666366008@uin.icq) orgn 51:1474577512273-1474577630:51#modChatAlpha.674459048@chat.agent:]: " + 29 | "DONE (200[]) FLAGS[set:.......J.., unset:..........] " + 30 | "FIELDS[changed:.............] ank_ver[{6333262700747542897=22.09.16-23:53:51-189387121}] " + 31 | "list_ver[{6333246817957146840=22.09.16-22:52:13-188051672}] " + 32 | "name[🔥👑الصياد 👑خط احمر🚫❌🔫💣🔥] stamp[AoLBzdsoJtJ1LI6-RgY] " + 33 | "flags[0x10b(PLC....J..)] created[1474573933=22.09.16-22:52:13] creator[3:666366008] " + 34 | "avatars_lastcheck[1474577584] cavatar_lastmod[1474575351] " + 35 | "origin[0::] abuse_drugs[0] abuse_spam[0] abuse_porno[0] abuse_unknown[0] abuse_unknown[0]", 36 | "[22653 22.09.2016 14:30:16] I MCHAT-ST txn_finalize(): TXN 0x1d2c42000[674451730.ModAnketa() " + 37 | "reqid '80405607-1474543815:1#modChatAlpha.674451730@chat.agent' " + 38 | "from 3:700337623(700337623@uin.icq) orgn 1:80405607-1474543815:1#modChatAlpha.674451730@chat.agent:]: " + 39 | "DONE (200[]) FLAGS[set:.........., unset:..........] FIELDS[changed:N............] " + 40 | "ank_ver[{6333117466407183233=22.09.16-14:30:16-168141697}] " + 41 | "list_ver[{6333107832793820511=22.09.16-13:52:53-166423903}] name[Việt Nam Zalo] " + 42 | "about[người Việt Nam] rules[ nói chuyện và chat video] " + 43 | "stamp[AoLBzaISFnNsHgQiXzQ] regions[US] flags[0x20b(PLC.....R.)] " + 44 | "created[1474541573=22.09.16-13:52:53] creator[3:700337623] " + 45 | "avatars_lastcheck[1474541573] origin[0::] " + 46 | "abuse_drugs[0] abuse_spam[0] abuse_porno[0] abuse_unknown[0] abuse_unknown[0]", 47 | `[3988519 30.05.2017 16:41:31] W MCHAT-ST txn_finalize(): TXN 0x1234d000[2000023207.ModAnketa() reqid ` + 48 | `'2837303925-1496151691:13#modChatAlpha.2000023207@chat.agent' from 3:1999585731(1999585731@uin.icq) ` + 49 | `orgn 13:2837303925-1496151691:13#modChatAlpha.2000023207@chat.agent:]: DONE (200[]) ` + 50 | `FLAGS[set:..........., unset:...........] FIELDS[changed:..............] ` + 51 | `ank_ver[{6425922582700258264=30.05.17-16:41:31-160728}] name[Cccc] about[Jdjdjdjdjjdr] nick[wwww] ` + 52 | `stamp[Aoe5190nWD89Ef-RCGQ] regions[RU] flags[0xb(PLC.......O)] ` + 53 | `created[1496151654=30.05.17-16:40:54] creator[3:1999585731] avatars_lastcheck[1496151655] ` + 54 | `cavatar_lastmod[1496151655] origin[0::] abuse drugs[0] abuse spam[0] abuse porno[0]` + 55 | `abuse violation[0] abuse other[0]`, 56 | } 57 | var lines [][]byte 58 | var names = [][]byte{ 59 | []byte("🔥👑الصياد 👑خط احمر🚫❌🔫💣🔥"), 60 | []byte("🔥👑الصياد 👑خط احمر🚫❌🔫💣🔥"), 61 | []byte("Việt Nam Zalo"), 62 | []byte("Cccc"), 63 | } 64 | var avatarsLastcheck = []int64{ 65 | 1474577584, 66 | 1474577584, 67 | 1474541573, 68 | 1496151655, 69 | } 70 | var avatarsLastcheckLit = [][]byte{} 71 | 72 | func init() { 73 | totalLength := 0 74 | for _, line := range sampleLines { 75 | totalLength += len(line) 76 | } 77 | buf := make([]byte, totalLength) 78 | offset := 0 79 | for _, line := range sampleLines { 80 | copy(buf[offset:], line) 81 | next := offset + len(line) 82 | lines = append(lines, buf[offset:next]) 83 | offset = next 84 | } 85 | for _, num := range avatarsLastcheck { 86 | data := strconv.FormatInt(num, 10) 87 | avatarsLastcheckLit = append(avatarsLastcheckLit, []byte(data)) 88 | } 89 | } 90 | 91 | var ldeParser = &CRMod{} 92 | 93 | func BenchmarkLDEComplex(b *testing.B) { 94 | for i := 0; i < b.N; i++ { 95 | for j, line := range lines { 96 | if ok, err := ldeParser.Extract(line); !ok { 97 | if err != nil { 98 | b.Fatalf("%s on parsing >>\033[1m%s\033[0m", err, string(line)) 99 | } 100 | require.NotNil(b, err) 101 | } 102 | if !bytes.Equal(names[j], ldeParser.Name) { 103 | require.Equal(b, string(names[j]), string(ldeParser.Name)) 104 | } 105 | if avatarsLastcheck[j] != ldeParser.GetAvatarLastCheckValue() { 106 | require.Equal(b, avatarsLastcheck[j], ldeParser.AvatarLastCheck.Value) 107 | } 108 | } 109 | } 110 | } 111 | 112 | var regexComplex = regexp.MustCompile( 113 | `` + 114 | `\[\S* (?P