├── .gitignore ├── go.mod ├── go.sum ├── example └── main.go ├── LICENSE ├── README.md └── drain.go /.gitignore: -------------------------------------------------------------------------------- 1 | .idea -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/faceair/drain 2 | 3 | go 1.17 4 | 5 | require github.com/hashicorp/golang-lru v0.5.4 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc= 2 | github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= 3 | -------------------------------------------------------------------------------- /example/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/faceair/drain" 7 | ) 8 | 9 | func main() { 10 | logger := drain.New(drain.DefaultConfig()) 11 | 12 | for _, line := range []string{ 13 | "connected to 10.0.0.1", 14 | "connected to 10.0.0.2", 15 | "connected to 10.0.0.3", 16 | "Hex number 0xDEADBEAF", 17 | "Hex number 0x10000", 18 | "user davidoh logged in", 19 | "user eranr logged in", 20 | } { 21 | logger.Train(line) 22 | } 23 | 24 | for _, cluster := range logger.Clusters() { 25 | println(cluster.String()) 26 | } 27 | 28 | cluster := logger.Match("user faceair logged in") 29 | if cluster == nil { 30 | println("no match") 31 | } else { 32 | fmt.Printf("cluster matched: %s", cluster.String()) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 faceair 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Drain 2 | 3 | > This project is an golang port of the original [Drain3](https://github.com/IBM/Drain3) project. 4 | 5 | Drain is an online log template miner that can extract templates (clusters) from a stream of log messages in a timely manner. It employs a parse tree with fixed depth to guide the log group search process, which effectively avoids constructing a very deep and unbalanced tree. 6 | 7 | ## Example 8 | 9 | ```go 10 | package main 11 | 12 | import ( 13 | "fmt" 14 | 15 | "github.com/faceair/drain" 16 | ) 17 | 18 | func main() { 19 | logger := drain.New(drain.DefaultConfig()) 20 | 21 | for _, line := range []string{ 22 | "connected to 10.0.0.1", 23 | "connected to 10.0.0.2", 24 | "connected to 10.0.0.3", 25 | "Hex number 0xDEADBEAF", 26 | "Hex number 0x10000", 27 | "user davidoh logged in", 28 | "user eranr logged in", 29 | } { 30 | logger.Train(line) 31 | } 32 | 33 | for _, cluster := range logger.Clusters() { 34 | println(cluster.String()) 35 | } 36 | 37 | cluster := logger.Match("user faceair logged in") 38 | if cluster == nil { 39 | println("no match") 40 | } else { 41 | fmt.Printf("cluster matched: %s", cluster.String()) 42 | } 43 | } 44 | ``` 45 | 46 | Output: 47 | ``` 48 | id={1} : size={3} : connected to <*> 49 | id={2} : size={2} : Hex number <*> 50 | id={3} : size={2} : user <*> logged in 51 | cluster matched: id={3} : size={2} : user <*> logged in 52 | ``` 53 | 54 | ## LICENSE 55 | 56 | [MIT](LICENSE) -------------------------------------------------------------------------------- /drain.go: -------------------------------------------------------------------------------- 1 | package drain 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "strconv" 7 | "strings" 8 | "unicode" 9 | 10 | "github.com/hashicorp/golang-lru/simplelru" 11 | ) 12 | 13 | type Config struct { 14 | maxNodeDepth int 15 | LogClusterDepth int 16 | SimTh float64 17 | MaxChildren int 18 | ExtraDelimiters []string 19 | MaxClusters int 20 | ParamString string 21 | } 22 | 23 | type LogCluster struct { 24 | logTemplateTokens []string 25 | id int 26 | size int 27 | } 28 | 29 | func (c *LogCluster) getTemplate() string { 30 | return strings.Join(c.logTemplateTokens, " ") 31 | } 32 | func (c *LogCluster) String() string { 33 | return fmt.Sprintf("id={%d} : size={%d} : %s", c.id, c.size, c.getTemplate()) 34 | } 35 | 36 | func createLogClusterCache(maxSize int) *LogClusterCache { 37 | if maxSize == 0 { 38 | maxSize = math.MaxInt 39 | } 40 | cache, _ := simplelru.NewLRU(maxSize, nil) 41 | return &LogClusterCache{ 42 | cache: cache, 43 | } 44 | } 45 | 46 | type LogClusterCache struct { 47 | cache simplelru.LRUCache 48 | } 49 | 50 | func (c *LogClusterCache) Values() []*LogCluster { 51 | values := make([]*LogCluster, 0) 52 | for _, key := range c.cache.Keys() { 53 | if value, ok := c.cache.Peek(key); ok { 54 | values = append(values, value.(*LogCluster)) 55 | } 56 | } 57 | return values 58 | } 59 | 60 | func (c *LogClusterCache) Set(key int, cluster *LogCluster) { 61 | c.cache.Add(key, cluster) 62 | } 63 | 64 | func (c *LogClusterCache) Get(key int) *LogCluster { 65 | cluster, ok := c.cache.Get(key) 66 | if !ok { 67 | return nil 68 | } 69 | return cluster.(*LogCluster) 70 | } 71 | 72 | func createNode() *Node { 73 | return &Node{ 74 | keyToChildNode: make(map[string]*Node), 75 | clusterIDs: make([]int, 0), 76 | } 77 | } 78 | 79 | type Node struct { 80 | keyToChildNode map[string]*Node 81 | clusterIDs []int 82 | } 83 | 84 | func DefaultConfig() *Config { 85 | return &Config{ 86 | LogClusterDepth: 4, 87 | SimTh: 0.4, 88 | MaxChildren: 100, 89 | ParamString: "<*>", 90 | } 91 | } 92 | 93 | func New(config *Config) *Drain { 94 | if config.LogClusterDepth < 3 { 95 | panic("depth argument must be at least 3") 96 | } 97 | config.maxNodeDepth = config.LogClusterDepth - 2 98 | 99 | d := &Drain{ 100 | config: config, 101 | rootNode: createNode(), 102 | idToCluster: createLogClusterCache(config.MaxClusters), 103 | } 104 | return d 105 | } 106 | 107 | type Drain struct { 108 | config *Config 109 | rootNode *Node 110 | idToCluster *LogClusterCache 111 | clustersCounter int 112 | } 113 | 114 | func (d *Drain) Clusters() []*LogCluster { 115 | return d.idToCluster.Values() 116 | } 117 | 118 | func (d *Drain) Train(content string) *LogCluster { 119 | contentTokens := d.getContentAsTokens(content) 120 | 121 | matchCluster := d.treeSearch(d.rootNode, contentTokens, d.config.SimTh, false) 122 | // Match no existing log cluster 123 | if matchCluster == nil { 124 | d.clustersCounter++ 125 | clusterID := d.clustersCounter 126 | matchCluster = &LogCluster{ 127 | logTemplateTokens: contentTokens, 128 | id: clusterID, 129 | size: 1, 130 | } 131 | d.idToCluster.Set(clusterID, matchCluster) 132 | d.addSeqToPrefixTree(d.rootNode, matchCluster) 133 | } else { 134 | newTemplateTokens := d.createTemplate(contentTokens, matchCluster.logTemplateTokens) 135 | matchCluster.logTemplateTokens = newTemplateTokens 136 | matchCluster.size++ 137 | // Touch cluster to update its state in the cache. 138 | d.idToCluster.Get(matchCluster.id) 139 | } 140 | return matchCluster 141 | } 142 | 143 | // Match against an already existing cluster. Match shall be perfect (sim_th=1.0). New cluster will not be created as a result of this call, nor any cluster modifications. 144 | func (d *Drain) Match(content string) *LogCluster { 145 | contentTokens := d.getContentAsTokens(content) 146 | matchCluster := d.treeSearch(d.rootNode, contentTokens, 1.0, true) 147 | return matchCluster 148 | } 149 | 150 | func (d *Drain) getContentAsTokens(content string) []string { 151 | content = strings.TrimSpace(content) 152 | for _, extraDelimiter := range d.config.ExtraDelimiters { 153 | content = strings.Replace(content, extraDelimiter, " ", -1) 154 | } 155 | return strings.Split(content, " ") 156 | } 157 | 158 | func (d *Drain) treeSearch(rootNode *Node, tokens []string, simTh float64, includeParams bool) *LogCluster { 159 | tokenCount := len(tokens) 160 | 161 | // at first level, children are grouped by token (word) count 162 | curNode, ok := rootNode.keyToChildNode[strconv.Itoa(tokenCount)] 163 | 164 | // no template with same token count yet 165 | if !ok { 166 | return nil 167 | } 168 | 169 | // handle case of empty log string - return the single cluster in that group 170 | if tokenCount == 0 { 171 | return d.idToCluster.Get(curNode.clusterIDs[0]) 172 | } 173 | 174 | // find the leaf node for this log - a path of nodes matching the first N tokens (N=tree depth) 175 | curNodeDepth := 1 176 | for _, token := range tokens { 177 | // at max depth 178 | if curNodeDepth >= d.config.maxNodeDepth { 179 | break 180 | } 181 | 182 | // this is last token 183 | if curNodeDepth == tokenCount { 184 | break 185 | } 186 | 187 | keyToChildNode := curNode.keyToChildNode 188 | curNode, ok = keyToChildNode[token] 189 | if !ok { // no exact next token exist, try wildcard node 190 | curNode, ok = keyToChildNode[d.config.ParamString] 191 | } 192 | if !ok { // no wildcard node exist 193 | return nil 194 | } 195 | curNodeDepth++ 196 | } 197 | 198 | // get best match among all clusters with same prefix, or None if no match is above sim_th 199 | cluster := d.fastMatch(curNode.clusterIDs, tokens, simTh, includeParams) 200 | return cluster 201 | } 202 | 203 | // fastMatch Find the best match for a log message (represented as tokens) versus a list of clusters 204 | func (d *Drain) fastMatch(clusterIDs []int, tokens []string, simTh float64, includeParams bool) *LogCluster { 205 | var matchCluster, maxCluster *LogCluster 206 | 207 | maxSim := -1.0 208 | maxParamCount := -1 209 | for _, clusterID := range clusterIDs { 210 | // Try to retrieve cluster from cache with bypassing eviction 211 | // algorithm as we are only testing candidates for a match. 212 | cluster := d.idToCluster.Get(clusterID) 213 | if cluster == nil { 214 | continue 215 | } 216 | curSim, paramCount := d.getSeqDistance(cluster.logTemplateTokens, tokens, includeParams) 217 | if curSim > maxSim || (curSim == maxSim && paramCount > maxParamCount) { 218 | maxSim = curSim 219 | maxParamCount = paramCount 220 | maxCluster = cluster 221 | } 222 | } 223 | if maxSim >= simTh { 224 | matchCluster = maxCluster 225 | } 226 | return matchCluster 227 | } 228 | 229 | func (d *Drain) getSeqDistance(seq1, seq2 []string, includeParams bool) (float64, int) { 230 | if len(seq1) != len(seq2) { 231 | panic("seq1 seq2 be of same length") 232 | } 233 | 234 | simTokens := 0 235 | paramCount := 0 236 | for i := range seq1 { 237 | token1 := seq1[i] 238 | token2 := seq2[i] 239 | if token1 == d.config.ParamString { 240 | paramCount++ 241 | } else if token1 == token2 { 242 | simTokens++ 243 | } 244 | } 245 | if includeParams { 246 | simTokens += paramCount 247 | } 248 | retVal := float64(simTokens) / float64(len(seq1)) 249 | return retVal, paramCount 250 | } 251 | 252 | func (d *Drain) addSeqToPrefixTree(rootNode *Node, cluster *LogCluster) { 253 | tokenCount := len(cluster.logTemplateTokens) 254 | tokenCountStr := strconv.Itoa(tokenCount) 255 | 256 | firstLayerNode, ok := rootNode.keyToChildNode[tokenCountStr] 257 | if !ok { 258 | firstLayerNode = createNode() 259 | rootNode.keyToChildNode[tokenCountStr] = firstLayerNode 260 | } 261 | curNode := firstLayerNode 262 | 263 | // handle case of empty log string 264 | if tokenCount == 0 { 265 | curNode.clusterIDs = append(curNode.clusterIDs, cluster.id) 266 | return 267 | } 268 | 269 | currentDepth := 1 270 | for _, token := range cluster.logTemplateTokens { 271 | // if at max depth or this is last token in template - add current log cluster to the leaf node 272 | if (currentDepth >= d.config.maxNodeDepth) || currentDepth >= tokenCount { 273 | // clean up stale clusters before adding a new one. 274 | newClusterIDs := make([]int, 0, len(curNode.clusterIDs)) 275 | for _, clusterID := range curNode.clusterIDs { 276 | if d.idToCluster.Get(clusterID) != nil { 277 | newClusterIDs = append(newClusterIDs, clusterID) 278 | } 279 | } 280 | newClusterIDs = append(newClusterIDs, cluster.id) 281 | curNode.clusterIDs = newClusterIDs 282 | break 283 | } 284 | 285 | // if token not matched in this layer of existing tree. 286 | if _, ok = curNode.keyToChildNode[token]; !ok { 287 | // if token not matched in this layer of existing tree. 288 | if !d.hasNumbers(token) { 289 | if _, ok = curNode.keyToChildNode[d.config.ParamString]; ok { 290 | if len(curNode.keyToChildNode) < d.config.MaxChildren { 291 | newNode := createNode() 292 | curNode.keyToChildNode[token] = newNode 293 | curNode = newNode 294 | } else { 295 | curNode = curNode.keyToChildNode[d.config.ParamString] 296 | } 297 | } else { 298 | if len(curNode.keyToChildNode)+1 < d.config.MaxChildren { 299 | newNode := createNode() 300 | curNode.keyToChildNode[token] = newNode 301 | curNode = newNode 302 | } else if len(curNode.keyToChildNode)+1 == d.config.MaxChildren { 303 | newNode := createNode() 304 | curNode.keyToChildNode[d.config.ParamString] = newNode 305 | curNode = newNode 306 | } else { 307 | curNode = curNode.keyToChildNode[d.config.ParamString] 308 | } 309 | } 310 | } else { 311 | if _, ok = curNode.keyToChildNode[d.config.ParamString]; !ok { 312 | newNode := createNode() 313 | curNode.keyToChildNode[d.config.ParamString] = newNode 314 | curNode = newNode 315 | } else { 316 | curNode = curNode.keyToChildNode[d.config.ParamString] 317 | } 318 | } 319 | } else { 320 | // if the token is matched 321 | curNode = curNode.keyToChildNode[token] 322 | } 323 | 324 | currentDepth++ 325 | } 326 | } 327 | 328 | func (d *Drain) hasNumbers(s string) bool { 329 | for _, c := range s { 330 | if unicode.IsNumber(c) { 331 | return true 332 | } 333 | } 334 | return false 335 | } 336 | 337 | func (d *Drain) createTemplate(seq1, seq2 []string) []string { 338 | if len(seq1) != len(seq2) { 339 | panic("seq1 seq2 be of same length") 340 | } 341 | retVal := make([]string, len(seq2)) 342 | copy(retVal, seq2) 343 | for i := range seq1 { 344 | if seq1[i] != seq2[i] { 345 | retVal[i] = d.config.ParamString 346 | } 347 | } 348 | return retVal 349 | } 350 | --------------------------------------------------------------------------------