├── .gitignore
├── README.md
├── config_ini
    ├── drain3.ini
    └── drain3_state.bin
├── data
    ├── chinese_english_logs.csv
    ├── chinese_english_logs_parse_by_drain3.csv
    ├── chinese_english_logs_parse_by_statistic.csv
    ├── english_logs.csv
    ├── english_logs_parse_by_drain3.csv
    ├── english_logs_parse_by_statistic.csv
    ├── 解析结果与金标准对比的结果_by_drain3.xlsx
    └── 解析结果与金标准对比的结果_by_statistic.xlsx
├── drain3
    ├── __init__.py
    ├── drain.py
    ├── file_persistence.py
    ├── kafka_persistence.py
    ├── masking.py
    ├── memory_buffer_persistence.py
    ├── persistence_handler.py
    ├── redis_persistence.py
    ├── simple_profiler.py
    ├── template_miner.py
    └── template_miner_config.py
├── requirements.txt
├── src
    ├── common_config.py
    ├── drain3_examples
    │   ├── drain_bigfile_demo.py
    │   └── drain_stdin_demo.py
    ├── log_parser_by_drain3.py
    ├── log_parser_by_statistic.py
    └── tool
    │   ├── read_save_file.py
    │   ├── str_related.py
    │   ├── tokenizer.py
    │   └── tool.py
├── tests
    ├── drain3_test.ini
    ├── test_drain.py
    ├── test_masking.py
    └── test_template_miner.py
└── 日志解析_项目介绍.docx


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | src/__pycache__/*
3 | .idea/
4 | src/drain3_examples/SSH.log
5 | src/drain3_examples/SSH.tar.gz
6 | config_ini/drain3_state.bin


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # log_parser
 2 | ## 功能：
 3 | 这是一个日志解析的项目，对中文、英文和中英文混杂三种类型的日志进行解析，得到每条日志的模板、参数和该模板出现的次数。
 4 | 
 5 | 实现了对日志的流式解析，每秒处理4k+条日志。
 6 | 
 7 | ## 运行环境：
 8 | python3.7
 9 | 
10 | ## 运行方式：
11 | 对于中文、中英文混杂的日志，采用统计的方法进行解析，直接执行src/log_parser_by_statistic.py;
12 | 
13 | 对于中文、英文和中英文混杂三种类型的日志，采用Drain3的方法进行解析，直接执行src/log_parser_by_drain3.py。
14 | 
15 | src/drain3_examples/drain_stdin_demo.py 可以从输入的日志中学习到日志模板，并且通过学习的模板对实时输入的日志进行解析，得到日志中的参数
16 | 
17 | ## 更多关于本项目的说明介绍
18 | 见知乎文章《使用改进后的Drain3进行中英文日志解析》，链接https://zhuanlan.zhihu.com/p/569437314
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/config_ini/drain3.ini:
--------------------------------------------------------------------------------
 1 | [SNAPSHOT]
 2 | snapshot_interval_minutes = 10
 3 | compress_state = True
 4 | 
 5 | [MASKING]
 6 | masking = [
 7 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"},
 8 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"},
 9 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
10 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
11 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"},
12 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"},
13 |           {"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"}
14 |           ]
15 | ;mask_prefix = <:
16 | ;mask_suffix = :>
17 | mask_prefix = <<
18 | mask_suffix = >>
19 | 
20 | [DRAIN]
21 | sim_th = 0.4
22 | depth = 4
23 | max_children = 100
24 | max_clusters = 1024
25 | extra_delimiters = ["_"]
26 | 
27 | [PROFILING]
28 | enabled = True
29 | report_sec = 30
30 | 


--------------------------------------------------------------------------------
/config_ini/drain3_state.bin:
--------------------------------------------------------------------------------
1 | eJztXQtz2za2/isa3zuzaSdWAb7Z2W4uRdE2Ez1YknKSbjoeWqJSbWXJI8ttsp3+94sHHwAFkLYkptpt045MkSCJ851zcL5zAFK/nd1//mZ9+690uj37tnM22ySLldolf7p9/Hn2snO2XH+8mS4fH7bp5maW3m9/Qk01tP8u+XSzWs/SYqeCdj4s7m7IF9DN20x/Wixnm3SFdkIA0M7Ner0lZ6I9v9X0YISboPY/p59vtmt6nfI85aCzo0PO/h0dyBFZzB7Qrn/CH/HOv//963/848hXNswffxfsJvdTrINAaKO7yAgMXd5jaJxcj9WXHd1+2TGBvNenBzNyLoh8SbcQ3AbetkhHz9G/Y9sfvfLbxeoqjoPO2/S2E2zWnz53nMft+ry/eJiuf0k3nztRuvllMT3IpwV3N9X87rP1rw8db7NZbzpher/ebBerj23dVCM3vV5sto/JsoNk/PnYd9DJHT48mtP5LfpMZ/MPj7qm4E8lOcjcBHezALmbc3+/XEyT7WK96nif7tPNIl0dHTtL4RQWL+6OfgdqkNF6vv012aTYGrfoukiqI9/IpkqKF9tk1XE+pqttbm+dObLCTMRj39TILEO/TZQPj4aimse+g8VpaLiePS7Th46/etgmy2W6Ofbt7B3j81cIv7ukBZVBQANxgMaH5HaZdvop0Ze3erxLN8kWqa2dIQMCfqBqC00IauiABk4uTukvO5qSj3U6nGOLhgd1UwQKrIndqnpyoKCArWo1HOkwattGj02kRurHl+v1x+XRA64iR+Og4a8VMBDngojmqsi2DS2zbcMEMxK74bGHzzp/PzloEI9WsXWbGSwWTBNKaRBExlxDn/rt0d0fGHKMTi/fgBA5U11Sd4JdxjmSmpu6Pjc+PNr6/CDzExkPyO+g6CqyGtUE2LHU5Oj2Ytegf3qDL0TJnq60jb5lF+hDgH3Wnh8d97qgd3qDGUQDPMSRz6oxF/v0uo0GYA3WjIinV82ANQgfxhbb6a6NKy40O+wl058/btaPqxni/Nt0uVyQRDHeJKuHedpWumHaXzSHg6COYB9EedopQgLEQWoGefX0bEqp8Vj7BBFGBFipITH26dF2Ra3p7ukliUpNsLT10+tuTdHfOr0gqdSkDNYJGkNNgLRO0Bhqxl7r9EYGtSa6maeXD6g1oUI5PRqtFvW/dGqhzySds/US9GkcRklF/Mj6MqmkVaMKeHo8W0V5vPaFsmy7Jn6ppzdkqTUj7On1VqvhMurpRVutxhb003MTrYbLGKeXOmg1XMY8vXig10Rb/QS7WxdtT88Y9HxGolxZcOyYVxaCLfPDo5bigmR7d6sZOpTTy4v1urVP4AStu2bsgCcIr/myY8DWJyLqWN3pJYdGzYAKTy/bMvASQUREDfuLjiM1xarTU6lZ2Hhb88pmneMfRnhbXOlh0onlvjseHts+agbuwwg1AkJNdGzm6kEzu6JOG60PhTUrM5TDxpb2M9CaJAkeptOwHxy5sxDUIH38BVN0fDEPy71++3A2Xa+26Wr74ezoPayBwzjM8FrttszmZLLcb9bzRd2M3cPi7n6Z3uTtuqPH5TLIT8JXSD9tN8nNLF0u7hbo6uTSZzdnP77MHr+gN8W7kY3hhzLuk01yd/OwxffMole+N91uFv9Ob8hqycUUifxzusJnbjePKWqzmBEU6BUboBusP7q0oZtMfyIo3pCtm5tZsqW+/q+H9erbb75pim3lpfBV8PMo2xSBkmzTsofk/O3jPVmb9s+zCLeksv3IQ49xeIkfVPk3bmngg1k3miq9e3TDfzcAUPNxe7TpjkcjvBl630+8KMabF+MQ/4nicOLGk9DDX/rRKOgB5yZynVIKduOtQ8SLJq7rRdHFZNDFX1+PeyNnSK7wLf7ocOc4kd+XHCo2cP88N6ZdqrmWe7F7MApcxVUEYCsl2AoDdhPn2gds78KwdZ/i6YQxj5kfX1VQKo75EUUo8i9HHkEpHuPPSeQRKPo9Jd8ksr4sti7D8STAG/9zGQZR7ArkV8XyNwXq/eTXgOpzop0XaFDByPfYpwB8V+ckmrjfTdRwj34je/8BQJPvON7pj3xeicVGNOlF76PYG/LnODGRyxn1WTG9d35MNUz9i3rPxXgwGL+NpO7i+j57TdQfB4JuhP4Oxk7/Bfr77rofO19xp3k9t++7TzjvHX/eZOS74773hBMnX4n7mQsrPC0eSvvZcJ68nw0non7u2pQutqkmWry3TRmH21S5K3BCNHDEXvhcW+IuMRTfHHXtqu+5QdYedjNYcSAg2GIjfpG3qtgACQO5pTde4d3/vXNc/gp+35nQ4fFJV1D/z4mv+CvgoXM8esYVosuRyEIMsYU0Eb59LQQocgt5nj1g28d/h+P+ZOCxAdSv5SKmWOAmYr6fwO8RAdwR+D0aQ4jdDRELIR1GAPjOwP/BiX2qUyoCCSP+6FIghSWWoinJ2lcKAHkpOFXVdNMWd7OxTLFfP681tYJ2sTEi/CIYR3EwziiH0xtTyhJfhZ7Tz8aVCQplAgILGEGAyYrSApcmoijAF0jgj/rjSe/J3WaIN1Q0le13C+Sb9LtqKn0/CgbO+7LHeCv0ggx8ZiQ/F0nAsFmoaRYrQQuMFkkwUTQqwXnQ05zMZSc/DNywoHMokmNHdUIiVU50NWAbYuPzRnHoe0TGKxq/ep43on6UZxQDcq3exB9QWMIxPwyGGUhZ/hI7PTroRYHjeuLblqw5wA6ajy3SJE1CnBsXiu+JMzR22cLkMvZKnN1xQFCZxP7Aj9/zMu3KS5r1iwbf7TYdjWMaMT13QkYs8RXdgeeMaJrhjofBAMUcEV4MYddYvNph7BMIlV27vOz7UYEXHSIwsxiH/g90lGOwo6NEaUNSvL4WSctQSchK2w6XxNIW4qqFuOFgfFmIizJLTm2BFyLJhpled+0j9MYhfyh33RIJhnjnHkZT037o+MR9gtCJ+r1uEEUOioLdyh1iOiZA/DG+wJ86vdFgQP2eaChGkTPIEkR6VRQRsTN77niEhnQB/BKe1jjdtDf8cBd9ztiQ8AEazbw6f3ORxcXSBgIpGXJmsJWixqVde4gpzAl4GXERQugho+v4bcSdW2wMvWGPKV0IUHnGgFXadjjGgSKzbcapEflCxuRRmOHu5WmryvXzgbW8vESTeUOZIgE/vE5y0anpj3uvPZdWAqiVD/xIbg7sxsChDekVMjnzkkrj2RiTSfQcu2PoNGd27fDp2KqSu35PISjxtB/vCQfDzNtKwl1KJRVIQrwb18/sKZC+w/qikT8YugM6KkbjSUhZChpWrx1/kI+topJoGHoDkgidU9/4TtaQ1E4b2w0mb4UNJM0RQ41YnsRuxO8DSa93KpyVDUGVlskobJbWNq6x+UtFB6nI643e9Hu1quGyJlY17eRMsWpWmDBKmEYFADgqoZEcb+Z85eg6YULFvlqhyd25P7oYF62KMPlt5Sbf7h4tLkS+fc1vdWQHKlMKuEZKujyexISaoWQf/7kaD/psZIqvvKrZ0ws8Gfk/PeDUA0u2I/UmNoOHbAlCaSeBb3An/IX2vTVvOniEq9fxZb/nyNTDa7bUIqZEdP7hW/GdeCr1Jb2sRiF/Vj0cx/GYeo6ls37XTkHnL7/Ddb534aVeq3JFMVQFqs9UeluOR0vJT3U8t9dTAVSUJ+ii58Quf82jh7fiFoLgxvZUqAUNWpphP9vZSO1N4GtMLZAj8+3UAgtXK8oz/3Gutr+HeT1Te5rT/HFudaLx7DRAf3IAY0rObIG98X1NhzuVJnYqyWh5LP3+wWMmnxBUB0wVKEDV/hCP4rnpU70K4HUQQyVwAwAr7UXgBz2HX/1XwT5bJNjF7QaTruspdt80LVXt86eJdBC9j8ZBWOtk0q3n+xORROBOzBQC507tTCEgd9L/3HSwHfpP/aOP+jMYu2+ol+UQ/qfEoZME9slhiZmk4soZ7SwhKvzor7BUp+jj+czzUqI/D4hP9g9mMo3LhVqaTHuuf6go4JV/JGodO4GqBEDjWtbqNWtV0ep17AxhF/eMcgb8X7/f19yihW0rqsVc4DA9s/1+FpvI7n6ApzQCCowjAgpLQFVL0SylaGFBw2ZvczCgRb/3AXSXnEEhOWMmbNnlNY0vzvtCTmPaNvOnpjykAKXHtazVcdaqyWkudN2BnqYXLaCCmLfBXOHwalHecXHyY6qaKq0WZX04gJnV44oT8LfO9yaAltmIrevDXuDQ9QT18CoFvJbrXTjqRdFCBZYNdtW+f/kA9573lp16HFB0HTI3kzpWdnzXsRSRY6nMPDvnWO1Ms4dA1Xem2cOg574l6nbCUbawg64vGrnj4dCPs4Utk5A7s9hwLmKPP+Reee6bYOyP4nKxkoipEA1lE9+Ny2ZKeyLWLFprI2Qx9UuYmpc2Za7x5H7iBY5PXDbl0QWP+cNkTzwrDp1R5LjPBvDtOHxD7Lr2PIqhwFLZpyQ1m7XVdtYdhABUJmxCj6w96na7ZMFXxBlfz3HfIAPui2UPPXd87YX8jDYaKQb+5VVcAEAGlXLFLV0VVizEFzURL9nPWgpAlD392M5cM4JQfwaEF+Pw2QhSz8SjhBzF4pkAOS6Sxd1qO3OBCBdNhsskewA3X6NX4rSLyX7iVw5/cRuUPMnZ+BK+vbFWZVgHoU/XepcBgzSgIZvJNAVSSJ4dbHynxt5SKEIpMgMgq/BFz7iokpXRjT/jtHc/obSf9EEuST8lD56p7ZSNAgCsHSISvItHxLbPace/n/ihV0TI87534WdW3ndiybPwkUdEzR6lKJYKnotblyiVGS8AfRMgdXPHGxKF0hmb6jCC5zmeUGkR6IspY0CNXRXc+L7HvTW2+0g2rzHvXZwxmgsEu8c+zFED/X+wDtismNNB4y9v7aeDgQ6B5AlCBM71wM/i0Mi7HGeeLmy88/gWfYhE0JBqbzDh9mbP0w+zJ7/93jDs90SvDGAfS2RrbVo72c0gX0RcrDtwR2M6X8c9n1BapwpU5g8nrUvye7m45WlhPHKZeT3iDxf93puy5gN61L5RvGPaAYvujTzW8EH2qM0oKt/JACyzjw5AEcbsamAW4nZI+WCHOfX7JDHfCYZigxqM3SIByaYsK34oPu/SG3mh79LDeYUAOJdhwHdmPMyegzofZXO6pY27Qf5UJ/9cCMJffACFTvEBP8iTp93bjAMsn+zZ/+DNJRqaiv1l2pWRSIGCJSmD1k7KMNiha5mCG5/51mRvNmmHw79WzeoTA7SqlBGeytgnmEkoT6QOKIgPCoCaJFiVtCqjq4IxmWO1F+Q1PyOXZFCkVjj0R5PY43OL/NGi8nEa/iqx88YbHUsygRLZp0YhN6C0kx4gNVYfNkI7XUDHhex5SuYhbQHKExTtR9nYE7yHzHmykojPY46fb667pHKES15985axlcZazdU34+c0z2+HOkkXuJCzxxcX+WsoRA/G0TUuKk+ymHNiJ3pTivai4NKi93RoTDqm2hwVaich+8tu/gi7Ufawm94kKp+Nd9z4Wmg/TJoM2dXgWjt58muo2rz5ZHoSyIFyGmHKrDEps62wa5a0dpLm11CppGBllMBL2lBEGhI+lBHZXkTfv1FnAdLoIZBX8lYXrZ2E8zWwxSrClciwn3F5arGYPaHQSpiYfKEJidhPKS5pkudT9XbSutdA3X2nx+u3gC4M3HNWJnuVx9sQt8we4s+zCw7I//opmnK6uHmOZvfJ+afM0eye9cfM0ejsbCL7GiC9nYT79W6ygHaiIVRlg4sTulf+tccY3vV4MJE9kYxfEbbzOsKXLHWVuDHXhlVfQ1NyuUEYjeoaocs0NSl2TUbMbICq2kDSTnYhhE4UMPPxoHIBlEEPLouj7z3RCzl0tixgcS8m0tupDLzeqZAX+kduwKk/r95mtdq2bECoegFUihyqdnLs1ztFehmD5AetKo4C4HhksyUfjWg2Rci88VMRVVlEdS6CtlMM8IFm7wxEvhfSXPoyzN6kmNX3guylAuWEprRKjQLFeJgvBSmuk596LowGfK0oi8PFHFFXBJjs5art5NwIrcq08DigFEFet2dcL3+lV9NbJbizguoLeMtD5QsTAHAVADzRa4N19gVPwDbZFFNvJ8X0RbTMH7zxSsaEV1IRx4hcp+RLmcpVS2P+jF3i2yy42RH28RJJwEc83+8Nu+iPi4ID4gfsDfJOkHftiE5m2gqpUlmnRKp941zW9aTfU4KBbwnOZY5kN0N9HXATNvJrZWdwrwsq39dTdTBcC0f/uFtVp3yq9wp6wYDvXGU2SUZUc5SLGaJKw8pyQBTwTRsaxoXLnFxZmiajjxVEiy5U9svaZzcrRa0cLxe1VQ5oumUB+QVFL1zT2YSd9cV28vXL3dlHWu1FZMjJwuHAu6al9RcQQPAVVTKdsxEcyqdzXowuhsJ3E0smxBt/n24/AXt2NdvFO+GlS6zcvSASQLJK8QJlgMUcQ+994EQRHVau/Sz4h9e+672AQrGYJJ57sY/eThbfM7XdF+UhuXr0TeoTFBTfZdlt5eWTOjCYP9VXUeJiR2/gR1fciJo1Ll7D3ptcXGTrL8djYgCXvcB6I5pI0yVLnBt/4G9PXNSslsMGlx506WxSjQBVYiHgLH3v+0kWCi5Cj3kPp+D9ymzGyMZUo52M8WhiP0tIJhliH4ky2smEeiqwBTJe9mGzbdJV3FLV5r7R96MsfvHruMndSj5FK7V/G43DoTP4mwgZJvfhtN9O4tNTdkbx94hn0JjkDNjJYdr1xkyREtYuLzZdg28CCy8h4V+BKcyFBLiwGQywuAkNo50UxgaKIkkKdzeoRmmEI3kFRk0gBpNXQMvk3rHfTmqhKtXU1umVnfXG2To0YvXhGCVUEfva4F1B87Ibn4w2PPf55Gfhajby30fhz4zdXrPtMJmKoQEW83byFBUou9RBDUP6IvrsjxPHDiWT/FqY7LdOilfqCeSRrGBs/NWlQ156Wq5pFXRIwswafzV8jw6do38ctIKiSTbJnq8VIKcIes0SL7bXLfCur2mk1R1ppI2vvJB5nwFdLBX6KD2gnhpdjcM4ywR3nYMOzkUhgPpzY/AWYMJyLrZebbZAuigo0H4eKCwQFbF3cdkbCJOhYSo7k2i2QMP+F9GLQDX5dzzla2YEfWPYE/cbHC2wp8v1+uMyLVzsw6NhzDX8c4a3gPvtyx0fzJsXTdCnDTT8M6JWKm1uKrcGOimbaPvwqOIlpwIE2CVYfOnJbIEovV2sruI46LxNbzvBZv3pc8d53K7P+4uH6fqXdPO5E6WbXxbTtH25WRZkquyo1fg74vvJPVv/+tDxNpv1phOm9+vNdrH6+OXEZauwbGpstsCVrheb7WOy7CCt/ty+ZOzqHG54aYGRVB216LO8e4asey1QDO7XaEmvtDTV8Se4zb8r6LsNpzP63bo1EMi6piX4e1kdy89Xs4XqqA208Jkgteh3W4c2Pj+di9WlpWpSXk6hy4nlMDG8R+fsswXig390eWqhzySdV8zSMDLx9FvLxkIX36daDoSp5PtUbY4hSNLMwGcaJBBn4BrWFJC9Gfi6ogIxePM5mGZtUtPItmbz/I7qDEM7s4q+JQpWsmpSGTJVzmcp/f1dXnUd+UnkkB9UVG8rXSR2FwLQhapSrzaG+GmcdbfA/HrJ9OePm/XjatbxV9t0uVx8TFfbTrxJVg/zdNMwlpqGpWFLV+ZkD1b/1LzF6rRusdKm8pEoD9SqXTHrJ7g/wwIN7kevWmCB7E+Ks6JmUpS/m7zroDuDrz2r17wFZIK1wOoEP58s1ZWi5T5mgmwLqz7bl5iwUH8CsPpTay6/mKkkqKGZVlwKX5LYzlzXczA1/F3RZlWHtm7nZt4GZAOFeUssUgNk7NUzBVlA16gKdwYFMhhZ1lRvEkOuL/YRZ4WlO1YLXNe5v18upsl2sV51vE/36WaRrr4Ay7EYMmuyw5HVApVlvU3sYeUILh5e6la3M0NPfnH58FN3ITlYLANmeb/VAv/F/bdM3H8MURU481a/xdwC2BlwRRhWAJxeLJapv5qvybfZC6MLef94paATX52/Aib6gFoMzAIF5q2Jit4F+b8fvqoqCDMmbI/2PA/VCcAh1ZxVw7IxTS3ChnLfLIK6XArO34u9Mr0wVJ2t6FttVDXd8ZC1Nj3F4+t0lo9qc3OOOWVOeJD2cKwsRj0basiLzbmRj2vFQFulRuXYZ1Esbm2zMpIP/Ci6Dpne2AAm+G5pg9NLnpa22uH/iF8RZzSq6t+XaiugXjzJZLTVSv5wNC/Nx23NImF0Woj7ZdxXNS3WfWXYSt4naLWQdOQVgHhx9wUCoeRFcFYLtHw37vE5Muq0iktcZd5UDig7Z6Z2uVYDmkpXQeYCugoVl9SnwSe782K1ve/eZd/Iny75r7AWqKjn/9g5Pz/0FQOoroEpuu+tkQ1pTTG9oIBnItgZts/SD7sVsr8DnjoDeLy0swFIQHBH23t3ibjYlrXAW0NltcNxlS+ig+qYiuzHxt+13F6KsLIjsk4SAjUPQLXasYFEO+1kLA1OwaT3c4Ddw57mgWOuT0umh4YxxcCJnJ0PZjVwZO6DtrRkRgoH9eOEDSWQtJATHBOSwwVXJIK3kig0Z68YBmNeCZV/5bZPy21tbkUDS5LsdlIZmTqp4RGOhD4tOFeLQ+NVOlpvU5RpVRRKlZnXpxqUGKWrWWe77gwX0836YT3fdsrrdvqbxS9UjmeouEm9VLW7an2aSvdTJ7uyg/vFb7uFFChCKP6abFI8HbVF11ysV63TM5t7opgVsIWcJV5sk1XHIbXSrETama83nYyQPl1YliqQcjPOkC1tuof8kpzGbienkYeZytjDUNQKDSyL+jKRTIlIrcxflMX8Ij6q2NfyogQtFul6Hhto0q0rsPD//Ajm4uWVmHhEs28yeAikZfILNnGyW8gv8sRpuJ49LtOHjr962CbLZWWka8VNWTrP/fI2aIHQP1fOP2ZCA4nOgML9HjZogUezhWRcXdjcJc8boY83aCH5xC4OQQtsOewHHK3gy2lmOtVomaZo9K6rKBp7ypOnKSlnwUl6weUAOZ6R7epkJa6F4uOwmEhSZtiakln+3cqLY0+fqBThLXk7EAQtkPRgvdkmt8u0009JhPRWj3fpJtmum+YVjzfcILlkArdEYytFMyIJpnhVPSIXEoYLLpAQkzRvdVw51PNyinV7m7IV4pKCWjBNiK0lrNUZtwrMufFTeye+s67lGUlJTI0p1hguL2TfpRP05QjZoDKWq3Iqa4Gq1q0Ie0Y0oAwIz/FasCzGNYUG7LG4LEOzlerkshQfSXEeglaq89SoqPzl8rrMTGgVMiN/T4l0ksXBKCdpL/p/OXaDxHgprIFA0AJn/TKrGVHXS5lMTqZ2Kt+ySsBV0BkkD+nmdbrtDC+CzlBRzPOhosJO4A7Ojc6La+2rTj5JMtchyTXIKMUVRg8p+DylC5xC/mvqQUjZEs+F7ay62Vl0VxCgaR6uaODMbRyLOS9/ayGKBowmqvwIfb8tSetTFq9ZaUIG/fmMJoVYM1NYtR9maCz20QEUiZK3vjVJ/DbzqI2ug69sGFUrKTlCtU0Z45n5YQ3327TnBWSM83x+XP2aLs6R7f6Sbrqffuo+/NSdrnYIxG2qi2+Y34Zie+VABlxdw+DpaWFuzLwHjZTznPRU5hYaSwDIuiRkAB4/MxLdXpafwBbyk98+nE3Xq2262n44wyeReR26+d2f56NaifknVBVNUfH/P+LvL1nnLxqp2gblZLqm4s/5Rtuibba5liaYuRhQLVyLVFITYtX/fEiRojarNfyxGB8APz7MVDwy6DuLS1FLTBINPL7oQJ3yvodbQrIFqCJpd85+Wj/wSr5b32LfhFyr6TJ5eBgldynX9Pzq/DzZTtd33dl8lsyTh3kynXHnbT/fp9WrJ/f3H85+F7oYu8KLs/EWcsK/bBx/vHr1SmjWeH+9Jb+i/xiDRd8+PAJNnb1i/52wubEzS7/j4zduMv0pvbmZPm42+QHSKj9wl3wq5i7Ib5PfDMJJdmy9mSHzy+xsk84ep8TQqNmhjmGjna4R+yfTEA/dMW6fzvoLZM/o5rsRYPW4XOaf/FH+G6SNfty5iCI7oMoOaLIDuuyAITtgyg5YsgO27AAOvZIjUtmhVHgolR5KxYdS+aEUAChFAEohgFIMFCkGilz/UgwUKQaKFANFioEixUCRYqBIMVCkGKhSDFQpBqrcCaQYqFIMVCkGqhQDVYqBKsVAlWKgSTHQpBhoUgw0+UggxUCTYqBJMdCkGGhSDDQpBroUA12KgS7FQJdioMuHQykGuhQDXYqBLsVAl2JgSDEwpBgYUgwMKQaGFANDHhOkGBhSDAwpBoYUA1OKgSnFwJRiYEoxMKUYmFIMTHlglGJgSjEwpRhYUgwsKQaWFANLioElxcCSYmBJMbDk7ECKgSXFwJZiYEsxsKUY2FIMbCkGthQDW4qBLcXAllOkGo4kJ0lAzpKAnCYBOU8CcqIE5EwJyKkSkHMlICdLQI5GHWWUo1FDGkvWiP9jkomHm+n6cbUl1B+1+v3/AS6Y9zA=


--------------------------------------------------------------------------------
/data/解析结果与金标准对比的结果_by_drain3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongdong9/log_parser/7c6bfa9a3d748a7f9eef93078940c7ce2c5446de/data/解析结果与金标准对比的结果_by_drain3.xlsx


--------------------------------------------------------------------------------
/data/解析结果与金标准对比的结果_by_statistic.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongdong9/log_parser/7c6bfa9a3d748a7f9eef93078940c7ce2c5446de/data/解析结果与金标准对比的结果_by_statistic.xlsx


--------------------------------------------------------------------------------
/drain3/__init__.py:
--------------------------------------------------------------------------------
1 | from drain3.template_miner import TemplateMiner
2 | 
3 | 


--------------------------------------------------------------------------------
/drain3/drain.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | # This file implements the Drain algorithm for log parsing.
  3 | # Based on https://github.com/logpai/logparser/blob/master/logparser/Drain/Drain.py by LogPAI team
  4 | 
  5 | from typing import List, Dict, Sequence
  6 | 
  7 | from cachetools import LRUCache, Cache
  8 | 
  9 | from drain3.simple_profiler import Profiler, NullProfiler
 10 | from src.tool.tokenizer import get_token_list
 11 | from src.common_config import IS_CONTAIN_CHINESE_KEY, SUBSTR_TYPE_PATTERN_KEY, SUBSTR_DETAIL_LIST_KEY, TOKEN_LIST_KEY
 12 | 
 13 | class LogCluster:
 14 |     __slots__ = ["log_template_tokens", "cluster_id", "size"]
 15 | 
 16 |     def __init__(self, log_template_tokens: list, cluster_id: int):
 17 |         """
 18 |         yd。功能：
 19 |         :param log_template_tokens: 即经过分词后的token_list
 20 |         :param cluster_id:
 21 |         """
 22 |         self.log_template_tokens = tuple(log_template_tokens)
 23 |         self.cluster_id = cluster_id
 24 |         self.size = 1 #yd。用于统计当前cluster匹配的日志条数
 25 | 
 26 |     def get_template(self):
 27 |         return ' '.join(self.log_template_tokens)
 28 | 
 29 |     def __str__(self):
 30 |         return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}"
 31 | 
 32 | 
 33 | class LogClusterCache(LRUCache):
 34 |     """
 35 |     Least Recently Used (LRU) cache which allows callers to conditionally skip
 36 |     cache eviction algorithm when accessing elements.
 37 |     """
 38 | 
 39 |     def __missing__(self, key):
 40 |         return None
 41 | 
 42 |     def get(self, key):
 43 |         """
 44 |         Returns the value of the item with the specified key without updating
 45 |         the cache eviction algorithm.
 46 |         """
 47 |         return Cache.__getitem__(self, key)
 48 | 
 49 | 
 50 | class Node:
 51 |     __slots__ = ["key_to_child_node", "cluster_ids"]
 52 | 
 53 |     def __init__(self):
 54 |         # yd。key_to_child_node这个字典在root_node这一层的格式为{str(token_count): Node() }
 55 |         #                              在子节点这一层的格式为{ token/self.param: Node() }
 56 |         self.key_to_child_node: Dict[str, Node] = {}
 57 |         self.cluster_ids: List[int] = []
 58 | 
 59 | 
 60 | class Drain:
 61 |     def __init__(self,
 62 |                  depth=4,
 63 |                  sim_th=0.4,
 64 |                  max_children=100,
 65 |                  max_clusters=None,
 66 |                  extra_delimiters=(),
 67 |                  profiler: Profiler = NullProfiler(),
 68 |                  param_str="<*>",
 69 |                  parametrize_numeric_tokens=True):
 70 |         """
 71 |         Create a new Drain instance.
 72 | 
 73 |         :param depth: max depth levels of log clusters. Minimum is 2.
 74 |             For example, for depth==4, Root is considered depth level 1.
 75 |             Token count is considered depth level 2.
 76 |             First log token is considered depth level 3.
 77 |             Log clusters below first token node are considered depth level 4.
 78 |         :param sim_th: similarity threshold - if percentage of similar tokens for a log message is below this
 79 |             number, a new log cluster will be created.
 80 |         :param max_children: max number of children of an internal node
 81 |         :param max_clusters: max number of tracked clusters (unlimited by default).
 82 |             When this number is reached, model starts replacing old clusters
 83 |             with a new ones according to the LRU policy.
 84 |         :param extra_delimiters: delimiters to apply when splitting log message into words (in addition to whitespace).
 85 |         :param parametrize_numeric_tokens: whether to treat tokens that contains at least one digit
 86 |             as template parameters.
 87 |         """
 88 |         if depth < 3:
 89 |             raise ValueError("depth argument must be at least 3")
 90 | 
 91 |         self.log_cluster_depth = depth
 92 |         self.max_node_depth = depth - 2  # max depth of a prefix tree node, starting from zero
 93 |         self.sim_th = sim_th #yd。similarity threshold
 94 |         self.max_children = max_children
 95 |         self.root_node = Node()
 96 |         self.profiler = profiler
 97 |         self.extra_delimiters = extra_delimiters
 98 |         self.max_clusters = max_clusters
 99 |         self.param_str = param_str
100 |         self.parametrize_numeric_tokens = parametrize_numeric_tokens
101 | 
102 |         # key: int, value: LogCluster
103 |         self.id_to_cluster = {} if max_clusters is None else LogClusterCache(maxsize=max_clusters)
104 |         self.clusters_counter = 0
105 | 
106 |     @property
107 |     def clusters(self):
108 |         return self.id_to_cluster.values()
109 | 
110 |     @staticmethod
111 |     def has_numbers(s):
112 |         """
113 |         yd。功能：判断字符串s是否包含任何数字
114 |         :param s:
115 |         :return:
116 |         """
117 |         return any(char.isdigit() for char in s)
118 | 
119 |     def tree_search(self, root_node: Node, tokens: list, sim_th: float, include_params: bool):
120 |         """
121 |         yd。功能：
122 |         :param root_node:
123 |         :param tokens: 将日志内容进行分词后的token_list
124 |         :param sim_th: 即similarity threshold
125 |         :param include_params:
126 |         :return:
127 |         """
128 |         # at first level, children are grouped by token (word) count
129 |         token_count = len(tokens)
130 |         cur_node = root_node.key_to_child_node.get(str(token_count))
131 | 
132 |         # no template with same token count yet
133 |         if cur_node is None:
134 |             return None
135 | 
136 |         # handle case of empty log string - return the single cluster in that group
137 |         if token_count == 0:
138 |             return self.id_to_cluster.get(cur_node.cluster_ids[0])
139 | 
140 |         # find the leaf node for this log - a path of nodes matching the first N tokens (N=tree depth)
141 |         cur_node_depth = 1
142 | 
143 |         for token in tokens:
144 |             # at max depth
145 |             if cur_node_depth >= self.max_node_depth:
146 |                 break
147 | 
148 |             # this is last token
149 |             if cur_node_depth == token_count:
150 |                 break
151 | 
152 |             key_to_child_node = cur_node.key_to_child_node
153 |             cur_node = key_to_child_node.get(token)
154 |             if cur_node is None:  # no exact next token exist, try wildcard node
155 |                 cur_node = key_to_child_node.get(self.param_str)
156 |             if cur_node is None:  # no wildcard node exist
157 |                 return None
158 | 
159 |             cur_node_depth += 1
160 | 
161 |         # get best match among all clusters with same prefix, or None if no match is above sim_th
162 |         cluster = self.fast_match(cur_node.cluster_ids, tokens, sim_th, include_params)
163 |         return cluster
164 | 
165 |     def add_seq_to_prefix_tree(self, root_node, cluster: LogCluster):
166 |         """
167 |         yd。功能：利用新构建的LogCluster来更新prefix_tree
168 |         :param root_node:
169 |         :param cluster: 新构建的LogCluster对象
170 |         :return:
171 |         """
172 |         # 第一步：判断token_count_str是否在root_node.key_to_child_node中，若不在则加入first_layer_node，若在里面，则获取first_layer_node
173 |         token_count = len(cluster.log_template_tokens)  # yd。获取LogCluster对象中token_list的长度，该token_list是由日志内容进分词后得到
174 |         token_count_str = str(token_count)
175 |         if token_count_str not in root_node.key_to_child_node:
176 |             first_layer_node = Node()
177 |             root_node.key_to_child_node[token_count_str] = first_layer_node
178 |         else:
179 |             first_layer_node = root_node.key_to_child_node[token_count_str]
180 | 
181 |         cur_node = first_layer_node
182 | 
183 |         # handle case of empty log string
184 |         if token_count == 0:
185 |             cur_node.cluster_ids = [cluster.cluster_id]
186 |             return
187 |         # 第二步：判断每个token/self.param是否在cur_node.key_to_child_node中，若不在，则加入；若已存在，则取出child_node
188 |         current_depth = 1  # yd。初始值为1，每处理一个token，它的值就加一
189 |         for token in cluster.log_template_tokens:  # yd。log_template_tokens是将分词得到的token_list转换为tuple后的结果
190 |             # if at max depth or this is last token in template - add current log cluster to the leaf node
191 |             if current_depth >= self.max_node_depth or current_depth >= token_count:#yd。如果是token_list中的最后一个token
192 |                 # clean up stale clusters before adding a new one.
193 |                 new_cluster_ids = []
194 |                 for cluster_id in cur_node.cluster_ids:
195 |                     if cluster_id in self.id_to_cluster:
196 |                         new_cluster_ids.append(cluster_id)
197 |                 new_cluster_ids.append(cluster.cluster_id)
198 |                 cur_node.cluster_ids = new_cluster_ids #yd。如果是叶子节点，则需要给cluster_ids赋值，非叶子节点，cluster_ids的值都为空
199 |                 break
200 | 
201 |             # if token not matched in this layer of existing tree.
202 |             if token not in cur_node.key_to_child_node:
203 |                 if self.parametrize_numeric_tokens and self.has_numbers(token):#yd。如果token中含有数字
204 |                     if self.param_str not in cur_node.key_to_child_node:
205 |                         new_node = Node()
206 |                         cur_node.key_to_child_node[self.param_str] = new_node
207 |                         cur_node = new_node
208 |                     else:
209 |                         cur_node = cur_node.key_to_child_node[self.param_str]
210 | 
211 |                 else:
212 |                     if self.param_str in cur_node.key_to_child_node:
213 |                         if len(cur_node.key_to_child_node) < self.max_children:
214 |                             new_node = Node()
215 |                             cur_node.key_to_child_node[token] = new_node
216 |                             cur_node = new_node
217 |                         else:
218 |                             cur_node = cur_node.key_to_child_node[self.param_str]
219 |                     else:
220 |                         if len(cur_node.key_to_child_node) + 1 < self.max_children:
221 |                             new_node = Node()
222 |                             cur_node.key_to_child_node[token] = new_node
223 |                             cur_node = new_node
224 |                         elif len(cur_node.key_to_child_node) + 1 == self.max_children:
225 |                             new_node = Node()
226 |                             cur_node.key_to_child_node[self.param_str] = new_node
227 |                             cur_node = new_node
228 |                         else:
229 |                             cur_node = cur_node.key_to_child_node[self.param_str]
230 | 
231 |             # if the token is matched
232 |             else:
233 |                 cur_node = cur_node.key_to_child_node[token]
234 | 
235 |             current_depth += 1
236 | 
237 |     # seq1 is a template, seq2 is the log to match
238 |     def get_seq_distance(self, seq1, seq2, include_params: bool):
239 |         """
240 |         yd。功能：计算seq1与seq2的相似度，相似度 = 公共元素的个数/ seq1的长度
241 |         :param seq1:
242 |         :param seq2:
243 |         :param include_params:
244 |         :return:
245 |         """
246 |         assert len(seq1) == len(seq2)
247 | 
248 |         # sequences are empty - full match
249 |         if len(seq1) == 0:
250 |             return 1.0, 0
251 | 
252 |         sim_tokens = 0
253 |         param_count = 0
254 | 
255 |         for token1, token2 in zip(seq1, seq2):
256 |             if token1 == self.param_str:
257 |                 param_count += 1
258 |                 continue
259 |             if token1 == token2:
260 |                 sim_tokens += 1
261 | 
262 |         if include_params:
263 |             sim_tokens += param_count
264 | 
265 |         ret_val = float(sim_tokens) / len(seq1)
266 | 
267 |         return ret_val, param_count
268 | 
269 |     def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include_params: bool):
270 |         """
271 |         yd。功能：从cluster_ids对应的所有cluster中，找出cluster.log_template_tokens与tokens相似度最高的
272 |         Find the best match for a log message (represented as tokens) versus a list of clusters
273 |         :param cluster_ids: List of clusters to match against (represented by their IDs)
274 |         :param tokens: the log message, separated to tokens.
275 |         :param sim_th: minimum required similarity threshold (None will be returned in no clusters reached it)
276 |         :param include_params: consider tokens matched to wildcard parameters in similarity threshold.
277 |         :return: Best match cluster or None
278 |         """
279 |         match_cluster = None
280 | 
281 |         max_sim = -1
282 |         max_param_count = -1
283 |         max_cluster = None
284 | 
285 |         for cluster_id in cluster_ids:
286 |             # Try to retrieve cluster from cache with bypassing eviction
287 |             # algorithm as we are only testing candidates for a match.
288 |             cluster = self.id_to_cluster.get(cluster_id)
289 |             if cluster is None:
290 |                 continue
291 |             cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params)
292 |             if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count):
293 |                 max_sim = cur_sim
294 |                 max_param_count = param_count
295 |                 max_cluster = cluster
296 | 
297 |         if max_sim >= sim_th:
298 |             match_cluster = max_cluster
299 | 
300 |         return match_cluster
301 | 
302 |     def create_template(self, seq1, seq2):
303 |         assert len(seq1) == len(seq2)
304 |         ret_val = list(seq2)
305 | 
306 |         for i, (token1, token2) in enumerate(zip(seq1, seq2)):
307 |             if token1 != token2:
308 |                 ret_val[i] = self.param_str
309 | 
310 |         return ret_val
311 | 
312 |     def print_tree(self, file=None, max_clusters=5):
313 |         self.print_node("root", self.root_node, 0, file, max_clusters)
314 | 
315 |     def print_node(self, token, node, depth, file, max_clusters):
316 |         out_str = '\t' * depth
317 | 
318 |         if depth == 0:
319 |             out_str += f'<{token}>'
320 |         elif depth == 1:
321 |             out_str += f'<L={token}>'
322 |         else:
323 |             out_str += f'"{token}"'
324 | 
325 |         if len(node.cluster_ids) > 0:
326 |             out_str += f" (cluster_count={len(node.cluster_ids)})"
327 | 
328 |         print(out_str, file=file)
329 | 
330 |         for token, child in node.key_to_child_node.items():
331 |             self.print_node(token, child, depth + 1, file, max_clusters)
332 | 
333 |         for cid in node.cluster_ids[:max_clusters]:
334 |             cluster = self.id_to_cluster[cid]
335 |             out_str = '\t' * (depth + 1) + str(cluster)
336 |             print(out_str, file=file)
337 | 
338 |     def get_content_as_tokens_raw(self, content):
339 |         """
340 |         这是drain3最原始的分词代码，只考虑了英文，没有考虑中文的情况
341 |         :param content:
342 |         :return:
343 |         """
344 |         content = content.strip()
345 |         for delimiter in self.extra_delimiters:
346 |             content = content.replace(delimiter, " ")
347 |         content_tokens = content.split()
348 |         return content_tokens
349 | 
350 | 
351 |     def get_content_as_tokens(self, content):
352 |         """
353 |         考虑中英文混杂，纯英文两种情况
354 |         :param content:
355 |         :return:
356 |         """
357 |         content = content.strip()
358 |         is_contain_chinese, substr_type_pattern, substr_detail_list, token_list = get_token_list(content)
359 |         content_tokens = token_list
360 |         #print(f"content_tokens = {content_tokens}")
361 |         tokenize_result = {IS_CONTAIN_CHINESE_KEY : is_contain_chinese, SUBSTR_TYPE_PATTERN_KEY : substr_type_pattern,
362 |                            SUBSTR_DETAIL_LIST_KEY : substr_detail_list, TOKEN_LIST_KEY : token_list}
363 |         return content_tokens,tokenize_result
364 | 
365 | 
366 |     def add_log_message(self, content: str):
367 |         """
368 |         yd。功能：根据传入的content，获取匹配的logCluster，该LogCluster可能是先前已经存在的，也可能是需要新生成的
369 |         :param content:被正则匹配mask后的日志内容，例如"connected to <:IP:>"
370 |         :return:match_cluster：匹配的logCluster；update_type：表示更新match_cluster的原因
371 |         """
372 |         content_tokens, tokenize_result = self.get_content_as_tokens(content)  # yd。对content进行分词
373 | 
374 |         if self.profiler:
375 |             self.profiler.start_section("tree_search")
376 |         match_cluster = self.tree_search(self.root_node, content_tokens, self.sim_th, False)
377 |         if self.profiler:
378 |             self.profiler.end_section()
379 | 
380 |         # Match no existing log cluster
381 |         # yd。即没有匹配到任何已经存在的log cluster，即没有匹配到任何已经存在的log模板，此时就要新创建一个LogCluster对象
382 |         if match_cluster is None:
383 |             if self.profiler:
384 |                 self.profiler.start_section("create_cluster")
385 |             self.clusters_counter += 1
386 |             cluster_id = self.clusters_counter
387 |             match_cluster = LogCluster(content_tokens, cluster_id) #yd。构造一个新的LogCluster对象
388 |             self.id_to_cluster[cluster_id] = match_cluster
389 |             self.add_seq_to_prefix_tree(self.root_node, match_cluster) #利用新构建的match_cluster来更新prefix_tree
390 |             update_type = "cluster_created"
391 | 
392 |         # Add the new log message to the existing cluster
393 |         else:
394 |             if self.profiler:
395 |                 self.profiler.start_section("cluster_exist")
396 |             new_template_tokens = self.create_template(content_tokens, match_cluster.log_template_tokens)
397 |             if tuple(new_template_tokens) == match_cluster.log_template_tokens: #yd。如果新创建的模板与最匹配的模板相同
398 |                 update_type = "none"
399 |             else:#yd。如果新创建的模板与最新的模板不相同，则用新创建的模板来更新最匹配的模板
400 |                 match_cluster.log_template_tokens = tuple(new_template_tokens)
401 |                 update_type = "cluster_template_changed"
402 |             match_cluster.size += 1
403 |             # Touch cluster to update its state in the cache.
404 |             # noinspection PyStatementEffect
405 |             self.id_to_cluster[match_cluster.cluster_id] #yd。因为使用了LRUCache机制来控制cluster个数，故这里需要访问一下match_cluster对应的id
406 | 
407 |         if self.profiler:
408 |             self.profiler.end_section()
409 | 
410 |         return match_cluster, update_type,tokenize_result
411 | 
412 |     def get_clusters_ids_for_seq_len(self, seq_len: int):
413 |         """
414 |         Return all clusters with the specified count of tokens
415 |         """
416 | 
417 |         def append_clusters_recursive(node: Node, id_list_to_fill: list):
418 |             id_list_to_fill.extend(node.cluster_ids)
419 |             for child_node in node.key_to_child_node.values():
420 |                 append_clusters_recursive(child_node, id_list_to_fill)
421 | 
422 |         cur_node = self.root_node.key_to_child_node.get(str(seq_len))
423 | 
424 |         # no template with same token count
425 |         if cur_node is None:
426 |             return []
427 | 
428 |         target = []
429 |         append_clusters_recursive(cur_node, target)
430 |         return target
431 | 
432 |     def match(self, content: str, full_search_strategy="never"):
433 |         """
434 |         Match log message against an already existing cluster.
435 |         Match shall be perfect (sim_th=1.0).
436 |         New cluster will not be created as a result of this call, nor any cluster modifications.
437 | 
438 |         :param content: log message to match
439 |         :param full_search_strategy: when to perform full cluster search.
440 |             (1) "never" is the fastest, will always perform a tree search [O(log(n)] but might produce
441 |             false negatives (wrong mismatches) on some edge cases;
442 |             (2) "fallback" will perform a linear search [O(n)] among all clusters with the same token count, but only in
443 |             case tree search found no match.
444 |             It should not have false negatives, however tree-search may find a non-optimal match with
445 |             more wildcard parameters than necessary;
446 |             (3) "always" is the slowest. It will select the best match among all known clusters, by always evaluating
447 |             all clusters with the same token count, and selecting the cluster with perfect all token match and least
448 |             count of wildcard matches.
449 |         :return: Matched cluster or None if no match found.
450 |         """
451 | 
452 |         assert full_search_strategy in ["always", "never", "fallback"]
453 | 
454 |         required_sim_th = 1.0
455 |         content_tokens, tokenize_result = self.get_content_as_tokens(content)
456 | 
457 |         # consider for future improvement:
458 |         # It is possible to implement a recursive tree_search (first try exact token match and fallback to
459 |         # wildcard match). This will be both accurate and more efficient than the linear full search
460 |         # also fast match can be optimized when exact match is required by early
461 |         # quitting on less than exact cluster matches.
462 |         def full_search():
463 |             all_ids = self.get_clusters_ids_for_seq_len(len(content_tokens))
464 |             cluster = self.fast_match(all_ids, content_tokens, required_sim_th, include_params=True)
465 |             return cluster, tokenize_result
466 | 
467 |         if full_search_strategy == "always":
468 |             return full_search()
469 | 
470 |         match_cluster = self.tree_search(self.root_node, content_tokens, required_sim_th, include_params=True)
471 |         if match_cluster is not None:
472 |             return match_cluster, tokenize_result
473 | 
474 |         if full_search_strategy == "never":
475 |             return None, tokenize_result
476 | 
477 |         return full_search()
478 | 
479 |     def get_total_cluster_size(self):
480 |         size = 0
481 |         for c in self.id_to_cluster.values():
482 |             size += c.size
483 |         return size
484 | 


--------------------------------------------------------------------------------
/drain3/file_persistence.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | import os
 4 | import pathlib
 5 | 
 6 | from drain3.persistence_handler import PersistenceHandler
 7 | 
 8 | 
 9 | class FilePersistence(PersistenceHandler):
10 |     def __init__(self, file_path):
11 |         self.file_path = file_path
12 | 
13 |     def save_state(self, state):
14 |         pathlib.Path(self.file_path).write_bytes(state)
15 | 
16 |     def load_state(self):
17 |         if not os.path.exists(self.file_path):
18 |             return None
19 | 
20 |         return pathlib.Path(self.file_path).read_bytes()
21 | 


--------------------------------------------------------------------------------
/drain3/kafka_persistence.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | import kafka
 4 | 
 5 | from drain3.persistence_handler import PersistenceHandler
 6 | 
 7 | 
 8 | class KafkaPersistence(PersistenceHandler):
 9 | 
10 |     def __init__(self, topic, snapshot_poll_timeout_sec=60, **kafka_client_options):
11 |         self.topic = topic
12 |         self.kafka_client_options = kafka_client_options
13 |         self.producer = kafka.KafkaProducer(**self.kafka_client_options)
14 |         self.snapshot_poll_timeout_sec = snapshot_poll_timeout_sec
15 | 
16 |     def save_state(self, state):
17 |         self.producer.send(self.topic, value=state)
18 | 
19 |     def load_state(self):
20 |         consumer = kafka.KafkaConsumer(**self.kafka_client_options)
21 |         partition = kafka.TopicPartition(self.topic, 0)
22 |         consumer.assign([partition])
23 |         end_offsets = consumer.end_offsets([partition])
24 |         end_offset = list(end_offsets.values())[0]
25 |         if end_offset > 0:
26 |             consumer.seek(partition, end_offset - 1)
27 |             snapshot_poll_timeout_ms = self.snapshot_poll_timeout_sec * 1000
28 |             records = consumer.poll(snapshot_poll_timeout_ms)
29 |             if not records:
30 |                 raise RuntimeError(f"No message received from Kafka during restore even though end_offset>0")
31 |             last_msg = records[partition][0]
32 |             state = last_msg.value
33 |         else:
34 |             state = None
35 | 
36 |         consumer.close()
37 |         return state
38 | 


--------------------------------------------------------------------------------
/drain3/masking.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | import abc
 4 | import re
 5 | from typing import Collection, Optional
 6 | 
 7 | 
 8 | class AbstractMaskingInstruction(abc.ABC):
 9 | 
10 |     def __init__(self, mask_with: str):
11 |         self.mask_with = mask_with
12 | 
13 |     @abc.abstractmethod
14 |     def mask(self, content: str, mask_prefix: str, mask_suffix: str) -> str:
15 |         """
16 |         Mask content according to this instruction and return the result.
17 | 
18 |         :param content: text to apply masking to
19 |         :param mask_prefix: the prefix of any masks inserted
20 |         :param mask_suffix: the suffix of any masks inserted
21 |         """
22 |         pass
23 | 
24 | 
25 | class MaskingInstruction(AbstractMaskingInstruction):
26 | 
27 |     def __init__(self, pattern: str, mask_with: str):
28 |         super().__init__(mask_with)
29 |         self.regex = re.compile(pattern)
30 | 
31 |     @property
32 |     def pattern(self):
33 |         return self.regex.pattern
34 | 
35 |     def mask(self, content: str, mask_prefix: str, mask_suffix: str) -> str:
36 |         """
37 |         yd。功能：将content中正则匹配的子串，用指定字符串替换，比如将content中的ip替换为<:IP:>
38 |         :param content:
39 |         :param mask_prefix:
40 |         :param mask_suffix:
41 |         :return:
42 |         """
43 |         mask = mask_prefix + self.mask_with + mask_suffix
44 |         return self.regex.sub(mask, content)
45 | 
46 | 
47 | # Alias for `MaskingInstruction`.
48 | RegexMaskingInstruction = MaskingInstruction
49 | 
50 | 
51 | class LogMasker:
52 | 
53 |     def __init__(self, masking_instructions: Collection[AbstractMaskingInstruction],
54 |                  mask_prefix: str, mask_suffix: str):
55 |         self.mask_prefix = mask_prefix
56 |         self.mask_suffix = mask_suffix
57 |         self.masking_instructions = masking_instructions
58 |         self.mask_name_to_instructions = {} #yd。格式为{mask_name: masking_instruction_list}
59 |         for mi in self.masking_instructions:
60 |             self.mask_name_to_instructions.setdefault(mi.mask_with, [])
61 |             self.mask_name_to_instructions[mi.mask_with].append(mi)
62 | 
63 |     def mask(self, content: str) -> str:
64 |         """
65 |         yd。功能：将content字符串中正则匹配的子串，用特定符号替换，比如将content中的ip数字用"<:IP:>"替换
66 |         :param content: 待正则匹配替换的字符串
67 |         :return:
68 |         """
69 |         for mi in self.masking_instructions:
70 |             content = mi.mask(content, self.mask_prefix, self.mask_suffix)
71 |         return content
72 | 
73 |     @property
74 |     def mask_names(self) -> Collection[str]:
75 |         return self.mask_name_to_instructions.keys()
76 | 
77 |     def instructions_by_mask_name(self, mask_name: str) -> Optional[Collection[AbstractMaskingInstruction]]:
78 |         """
79 |         yd。功能：根据mask_name查找到对应的masking_instruction_list
80 |         :param mask_name:
81 |         :return:
82 |         """
83 |         return self.mask_name_to_instructions.get(mask_name, [])
84 | 
85 | # Some masking examples
86 | # ---------------------
87 | #
88 | # masking_instances = [
89 | #    MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)', "ID"),
90 | #    MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})((?=[^A-Za-z0-9])|$)', "IP"),
91 | #    MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)', "SEQ"),
92 | #    MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)', "SEQ"),
93 | #
94 | #    MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)', "HEX"),
95 | #    MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([\-\+]?\d+)((?=[^A-Za-z0-9])|$)', "NUM"),
96 | #    MaskingInstruction(r'(?<=executed cmd )(".+?")', "CMD"),
97 | # ]
98 | 


--------------------------------------------------------------------------------
/drain3/memory_buffer_persistence.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | from drain3.persistence_handler import PersistenceHandler
 4 | 
 5 | 
 6 | class MemoryBufferPersistence(PersistenceHandler):
 7 |     def __init__(self):
 8 |         self.state = None
 9 | 
10 |     def save_state(self, state):
11 |         self.state = state
12 | 
13 |     def load_state(self):
14 |         return self.state


--------------------------------------------------------------------------------
/drain3/persistence_handler.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | 
 6 | class PersistenceHandler(ABC):
 7 | 
 8 |     @abstractmethod
 9 |     def save_state(self, state):
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def load_state(self):
14 |         pass
15 | 


--------------------------------------------------------------------------------
/drain3/redis_persistence.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | import redis
 4 | 
 5 | from drain3.persistence_handler import PersistenceHandler
 6 | 
 7 | 
 8 | class RedisPersistence(PersistenceHandler):
 9 |     def __init__(self, redis_host, redis_port, redis_db, redis_pass, is_ssl, redis_key):
10 |         self.redis_host = redis_host
11 |         self.redis_port = redis_port
12 |         self.redis_db = redis_db
13 |         self.redis_pass = redis_pass
14 |         self.is_ssl = is_ssl
15 |         self.redis_key = redis_key
16 |         self.r = redis.Redis(host=self.redis_host,
17 |                              port=self.redis_port,
18 |                              db=self.redis_db,
19 |                              password=self.redis_pass,
20 |                              ssl=self.is_ssl)
21 | 
22 |     def save_state(self, state):
23 |         self.r.set(self.redis_key, state)
24 | 
25 |     def load_state(self):
26 |         return self.r.get(self.redis_key)
27 | 


--------------------------------------------------------------------------------
/drain3/simple_profiler.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | # Based on https://github.com/davidohana/SimpleProfiler/blob/main/python/simple_profiler.py
  3 | 
  4 | import os
  5 | import time
  6 | 
  7 | from abc import ABC, abstractmethod
  8 | 
  9 | 
 10 | class Profiler(ABC):
 11 | 
 12 |     @abstractmethod
 13 |     def start_section(self, section_name: str):
 14 |         pass
 15 | 
 16 |     @abstractmethod
 17 |     def end_section(self, section_name=""):
 18 |         pass
 19 | 
 20 |     @abstractmethod
 21 |     def report(self, period_sec=30):
 22 |         pass
 23 | 
 24 | 
 25 | class NullProfiler(Profiler):
 26 |     """A no-op profiler. Use it instead of SimpleProfiler in case you want to disable profiling."""
 27 | 
 28 |     def start_section(self, section_name: str):
 29 |         pass
 30 | 
 31 |     def end_section(self, section_name=""):
 32 |         pass
 33 | 
 34 |     def report(self, period_sec=30):
 35 |         pass
 36 | 
 37 | 
 38 | class SimpleProfiler(Profiler):
 39 |     def __init__(self, reset_after_sample_count=0, enclosing_section_name="total", printer=print, report_sec=30):
 40 |         self.printer = printer
 41 |         self.enclosing_section_name = enclosing_section_name
 42 |         self.reset_after_sample_count = reset_after_sample_count
 43 |         self.report_sec = report_sec
 44 | 
 45 |         self.section_to_stats = {}
 46 |         self.last_report_timestamp_sec = time.time()
 47 |         self.last_started_section_name = ""
 48 | 
 49 |     def start_section(self, section_name: str):
 50 |         """Start measuring a section"""
 51 | 
 52 |         if not section_name:
 53 |             raise ValueError("Section name is empty")
 54 |         self.last_started_section_name = section_name
 55 | 
 56 |         section = self.section_to_stats.get(section_name, None)
 57 |         if section is None:
 58 |             section = ProfiledSectionStats(section_name)
 59 |             self.section_to_stats[section_name] = section
 60 | 
 61 |         if section.start_time_sec != 0:
 62 |             raise ValueError(f"Section {section_name} is already started")
 63 | 
 64 |         section.start_time_sec = time.time()
 65 | 
 66 |     def end_section(self, name=""):
 67 |         """End measuring a section. Leave section name empty to end the last started section."""
 68 | 
 69 |         now = time.time()
 70 | 
 71 |         section_name = name
 72 |         if not name:
 73 |             section_name = self.last_started_section_name
 74 | 
 75 |         if not section_name:
 76 |             raise ValueError("Neither section name is specified nor a section is started")
 77 | 
 78 |         section: ProfiledSectionStats = self.section_to_stats.get(section_name, None)
 79 |         if section is None:
 80 |             raise ValueError(f"Section {section_name} does not exist")
 81 | 
 82 |         if section.start_time_sec == 0:
 83 |             raise ValueError(f"Section {section_name} was not started")
 84 | 
 85 |         took_sec = now - section.start_time_sec
 86 |         if 0 < self.reset_after_sample_count == section.sample_count:
 87 |             section.sample_count_batch = 0
 88 |             section.total_time_sec_batch = 0
 89 | 
 90 |         section.sample_count += 1
 91 |         section.total_time_sec += took_sec
 92 |         section.sample_count_batch += 1
 93 |         section.total_time_sec_batch += took_sec
 94 |         section.start_time_sec = 0
 95 | 
 96 |     def report(self, period_sec=30):
 97 |         """Print results using [printer] function. By default prints to stdout."""
 98 |         if time.time() - self.last_report_timestamp_sec < period_sec:
 99 |             return False
100 | 
101 |         enclosing_time_sec = 0
102 |         if self.enclosing_section_name:
103 |             enclosing_section: ProfiledSectionStats = self.section_to_stats.get(self.enclosing_section_name, None)
104 |             if enclosing_section:
105 |                 enclosing_time_sec = enclosing_section.total_time_sec
106 | 
107 |         include_batch_rates = self.reset_after_sample_count > 0
108 | 
109 |         sections = self.section_to_stats.values()
110 |         sorted_sections = sorted(sections, key=lambda it: it.total_time_sec, reverse=True)
111 |         lines = map(lambda it: it.to_string(enclosing_time_sec, include_batch_rates), sorted_sections)
112 |         text = os.linesep.join(lines)
113 |         self.printer(text)
114 | 
115 |         self.last_report_timestamp_sec = time.time()
116 |         return True
117 | 
118 | 
119 | class ProfiledSectionStats:
120 |     def __init__(self, section_name, start_time_sec=0, sample_count=0, total_time_sec=0,
121 |                  sample_count_batch=0, total_time_sec_batch=0):
122 |         self.section_name = section_name
123 |         self.start_time_sec = start_time_sec
124 |         self.sample_count = sample_count
125 |         self.total_time_sec = total_time_sec
126 |         self.sample_count_batch = sample_count_batch
127 |         self.total_time_sec_batch = total_time_sec_batch
128 | 
129 |     def to_string(self, enclosing_time_sec: int, include_batch_rates: bool):
130 |         took_sec_text = f"{self.total_time_sec:>8.2f} s"
131 |         if enclosing_time_sec > 0:
132 |             took_sec_text += f" ({100 * self.total_time_sec / enclosing_time_sec:>6.2f}%)"
133 | 
134 |         ms_per_k_samples = f"{1000000 * self.total_time_sec / self.sample_count: 7.2f}"
135 | 
136 |         if self.total_time_sec > 0:
137 |             samples_per_sec = f"{self.sample_count / self.total_time_sec: 15,.2f}"
138 |         else:
139 |             samples_per_sec = "N/A"
140 | 
141 |         if include_batch_rates:
142 |             ms_per_k_samples += f" ({1000000 * self.total_time_sec_batch / self.sample_count_batch: 7.2f})"
143 |             if self.total_time_sec_batch > 0:
144 |                 samples_per_sec += f" ({self.sample_count_batch / self.total_time_sec_batch: 15,.2f})"
145 |             else:
146 |                 samples_per_sec += " (N/A)"
147 | 
148 |         return f"{self.section_name: <15}: took {took_sec_text}, " \
149 |                f"{self.sample_count: >10,} samples, " \
150 |                f"{ms_per_k_samples} ms / 1000 samples, " \
151 |                f"{samples_per_sec} hz"
152 | 


--------------------------------------------------------------------------------
/drain3/template_miner.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | 
  3 | import base64
  4 | import logging
  5 | import re
  6 | import time
  7 | import zlib
  8 | from typing import Optional, List, NamedTuple
  9 | 
 10 | import jsonpickle
 11 | from cachetools import LRUCache, cachedmethod
 12 | 
 13 | from drain3.drain import Drain, LogCluster
 14 | from drain3.masking import LogMasker
 15 | from drain3.persistence_handler import PersistenceHandler
 16 | from drain3.simple_profiler import SimpleProfiler, NullProfiler, Profiler
 17 | from drain3.template_miner_config import TemplateMinerConfig
 18 | from src.common_config import CLUSTER_COUNT_KEY, DEFAULT_STR_VALUE, USE_OLD_FUNCTION_EXTRACT_PARAMETER,\
 19 |     TOKEN_LIST_KEY, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY,LOG_TEMPLATE_TOKENS_KEY,ENABLE_MASK_CONTENT
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | config_filename = 'drain3.ini'
 24 | 
 25 | ExtractedParameter = NamedTuple("ExtractedParameter", [("value", str), ("mask_name", str)])
 26 | 
 27 | 
 28 | class TemplateMiner:
 29 | 
 30 |     def __init__(self,
 31 |                  persistence_handler: PersistenceHandler = None,
 32 |                  config: TemplateMinerConfig = None):
 33 |         """
 34 |         Wrapper for Drain with persistence and masking support
 35 | 
 36 |         :param persistence_handler: The type of persistence to use. When None, no persistence is applied.
 37 |         :param config: Configuration object. When none, configuration is loaded from default .ini file (if exist)
 38 |         """
 39 |         logger.info("Starting Drain3 template miner")
 40 | 
 41 |         if config is None:
 42 |             logger.info(f"Loading configuration from {config_filename}")
 43 |             config = TemplateMinerConfig()
 44 |             config.load(config_filename)
 45 | 
 46 |         self.config = config
 47 | 
 48 |         self.profiler: Profiler = NullProfiler()
 49 |         if self.config.profiling_enabled:
 50 |             self.profiler = SimpleProfiler()
 51 | 
 52 |         self.persistence_handler = persistence_handler
 53 | 
 54 |         param_str = self.config.mask_prefix + "*" + self.config.mask_suffix #yd。将param_str的值设为<*>
 55 |         self.drain = Drain(
 56 |             sim_th=self.config.drain_sim_th,
 57 |             depth=self.config.drain_depth,
 58 |             max_children=self.config.drain_max_children,
 59 |             max_clusters=self.config.drain_max_clusters,
 60 |             extra_delimiters=self.config.drain_extra_delimiters,
 61 |             profiler=self.profiler,
 62 |             param_str=param_str,
 63 |             parametrize_numeric_tokens=self.config.parametrize_numeric_tokens
 64 |         )
 65 |         self.masker = LogMasker(self.config.masking_instructions, self.config.mask_prefix, self.config.mask_suffix)
 66 |         self.parameter_extraction_cache = LRUCache(self.config.parameter_extraction_cache_capacity)
 67 |         self.last_save_time = time.time() #yd。表示最近一次将self.drain对象进行序列化得到state，并保存state的时间
 68 |         if persistence_handler is not None: #yd。如果持久化handler不为None，则加载state
 69 |             self.load_state()
 70 | 
 71 |     def load_state(self):
 72 |         """
 73 |         yd。加载之前保存的state，然后将state反序列化，用反序列化的结果来更新self.drain对象，
 74 |         :return:
 75 |         """
 76 |         # yd。这里选择不许需要之前的状态
 77 |         return
 78 | 
 79 |         logger.info("Checking for saved state")
 80 | 
 81 |         state = self.persistence_handler.load_state()
 82 |         if state is None:
 83 |             logger.info("Saved state not found")
 84 |             return
 85 | 
 86 |         if self.config.snapshot_compress_state:
 87 |             state = zlib.decompress(base64.b64decode(state))
 88 | 
 89 |         loaded_drain: Drain = jsonpickle.loads(state, keys=True)
 90 | 
 91 |         # json-pickle encoded keys as string by default, so we have to convert those back to int
 92 |         # this is only relevant for backwards compatibility when loading a snapshot of drain <= v0.9.1
 93 |         # which did not use json-pickle's keys=true
 94 |         if len(loaded_drain.id_to_cluster) > 0 and isinstance(next(iter(loaded_drain.id_to_cluster.keys())), str):
 95 |             loaded_drain.id_to_cluster = {int(k): v for k, v in list(loaded_drain.id_to_cluster.items())}
 96 |             if self.config.drain_max_clusters:
 97 |                 cache = LRUCache(maxsize=self.config.drain_max_clusters)
 98 |                 cache.update(loaded_drain.id_to_cluster)
 99 |                 loaded_drain.id_to_cluster = cache
100 | 
101 |         self.drain.id_to_cluster = loaded_drain.id_to_cluster
102 |         self.drain.clusters_counter = loaded_drain.clusters_counter
103 |         self.drain.root_node = loaded_drain.root_node
104 | 
105 |         logger.info("Restored {0} clusters built from {1} messages".format(
106 |             len(loaded_drain.clusters), loaded_drain.get_total_cluster_size()))
107 | 
108 |     def save_state(self, snapshot_reason):
109 |         """
110 |         yd。功能：将self.drain对象序列化后得到state，将state保存到指定文件中
111 |         :param snapshot_reason:
112 |         :return:
113 |         """
114 |         state = jsonpickle.dumps(self.drain, keys=True).encode('utf-8') #yd。将self.drain这个对象序列化
115 |         if self.config.snapshot_compress_state:#yd。如果需要压缩state snapshot，则进行压缩
116 |             state = base64.b64encode(zlib.compress(state))
117 | 
118 |         logger.info(f"Saving state of {len(self.drain.clusters)} clusters "
119 |                     f"with {self.drain.get_total_cluster_size()} messages, {len(state)} bytes, "
120 |                     f"reason: {snapshot_reason}")
121 | 
122 |         self.persistence_handler.save_state(state) #yd。文件持久化，即将state保存到指定路径所在的文件中
123 | 
124 |     def get_snapshot_reason(self, change_type, cluster_id):
125 |         """
126 |         yd。功能：获取保存snapshot的原因，主要原因有两个：
127 |             1、change_type不为none；
128 |             2、距离上次保存snapshot的时间超过配置的间隔时间
129 |         :param change_type:
130 |         :param cluster_id:
131 |         :return:
132 |         """
133 |         if change_type != "none":
134 |             return "{} ({})".format(change_type, cluster_id)
135 | 
136 |         diff_time_sec = time.time() - self.last_save_time
137 |         if diff_time_sec >= self.config.snapshot_interval_minutes * 60:
138 |             return "periodic"
139 | 
140 |         return None
141 | 
142 |     def make_result_dict(self,cluster, tokenize_result):
143 |         result_dict = {            CLUSTER_ID_KEY: cluster.cluster_id,
144 |             CLUSTER_SIZE_KEY: cluster.size, #yd。用于统计当前cluster匹配的日志条数
145 |             LOG_TEMPLATE_TOKENS_KEY: cluster.log_template_tokens,
146 |             TEMPLATE_MINED_KEY: cluster.get_template() # yd。返回挖掘处理的日志模板
147 |         }
148 |         result_dict.update(tokenize_result)
149 |         return result_dict
150 | 
151 |     def add_log_message(self, log_message: str) -> dict:
152 |         """
153 |         yd。功能：根据当前传入的日志内容，获取对应的日志模板的logCluster
154 |         :param log_message: 一条日志的内容
155 |         :return:
156 |         """
157 |         self.profiler.start_section("total")
158 | 
159 |         if ENABLE_MASK_CONTENT:
160 |             self.profiler.start_section("mask")
161 |             # yd。将log_message字符串中正则匹配的子串，用特定符号替换。
162 |             # 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换，返回"connected to <:IP:>"
163 |             masked_content = self.masker.mask(log_message)
164 |             self.profiler.end_section()
165 |         else:
166 |             masked_content = log_message
167 | 
168 |         self.profiler.start_section("drain")
169 |         # yd。根据传入的masked_content，获取匹配的logCluster
170 |         cluster, change_type, tokenize_result = self.drain.add_log_message(masked_content)
171 |         self.profiler.end_section("drain")
172 | 
173 |         result = {
174 |             "change_type": change_type,
175 |             CLUSTER_COUNT_KEY: len(self.drain.clusters)  # yd。统计当前已经挖掘的模板的 总数
176 |         }
177 |         result_dict = self.make_result_dict(cluster, tokenize_result)
178 |         result.update(result_dict)
179 | 
180 |         #yd。这里是将当前的日志模板信息的快照保存下来
181 |         if self.persistence_handler is not None:
182 |             self.profiler.start_section("save_state")
183 |             snapshot_reason = self.get_snapshot_reason(change_type, cluster.cluster_id)
184 |             if snapshot_reason:
185 |                 self.save_state(snapshot_reason)
186 |                 self.last_save_time = time.time()
187 |             self.profiler.end_section()
188 | 
189 |         self.profiler.end_section("total")
190 |         self.profiler.report(self.config.profiling_report_sec) #yd。这个方法啥事都没有干，可以不管
191 |         return result
192 | 
193 |     def match(self, log_message: str, full_search_strategy="never") -> LogCluster:
194 |         """
195 |         Mask log message and match against an already existing cluster.
196 |         Match shall be perfect (sim_th=1.0).
197 |         New cluster will not be created as a result of this call, nor any cluster modifications.
198 | 
199 |         :param log_message: log message to match
200 |         :param full_search_strategy: when to perform full cluster search.
201 |             (1) "never" is the fastest, will always perform a tree search [O(log(n)] but might produce
202 |             false negatives (wrong mismatches) on some edge cases;
203 |             (2) "fallback" will perform a linear search [O(n)] among all clusters with the same token count, but only in
204 |             case tree search found no match.
205 |             It should not have false negatives, however tree-search may find a non-optimal match with
206 |             more wildcard parameters than necessary;
207 |             (3) "always" is the slowest. It will select the best match among all known clusters, by always evaluating
208 |             all clusters with the same token count, and selecting the cluster with perfect all token match and least
209 |             count of wildcard matches.
210 |         :return: Matched cluster or None if no match found.
211 |         """
212 |         if ENABLE_MASK_CONTENT:
213 |             # yd。将log_message字符串中正则匹配的子串，用特定符号替换。
214 |             # 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换，返回"connected to <:IP:>"
215 |             masked_content = self.masker.mask(log_message)
216 |         else:
217 |             masked_content = log_message
218 | 
219 |         matched_cluster, tokenize_result = self.drain.match(masked_content, full_search_strategy)
220 |         return matched_cluster, tokenize_result
221 | 
222 |     def get_parameter_list(self, log_template: str, log_message: str) -> List[str]:
223 |         """
224 |         Extract parameters from a log message according to a provided template that was generated
225 |         by calling `add_log_message()`.
226 | 
227 |         This function is deprecated. Please use extract_parameters instead.
228 | 
229 |         :param log_template: log template corresponding to the log message
230 |         :param log_message: log message to extract parameters from
231 |         :return: An ordered list of parameter values present in the log message.
232 |         """
233 | 
234 |         extracted_parameters = self.extract_parameters(log_template, log_message, exact_matching=False)
235 |         if not extracted_parameters:
236 |             return []
237 |         return [parameter.value for parameter in extracted_parameters]
238 | 
239 |     def get_parameter(self,result_dict, log_line):
240 |         if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
241 |             # template = result["template_mined"]
242 |             template = result_dict.get(TEMPLATE_MINED_KEY, DEFAULT_STR_VALUE)
243 |             params = self.extract_parameters(template, log_line)
244 |             return params
245 |         content_tokens = result_dict.get(TOKEN_LIST_KEY, [])
246 |         # log_template_tokens = result["log_template_tokens"]
247 |         log_template_tokens = result_dict.get(LOG_TEMPLATE_TOKENS_KEY, [])
248 |         params = self.extract_parameters_by_compare(content_tokens, log_template_tokens)
249 |         return params
250 | 
251 |     def extract_parameters_by_compare(self, content_tokens, log_template_tokens):
252 |         parameter_list = []
253 |         for token1, token2 in zip(content_tokens, log_template_tokens):
254 |             if token1 == token2:
255 |                 continue
256 |             extracted_parameter = ExtractedParameter(token1, mask_name="-")
257 |             parameter_list.append(extracted_parameter)
258 |         return parameter_list
259 | 
260 | 
261 |     def extract_parameters(self,
262 |                            log_template: str,
263 |                            log_message: str,
264 |                            exact_matching: bool = True) -> Optional[List[ExtractedParameter]]:
265 |         """
266 |         Extract parameters from a log message according to a provided template that was generated
267 |         by calling `add_log_message()`.
268 | 
269 |         For most accurate results, it is recommended that
270 |         - Each `MaskingInstruction` has a unique `mask_with` value,
271 |         - No `MaskingInstruction` has a `mask_with` value of `*`,
272 |         - The regex-patterns of `MaskingInstruction` do not use unnamed back-references;
273 |           instead use back-references to named groups e.g. `(?P=some-name)`.
274 | 
275 |         :param log_template: log template corresponding to the log message
276 |         :param log_message: log message to extract parameters from
277 |         :param exact_matching: whether to apply the correct masking-patterns to match parameters, or try to approximate;
278 |             disabling exact_matching may be faster but may lead to situations in which parameters
279 |             are wrongly identified.
280 |         :return: A ordered list of ExtractedParameter for the log message
281 |             or None if log_message does not correspond to log_template.
282 |         """
283 |         #yd。将delimiter用空格替换
284 |         for delimiter in self.config.drain_extra_delimiters:
285 |             log_message = re.sub(delimiter, " ", log_message)
286 | 
287 |         template_regex, param_group_name_to_mask_name = self._get_template_parameter_extraction_regex(
288 |             log_template, exact_matching)
289 | 
290 |         # Parameters are represented by specific named groups inside template_regex.
291 |         parameter_match = re.match(template_regex, log_message)
292 | 
293 |         # log template does not match template
294 |         if not parameter_match:
295 |             return None
296 | 
297 |         # create list of extracted parameters
298 |         extracted_parameters = []
299 |         for group_name, parameter in parameter_match.groupdict().items(): #yd。对正则匹配的结果进行遍历
300 |             if group_name in param_group_name_to_mask_name:
301 |                 mask_name = param_group_name_to_mask_name[group_name]
302 |                 extracted_parameter = ExtractedParameter(parameter, mask_name)
303 |                 extracted_parameters.append(extracted_parameter)
304 | 
305 |         return extracted_parameters
306 | 
307 |     @cachedmethod(lambda self: self.parameter_extraction_cache)
308 |     def _get_template_parameter_extraction_regex(self, log_template: str, exact_matching: bool):
309 |         """
310 |         yd。功能：构建模板参数抽取的正则表达式
311 |         :param log_template:
312 |         :param exact_matching:
313 |         :return: template_regex:
314 |                 param_group_name_to_mask_name，以dict的形式保存着正则表达式的名称和mask_name，例如{'p_0': 'HEX', 'p_1': '*', 'p_2': 'CMD', 'p_3': 'SEQ', 'p_4': 'IP', 'p_5': 'NUM', 'p_6': 'ID'}
315 |         """
316 |         param_group_name_to_mask_name = dict()
317 |         param_name_counter = [0]
318 |         #print(f"  log_template传入的值 = {log_template}")
319 |         def get_next_param_name():
320 |             param_group_name = "p_" + str(param_name_counter[0])
321 |             param_name_counter[0] += 1
322 |             return param_group_name
323 | 
324 |         # Create a named group with the respective patterns for the given mask-name.
325 |         def create_capture_regex(_mask_name):
326 |             allowed_patterns = []
327 |             if exact_matching:
328 |                 # get all possible regex patterns from masking instructions that match this mask name
329 |                 masking_instructions = self.masker.instructions_by_mask_name(_mask_name)
330 |                 for mi in masking_instructions:
331 |                     # MaskingInstruction may already contain named groups.
332 |                     # We replace group names in those named groups, to avoid conflicts due to duplicate names.
333 |                     if hasattr(mi, 'regex'):
334 |                         mi_groups = mi.regex.groupindex.keys()
335 |                         pattern = mi.pattern #yd。取出构造正则表达式时的字符串
336 |                     else:
337 |                         # non regex masking instructions - support only non-exact matching
338 |                         mi_groups = []
339 |                         pattern = ".+?"
340 | 
341 |                     for group_name in mi_groups:
342 |                         param_group_name = get_next_param_name()
343 | 
344 |                         def replace_captured_param_name(param_pattern):
345 |                             _search_str = param_pattern.format(group_name)
346 |                             _replace_str = param_pattern.format(param_group_name)
347 |                             return pattern.replace(_search_str, _replace_str)
348 | 
349 |                         pattern = replace_captured_param_name("(?P={}")
350 |                         pattern = replace_captured_param_name("(?P<{}>")
351 | 
352 |                     # support unnamed back-references in masks (simple cases only)
353 |                     pattern = re.sub(r"\\(?!0)\d{1,2}", r"(?:.+?)", pattern)
354 |                     allowed_patterns.append(pattern)
355 | 
356 |             if not exact_matching or _mask_name == "*":
357 |                 allowed_patterns.append(r".+?")
358 | 
359 |             # Give each capture group a unique name to avoid conflicts.
360 |             param_group_name = get_next_param_name()
361 |             param_group_name_to_mask_name[param_group_name] = _mask_name
362 |             joined_patterns = "|".join(allowed_patterns) #yd。将正则表达式join起来
363 |             capture_regex = "(?P<{}>{})".format(param_group_name, joined_patterns)
364 |             return capture_regex
365 | 
366 |         # For every mask in the template, replace it with a named group of all
367 |         # possible masking-patterns it could represent (in order).
368 |         mask_names = set(self.masker.mask_names)
369 | 
370 |         # the Drain catch-all mask
371 |         mask_names.add("*")
372 | 
373 |         escaped_prefix = re.escape(self.masker.mask_prefix) #yd。将字符串中所有可能被解释为正则运算符的字符进行转义
374 |         escaped_suffix = re.escape(self.masker.mask_suffix)
375 |         template_regex = re.escape(log_template)
376 |         #print(f"template_regex最初的值 = {template_regex}")
377 | 
378 |         # replace each mask name with a proper regex that captures it
379 |         for mask_name in mask_names:
380 |             search_str = escaped_prefix + re.escape(mask_name) + escaped_suffix
381 |             while True:
382 |                 rep_str = create_capture_regex(mask_name)
383 |                 # Replace one-by-one to get a new param group name for each replacement.
384 |                 template_regex_new = template_regex.replace(search_str, rep_str, 1)
385 |                 # Break when all replaces for this mask are done.
386 |                 if template_regex_new == template_regex:
387 |                     break
388 |                 template_regex = template_regex_new
389 | 
390 |         #print(f"template_regex处理的值 = {template_regex}")
391 |         #yd。将正则表达式template_regex进行改造，将其中的空格替换为"\\s+"，并且在template_regex前后分别加上起始符和结束符
392 |         # match also messages with multiple spaces or other whitespace chars between tokens
393 |         template_regex = re.sub(r"\\ ", r"\\s+", template_regex)
394 |         template_regex = "^" + template_regex + "$"
395 |         return template_regex, param_group_name_to_mask_name
396 | 


--------------------------------------------------------------------------------
/drain3/template_miner_config.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | import ast
 4 | import configparser
 5 | import json
 6 | import logging
 7 | 
 8 | from drain3.masking import MaskingInstruction
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class TemplateMinerConfig:
14 |     def __init__(self):
15 |         self.profiling_enabled = False
16 |         self.profiling_report_sec = 60
17 |         self.snapshot_interval_minutes = 5
18 |         self.snapshot_compress_state = True
19 |         self.drain_extra_delimiters = []
20 |         self.drain_sim_th = 0.4
21 |         self.drain_depth = 4
22 |         self.drain_max_children = 100
23 |         self.drain_max_clusters = None
24 |         self.masking_instructions = [] #yd。由ini配置文件中"masking"字段中的正则表达式构成
25 |         self.mask_prefix = "<"
26 |         self.mask_suffix = ">"
27 |         self.parameter_extraction_cache_capacity = 3000
28 |         self.parametrize_numeric_tokens = True
29 | 
30 |     def load(self, config_filename: str):
31 |         """
32 |         yd。功能：解析config_filename配置文件中设置的字段
33 |         :param config_filename: 配置文件（比如drain3.ini）的路径
34 |         :return:
35 |         """
36 |         parser = configparser.ConfigParser()
37 |         read_files = parser.read(config_filename)
38 |         if len(read_files) == 0:
39 |             logger.warning(f"config file not found: {config_filename}")
40 | 
41 |         section_profiling = 'PROFILING'
42 |         section_snapshot = 'SNAPSHOT'
43 |         section_drain = 'DRAIN'
44 |         section_masking = 'MASKING'
45 | 
46 |         self.profiling_enabled = parser.getboolean(section_profiling, 'enabled',
47 |                                                    fallback=self.profiling_enabled)
48 |         self.profiling_report_sec = parser.getint(section_profiling, 'report_sec',
49 |                                                   fallback=self.profiling_report_sec)
50 | 
51 |         self.snapshot_interval_minutes = parser.getint(section_snapshot, 'snapshot_interval_minutes',
52 |                                                        fallback=self.snapshot_interval_minutes)
53 |         self.snapshot_compress_state = parser.getboolean(section_snapshot, 'compress_state',
54 |                                                          fallback=self.snapshot_compress_state)
55 | 
56 |         drain_extra_delimiters_str = parser.get(section_drain, 'extra_delimiters',
57 |                                                 fallback=str(self.drain_extra_delimiters))
58 |         self.drain_extra_delimiters = ast.literal_eval(drain_extra_delimiters_str)
59 | 
60 |         self.drain_sim_th = parser.getfloat(section_drain, 'sim_th',
61 |                                             fallback=self.drain_sim_th)
62 |         self.drain_depth = parser.getint(section_drain, 'depth',
63 |                                          fallback=self.drain_depth)
64 |         self.drain_max_children = parser.getint(section_drain, 'max_children',
65 |                                                 fallback=self.drain_max_children)
66 |         self.drain_max_clusters = parser.getint(section_drain, 'max_clusters',
67 |                                                 fallback=self.drain_max_clusters)
68 |         self.parametrize_numeric_tokens = parser.getboolean(section_drain, 'parametrize_numeric_tokens',
69 |                                                             fallback=self.parametrize_numeric_tokens)
70 | 
71 |         masking_instructions_str = parser.get(section_masking, 'masking',
72 |                                               fallback=str(self.masking_instructions))
73 |         self.mask_prefix = parser.get(section_masking, 'mask_prefix', fallback=self.mask_prefix)
74 |         self.mask_suffix = parser.get(section_masking, 'mask_suffix', fallback=self.mask_suffix)
75 |         self.parameter_extraction_cache_capacity = parser.get(section_masking, 'parameter_extraction_cache_capacity',
76 |                                                               fallback=self.parameter_extraction_cache_capacity)
77 |         #yd。下面是将配置文件中的'masking'字段的内容解析出来，用正则表达式来构建MaskingInstruction对象，
78 |         masking_instructions = []
79 |         masking_list = json.loads(masking_instructions_str) #yd。将masking_instructions_str转化为list
80 |         for mi in masking_list:
81 |             instruction = MaskingInstruction(mi['regex_pattern'], mi['mask_with'])
82 |             masking_instructions.append(instruction)
83 |         self.masking_instructions = masking_instructions
84 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cachetools==4.2.1
 2 | jieba==0.42.1
 3 | jsonpickle==2.0.0
 4 | kafka==1.3.5
 5 | kafka_python==2.0.2
 6 | pandas==1.1.5
 7 | redis==3.5.3
 8 | requests==2.28.1
 9 | tqdm==4.64.0
10 | 


--------------------------------------------------------------------------------
/src/common_config.py:
--------------------------------------------------------------------------------
 1 | import re, os
 2 | from src.tool.tool import get_project_dir_path
 3 | 
 4 | PROJECT_DIR_PATH = get_project_dir_path()
 5 | DATA_DIR_PATH = os.path.join(PROJECT_DIR_PATH, "data")
 6 | CONFIG_DIR_PATH = os.path.join(PROJECT_DIR_PATH, "config_ini")
 7 | 
 8 | STAR_CHAR = "*"
 9 | 
10 | DEFAULT_STR_VALUE = "-"
11 | 
12 | 
13 | USE_OLD_FUNCTION_EXTRACT_PARAMETER = False
14 | ENABLE_MASK_CONTENT = False
15 | 
16 | CHINESE_SUBSTR_TYPE = "中"
17 | SPACE_SUBSTR_TYPE = "空格"
18 | ENGLISH_SUBSTR_TYPE = "英"
19 | PUNCTUATION_MARK_TYPE = "标点"
20 | CONNECTOR_CHAR = "^"
21 | 
22 | CHINESE_SPACE_CHINESE_PATTERN = CONNECTOR_CHAR.join([CHINESE_SUBSTR_TYPE, SPACE_SUBSTR_TYPE,CHINESE_SUBSTR_TYPE])
23 | 
24 | 
25 | #CHINESE_REGEXP = re.compile(u"([\u4e00-\u9fff|\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+)")
26 | CHINESE_REGEXP = re.compile(u"([\u4e00-\u9fff]+)")
27 | PUNCTUATION_MARK_REGEXP = re.compile(u"(。|,|，|:|：|=)")
28 | 
29 | #NONE_CHINESE_REGEXP = re.compile(u"([^\u4e00-\u9fff|\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+)")
30 | NONE_CHINESE_REGEXP = re.compile(u"([^\u4e00-\u9fff|。,，:：=]+)")
31 | 
32 | CLUSTER_ID_KEY = "cluster_id"
33 | CLUSTER_SIZE_KEY = "cluster_size"
34 | TEMPLATE_MINED_KEY = "template_mined"
35 | LOG_TEMPLATE_TOKENS_KEY = "log_template_tokens"
36 | CLUSTER_COUNT_KEY = "cluster_count" #用于统计当前已经有多少个cluster了，一个cluster就是一个log template
37 | 
38 | IS_CONTAIN_CHINESE_KEY = "is_contain_chinese"
39 | SUBSTR_TYPE_PATTERN_KEY = "substr_type_pattern"
40 | SUBSTR_DETAIL_LIST_KEY = "substr_detail_list"
41 | TOKEN_LIST_KEY = "token_list"


--------------------------------------------------------------------------------
/src/drain3_examples/drain_bigfile_demo.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | import json
 4 | import logging
 5 | import os
 6 | import subprocess
 7 | import sys
 8 | import time
 9 | from os.path import dirname
10 | 
11 | from drain3 import TemplateMiner
12 | from drain3.template_miner_config import TemplateMinerConfig
13 | from src.common_config import CONFIG_DIR_PATH
14 | 
15 | logger = logging.getLogger(__name__)
16 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')
17 | 
18 | in_gz_file = "SSH.tar.gz"
19 | in_log_file = "SSH.log"
20 | if not os.path.isfile(in_log_file):
21 |     logger.info(f"Downloading file {in_gz_file}")
22 |     p = subprocess.Popen(f"curl https://zenodo.org/record/3227177/files/{in_gz_file} --output {in_gz_file}", shell=True)
23 |     p.wait()
24 |     logger.info(f"Extracting file {in_gz_file}")
25 |     p = subprocess.Popen(f"tar -xvzf {in_gz_file}", shell=True)
26 |     p.wait()
27 | 
28 | 
29 | config = TemplateMinerConfig()
30 | #config.load(dirname(__file__) + "/drain3.ini")
31 | drain3_ini_file_path = os.path.join(CONFIG_DIR_PATH, "drain3.ini")
32 | config.load(drain3_ini_file_path)
33 | config.profiling_enabled = True
34 | template_miner = TemplateMiner(config=config)
35 | 
36 | line_count = 0
37 | 
38 | with open(in_log_file) as f:
39 |     lines = f.readlines()
40 | 
41 | start_time = time.time()
42 | batch_start_time = start_time
43 | batch_size = 10000
44 | 
45 | for line in lines:
46 |     line = line.rstrip()
47 |     line = line.partition(": ")[2]
48 |     result = template_miner.add_log_message(line)
49 |     line_count += 1
50 |     if line_count % batch_size == 0:
51 |         time_took = time.time() - batch_start_time
52 |         rate = batch_size / time_took
53 |         logger.info(f"Processing line: {line_count}, rate {rate:.1f} lines/sec, "
54 |                     f"{len(template_miner.drain.clusters)} clusters so far.")
55 |         batch_start_time = time.time()
56 |     if result["change_type"] != "none":
57 |         result_json = json.dumps(result)
58 |         logger.info(f"Input ({line_count}): " + line)
59 |         logger.info("Result: " + result_json)
60 | 
61 | time_took = time.time() - start_time
62 | rate = line_count / time_took
63 | logger.info(f"--- Done processing file in {time_took:.2f} sec. Total of {line_count} lines, rate {rate:.1f} lines/sec, "
64 |             f"{len(template_miner.drain.clusters)} clusters")
65 | 
66 | sorted_clusters = sorted(template_miner.drain.clusters, key=lambda it: it.size, reverse=True)
67 | for cluster in sorted_clusters:
68 |     logger.info(cluster)
69 | 
70 | print("Prefix Tree:")
71 | template_miner.drain.print_tree()
72 | 
73 | template_miner.profiler.report(0)
74 | 


--------------------------------------------------------------------------------
/src/drain3_examples/drain_stdin_demo.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | import json
 4 | import logging
 5 | import sys, os
 6 | from os.path import dirname
 7 | 
 8 | from drain3 import TemplateMiner
 9 | from drain3.template_miner_config import TemplateMinerConfig
10 | from src.common_config import CONFIG_DIR_PATH, USE_OLD_FUNCTION_EXTRACT_PARAMETER, TOKEN_LIST_KEY, \
11 |     TEMPLATE_MINED_KEY, LOG_TEMPLATE_TOKENS_KEY, DEFAULT_STR_VALUE
12 | 
13 | # persistence_type = "NONE"
14 | # persistence_type = "REDIS"
15 | # persistence_type = "KAFKA"
16 | persistence_type = "FILE"
17 | 
18 | logger = logging.getLogger(__name__)
19 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')
20 | 
21 | if persistence_type == "KAFKA":
22 |     from drain3.kafka_persistence import KafkaPersistence
23 | 
24 |     persistence = KafkaPersistence("drain3_state", bootstrap_servers="localhost:9092")
25 | 
26 | elif persistence_type == "FILE":
27 |     from drain3.file_persistence import FilePersistence
28 | 
29 |     # persistence = FilePersistence("drain3_state.bin")
30 |     drain3_state_bin_file_path = os.path.join(CONFIG_DIR_PATH, "drain3_state.bin")
31 |     persistence = FilePersistence(drain3_state_bin_file_path)
32 | 
33 | elif persistence_type == "REDIS":
34 |     from drain3.redis_persistence import RedisPersistence
35 | 
36 |     persistence = RedisPersistence(redis_host='',
37 |                                    redis_port=25061,
38 |                                    redis_db=0,
39 |                                    redis_pass='',
40 |                                    is_ssl=True,
41 |                                    redis_key="drain3_state_key")
42 | else:
43 |     persistence = None
44 | 
45 | config = TemplateMinerConfig()
46 | # config.load(dirname(__file__) + "/drain3.ini")
47 | drain3_ini_file_path = os.path.join(CONFIG_DIR_PATH, "drain3.ini")
48 | config.load(drain3_ini_file_path)
49 | config.profiling_enabled = False
50 | 
51 | template_miner = TemplateMiner(persistence, config)
52 | print(f"Drain3 started with '{persistence_type}' persistence")
53 | print(f"{len(config.masking_instructions)} masking instructions are in use")
54 | print(f"Starting training mode. Reading from std-in ('q' to finish)")  # yd。利用输入的一条条日志，训练得到模板
55 | while True:
56 |     log_line = input("> ")
57 |     if log_line == 'q':
58 |         break
59 |     # is_contain_chinese, substr_type_pattern, substr_detail_list, token_list, token_join_str = get_token_list(log_line)
60 |     # log_line = token_join_str
61 |     result = template_miner.add_log_message(log_line)
62 |     result_json = json.dumps(result, ensure_ascii=False)
63 |     print(result_json)
64 |     params = template_miner.get_parameter(result, log_line)
65 |     print("Parameters: " + str(params))
66 | # yd。训练完毕，打印挖掘的每个cluster
67 | print("Training done. Mined clusters:")
68 | for cluster in template_miner.drain.clusters:
69 |     print(cluster)
70 | 
71 | print(f"Starting inference mode, matching to pre-trained clusters. Input log lines or 'q' to finish")
72 | while True:
73 |     log_line = input("> ")
74 |     if log_line == 'q':
75 |         break
76 |     cluster, tokenize_result = template_miner.match(log_line)
77 |     if cluster is None:
78 |         print(f"No match found")
79 |     else:
80 |         result = template_miner.make_result_dict(cluster, tokenize_result)
81 |         params = template_miner.get_parameter(result, log_line)
82 |         print(f"Parameters: {params}")
83 | 


--------------------------------------------------------------------------------
/src/log_parser_by_drain3.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | from collections import defaultdict
  4 | from tqdm import tqdm  # 进度条
  5 | from src.tool.read_save_file import open_excel, save_dataframe
  6 | 
  7 | from src.common_config import DATA_DIR_PATH, DEFAULT_STR_VALUE, USE_OLD_FUNCTION_EXTRACT_PARAMETER, \
  8 |     CLUSTER_ID_KEY, CLUSTER_SIZE_KEY, TEMPLATE_MINED_KEY, IS_CONTAIN_CHINESE_KEY, SUBSTR_TYPE_PATTERN_KEY, \
  9 |     SUBSTR_DETAIL_LIST_KEY, TOKEN_LIST_KEY, LOG_TEMPLATE_TOKENS_KEY
 10 | 
 11 | from src.tool.str_related import get_tow_set_diff
 12 | import json
 13 | import sys, os
 14 | 
 15 | from drain3 import TemplateMiner
 16 | from drain3.template_miner_config import TemplateMinerConfig
 17 | from drain3.file_persistence import FilePersistence
 18 | 
 19 | from src.common_config import CONFIG_DIR_PATH
 20 | 
 21 | 
 22 | class LogParserByDrain3:
 23 |     def __init__(self):
 24 |         persistence_type = "FILE"
 25 |         drain3_state_bin_file_path = os.path.join(CONFIG_DIR_PATH, "drain3_state.bin")
 26 |         persistence = FilePersistence(drain3_state_bin_file_path)
 27 | 
 28 |         config = TemplateMinerConfig()
 29 |         drain3_ini_file_path = os.path.join(CONFIG_DIR_PATH, "drain3.ini")
 30 |         config.load(drain3_ini_file_path)
 31 |         config.profiling_enabled = False
 32 | 
 33 |         self.template_miner = TemplateMiner(persistence, config)
 34 |         print(f"Drain3 started with '{persistence_type}' persistence")
 35 |         print(f"{len(config.masking_instructions)} masking instructions are in use")
 36 |         print(f"Starting training mode. Reading from std-in ('q' to finish)")  # yd。利用输入的一条条日志，训练得到模板
 37 | 
 38 |     def parse_log_content(self, log_line):
 39 |         result = self.template_miner.add_log_message(log_line)
 40 |         result_json = json.dumps(result, ensure_ascii=False)
 41 |         # print(result_json)
 42 |         if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
 43 |             template = result.get(TEMPLATE_MINED_KEY, DEFAULT_STR_VALUE)
 44 |             params = self.template_miner.extract_parameters(template, log_line)
 45 |         else:
 46 |             content_tokens = result.get(TOKEN_LIST_KEY, [])
 47 |             log_template_tokens = result.get(LOG_TEMPLATE_TOKENS_KEY, [])
 48 |             params = self.template_miner.extract_parameters_by_compare(content_tokens, log_template_tokens)
 49 |         return result, params
 50 | 
 51 |     def parse_log_file(self, raw_log_csv_path, result_file_path):
 52 |         print(f"start to parse log {raw_log_csv_path}")
 53 |         log_item_df = open_excel(raw_log_csv_path)
 54 |         log_csv_header = ["_time", "content"]
 55 |         log_item_df = log_item_df[log_csv_header]
 56 |         analysis_result_list = []
 57 |         log_item_count = len(log_item_df)
 58 |         progress_bar = tqdm(total=log_item_count)
 59 |         for line_index, line_detail in enumerate(log_item_df.values.tolist()):
 60 |             [time_str, content] = line_detail
 61 |             progress_bar.update(1)
 62 |             if content != content:
 63 |                 content = ""
 64 |             if isinstance(content, str) == False:
 65 |                 content = str(content)
 66 |             # content = "终端服务器安全层在协议流中检测到错误，并已取消客户端连接。 客户端 IP: 192.168.100.132。"
 67 |             # content = "DSN3201I -PB4A ABNORMAL EOT IN PROGRESS FOR 825 825 USER=NVTWS CONNECTION-ID=UTILITY CORRELATION-ID=PIMGEKD2 825 JOBNAME=PIMGEKD2 ASID=0102 TCB=0088C840"
 68 |             result_dict, extract_parameter_list = self.parse_log_content(content)
 69 | 
 70 |             parameter_list = []
 71 |             if extract_parameter_list is not None:
 72 |                 for parameter in extract_parameter_list:
 73 |                     parameter_list.append(parameter.value)
 74 | 
 75 |             event_id = result_dict.get(CLUSTER_ID_KEY, 1) - 1
 76 |             event_template = result_dict.get(TEMPLATE_MINED_KEY, 0)
 77 |             Occurrences = result_dict.get(CLUSTER_SIZE_KEY, DEFAULT_STR_VALUE)
 78 |             substr_detail_list = result_dict.get(SUBSTR_DETAIL_LIST_KEY, DEFAULT_STR_VALUE)
 79 |             substr_type_pattern = result_dict.get(SUBSTR_TYPE_PATTERN_KEY, DEFAULT_STR_VALUE)
 80 |             pattern_length = len(substr_detail_list)
 81 |             is_contain_chinese = result_dict.get(IS_CONTAIN_CHINESE_KEY, DEFAULT_STR_VALUE)
 82 |             token_list = result_dict.get(TOKEN_LIST_KEY, DEFAULT_STR_VALUE)
 83 |             token_count = len(token_list)
 84 |             event_key = "-"
 85 |             star_ratio = "-"
 86 |             analysis_result_detail = [
 87 |                 substr_detail_list, substr_type_pattern, pattern_length,
 88 |                 is_contain_chinese,
 89 |                 token_list, token_count, event_key,
 90 |                 event_id, event_template, star_ratio, Occurrences, parameter_list]
 91 | 
 92 |             analysis_result_list.append(line_detail + analysis_result_detail)
 93 |         progress_bar.close()
 94 |         analysis_result_df = pd.DataFrame(analysis_result_list,
 95 |                                           columns=["_time", "content",
 96 |                                                    "子串类型明细", "子串类型模式", "模式长度",
 97 |                                                    "是否包含中文",
 98 |                                                    "切分的结果", "切分后的长度", "event_key",
 99 |                                                    "EventId", "EventTemplate", "star_ratio", "Occurrences",
100 |                                                    "ParameterList"])
101 |         save_dataframe(analysis_result_df, result_file_path)
102 | 
103 |     def compare_predict_with_gold(self, predict_file_path, gold_file_path, compare_result_file_path):
104 |         predict_item_df = open_excel(predict_file_path)
105 |         result_table_header = ["_time", "content", "EventId", "EventTemplate", "Occurrences", "ParameterList"]
106 |         predict_item_df = predict_item_df[result_table_header]
107 |         predict_item_count = len(predict_item_df)
108 |         print(predict_item_count)
109 | 
110 |         gold_item_df = open_excel(gold_file_path)
111 |         gold_item_df = gold_item_df[result_table_header]
112 |         gold_item_count = len(gold_item_df)
113 |         print(gold_item_count)
114 |         if predict_item_count != gold_item_count:
115 |             print(
116 |                 f"---error: predict_item_count != gold_item_count, predict_item_count = {predict_item_count}, gold_item_count = {gold_item_count}")
117 |             return None
118 |         progress_bar = tqdm(total=gold_item_count)
119 |         compare_result_list = []
120 |         for row_index in range(predict_item_count):
121 |             predict_line_detail = predict_item_df.loc[row_index].tolist()
122 |             gold_line_detail = gold_item_df.loc[row_index].tolist()
123 |             progress_bar.update(1)
124 | 
125 |             [time_predict, content_predict, EventId_predict, EventTemplate_predict, Occurrences_predict,
126 |              ParameterList_predict] = predict_line_detail
127 |             [time_gold, content_gold, EventId_gold, EventTemplate_gold, Occurrences_gold,
128 |              ParameterList_gold] = gold_line_detail
129 |             if time_predict != time_gold:
130 |                 print(
131 |                     f"---error: time_predict != time_gold, time_predict = {time_predict}, time_gold = {time_gold}")
132 |                 return None
133 |             if content_predict != content_gold:
134 |                 print(
135 |                     f"---error: content_predict != content_gold, content_predict = {content_predict}, content_gold = {content_gold}")
136 |                 return None
137 | 
138 |             is_template_same = False
139 |             if EventTemplate_predict == EventTemplate_gold:
140 |                 is_template_same = True
141 | 
142 |             ParameterList_predict = eval(ParameterList_predict)
143 |             ParameterList_gold = eval(ParameterList_gold)
144 |             is_parameter_same, intersection_set, only_in_predict_set, only_in_gold_set = get_tow_set_diff(
145 |                 set(ParameterList_predict), set(ParameterList_gold))
146 |             compare_result_detail = [time_gold, content_gold, EventId_gold,
147 |                                      EventTemplate_predict, EventTemplate_gold, is_template_same,
148 |                                      Occurrences_gold,
149 |                                      ParameterList_predict, ParameterList_gold, is_parameter_same, intersection_set,
150 |                                      only_in_predict_set, only_in_gold_set]
151 |             compare_result_list.append(compare_result_detail)
152 |         progress_bar.close()
153 |         compare_result_df = pd.DataFrame(compare_result_list, columns=["_time", "content", "EventId",
154 |                                                                        "EventTemplate_predict", "EventTemplate_gold",
155 |                                                                        "is_template_same",
156 |                                                                        "Occurrences_gold",
157 |                                                                        "ParameterList_predict", "ParameterList_gold",
158 |                                                                        "is_parameter_same", "intersection_set",
159 |                                                                        "only_in_predict_set", "only_in_gold_set"])
160 |         save_dataframe(compare_result_df, compare_result_file_path)
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     is_get_parse_result = True
165 |     is_get_indicator = False
166 |     log_parser = LogParserByDrain3()
167 | 
168 |     if is_get_parse_result:
169 |         raw_log_csv_path = os.path.join(DATA_DIR_PATH, "english_logs.csv")
170 |         result_file_path = os.path.join(DATA_DIR_PATH, "english_logs_parse_by_drain3.csv")
171 |         log_parser.parse_log_file(raw_log_csv_path, result_file_path)
172 | 
173 |         raw_log_csv_path = os.path.join(DATA_DIR_PATH, "chinese_english_logs.csv")
174 |         result_file_path = os.path.join(DATA_DIR_PATH, "chinese_english_logs_parse_by_drain3.csv")
175 |         log_parser.parse_log_file(raw_log_csv_path, result_file_path)
176 | 
177 |     if is_get_indicator:
178 |         raw_log_csv_path = os.path.join(DATA_DIR_PATH, "english_logs.csv")
179 |         result_file_path = os.path.join(DATA_DIR_PATH, "english_logs_parse_by_drain3.csv")
180 |         gold_file_path = raw_log_csv_path
181 |         compare_result_file_path = os.path.join(DATA_DIR_PATH, "解析结果与金标准对比的结果_by_drain3.xlsx")
182 |         log_parser.compare_predict_with_gold(result_file_path, gold_file_path, compare_result_file_path)
183 | 


--------------------------------------------------------------------------------
/src/log_parser_by_statistic.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | from collections import defaultdict
  4 | from tqdm import tqdm  # 进度条
  5 | from src.tool.read_save_file import open_excel, save_dataframe
  6 | 
  7 | from src.common_config import DATA_DIR_PATH, CONNECTOR_CHAR, \
  8 |     STAR_CHAR
  9 | from src.tool.str_related import get_tow_set_diff
 10 | from src.tool.tool import calculate_normalize_ratio
 11 | from src.tool.tokenizer import get_token_list
 12 | 
 13 | 
 14 | class LogParserByStatistics:
 15 | 
 16 |     def get_event_template_and_parameter(self, is_contain_chinese, token_list, token_2_frequency, event_occurrences):
 17 |         """
 18 |         功能：判断event中每个token出现的频次与event_occurrences是否相对，
 19 |              如果相等，则该token就是模板中的词；
 20 |              如果不相等，则该token就是parameter。
 21 |         :param is_contain_chinese:
 22 |         :param token_list:
 23 |         :param token_2_frequency: 记录当前event_id对应的所有token出现的频次
 24 |         :param event_occurrences:当前event出现的次数
 25 |         :return:
 26 |         """
 27 |         template_token_list = []
 28 |         parameter_set = set([])
 29 |         parameter_list = []
 30 |         star_count = 0
 31 |         for token in token_list:
 32 |             frequency = token_2_frequency[token]
 33 |             if frequency == event_occurrences: #如果该词在当前event中出现的频次等于该event出现的频次，则该词就是模板词
 34 |                 template_token_list.append(token)
 35 |                 continue
 36 | 
 37 |             template_token_list.append(STAR_CHAR) #该词是参数，用星号表示
 38 |             star_count += 1
 39 |             if token not in parameter_set: #将参数分别保存在list和set中
 40 |                 parameter_set.add(token)
 41 |                 parameter_list.append(token)
 42 | 
 43 |         connector_char = " "
 44 |         if is_contain_chinese == True:
 45 |             connector_char = ""
 46 |         event_template = connector_char.join(template_token_list)
 47 |         star_ratio = calculate_normalize_ratio(star_count, len(token_list))
 48 |         return event_template, parameter_list, star_ratio
 49 | 
 50 |     def update_token_2_frequency(self, token_2_frequency, token_list):
 51 |         token_set = set(token_list)
 52 |         for token in token_set:
 53 |             if token in token_2_frequency:
 54 |                 token_2_frequency[token] += 1
 55 |             else:
 56 |                 token_2_frequency[token] = 1
 57 |         return token_2_frequency
 58 | 
 59 |     def update_event_key_2_id(self, event_key, event_key_2_id):
 60 |         if event_key not in event_key_2_id:
 61 |             event_id = len(event_key_2_id)
 62 |             event_key_2_id[event_key] = event_id
 63 |         return event_key_2_id
 64 | 
 65 |     def update_event_id_2_occurrences(self, event_id, event_id_2_occurrences):
 66 |         if event_id not in event_id_2_occurrences:
 67 |             event_id_2_occurrences[event_id] = 1
 68 |         else:
 69 |             event_id_2_occurrences[event_id] += 1
 70 |         return event_id_2_occurrences
 71 | 
 72 |     def parse_log_content(self, content, event_key_2_id, event_id_2_occurrences, event_id_2_token_2_frequency):
 73 | 
 74 |         is_contain_chinese, substr_type_pattern, substr_detail_list, token_list = get_token_list(content)
 75 |         pattern_length = len(substr_detail_list)
 76 |         token_count = len(token_list)
 77 | 
 78 |         event_key = substr_type_pattern + CONNECTOR_CHAR + str(token_count)
 79 |         self.update_event_key_2_id(event_key, event_key_2_id)
 80 |         event_id = event_key_2_id[event_key]
 81 | 
 82 |         self.update_event_id_2_occurrences(event_id, event_id_2_occurrences)
 83 |         Occurrences = event_id_2_occurrences[event_id]
 84 | 
 85 |         token_2_frequency = event_id_2_token_2_frequency[event_id]
 86 |         token_2_frequency_new = self.update_token_2_frequency(token_2_frequency, token_list)
 87 |         event_id_2_token_2_frequency[event_id] = token_2_frequency_new
 88 | 
 89 |         event_template, parameter_list, star_ratio = self.get_event_template_and_parameter(is_contain_chinese, token_list,
 90 |                                                                                token_2_frequency, Occurrences)
 91 |         analysis_result_detail = [
 92 |                                   substr_detail_list, substr_type_pattern, pattern_length,
 93 |                                   is_contain_chinese,
 94 |                                   token_list, token_count, event_key,
 95 |                                   event_id, event_template,star_ratio, Occurrences, parameter_list]
 96 |         return analysis_result_detail
 97 | 
 98 | 
 99 |     def parse_log_file(self, raw_log_csv_path, result_file_path):
100 |         log_item_df = open_excel(raw_log_csv_path)
101 |         log_csv_header = ["_time", "content"]
102 |         log_item_df = log_item_df[log_csv_header]
103 |         analysis_result_list = []
104 |         event_key_2_id = {}
105 |         event_id_2_occurrences = {}
106 |         event_id_2_token_2_frequency = defaultdict(dict)
107 |         log_item_count = len(log_item_df)
108 |         progress_bar = tqdm(total=log_item_count)
109 |         for line_index, line_detail in enumerate(log_item_df.values.tolist()):
110 |             [time_str, content] = line_detail
111 |             progress_bar.update(1)
112 |             if content != content:
113 |                 content = ""
114 |             if isinstance(content,str)==False:
115 |                 content = str(content)
116 |             #content = "终端服务器安全层在协议流中检测到错误，并已取消客户端连接。 客户端 IP: 192.168.100.132。"
117 |             #content = "DSN3201I -PB4A ABNORMAL EOT IN PROGRESS FOR 825 825 USER=NVTWS CONNECTION-ID=UTILITY CORRELATION-ID=PIMGEKD2 825 JOBNAME=PIMGEKD2 ASID=0102 TCB=0088C840"
118 |             analysis_result_detail = self.parse_log_content(content, event_key_2_id, event_id_2_occurrences, event_id_2_token_2_frequency)
119 | 
120 |             analysis_result_list.append(line_detail + analysis_result_detail)
121 |         progress_bar.close()
122 |         analysis_result_df = pd.DataFrame(analysis_result_list,
123 |                                           columns=["_time", "content",
124 |                                                    "子串类型明细", "子串类型模式","模式长度",
125 |                                                    "是否包含中文",
126 |                                                    "切分的结果", "切分后的长度","event_key",
127 |                                                    "EventId", "EventTemplate","star_ratio", "Occurrences", "ParameterList"])
128 |         save_dataframe(analysis_result_df, result_file_path)
129 | 
130 |     def compare_predict_with_gold(self, predict_file_path, gold_file_path, compare_result_file_path):
131 |         predict_item_df = open_excel(predict_file_path)
132 |         result_table_header = ["_time", "content","EventId", "EventTemplate", "Occurrences", "ParameterList"]
133 |         predict_item_df = predict_item_df[result_table_header]
134 |         predict_item_count = len(predict_item_df)
135 |         print(predict_item_count)
136 | 
137 |         gold_item_df = open_excel(gold_file_path)
138 |         gold_item_df = gold_item_df[result_table_header]
139 |         gold_item_count = len(gold_item_df)
140 |         print(gold_item_count)
141 |         if predict_item_count != gold_item_count:
142 |             print(f"---error: predict_item_count != gold_item_count, predict_item_count = {predict_item_count}, gold_item_count = {gold_item_count}")
143 |             return None
144 |         progress_bar = tqdm(total=gold_item_count)
145 |         compare_result_list = []
146 |         for row_index in range(predict_item_count):
147 |             predict_line_detail = predict_item_df.loc[row_index].tolist()
148 |             gold_line_detail = gold_item_df.loc[row_index].tolist()
149 |             progress_bar.update(1)
150 | 
151 |             [time_predict, content_predict, EventId_predict, EventTemplate_predict, Occurrences_predict, ParameterList_predict] = predict_line_detail
152 |             [time_gold, content_gold, EventId_gold, EventTemplate_gold, Occurrences_gold, ParameterList_gold] = gold_line_detail
153 |             if time_predict != time_gold:
154 |                 print(
155 |                     f"---error: time_predict != time_gold, time_predict = {time_predict}, time_gold = {time_gold}")
156 |                 return None
157 |             if content_predict != content_gold:
158 |                 print(
159 |                     f"---error: content_predict != content_gold, content_predict = {content_predict}, content_gold = {content_gold}")
160 |                 return None
161 | 
162 |             is_template_same = False
163 |             if EventTemplate_predict == EventTemplate_gold:
164 |                 is_template_same = True
165 | 
166 |             ParameterList_predict = eval(ParameterList_predict)
167 |             ParameterList_gold = eval(ParameterList_gold)
168 |             is_parameter_same, intersection_set, only_in_predict_set, only_in_gold_set = get_tow_set_diff(set(ParameterList_predict), set(ParameterList_gold))
169 |             compare_result_detail = [time_gold, content_gold, EventId_gold,
170 |                                      EventTemplate_predict, EventTemplate_gold, is_template_same,
171 |                                      Occurrences_gold,
172 |                                      ParameterList_predict, ParameterList_gold, is_parameter_same, intersection_set, only_in_predict_set, only_in_gold_set]
173 |             compare_result_list.append(compare_result_detail)
174 |         progress_bar.close()
175 |         compare_result_df = pd.DataFrame(compare_result_list, columns=["_time", "content","EventId",
176 |                                                    "EventTemplate_predict", "EventTemplate_gold", "is_template_same",
177 |                                                     "Occurrences_gold",
178 |                                                    "ParameterList_predict", "ParameterList_gold", "is_parameter_same", "intersection_set", "only_in_predict_set", "only_in_gold_set"])
179 |         save_dataframe(compare_result_df, compare_result_file_path)
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     is_get_parse_result = True
184 |     is_get_indicator = True
185 |     log_parser = LogParserByStatistics()
186 | 
187 |     if is_get_parse_result:
188 |         raw_log_csv_path = os.path.join(DATA_DIR_PATH, "english_logs.csv")
189 |         result_file_path = os.path.join(DATA_DIR_PATH, "english_logs_parse_by_statistic.csv")
190 |         log_parser.parse_log_file(raw_log_csv_path, result_file_path)
191 | 
192 |         raw_log_csv_path = os.path.join(DATA_DIR_PATH, "chinese_english_logs.csv")
193 |         result_file_path = os.path.join(DATA_DIR_PATH, "chinese_english_logs_parse_by_statistic.csv")
194 |         log_parser.parse_log_file(raw_log_csv_path, result_file_path)
195 | 
196 |     if is_get_indicator:
197 |         raw_log_csv_path = os.path.join(DATA_DIR_PATH, "english_logs.csv")
198 |         result_file_path = os.path.join(DATA_DIR_PATH, "english_logs_parse_by_statistic.csv")
199 |         gold_file_path = raw_log_csv_path
200 |         compare_result_file_path = os.path.join(DATA_DIR_PATH, "解析结果与金标准对比的结果_by_statistic.xlsx")
201 |         log_parser.compare_predict_with_gold(result_file_path, gold_file_path,compare_result_file_path)


--------------------------------------------------------------------------------
/src/tool/read_save_file.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | from src.common_config import get_project_dir_path
 3 | 
 4 | import pandas as pd
 5 | def open_excel(input_file_path, sheet_name = "Sheet1", column_2_str = None):
 6 |     if input_file_path.endswith(".xlsx"):
 7 |         if sheet_name == "":
 8 |             if column_2_str is None:
 9 |                 item_name_df = pd.read_excel(input_file_path)
10 |             else:
11 |                 item_name_df = pd.read_excel(input_file_path, converters=column_2_str)
12 |         else:
13 |             if column_2_str is None:
14 |                 item_name_df = pd.read_excel(input_file_path, sheet_name)
15 |             else:
16 |                 item_name_df = pd.read_excel(input_file_path, sheet_name, converters=column_2_str)
17 |                 #item_name_df = pd.read_excel(input_file_path, sheet_name)
18 |     elif input_file_path.endswith(".csv"):
19 |         try:
20 |             item_name_df = pd.read_csv(input_file_path, encoding='utf-8',engine ='python')
21 |         except:
22 |             try:
23 |                 item_name_df = pd.read_csv(input_file_path, encoding='gb2312', engine='python')
24 |             except:
25 |                 traceback.print_exc()
26 |     else:
27 |         print("input file type is not xlsx or csv")
28 |         item_name_df = None
29 |     return item_name_df
30 | 
31 | 
32 | def save_dataframe(dataset_df, file_path, sheet_name ="Sheet1"):
33 |     if file_path.endswith(".xlsx"):
34 |         dataset_df.to_excel(file_path, index=False, sheet_name = sheet_name)
35 |     elif file_path.endswith(".csv"):
36 |         dataset_df.to_csv(file_path, index=False)
37 |     else:
38 |         print("file path is not end with .xlsx or .csv")
39 |         return None
40 |     cur_dir = get_project_dir_path()
41 |     short_path = file_path.replace(cur_dir, "")
42 |     print("finish to save data in {}".format(short_path))
43 | 
44 | def save_to_multi_sheet(file_path, dataframe_sheet_tuples):
45 |     with pd.ExcelWriter(file_path) as writer:
46 |         for dataset_df, sheet_name in dataframe_sheet_tuples:
47 |             dataset_df.to_excel(writer,index=False,  sheet_name=sheet_name)
48 |     print("finish to save result in {}".format(file_path))
49 | 
50 | 
51 | def save_dataframe_by_csv(dataset_df, file_path):
52 |     dataset_df.to_csv(file_path,index=False)
53 |     cur_dir = get_project_dir_path()
54 |     short_path = file_path.replace(cur_dir, "")
55 |     print("finish to save data in {}".format(short_path))


--------------------------------------------------------------------------------
/src/tool/str_related.py:
--------------------------------------------------------------------------------
  1 | import copy, re
  2 | from collections import defaultdict
  3 | 
  4 | from src.common_config import DEFAULT_STR_VALUE
  5 | 
  6 | 
  7 | def process_none_str(input_str):
  8 |     if input_str != input_str:
  9 |         return DEFAULT_STR_VALUE
 10 |     if input_str is None:
 11 |         return DEFAULT_STR_VALUE
 12 |     if input_str == "":
 13 |         return DEFAULT_STR_VALUE
 14 |     return input_str
 15 | 
 16 | def str_normalize(input_str):
 17 |     input_str = process_none_str(input_str)
 18 |     if isinstance(input_str, str):
 19 |         normalize_str = input_str.strip()
 20 |         normalize_str = normalize_str.replace("（", "(").replace("）",")")
 21 |     else:
 22 |         normalize_str = str(input_str)
 23 |     normalize_str = normalize_str.replace("\r", "").replace("\n", "")
 24 |     normalize_str = string_full_to_half(normalize_str)
 25 |     return normalize_str
 26 | 
 27 | 
 28 | def get_tow_set_diff(set_a, set_b):
 29 |     intersection_set = set_a & set_b
 30 |     only_in_a_set = set_a - intersection_set
 31 |     only_in_b_set = set_b - intersection_set
 32 |     is_tow_set_same = False
 33 |     if len(intersection_set) == len(set_a) and len(intersection_set) == len(set_b):
 34 |         is_tow_set_same = True
 35 |     return is_tow_set_same, intersection_set, only_in_a_set, only_in_b_set
 36 | 
 37 | def get_bracket_index(input_str, is_debug = False):
 38 |     """
 39 |     功能：获取括号的索引，按从左到右的顺序
 40 |     @param input_str 输入的字符串，例如"()))","(()", ")()())", "", "(())", "((()())"
 41 |     @return bracket_index_list，以list的形式，范围最长有效括号组合的索引，格式为[(left_bracket_index, right_bracket_index), (left_bracket_index, right_bracket_index),]
 42 |     """
 43 |     if is_debug == True:
 44 |         print("--------input_str = {}".format(input_str))
 45 |     raw_bracket_list = []
 46 |     bracket_index_list = []
 47 |     for bracket_index, temp_char in enumerate(input_str):
 48 |         if temp_char != "(" and temp_char != ")":
 49 |             continue
 50 |         raw_bracket_list.append((temp_char, bracket_index))
 51 | 
 52 |     left_bracket_index_stack = []
 53 |     bracket_count = len(raw_bracket_list)
 54 |     for i in range(bracket_count):
 55 |         (bracket_symbol, bracket_index) = raw_bracket_list[i]
 56 |         if bracket_symbol == "(":
 57 |             left_bracket_index_stack.append(bracket_index)
 58 |         elif bracket_symbol == ")":
 59 |             if len(left_bracket_index_stack) == 0:#如果没有左括号，则当前的有右括号是无效的
 60 |                 continue
 61 |             left_bracket_index = left_bracket_index_stack.pop(-1)
 62 |             bracket_index_list.append((left_bracket_index, bracket_index))
 63 |     bracket_index_list = merge_interval(bracket_index_list)
 64 | 
 65 |     for target_index_pair in bracket_index_list:
 66 |         (left_bracket_index, right_bracket_index) = target_index_pair
 67 |         target_str = input_str[left_bracket_index: right_bracket_index+1]
 68 |         if is_debug == True:
 69 |             print("input_str = {0}, left_bracket_index = {1}, right_bracket_index = {2}, target_str = {3}".format(input_str, left_bracket_index, right_bracket_index, target_str))
 70 |     return bracket_index_list
 71 | 
 72 | def drop_bracket_content(mj_name):
 73 |     bracket_index_list = get_bracket_index(mj_name)
 74 |     right_index = len(mj_name)
 75 |     new_name = mj_name
 76 |     prefix_end_index = -1
 77 |     for temp in bracket_index_list[::-1]:
 78 | 
 79 |         [start_index, end_index] = temp
 80 |         new_name = new_name[:start_index] + " " + new_name[end_index+1:]
 81 |         # end_right_index = end_index + 1
 82 |         # if end_right_index == right_index:
 83 |         #     right_index = start_index
 84 |         #     suffix_bracket_content = mj_name[start_index: end_right_index]
 85 |         #     suffix_bracket_content_list.insert(0, suffix_bracket_content)
 86 |         #     prefix_end_index = start_index
 87 |         # else:
 88 |         #     break
 89 |     return new_name
 90 | 
 91 | def get_bracket_content_prefix(mj_name):
 92 |     """
 93 |     获取括号内容的前缀，括号内容
 94 |     """
 95 |     bracket_content_list = []
 96 |     bracket_index_list = get_bracket_index(mj_name)
 97 |     right_index = len(mj_name)
 98 |     prefix_end_index = -1
 99 |     for temp in bracket_index_list[::-1]:
100 |         [start_index, end_index] = temp
101 |         end_right_index = end_index + 1
102 |         if end_right_index == right_index:
103 |             right_index = start_index
104 |             bracket_content = mj_name[start_index: end_right_index]
105 |             bracket_content_list.insert(0, bracket_content)
106 |             prefix_end_index = start_index
107 |         else:
108 |             break
109 |     if prefix_end_index != -1:
110 |         prefix_content = mj_name[:prefix_end_index]
111 |     else:
112 |         prefix_content = ""
113 |     suffix_bracket_content_join = "".join(bracket_content_list)
114 |     all_bracket_content = suffix_bracket_content_join.replace("(", "").replace(")", "").strip()
115 |     return prefix_content, bracket_content_list, all_bracket_content
116 | 
117 | def get_regexp_match_results(input_name, to_match_regexps ):
118 |     """
119 |     抽取正则匹配的结果
120 |     """
121 |     result_list = to_match_regexps.finditer(input_name)
122 |     match_detail_list = []
123 |     for result_detail in result_list:
124 |         (left_index, right_index) = result_detail.span()
125 |         match_str = result_detail.group()
126 |         match_detail_list.insert(0, [left_index, right_index, match_str])
127 |     return match_detail_list
128 | 
129 | def char_full_2_half(uchar):
130 |     """单个字符 全角转半角"""
131 |     inside_code = ord(uchar)
132 |     if inside_code == 0x3000:
133 |         inside_code = 0x0020
134 |     else:
135 |         inside_code -= 0xfee0
136 |     if inside_code < 0x0020 or inside_code > 0x7e: #转完之后不是半角字符返回原来的字符
137 |         return uchar
138 |     return chr(inside_code)
139 | 
140 | def string_full_to_half(ustring):
141 |     """把字符串全角转半角"""
142 |     return "".join([char_full_2_half(uchar) for uchar in ustring])
143 | 
144 | def get_regexp_match_result(to_match_regexp, temp_str):
145 |     target_list = []
146 |     result_list = to_match_regexp.finditer(temp_str)
147 |     for result_detail in result_list:
148 |         (left_index, right_index) = result_detail.span()
149 |         match_str = result_detail.group()
150 |         target_list.insert(0, [left_index, right_index, match_str])
151 |     return target_list
152 | 
153 | 
154 | 
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     if 0:
159 |         input_name = "★(甲)速效救心丸(50粒*3瓶)"
160 |         drop_bracket_content(input_name)
161 |     if 0:
162 |         ustring = "维生素ｂ１２注射液"
163 |         new_str = string_full_to_half(ustring)
164 |         print(ustring, new_str)
165 | 


--------------------------------------------------------------------------------
/src/tool/tokenizer.py:
--------------------------------------------------------------------------------
  1 | from src.common_config import DATA_DIR_PATH,CHINESE_REGEXP,CONNECTOR_CHAR,\
  2 |     PUNCTUATION_MARK_REGEXP,NONE_CHINESE_REGEXP, CHINESE_SUBSTR_TYPE,SPACE_SUBSTR_TYPE, ENGLISH_SUBSTR_TYPE,\
  3 |     CHINESE_SPACE_CHINESE_PATTERN,PUNCTUATION_MARK_TYPE
  4 | from src.tool.str_related import str_normalize, get_tow_set_diff
  5 | import jieba
  6 | 
  7 | def get_substr_pattern(content):
  8 |     substr_detail_list = []
  9 |     reg_match_list = CHINESE_REGEXP.finditer(content)
 10 | 
 11 |     for match_item in reg_match_list:
 12 |         match_str = match_item.group()
 13 |         (start_index, end_index) = match_item.span()
 14 |         substr_detail_list.append([start_index, end_index, match_str, CHINESE_SUBSTR_TYPE])  # 不包括end_index
 15 | 
 16 |     reg_match_list = PUNCTUATION_MARK_REGEXP.finditer(content)
 17 |     for match_item in reg_match_list:
 18 |         match_str = match_item.group()
 19 |         (start_index, end_index) = match_item.span()
 20 |         substr_detail_list.append([start_index, end_index, match_str, PUNCTUATION_MARK_TYPE])  # 不包括end_index
 21 | 
 22 |     reg_match_list = NONE_CHINESE_REGEXP.finditer(content) #提取非中文的结果
 23 |     for match_item in reg_match_list:
 24 |         match_str = match_item.group()
 25 |         (start_index, end_index) = match_item.span()
 26 |         match_str_strip = match_str.strip()
 27 | 
 28 |         #获取前缀空格
 29 |         match_index = match_str.find(match_str_strip)
 30 |         prefix_space_start_index = start_index
 31 |         prefix_space_end_index = prefix_space_start_index + match_index
 32 |         if prefix_space_start_index != prefix_space_end_index:
 33 |             prefix_space_str = content[prefix_space_start_index:prefix_space_end_index]
 34 |             substr_detail_list.append([prefix_space_start_index, prefix_space_end_index, prefix_space_str,
 35 |                                          SPACE_SUBSTR_TYPE])  # 不包括end_index
 36 | 
 37 |         #获取中间的英文字符串
 38 |         mid_substr_start_index = prefix_space_end_index
 39 |         mid_str_end_index = mid_substr_start_index + len(match_str_strip)
 40 |         if mid_substr_start_index != mid_str_end_index:
 41 |             mid_substr = content[mid_substr_start_index:mid_str_end_index]
 42 |             substr_detail_list.append( [mid_substr_start_index, mid_str_end_index, mid_substr,ENGLISH_SUBSTR_TYPE])  # 不包括end_index
 43 | 
 44 |         #获取结尾的空格
 45 |         suffix_space_start_index = mid_str_end_index
 46 |         suffix_space_end_index = end_index
 47 |         if suffix_space_start_index != suffix_space_end_index:
 48 |             suffix_space_str = content[suffix_space_start_index:suffix_space_end_index]
 49 |             substr_detail_list.append(
 50 |                 [suffix_space_start_index, suffix_space_end_index, suffix_space_str,
 51 |                  SPACE_SUBSTR_TYPE])  # 不包括end_index
 52 | 
 53 |     substr_detail_list.sort(key=lambda x: x[0], reverse=False)
 54 | 
 55 |     substr_type_pattern = CONNECTOR_CHAR.join([item[3] for item in substr_detail_list])
 56 |     # print(substr_detail_list)
 57 |     # print(substr_type_pattern)
 58 |     return substr_detail_list, substr_type_pattern
 59 | 
 60 | def split_substr(substr_detail_list, need_split_substr_type, is_split_by_space):
 61 |     """
 62 | 
 63 |     :param substr_detail_list:
 64 |     :param need_split_substr_type: 表示哪些类型的子串需要被切分
 65 |     :param is_split_by_space: 表示是否以空格的方式来切，如果该值为False，则表示用结巴来切分
 66 |     :return:
 67 |     """
 68 |     split_list = []
 69 |     #split_substr_count = 0
 70 |     for substr_item_detail in substr_detail_list:
 71 |         [start_index, end_index, match_str, substr_type] = substr_item_detail
 72 |         if substr_type == need_split_substr_type:
 73 |             if is_split_by_space:
 74 |                 temp_token_list = match_str.split()
 75 |             else:
 76 |                 temp_token_list = list(jieba.cut(match_str))
 77 | 
 78 |             split_list.extend(temp_token_list)
 79 |         else:
 80 |             if substr_type == SPACE_SUBSTR_TYPE:
 81 |                 continue
 82 |             split_list.append(match_str)
 83 |     return split_list
 84 | 
 85 | def get_token_list(content):
 86 |     content = content.strip()
 87 |     # content = str_normalize(content)
 88 |     substr_detail_list, substr_type_pattern = get_substr_pattern(content)
 89 |     is_contain_chinese = False
 90 |     if substr_type_pattern.find(CHINESE_SUBSTR_TYPE) != -1:  # 如果模式中包含中文
 91 |         is_contain_chinese = True
 92 |     if is_contain_chinese:  # 如果模式中包含中文
 93 |         if substr_type_pattern.find(CHINESE_SPACE_CHINESE_PATTERN) != -1:  # 如果模式中包含中文空格中文，则将中文按空格切分
 94 |             token_list = split_substr(substr_detail_list, CHINESE_SUBSTR_TYPE, is_split_by_space=True)
 95 |         else:  # 情况2，中文与中文之间，没有空格隔开，则针对中文用jieba分词，英文的保持不变
 96 |             token_list = split_substr(substr_detail_list, CHINESE_SUBSTR_TYPE, is_split_by_space=False)
 97 |     else:  # 即模式中不包含中文，则对英文按空格进行切分
 98 |         token_list = split_substr(substr_detail_list, ENGLISH_SUBSTR_TYPE, is_split_by_space=True)
 99 |     return is_contain_chinese, substr_type_pattern, substr_detail_list, token_list
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     content = "今天  456  名。明天"
104 |     get_substr_pattern(content)
105 |     content = "今天  4 56  名"
106 |     get_substr_pattern(content)
107 |     content = "终端服务器安全层在协议流中检测到错误，并已取消客户端连接。 客户端 IP: 192.168.100.132。"
108 |     get_substr_pattern(content)


--------------------------------------------------------------------------------
/src/tool/tool.py:
--------------------------------------------------------------------------------
 1 | import os, json, random, sys
 2 | import requests
 3 | 
 4 | from collections import Counter
 5 | 
 6 | 
 7 | # 日志耗时装饰器
 8 | import time, datetime
 9 | import functools
10 | 
11 | def get_project_dir_path():
12 | 
13 |     # cur_path = os.getcwd()
14 |     # print("get_project_dir_path, cur_path = {}".format(cur_path))
15 |     # project_dir_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
16 |     cur_file_path = os.path.abspath(__file__)
17 |     #print("cur_file_path = {}".format(cur_file_path))
18 |     # cur_dir_path = os.path.dirname(cur_file_path)
19 |     # print("get_project_dir_path, cur_dir_path = {}".format(cur_dir_path))
20 | 
21 |     project_dir_path = os.path.abspath(os.path.join(cur_file_path, "../../.."))
22 |     print("get_project_dir_path, project_dir_path = {}".format(project_dir_path))
23 |     return project_dir_path
24 | 
25 | def merge_interval(interval_list):
26 |     """
27 |     区间合并，参考https://leetcode-cn.com/problems/merge-intervals/
28 |     """
29 |     interval_count = len(interval_list)
30 |     if interval_count <= 1:
31 |         return interval_list
32 | 
33 |     merge_interval_list = []
34 |     start_acsend_intervals = sorted(interval_list, key=lambda x: x[0], reverse=False)
35 |     [prev_interval_start, prev_interval_end] = start_acsend_intervals[0]
36 |     for i in range(1, interval_count):
37 |         [cur_start, cur_end] = start_acsend_intervals[i]
38 | 
39 |         if prev_interval_end < cur_start:  # [[,4],[8,]]
40 |             merge_interval_list.append([prev_interval_start, prev_interval_end])
41 |             prev_interval_start = cur_start
42 |             prev_interval_end = cur_end
43 |         else:  # prev_internal_end >= cur_start #[[1,4],[3,4]]
44 |             prev_interval_end = max(prev_interval_end, cur_end)
45 |     merge_interval_list.append([prev_interval_start, prev_interval_end])
46 |     return merge_interval_list
47 | 
48 | def calculate_normalize_ratio(frequency, frequency_sum):
49 |     """
50 |     计算归一化的比值
51 |     """
52 |     if frequency_sum > 0:
53 |         ratio = (frequency / frequency_sum)
54 |         ratio = format(ratio, '.2f')   # 保留2位小数
55 |     else:
56 |         ratio = "-"
57 |     return ratio


--------------------------------------------------------------------------------
/tests/drain3_test.ini:
--------------------------------------------------------------------------------
 1 | [SNAPSHOT]
 2 | snapshot_interval_minutes = 10
 3 | compress_state = True
 4 | 
 5 | [MASKING]
 6 | masking = [
 7 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"},
 8 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"},
 9 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
10 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
11 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"},
12 |           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"},
13 |           {"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"}
14 |           ]
15 | 
16 | [DRAIN]
17 | sim_th = 0.4
18 | depth = 4
19 | max_children = 100
20 | max_clusters = 1024
21 | extra_delimiters = ["_"]
22 | 
23 | [PROFILING]
24 | enabled = True
25 | report_sec = 30
26 | 


--------------------------------------------------------------------------------
/tests/test_drain.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | 
  3 | import unittest
  4 | 
  5 | from drain3.drain import Drain, LogCluster
  6 | 
  7 | 
  8 | class DrainTest(unittest.TestCase):
  9 | 
 10 |     def test_add_shorter_than_depth_message(self):
 11 |         model = Drain(depth=4)
 12 |         res = model.add_log_message("word")
 13 |         print(res[1])
 14 |         print(res[0])
 15 |         self.assertEqual(res[1], "cluster_created")
 16 | 
 17 |         res = model.add_log_message("word")
 18 |         print(res[1])
 19 |         print(res[0])
 20 |         self.assertEqual(res[1], "none")
 21 | 
 22 |         res = model.add_log_message("otherword")
 23 |         print(res[1])
 24 |         print(res[0])
 25 |         self.assertEqual(res[1], "cluster_created")
 26 | 
 27 |         self.assertEqual(2, len(model.id_to_cluster))
 28 | 
 29 |     def test_add_log_message(self):
 30 |         model = Drain()
 31 |         entries = str.splitlines(
 32 |             """
 33 |             Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth]
 34 |             Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth]
 35 |             Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2
 36 |             Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2
 37 |             Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2
 38 |             Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth]
 39 |             """
 40 |         )
 41 |         expected = str.splitlines(
 42 |             """
 43 |             Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth]
 44 |             Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth]
 45 |             Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2
 46 |             Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2
 47 |             Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2
 48 |             Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth]
 49 |             """
 50 |         )
 51 |         actual = []
 52 | 
 53 |         for entry in entries:
 54 |             cluster, change_type = model.add_log_message(entry)
 55 |             actual.append(cluster.get_template())
 56 | 
 57 |         self.assertListEqual(list(map(str.strip, expected)), actual)
 58 |         self.assertEqual(8, model.get_total_cluster_size())
 59 | 
 60 |     def test_add_log_message_sim_75(self):
 61 |         """When `sim_th` is set to 75% then only certain log entries match.
 62 | 
 63 |         In this test similarity threshold is set to 75% which makes the model
 64 |         less aggressive in grouping entries into clusters. In particular, it
 65 |         only finds clusters for "Failed password" entries.
 66 |         """
 67 |         model = Drain(
 68 |             depth=4,
 69 |             sim_th=0.75,
 70 |             max_children=100,
 71 |         )
 72 |         entries = str.splitlines(
 73 |             """
 74 |             Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth]
 75 |             Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth]
 76 |             Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2
 77 |             Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2
 78 |             Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2
 79 |             Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth]
 80 |             """
 81 |         )
 82 |         expected = str.splitlines(
 83 |             """
 84 |             Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth]
 85 |             Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth]
 86 |             Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2
 87 |             Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2
 88 |             Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2
 89 |             Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth]
 90 |             """
 91 |         )
 92 |         actual = []
 93 | 
 94 |         for entry in entries:
 95 |             cluster, change_type = model.add_log_message(entry)
 96 |             actual.append(cluster.get_template())
 97 | 
 98 |         self.assertListEqual(list(map(str.strip, expected)), actual)
 99 |         self.assertEqual(8, model.get_total_cluster_size())
100 | 
101 |     def test_max_clusters(self):
102 |         """Verify model respects the max_clusters option.
103 |         
104 |         Key difference between this and other tests is that with `max_clusters`
105 |         set to 1 model is capable of keeping track of a single cluster at a
106 |         time. Consequently, when log stream switched form the A format to the B
107 |         and back model doesn't recognize it and returnes a new template with no
108 |         slots.
109 |         """
110 |         model = Drain(max_clusters=1)
111 |         entries = str.splitlines(
112 |             """
113 |             A format 1
114 |             A format 2
115 |             B format 1
116 |             B format 2
117 |             A format 3
118 |             """
119 |         )
120 |         expected = str.splitlines(
121 |             """
122 |             A format 1
123 |             A format <*>
124 |             B format 1
125 |             B format <*>
126 |             A format 3
127 |             """
128 |         )
129 |         actual = []
130 | 
131 |         for entry in entries:
132 |             cluster, change_type = model.add_log_message(entry)
133 |             actual.append(cluster.get_template())
134 | 
135 |         self.assertListEqual(list(map(str.strip, expected)), actual)
136 |         self.assertEqual(1, model.get_total_cluster_size())
137 | 
138 |     def test_max_clusters_lru_multiple_leaf_nodes(self):
139 |         """When all templates end up in different nodes and the max number of
140 |         clusters is reached, then clusters are removed according to the lru
141 |         policy.
142 |         """
143 |         model = Drain(max_clusters=2, depth=4, param_str="*")
144 |         entries = [
145 |             "A A A",
146 |             "A A B",
147 |             "B A A",
148 |             "B A B",
149 |             "C A A",
150 |             "C A B",
151 |             "B A A",
152 |             "A A A",
153 |         ]
154 |         expected = [
155 |             # lru: []
156 |             "A A A",
157 |             # lru: ["A A A"]
158 |             "A A *",
159 |             # lru: ["A A *"]
160 |             "B A A",
161 |             # lru: ["B A A", "A A *"]
162 |             "B A *",
163 |             # lru: ["B A *", "A A *"]
164 |             "C A A",
165 |             # lru: ["C A A", "B A *"]
166 |             "C A *",
167 |             # lru: ["C A *", "B A *"]
168 |             "B A *",
169 |             # Message "B A A" was normalized because the template "B A *" is
170 |             # still present in the cache.
171 |             # lru: ["B A *", "C A *"]
172 |             "A A A",
173 |             # Message "A A A" was not normalized because the template "C A A"
174 |             # pushed out the template "A A *" from the cache.
175 |             # lru: ["A A A", "C A *"]
176 |         ]
177 |         actual = []
178 | 
179 |         for entry in entries:
180 |             cluster, _ = model.add_log_message(entry)
181 |             actual.append(cluster.get_template())
182 | 
183 |         self.assertListEqual(list(map(str.strip, expected)), actual)
184 |         self.assertEqual(4, model.get_total_cluster_size())
185 | 
186 |     def test_max_clusters_lru_single_leaf_node(self):
187 |         """When all templates end up in the same leaf node and the max number of
188 |         clusters is reached, then clusters are removed according to the lru
189 |         policy.
190 |         """
191 |         model = Drain(max_clusters=2, depth=4, param_str="*")
192 |         entries = [
193 |             "A A A",
194 |             "A A B",
195 |             "A B A",
196 |             "A B B",
197 |             "A C A",
198 |             "A C B",
199 |             "A B A",
200 |             "A A A",
201 |         ]
202 |         expected = [
203 |             # lru: []
204 |             "A A A",
205 |             # lru: ["A A A"]
206 |             "A A *",
207 |             # lru: ["A A *"]
208 |             "A B A",
209 |             # lru: ["B A A", "A A *"]
210 |             "A B *",
211 |             # lru: ["B A *", "A A *"]
212 |             "A C A",
213 |             # lru: ["C A A", "B A *"]
214 |             "A C *",
215 |             # lru: ["C A *", "B A *"]
216 |             "A B *",
217 |             # Message "B A A" was normalized because the template "B A *" is
218 |             # still present in the cache.
219 |             # lru: ["B A *", "C A *"]
220 |             "A A A",
221 |             # Message "A A A" was not normalized because the template "C A A"
222 |             # pushed out the template "A A *" from the cache.
223 |             # lru: ["A A A", "C A *"]
224 |         ]
225 |         actual = []
226 | 
227 |         for entry in entries:
228 |             cluster, _ = model.add_log_message(entry)
229 |             actual.append(cluster.get_template())
230 | 
231 |         self.assertListEqual(list(map(str.strip, expected)), actual)
232 |         # self.assertEqual(5, model.get_total_cluster_size())
233 | 
234 |     def test_match_only(self):
235 |         model = Drain()
236 |         res = model.add_log_message("aa aa aa")
237 |         print(res[0])
238 | 
239 |         res = model.add_log_message("aa aa bb")
240 |         print(res[0])
241 | 
242 |         res = model.add_log_message("aa aa cc")
243 |         print(res[0])
244 | 
245 |         res = model.add_log_message("xx yy zz")
246 |         print(res[0])
247 | 
248 |         c: LogCluster = model.match("aa aa tt")
249 |         self.assertEqual(1, c.cluster_id)
250 | 
251 |         c: LogCluster = model.match("xx yy zz")
252 |         self.assertEqual(2, c.cluster_id)
253 | 
254 |         c: LogCluster = model.match("xx yy rr")
255 |         self.assertIsNone(c)
256 | 
257 |         c: LogCluster = model.match("nothing")
258 |         self.assertIsNone(c)
259 | 
260 | 


--------------------------------------------------------------------------------
/tests/test_masking.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | 
 3 | import unittest
 4 | 
 5 | from drain3.masking import MaskingInstruction, LogMasker
 6 | 
 7 | 
 8 | class MaskingTest(unittest.TestCase):
 9 | 
10 |     def test_instructions_by_mask_name(self):
11 |         instructions = []
12 |         a = MaskingInstruction(r"a", "1")
13 |         instructions.append(a)
14 |         b = MaskingInstruction(r"b", "1")
15 |         instructions.append(b)
16 |         c = MaskingInstruction(r"c", "2")
17 |         instructions.append(c)
18 |         d = MaskingInstruction(r"d", "3")
19 |         instructions.append(d)
20 |         x = MaskingInstruction(r"x", "something else")
21 |         instructions.append(x)
22 |         y = MaskingInstruction(r"y", "something else")
23 |         instructions.append(y)
24 |         masker = LogMasker(instructions, "", "")
25 |         self.assertCountEqual(["1", "2", "3", "something else"], masker.mask_names)
26 |         self.assertCountEqual([a, b], masker.instructions_by_mask_name("1"))
27 |         self.assertCountEqual([c], masker.instructions_by_mask_name("2"))
28 |         self.assertCountEqual([d], masker.instructions_by_mask_name("3"))
29 |         self.assertCountEqual([x, y], masker.instructions_by_mask_name("something else"))
30 | 
31 |     def test_mask(self):
32 |         s = "D9 test 999 888 1A ccc 3"
33 |         mi = MaskingInstruction(r"((?<=[^A-Za-z0-9])|^)([\-\+]?\d+)((?=[^A-Za-z0-9])|$)", "NUM")
34 |         masker = LogMasker([mi], "<!", "!>")
35 |         masked = masker.mask(s)
36 |         self.assertEqual("D9 test <!NUM!> <!NUM!> 1A ccc <!NUM!>", masked)
37 | 


--------------------------------------------------------------------------------
/tests/test_template_miner.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | 
  3 | import io
  4 | import logging
  5 | import sys
  6 | import unittest
  7 | from os.path import dirname
  8 | 
  9 | from drain3 import TemplateMiner
 10 | from drain3.masking import MaskingInstruction
 11 | from drain3.memory_buffer_persistence import MemoryBufferPersistence
 12 | from drain3.template_miner_config import TemplateMinerConfig
 13 | 
 14 | 
 15 | class TemplateMinerTest(unittest.TestCase):
 16 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')
 17 | 
 18 |     def test_load_config(self):
 19 |         config = TemplateMinerConfig()
 20 |         config.load(dirname(__file__) + "/drain3_test.ini")
 21 |         self.assertEqual(1024, config.drain_max_clusters)
 22 |         self.assertListEqual(["_"], config.drain_extra_delimiters)
 23 |         self.assertEqual(7, len(config.masking_instructions))
 24 | 
 25 |     def test_save_load_snapshot_unlimited_clusters(self):
 26 |         self.save_load_snapshot(None)
 27 | 
 28 |     def test_save_load_snapshot_limited_clusters(self):
 29 |         self.save_load_snapshot(10)
 30 | 
 31 |     def save_load_snapshot(self, max_clusters):
 32 |         persistence = MemoryBufferPersistence()
 33 | 
 34 |         config = TemplateMinerConfig()
 35 |         config.drain_max_clusters = max_clusters
 36 |         template_miner1 = TemplateMiner(persistence, config)
 37 |         print(template_miner1.add_log_message("hello"))
 38 |         print(template_miner1.add_log_message("hello ABC"))
 39 |         print(template_miner1.add_log_message("hello BCD"))
 40 |         print(template_miner1.add_log_message("hello XYZ"))
 41 |         print(template_miner1.add_log_message("goodbye XYZ"))
 42 | 
 43 |         template_miner2 = TemplateMiner(persistence, config)
 44 | 
 45 |         self.assertListEqual(list(template_miner1.drain.id_to_cluster.keys()),
 46 |                              list(template_miner2.drain.id_to_cluster.keys()))
 47 | 
 48 |         self.assertListEqual(list(template_miner1.drain.root_node.key_to_child_node.keys()),
 49 |                              list(template_miner2.drain.root_node.key_to_child_node.keys()))
 50 | 
 51 |         def get_tree_lines(template_miner):
 52 |             sio = io.StringIO()
 53 |             template_miner.drain.print_tree(sio)
 54 |             sio.seek(0)
 55 |             return sio.readlines()
 56 | 
 57 |         self.assertListEqual(get_tree_lines(template_miner1),
 58 |                              get_tree_lines(template_miner2))
 59 | 
 60 |         print(template_miner2.add_log_message("hello yyy"))
 61 |         print(template_miner2.add_log_message("goodbye ABC"))
 62 | 
 63 |     def test_extract_parameters(self):
 64 |         config = TemplateMinerConfig()
 65 |         mi = MaskingInstruction("((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM")
 66 |         config.masking_instructions.append(mi)
 67 |         mi = MaskingInstruction(r"multiple words", "WORDS")
 68 |         config.masking_instructions.append(mi)
 69 |         config.mask_prefix = "[:"
 70 |         config.mask_suffix = ":]"
 71 |         template_miner = TemplateMiner(None, config)
 72 | 
 73 |         def add_and_test(msg, expected_params, exact_matching=False):
 74 |             print(f"msg: {msg}")
 75 |             res = template_miner.add_log_message(msg)
 76 |             print(f"result: {res}")
 77 |             extracted_parameters = template_miner.extract_parameters(
 78 |                 res["template_mined"], msg, exact_matching=exact_matching)
 79 |             self.assertIsNotNone(extracted_parameters)
 80 |             params = [parameter.value for parameter in extracted_parameters]
 81 |             print(f"params: {params}")
 82 |             self.assertListEqual(params, expected_params)
 83 | 
 84 |         add_and_test("hello", [])
 85 |         add_and_test("hello ABC", [])
 86 |         add_and_test("hello BCD", ["BCD"])
 87 |         add_and_test("hello    BCD", ["BCD"])
 88 |         add_and_test("hello\tBCD", ["BCD"])
 89 |         add_and_test("request took 123 ms", ["123"])
 90 |         add_and_test("file saved [test.xml]", [])
 91 |         add_and_test("new order received: [:xyz:]", [])
 92 |         add_and_test("order type: new, order priority:3", ["3"])
 93 |         add_and_test("order type: changed, order priority:5", ["changed,", "5"])
 94 |         add_and_test("sometimes one needs multiple words", ["multiple words"], True)
 95 |         add_and_test("sometimes one needs not", ["not"], True)
 96 |         add_and_test("sometimes one needs multiple words", ["multiple words"], True)
 97 | 
 98 |     def test_extract_parameters_direct(self):
 99 |         config = TemplateMinerConfig()
100 |         mi = MaskingInstruction(r"hdfs://[\w.:@-]*((/[\w.~%+-]+)+/?)?", "hdfs_uri")
101 |         config.masking_instructions.append(mi)
102 |         mi = MaskingInstruction(r"(?P<quote>[\"'`]).*?(?P=quote)", "quoted_string")
103 |         config.masking_instructions.append(mi)
104 |         mi = MaskingInstruction(r"((?P<p_0>[*_])\2{0,2}).*?\1", "markdown_emph")
105 |         config.masking_instructions.append(mi)
106 |         mi = MaskingInstruction(r"multiple \*word\* pattern", "*words*")
107 |         config.masking_instructions.append(mi)
108 |         mi = MaskingInstruction(r"some \S+ \S+ pattern", "*words*")
109 |         config.masking_instructions.append(mi)
110 |         mi = MaskingInstruction(r"(\d{1,3}\.){3}\d{1,3}", "ip")
111 |         config.masking_instructions.append(mi)
112 |         mi = MaskingInstruction(r"(?P<number>\d+)\.\d+", "float")
113 |         config.masking_instructions.append(mi)
114 |         mi = MaskingInstruction(r"0[xX][a-fA-F0-9]+", "integer")
115 |         config.masking_instructions.append(mi)
116 |         mi = MaskingInstruction(r"(?P<number>\d+)", "integer")
117 |         config.masking_instructions.append(mi)
118 |         mi = MaskingInstruction(r"HelloWorld", "*")
119 |         config.masking_instructions.append(mi)
120 |         mi = MaskingInstruction(r"MaskPrefix", "<")
121 |         config.masking_instructions.append(mi)
122 |         template_miner = TemplateMiner(None, config)
123 | 
124 |         test_vectors = [
125 |             (
126 |                 "<hdfs_uri>:<integer>+<integer>",
127 |                 "hdfs://msra-sa-41:9000/pageinput2.txt:671088640+134217728",
128 |                 ["hdfs://msra-sa-41:9000/pageinput2.txt", "671088640", "134217728"],
129 |                 ["hdfs_uri", "integer", "integer"]
130 |             ),
131 |             (
132 |                 "Hello <quoted_string>",
133 |                 "Hello 'World'",
134 |                 ["'World'"],
135 |                 ["quoted_string"]
136 |             ),
137 |             (
138 |                 "<quoted_string><quoted_string>",
139 |                 """'This "should"'`do no breakin'`""",
140 |                 ["""'This "should"'""", "`do no breakin'`"],
141 |                 ["quoted_string", "quoted_string"]
142 |             ),
143 |             (
144 |                 "This is <markdown_emph> <markdown_emph>!.",
145 |                 "This is ___very___ *important*!.",
146 |                 ["___very___", "*important*"],
147 |                 ["markdown_emph", "markdown_emph"]
148 |             ),
149 |             (
150 |                 "<float>.<*>",
151 |                 "0.15.Test",
152 |                 ["0.15", "Test"],
153 |                 ["float", "*"]
154 |             ),
155 |             (
156 |                 "<ip>:<integer>",
157 |                 "192.0.0.1:5000",
158 |                 ["192.0.0.1", "5000"],
159 |                 ["ip", "integer"]
160 |             ),
161 |             (
162 |                 "<ip>:<integer>:<integer>",
163 |                 "192.0.0.1:5000:123",
164 |                 ["192.0.0.1", "5000", "123"],
165 |                 ["ip", "integer", "integer"]
166 |             ),
167 |             (
168 |                 "<float>.<*>.<float>",
169 |                 "0.15.Test.0.2",
170 |                 ["0.15", "Test", "0.2"],
171 |                 ["float", "*", "float"]
172 |             ),
173 |             (
174 |                 "<float> <float>",
175 |                 "0.15 10.16",
176 |                 ["0.15", "10.16"],
177 |                 ["float", "float"]
178 |             ),
179 |             (
180 |                 "<*words*>@<integer>",
181 |                 "some other cool pattern@0xe1f",
182 |                 ["some other cool pattern", "0xe1f"],
183 |                 ["*words*", "integer"]
184 |             ),
185 |             (
186 |                 "Another test with <*words*> that includes <integer><integer> and <integer> <*> <integer>",
187 |                 "Another test with some other 0Xadded pattern that includes 500xc0ffee and 0X4 times 5",
188 |                 ["some other 0Xadded pattern", "50", "0xc0ffee", "0X4", "times", "5"],
189 |                 ["*words*", "integer", "integer", "integer", "*", "integer"]
190 |             ),
191 |             (
192 |                 "some <*words*> <*words*>",
193 |                 "some multiple *word* pattern some confusing *word* pattern",
194 |                 ["multiple *word* pattern", "some confusing *word* pattern"],
195 |                 ["*words*", "*words*"]
196 |             ),
197 |             (
198 |                 "<*words*> <*>",
199 |                 "multiple *word* pattern <*words*>",
200 |                 ["multiple *word* pattern", "<*words*>"],
201 |                 ["*words*", "*"]
202 |             ),
203 |             (
204 |                 "<*> <*>",
205 |                 "HelloWorld Test",
206 |                 ["HelloWorld", "Test"],
207 |                 ["*", "*"]
208 |             ),
209 |             (
210 |                 "<*> <*>",
211 |                 "HelloWorld <anything>",
212 |                 ["HelloWorld", "<anything>"],
213 |                 ["*", "*"]
214 |             ),
215 |             (
216 |                 "<*><integer>",
217 |                 "HelloWorld1",
218 |                 ["HelloWorld", "1"],
219 |                 ["*", "integer"]
220 |             ),
221 |             (
222 |                 "<*> works <*>",
223 |                 "This works as-expected",
224 |                 ["This", "as-expected"],
225 |                 ["*", "*"]
226 |             ),
227 |             (
228 |                 "<memory:<integer>>",
229 |                 "<memory:8>",
230 |                 ["8"],
231 |                 ["integer"]
232 |             ),
233 |             (
234 |                 "<memory:<integer> <core:<float>>>",
235 |                 "<memory:8 <core:0.5>>",
236 |                 ["8", "0.5"],
237 |                 ["integer", "float"]
238 |             ),
239 |             (
240 |                 "<*> <memory:<<integer> <core:<float>>>",
241 |                 "New: <memory:<8 <core:0.5>>",
242 |                 ["New:", "8", "0.5"],
243 |                 ["*", "integer", "float"]
244 |             ),
245 |             (
246 |                 "<<>",
247 |                 "MaskPrefix",
248 |                 ["MaskPrefix"],
249 |                 ["<"]
250 |             ),
251 |             (
252 |                 "<<<>>",
253 |                 "<MaskPrefix>",
254 |                 ["MaskPrefix"],
255 |                 ["<"]
256 |             ),
257 |             (
258 |                 "There are no parameters here.",
259 |                 "There are no parameters here.",
260 |                 [],
261 |                 []
262 |             ),
263 |             (
264 |                 "<float> <float>",
265 |                 "0.15 10.16 3.19",
266 |                 None,
267 |                 None
268 |             ),
269 |             (
270 |                 "<float> <float>",
271 |                 "0.15 10.16 test 3.19",
272 |                 None,
273 |                 None
274 |             ),
275 |             (
276 |                 "<memory:<<integer> <core:<float>>>",
277 |                 "<memory:8 <core:0.5>>",
278 |                 None,
279 |                 None
280 |             ),
281 |             (
282 |                 "<<>",
283 |                 "<<>",
284 |                 None,
285 |                 None
286 |             ),
287 |             (
288 |                 "<*words*> <*words*>",
289 |                 "0.15 0.15",
290 |                 None,
291 |                 None
292 |             ),
293 |         ]
294 | 
295 |         for template, content, expected_parameters, expected_mask_names in test_vectors:
296 |             with self.subTest(template=template, content=content, expected_parameters=expected_parameters):
297 |                 extracted_parameters = template_miner.extract_parameters(template, content, exact_matching=True)
298 |                 if expected_parameters is None:
299 |                     self.assertIsNone(extracted_parameters)
300 |                 else:
301 |                     self.assertIsNotNone(extracted_parameters)
302 |                     self.assertListEqual([parameter.value for parameter in extracted_parameters],
303 |                                          expected_parameters)
304 |                     self.assertListEqual([parameter.mask_name for parameter in extracted_parameters],
305 |                                          expected_mask_names)
306 | 
307 |     def test_match_only(self):
308 |         config = TemplateMinerConfig()
309 |         config.drain_extra_delimiters = ["_"]
310 |         mi = MaskingInstruction("((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM")
311 |         config.masking_instructions.append(mi)
312 |         tm = TemplateMiner(None, config)
313 | 
314 |         res = tm.add_log_message("aa aa aa")
315 |         print(res)
316 | 
317 |         res = tm.add_log_message("aa aa bb")
318 |         print(res)
319 | 
320 |         res = tm.add_log_message("xx yy zz")
321 |         print(res)
322 | 
323 |         res = tm.add_log_message("rrr qqq 123")
324 |         print(res)
325 | 
326 |         c = tm.match("aa   aa tt")
327 |         self.assertEqual(1, c.cluster_id)
328 | 
329 |         c = tm.match("aa aa 12")
330 |         self.assertEqual(1, c.cluster_id)
331 | 
332 |         c = tm.match("xx yy   zz")
333 |         self.assertEqual(2, c.cluster_id)
334 | 
335 |         c = tm.match("xx yy rr")
336 |         self.assertIsNone(c)
337 | 
338 |         c = tm.match("nothing")
339 |         self.assertIsNone(c)
340 | 
341 |         c = tm.match("rrr qqq   456   ")
342 |         self.assertEqual(3, c.cluster_id)
343 | 
344 |         c = tm.match("rrr qqq 555.2")
345 |         self.assertIsNone(c)
346 | 
347 |         c = tm.match("rrr qqq num")
348 |         self.assertIsNone(c)
349 | 
350 |     def test_match_strategies(self):
351 |         miner = TemplateMiner()
352 |         print(miner.add_log_message("training4Model start"))
353 |         print(miner.add_log_message("loadModel start"))
354 |         print(miner.add_log_message("loadModel stop"))
355 |         print(miner.add_log_message("this is a test"))
356 |         miner.drain.print_tree()
357 |         self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback"))
358 |         self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always"))
359 |         self.assertIsNone(miner.match("loadModel start", full_search_strategy="never"))
360 |         print(miner.add_log_message("loadModel start"))
361 |         self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback"))
362 |         self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always"))
363 |         self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="never"))
364 | 
365 |         config = TemplateMinerConfig()
366 |         config.parametrize_numeric_tokens = False
367 |         miner = TemplateMiner(config=config)
368 |         print(miner.add_log_message("training4Model start"))
369 |         print(miner.add_log_message("loadModel start"))
370 |         print(miner.add_log_message("loadModel stop"))
371 |         print(miner.add_log_message("this is a test"))
372 |         self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback"))
373 |         self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always"))
374 |         self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="never"))
375 | 
376 |         self.assertIsNone(miner.match("", full_search_strategy="never"))
377 |         self.assertIsNone(miner.match("", full_search_strategy="always"))
378 |         self.assertIsNone(miner.match("", full_search_strategy="fallback"))
379 | 
380 |         print(miner.add_log_message(""))
381 |         self.assertIsNotNone(miner.match("", full_search_strategy="never"))
382 |         self.assertIsNotNone(miner.match("", full_search_strategy="always"))
383 |         self.assertIsNotNone(miner.match("", full_search_strategy="fallback"))
384 | 


--------------------------------------------------------------------------------
/日志解析_项目介绍.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongdong9/log_parser/7c6bfa9a3d748a7f9eef93078940c7ce2c5446de/日志解析_项目介绍.docx


--------------------------------------------------------------------------------