├── .gitignore ├── README.md ├── config_ini ├── drain3.ini └── drain3_state.bin ├── data ├── chinese_english_logs.csv ├── chinese_english_logs_parse_by_drain3.csv ├── chinese_english_logs_parse_by_statistic.csv ├── english_logs.csv ├── english_logs_parse_by_drain3.csv ├── english_logs_parse_by_statistic.csv ├── 解析结果与金标准对比的结果_by_drain3.xlsx └── 解析结果与金标准对比的结果_by_statistic.xlsx ├── drain3 ├── __init__.py ├── drain.py ├── file_persistence.py ├── kafka_persistence.py ├── masking.py ├── memory_buffer_persistence.py ├── persistence_handler.py ├── redis_persistence.py ├── simple_profiler.py ├── template_miner.py └── template_miner_config.py ├── requirements.txt ├── src ├── common_config.py ├── drain3_examples │ ├── drain_bigfile_demo.py │ └── drain_stdin_demo.py ├── log_parser_by_drain3.py ├── log_parser_by_statistic.py └── tool │ ├── read_save_file.py │ ├── str_related.py │ ├── tokenizer.py │ └── tool.py ├── tests ├── drain3_test.ini ├── test_drain.py ├── test_masking.py └── test_template_miner.py └── 日志解析_项目介绍.docx /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | src/__pycache__/* 3 | .idea/ 4 | src/drain3_examples/SSH.log 5 | src/drain3_examples/SSH.tar.gz 6 | config_ini/drain3_state.bin -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # log_parser 2 | ## 功能: 3 | 这是一个日志解析的项目,对中文、英文和中英文混杂三种类型的日志进行解析,得到每条日志的模板、参数和该模板出现的次数。 4 | 5 | 实现了对日志的流式解析,每秒处理4k+条日志。 6 | 7 | ## 运行环境: 8 | python3.7 9 | 10 | ## 运行方式: 11 | 对于中文、中英文混杂的日志,采用统计的方法进行解析,直接执行src/log_parser_by_statistic.py; 12 | 13 | 对于中文、英文和中英文混杂三种类型的日志,采用Drain3的方法进行解析,直接执行src/log_parser_by_drain3.py。 14 | 15 | src/drain3_examples/drain_stdin_demo.py 可以从输入的日志中学习到日志模板,并且通过学习的模板对实时输入的日志进行解析,得到日志中的参数 16 | 17 | ## 更多关于本项目的说明介绍 18 | 见知乎文章《使用改进后的Drain3进行中英文日志解析》,链接https://zhuanlan.zhihu.com/p/569437314 19 | 20 | 21 | -------------------------------------------------------------------------------- /config_ini/drain3.ini: -------------------------------------------------------------------------------- 1 | [SNAPSHOT] 2 | snapshot_interval_minutes = 10 3 | compress_state = True 4 | 5 | [MASKING] 6 | masking = [ 7 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"}, 8 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, 9 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, 10 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, 11 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"}, 12 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"}, 13 | {"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"} 14 | ] 15 | ;mask_prefix = <: 16 | ;mask_suffix = :> 17 | mask_prefix = << 18 | mask_suffix = >> 19 | 20 | [DRAIN] 21 | sim_th = 0.4 22 | depth = 4 23 | max_children = 100 24 | max_clusters = 1024 25 | extra_delimiters = ["_"] 26 | 27 | [PROFILING] 28 | enabled = True 29 | report_sec = 30 30 | -------------------------------------------------------------------------------- /config_ini/drain3_state.bin: -------------------------------------------------------------------------------- 1 | eJztXQtz2za2/isa3zuzaSdWAb7Z2W4uRdE2Ez1YknKSbjoeWqJSbWXJI8ttsp3+94sHHwAFkLYkptpt045MkSCJ851zcL5zAFK/nd1//mZ9+690uj37tnM22ySLldolf7p9/Hn2snO2XH+8mS4fH7bp5maW3m9/Qk01tP8u+XSzWs/SYqeCdj4s7m7IF9DN20x/Wixnm3SFdkIA0M7Ner0lZ6I9v9X0YISboPY/p59vtmt6nfI85aCzo0PO/h0dyBFZzB7Qrn/CH/HOv//963/848hXNswffxfsJvdTrINAaKO7yAgMXd5jaJxcj9WXHd1+2TGBvNenBzNyLoh8SbcQ3AbetkhHz9G/Y9sfvfLbxeoqjoPO2/S2E2zWnz53nMft+ry/eJiuf0k3nztRuvllMT3IpwV3N9X87rP1rw8db7NZbzpher/ebBerj23dVCM3vV5sto/JsoNk/PnYd9DJHT48mtP5LfpMZ/MPj7qm4E8lOcjcBHezALmbc3+/XEyT7WK96nif7tPNIl0dHTtL4RQWL+6OfgdqkNF6vv012aTYGrfoukiqI9/IpkqKF9tk1XE+pqttbm+dObLCTMRj39TILEO/TZQPj4aimse+g8VpaLiePS7Th46/etgmy2W6Ofbt7B3j81cIv7ukBZVBQANxgMaH5HaZdvop0Ze3erxLN8kWqa2dIQMCfqBqC00IauiABk4uTukvO5qSj3U6nGOLhgd1UwQKrIndqnpyoKCArWo1HOkwattGj02kRurHl+v1x+XRA64iR+Og4a8VMBDngojmqsi2DS2zbcMEMxK74bGHzzp/PzloEI9WsXWbGSwWTBNKaRBExlxDn/rt0d0fGHKMTi/fgBA5U11Sd4JdxjmSmpu6Pjc+PNr6/CDzExkPyO+g6CqyGtUE2LHU5Oj2Ytegf3qDL0TJnq60jb5lF+hDgH3Wnh8d97qgd3qDGUQDPMSRz6oxF/v0uo0GYA3WjIinV82ANQgfxhbb6a6NKy40O+wl058/btaPqxni/Nt0uVyQRDHeJKuHedpWumHaXzSHg6COYB9EedopQgLEQWoGefX0bEqp8Vj7BBFGBFipITH26dF2Ra3p7ukliUpNsLT10+tuTdHfOr0gqdSkDNYJGkNNgLRO0Bhqxl7r9EYGtSa6maeXD6g1oUI5PRqtFvW/dGqhzySds/US9GkcRklF/Mj6MqmkVaMKeHo8W0V5vPaFsmy7Jn6ppzdkqTUj7On1VqvhMurpRVutxhb003MTrYbLGKeXOmg1XMY8vXig10Rb/QS7WxdtT88Y9HxGolxZcOyYVxaCLfPDo5bigmR7d6sZOpTTy4v1urVP4AStu2bsgCcIr/myY8DWJyLqWN3pJYdGzYAKTy/bMvASQUREDfuLjiM1xarTU6lZ2Hhb88pmneMfRnhbXOlh0onlvjseHts+agbuwwg1AkJNdGzm6kEzu6JOG60PhTUrM5TDxpb2M9CaJAkeptOwHxy5sxDUIH38BVN0fDEPy71++3A2Xa+26Wr74ezoPayBwzjM8FrttszmZLLcb9bzRd2M3cPi7n6Z3uTtuqPH5TLIT8JXSD9tN8nNLF0u7hbo6uTSZzdnP77MHr+gN8W7kY3hhzLuk01yd/OwxffMole+N91uFv9Ob8hqycUUifxzusJnbjePKWqzmBEU6BUboBusP7q0oZtMfyIo3pCtm5tZsqW+/q+H9erbb75pim3lpfBV8PMo2xSBkmzTsofk/O3jPVmb9s+zCLeksv3IQ49xeIkfVPk3bmngg1k3miq9e3TDfzcAUPNxe7TpjkcjvBl630+8KMabF+MQ/4nicOLGk9DDX/rRKOgB5yZynVIKduOtQ8SLJq7rRdHFZNDFX1+PeyNnSK7wLf7ocOc4kd+XHCo2cP88N6ZdqrmWe7F7MApcxVUEYCsl2AoDdhPn2gds78KwdZ/i6YQxj5kfX1VQKo75EUUo8i9HHkEpHuPPSeQRKPo9Jd8ksr4sti7D8STAG/9zGQZR7ArkV8XyNwXq/eTXgOpzop0XaFDByPfYpwB8V+ckmrjfTdRwj34je/8BQJPvON7pj3xeicVGNOlF76PYG/LnODGRyxn1WTG9d35MNUz9i3rPxXgwGL+NpO7i+j57TdQfB4JuhP4Oxk7/Bfr77rofO19xp3k9t++7TzjvHX/eZOS74773hBMnX4n7mQsrPC0eSvvZcJ68nw0non7u2pQutqkmWry3TRmH21S5K3BCNHDEXvhcW+IuMRTfHHXtqu+5QdYedjNYcSAg2GIjfpG3qtgACQO5pTde4d3/vXNc/gp+35nQ4fFJV1D/z4mv+CvgoXM8esYVosuRyEIMsYU0Eb59LQQocgt5nj1g28d/h+P+ZOCxAdSv5SKmWOAmYr6fwO8RAdwR+D0aQ4jdDRELIR1GAPjOwP/BiX2qUyoCCSP+6FIghSWWoinJ2lcKAHkpOFXVdNMWd7OxTLFfP681tYJ2sTEi/CIYR3EwziiH0xtTyhJfhZ7Tz8aVCQplAgILGEGAyYrSApcmoijAF0jgj/rjSe/J3WaIN1Q0le13C+Sb9LtqKn0/CgbO+7LHeCv0ggx8ZiQ/F0nAsFmoaRYrQQuMFkkwUTQqwXnQ05zMZSc/DNywoHMokmNHdUIiVU50NWAbYuPzRnHoe0TGKxq/ep43on6UZxQDcq3exB9QWMIxPwyGGUhZ/hI7PTroRYHjeuLblqw5wA6ajy3SJE1CnBsXiu+JMzR22cLkMvZKnN1xQFCZxP7Aj9/zMu3KS5r1iwbf7TYdjWMaMT13QkYs8RXdgeeMaJrhjofBAMUcEV4MYddYvNph7BMIlV27vOz7UYEXHSIwsxiH/g90lGOwo6NEaUNSvL4WSctQSchK2w6XxNIW4qqFuOFgfFmIizJLTm2BFyLJhpled+0j9MYhfyh33RIJhnjnHkZT037o+MR9gtCJ+r1uEEUOioLdyh1iOiZA/DG+wJ86vdFgQP2eaChGkTPIEkR6VRQRsTN77niEhnQB/BKe1jjdtDf8cBd9ztiQ8AEazbw6f3ORxcXSBgIpGXJmsJWixqVde4gpzAl4GXERQugho+v4bcSdW2wMvWGPKV0IUHnGgFXadjjGgSKzbcapEflCxuRRmOHu5WmryvXzgbW8vESTeUOZIgE/vE5y0anpj3uvPZdWAqiVD/xIbg7sxsChDekVMjnzkkrj2RiTSfQcu2PoNGd27fDp2KqSu35PISjxtB/vCQfDzNtKwl1KJRVIQrwb18/sKZC+w/qikT8YugM6KkbjSUhZChpWrx1/kI+topJoGHoDkgidU9/4TtaQ1E4b2w0mb4UNJM0RQ41YnsRuxO8DSa93KpyVDUGVlskobJbWNq6x+UtFB6nI643e9Hu1quGyJlY17eRMsWpWmDBKmEYFADgqoZEcb+Z85eg6YULFvlqhyd25P7oYF62KMPlt5Sbf7h4tLkS+fc1vdWQHKlMKuEZKujyexISaoWQf/7kaD/psZIqvvKrZ0ws8Gfk/PeDUA0u2I/UmNoOHbAlCaSeBb3An/IX2vTVvOniEq9fxZb/nyNTDa7bUIqZEdP7hW/GdeCr1Jb2sRiF/Vj0cx/GYeo6ls37XTkHnL7/Ddb534aVeq3JFMVQFqs9UeluOR0vJT3U8t9dTAVSUJ+ii58Quf82jh7fiFoLgxvZUqAUNWpphP9vZSO1N4GtMLZAj8+3UAgtXK8oz/3Gutr+HeT1Te5rT/HFudaLx7DRAf3IAY0rObIG98X1NhzuVJnYqyWh5LP3+wWMmnxBUB0wVKEDV/hCP4rnpU70K4HUQQyVwAwAr7UXgBz2HX/1XwT5bJNjF7QaTruspdt80LVXt86eJdBC9j8ZBWOtk0q3n+xORROBOzBQC507tTCEgd9L/3HSwHfpP/aOP+jMYu2+ol+UQ/qfEoZME9slhiZmk4soZ7SwhKvzor7BUp+jj+czzUqI/D4hP9g9mMo3LhVqaTHuuf6go4JV/JGodO4GqBEDjWtbqNWtV0ep17AxhF/eMcgb8X7/f19yihW0rqsVc4DA9s/1+FpvI7n6ApzQCCowjAgpLQFVL0SylaGFBw2ZvczCgRb/3AXSXnEEhOWMmbNnlNY0vzvtCTmPaNvOnpjykAKXHtazVcdaqyWkudN2BnqYXLaCCmLfBXOHwalHecXHyY6qaKq0WZX04gJnV44oT8LfO9yaAltmIrevDXuDQ9QT18CoFvJbrXTjqRdFCBZYNdtW+f/kA9573lp16HFB0HTI3kzpWdnzXsRSRY6nMPDvnWO1Ms4dA1Xem2cOg574l6nbCUbawg64vGrnj4dCPs4Utk5A7s9hwLmKPP+Reee6bYOyP4nKxkoipEA1lE9+Ny2ZKeyLWLFprI2Qx9UuYmpc2Za7x5H7iBY5PXDbl0QWP+cNkTzwrDp1R5LjPBvDtOHxD7Lr2PIqhwFLZpyQ1m7XVdtYdhABUJmxCj6w96na7ZMFXxBlfz3HfIAPui2UPPXd87YX8jDYaKQb+5VVcAEAGlXLFLV0VVizEFzURL9nPWgpAlD392M5cM4JQfwaEF+Pw2QhSz8SjhBzF4pkAOS6Sxd1qO3OBCBdNhsskewA3X6NX4rSLyX7iVw5/cRuUPMnZ+BK+vbFWZVgHoU/XepcBgzSgIZvJNAVSSJ4dbHynxt5SKEIpMgMgq/BFz7iokpXRjT/jtHc/obSf9EEuST8lD56p7ZSNAgCsHSISvItHxLbPace/n/ihV0TI87534WdW3ndiybPwkUdEzR6lKJYKnotblyiVGS8AfRMgdXPHGxKF0hmb6jCC5zmeUGkR6IspY0CNXRXc+L7HvTW2+0g2rzHvXZwxmgsEu8c+zFED/X+wDtismNNB4y9v7aeDgQ6B5AlCBM71wM/i0Mi7HGeeLmy88/gWfYhE0JBqbzDh9mbP0w+zJ7/93jDs90SvDGAfS2RrbVo72c0gX0RcrDtwR2M6X8c9n1BapwpU5g8nrUvye7m45WlhPHKZeT3iDxf93puy5gN61L5RvGPaAYvujTzW8EH2qM0oKt/JACyzjw5AEcbsamAW4nZI+WCHOfX7JDHfCYZigxqM3SIByaYsK34oPu/SG3mh79LDeYUAOJdhwHdmPMyegzofZXO6pY27Qf5UJ/9cCMJffACFTvEBP8iTp93bjAMsn+zZ/+DNJRqaiv1l2pWRSIGCJSmD1k7KMNiha5mCG5/51mRvNmmHw79WzeoTA7SqlBGeytgnmEkoT6QOKIgPCoCaJFiVtCqjq4IxmWO1F+Q1PyOXZFCkVjj0R5PY43OL/NGi8nEa/iqx88YbHUsygRLZp0YhN6C0kx4gNVYfNkI7XUDHhex5SuYhbQHKExTtR9nYE7yHzHmykojPY46fb667pHKES15985axlcZazdU34+c0z2+HOkkXuJCzxxcX+WsoRA/G0TUuKk+ymHNiJ3pTivai4NKi93RoTDqm2hwVaich+8tu/gi7Ufawm94kKp+Nd9z4Wmg/TJoM2dXgWjt58muo2rz5ZHoSyIFyGmHKrDEps62wa5a0dpLm11CppGBllMBL2lBEGhI+lBHZXkTfv1FnAdLoIZBX8lYXrZ2E8zWwxSrClciwn3F5arGYPaHQSpiYfKEJidhPKS5pkudT9XbSutdA3X2nx+u3gC4M3HNWJnuVx9sQt8we4s+zCw7I//opmnK6uHmOZvfJ+afM0eye9cfM0ejsbCL7GiC9nYT79W6ygHaiIVRlg4sTulf+tccY3vV4MJE9kYxfEbbzOsKXLHWVuDHXhlVfQ1NyuUEYjeoaocs0NSl2TUbMbICq2kDSTnYhhE4UMPPxoHIBlEEPLouj7z3RCzl0tixgcS8m0tupDLzeqZAX+kduwKk/r95mtdq2bECoegFUihyqdnLs1ztFehmD5AetKo4C4HhksyUfjWg2Rci88VMRVVlEdS6CtlMM8IFm7wxEvhfSXPoyzN6kmNX3guylAuWEprRKjQLFeJgvBSmuk596LowGfK0oi8PFHFFXBJjs5art5NwIrcq08DigFEFet2dcL3+lV9NbJbizguoLeMtD5QsTAHAVADzRa4N19gVPwDbZFFNvJ8X0RbTMH7zxSsaEV1IRx4hcp+RLmcpVS2P+jF3i2yy42RH28RJJwEc83+8Nu+iPi4ID4gfsDfJOkHftiE5m2gqpUlmnRKp941zW9aTfU4KBbwnOZY5kN0N9HXATNvJrZWdwrwsq39dTdTBcC0f/uFtVp3yq9wp6wYDvXGU2SUZUc5SLGaJKw8pyQBTwTRsaxoXLnFxZmiajjxVEiy5U9svaZzcrRa0cLxe1VQ5oumUB+QVFL1zT2YSd9cV28vXL3dlHWu1FZMjJwuHAu6al9RcQQPAVVTKdsxEcyqdzXowuhsJ3E0smxBt/n24/AXt2NdvFO+GlS6zcvSASQLJK8QJlgMUcQ+994EQRHVau/Sz4h9e+672AQrGYJJ57sY/eThbfM7XdF+UhuXr0TeoTFBTfZdlt5eWTOjCYP9VXUeJiR2/gR1fciJo1Ll7D3ptcXGTrL8djYgCXvcB6I5pI0yVLnBt/4G9PXNSslsMGlx506WxSjQBVYiHgLH3v+0kWCi5Cj3kPp+D9ymzGyMZUo52M8WhiP0tIJhliH4ky2smEeiqwBTJe9mGzbdJV3FLV5r7R96MsfvHruMndSj5FK7V/G43DoTP4mwgZJvfhtN9O4tNTdkbx94hn0JjkDNjJYdr1xkyREtYuLzZdg28CCy8h4V+BKcyFBLiwGQywuAkNo50UxgaKIkkKdzeoRmmEI3kFRk0gBpNXQMvk3rHfTmqhKtXU1umVnfXG2To0YvXhGCVUEfva4F1B87Ibn4w2PPf55Gfhajby30fhz4zdXrPtMJmKoQEW83byFBUou9RBDUP6IvrsjxPHDiWT/FqY7LdOilfqCeSRrGBs/NWlQ156Wq5pFXRIwswafzV8jw6do38ctIKiSTbJnq8VIKcIes0SL7bXLfCur2mk1R1ppI2vvJB5nwFdLBX6KD2gnhpdjcM4ywR3nYMOzkUhgPpzY/AWYMJyLrZebbZAuigo0H4eKCwQFbF3cdkbCJOhYSo7k2i2QMP+F9GLQDX5dzzla2YEfWPYE/cbHC2wp8v1+uMyLVzsw6NhzDX8c4a3gPvtyx0fzJsXTdCnDTT8M6JWKm1uKrcGOimbaPvwqOIlpwIE2CVYfOnJbIEovV2sruI46LxNbzvBZv3pc8d53K7P+4uH6fqXdPO5E6WbXxbTtH25WRZkquyo1fg74vvJPVv/+tDxNpv1phOm9+vNdrH6+OXEZauwbGpstsCVrheb7WOy7CCt/ty+ZOzqHG54aYGRVB216LO8e4asey1QDO7XaEmvtDTV8Se4zb8r6LsNpzP63bo1EMi6piX4e1kdy89Xs4XqqA208Jkgteh3W4c2Pj+di9WlpWpSXk6hy4nlMDG8R+fsswXig390eWqhzySdV8zSMDLx9FvLxkIX36daDoSp5PtUbY4hSNLMwGcaJBBn4BrWFJC9Gfi6ogIxePM5mGZtUtPItmbz/I7qDEM7s4q+JQpWsmpSGTJVzmcp/f1dXnUd+UnkkB9UVG8rXSR2FwLQhapSrzaG+GmcdbfA/HrJ9OePm/XjatbxV9t0uVx8TFfbTrxJVg/zdNMwlpqGpWFLV+ZkD1b/1LzF6rRusdKm8pEoD9SqXTHrJ7g/wwIN7kevWmCB7E+Ks6JmUpS/m7zroDuDrz2r17wFZIK1wOoEP58s1ZWi5T5mgmwLqz7bl5iwUH8CsPpTay6/mKkkqKGZVlwKX5LYzlzXczA1/F3RZlWHtm7nZt4GZAOFeUssUgNk7NUzBVlA16gKdwYFMhhZ1lRvEkOuL/YRZ4WlO1YLXNe5v18upsl2sV51vE/36WaRrr4Ay7EYMmuyw5HVApVlvU3sYeUILh5e6la3M0NPfnH58FN3ITlYLANmeb/VAv/F/bdM3H8MURU481a/xdwC2BlwRRhWAJxeLJapv5qvybfZC6MLef94paATX52/Aib6gFoMzAIF5q2Jit4F+b8fvqoqCDMmbI/2PA/VCcAh1ZxVw7IxTS3ChnLfLIK6XArO34u9Mr0wVJ2t6FttVDXd8ZC1Nj3F4+t0lo9qc3OOOWVOeJD2cKwsRj0basiLzbmRj2vFQFulRuXYZ1Esbm2zMpIP/Ci6Dpne2AAm+G5pg9NLnpa22uH/iF8RZzSq6t+XaiugXjzJZLTVSv5wNC/Nx23NImF0Woj7ZdxXNS3WfWXYSt4naLWQdOQVgHhx9wUCoeRFcFYLtHw37vE5Muq0iktcZd5UDig7Z6Z2uVYDmkpXQeYCugoVl9SnwSe782K1ve/eZd/Iny75r7AWqKjn/9g5Pz/0FQOoroEpuu+tkQ1pTTG9oIBnItgZts/SD7sVsr8DnjoDeLy0swFIQHBH23t3ibjYlrXAW0NltcNxlS+ig+qYiuzHxt+13F6KsLIjsk4SAjUPQLXasYFEO+1kLA1OwaT3c4Ddw57mgWOuT0umh4YxxcCJnJ0PZjVwZO6DtrRkRgoH9eOEDSWQtJATHBOSwwVXJIK3kig0Z68YBmNeCZV/5bZPy21tbkUDS5LsdlIZmTqp4RGOhD4tOFeLQ+NVOlpvU5RpVRRKlZnXpxqUGKWrWWe77gwX0836YT3fdsrrdvqbxS9UjmeouEm9VLW7an2aSvdTJ7uyg/vFb7uFFChCKP6abFI8HbVF11ysV63TM5t7opgVsIWcJV5sk1XHIbXSrETama83nYyQPl1YliqQcjPOkC1tuof8kpzGbienkYeZytjDUNQKDSyL+jKRTIlIrcxflMX8Ij6q2NfyogQtFul6Hhto0q0rsPD//Ajm4uWVmHhEs28yeAikZfILNnGyW8gv8sRpuJ49LtOHjr962CbLZWWka8VNWTrP/fI2aIHQP1fOP2ZCA4nOgML9HjZogUezhWRcXdjcJc8boY83aCH5xC4OQQtsOewHHK3gy2lmOtVomaZo9K6rKBp7ypOnKSlnwUl6weUAOZ6R7epkJa6F4uOwmEhSZtiakln+3cqLY0+fqBThLXk7EAQtkPRgvdkmt8u0009JhPRWj3fpJtmum+YVjzfcILlkArdEYytFMyIJpnhVPSIXEoYLLpAQkzRvdVw51PNyinV7m7IV4pKCWjBNiK0lrNUZtwrMufFTeye+s67lGUlJTI0p1hguL2TfpRP05QjZoDKWq3Iqa4Gq1q0Ie0Y0oAwIz/FasCzGNYUG7LG4LEOzlerkshQfSXEeglaq89SoqPzl8rrMTGgVMiN/T4l0ksXBKCdpL/p/OXaDxHgprIFA0AJn/TKrGVHXS5lMTqZ2Kt+ySsBV0BkkD+nmdbrtDC+CzlBRzPOhosJO4A7Ojc6La+2rTj5JMtchyTXIKMUVRg8p+DylC5xC/mvqQUjZEs+F7ay62Vl0VxCgaR6uaODMbRyLOS9/ayGKBowmqvwIfb8tSetTFq9ZaUIG/fmMJoVYM1NYtR9maCz20QEUiZK3vjVJ/DbzqI2ug69sGFUrKTlCtU0Z45n5YQ3327TnBWSM83x+XP2aLs6R7f6Sbrqffuo+/NSdrnYIxG2qi2+Y34Zie+VABlxdw+DpaWFuzLwHjZTznPRU5hYaSwDIuiRkAB4/MxLdXpafwBbyk98+nE3Xq2262n44wyeReR26+d2f56NaifknVBVNUfH/P+LvL1nnLxqp2gblZLqm4s/5Rtuibba5liaYuRhQLVyLVFITYtX/fEiRojarNfyxGB8APz7MVDwy6DuLS1FLTBINPL7oQJ3yvodbQrIFqCJpd85+Wj/wSr5b32LfhFyr6TJ5eBgldynX9Pzq/DzZTtd33dl8lsyTh3kynXHnbT/fp9WrJ/f3H85+F7oYu8KLs/EWcsK/bBx/vHr1SmjWeH+9Jb+i/xiDRd8+PAJNnb1i/52wubEzS7/j4zduMv0pvbmZPm42+QHSKj9wl3wq5i7Ib5PfDMJJdmy9mSHzy+xsk84ep8TQqNmhjmGjna4R+yfTEA/dMW6fzvoLZM/o5rsRYPW4XOaf/FH+G6SNfty5iCI7oMoOaLIDuuyAITtgyg5YsgO27AAOvZIjUtmhVHgolR5KxYdS+aEUAChFAEohgFIMFCkGilz/UgwUKQaKFANFioEixUCRYqBIMVCkGKhSDFQpBqrcCaQYqFIMVCkGqhQDVYqBKsVAlWKgSTHQpBhoUgw0+UggxUCTYqBJMdCkGGhSDDQpBroUA12KgS7FQJdioMuHQykGuhQDXYqBLsVAl2JgSDEwpBgYUgwMKQaGFANDHhOkGBhSDAwpBoYUA1OKgSnFwJRiYEoxMKUYmFIMTHlglGJgSjEwpRhYUgwsKQaWFANLioElxcCSYmBJMbDk7ECKgSXFwJZiYEsxsKUY2FIMbCkGthQDW4qBLcXAllOkGo4kJ0lAzpKAnCYBOU8CcqIE5EwJyKkSkHMlICdLQI5GHWWUo1FDGkvWiP9jkomHm+n6cbUl1B+1+v3/AS6Y9zA= -------------------------------------------------------------------------------- /data/解析结果与金标准对比的结果_by_drain3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongdong9/log_parser/7c6bfa9a3d748a7f9eef93078940c7ce2c5446de/data/解析结果与金标准对比的结果_by_drain3.xlsx -------------------------------------------------------------------------------- /data/解析结果与金标准对比的结果_by_statistic.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongdong9/log_parser/7c6bfa9a3d748a7f9eef93078940c7ce2c5446de/data/解析结果与金标准对比的结果_by_statistic.xlsx -------------------------------------------------------------------------------- /drain3/__init__.py: -------------------------------------------------------------------------------- 1 | from drain3.template_miner import TemplateMiner 2 | 3 | -------------------------------------------------------------------------------- /drain3/drain.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | # This file implements the Drain algorithm for log parsing. 3 | # Based on https://github.com/logpai/logparser/blob/master/logparser/Drain/Drain.py by LogPAI team 4 | 5 | from typing import List, Dict, Sequence 6 | 7 | from cachetools import LRUCache, Cache 8 | 9 | from drain3.simple_profiler import Profiler, NullProfiler 10 | from src.tool.tokenizer import get_token_list 11 | from src.common_config import IS_CONTAIN_CHINESE_KEY, SUBSTR_TYPE_PATTERN_KEY, SUBSTR_DETAIL_LIST_KEY, TOKEN_LIST_KEY 12 | 13 | class LogCluster: 14 | __slots__ = ["log_template_tokens", "cluster_id", "size"] 15 | 16 | def __init__(self, log_template_tokens: list, cluster_id: int): 17 | """ 18 | yd。功能: 19 | :param log_template_tokens: 即经过分词后的token_list 20 | :param cluster_id: 21 | """ 22 | self.log_template_tokens = tuple(log_template_tokens) 23 | self.cluster_id = cluster_id 24 | self.size = 1 #yd。用于统计当前cluster匹配的日志条数 25 | 26 | def get_template(self): 27 | return ' '.join(self.log_template_tokens) 28 | 29 | def __str__(self): 30 | return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}" 31 | 32 | 33 | class LogClusterCache(LRUCache): 34 | """ 35 | Least Recently Used (LRU) cache which allows callers to conditionally skip 36 | cache eviction algorithm when accessing elements. 37 | """ 38 | 39 | def __missing__(self, key): 40 | return None 41 | 42 | def get(self, key): 43 | """ 44 | Returns the value of the item with the specified key without updating 45 | the cache eviction algorithm. 46 | """ 47 | return Cache.__getitem__(self, key) 48 | 49 | 50 | class Node: 51 | __slots__ = ["key_to_child_node", "cluster_ids"] 52 | 53 | def __init__(self): 54 | # yd。key_to_child_node这个字典在root_node这一层的格式为{str(token_count): Node() } 55 | # 在子节点这一层的格式为{ token/self.param: Node() } 56 | self.key_to_child_node: Dict[str, Node] = {} 57 | self.cluster_ids: List[int] = [] 58 | 59 | 60 | class Drain: 61 | def __init__(self, 62 | depth=4, 63 | sim_th=0.4, 64 | max_children=100, 65 | max_clusters=None, 66 | extra_delimiters=(), 67 | profiler: Profiler = NullProfiler(), 68 | param_str="<*>", 69 | parametrize_numeric_tokens=True): 70 | """ 71 | Create a new Drain instance. 72 | 73 | :param depth: max depth levels of log clusters. Minimum is 2. 74 | For example, for depth==4, Root is considered depth level 1. 75 | Token count is considered depth level 2. 76 | First log token is considered depth level 3. 77 | Log clusters below first token node are considered depth level 4. 78 | :param sim_th: similarity threshold - if percentage of similar tokens for a log message is below this 79 | number, a new log cluster will be created. 80 | :param max_children: max number of children of an internal node 81 | :param max_clusters: max number of tracked clusters (unlimited by default). 82 | When this number is reached, model starts replacing old clusters 83 | with a new ones according to the LRU policy. 84 | :param extra_delimiters: delimiters to apply when splitting log message into words (in addition to whitespace). 85 | :param parametrize_numeric_tokens: whether to treat tokens that contains at least one digit 86 | as template parameters. 87 | """ 88 | if depth < 3: 89 | raise ValueError("depth argument must be at least 3") 90 | 91 | self.log_cluster_depth = depth 92 | self.max_node_depth = depth - 2 # max depth of a prefix tree node, starting from zero 93 | self.sim_th = sim_th #yd。similarity threshold 94 | self.max_children = max_children 95 | self.root_node = Node() 96 | self.profiler = profiler 97 | self.extra_delimiters = extra_delimiters 98 | self.max_clusters = max_clusters 99 | self.param_str = param_str 100 | self.parametrize_numeric_tokens = parametrize_numeric_tokens 101 | 102 | # key: int, value: LogCluster 103 | self.id_to_cluster = {} if max_clusters is None else LogClusterCache(maxsize=max_clusters) 104 | self.clusters_counter = 0 105 | 106 | @property 107 | def clusters(self): 108 | return self.id_to_cluster.values() 109 | 110 | @staticmethod 111 | def has_numbers(s): 112 | """ 113 | yd。功能:判断字符串s是否包含任何数字 114 | :param s: 115 | :return: 116 | """ 117 | return any(char.isdigit() for char in s) 118 | 119 | def tree_search(self, root_node: Node, tokens: list, sim_th: float, include_params: bool): 120 | """ 121 | yd。功能: 122 | :param root_node: 123 | :param tokens: 将日志内容进行分词后的token_list 124 | :param sim_th: 即similarity threshold 125 | :param include_params: 126 | :return: 127 | """ 128 | # at first level, children are grouped by token (word) count 129 | token_count = len(tokens) 130 | cur_node = root_node.key_to_child_node.get(str(token_count)) 131 | 132 | # no template with same token count yet 133 | if cur_node is None: 134 | return None 135 | 136 | # handle case of empty log string - return the single cluster in that group 137 | if token_count == 0: 138 | return self.id_to_cluster.get(cur_node.cluster_ids[0]) 139 | 140 | # find the leaf node for this log - a path of nodes matching the first N tokens (N=tree depth) 141 | cur_node_depth = 1 142 | 143 | for token in tokens: 144 | # at max depth 145 | if cur_node_depth >= self.max_node_depth: 146 | break 147 | 148 | # this is last token 149 | if cur_node_depth == token_count: 150 | break 151 | 152 | key_to_child_node = cur_node.key_to_child_node 153 | cur_node = key_to_child_node.get(token) 154 | if cur_node is None: # no exact next token exist, try wildcard node 155 | cur_node = key_to_child_node.get(self.param_str) 156 | if cur_node is None: # no wildcard node exist 157 | return None 158 | 159 | cur_node_depth += 1 160 | 161 | # get best match among all clusters with same prefix, or None if no match is above sim_th 162 | cluster = self.fast_match(cur_node.cluster_ids, tokens, sim_th, include_params) 163 | return cluster 164 | 165 | def add_seq_to_prefix_tree(self, root_node, cluster: LogCluster): 166 | """ 167 | yd。功能:利用新构建的LogCluster来更新prefix_tree 168 | :param root_node: 169 | :param cluster: 新构建的LogCluster对象 170 | :return: 171 | """ 172 | # 第一步:判断token_count_str是否在root_node.key_to_child_node中,若不在则加入first_layer_node,若在里面,则获取first_layer_node 173 | token_count = len(cluster.log_template_tokens) # yd。获取LogCluster对象中token_list的长度,该token_list是由日志内容进分词后得到 174 | token_count_str = str(token_count) 175 | if token_count_str not in root_node.key_to_child_node: 176 | first_layer_node = Node() 177 | root_node.key_to_child_node[token_count_str] = first_layer_node 178 | else: 179 | first_layer_node = root_node.key_to_child_node[token_count_str] 180 | 181 | cur_node = first_layer_node 182 | 183 | # handle case of empty log string 184 | if token_count == 0: 185 | cur_node.cluster_ids = [cluster.cluster_id] 186 | return 187 | # 第二步:判断每个token/self.param是否在cur_node.key_to_child_node中,若不在,则加入;若已存在,则取出child_node 188 | current_depth = 1 # yd。初始值为1,每处理一个token,它的值就加一 189 | for token in cluster.log_template_tokens: # yd。log_template_tokens是将分词得到的token_list转换为tuple后的结果 190 | # if at max depth or this is last token in template - add current log cluster to the leaf node 191 | if current_depth >= self.max_node_depth or current_depth >= token_count:#yd。如果是token_list中的最后一个token 192 | # clean up stale clusters before adding a new one. 193 | new_cluster_ids = [] 194 | for cluster_id in cur_node.cluster_ids: 195 | if cluster_id in self.id_to_cluster: 196 | new_cluster_ids.append(cluster_id) 197 | new_cluster_ids.append(cluster.cluster_id) 198 | cur_node.cluster_ids = new_cluster_ids #yd。如果是叶子节点,则需要给cluster_ids赋值,非叶子节点,cluster_ids的值都为空 199 | break 200 | 201 | # if token not matched in this layer of existing tree. 202 | if token not in cur_node.key_to_child_node: 203 | if self.parametrize_numeric_tokens and self.has_numbers(token):#yd。如果token中含有数字 204 | if self.param_str not in cur_node.key_to_child_node: 205 | new_node = Node() 206 | cur_node.key_to_child_node[self.param_str] = new_node 207 | cur_node = new_node 208 | else: 209 | cur_node = cur_node.key_to_child_node[self.param_str] 210 | 211 | else: 212 | if self.param_str in cur_node.key_to_child_node: 213 | if len(cur_node.key_to_child_node) < self.max_children: 214 | new_node = Node() 215 | cur_node.key_to_child_node[token] = new_node 216 | cur_node = new_node 217 | else: 218 | cur_node = cur_node.key_to_child_node[self.param_str] 219 | else: 220 | if len(cur_node.key_to_child_node) + 1 < self.max_children: 221 | new_node = Node() 222 | cur_node.key_to_child_node[token] = new_node 223 | cur_node = new_node 224 | elif len(cur_node.key_to_child_node) + 1 == self.max_children: 225 | new_node = Node() 226 | cur_node.key_to_child_node[self.param_str] = new_node 227 | cur_node = new_node 228 | else: 229 | cur_node = cur_node.key_to_child_node[self.param_str] 230 | 231 | # if the token is matched 232 | else: 233 | cur_node = cur_node.key_to_child_node[token] 234 | 235 | current_depth += 1 236 | 237 | # seq1 is a template, seq2 is the log to match 238 | def get_seq_distance(self, seq1, seq2, include_params: bool): 239 | """ 240 | yd。功能:计算seq1与seq2的相似度,相似度 = 公共元素的个数/ seq1的长度 241 | :param seq1: 242 | :param seq2: 243 | :param include_params: 244 | :return: 245 | """ 246 | assert len(seq1) == len(seq2) 247 | 248 | # sequences are empty - full match 249 | if len(seq1) == 0: 250 | return 1.0, 0 251 | 252 | sim_tokens = 0 253 | param_count = 0 254 | 255 | for token1, token2 in zip(seq1, seq2): 256 | if token1 == self.param_str: 257 | param_count += 1 258 | continue 259 | if token1 == token2: 260 | sim_tokens += 1 261 | 262 | if include_params: 263 | sim_tokens += param_count 264 | 265 | ret_val = float(sim_tokens) / len(seq1) 266 | 267 | return ret_val, param_count 268 | 269 | def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include_params: bool): 270 | """ 271 | yd。功能:从cluster_ids对应的所有cluster中,找出cluster.log_template_tokens与tokens相似度最高的 272 | Find the best match for a log message (represented as tokens) versus a list of clusters 273 | :param cluster_ids: List of clusters to match against (represented by their IDs) 274 | :param tokens: the log message, separated to tokens. 275 | :param sim_th: minimum required similarity threshold (None will be returned in no clusters reached it) 276 | :param include_params: consider tokens matched to wildcard parameters in similarity threshold. 277 | :return: Best match cluster or None 278 | """ 279 | match_cluster = None 280 | 281 | max_sim = -1 282 | max_param_count = -1 283 | max_cluster = None 284 | 285 | for cluster_id in cluster_ids: 286 | # Try to retrieve cluster from cache with bypassing eviction 287 | # algorithm as we are only testing candidates for a match. 288 | cluster = self.id_to_cluster.get(cluster_id) 289 | if cluster is None: 290 | continue 291 | cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params) 292 | if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count): 293 | max_sim = cur_sim 294 | max_param_count = param_count 295 | max_cluster = cluster 296 | 297 | if max_sim >= sim_th: 298 | match_cluster = max_cluster 299 | 300 | return match_cluster 301 | 302 | def create_template(self, seq1, seq2): 303 | assert len(seq1) == len(seq2) 304 | ret_val = list(seq2) 305 | 306 | for i, (token1, token2) in enumerate(zip(seq1, seq2)): 307 | if token1 != token2: 308 | ret_val[i] = self.param_str 309 | 310 | return ret_val 311 | 312 | def print_tree(self, file=None, max_clusters=5): 313 | self.print_node("root", self.root_node, 0, file, max_clusters) 314 | 315 | def print_node(self, token, node, depth, file, max_clusters): 316 | out_str = '\t' * depth 317 | 318 | if depth == 0: 319 | out_str += f'<{token}>' 320 | elif depth == 1: 321 | out_str += f'' 322 | else: 323 | out_str += f'"{token}"' 324 | 325 | if len(node.cluster_ids) > 0: 326 | out_str += f" (cluster_count={len(node.cluster_ids)})" 327 | 328 | print(out_str, file=file) 329 | 330 | for token, child in node.key_to_child_node.items(): 331 | self.print_node(token, child, depth + 1, file, max_clusters) 332 | 333 | for cid in node.cluster_ids[:max_clusters]: 334 | cluster = self.id_to_cluster[cid] 335 | out_str = '\t' * (depth + 1) + str(cluster) 336 | print(out_str, file=file) 337 | 338 | def get_content_as_tokens_raw(self, content): 339 | """ 340 | 这是drain3最原始的分词代码,只考虑了英文,没有考虑中文的情况 341 | :param content: 342 | :return: 343 | """ 344 | content = content.strip() 345 | for delimiter in self.extra_delimiters: 346 | content = content.replace(delimiter, " ") 347 | content_tokens = content.split() 348 | return content_tokens 349 | 350 | 351 | def get_content_as_tokens(self, content): 352 | """ 353 | 考虑中英文混杂,纯英文两种情况 354 | :param content: 355 | :return: 356 | """ 357 | content = content.strip() 358 | is_contain_chinese, substr_type_pattern, substr_detail_list, token_list = get_token_list(content) 359 | content_tokens = token_list 360 | #print(f"content_tokens = {content_tokens}") 361 | tokenize_result = {IS_CONTAIN_CHINESE_KEY : is_contain_chinese, SUBSTR_TYPE_PATTERN_KEY : substr_type_pattern, 362 | SUBSTR_DETAIL_LIST_KEY : substr_detail_list, TOKEN_LIST_KEY : token_list} 363 | return content_tokens,tokenize_result 364 | 365 | 366 | def add_log_message(self, content: str): 367 | """ 368 | yd。功能:根据传入的content,获取匹配的logCluster,该LogCluster可能是先前已经存在的,也可能是需要新生成的 369 | :param content:被正则匹配mask后的日志内容,例如"connected to <:IP:>" 370 | :return:match_cluster:匹配的logCluster;update_type:表示更新match_cluster的原因 371 | """ 372 | content_tokens, tokenize_result = self.get_content_as_tokens(content) # yd。对content进行分词 373 | 374 | if self.profiler: 375 | self.profiler.start_section("tree_search") 376 | match_cluster = self.tree_search(self.root_node, content_tokens, self.sim_th, False) 377 | if self.profiler: 378 | self.profiler.end_section() 379 | 380 | # Match no existing log cluster 381 | # yd。即没有匹配到任何已经存在的log cluster,即没有匹配到任何已经存在的log模板,此时就要新创建一个LogCluster对象 382 | if match_cluster is None: 383 | if self.profiler: 384 | self.profiler.start_section("create_cluster") 385 | self.clusters_counter += 1 386 | cluster_id = self.clusters_counter 387 | match_cluster = LogCluster(content_tokens, cluster_id) #yd。构造一个新的LogCluster对象 388 | self.id_to_cluster[cluster_id] = match_cluster 389 | self.add_seq_to_prefix_tree(self.root_node, match_cluster) #利用新构建的match_cluster来更新prefix_tree 390 | update_type = "cluster_created" 391 | 392 | # Add the new log message to the existing cluster 393 | else: 394 | if self.profiler: 395 | self.profiler.start_section("cluster_exist") 396 | new_template_tokens = self.create_template(content_tokens, match_cluster.log_template_tokens) 397 | if tuple(new_template_tokens) == match_cluster.log_template_tokens: #yd。如果新创建的模板与最匹配的模板相同 398 | update_type = "none" 399 | else:#yd。如果新创建的模板与最新的模板不相同,则用新创建的模板来更新最匹配的模板 400 | match_cluster.log_template_tokens = tuple(new_template_tokens) 401 | update_type = "cluster_template_changed" 402 | match_cluster.size += 1 403 | # Touch cluster to update its state in the cache. 404 | # noinspection PyStatementEffect 405 | self.id_to_cluster[match_cluster.cluster_id] #yd。因为使用了LRUCache机制来控制cluster个数,故这里需要访问一下match_cluster对应的id 406 | 407 | if self.profiler: 408 | self.profiler.end_section() 409 | 410 | return match_cluster, update_type,tokenize_result 411 | 412 | def get_clusters_ids_for_seq_len(self, seq_len: int): 413 | """ 414 | Return all clusters with the specified count of tokens 415 | """ 416 | 417 | def append_clusters_recursive(node: Node, id_list_to_fill: list): 418 | id_list_to_fill.extend(node.cluster_ids) 419 | for child_node in node.key_to_child_node.values(): 420 | append_clusters_recursive(child_node, id_list_to_fill) 421 | 422 | cur_node = self.root_node.key_to_child_node.get(str(seq_len)) 423 | 424 | # no template with same token count 425 | if cur_node is None: 426 | return [] 427 | 428 | target = [] 429 | append_clusters_recursive(cur_node, target) 430 | return target 431 | 432 | def match(self, content: str, full_search_strategy="never"): 433 | """ 434 | Match log message against an already existing cluster. 435 | Match shall be perfect (sim_th=1.0). 436 | New cluster will not be created as a result of this call, nor any cluster modifications. 437 | 438 | :param content: log message to match 439 | :param full_search_strategy: when to perform full cluster search. 440 | (1) "never" is the fastest, will always perform a tree search [O(log(n)] but might produce 441 | false negatives (wrong mismatches) on some edge cases; 442 | (2) "fallback" will perform a linear search [O(n)] among all clusters with the same token count, but only in 443 | case tree search found no match. 444 | It should not have false negatives, however tree-search may find a non-optimal match with 445 | more wildcard parameters than necessary; 446 | (3) "always" is the slowest. It will select the best match among all known clusters, by always evaluating 447 | all clusters with the same token count, and selecting the cluster with perfect all token match and least 448 | count of wildcard matches. 449 | :return: Matched cluster or None if no match found. 450 | """ 451 | 452 | assert full_search_strategy in ["always", "never", "fallback"] 453 | 454 | required_sim_th = 1.0 455 | content_tokens, tokenize_result = self.get_content_as_tokens(content) 456 | 457 | # consider for future improvement: 458 | # It is possible to implement a recursive tree_search (first try exact token match and fallback to 459 | # wildcard match). This will be both accurate and more efficient than the linear full search 460 | # also fast match can be optimized when exact match is required by early 461 | # quitting on less than exact cluster matches. 462 | def full_search(): 463 | all_ids = self.get_clusters_ids_for_seq_len(len(content_tokens)) 464 | cluster = self.fast_match(all_ids, content_tokens, required_sim_th, include_params=True) 465 | return cluster, tokenize_result 466 | 467 | if full_search_strategy == "always": 468 | return full_search() 469 | 470 | match_cluster = self.tree_search(self.root_node, content_tokens, required_sim_th, include_params=True) 471 | if match_cluster is not None: 472 | return match_cluster, tokenize_result 473 | 474 | if full_search_strategy == "never": 475 | return None, tokenize_result 476 | 477 | return full_search() 478 | 479 | def get_total_cluster_size(self): 480 | size = 0 481 | for c in self.id_to_cluster.values(): 482 | size += c.size 483 | return size 484 | -------------------------------------------------------------------------------- /drain3/file_persistence.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import os 4 | import pathlib 5 | 6 | from drain3.persistence_handler import PersistenceHandler 7 | 8 | 9 | class FilePersistence(PersistenceHandler): 10 | def __init__(self, file_path): 11 | self.file_path = file_path 12 | 13 | def save_state(self, state): 14 | pathlib.Path(self.file_path).write_bytes(state) 15 | 16 | def load_state(self): 17 | if not os.path.exists(self.file_path): 18 | return None 19 | 20 | return pathlib.Path(self.file_path).read_bytes() 21 | -------------------------------------------------------------------------------- /drain3/kafka_persistence.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import kafka 4 | 5 | from drain3.persistence_handler import PersistenceHandler 6 | 7 | 8 | class KafkaPersistence(PersistenceHandler): 9 | 10 | def __init__(self, topic, snapshot_poll_timeout_sec=60, **kafka_client_options): 11 | self.topic = topic 12 | self.kafka_client_options = kafka_client_options 13 | self.producer = kafka.KafkaProducer(**self.kafka_client_options) 14 | self.snapshot_poll_timeout_sec = snapshot_poll_timeout_sec 15 | 16 | def save_state(self, state): 17 | self.producer.send(self.topic, value=state) 18 | 19 | def load_state(self): 20 | consumer = kafka.KafkaConsumer(**self.kafka_client_options) 21 | partition = kafka.TopicPartition(self.topic, 0) 22 | consumer.assign([partition]) 23 | end_offsets = consumer.end_offsets([partition]) 24 | end_offset = list(end_offsets.values())[0] 25 | if end_offset > 0: 26 | consumer.seek(partition, end_offset - 1) 27 | snapshot_poll_timeout_ms = self.snapshot_poll_timeout_sec * 1000 28 | records = consumer.poll(snapshot_poll_timeout_ms) 29 | if not records: 30 | raise RuntimeError(f"No message received from Kafka during restore even though end_offset>0") 31 | last_msg = records[partition][0] 32 | state = last_msg.value 33 | else: 34 | state = None 35 | 36 | consumer.close() 37 | return state 38 | -------------------------------------------------------------------------------- /drain3/masking.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import abc 4 | import re 5 | from typing import Collection, Optional 6 | 7 | 8 | class AbstractMaskingInstruction(abc.ABC): 9 | 10 | def __init__(self, mask_with: str): 11 | self.mask_with = mask_with 12 | 13 | @abc.abstractmethod 14 | def mask(self, content: str, mask_prefix: str, mask_suffix: str) -> str: 15 | """ 16 | Mask content according to this instruction and return the result. 17 | 18 | :param content: text to apply masking to 19 | :param mask_prefix: the prefix of any masks inserted 20 | :param mask_suffix: the suffix of any masks inserted 21 | """ 22 | pass 23 | 24 | 25 | class MaskingInstruction(AbstractMaskingInstruction): 26 | 27 | def __init__(self, pattern: str, mask_with: str): 28 | super().__init__(mask_with) 29 | self.regex = re.compile(pattern) 30 | 31 | @property 32 | def pattern(self): 33 | return self.regex.pattern 34 | 35 | def mask(self, content: str, mask_prefix: str, mask_suffix: str) -> str: 36 | """ 37 | yd。功能:将content中正则匹配的子串,用指定字符串替换,比如将content中的ip替换为<:IP:> 38 | :param content: 39 | :param mask_prefix: 40 | :param mask_suffix: 41 | :return: 42 | """ 43 | mask = mask_prefix + self.mask_with + mask_suffix 44 | return self.regex.sub(mask, content) 45 | 46 | 47 | # Alias for `MaskingInstruction`. 48 | RegexMaskingInstruction = MaskingInstruction 49 | 50 | 51 | class LogMasker: 52 | 53 | def __init__(self, masking_instructions: Collection[AbstractMaskingInstruction], 54 | mask_prefix: str, mask_suffix: str): 55 | self.mask_prefix = mask_prefix 56 | self.mask_suffix = mask_suffix 57 | self.masking_instructions = masking_instructions 58 | self.mask_name_to_instructions = {} #yd。格式为{mask_name: masking_instruction_list} 59 | for mi in self.masking_instructions: 60 | self.mask_name_to_instructions.setdefault(mi.mask_with, []) 61 | self.mask_name_to_instructions[mi.mask_with].append(mi) 62 | 63 | def mask(self, content: str) -> str: 64 | """ 65 | yd。功能:将content字符串中正则匹配的子串,用特定符号替换,比如将content中的ip数字用"<:IP:>"替换 66 | :param content: 待正则匹配替换的字符串 67 | :return: 68 | """ 69 | for mi in self.masking_instructions: 70 | content = mi.mask(content, self.mask_prefix, self.mask_suffix) 71 | return content 72 | 73 | @property 74 | def mask_names(self) -> Collection[str]: 75 | return self.mask_name_to_instructions.keys() 76 | 77 | def instructions_by_mask_name(self, mask_name: str) -> Optional[Collection[AbstractMaskingInstruction]]: 78 | """ 79 | yd。功能:根据mask_name查找到对应的masking_instruction_list 80 | :param mask_name: 81 | :return: 82 | """ 83 | return self.mask_name_to_instructions.get(mask_name, []) 84 | 85 | # Some masking examples 86 | # --------------------- 87 | # 88 | # masking_instances = [ 89 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)', "ID"), 90 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})((?=[^A-Za-z0-9])|$)', "IP"), 91 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)', "SEQ"), 92 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)', "SEQ"), 93 | # 94 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)', "HEX"), 95 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([\-\+]?\d+)((?=[^A-Za-z0-9])|$)', "NUM"), 96 | # MaskingInstruction(r'(?<=executed cmd )(".+?")', "CMD"), 97 | # ] 98 | -------------------------------------------------------------------------------- /drain3/memory_buffer_persistence.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | from drain3.persistence_handler import PersistenceHandler 4 | 5 | 6 | class MemoryBufferPersistence(PersistenceHandler): 7 | def __init__(self): 8 | self.state = None 9 | 10 | def save_state(self, state): 11 | self.state = state 12 | 13 | def load_state(self): 14 | return self.state -------------------------------------------------------------------------------- /drain3/persistence_handler.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | 6 | class PersistenceHandler(ABC): 7 | 8 | @abstractmethod 9 | def save_state(self, state): 10 | pass 11 | 12 | @abstractmethod 13 | def load_state(self): 14 | pass 15 | -------------------------------------------------------------------------------- /drain3/redis_persistence.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import redis 4 | 5 | from drain3.persistence_handler import PersistenceHandler 6 | 7 | 8 | class RedisPersistence(PersistenceHandler): 9 | def __init__(self, redis_host, redis_port, redis_db, redis_pass, is_ssl, redis_key): 10 | self.redis_host = redis_host 11 | self.redis_port = redis_port 12 | self.redis_db = redis_db 13 | self.redis_pass = redis_pass 14 | self.is_ssl = is_ssl 15 | self.redis_key = redis_key 16 | self.r = redis.Redis(host=self.redis_host, 17 | port=self.redis_port, 18 | db=self.redis_db, 19 | password=self.redis_pass, 20 | ssl=self.is_ssl) 21 | 22 | def save_state(self, state): 23 | self.r.set(self.redis_key, state) 24 | 25 | def load_state(self): 26 | return self.r.get(self.redis_key) 27 | -------------------------------------------------------------------------------- /drain3/simple_profiler.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Based on https://github.com/davidohana/SimpleProfiler/blob/main/python/simple_profiler.py 3 | 4 | import os 5 | import time 6 | 7 | from abc import ABC, abstractmethod 8 | 9 | 10 | class Profiler(ABC): 11 | 12 | @abstractmethod 13 | def start_section(self, section_name: str): 14 | pass 15 | 16 | @abstractmethod 17 | def end_section(self, section_name=""): 18 | pass 19 | 20 | @abstractmethod 21 | def report(self, period_sec=30): 22 | pass 23 | 24 | 25 | class NullProfiler(Profiler): 26 | """A no-op profiler. Use it instead of SimpleProfiler in case you want to disable profiling.""" 27 | 28 | def start_section(self, section_name: str): 29 | pass 30 | 31 | def end_section(self, section_name=""): 32 | pass 33 | 34 | def report(self, period_sec=30): 35 | pass 36 | 37 | 38 | class SimpleProfiler(Profiler): 39 | def __init__(self, reset_after_sample_count=0, enclosing_section_name="total", printer=print, report_sec=30): 40 | self.printer = printer 41 | self.enclosing_section_name = enclosing_section_name 42 | self.reset_after_sample_count = reset_after_sample_count 43 | self.report_sec = report_sec 44 | 45 | self.section_to_stats = {} 46 | self.last_report_timestamp_sec = time.time() 47 | self.last_started_section_name = "" 48 | 49 | def start_section(self, section_name: str): 50 | """Start measuring a section""" 51 | 52 | if not section_name: 53 | raise ValueError("Section name is empty") 54 | self.last_started_section_name = section_name 55 | 56 | section = self.section_to_stats.get(section_name, None) 57 | if section is None: 58 | section = ProfiledSectionStats(section_name) 59 | self.section_to_stats[section_name] = section 60 | 61 | if section.start_time_sec != 0: 62 | raise ValueError(f"Section {section_name} is already started") 63 | 64 | section.start_time_sec = time.time() 65 | 66 | def end_section(self, name=""): 67 | """End measuring a section. Leave section name empty to end the last started section.""" 68 | 69 | now = time.time() 70 | 71 | section_name = name 72 | if not name: 73 | section_name = self.last_started_section_name 74 | 75 | if not section_name: 76 | raise ValueError("Neither section name is specified nor a section is started") 77 | 78 | section: ProfiledSectionStats = self.section_to_stats.get(section_name, None) 79 | if section is None: 80 | raise ValueError(f"Section {section_name} does not exist") 81 | 82 | if section.start_time_sec == 0: 83 | raise ValueError(f"Section {section_name} was not started") 84 | 85 | took_sec = now - section.start_time_sec 86 | if 0 < self.reset_after_sample_count == section.sample_count: 87 | section.sample_count_batch = 0 88 | section.total_time_sec_batch = 0 89 | 90 | section.sample_count += 1 91 | section.total_time_sec += took_sec 92 | section.sample_count_batch += 1 93 | section.total_time_sec_batch += took_sec 94 | section.start_time_sec = 0 95 | 96 | def report(self, period_sec=30): 97 | """Print results using [printer] function. By default prints to stdout.""" 98 | if time.time() - self.last_report_timestamp_sec < period_sec: 99 | return False 100 | 101 | enclosing_time_sec = 0 102 | if self.enclosing_section_name: 103 | enclosing_section: ProfiledSectionStats = self.section_to_stats.get(self.enclosing_section_name, None) 104 | if enclosing_section: 105 | enclosing_time_sec = enclosing_section.total_time_sec 106 | 107 | include_batch_rates = self.reset_after_sample_count > 0 108 | 109 | sections = self.section_to_stats.values() 110 | sorted_sections = sorted(sections, key=lambda it: it.total_time_sec, reverse=True) 111 | lines = map(lambda it: it.to_string(enclosing_time_sec, include_batch_rates), sorted_sections) 112 | text = os.linesep.join(lines) 113 | self.printer(text) 114 | 115 | self.last_report_timestamp_sec = time.time() 116 | return True 117 | 118 | 119 | class ProfiledSectionStats: 120 | def __init__(self, section_name, start_time_sec=0, sample_count=0, total_time_sec=0, 121 | sample_count_batch=0, total_time_sec_batch=0): 122 | self.section_name = section_name 123 | self.start_time_sec = start_time_sec 124 | self.sample_count = sample_count 125 | self.total_time_sec = total_time_sec 126 | self.sample_count_batch = sample_count_batch 127 | self.total_time_sec_batch = total_time_sec_batch 128 | 129 | def to_string(self, enclosing_time_sec: int, include_batch_rates: bool): 130 | took_sec_text = f"{self.total_time_sec:>8.2f} s" 131 | if enclosing_time_sec > 0: 132 | took_sec_text += f" ({100 * self.total_time_sec / enclosing_time_sec:>6.2f}%)" 133 | 134 | ms_per_k_samples = f"{1000000 * self.total_time_sec / self.sample_count: 7.2f}" 135 | 136 | if self.total_time_sec > 0: 137 | samples_per_sec = f"{self.sample_count / self.total_time_sec: 15,.2f}" 138 | else: 139 | samples_per_sec = "N/A" 140 | 141 | if include_batch_rates: 142 | ms_per_k_samples += f" ({1000000 * self.total_time_sec_batch / self.sample_count_batch: 7.2f})" 143 | if self.total_time_sec_batch > 0: 144 | samples_per_sec += f" ({self.sample_count_batch / self.total_time_sec_batch: 15,.2f})" 145 | else: 146 | samples_per_sec += " (N/A)" 147 | 148 | return f"{self.section_name: <15}: took {took_sec_text}, " \ 149 | f"{self.sample_count: >10,} samples, " \ 150 | f"{ms_per_k_samples} ms / 1000 samples, " \ 151 | f"{samples_per_sec} hz" 152 | -------------------------------------------------------------------------------- /drain3/template_miner.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import base64 4 | import logging 5 | import re 6 | import time 7 | import zlib 8 | from typing import Optional, List, NamedTuple 9 | 10 | import jsonpickle 11 | from cachetools import LRUCache, cachedmethod 12 | 13 | from drain3.drain import Drain, LogCluster 14 | from drain3.masking import LogMasker 15 | from drain3.persistence_handler import PersistenceHandler 16 | from drain3.simple_profiler import SimpleProfiler, NullProfiler, Profiler 17 | from drain3.template_miner_config import TemplateMinerConfig 18 | from src.common_config import CLUSTER_COUNT_KEY, DEFAULT_STR_VALUE, USE_OLD_FUNCTION_EXTRACT_PARAMETER,\ 19 | TOKEN_LIST_KEY, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY,LOG_TEMPLATE_TOKENS_KEY,ENABLE_MASK_CONTENT 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | config_filename = 'drain3.ini' 24 | 25 | ExtractedParameter = NamedTuple("ExtractedParameter", [("value", str), ("mask_name", str)]) 26 | 27 | 28 | class TemplateMiner: 29 | 30 | def __init__(self, 31 | persistence_handler: PersistenceHandler = None, 32 | config: TemplateMinerConfig = None): 33 | """ 34 | Wrapper for Drain with persistence and masking support 35 | 36 | :param persistence_handler: The type of persistence to use. When None, no persistence is applied. 37 | :param config: Configuration object. When none, configuration is loaded from default .ini file (if exist) 38 | """ 39 | logger.info("Starting Drain3 template miner") 40 | 41 | if config is None: 42 | logger.info(f"Loading configuration from {config_filename}") 43 | config = TemplateMinerConfig() 44 | config.load(config_filename) 45 | 46 | self.config = config 47 | 48 | self.profiler: Profiler = NullProfiler() 49 | if self.config.profiling_enabled: 50 | self.profiler = SimpleProfiler() 51 | 52 | self.persistence_handler = persistence_handler 53 | 54 | param_str = self.config.mask_prefix + "*" + self.config.mask_suffix #yd。将param_str的值设为<*> 55 | self.drain = Drain( 56 | sim_th=self.config.drain_sim_th, 57 | depth=self.config.drain_depth, 58 | max_children=self.config.drain_max_children, 59 | max_clusters=self.config.drain_max_clusters, 60 | extra_delimiters=self.config.drain_extra_delimiters, 61 | profiler=self.profiler, 62 | param_str=param_str, 63 | parametrize_numeric_tokens=self.config.parametrize_numeric_tokens 64 | ) 65 | self.masker = LogMasker(self.config.masking_instructions, self.config.mask_prefix, self.config.mask_suffix) 66 | self.parameter_extraction_cache = LRUCache(self.config.parameter_extraction_cache_capacity) 67 | self.last_save_time = time.time() #yd。表示最近一次将self.drain对象进行序列化得到state,并保存state的时间 68 | if persistence_handler is not None: #yd。如果持久化handler不为None,则加载state 69 | self.load_state() 70 | 71 | def load_state(self): 72 | """ 73 | yd。加载之前保存的state,然后将state反序列化,用反序列化的结果来更新self.drain对象, 74 | :return: 75 | """ 76 | # yd。这里选择不许需要之前的状态 77 | return 78 | 79 | logger.info("Checking for saved state") 80 | 81 | state = self.persistence_handler.load_state() 82 | if state is None: 83 | logger.info("Saved state not found") 84 | return 85 | 86 | if self.config.snapshot_compress_state: 87 | state = zlib.decompress(base64.b64decode(state)) 88 | 89 | loaded_drain: Drain = jsonpickle.loads(state, keys=True) 90 | 91 | # json-pickle encoded keys as string by default, so we have to convert those back to int 92 | # this is only relevant for backwards compatibility when loading a snapshot of drain <= v0.9.1 93 | # which did not use json-pickle's keys=true 94 | if len(loaded_drain.id_to_cluster) > 0 and isinstance(next(iter(loaded_drain.id_to_cluster.keys())), str): 95 | loaded_drain.id_to_cluster = {int(k): v for k, v in list(loaded_drain.id_to_cluster.items())} 96 | if self.config.drain_max_clusters: 97 | cache = LRUCache(maxsize=self.config.drain_max_clusters) 98 | cache.update(loaded_drain.id_to_cluster) 99 | loaded_drain.id_to_cluster = cache 100 | 101 | self.drain.id_to_cluster = loaded_drain.id_to_cluster 102 | self.drain.clusters_counter = loaded_drain.clusters_counter 103 | self.drain.root_node = loaded_drain.root_node 104 | 105 | logger.info("Restored {0} clusters built from {1} messages".format( 106 | len(loaded_drain.clusters), loaded_drain.get_total_cluster_size())) 107 | 108 | def save_state(self, snapshot_reason): 109 | """ 110 | yd。功能:将self.drain对象序列化后得到state,将state保存到指定文件中 111 | :param snapshot_reason: 112 | :return: 113 | """ 114 | state = jsonpickle.dumps(self.drain, keys=True).encode('utf-8') #yd。将self.drain这个对象序列化 115 | if self.config.snapshot_compress_state:#yd。如果需要压缩state snapshot,则进行压缩 116 | state = base64.b64encode(zlib.compress(state)) 117 | 118 | logger.info(f"Saving state of {len(self.drain.clusters)} clusters " 119 | f"with {self.drain.get_total_cluster_size()} messages, {len(state)} bytes, " 120 | f"reason: {snapshot_reason}") 121 | 122 | self.persistence_handler.save_state(state) #yd。文件持久化,即将state保存到指定路径所在的文件中 123 | 124 | def get_snapshot_reason(self, change_type, cluster_id): 125 | """ 126 | yd。功能:获取保存snapshot的原因,主要原因有两个: 127 | 1、change_type不为none; 128 | 2、距离上次保存snapshot的时间超过配置的间隔时间 129 | :param change_type: 130 | :param cluster_id: 131 | :return: 132 | """ 133 | if change_type != "none": 134 | return "{} ({})".format(change_type, cluster_id) 135 | 136 | diff_time_sec = time.time() - self.last_save_time 137 | if diff_time_sec >= self.config.snapshot_interval_minutes * 60: 138 | return "periodic" 139 | 140 | return None 141 | 142 | def make_result_dict(self,cluster, tokenize_result): 143 | result_dict = { CLUSTER_ID_KEY: cluster.cluster_id, 144 | CLUSTER_SIZE_KEY: cluster.size, #yd。用于统计当前cluster匹配的日志条数 145 | LOG_TEMPLATE_TOKENS_KEY: cluster.log_template_tokens, 146 | TEMPLATE_MINED_KEY: cluster.get_template() # yd。返回挖掘处理的日志模板 147 | } 148 | result_dict.update(tokenize_result) 149 | return result_dict 150 | 151 | def add_log_message(self, log_message: str) -> dict: 152 | """ 153 | yd。功能:根据当前传入的日志内容,获取对应的日志模板的logCluster 154 | :param log_message: 一条日志的内容 155 | :return: 156 | """ 157 | self.profiler.start_section("total") 158 | 159 | if ENABLE_MASK_CONTENT: 160 | self.profiler.start_section("mask") 161 | # yd。将log_message字符串中正则匹配的子串,用特定符号替换。 162 | # 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换,返回"connected to <:IP:>" 163 | masked_content = self.masker.mask(log_message) 164 | self.profiler.end_section() 165 | else: 166 | masked_content = log_message 167 | 168 | self.profiler.start_section("drain") 169 | # yd。根据传入的masked_content,获取匹配的logCluster 170 | cluster, change_type, tokenize_result = self.drain.add_log_message(masked_content) 171 | self.profiler.end_section("drain") 172 | 173 | result = { 174 | "change_type": change_type, 175 | CLUSTER_COUNT_KEY: len(self.drain.clusters) # yd。统计当前已经挖掘的模板的 总数 176 | } 177 | result_dict = self.make_result_dict(cluster, tokenize_result) 178 | result.update(result_dict) 179 | 180 | #yd。这里是将当前的日志模板信息的快照保存下来 181 | if self.persistence_handler is not None: 182 | self.profiler.start_section("save_state") 183 | snapshot_reason = self.get_snapshot_reason(change_type, cluster.cluster_id) 184 | if snapshot_reason: 185 | self.save_state(snapshot_reason) 186 | self.last_save_time = time.time() 187 | self.profiler.end_section() 188 | 189 | self.profiler.end_section("total") 190 | self.profiler.report(self.config.profiling_report_sec) #yd。这个方法啥事都没有干,可以不管 191 | return result 192 | 193 | def match(self, log_message: str, full_search_strategy="never") -> LogCluster: 194 | """ 195 | Mask log message and match against an already existing cluster. 196 | Match shall be perfect (sim_th=1.0). 197 | New cluster will not be created as a result of this call, nor any cluster modifications. 198 | 199 | :param log_message: log message to match 200 | :param full_search_strategy: when to perform full cluster search. 201 | (1) "never" is the fastest, will always perform a tree search [O(log(n)] but might produce 202 | false negatives (wrong mismatches) on some edge cases; 203 | (2) "fallback" will perform a linear search [O(n)] among all clusters with the same token count, but only in 204 | case tree search found no match. 205 | It should not have false negatives, however tree-search may find a non-optimal match with 206 | more wildcard parameters than necessary; 207 | (3) "always" is the slowest. It will select the best match among all known clusters, by always evaluating 208 | all clusters with the same token count, and selecting the cluster with perfect all token match and least 209 | count of wildcard matches. 210 | :return: Matched cluster or None if no match found. 211 | """ 212 | if ENABLE_MASK_CONTENT: 213 | # yd。将log_message字符串中正则匹配的子串,用特定符号替换。 214 | # 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换,返回"connected to <:IP:>" 215 | masked_content = self.masker.mask(log_message) 216 | else: 217 | masked_content = log_message 218 | 219 | matched_cluster, tokenize_result = self.drain.match(masked_content, full_search_strategy) 220 | return matched_cluster, tokenize_result 221 | 222 | def get_parameter_list(self, log_template: str, log_message: str) -> List[str]: 223 | """ 224 | Extract parameters from a log message according to a provided template that was generated 225 | by calling `add_log_message()`. 226 | 227 | This function is deprecated. Please use extract_parameters instead. 228 | 229 | :param log_template: log template corresponding to the log message 230 | :param log_message: log message to extract parameters from 231 | :return: An ordered list of parameter values present in the log message. 232 | """ 233 | 234 | extracted_parameters = self.extract_parameters(log_template, log_message, exact_matching=False) 235 | if not extracted_parameters: 236 | return [] 237 | return [parameter.value for parameter in extracted_parameters] 238 | 239 | def get_parameter(self,result_dict, log_line): 240 | if USE_OLD_FUNCTION_EXTRACT_PARAMETER: 241 | # template = result["template_mined"] 242 | template = result_dict.get(TEMPLATE_MINED_KEY, DEFAULT_STR_VALUE) 243 | params = self.extract_parameters(template, log_line) 244 | return params 245 | content_tokens = result_dict.get(TOKEN_LIST_KEY, []) 246 | # log_template_tokens = result["log_template_tokens"] 247 | log_template_tokens = result_dict.get(LOG_TEMPLATE_TOKENS_KEY, []) 248 | params = self.extract_parameters_by_compare(content_tokens, log_template_tokens) 249 | return params 250 | 251 | def extract_parameters_by_compare(self, content_tokens, log_template_tokens): 252 | parameter_list = [] 253 | for token1, token2 in zip(content_tokens, log_template_tokens): 254 | if token1 == token2: 255 | continue 256 | extracted_parameter = ExtractedParameter(token1, mask_name="-") 257 | parameter_list.append(extracted_parameter) 258 | return parameter_list 259 | 260 | 261 | def extract_parameters(self, 262 | log_template: str, 263 | log_message: str, 264 | exact_matching: bool = True) -> Optional[List[ExtractedParameter]]: 265 | """ 266 | Extract parameters from a log message according to a provided template that was generated 267 | by calling `add_log_message()`. 268 | 269 | For most accurate results, it is recommended that 270 | - Each `MaskingInstruction` has a unique `mask_with` value, 271 | - No `MaskingInstruction` has a `mask_with` value of `*`, 272 | - The regex-patterns of `MaskingInstruction` do not use unnamed back-references; 273 | instead use back-references to named groups e.g. `(?P=some-name)`. 274 | 275 | :param log_template: log template corresponding to the log message 276 | :param log_message: log message to extract parameters from 277 | :param exact_matching: whether to apply the correct masking-patterns to match parameters, or try to approximate; 278 | disabling exact_matching may be faster but may lead to situations in which parameters 279 | are wrongly identified. 280 | :return: A ordered list of ExtractedParameter for the log message 281 | or None if log_message does not correspond to log_template. 282 | """ 283 | #yd。将delimiter用空格替换 284 | for delimiter in self.config.drain_extra_delimiters: 285 | log_message = re.sub(delimiter, " ", log_message) 286 | 287 | template_regex, param_group_name_to_mask_name = self._get_template_parameter_extraction_regex( 288 | log_template, exact_matching) 289 | 290 | # Parameters are represented by specific named groups inside template_regex. 291 | parameter_match = re.match(template_regex, log_message) 292 | 293 | # log template does not match template 294 | if not parameter_match: 295 | return None 296 | 297 | # create list of extracted parameters 298 | extracted_parameters = [] 299 | for group_name, parameter in parameter_match.groupdict().items(): #yd。对正则匹配的结果进行遍历 300 | if group_name in param_group_name_to_mask_name: 301 | mask_name = param_group_name_to_mask_name[group_name] 302 | extracted_parameter = ExtractedParameter(parameter, mask_name) 303 | extracted_parameters.append(extracted_parameter) 304 | 305 | return extracted_parameters 306 | 307 | @cachedmethod(lambda self: self.parameter_extraction_cache) 308 | def _get_template_parameter_extraction_regex(self, log_template: str, exact_matching: bool): 309 | """ 310 | yd。功能:构建模板参数抽取的正则表达式 311 | :param log_template: 312 | :param exact_matching: 313 | :return: template_regex: 314 | param_group_name_to_mask_name,以dict的形式保存着正则表达式的名称和mask_name,例如{'p_0': 'HEX', 'p_1': '*', 'p_2': 'CMD', 'p_3': 'SEQ', 'p_4': 'IP', 'p_5': 'NUM', 'p_6': 'ID'} 315 | """ 316 | param_group_name_to_mask_name = dict() 317 | param_name_counter = [0] 318 | #print(f" log_template传入的值 = {log_template}") 319 | def get_next_param_name(): 320 | param_group_name = "p_" + str(param_name_counter[0]) 321 | param_name_counter[0] += 1 322 | return param_group_name 323 | 324 | # Create a named group with the respective patterns for the given mask-name. 325 | def create_capture_regex(_mask_name): 326 | allowed_patterns = [] 327 | if exact_matching: 328 | # get all possible regex patterns from masking instructions that match this mask name 329 | masking_instructions = self.masker.instructions_by_mask_name(_mask_name) 330 | for mi in masking_instructions: 331 | # MaskingInstruction may already contain named groups. 332 | # We replace group names in those named groups, to avoid conflicts due to duplicate names. 333 | if hasattr(mi, 'regex'): 334 | mi_groups = mi.regex.groupindex.keys() 335 | pattern = mi.pattern #yd。取出构造正则表达式时的字符串 336 | else: 337 | # non regex masking instructions - support only non-exact matching 338 | mi_groups = [] 339 | pattern = ".+?" 340 | 341 | for group_name in mi_groups: 342 | param_group_name = get_next_param_name() 343 | 344 | def replace_captured_param_name(param_pattern): 345 | _search_str = param_pattern.format(group_name) 346 | _replace_str = param_pattern.format(param_group_name) 347 | return pattern.replace(_search_str, _replace_str) 348 | 349 | pattern = replace_captured_param_name("(?P={}") 350 | pattern = replace_captured_param_name("(?P<{}>") 351 | 352 | # support unnamed back-references in masks (simple cases only) 353 | pattern = re.sub(r"\\(?!0)\d{1,2}", r"(?:.+?)", pattern) 354 | allowed_patterns.append(pattern) 355 | 356 | if not exact_matching or _mask_name == "*": 357 | allowed_patterns.append(r".+?") 358 | 359 | # Give each capture group a unique name to avoid conflicts. 360 | param_group_name = get_next_param_name() 361 | param_group_name_to_mask_name[param_group_name] = _mask_name 362 | joined_patterns = "|".join(allowed_patterns) #yd。将正则表达式join起来 363 | capture_regex = "(?P<{}>{})".format(param_group_name, joined_patterns) 364 | return capture_regex 365 | 366 | # For every mask in the template, replace it with a named group of all 367 | # possible masking-patterns it could represent (in order). 368 | mask_names = set(self.masker.mask_names) 369 | 370 | # the Drain catch-all mask 371 | mask_names.add("*") 372 | 373 | escaped_prefix = re.escape(self.masker.mask_prefix) #yd。将字符串中所有可能被解释为正则运算符的字符进行转义 374 | escaped_suffix = re.escape(self.masker.mask_suffix) 375 | template_regex = re.escape(log_template) 376 | #print(f"template_regex最初的值 = {template_regex}") 377 | 378 | # replace each mask name with a proper regex that captures it 379 | for mask_name in mask_names: 380 | search_str = escaped_prefix + re.escape(mask_name) + escaped_suffix 381 | while True: 382 | rep_str = create_capture_regex(mask_name) 383 | # Replace one-by-one to get a new param group name for each replacement. 384 | template_regex_new = template_regex.replace(search_str, rep_str, 1) 385 | # Break when all replaces for this mask are done. 386 | if template_regex_new == template_regex: 387 | break 388 | template_regex = template_regex_new 389 | 390 | #print(f"template_regex处理的值 = {template_regex}") 391 | #yd。将正则表达式template_regex进行改造,将其中的空格替换为"\\s+",并且在template_regex前后分别加上起始符和结束符 392 | # match also messages with multiple spaces or other whitespace chars between tokens 393 | template_regex = re.sub(r"\\ ", r"\\s+", template_regex) 394 | template_regex = "^" + template_regex + "$" 395 | return template_regex, param_group_name_to_mask_name 396 | -------------------------------------------------------------------------------- /drain3/template_miner_config.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import ast 4 | import configparser 5 | import json 6 | import logging 7 | 8 | from drain3.masking import MaskingInstruction 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class TemplateMinerConfig: 14 | def __init__(self): 15 | self.profiling_enabled = False 16 | self.profiling_report_sec = 60 17 | self.snapshot_interval_minutes = 5 18 | self.snapshot_compress_state = True 19 | self.drain_extra_delimiters = [] 20 | self.drain_sim_th = 0.4 21 | self.drain_depth = 4 22 | self.drain_max_children = 100 23 | self.drain_max_clusters = None 24 | self.masking_instructions = [] #yd。由ini配置文件中"masking"字段中的正则表达式构成 25 | self.mask_prefix = "<" 26 | self.mask_suffix = ">" 27 | self.parameter_extraction_cache_capacity = 3000 28 | self.parametrize_numeric_tokens = True 29 | 30 | def load(self, config_filename: str): 31 | """ 32 | yd。功能:解析config_filename配置文件中设置的字段 33 | :param config_filename: 配置文件(比如drain3.ini)的路径 34 | :return: 35 | """ 36 | parser = configparser.ConfigParser() 37 | read_files = parser.read(config_filename) 38 | if len(read_files) == 0: 39 | logger.warning(f"config file not found: {config_filename}") 40 | 41 | section_profiling = 'PROFILING' 42 | section_snapshot = 'SNAPSHOT' 43 | section_drain = 'DRAIN' 44 | section_masking = 'MASKING' 45 | 46 | self.profiling_enabled = parser.getboolean(section_profiling, 'enabled', 47 | fallback=self.profiling_enabled) 48 | self.profiling_report_sec = parser.getint(section_profiling, 'report_sec', 49 | fallback=self.profiling_report_sec) 50 | 51 | self.snapshot_interval_minutes = parser.getint(section_snapshot, 'snapshot_interval_minutes', 52 | fallback=self.snapshot_interval_minutes) 53 | self.snapshot_compress_state = parser.getboolean(section_snapshot, 'compress_state', 54 | fallback=self.snapshot_compress_state) 55 | 56 | drain_extra_delimiters_str = parser.get(section_drain, 'extra_delimiters', 57 | fallback=str(self.drain_extra_delimiters)) 58 | self.drain_extra_delimiters = ast.literal_eval(drain_extra_delimiters_str) 59 | 60 | self.drain_sim_th = parser.getfloat(section_drain, 'sim_th', 61 | fallback=self.drain_sim_th) 62 | self.drain_depth = parser.getint(section_drain, 'depth', 63 | fallback=self.drain_depth) 64 | self.drain_max_children = parser.getint(section_drain, 'max_children', 65 | fallback=self.drain_max_children) 66 | self.drain_max_clusters = parser.getint(section_drain, 'max_clusters', 67 | fallback=self.drain_max_clusters) 68 | self.parametrize_numeric_tokens = parser.getboolean(section_drain, 'parametrize_numeric_tokens', 69 | fallback=self.parametrize_numeric_tokens) 70 | 71 | masking_instructions_str = parser.get(section_masking, 'masking', 72 | fallback=str(self.masking_instructions)) 73 | self.mask_prefix = parser.get(section_masking, 'mask_prefix', fallback=self.mask_prefix) 74 | self.mask_suffix = parser.get(section_masking, 'mask_suffix', fallback=self.mask_suffix) 75 | self.parameter_extraction_cache_capacity = parser.get(section_masking, 'parameter_extraction_cache_capacity', 76 | fallback=self.parameter_extraction_cache_capacity) 77 | #yd。下面是将配置文件中的'masking'字段的内容解析出来,用正则表达式来构建MaskingInstruction对象, 78 | masking_instructions = [] 79 | masking_list = json.loads(masking_instructions_str) #yd。将masking_instructions_str转化为list 80 | for mi in masking_list: 81 | instruction = MaskingInstruction(mi['regex_pattern'], mi['mask_with']) 82 | masking_instructions.append(instruction) 83 | self.masking_instructions = masking_instructions 84 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cachetools==4.2.1 2 | jieba==0.42.1 3 | jsonpickle==2.0.0 4 | kafka==1.3.5 5 | kafka_python==2.0.2 6 | pandas==1.1.5 7 | redis==3.5.3 8 | requests==2.28.1 9 | tqdm==4.64.0 10 | -------------------------------------------------------------------------------- /src/common_config.py: -------------------------------------------------------------------------------- 1 | import re, os 2 | from src.tool.tool import get_project_dir_path 3 | 4 | PROJECT_DIR_PATH = get_project_dir_path() 5 | DATA_DIR_PATH = os.path.join(PROJECT_DIR_PATH, "data") 6 | CONFIG_DIR_PATH = os.path.join(PROJECT_DIR_PATH, "config_ini") 7 | 8 | STAR_CHAR = "*" 9 | 10 | DEFAULT_STR_VALUE = "-" 11 | 12 | 13 | USE_OLD_FUNCTION_EXTRACT_PARAMETER = False 14 | ENABLE_MASK_CONTENT = False 15 | 16 | CHINESE_SUBSTR_TYPE = "中" 17 | SPACE_SUBSTR_TYPE = "空格" 18 | ENGLISH_SUBSTR_TYPE = "英" 19 | PUNCTUATION_MARK_TYPE = "标点" 20 | CONNECTOR_CHAR = "^" 21 | 22 | CHINESE_SPACE_CHINESE_PATTERN = CONNECTOR_CHAR.join([CHINESE_SUBSTR_TYPE, SPACE_SUBSTR_TYPE,CHINESE_SUBSTR_TYPE]) 23 | 24 | 25 | #CHINESE_REGEXP = re.compile(u"([\u4e00-\u9fff|\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+)") 26 | CHINESE_REGEXP = re.compile(u"([\u4e00-\u9fff]+)") 27 | PUNCTUATION_MARK_REGEXP = re.compile(u"(。|,|,|:|:|=)") 28 | 29 | #NONE_CHINESE_REGEXP = re.compile(u"([^\u4e00-\u9fff|\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+)") 30 | NONE_CHINESE_REGEXP = re.compile(u"([^\u4e00-\u9fff|。,,::=]+)") 31 | 32 | CLUSTER_ID_KEY = "cluster_id" 33 | CLUSTER_SIZE_KEY = "cluster_size" 34 | TEMPLATE_MINED_KEY = "template_mined" 35 | LOG_TEMPLATE_TOKENS_KEY = "log_template_tokens" 36 | CLUSTER_COUNT_KEY = "cluster_count" #用于统计当前已经有多少个cluster了,一个cluster就是一个log template 37 | 38 | IS_CONTAIN_CHINESE_KEY = "is_contain_chinese" 39 | SUBSTR_TYPE_PATTERN_KEY = "substr_type_pattern" 40 | SUBSTR_DETAIL_LIST_KEY = "substr_detail_list" 41 | TOKEN_LIST_KEY = "token_list" -------------------------------------------------------------------------------- /src/drain3_examples/drain_bigfile_demo.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import json 4 | import logging 5 | import os 6 | import subprocess 7 | import sys 8 | import time 9 | from os.path import dirname 10 | 11 | from drain3 import TemplateMiner 12 | from drain3.template_miner_config import TemplateMinerConfig 13 | from src.common_config import CONFIG_DIR_PATH 14 | 15 | logger = logging.getLogger(__name__) 16 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') 17 | 18 | in_gz_file = "SSH.tar.gz" 19 | in_log_file = "SSH.log" 20 | if not os.path.isfile(in_log_file): 21 | logger.info(f"Downloading file {in_gz_file}") 22 | p = subprocess.Popen(f"curl https://zenodo.org/record/3227177/files/{in_gz_file} --output {in_gz_file}", shell=True) 23 | p.wait() 24 | logger.info(f"Extracting file {in_gz_file}") 25 | p = subprocess.Popen(f"tar -xvzf {in_gz_file}", shell=True) 26 | p.wait() 27 | 28 | 29 | config = TemplateMinerConfig() 30 | #config.load(dirname(__file__) + "/drain3.ini") 31 | drain3_ini_file_path = os.path.join(CONFIG_DIR_PATH, "drain3.ini") 32 | config.load(drain3_ini_file_path) 33 | config.profiling_enabled = True 34 | template_miner = TemplateMiner(config=config) 35 | 36 | line_count = 0 37 | 38 | with open(in_log_file) as f: 39 | lines = f.readlines() 40 | 41 | start_time = time.time() 42 | batch_start_time = start_time 43 | batch_size = 10000 44 | 45 | for line in lines: 46 | line = line.rstrip() 47 | line = line.partition(": ")[2] 48 | result = template_miner.add_log_message(line) 49 | line_count += 1 50 | if line_count % batch_size == 0: 51 | time_took = time.time() - batch_start_time 52 | rate = batch_size / time_took 53 | logger.info(f"Processing line: {line_count}, rate {rate:.1f} lines/sec, " 54 | f"{len(template_miner.drain.clusters)} clusters so far.") 55 | batch_start_time = time.time() 56 | if result["change_type"] != "none": 57 | result_json = json.dumps(result) 58 | logger.info(f"Input ({line_count}): " + line) 59 | logger.info("Result: " + result_json) 60 | 61 | time_took = time.time() - start_time 62 | rate = line_count / time_took 63 | logger.info(f"--- Done processing file in {time_took:.2f} sec. Total of {line_count} lines, rate {rate:.1f} lines/sec, " 64 | f"{len(template_miner.drain.clusters)} clusters") 65 | 66 | sorted_clusters = sorted(template_miner.drain.clusters, key=lambda it: it.size, reverse=True) 67 | for cluster in sorted_clusters: 68 | logger.info(cluster) 69 | 70 | print("Prefix Tree:") 71 | template_miner.drain.print_tree() 72 | 73 | template_miner.profiler.report(0) 74 | -------------------------------------------------------------------------------- /src/drain3_examples/drain_stdin_demo.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import json 4 | import logging 5 | import sys, os 6 | from os.path import dirname 7 | 8 | from drain3 import TemplateMiner 9 | from drain3.template_miner_config import TemplateMinerConfig 10 | from src.common_config import CONFIG_DIR_PATH, USE_OLD_FUNCTION_EXTRACT_PARAMETER, TOKEN_LIST_KEY, \ 11 | TEMPLATE_MINED_KEY, LOG_TEMPLATE_TOKENS_KEY, DEFAULT_STR_VALUE 12 | 13 | # persistence_type = "NONE" 14 | # persistence_type = "REDIS" 15 | # persistence_type = "KAFKA" 16 | persistence_type = "FILE" 17 | 18 | logger = logging.getLogger(__name__) 19 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') 20 | 21 | if persistence_type == "KAFKA": 22 | from drain3.kafka_persistence import KafkaPersistence 23 | 24 | persistence = KafkaPersistence("drain3_state", bootstrap_servers="localhost:9092") 25 | 26 | elif persistence_type == "FILE": 27 | from drain3.file_persistence import FilePersistence 28 | 29 | # persistence = FilePersistence("drain3_state.bin") 30 | drain3_state_bin_file_path = os.path.join(CONFIG_DIR_PATH, "drain3_state.bin") 31 | persistence = FilePersistence(drain3_state_bin_file_path) 32 | 33 | elif persistence_type == "REDIS": 34 | from drain3.redis_persistence import RedisPersistence 35 | 36 | persistence = RedisPersistence(redis_host='', 37 | redis_port=25061, 38 | redis_db=0, 39 | redis_pass='', 40 | is_ssl=True, 41 | redis_key="drain3_state_key") 42 | else: 43 | persistence = None 44 | 45 | config = TemplateMinerConfig() 46 | # config.load(dirname(__file__) + "/drain3.ini") 47 | drain3_ini_file_path = os.path.join(CONFIG_DIR_PATH, "drain3.ini") 48 | config.load(drain3_ini_file_path) 49 | config.profiling_enabled = False 50 | 51 | template_miner = TemplateMiner(persistence, config) 52 | print(f"Drain3 started with '{persistence_type}' persistence") 53 | print(f"{len(config.masking_instructions)} masking instructions are in use") 54 | print(f"Starting training mode. Reading from std-in ('q' to finish)") # yd。利用输入的一条条日志,训练得到模板 55 | while True: 56 | log_line = input("> ") 57 | if log_line == 'q': 58 | break 59 | # is_contain_chinese, substr_type_pattern, substr_detail_list, token_list, token_join_str = get_token_list(log_line) 60 | # log_line = token_join_str 61 | result = template_miner.add_log_message(log_line) 62 | result_json = json.dumps(result, ensure_ascii=False) 63 | print(result_json) 64 | params = template_miner.get_parameter(result, log_line) 65 | print("Parameters: " + str(params)) 66 | # yd。训练完毕,打印挖掘的每个cluster 67 | print("Training done. Mined clusters:") 68 | for cluster in template_miner.drain.clusters: 69 | print(cluster) 70 | 71 | print(f"Starting inference mode, matching to pre-trained clusters. Input log lines or 'q' to finish") 72 | while True: 73 | log_line = input("> ") 74 | if log_line == 'q': 75 | break 76 | cluster, tokenize_result = template_miner.match(log_line) 77 | if cluster is None: 78 | print(f"No match found") 79 | else: 80 | result = template_miner.make_result_dict(cluster, tokenize_result) 81 | params = template_miner.get_parameter(result, log_line) 82 | print(f"Parameters: {params}") 83 | -------------------------------------------------------------------------------- /src/log_parser_by_drain3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from collections import defaultdict 4 | from tqdm import tqdm # 进度条 5 | from src.tool.read_save_file import open_excel, save_dataframe 6 | 7 | from src.common_config import DATA_DIR_PATH, DEFAULT_STR_VALUE, USE_OLD_FUNCTION_EXTRACT_PARAMETER, \ 8 | CLUSTER_ID_KEY, CLUSTER_SIZE_KEY, TEMPLATE_MINED_KEY, IS_CONTAIN_CHINESE_KEY, SUBSTR_TYPE_PATTERN_KEY, \ 9 | SUBSTR_DETAIL_LIST_KEY, TOKEN_LIST_KEY, LOG_TEMPLATE_TOKENS_KEY 10 | 11 | from src.tool.str_related import get_tow_set_diff 12 | import json 13 | import sys, os 14 | 15 | from drain3 import TemplateMiner 16 | from drain3.template_miner_config import TemplateMinerConfig 17 | from drain3.file_persistence import FilePersistence 18 | 19 | from src.common_config import CONFIG_DIR_PATH 20 | 21 | 22 | class LogParserByDrain3: 23 | def __init__(self): 24 | persistence_type = "FILE" 25 | drain3_state_bin_file_path = os.path.join(CONFIG_DIR_PATH, "drain3_state.bin") 26 | persistence = FilePersistence(drain3_state_bin_file_path) 27 | 28 | config = TemplateMinerConfig() 29 | drain3_ini_file_path = os.path.join(CONFIG_DIR_PATH, "drain3.ini") 30 | config.load(drain3_ini_file_path) 31 | config.profiling_enabled = False 32 | 33 | self.template_miner = TemplateMiner(persistence, config) 34 | print(f"Drain3 started with '{persistence_type}' persistence") 35 | print(f"{len(config.masking_instructions)} masking instructions are in use") 36 | print(f"Starting training mode. Reading from std-in ('q' to finish)") # yd。利用输入的一条条日志,训练得到模板 37 | 38 | def parse_log_content(self, log_line): 39 | result = self.template_miner.add_log_message(log_line) 40 | result_json = json.dumps(result, ensure_ascii=False) 41 | # print(result_json) 42 | if USE_OLD_FUNCTION_EXTRACT_PARAMETER: 43 | template = result.get(TEMPLATE_MINED_KEY, DEFAULT_STR_VALUE) 44 | params = self.template_miner.extract_parameters(template, log_line) 45 | else: 46 | content_tokens = result.get(TOKEN_LIST_KEY, []) 47 | log_template_tokens = result.get(LOG_TEMPLATE_TOKENS_KEY, []) 48 | params = self.template_miner.extract_parameters_by_compare(content_tokens, log_template_tokens) 49 | return result, params 50 | 51 | def parse_log_file(self, raw_log_csv_path, result_file_path): 52 | print(f"start to parse log {raw_log_csv_path}") 53 | log_item_df = open_excel(raw_log_csv_path) 54 | log_csv_header = ["_time", "content"] 55 | log_item_df = log_item_df[log_csv_header] 56 | analysis_result_list = [] 57 | log_item_count = len(log_item_df) 58 | progress_bar = tqdm(total=log_item_count) 59 | for line_index, line_detail in enumerate(log_item_df.values.tolist()): 60 | [time_str, content] = line_detail 61 | progress_bar.update(1) 62 | if content != content: 63 | content = "" 64 | if isinstance(content, str) == False: 65 | content = str(content) 66 | # content = "终端服务器安全层在协议流中检测到错误,并已取消客户端连接。 客户端 IP: 192.168.100.132。" 67 | # content = "DSN3201I -PB4A ABNORMAL EOT IN PROGRESS FOR 825 825 USER=NVTWS CONNECTION-ID=UTILITY CORRELATION-ID=PIMGEKD2 825 JOBNAME=PIMGEKD2 ASID=0102 TCB=0088C840" 68 | result_dict, extract_parameter_list = self.parse_log_content(content) 69 | 70 | parameter_list = [] 71 | if extract_parameter_list is not None: 72 | for parameter in extract_parameter_list: 73 | parameter_list.append(parameter.value) 74 | 75 | event_id = result_dict.get(CLUSTER_ID_KEY, 1) - 1 76 | event_template = result_dict.get(TEMPLATE_MINED_KEY, 0) 77 | Occurrences = result_dict.get(CLUSTER_SIZE_KEY, DEFAULT_STR_VALUE) 78 | substr_detail_list = result_dict.get(SUBSTR_DETAIL_LIST_KEY, DEFAULT_STR_VALUE) 79 | substr_type_pattern = result_dict.get(SUBSTR_TYPE_PATTERN_KEY, DEFAULT_STR_VALUE) 80 | pattern_length = len(substr_detail_list) 81 | is_contain_chinese = result_dict.get(IS_CONTAIN_CHINESE_KEY, DEFAULT_STR_VALUE) 82 | token_list = result_dict.get(TOKEN_LIST_KEY, DEFAULT_STR_VALUE) 83 | token_count = len(token_list) 84 | event_key = "-" 85 | star_ratio = "-" 86 | analysis_result_detail = [ 87 | substr_detail_list, substr_type_pattern, pattern_length, 88 | is_contain_chinese, 89 | token_list, token_count, event_key, 90 | event_id, event_template, star_ratio, Occurrences, parameter_list] 91 | 92 | analysis_result_list.append(line_detail + analysis_result_detail) 93 | progress_bar.close() 94 | analysis_result_df = pd.DataFrame(analysis_result_list, 95 | columns=["_time", "content", 96 | "子串类型明细", "子串类型模式", "模式长度", 97 | "是否包含中文", 98 | "切分的结果", "切分后的长度", "event_key", 99 | "EventId", "EventTemplate", "star_ratio", "Occurrences", 100 | "ParameterList"]) 101 | save_dataframe(analysis_result_df, result_file_path) 102 | 103 | def compare_predict_with_gold(self, predict_file_path, gold_file_path, compare_result_file_path): 104 | predict_item_df = open_excel(predict_file_path) 105 | result_table_header = ["_time", "content", "EventId", "EventTemplate", "Occurrences", "ParameterList"] 106 | predict_item_df = predict_item_df[result_table_header] 107 | predict_item_count = len(predict_item_df) 108 | print(predict_item_count) 109 | 110 | gold_item_df = open_excel(gold_file_path) 111 | gold_item_df = gold_item_df[result_table_header] 112 | gold_item_count = len(gold_item_df) 113 | print(gold_item_count) 114 | if predict_item_count != gold_item_count: 115 | print( 116 | f"---error: predict_item_count != gold_item_count, predict_item_count = {predict_item_count}, gold_item_count = {gold_item_count}") 117 | return None 118 | progress_bar = tqdm(total=gold_item_count) 119 | compare_result_list = [] 120 | for row_index in range(predict_item_count): 121 | predict_line_detail = predict_item_df.loc[row_index].tolist() 122 | gold_line_detail = gold_item_df.loc[row_index].tolist() 123 | progress_bar.update(1) 124 | 125 | [time_predict, content_predict, EventId_predict, EventTemplate_predict, Occurrences_predict, 126 | ParameterList_predict] = predict_line_detail 127 | [time_gold, content_gold, EventId_gold, EventTemplate_gold, Occurrences_gold, 128 | ParameterList_gold] = gold_line_detail 129 | if time_predict != time_gold: 130 | print( 131 | f"---error: time_predict != time_gold, time_predict = {time_predict}, time_gold = {time_gold}") 132 | return None 133 | if content_predict != content_gold: 134 | print( 135 | f"---error: content_predict != content_gold, content_predict = {content_predict}, content_gold = {content_gold}") 136 | return None 137 | 138 | is_template_same = False 139 | if EventTemplate_predict == EventTemplate_gold: 140 | is_template_same = True 141 | 142 | ParameterList_predict = eval(ParameterList_predict) 143 | ParameterList_gold = eval(ParameterList_gold) 144 | is_parameter_same, intersection_set, only_in_predict_set, only_in_gold_set = get_tow_set_diff( 145 | set(ParameterList_predict), set(ParameterList_gold)) 146 | compare_result_detail = [time_gold, content_gold, EventId_gold, 147 | EventTemplate_predict, EventTemplate_gold, is_template_same, 148 | Occurrences_gold, 149 | ParameterList_predict, ParameterList_gold, is_parameter_same, intersection_set, 150 | only_in_predict_set, only_in_gold_set] 151 | compare_result_list.append(compare_result_detail) 152 | progress_bar.close() 153 | compare_result_df = pd.DataFrame(compare_result_list, columns=["_time", "content", "EventId", 154 | "EventTemplate_predict", "EventTemplate_gold", 155 | "is_template_same", 156 | "Occurrences_gold", 157 | "ParameterList_predict", "ParameterList_gold", 158 | "is_parameter_same", "intersection_set", 159 | "only_in_predict_set", "only_in_gold_set"]) 160 | save_dataframe(compare_result_df, compare_result_file_path) 161 | 162 | 163 | if __name__ == '__main__': 164 | is_get_parse_result = True 165 | is_get_indicator = False 166 | log_parser = LogParserByDrain3() 167 | 168 | if is_get_parse_result: 169 | raw_log_csv_path = os.path.join(DATA_DIR_PATH, "english_logs.csv") 170 | result_file_path = os.path.join(DATA_DIR_PATH, "english_logs_parse_by_drain3.csv") 171 | log_parser.parse_log_file(raw_log_csv_path, result_file_path) 172 | 173 | raw_log_csv_path = os.path.join(DATA_DIR_PATH, "chinese_english_logs.csv") 174 | result_file_path = os.path.join(DATA_DIR_PATH, "chinese_english_logs_parse_by_drain3.csv") 175 | log_parser.parse_log_file(raw_log_csv_path, result_file_path) 176 | 177 | if is_get_indicator: 178 | raw_log_csv_path = os.path.join(DATA_DIR_PATH, "english_logs.csv") 179 | result_file_path = os.path.join(DATA_DIR_PATH, "english_logs_parse_by_drain3.csv") 180 | gold_file_path = raw_log_csv_path 181 | compare_result_file_path = os.path.join(DATA_DIR_PATH, "解析结果与金标准对比的结果_by_drain3.xlsx") 182 | log_parser.compare_predict_with_gold(result_file_path, gold_file_path, compare_result_file_path) 183 | -------------------------------------------------------------------------------- /src/log_parser_by_statistic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from collections import defaultdict 4 | from tqdm import tqdm # 进度条 5 | from src.tool.read_save_file import open_excel, save_dataframe 6 | 7 | from src.common_config import DATA_DIR_PATH, CONNECTOR_CHAR, \ 8 | STAR_CHAR 9 | from src.tool.str_related import get_tow_set_diff 10 | from src.tool.tool import calculate_normalize_ratio 11 | from src.tool.tokenizer import get_token_list 12 | 13 | 14 | class LogParserByStatistics: 15 | 16 | def get_event_template_and_parameter(self, is_contain_chinese, token_list, token_2_frequency, event_occurrences): 17 | """ 18 | 功能:判断event中每个token出现的频次与event_occurrences是否相对, 19 | 如果相等,则该token就是模板中的词; 20 | 如果不相等,则该token就是parameter。 21 | :param is_contain_chinese: 22 | :param token_list: 23 | :param token_2_frequency: 记录当前event_id对应的所有token出现的频次 24 | :param event_occurrences:当前event出现的次数 25 | :return: 26 | """ 27 | template_token_list = [] 28 | parameter_set = set([]) 29 | parameter_list = [] 30 | star_count = 0 31 | for token in token_list: 32 | frequency = token_2_frequency[token] 33 | if frequency == event_occurrences: #如果该词在当前event中出现的频次等于该event出现的频次,则该词就是模板词 34 | template_token_list.append(token) 35 | continue 36 | 37 | template_token_list.append(STAR_CHAR) #该词是参数,用星号表示 38 | star_count += 1 39 | if token not in parameter_set: #将参数分别保存在list和set中 40 | parameter_set.add(token) 41 | parameter_list.append(token) 42 | 43 | connector_char = " " 44 | if is_contain_chinese == True: 45 | connector_char = "" 46 | event_template = connector_char.join(template_token_list) 47 | star_ratio = calculate_normalize_ratio(star_count, len(token_list)) 48 | return event_template, parameter_list, star_ratio 49 | 50 | def update_token_2_frequency(self, token_2_frequency, token_list): 51 | token_set = set(token_list) 52 | for token in token_set: 53 | if token in token_2_frequency: 54 | token_2_frequency[token] += 1 55 | else: 56 | token_2_frequency[token] = 1 57 | return token_2_frequency 58 | 59 | def update_event_key_2_id(self, event_key, event_key_2_id): 60 | if event_key not in event_key_2_id: 61 | event_id = len(event_key_2_id) 62 | event_key_2_id[event_key] = event_id 63 | return event_key_2_id 64 | 65 | def update_event_id_2_occurrences(self, event_id, event_id_2_occurrences): 66 | if event_id not in event_id_2_occurrences: 67 | event_id_2_occurrences[event_id] = 1 68 | else: 69 | event_id_2_occurrences[event_id] += 1 70 | return event_id_2_occurrences 71 | 72 | def parse_log_content(self, content, event_key_2_id, event_id_2_occurrences, event_id_2_token_2_frequency): 73 | 74 | is_contain_chinese, substr_type_pattern, substr_detail_list, token_list = get_token_list(content) 75 | pattern_length = len(substr_detail_list) 76 | token_count = len(token_list) 77 | 78 | event_key = substr_type_pattern + CONNECTOR_CHAR + str(token_count) 79 | self.update_event_key_2_id(event_key, event_key_2_id) 80 | event_id = event_key_2_id[event_key] 81 | 82 | self.update_event_id_2_occurrences(event_id, event_id_2_occurrences) 83 | Occurrences = event_id_2_occurrences[event_id] 84 | 85 | token_2_frequency = event_id_2_token_2_frequency[event_id] 86 | token_2_frequency_new = self.update_token_2_frequency(token_2_frequency, token_list) 87 | event_id_2_token_2_frequency[event_id] = token_2_frequency_new 88 | 89 | event_template, parameter_list, star_ratio = self.get_event_template_and_parameter(is_contain_chinese, token_list, 90 | token_2_frequency, Occurrences) 91 | analysis_result_detail = [ 92 | substr_detail_list, substr_type_pattern, pattern_length, 93 | is_contain_chinese, 94 | token_list, token_count, event_key, 95 | event_id, event_template,star_ratio, Occurrences, parameter_list] 96 | return analysis_result_detail 97 | 98 | 99 | def parse_log_file(self, raw_log_csv_path, result_file_path): 100 | log_item_df = open_excel(raw_log_csv_path) 101 | log_csv_header = ["_time", "content"] 102 | log_item_df = log_item_df[log_csv_header] 103 | analysis_result_list = [] 104 | event_key_2_id = {} 105 | event_id_2_occurrences = {} 106 | event_id_2_token_2_frequency = defaultdict(dict) 107 | log_item_count = len(log_item_df) 108 | progress_bar = tqdm(total=log_item_count) 109 | for line_index, line_detail in enumerate(log_item_df.values.tolist()): 110 | [time_str, content] = line_detail 111 | progress_bar.update(1) 112 | if content != content: 113 | content = "" 114 | if isinstance(content,str)==False: 115 | content = str(content) 116 | #content = "终端服务器安全层在协议流中检测到错误,并已取消客户端连接。 客户端 IP: 192.168.100.132。" 117 | #content = "DSN3201I -PB4A ABNORMAL EOT IN PROGRESS FOR 825 825 USER=NVTWS CONNECTION-ID=UTILITY CORRELATION-ID=PIMGEKD2 825 JOBNAME=PIMGEKD2 ASID=0102 TCB=0088C840" 118 | analysis_result_detail = self.parse_log_content(content, event_key_2_id, event_id_2_occurrences, event_id_2_token_2_frequency) 119 | 120 | analysis_result_list.append(line_detail + analysis_result_detail) 121 | progress_bar.close() 122 | analysis_result_df = pd.DataFrame(analysis_result_list, 123 | columns=["_time", "content", 124 | "子串类型明细", "子串类型模式","模式长度", 125 | "是否包含中文", 126 | "切分的结果", "切分后的长度","event_key", 127 | "EventId", "EventTemplate","star_ratio", "Occurrences", "ParameterList"]) 128 | save_dataframe(analysis_result_df, result_file_path) 129 | 130 | def compare_predict_with_gold(self, predict_file_path, gold_file_path, compare_result_file_path): 131 | predict_item_df = open_excel(predict_file_path) 132 | result_table_header = ["_time", "content","EventId", "EventTemplate", "Occurrences", "ParameterList"] 133 | predict_item_df = predict_item_df[result_table_header] 134 | predict_item_count = len(predict_item_df) 135 | print(predict_item_count) 136 | 137 | gold_item_df = open_excel(gold_file_path) 138 | gold_item_df = gold_item_df[result_table_header] 139 | gold_item_count = len(gold_item_df) 140 | print(gold_item_count) 141 | if predict_item_count != gold_item_count: 142 | print(f"---error: predict_item_count != gold_item_count, predict_item_count = {predict_item_count}, gold_item_count = {gold_item_count}") 143 | return None 144 | progress_bar = tqdm(total=gold_item_count) 145 | compare_result_list = [] 146 | for row_index in range(predict_item_count): 147 | predict_line_detail = predict_item_df.loc[row_index].tolist() 148 | gold_line_detail = gold_item_df.loc[row_index].tolist() 149 | progress_bar.update(1) 150 | 151 | [time_predict, content_predict, EventId_predict, EventTemplate_predict, Occurrences_predict, ParameterList_predict] = predict_line_detail 152 | [time_gold, content_gold, EventId_gold, EventTemplate_gold, Occurrences_gold, ParameterList_gold] = gold_line_detail 153 | if time_predict != time_gold: 154 | print( 155 | f"---error: time_predict != time_gold, time_predict = {time_predict}, time_gold = {time_gold}") 156 | return None 157 | if content_predict != content_gold: 158 | print( 159 | f"---error: content_predict != content_gold, content_predict = {content_predict}, content_gold = {content_gold}") 160 | return None 161 | 162 | is_template_same = False 163 | if EventTemplate_predict == EventTemplate_gold: 164 | is_template_same = True 165 | 166 | ParameterList_predict = eval(ParameterList_predict) 167 | ParameterList_gold = eval(ParameterList_gold) 168 | is_parameter_same, intersection_set, only_in_predict_set, only_in_gold_set = get_tow_set_diff(set(ParameterList_predict), set(ParameterList_gold)) 169 | compare_result_detail = [time_gold, content_gold, EventId_gold, 170 | EventTemplate_predict, EventTemplate_gold, is_template_same, 171 | Occurrences_gold, 172 | ParameterList_predict, ParameterList_gold, is_parameter_same, intersection_set, only_in_predict_set, only_in_gold_set] 173 | compare_result_list.append(compare_result_detail) 174 | progress_bar.close() 175 | compare_result_df = pd.DataFrame(compare_result_list, columns=["_time", "content","EventId", 176 | "EventTemplate_predict", "EventTemplate_gold", "is_template_same", 177 | "Occurrences_gold", 178 | "ParameterList_predict", "ParameterList_gold", "is_parameter_same", "intersection_set", "only_in_predict_set", "only_in_gold_set"]) 179 | save_dataframe(compare_result_df, compare_result_file_path) 180 | 181 | 182 | if __name__ == '__main__': 183 | is_get_parse_result = True 184 | is_get_indicator = True 185 | log_parser = LogParserByStatistics() 186 | 187 | if is_get_parse_result: 188 | raw_log_csv_path = os.path.join(DATA_DIR_PATH, "english_logs.csv") 189 | result_file_path = os.path.join(DATA_DIR_PATH, "english_logs_parse_by_statistic.csv") 190 | log_parser.parse_log_file(raw_log_csv_path, result_file_path) 191 | 192 | raw_log_csv_path = os.path.join(DATA_DIR_PATH, "chinese_english_logs.csv") 193 | result_file_path = os.path.join(DATA_DIR_PATH, "chinese_english_logs_parse_by_statistic.csv") 194 | log_parser.parse_log_file(raw_log_csv_path, result_file_path) 195 | 196 | if is_get_indicator: 197 | raw_log_csv_path = os.path.join(DATA_DIR_PATH, "english_logs.csv") 198 | result_file_path = os.path.join(DATA_DIR_PATH, "english_logs_parse_by_statistic.csv") 199 | gold_file_path = raw_log_csv_path 200 | compare_result_file_path = os.path.join(DATA_DIR_PATH, "解析结果与金标准对比的结果_by_statistic.xlsx") 201 | log_parser.compare_predict_with_gold(result_file_path, gold_file_path,compare_result_file_path) -------------------------------------------------------------------------------- /src/tool/read_save_file.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from src.common_config import get_project_dir_path 3 | 4 | import pandas as pd 5 | def open_excel(input_file_path, sheet_name = "Sheet1", column_2_str = None): 6 | if input_file_path.endswith(".xlsx"): 7 | if sheet_name == "": 8 | if column_2_str is None: 9 | item_name_df = pd.read_excel(input_file_path) 10 | else: 11 | item_name_df = pd.read_excel(input_file_path, converters=column_2_str) 12 | else: 13 | if column_2_str is None: 14 | item_name_df = pd.read_excel(input_file_path, sheet_name) 15 | else: 16 | item_name_df = pd.read_excel(input_file_path, sheet_name, converters=column_2_str) 17 | #item_name_df = pd.read_excel(input_file_path, sheet_name) 18 | elif input_file_path.endswith(".csv"): 19 | try: 20 | item_name_df = pd.read_csv(input_file_path, encoding='utf-8',engine ='python') 21 | except: 22 | try: 23 | item_name_df = pd.read_csv(input_file_path, encoding='gb2312', engine='python') 24 | except: 25 | traceback.print_exc() 26 | else: 27 | print("input file type is not xlsx or csv") 28 | item_name_df = None 29 | return item_name_df 30 | 31 | 32 | def save_dataframe(dataset_df, file_path, sheet_name ="Sheet1"): 33 | if file_path.endswith(".xlsx"): 34 | dataset_df.to_excel(file_path, index=False, sheet_name = sheet_name) 35 | elif file_path.endswith(".csv"): 36 | dataset_df.to_csv(file_path, index=False) 37 | else: 38 | print("file path is not end with .xlsx or .csv") 39 | return None 40 | cur_dir = get_project_dir_path() 41 | short_path = file_path.replace(cur_dir, "") 42 | print("finish to save data in {}".format(short_path)) 43 | 44 | def save_to_multi_sheet(file_path, dataframe_sheet_tuples): 45 | with pd.ExcelWriter(file_path) as writer: 46 | for dataset_df, sheet_name in dataframe_sheet_tuples: 47 | dataset_df.to_excel(writer,index=False, sheet_name=sheet_name) 48 | print("finish to save result in {}".format(file_path)) 49 | 50 | 51 | def save_dataframe_by_csv(dataset_df, file_path): 52 | dataset_df.to_csv(file_path,index=False) 53 | cur_dir = get_project_dir_path() 54 | short_path = file_path.replace(cur_dir, "") 55 | print("finish to save data in {}".format(short_path)) -------------------------------------------------------------------------------- /src/tool/str_related.py: -------------------------------------------------------------------------------- 1 | import copy, re 2 | from collections import defaultdict 3 | 4 | from src.common_config import DEFAULT_STR_VALUE 5 | 6 | 7 | def process_none_str(input_str): 8 | if input_str != input_str: 9 | return DEFAULT_STR_VALUE 10 | if input_str is None: 11 | return DEFAULT_STR_VALUE 12 | if input_str == "": 13 | return DEFAULT_STR_VALUE 14 | return input_str 15 | 16 | def str_normalize(input_str): 17 | input_str = process_none_str(input_str) 18 | if isinstance(input_str, str): 19 | normalize_str = input_str.strip() 20 | normalize_str = normalize_str.replace("(", "(").replace(")",")") 21 | else: 22 | normalize_str = str(input_str) 23 | normalize_str = normalize_str.replace("\r", "").replace("\n", "") 24 | normalize_str = string_full_to_half(normalize_str) 25 | return normalize_str 26 | 27 | 28 | def get_tow_set_diff(set_a, set_b): 29 | intersection_set = set_a & set_b 30 | only_in_a_set = set_a - intersection_set 31 | only_in_b_set = set_b - intersection_set 32 | is_tow_set_same = False 33 | if len(intersection_set) == len(set_a) and len(intersection_set) == len(set_b): 34 | is_tow_set_same = True 35 | return is_tow_set_same, intersection_set, only_in_a_set, only_in_b_set 36 | 37 | def get_bracket_index(input_str, is_debug = False): 38 | """ 39 | 功能:获取括号的索引,按从左到右的顺序 40 | @param input_str 输入的字符串,例如"()))","(()", ")()())", "", "(())", "((()())" 41 | @return bracket_index_list,以list的形式,范围最长有效括号组合的索引,格式为[(left_bracket_index, right_bracket_index), (left_bracket_index, right_bracket_index),] 42 | """ 43 | if is_debug == True: 44 | print("--------input_str = {}".format(input_str)) 45 | raw_bracket_list = [] 46 | bracket_index_list = [] 47 | for bracket_index, temp_char in enumerate(input_str): 48 | if temp_char != "(" and temp_char != ")": 49 | continue 50 | raw_bracket_list.append((temp_char, bracket_index)) 51 | 52 | left_bracket_index_stack = [] 53 | bracket_count = len(raw_bracket_list) 54 | for i in range(bracket_count): 55 | (bracket_symbol, bracket_index) = raw_bracket_list[i] 56 | if bracket_symbol == "(": 57 | left_bracket_index_stack.append(bracket_index) 58 | elif bracket_symbol == ")": 59 | if len(left_bracket_index_stack) == 0:#如果没有左括号,则当前的有右括号是无效的 60 | continue 61 | left_bracket_index = left_bracket_index_stack.pop(-1) 62 | bracket_index_list.append((left_bracket_index, bracket_index)) 63 | bracket_index_list = merge_interval(bracket_index_list) 64 | 65 | for target_index_pair in bracket_index_list: 66 | (left_bracket_index, right_bracket_index) = target_index_pair 67 | target_str = input_str[left_bracket_index: right_bracket_index+1] 68 | if is_debug == True: 69 | print("input_str = {0}, left_bracket_index = {1}, right_bracket_index = {2}, target_str = {3}".format(input_str, left_bracket_index, right_bracket_index, target_str)) 70 | return bracket_index_list 71 | 72 | def drop_bracket_content(mj_name): 73 | bracket_index_list = get_bracket_index(mj_name) 74 | right_index = len(mj_name) 75 | new_name = mj_name 76 | prefix_end_index = -1 77 | for temp in bracket_index_list[::-1]: 78 | 79 | [start_index, end_index] = temp 80 | new_name = new_name[:start_index] + " " + new_name[end_index+1:] 81 | # end_right_index = end_index + 1 82 | # if end_right_index == right_index: 83 | # right_index = start_index 84 | # suffix_bracket_content = mj_name[start_index: end_right_index] 85 | # suffix_bracket_content_list.insert(0, suffix_bracket_content) 86 | # prefix_end_index = start_index 87 | # else: 88 | # break 89 | return new_name 90 | 91 | def get_bracket_content_prefix(mj_name): 92 | """ 93 | 获取括号内容的前缀,括号内容 94 | """ 95 | bracket_content_list = [] 96 | bracket_index_list = get_bracket_index(mj_name) 97 | right_index = len(mj_name) 98 | prefix_end_index = -1 99 | for temp in bracket_index_list[::-1]: 100 | [start_index, end_index] = temp 101 | end_right_index = end_index + 1 102 | if end_right_index == right_index: 103 | right_index = start_index 104 | bracket_content = mj_name[start_index: end_right_index] 105 | bracket_content_list.insert(0, bracket_content) 106 | prefix_end_index = start_index 107 | else: 108 | break 109 | if prefix_end_index != -1: 110 | prefix_content = mj_name[:prefix_end_index] 111 | else: 112 | prefix_content = "" 113 | suffix_bracket_content_join = "".join(bracket_content_list) 114 | all_bracket_content = suffix_bracket_content_join.replace("(", "").replace(")", "").strip() 115 | return prefix_content, bracket_content_list, all_bracket_content 116 | 117 | def get_regexp_match_results(input_name, to_match_regexps ): 118 | """ 119 | 抽取正则匹配的结果 120 | """ 121 | result_list = to_match_regexps.finditer(input_name) 122 | match_detail_list = [] 123 | for result_detail in result_list: 124 | (left_index, right_index) = result_detail.span() 125 | match_str = result_detail.group() 126 | match_detail_list.insert(0, [left_index, right_index, match_str]) 127 | return match_detail_list 128 | 129 | def char_full_2_half(uchar): 130 | """单个字符 全角转半角""" 131 | inside_code = ord(uchar) 132 | if inside_code == 0x3000: 133 | inside_code = 0x0020 134 | else: 135 | inside_code -= 0xfee0 136 | if inside_code < 0x0020 or inside_code > 0x7e: #转完之后不是半角字符返回原来的字符 137 | return uchar 138 | return chr(inside_code) 139 | 140 | def string_full_to_half(ustring): 141 | """把字符串全角转半角""" 142 | return "".join([char_full_2_half(uchar) for uchar in ustring]) 143 | 144 | def get_regexp_match_result(to_match_regexp, temp_str): 145 | target_list = [] 146 | result_list = to_match_regexp.finditer(temp_str) 147 | for result_detail in result_list: 148 | (left_index, right_index) = result_detail.span() 149 | match_str = result_detail.group() 150 | target_list.insert(0, [left_index, right_index, match_str]) 151 | return target_list 152 | 153 | 154 | 155 | 156 | 157 | if __name__ == "__main__": 158 | if 0: 159 | input_name = "★(甲)速效救心丸(50粒*3瓶)" 160 | drop_bracket_content(input_name) 161 | if 0: 162 | ustring = "维生素b12注射液" 163 | new_str = string_full_to_half(ustring) 164 | print(ustring, new_str) 165 | -------------------------------------------------------------------------------- /src/tool/tokenizer.py: -------------------------------------------------------------------------------- 1 | from src.common_config import DATA_DIR_PATH,CHINESE_REGEXP,CONNECTOR_CHAR,\ 2 | PUNCTUATION_MARK_REGEXP,NONE_CHINESE_REGEXP, CHINESE_SUBSTR_TYPE,SPACE_SUBSTR_TYPE, ENGLISH_SUBSTR_TYPE,\ 3 | CHINESE_SPACE_CHINESE_PATTERN,PUNCTUATION_MARK_TYPE 4 | from src.tool.str_related import str_normalize, get_tow_set_diff 5 | import jieba 6 | 7 | def get_substr_pattern(content): 8 | substr_detail_list = [] 9 | reg_match_list = CHINESE_REGEXP.finditer(content) 10 | 11 | for match_item in reg_match_list: 12 | match_str = match_item.group() 13 | (start_index, end_index) = match_item.span() 14 | substr_detail_list.append([start_index, end_index, match_str, CHINESE_SUBSTR_TYPE]) # 不包括end_index 15 | 16 | reg_match_list = PUNCTUATION_MARK_REGEXP.finditer(content) 17 | for match_item in reg_match_list: 18 | match_str = match_item.group() 19 | (start_index, end_index) = match_item.span() 20 | substr_detail_list.append([start_index, end_index, match_str, PUNCTUATION_MARK_TYPE]) # 不包括end_index 21 | 22 | reg_match_list = NONE_CHINESE_REGEXP.finditer(content) #提取非中文的结果 23 | for match_item in reg_match_list: 24 | match_str = match_item.group() 25 | (start_index, end_index) = match_item.span() 26 | match_str_strip = match_str.strip() 27 | 28 | #获取前缀空格 29 | match_index = match_str.find(match_str_strip) 30 | prefix_space_start_index = start_index 31 | prefix_space_end_index = prefix_space_start_index + match_index 32 | if prefix_space_start_index != prefix_space_end_index: 33 | prefix_space_str = content[prefix_space_start_index:prefix_space_end_index] 34 | substr_detail_list.append([prefix_space_start_index, prefix_space_end_index, prefix_space_str, 35 | SPACE_SUBSTR_TYPE]) # 不包括end_index 36 | 37 | #获取中间的英文字符串 38 | mid_substr_start_index = prefix_space_end_index 39 | mid_str_end_index = mid_substr_start_index + len(match_str_strip) 40 | if mid_substr_start_index != mid_str_end_index: 41 | mid_substr = content[mid_substr_start_index:mid_str_end_index] 42 | substr_detail_list.append( [mid_substr_start_index, mid_str_end_index, mid_substr,ENGLISH_SUBSTR_TYPE]) # 不包括end_index 43 | 44 | #获取结尾的空格 45 | suffix_space_start_index = mid_str_end_index 46 | suffix_space_end_index = end_index 47 | if suffix_space_start_index != suffix_space_end_index: 48 | suffix_space_str = content[suffix_space_start_index:suffix_space_end_index] 49 | substr_detail_list.append( 50 | [suffix_space_start_index, suffix_space_end_index, suffix_space_str, 51 | SPACE_SUBSTR_TYPE]) # 不包括end_index 52 | 53 | substr_detail_list.sort(key=lambda x: x[0], reverse=False) 54 | 55 | substr_type_pattern = CONNECTOR_CHAR.join([item[3] for item in substr_detail_list]) 56 | # print(substr_detail_list) 57 | # print(substr_type_pattern) 58 | return substr_detail_list, substr_type_pattern 59 | 60 | def split_substr(substr_detail_list, need_split_substr_type, is_split_by_space): 61 | """ 62 | 63 | :param substr_detail_list: 64 | :param need_split_substr_type: 表示哪些类型的子串需要被切分 65 | :param is_split_by_space: 表示是否以空格的方式来切,如果该值为False,则表示用结巴来切分 66 | :return: 67 | """ 68 | split_list = [] 69 | #split_substr_count = 0 70 | for substr_item_detail in substr_detail_list: 71 | [start_index, end_index, match_str, substr_type] = substr_item_detail 72 | if substr_type == need_split_substr_type: 73 | if is_split_by_space: 74 | temp_token_list = match_str.split() 75 | else: 76 | temp_token_list = list(jieba.cut(match_str)) 77 | 78 | split_list.extend(temp_token_list) 79 | else: 80 | if substr_type == SPACE_SUBSTR_TYPE: 81 | continue 82 | split_list.append(match_str) 83 | return split_list 84 | 85 | def get_token_list(content): 86 | content = content.strip() 87 | # content = str_normalize(content) 88 | substr_detail_list, substr_type_pattern = get_substr_pattern(content) 89 | is_contain_chinese = False 90 | if substr_type_pattern.find(CHINESE_SUBSTR_TYPE) != -1: # 如果模式中包含中文 91 | is_contain_chinese = True 92 | if is_contain_chinese: # 如果模式中包含中文 93 | if substr_type_pattern.find(CHINESE_SPACE_CHINESE_PATTERN) != -1: # 如果模式中包含中文空格中文,则将中文按空格切分 94 | token_list = split_substr(substr_detail_list, CHINESE_SUBSTR_TYPE, is_split_by_space=True) 95 | else: # 情况2,中文与中文之间,没有空格隔开,则针对中文用jieba分词,英文的保持不变 96 | token_list = split_substr(substr_detail_list, CHINESE_SUBSTR_TYPE, is_split_by_space=False) 97 | else: # 即模式中不包含中文,则对英文按空格进行切分 98 | token_list = split_substr(substr_detail_list, ENGLISH_SUBSTR_TYPE, is_split_by_space=True) 99 | return is_contain_chinese, substr_type_pattern, substr_detail_list, token_list 100 | 101 | 102 | if __name__ == '__main__': 103 | content = "今天 456 名。明天" 104 | get_substr_pattern(content) 105 | content = "今天 4 56 名" 106 | get_substr_pattern(content) 107 | content = "终端服务器安全层在协议流中检测到错误,并已取消客户端连接。 客户端 IP: 192.168.100.132。" 108 | get_substr_pattern(content) -------------------------------------------------------------------------------- /src/tool/tool.py: -------------------------------------------------------------------------------- 1 | import os, json, random, sys 2 | import requests 3 | 4 | from collections import Counter 5 | 6 | 7 | # 日志耗时装饰器 8 | import time, datetime 9 | import functools 10 | 11 | def get_project_dir_path(): 12 | 13 | # cur_path = os.getcwd() 14 | # print("get_project_dir_path, cur_path = {}".format(cur_path)) 15 | # project_dir_path = os.path.abspath(os.path.join(os.getcwd(), "../..")) 16 | cur_file_path = os.path.abspath(__file__) 17 | #print("cur_file_path = {}".format(cur_file_path)) 18 | # cur_dir_path = os.path.dirname(cur_file_path) 19 | # print("get_project_dir_path, cur_dir_path = {}".format(cur_dir_path)) 20 | 21 | project_dir_path = os.path.abspath(os.path.join(cur_file_path, "../../..")) 22 | print("get_project_dir_path, project_dir_path = {}".format(project_dir_path)) 23 | return project_dir_path 24 | 25 | def merge_interval(interval_list): 26 | """ 27 | 区间合并,参考https://leetcode-cn.com/problems/merge-intervals/ 28 | """ 29 | interval_count = len(interval_list) 30 | if interval_count <= 1: 31 | return interval_list 32 | 33 | merge_interval_list = [] 34 | start_acsend_intervals = sorted(interval_list, key=lambda x: x[0], reverse=False) 35 | [prev_interval_start, prev_interval_end] = start_acsend_intervals[0] 36 | for i in range(1, interval_count): 37 | [cur_start, cur_end] = start_acsend_intervals[i] 38 | 39 | if prev_interval_end < cur_start: # [[,4],[8,]] 40 | merge_interval_list.append([prev_interval_start, prev_interval_end]) 41 | prev_interval_start = cur_start 42 | prev_interval_end = cur_end 43 | else: # prev_internal_end >= cur_start #[[1,4],[3,4]] 44 | prev_interval_end = max(prev_interval_end, cur_end) 45 | merge_interval_list.append([prev_interval_start, prev_interval_end]) 46 | return merge_interval_list 47 | 48 | def calculate_normalize_ratio(frequency, frequency_sum): 49 | """ 50 | 计算归一化的比值 51 | """ 52 | if frequency_sum > 0: 53 | ratio = (frequency / frequency_sum) 54 | ratio = format(ratio, '.2f') # 保留2位小数 55 | else: 56 | ratio = "-" 57 | return ratio -------------------------------------------------------------------------------- /tests/drain3_test.ini: -------------------------------------------------------------------------------- 1 | [SNAPSHOT] 2 | snapshot_interval_minutes = 10 3 | compress_state = True 4 | 5 | [MASKING] 6 | masking = [ 7 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"}, 8 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, 9 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, 10 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, 11 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"}, 12 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"}, 13 | {"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"} 14 | ] 15 | 16 | [DRAIN] 17 | sim_th = 0.4 18 | depth = 4 19 | max_children = 100 20 | max_clusters = 1024 21 | extra_delimiters = ["_"] 22 | 23 | [PROFILING] 24 | enabled = True 25 | report_sec = 30 26 | -------------------------------------------------------------------------------- /tests/test_drain.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import unittest 4 | 5 | from drain3.drain import Drain, LogCluster 6 | 7 | 8 | class DrainTest(unittest.TestCase): 9 | 10 | def test_add_shorter_than_depth_message(self): 11 | model = Drain(depth=4) 12 | res = model.add_log_message("word") 13 | print(res[1]) 14 | print(res[0]) 15 | self.assertEqual(res[1], "cluster_created") 16 | 17 | res = model.add_log_message("word") 18 | print(res[1]) 19 | print(res[0]) 20 | self.assertEqual(res[1], "none") 21 | 22 | res = model.add_log_message("otherword") 23 | print(res[1]) 24 | print(res[0]) 25 | self.assertEqual(res[1], "cluster_created") 26 | 27 | self.assertEqual(2, len(model.id_to_cluster)) 28 | 29 | def test_add_log_message(self): 30 | model = Drain() 31 | entries = str.splitlines( 32 | """ 33 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 34 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 35 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 36 | Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2 37 | Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2 38 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 39 | """ 40 | ) 41 | expected = str.splitlines( 42 | """ 43 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 44 | Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth] 45 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 46 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 47 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 48 | Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth] 49 | """ 50 | ) 51 | actual = [] 52 | 53 | for entry in entries: 54 | cluster, change_type = model.add_log_message(entry) 55 | actual.append(cluster.get_template()) 56 | 57 | self.assertListEqual(list(map(str.strip, expected)), actual) 58 | self.assertEqual(8, model.get_total_cluster_size()) 59 | 60 | def test_add_log_message_sim_75(self): 61 | """When `sim_th` is set to 75% then only certain log entries match. 62 | 63 | In this test similarity threshold is set to 75% which makes the model 64 | less aggressive in grouping entries into clusters. In particular, it 65 | only finds clusters for "Failed password" entries. 66 | """ 67 | model = Drain( 68 | depth=4, 69 | sim_th=0.75, 70 | max_children=100, 71 | ) 72 | entries = str.splitlines( 73 | """ 74 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 75 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 76 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 77 | Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2 78 | Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2 79 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 80 | """ 81 | ) 82 | expected = str.splitlines( 83 | """ 84 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 85 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 86 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 87 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 88 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 89 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 90 | """ 91 | ) 92 | actual = [] 93 | 94 | for entry in entries: 95 | cluster, change_type = model.add_log_message(entry) 96 | actual.append(cluster.get_template()) 97 | 98 | self.assertListEqual(list(map(str.strip, expected)), actual) 99 | self.assertEqual(8, model.get_total_cluster_size()) 100 | 101 | def test_max_clusters(self): 102 | """Verify model respects the max_clusters option. 103 | 104 | Key difference between this and other tests is that with `max_clusters` 105 | set to 1 model is capable of keeping track of a single cluster at a 106 | time. Consequently, when log stream switched form the A format to the B 107 | and back model doesn't recognize it and returnes a new template with no 108 | slots. 109 | """ 110 | model = Drain(max_clusters=1) 111 | entries = str.splitlines( 112 | """ 113 | A format 1 114 | A format 2 115 | B format 1 116 | B format 2 117 | A format 3 118 | """ 119 | ) 120 | expected = str.splitlines( 121 | """ 122 | A format 1 123 | A format <*> 124 | B format 1 125 | B format <*> 126 | A format 3 127 | """ 128 | ) 129 | actual = [] 130 | 131 | for entry in entries: 132 | cluster, change_type = model.add_log_message(entry) 133 | actual.append(cluster.get_template()) 134 | 135 | self.assertListEqual(list(map(str.strip, expected)), actual) 136 | self.assertEqual(1, model.get_total_cluster_size()) 137 | 138 | def test_max_clusters_lru_multiple_leaf_nodes(self): 139 | """When all templates end up in different nodes and the max number of 140 | clusters is reached, then clusters are removed according to the lru 141 | policy. 142 | """ 143 | model = Drain(max_clusters=2, depth=4, param_str="*") 144 | entries = [ 145 | "A A A", 146 | "A A B", 147 | "B A A", 148 | "B A B", 149 | "C A A", 150 | "C A B", 151 | "B A A", 152 | "A A A", 153 | ] 154 | expected = [ 155 | # lru: [] 156 | "A A A", 157 | # lru: ["A A A"] 158 | "A A *", 159 | # lru: ["A A *"] 160 | "B A A", 161 | # lru: ["B A A", "A A *"] 162 | "B A *", 163 | # lru: ["B A *", "A A *"] 164 | "C A A", 165 | # lru: ["C A A", "B A *"] 166 | "C A *", 167 | # lru: ["C A *", "B A *"] 168 | "B A *", 169 | # Message "B A A" was normalized because the template "B A *" is 170 | # still present in the cache. 171 | # lru: ["B A *", "C A *"] 172 | "A A A", 173 | # Message "A A A" was not normalized because the template "C A A" 174 | # pushed out the template "A A *" from the cache. 175 | # lru: ["A A A", "C A *"] 176 | ] 177 | actual = [] 178 | 179 | for entry in entries: 180 | cluster, _ = model.add_log_message(entry) 181 | actual.append(cluster.get_template()) 182 | 183 | self.assertListEqual(list(map(str.strip, expected)), actual) 184 | self.assertEqual(4, model.get_total_cluster_size()) 185 | 186 | def test_max_clusters_lru_single_leaf_node(self): 187 | """When all templates end up in the same leaf node and the max number of 188 | clusters is reached, then clusters are removed according to the lru 189 | policy. 190 | """ 191 | model = Drain(max_clusters=2, depth=4, param_str="*") 192 | entries = [ 193 | "A A A", 194 | "A A B", 195 | "A B A", 196 | "A B B", 197 | "A C A", 198 | "A C B", 199 | "A B A", 200 | "A A A", 201 | ] 202 | expected = [ 203 | # lru: [] 204 | "A A A", 205 | # lru: ["A A A"] 206 | "A A *", 207 | # lru: ["A A *"] 208 | "A B A", 209 | # lru: ["B A A", "A A *"] 210 | "A B *", 211 | # lru: ["B A *", "A A *"] 212 | "A C A", 213 | # lru: ["C A A", "B A *"] 214 | "A C *", 215 | # lru: ["C A *", "B A *"] 216 | "A B *", 217 | # Message "B A A" was normalized because the template "B A *" is 218 | # still present in the cache. 219 | # lru: ["B A *", "C A *"] 220 | "A A A", 221 | # Message "A A A" was not normalized because the template "C A A" 222 | # pushed out the template "A A *" from the cache. 223 | # lru: ["A A A", "C A *"] 224 | ] 225 | actual = [] 226 | 227 | for entry in entries: 228 | cluster, _ = model.add_log_message(entry) 229 | actual.append(cluster.get_template()) 230 | 231 | self.assertListEqual(list(map(str.strip, expected)), actual) 232 | # self.assertEqual(5, model.get_total_cluster_size()) 233 | 234 | def test_match_only(self): 235 | model = Drain() 236 | res = model.add_log_message("aa aa aa") 237 | print(res[0]) 238 | 239 | res = model.add_log_message("aa aa bb") 240 | print(res[0]) 241 | 242 | res = model.add_log_message("aa aa cc") 243 | print(res[0]) 244 | 245 | res = model.add_log_message("xx yy zz") 246 | print(res[0]) 247 | 248 | c: LogCluster = model.match("aa aa tt") 249 | self.assertEqual(1, c.cluster_id) 250 | 251 | c: LogCluster = model.match("xx yy zz") 252 | self.assertEqual(2, c.cluster_id) 253 | 254 | c: LogCluster = model.match("xx yy rr") 255 | self.assertIsNone(c) 256 | 257 | c: LogCluster = model.match("nothing") 258 | self.assertIsNone(c) 259 | 260 | -------------------------------------------------------------------------------- /tests/test_masking.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import unittest 4 | 5 | from drain3.masking import MaskingInstruction, LogMasker 6 | 7 | 8 | class MaskingTest(unittest.TestCase): 9 | 10 | def test_instructions_by_mask_name(self): 11 | instructions = [] 12 | a = MaskingInstruction(r"a", "1") 13 | instructions.append(a) 14 | b = MaskingInstruction(r"b", "1") 15 | instructions.append(b) 16 | c = MaskingInstruction(r"c", "2") 17 | instructions.append(c) 18 | d = MaskingInstruction(r"d", "3") 19 | instructions.append(d) 20 | x = MaskingInstruction(r"x", "something else") 21 | instructions.append(x) 22 | y = MaskingInstruction(r"y", "something else") 23 | instructions.append(y) 24 | masker = LogMasker(instructions, "", "") 25 | self.assertCountEqual(["1", "2", "3", "something else"], masker.mask_names) 26 | self.assertCountEqual([a, b], masker.instructions_by_mask_name("1")) 27 | self.assertCountEqual([c], masker.instructions_by_mask_name("2")) 28 | self.assertCountEqual([d], masker.instructions_by_mask_name("3")) 29 | self.assertCountEqual([x, y], masker.instructions_by_mask_name("something else")) 30 | 31 | def test_mask(self): 32 | s = "D9 test 999 888 1A ccc 3" 33 | mi = MaskingInstruction(r"((?<=[^A-Za-z0-9])|^)([\-\+]?\d+)((?=[^A-Za-z0-9])|$)", "NUM") 34 | masker = LogMasker([mi], "") 35 | masked = masker.mask(s) 36 | self.assertEqual("D9 test 1A ccc ", masked) 37 | -------------------------------------------------------------------------------- /tests/test_template_miner.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import io 4 | import logging 5 | import sys 6 | import unittest 7 | from os.path import dirname 8 | 9 | from drain3 import TemplateMiner 10 | from drain3.masking import MaskingInstruction 11 | from drain3.memory_buffer_persistence import MemoryBufferPersistence 12 | from drain3.template_miner_config import TemplateMinerConfig 13 | 14 | 15 | class TemplateMinerTest(unittest.TestCase): 16 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') 17 | 18 | def test_load_config(self): 19 | config = TemplateMinerConfig() 20 | config.load(dirname(__file__) + "/drain3_test.ini") 21 | self.assertEqual(1024, config.drain_max_clusters) 22 | self.assertListEqual(["_"], config.drain_extra_delimiters) 23 | self.assertEqual(7, len(config.masking_instructions)) 24 | 25 | def test_save_load_snapshot_unlimited_clusters(self): 26 | self.save_load_snapshot(None) 27 | 28 | def test_save_load_snapshot_limited_clusters(self): 29 | self.save_load_snapshot(10) 30 | 31 | def save_load_snapshot(self, max_clusters): 32 | persistence = MemoryBufferPersistence() 33 | 34 | config = TemplateMinerConfig() 35 | config.drain_max_clusters = max_clusters 36 | template_miner1 = TemplateMiner(persistence, config) 37 | print(template_miner1.add_log_message("hello")) 38 | print(template_miner1.add_log_message("hello ABC")) 39 | print(template_miner1.add_log_message("hello BCD")) 40 | print(template_miner1.add_log_message("hello XYZ")) 41 | print(template_miner1.add_log_message("goodbye XYZ")) 42 | 43 | template_miner2 = TemplateMiner(persistence, config) 44 | 45 | self.assertListEqual(list(template_miner1.drain.id_to_cluster.keys()), 46 | list(template_miner2.drain.id_to_cluster.keys())) 47 | 48 | self.assertListEqual(list(template_miner1.drain.root_node.key_to_child_node.keys()), 49 | list(template_miner2.drain.root_node.key_to_child_node.keys())) 50 | 51 | def get_tree_lines(template_miner): 52 | sio = io.StringIO() 53 | template_miner.drain.print_tree(sio) 54 | sio.seek(0) 55 | return sio.readlines() 56 | 57 | self.assertListEqual(get_tree_lines(template_miner1), 58 | get_tree_lines(template_miner2)) 59 | 60 | print(template_miner2.add_log_message("hello yyy")) 61 | print(template_miner2.add_log_message("goodbye ABC")) 62 | 63 | def test_extract_parameters(self): 64 | config = TemplateMinerConfig() 65 | mi = MaskingInstruction("((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM") 66 | config.masking_instructions.append(mi) 67 | mi = MaskingInstruction(r"multiple words", "WORDS") 68 | config.masking_instructions.append(mi) 69 | config.mask_prefix = "[:" 70 | config.mask_suffix = ":]" 71 | template_miner = TemplateMiner(None, config) 72 | 73 | def add_and_test(msg, expected_params, exact_matching=False): 74 | print(f"msg: {msg}") 75 | res = template_miner.add_log_message(msg) 76 | print(f"result: {res}") 77 | extracted_parameters = template_miner.extract_parameters( 78 | res["template_mined"], msg, exact_matching=exact_matching) 79 | self.assertIsNotNone(extracted_parameters) 80 | params = [parameter.value for parameter in extracted_parameters] 81 | print(f"params: {params}") 82 | self.assertListEqual(params, expected_params) 83 | 84 | add_and_test("hello", []) 85 | add_and_test("hello ABC", []) 86 | add_and_test("hello BCD", ["BCD"]) 87 | add_and_test("hello BCD", ["BCD"]) 88 | add_and_test("hello\tBCD", ["BCD"]) 89 | add_and_test("request took 123 ms", ["123"]) 90 | add_and_test("file saved [test.xml]", []) 91 | add_and_test("new order received: [:xyz:]", []) 92 | add_and_test("order type: new, order priority:3", ["3"]) 93 | add_and_test("order type: changed, order priority:5", ["changed,", "5"]) 94 | add_and_test("sometimes one needs multiple words", ["multiple words"], True) 95 | add_and_test("sometimes one needs not", ["not"], True) 96 | add_and_test("sometimes one needs multiple words", ["multiple words"], True) 97 | 98 | def test_extract_parameters_direct(self): 99 | config = TemplateMinerConfig() 100 | mi = MaskingInstruction(r"hdfs://[\w.:@-]*((/[\w.~%+-]+)+/?)?", "hdfs_uri") 101 | config.masking_instructions.append(mi) 102 | mi = MaskingInstruction(r"(?P[\"'`]).*?(?P=quote)", "quoted_string") 103 | config.masking_instructions.append(mi) 104 | mi = MaskingInstruction(r"((?P[*_])\2{0,2}).*?\1", "markdown_emph") 105 | config.masking_instructions.append(mi) 106 | mi = MaskingInstruction(r"multiple \*word\* pattern", "*words*") 107 | config.masking_instructions.append(mi) 108 | mi = MaskingInstruction(r"some \S+ \S+ pattern", "*words*") 109 | config.masking_instructions.append(mi) 110 | mi = MaskingInstruction(r"(\d{1,3}\.){3}\d{1,3}", "ip") 111 | config.masking_instructions.append(mi) 112 | mi = MaskingInstruction(r"(?P\d+)\.\d+", "float") 113 | config.masking_instructions.append(mi) 114 | mi = MaskingInstruction(r"0[xX][a-fA-F0-9]+", "integer") 115 | config.masking_instructions.append(mi) 116 | mi = MaskingInstruction(r"(?P\d+)", "integer") 117 | config.masking_instructions.append(mi) 118 | mi = MaskingInstruction(r"HelloWorld", "*") 119 | config.masking_instructions.append(mi) 120 | mi = MaskingInstruction(r"MaskPrefix", "<") 121 | config.masking_instructions.append(mi) 122 | template_miner = TemplateMiner(None, config) 123 | 124 | test_vectors = [ 125 | ( 126 | ":+", 127 | "hdfs://msra-sa-41:9000/pageinput2.txt:671088640+134217728", 128 | ["hdfs://msra-sa-41:9000/pageinput2.txt", "671088640", "134217728"], 129 | ["hdfs_uri", "integer", "integer"] 130 | ), 131 | ( 132 | "Hello ", 133 | "Hello 'World'", 134 | ["'World'"], 135 | ["quoted_string"] 136 | ), 137 | ( 138 | "", 139 | """'This "should"'`do no breakin'`""", 140 | ["""'This "should"'""", "`do no breakin'`"], 141 | ["quoted_string", "quoted_string"] 142 | ), 143 | ( 144 | "This is !.", 145 | "This is ___very___ *important*!.", 146 | ["___very___", "*important*"], 147 | ["markdown_emph", "markdown_emph"] 148 | ), 149 | ( 150 | ".<*>", 151 | "0.15.Test", 152 | ["0.15", "Test"], 153 | ["float", "*"] 154 | ), 155 | ( 156 | ":", 157 | "192.0.0.1:5000", 158 | ["192.0.0.1", "5000"], 159 | ["ip", "integer"] 160 | ), 161 | ( 162 | "::", 163 | "192.0.0.1:5000:123", 164 | ["192.0.0.1", "5000", "123"], 165 | ["ip", "integer", "integer"] 166 | ), 167 | ( 168 | ".<*>.", 169 | "0.15.Test.0.2", 170 | ["0.15", "Test", "0.2"], 171 | ["float", "*", "float"] 172 | ), 173 | ( 174 | " ", 175 | "0.15 10.16", 176 | ["0.15", "10.16"], 177 | ["float", "float"] 178 | ), 179 | ( 180 | "<*words*>@", 181 | "some other cool pattern@0xe1f", 182 | ["some other cool pattern", "0xe1f"], 183 | ["*words*", "integer"] 184 | ), 185 | ( 186 | "Another test with <*words*> that includes and <*> ", 187 | "Another test with some other 0Xadded pattern that includes 500xc0ffee and 0X4 times 5", 188 | ["some other 0Xadded pattern", "50", "0xc0ffee", "0X4", "times", "5"], 189 | ["*words*", "integer", "integer", "integer", "*", "integer"] 190 | ), 191 | ( 192 | "some <*words*> <*words*>", 193 | "some multiple *word* pattern some confusing *word* pattern", 194 | ["multiple *word* pattern", "some confusing *word* pattern"], 195 | ["*words*", "*words*"] 196 | ), 197 | ( 198 | "<*words*> <*>", 199 | "multiple *word* pattern <*words*>", 200 | ["multiple *word* pattern", "<*words*>"], 201 | ["*words*", "*"] 202 | ), 203 | ( 204 | "<*> <*>", 205 | "HelloWorld Test", 206 | ["HelloWorld", "Test"], 207 | ["*", "*"] 208 | ), 209 | ( 210 | "<*> <*>", 211 | "HelloWorld ", 212 | ["HelloWorld", ""], 213 | ["*", "*"] 214 | ), 215 | ( 216 | "<*>", 217 | "HelloWorld1", 218 | ["HelloWorld", "1"], 219 | ["*", "integer"] 220 | ), 221 | ( 222 | "<*> works <*>", 223 | "This works as-expected", 224 | ["This", "as-expected"], 225 | ["*", "*"] 226 | ), 227 | ( 228 | ">", 229 | "", 230 | ["8"], 231 | ["integer"] 232 | ), 233 | ( 234 | " >>", 235 | ">", 236 | ["8", "0.5"], 237 | ["integer", "float"] 238 | ), 239 | ( 240 | "<*> >>", 241 | "New: >", 242 | ["New:", "8", "0.5"], 243 | ["*", "integer", "float"] 244 | ), 245 | ( 246 | "<<>", 247 | "MaskPrefix", 248 | ["MaskPrefix"], 249 | ["<"] 250 | ), 251 | ( 252 | "<<<>>", 253 | "", 254 | ["MaskPrefix"], 255 | ["<"] 256 | ), 257 | ( 258 | "There are no parameters here.", 259 | "There are no parameters here.", 260 | [], 261 | [] 262 | ), 263 | ( 264 | " ", 265 | "0.15 10.16 3.19", 266 | None, 267 | None 268 | ), 269 | ( 270 | " ", 271 | "0.15 10.16 test 3.19", 272 | None, 273 | None 274 | ), 275 | ( 276 | " >>", 277 | ">", 278 | None, 279 | None 280 | ), 281 | ( 282 | "<<>", 283 | "<<>", 284 | None, 285 | None 286 | ), 287 | ( 288 | "<*words*> <*words*>", 289 | "0.15 0.15", 290 | None, 291 | None 292 | ), 293 | ] 294 | 295 | for template, content, expected_parameters, expected_mask_names in test_vectors: 296 | with self.subTest(template=template, content=content, expected_parameters=expected_parameters): 297 | extracted_parameters = template_miner.extract_parameters(template, content, exact_matching=True) 298 | if expected_parameters is None: 299 | self.assertIsNone(extracted_parameters) 300 | else: 301 | self.assertIsNotNone(extracted_parameters) 302 | self.assertListEqual([parameter.value for parameter in extracted_parameters], 303 | expected_parameters) 304 | self.assertListEqual([parameter.mask_name for parameter in extracted_parameters], 305 | expected_mask_names) 306 | 307 | def test_match_only(self): 308 | config = TemplateMinerConfig() 309 | config.drain_extra_delimiters = ["_"] 310 | mi = MaskingInstruction("((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM") 311 | config.masking_instructions.append(mi) 312 | tm = TemplateMiner(None, config) 313 | 314 | res = tm.add_log_message("aa aa aa") 315 | print(res) 316 | 317 | res = tm.add_log_message("aa aa bb") 318 | print(res) 319 | 320 | res = tm.add_log_message("xx yy zz") 321 | print(res) 322 | 323 | res = tm.add_log_message("rrr qqq 123") 324 | print(res) 325 | 326 | c = tm.match("aa aa tt") 327 | self.assertEqual(1, c.cluster_id) 328 | 329 | c = tm.match("aa aa 12") 330 | self.assertEqual(1, c.cluster_id) 331 | 332 | c = tm.match("xx yy zz") 333 | self.assertEqual(2, c.cluster_id) 334 | 335 | c = tm.match("xx yy rr") 336 | self.assertIsNone(c) 337 | 338 | c = tm.match("nothing") 339 | self.assertIsNone(c) 340 | 341 | c = tm.match("rrr qqq 456 ") 342 | self.assertEqual(3, c.cluster_id) 343 | 344 | c = tm.match("rrr qqq 555.2") 345 | self.assertIsNone(c) 346 | 347 | c = tm.match("rrr qqq num") 348 | self.assertIsNone(c) 349 | 350 | def test_match_strategies(self): 351 | miner = TemplateMiner() 352 | print(miner.add_log_message("training4Model start")) 353 | print(miner.add_log_message("loadModel start")) 354 | print(miner.add_log_message("loadModel stop")) 355 | print(miner.add_log_message("this is a test")) 356 | miner.drain.print_tree() 357 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback")) 358 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always")) 359 | self.assertIsNone(miner.match("loadModel start", full_search_strategy="never")) 360 | print(miner.add_log_message("loadModel start")) 361 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback")) 362 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always")) 363 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="never")) 364 | 365 | config = TemplateMinerConfig() 366 | config.parametrize_numeric_tokens = False 367 | miner = TemplateMiner(config=config) 368 | print(miner.add_log_message("training4Model start")) 369 | print(miner.add_log_message("loadModel start")) 370 | print(miner.add_log_message("loadModel stop")) 371 | print(miner.add_log_message("this is a test")) 372 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback")) 373 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always")) 374 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="never")) 375 | 376 | self.assertIsNone(miner.match("", full_search_strategy="never")) 377 | self.assertIsNone(miner.match("", full_search_strategy="always")) 378 | self.assertIsNone(miner.match("", full_search_strategy="fallback")) 379 | 380 | print(miner.add_log_message("")) 381 | self.assertIsNotNone(miner.match("", full_search_strategy="never")) 382 | self.assertIsNotNone(miner.match("", full_search_strategy="always")) 383 | self.assertIsNotNone(miner.match("", full_search_strategy="fallback")) 384 | -------------------------------------------------------------------------------- /日志解析_项目介绍.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongdong9/log_parser/7c6bfa9a3d748a7f9eef93078940c7ce2c5446de/日志解析_项目介绍.docx --------------------------------------------------------------------------------