├── .gitignore ├── README.md ├── compute.sh ├── conf.expmple ├── report ├── cli.sh └── email.sh └── run.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .settings/ 2 | .project 3 | .buildpath -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | access log analysis 2 | ================== 3 | 4 | 分析访问日志,可通过终端显示或邮件发送分析报告。 5 | 6 | ### 一、下载 7 | 8 | * 下载release版 9 | 10 | [https://github.com/tabalt/access-log-analysis/releases](https://github.com/tabalt/access-log-analysis/releases) 11 | 12 | * 下载最新版本 13 | 14 | [https://github.com/tabalt/access-log-analysis/archive/master.zip](https://github.com/tabalt/access-log-analysis/archive/master.zip) 15 | 16 | 17 | ### 二、配置 18 | 19 | * 创建配置文件 20 | 21 | cd ~/access-log-analysis 22 | cp conf.expmple conf.sh 23 | vim conf.sh 24 | 25 | * 修改配置 26 | 27 | * 设置日志文件路径 28 | 29 | log_path=/usr/local/nginx/logs/archive 30 | 31 | * 设置接收报告邮箱 32 | 33 | report_email="" 34 | 35 | * 日志文件名的后缀,如前一天的日期后缀: 36 | 37 | log_file_suffix=`date -d "yesterday" +%Y%m%d` 38 | 39 | * 设置要分析的域名及日志文件 40 | 41 | log_config=" 42 | www.abc.com|$log_path/www_abc_com_access.log-$log_file_suffix 43 | api.abc.com|$log_path/api_abc_com_access.log-$log_file_suffix 44 | " 45 | 46 | ### 三、使用 47 | 48 | * 在终输出分析结果 49 | 50 | sh run.sh 51 | 52 | * 通过邮件发送分析结果 53 | 54 | sh run.sh email 55 | 56 | * crontab 中定时执行 57 | 58 | 每天11点发送分析报告: 59 | 60 | 0 11 * * * cd /home/tabalt/access-log-analysis; /bin/bash run.sh email > /dev/null 2>&1 61 | 62 | 63 | ### 四、报表内容 64 | 65 | 66 | * 目前报表中会包含如下内容: 67 | 68 | * 报表域名、日志文件、时间等 69 | * 概况 70 | * 访问ip前N名 71 | * 被访问的url前N名 72 | * 来源页面前N名 73 | * 404页面前N名 74 | * 蜘蛛统计 75 | * 搜索引擎来源统计 76 | 77 | * 下面是终端下输出的报表形式: 78 | 79 | ┌──────────────────────────────────────────── 80 | │ 报表域名: www.abc.com 81 | │ 日志文件: /usr/local/nginx/logs/archive/www_abc_com_access.log-20141029 82 | │ 创建时间: 2014-10-30 11:38 83 | ├──────────────────────────────────────────── 84 | │ 85 | │ + 概况 + 86 | │ ──────────── 87 | │ 88 | │  总访问量: 1513 89 | │   总带宽: 69M 90 | │  独立访客: 697 91 | │ 92 | │ 93 | │ + 访问IP统计 + 94 | │ ──────────── 95 | │ 96 | │ 19 123.151.136.151 97 | │ 18 220.170.90.112 98 | │ 16 49.4.178.31 99 | │ 100 | │ 101 | │ + 访问url统计 + 102 | │ ──────────── 103 | │ 104 | │ ... 105 | │ 106 | │ 107 | │ + 来源页面统计 + 108 | │ ──────────── 109 | │ 110 | │ ... 111 | │ 112 | │ 113 | │ + 404统计 + 114 | │ ──────────── 115 | │ 116 | │ ... 117 | │ 118 | │ 119 | │ + 蜘蛛统计 + 120 | │ ──────────── 121 | │ 122 | │ ... 123 | │ 124 | │ 125 | │ + 搜索引擎来源统计 + 126 | │ ──────────── 127 | │ 128 | │ ... 129 | │ 130 | └──────────────────────────────────────────── 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /compute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################ 4 | # compute functions 5 | ################################################ 6 | 7 | # 8 | # get total visit 9 | # 10 | function get_total_visit() 11 | { 12 | log_file=$1 13 | total_visit=`wc -l ${log_file} | awk '{print $1}'` 14 | echo $total_visit 15 | } 16 | 17 | # 18 | # get total bandwidth 19 | # 20 | function get_total_bandwidth() 21 | { 22 | log_file=$1 23 | total_bandwidth=`awk -v total=0 '{total+=$10}END{print total/1024/1024}' ${log_file}`M 24 | echo $total_bandwidth 25 | } 26 | 27 | # 28 | # get total unique 29 | # 30 | function get_total_unique() 31 | { 32 | log_file=$1 33 | total_unique=`awk '{ip[$1]++}END{print asort(ip)}' ${log_file}` 34 | echo $total_unique 35 | } 36 | 37 | # 38 | # get ip top N 39 | # 40 | function get_ip_top() 41 | { 42 | log_file=$1 43 | top=$2 44 | ip_pv=`awk '{count[$1]++}END{for (ip in count){print count[ip],ip}}' ${log_file} | sort -rn | head -${top} | awk 'BEGIN{OFS = "\\\t"; ORS = "\\\n"}{print $1,$2}'` 45 | echo -e $ip_pv 46 | } 47 | 48 | # 49 | # get url top N 50 | # 51 | function get_url_top() 52 | { 53 | log_file=$1 54 | top=$2 55 | url_num=`awk '{count[$7]++}END{for (url in count){print count[url],url}}' ${log_file} | sort -rn | head -${top} | awk 'BEGIN{OFS = "\\\t"; ORS = "\\\n"}{print $1,$2}'` 56 | echo -e $url_num 57 | } 58 | 59 | # 60 | # get referer top N 61 | # 62 | function get_referer_top() 63 | { 64 | log_file=$1 65 | top=$2 66 | referer=`awk -v domain=$domain '$11 !~ /http:\/\/[^/]*'"$domain"'/{count[$11]++}END{for (url in count){print count[url],url}}' ${log_file} | sort -rn | head -${top} | awk 'BEGIN{OFS = "\\\t"; ORS = "\\\n"}{print $1,$2}'` 67 | echo -e $referer 68 | } 69 | 70 | # 71 | # get notfound top N 72 | # 73 | function get_notfound_top() 74 | { 75 | log_file=$1 76 | top=$2 77 | notfound=`awk '$9 == 404 {url[$7]++}END{for (k in url){print url[k],k}}' ${log_file} | sort -rn | head -${top} | awk 'BEGIN{OFS = "\\\t"; ORS = "\\\n"}{print $1,$2}'` 78 | echo -e $notfound 79 | } 80 | 81 | # 82 | # get spider 83 | # 84 | function get_spider() 85 | { 86 | log_file=$1 87 | spider=`awk -F'"' '$6 ~ /Baiduspider/ {spider["baiduspider"]++} $6 ~ /Googlebot/ {spider["googlebot"]++}END{for (k in spider){print k,spider[k]}}' ${log_file} | awk 'BEGIN{OFS = "\\\t"; ORS = "\\\n"}{print $1,$2}'` 88 | echo -e $spider 89 | } 90 | 91 | # 92 | # get search engine 93 | # 94 | function get_search_enigne() 95 | { 96 | log_file=$1 97 | search=`awk -F'"' '$4 ~ /http:\/\/www\.baidu\.com/ {search["baidu_search_enigne"]++} $4 ~ /http:\/\/www\.google\.com/ {search["google_search_enigne"]++}END{for (k in search){print k,search[k]}}' ${log_file} | awk 'BEGIN{OFS = "\\\t"; ORS = "\\\n"}{print $1,$2}'` 98 | echo -e $search 99 | } 100 | 101 | -------------------------------------------------------------------------------- /conf.expmple: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################ 4 | # global config 5 | ################################################ 6 | 7 | 8 | # your log file's path 9 | log_path=/usr/local/nginx/logs/archive 10 | 11 | # put you email here to get email report 12 | report_email="" 13 | 14 | # item number for top 15 | top_item_num=50 16 | 17 | # log file suffix 18 | log_file_suffix=`date -d "yesterday" +%Y%m%d` 19 | 20 | # domain and log file config 21 | log_config=" 22 | www.abc.com|$log_path/www_abc_com_access.log-$log_file_suffix 23 | api.abc.com|$log_path/api_abc_com_access.log-$log_file_suffix 24 | " 25 | 26 | -------------------------------------------------------------------------------- /report/cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################ 4 | # cli report functions 5 | ################################################ 6 | 7 | function get_report_title() 8 | { 9 | echo "" 10 | } 11 | 12 | function get_single_title() 13 | { 14 | single_title="\n 15 | ┌────────────────────────────────────────────\n 16 | │ 报表域名:\t$1\n 17 | │ 日志文件:\t$2\n 18 | │ 创建时间:\t`date +%Y-%m-%d" "%H":"%M`\n 19 | ├────────────────────────────────────────────\n 20 | " 21 | echo $single_title 22 | } 23 | 24 | function get_block_title() 25 | { 26 | echo "│\n 27 | │ + $1 +\n 28 | │ ────────────\n 29 | │\n" 30 | } 31 | 32 | function format_list_data() 33 | { 34 | data=$1 35 | #echo `awk $data 'BEGIN{FS=" ";RS="~";OFS="\\\t";ORS="\\\n │\\\t"}{print $1,$2}'` 36 | echo `awk $data 'BEGIN{FS="\\t";RS="\\n";OFS="\\\t";ORS="\\\n │\\\t"}{print $1,$2}'` 37 | } 38 | 39 | # 40 | # get single report 41 | # 42 | function get_single_report() 43 | { 44 | domain=$1 45 | log_file=$2 46 | 47 | single_report="\n 48 | 49 | `get_single_title $domain $log_file` 50 | 51 | `get_block_title 概况` 52 | 53 | │  总访问量:\t`get_total_visit $log_file`\n 54 | │   总带宽:\t`get_total_bandwidth $log_file`\n 55 | │  独立访客:\t`get_total_unique $log_file`\n 56 | │\n 57 | 58 | 59 | `get_block_title 访问IP统计` 60 | 61 | │\t`get_ip_top $log_file $top_item_num | format_list_data`\n 62 | 63 | `get_block_title 访问url统计` 64 | 65 | │\t`get_url_top $log_file $top_item_num | format_list_data`\n 66 | 67 | `get_block_title 来源页面统计` 68 | 69 | │\t`get_referer_top $log_file $top_item_num | format_list_data`\n 70 | 71 | `get_block_title 404统计` 72 | 73 | │\t`get_notfound_top $log_file $top_item_num | format_list_data`\n 74 | 75 | `get_block_title 蜘蛛统计` 76 | 77 | │\t`get_spider $log_file | format_list_data`\n 78 | 79 | `get_block_title 搜索引擎来源统计` 80 | 81 | │\t`get_search_enigne $log_file | format_list_data`\n 82 | └────────────────────────────────────────────\n 83 | 84 | " 85 | echo $single_report 86 | } 87 | 88 | -------------------------------------------------------------------------------- /report/email.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################ 4 | # email report functions 5 | ################################################ 6 | 7 | function get_report_title() 8 | { 9 | echo " 10 | 36 |

$logdate 访问日志报表

\n 37 | " 38 | } 39 | 40 | function get_single_title() 41 | { 42 | single_title="\n 43 |


\n 44 | 45 | \n 46 | \n 47 | \n 48 | \n 49 | \n 50 | \n 51 | \n 52 | \n 53 | \n 54 | \n 55 | \n 56 | \n 57 | \n 58 | \n 92 | \n 95 | 96 | "; 97 | } 98 | 99 | function get_overview() 100 | { 101 | echo " 102 |
报表域名$1
日志文件$2
报表时间`date +%Y-%m-%d" "%H":"%M`
\n 59 | " 60 | echo $single_title 61 | } 62 | 63 | function get_block_title() 64 | { 65 | echo "\n 66 | \n 67 | \n 68 | \n 69 | \n 70 | " 71 | } 72 | 73 | function get_tr_end() 74 | { 75 | echo "\n 76 | \n 77 | "; 78 | } 79 | 80 | function get_table_end() 81 | { 82 | echo "\n 83 |
$1
\n 84 | "; 85 | } 86 | 87 | function format_list_data() 88 | { 89 | data=$1 90 | echo "\n 91 |
\n 93 | `awk $data 'BEGIN{FS="\\t";RS="\\n";OFS="\\\t";ORS="
\\\n"}{print $1,$2}'`\n 94 |
\n 103 | \n 104 | \n 105 | \n 106 | \n 107 | \n 108 | \n 109 | \n 110 | \n 111 | \n 112 | \n 113 | \n 114 | \n 115 | \n 116 | \n 117 | \n 118 | \n 119 |
访问概况
总访问量`get_total_visit $log_file`
总带宽`get_total_bandwidth $log_file`
独立访客`get_total_unique $log_file`
\n 120 | " 121 | } 122 | 123 | # 124 | # get single report 125 | # 126 | function get_single_report() 127 | { 128 | domain=$1 129 | log_file=$2 130 | 131 | single_report=" 132 | 133 | `get_single_title $domain $log_file` 134 | 135 | `get_overview` 136 | 137 | `get_block_title 访问IP统计` 138 | 139 | `get_ip_top $log_file $top_item_num | format_list_data` 140 | 141 | `get_table_end` 142 | 143 | `get_block_title 访问url统计` 144 | 145 | `get_url_top $log_file $top_item_num | format_list_data` 146 | 147 | `get_table_end` 148 | 149 | `get_block_title 来源页面统计` 150 | 151 | `get_referer_top $log_file $top_item_num | format_list_data` 152 | 153 | `get_table_end` 154 | 155 | `get_block_title 404统计` 156 | 157 | `get_notfound_top $log_file $top_item_num | format_list_data` 158 | 159 | `get_table_end` 160 | 161 | `get_block_title 蜘蛛统计` 162 | 163 | `get_spider $log_file | format_list_data` 164 | 165 | `get_table_end` 166 | 167 | `get_block_title 搜索引擎来源统计` 168 | 169 | `get_search_enigne $log_file | format_list_data` 170 | 171 | `get_table_end` 172 | 173 | 174 | 175 | 176 | 177 | " 178 | echo $single_report 179 | } 180 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # get dir and report type 5 | # 6 | 7 | this_file=`pwd`"/"$0 8 | this_dir=`dirname $this_file` 9 | 10 | report_type=$1 11 | if [ "$report_type" != "email" ] 12 | then 13 | report_type="cli" 14 | fi 15 | 16 | # 17 | # include conf and func 18 | # 19 | 20 | . $this_dir/conf.sh 21 | . $this_dir/compute.sh 22 | . $this_dir/report/$report_type.sh 23 | 24 | 25 | # 26 | # get report 27 | # 28 | 29 | all_report="`get_report_title`" 30 | for conf in $log_config 31 | { 32 | domain=`echo $conf | awk 'BEGIN{FS = "|";}{print $1}'` 33 | log_file="`echo $conf | awk 'BEGIN{FS = "|";}{print $2}'`" 34 | all_report=$all_report`get_single_report $domain $log_file` 35 | } 36 | 37 | # 38 | # show report 39 | # 40 | 41 | logdate=`date -d "yesterday" +%Y-%m-%d` 42 | 43 | if [ "$report_type" = "email" ] 44 | then 45 | echo -e $all_report | mail -s "$(echo -e "access log statistics $logdate `hostname` \nContent-Type: text/html;charset=utf-8")" ${report_email} 46 | else 47 | echo -e $all_report 48 | fi 49 | 50 | --------------------------------------------------------------------------------