├── README.md ├── groups.txt └── rank_items.txt /README.md: -------------------------------------------------------------------------------- 1 | ## 说明 ## 2 | 1. 知道的命令太少,受限 3 | 2. 知道的命令太多,参数太多,自由度太大,选择太多,无所适从 4 | 3. 基础模式,告诉你如何选择命令 5 | 4. 写命令就像写sql 6 | 5. 写shell脚本就像写存储过程 7 | 8 | ## sql与shell command对应关系 ## 9 | table file || /tmp/file ==> data source 10 | select cut -d';' -f1,2 || awk -F';' {print $1';'$2} ==> map 11 | filter sed -n -e '/pattern/ p' || awk -F';' '$2 > 20 {print $0}' ==> filter 12 | group_by awk '{a[$2] += $3}END{for(i in a)print i";"a[i]}' ==> reduce 13 | order_by sort -r -k2,3 -n ==> sort 14 | join join -t';' -1 1 -2 1 -o 1.1 2.2 file1 file2 ==> join 15 | distinct uniq || sort -u ==> reduce 16 | limit head -n10 ==> slice 17 | offset sed -n 'offset,offset+limit p' ==> slice 18 | update sed -n -e -i 's/pattern/newvalue/g' ==> update 19 | insert sed '/pattern/ a new line content' -i ==> insert 20 | delete sed '/pattern/ d' -i ==> delete 21 | 22 | 23 | ## 数据样例 ## 24 | rank_items.txt(258770) 25 | "5b94c293-bdd0-4569-8203-0dbb9eeab432";"academy_hot_course";"6";"2013-06-03 21:37:41.189835+08";10 26 | "55a2d8d2-9734-43ba-b8ee-a55fc0596bd4";"hot_question:微观经济学";"469734";"2013-06-05 18:08:05.741067+08";1 27 | "28f7d770-49c9-47b9-9435-5f8c8de06a78";"hot_question:动物";"438176";"2013-06-05 18:08:41.168943+08";1 28 | "931bc2ff-9145-4765-813d-768b05fa9372";"hot_post:69";"463554";"2013-06-03 21:38:58.224445+08";1 29 | "da55cbcf-ce8e-47d2-a88d-d879801205f6";"group_hot_member:198";"p9hg1u";"2013-06-03 21:39:04.28642+08";1 30 | "e9594fdc-9961-4cc4-b43a-9cedd323a9b0";"hot_ask_user";"et4hhg";"2013-06-05 18:08:41.202577+08";1 31 | "f0613373-1108-4cd6-9856-c4b42dc3ecee";"hot_tag";"求真相";"2013-06-01 10:03:36.764314+08";1 32 | "a59f3536-7133-4c1f-a3da-a02ec57eac51";"hot_post:27";"459797";"2013-06-03 21:40:20.346828+08";1 33 | "8aec1661-7ada-4b8f-b815-993039b281d7";"hot_post:198";"464179";"2013-06-03 21:40:34.297701+08";1 34 | 35 | groups.txt(217) 36 | 205;"真要瘦不瘦不罢休";"2012-11-23 13:42:38+08" 37 | 28;"健康朝九晚五";"2010-10-20 16:20:43+08" 38 | 280;"核谐家园";"2013-04-17 17:11:49.545351+08" 39 | 38;"创意科技";"2010-10-20 16:20:44+08" 40 | 39;"死理性派";"2010-10-20 16:20:44+08" 41 | 175;"魔兽世界";"2012-10-08 14:35:20+08" 42 | 29;"爱宠";"2010-10-20 16:20:44+08" 43 | 44 | 45 | ## 总数 ## 46 | select count(1) from rank_item 47 | 48 | wc -l rank_items.txt | cut -d' ' -f1 49 | cat rank_items.txt | wc -l 50 | wc -l < rank_items.txt 51 | 52 | ## 性情小组活跃用户数量 ## 53 | select count(distinct(object_name)) 54 | from rank_item 55 | where rank_name='group_hot_member:30' 56 | 57 | sed -n -e '/"group_hot_member\:30"\;/ p' rank_items.txt 58 | | cut -d';' -f3 | sort | uniq | wc -l 59 | 60 | ## 性能 ## 61 |  62 | 63 | ## 性情小组活跃用户TOP10 ## 64 | 65 | select object_name,sum(score) total_score 66 | from rank_item 67 | where rank_name='group_hot_member:30' 68 | group by object_name 69 | order by total_score desc 70 | limit 10 71 | 72 | sed -n -e '/"group_hot_member\:30"\;/ p' rank_items.txt 73 | | awk -F';' '{a[$3]+=$5}END{for(i in a)print i,a[i]}' 74 | | sort -t' ' -n -r -k 2 | head -n 10 75 | 76 | ## 性情小组活跃用户(总分大于80的) ## 77 | 78 | select object_name,sum(score) total_score 79 | from rank_item 80 | where rank_name='group_hot_member:30' 81 | group by object_name 82 | order by total_score desc 83 | limit 10 84 | 85 | sed -n -e '/"group_hot_member\:30"\;/ p' rank_items.txt 86 | | awk -F';' '{a[$3]+=$5}END{for(i in a)print i,a[i]}' 87 | | awk -F' ' '$2 > 80 {print $0}' 88 | | sort -t' ' -n -r -k 2 | head -n 10 89 | 90 | ## 活跃小组按总分倒排 ## 91 | select substr(rank_name,18) group_id, sum(score) total_score 92 | from rank_item 93 | where rank_name like 'group_hot_member:%' 94 | group by group_id 95 | order by total_score desc 96 | limit 100 97 | 98 | sed -n -e '/"group_hot_member\:/ p' rank_items.txt 99 | | cut -d';' -f2,5 | sed -e s/\"//g -e 's/group_hot_member\://g' 100 | | awk -F';' '{a[$1]+=$2}END{for(i in a)print i,a[i]}' 101 | | sort -t' ' -n -r -k2 | head -n100 102 | 103 | ## 活跃小组按总分倒排(显示小组名称) ## 104 | select b.group_id,a.total_score from( 105 | select substr(rank_name,18) group_id, sum(score) total_score 106 | from rank_item 107 | where rank_name like 'group_hot_member:%' 108 | group by group_id 109 | order by total_score desc 110 | limit 100 111 | ) a join group b on a.group_id=b.id 112 | order by a.total_score desc 113 | 114 | sed -n -e '/"group_hot_member\:/ p' rank_items.txt 115 | | cut -d';' -f2,5 | sed -e s/\"//g -e 's/group_hot_member\://g' 116 | | awk -F';' '{a[$1]+=$2}END{for(i in a)print i";"a[i]}' 117 | | sort -t';' -n -r -k2 | head -n100 118 | | sort -n -t';' -k1 > /tmp/a.txt &; 119 | 120 | sed -e s/\"//g groups.txt | cut -d';' -f1,2 121 | | sort -n -t';' -k 1 > /tmp/b.txt &; 122 | 123 | wait; 124 | 125 | join -t';' -1 1 -2 1 -o2.2 1.2 /tmp/a.txt /tmp/b.txt | sort -t';' -k2 -n -r; 126 | 127 | ## 其它特殊的非常有用的shell命令 ## 128 | yes 无限重复,配合head使用 129 | seq 生成序列,类似于python:range 130 | paste 按列合并文件 131 | tee 管道分流 132 | xargs 分割参数,并行计算 133 | parallel 并行计算 134 | awk '{system("cmd")}' 逐行调用外部命令 135 | 136 | ## NB的管道合并和分流操作符 ## 137 | <() 138 | cat <(command1) <(command2) 139 | paste <(command1) <(command2) 140 | >() 141 | command0 | tee >(command1) >(command2) >(command3) | command4 142 | http://serverfault.com/questions/171095/how-do-i-join-two-named-pipes-into-single-input-stream-in-linux 143 | http://tldp.org/LDP/abs/html/process-sub.html 144 | 145 | ## 最后来一个比较酷的 抓取所有主题站所有文章的缩略图## 146 | curl http://www.guokr.com/site/ 2>/dev/null 147 | | awk '/