├── .idea └── vcs.xml ├── Gemfile ├── README.md └── analysis_xuanwu.rb /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'nokogiri' 4 | gem 'domainatrix' 5 | gem 'sqlite3' 6 | gem 'simplecov' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 分析订阅源 2 | 3 | ## 背景 4 | 腾讯玄武实验室的安全技术动态更新的比较快，而且全面。想要知道他们的订阅了哪些源（是的，我就是这么无聊任性）。好在他们的代码是用github.io管理的，所以，直接分析吧。 5 | 6 | ## 结论 7 | 通过数据分析，截止2017年5月11日，总文章数9492，总来源数2048。 8 | 9 | 另外一些很有意思的内容出来了： 10 | 11 | ### 采用内容来源最多的几个源 12 | 应用数超过20个的来源是 13 | ``` 14 | Dinosn: 1465 15 | binitamshah: 1265 16 | unpacker: 176 17 | SecLists: 161 18 | _jsoo_: 128 19 | Mike_Mimoso: 98 20 | TrendLabs: 96 21 | cedoxX: 96 22 | gN3mes1s: 92 23 | threatpost: 92 24 | ProjectZeroBugs: 80 25 | subTee: 72 26 | cn0Xroot: 71 27 | aszy: 62 28 | revskills: 55 29 | securxcess: 55 30 | Unit42_Intel: 54 31 | FireEye: 50 32 | PythonArsenal: 48 33 | Enno_Insinuator: 46 34 | 安全客: 43 35 | FreeBuf: 41 36 | NCCGroupInfosec: 38 37 | virqdroid: 38 38 | daniel_bilar: 38 39 | GitHub: 37 40 | Seebug: 36 41 | seebug: 35 42 | McAfee_Labs: 33 43 | Github: 33 44 | cyb3rops: 32 45 | tiraniddo: 31 46 | mattifestation: 31 47 | WEareTROOPERS: 31 48 | 0xroot: 29 49 | jedisct1: 29 50 | mwrlabs: 28 51 | claud_xiao: 27 52 | virusbtn: 27 53 | MottoIN: 25 54 | benhawkes: 24 55 | PhysicalDrive0: 24 56 | capstone_engine: 22 57 | hosselot: 22 58 | 0x6D6172696F: 22 59 | PaloAltoNtwks: 21 60 | dragosr: 21 61 | jwgoerlich: 21 62 | cynicalsecurity: 21 63 | FuzzySec: 21 64 | JohnLaTwC: 20 65 | IntrusionSkills: 20 66 | quequero: 20 67 | x0rz: 20 68 | taviso: 20 69 | ``` 70 | 可以看到前两位来源占了总比例的 28%，说明这两个新闻源的质量很高。 71 | 72 | ### 参考网站 73 | 2541个外链，排序情况如下： 74 | ``` 75 | github.com: 1126 76 | t.co: 338 77 | bit.ly: 302 78 | goo.gl: 254 79 | bugs.chromium.org: 175 80 | threatpost.com: 159 81 | ow.ly: 122 82 | twitter.com: 97 83 | securityaffairs.co: 91 84 | www.slideshare.net: 90 85 | paper.seebug.org: 75 86 | www.exploit-db.com: 70 87 | : 68 88 | www.blackhat.com: 66 89 | packetstormsecurity.com: 65 90 | www.zerodayinitiative.com: 63 91 | www.freebuf.com: 60 92 | gist.github.com: 56 93 | securelist.com: 54 94 | mp.weixin.qq.com: 54 95 | bobao.360.cn: 51 96 | ``` 97 | 看到没有，github才是学习安全的top1来源；github才是学习安全的top1来源；github才是学习安全的top1来源。重要的事情说三遍。 98 | 99 | paper.seebug.org排名比较高，说明大家对heige的工作还是比较认可滴，甚至超过了www.exploit-db.com和安全客播报。里面有一些软连接没有跟进展开，空了再说。 100 | 101 | 102 | ### 标签 103 | 共发现59个标签，排序结果如下： 104 | ``` 105 | Others : 1143 106 | Tools : 965 107 | Windows : 692 108 | Android : 586 109 | Malware : 527 110 | Browser : 479 111 | Popular Software : 410 112 | Attack : 400 113 | Web Security : 332 114 | Linux : 329 115 | MalwareAnalysis : 261 116 | Pentest : 243 117 | iOS : 213 118 | Hardware : 198 119 | Network : 176 120 | Vulnerability : 151 121 | WirelessSecurity : 147 122 | Detect : 139 123 | IoTDevice : 127 124 | macOS : 123 125 | Fuzzing : 109 126 | SecurityProduct : 107 127 | OpenSourceProject : 104 128 | ReverseEngineering : 102 129 | Crypto : 101 130 | Conference : 93 131 | Defend : 90 132 | Industry News : 89 133 | Mac OS X : 78 134 | Exploit : 76 135 | Debug : 69 136 | Virtualization : 68 137 | NetworkDevice : 62 138 | Mobile : 57 139 | Programming : 51 140 | SecurityReport : 50 141 | ThirdParty : 42 142 | Forensics : 40 143 | Operating System : 40 144 | Challenges : 40 145 | Firmware : 39 146 | Protocol : 38 147 | Mitigation : 36 148 | Obfuscation : 33 149 | MachineLearning : 33 150 | Sandbox : 30 151 | Backdoor : 23 152 | Rootkit : 23 153 | Cloud : 22 154 | ThreatIntelligence : 21 155 | Language : 13 156 | SCADA : 13 157 | Device : 11 158 | Attrack : 11 159 | Private : 11 160 | APT : 10 161 | Bug Bounty : 9 162 | : 6 163 | Symbolic Execution : 1 164 | ``` 165 | 从这里可以看出来关注点还是比较明确的，偏二进制。相对来说，不太满足各位小白帽的"求知欲望"，他们希望偏Web漏洞多一点，所以大家大部分反馈的是看不懂。 166 | 167 | ## 运行 168 | ```bash 169 | bundle install 170 | ruby analysis_xuanwu.rb 171 | ``` 172 | 173 | 如果需要跟踪twitter的真实连接（t.co展开），可以通过下面命令完成： 174 | ```bash 175 | OPEN_TWITTER_URL_PARSE=1 http_proxy=http://127.0.0.1:8123 ruby analysis_xuanwu.rb 176 | ``` 177 | 打开OPEN_TWITTER_URL_PARSE，并且指定可以翻墙的http代理地址：http_proxy=http://127.0.0.1:8123 178 | 179 | ## 查询 180 | ### 查询参考网站排序 181 | ```sql 182 | select host,count(*) cnt from articles group by host order by cnt desc 183 | ``` -------------------------------------------------------------------------------- /analysis_xuanwu.rb: -------------------------------------------------------------------------------- 1 | #/usr/bin/env ruby 2 | # encoding: utf-8 3 | require 'nokogiri' 4 | require 'domainatrix' 5 | require 'uri' 6 | require 'net/http' 7 | 8 | # 是否打开twitter url跟踪，默认关闭，打开的话会统计真实的来源url，需要翻墙 9 | OPEN_TWITTER_URL_PARSE=false 10 | OPEN_TWITTER_URL_PARSE ||= ENV['OPEN_TWITTER_URL_PARSE'] 11 | 12 | # 是否使用数据库保存内容，便于后续统计 13 | USE_SQLITE3=true 14 | if USE_SQLITE3 15 | require "sqlite3" 16 | $db = SQLite3::Database.new "xuanwu.sqlite3" 17 | 18 | # Create a table 19 | $db.execute <<-SQL 20 | create table if not exists articles ( 21 | title varchar(256), 22 | author varchar(256), 23 | atname varchar(256), 24 | source varchar(256), 25 | link varchar(256), 26 | host varchar(256), 27 | tag varchar(256), 28 | file varchar(256), 29 | description text, 30 | created_at timestamp, 31 | year int, 32 | month int, 33 | day int 34 | ) ; 35 | SQL 36 | 37 | def insert_article(obj) 38 | $db.execute "insert into articles (author, atname, description, tag, link, source, file, year, month, day, host) values ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", obj[:fullname], obj[:atname], obj[:description], obj[:tag], obj[:link], obj[:source], obj[:file], obj[:year], obj[:month], obj[:day], obj[:host] 39 | end 40 | end 41 | 42 | class XuanwuRss 43 | def initialize 44 | @dir = File.join(File.dirname(__FILE__), 'XuanwuLab.github.io') 45 | end 46 | 47 | def run 48 | objs = [] 49 | git_update 50 | files = File.join(@dir, "cn", "secnews", "**", "*.html") 51 | Dir.glob(files){|file| 52 | # puts file 53 | objs += analysis_file(file) {|obj| 54 | yield(obj) if block_given? 55 | } 56 | } 57 | objs 58 | end 59 | 60 | 61 | def test(file) 62 | objs = [] 63 | objs += analysis_file(file) {|obj| 64 | yield(obj) if block_given? 65 | } 66 | objs 67 | end 68 | 69 | def get_redirect_url(url) 70 | return url unless OPEN_TWITTER_URL_PARSE 71 | response = Net::HTTP.get_response(URI(url)) 72 | case response 73 | when Net::HTTPRedirection then 74 | url = response['location'] 75 | end 76 | url 77 | end 78 | 79 | 80 | def git_update 81 | unless Dir.exists?(@dir) 82 | puts `git clone https://github.com/XuanwuLab/XuanwuLab.github.io.git` 83 | end 84 | 85 | puts `cd XuanwuLab.github.io; git pull` 86 | end 87 | 88 | def analysis_file(file) 89 | prefix = File.absolute_path(File.dirname(__FILE__)) 90 | filename = File.absolute_path(file).delete_prefix(prefix).delete_suffix('index.html') 91 | path = filename.split(/[\\\/]/)[4..6] 92 | 93 | objs = [] 94 | doc = Nokogiri.parse(File.read(file).force_encoding('utf-8')) 95 | doc.xpath('//*[@id="weibowrapper"]/ul').each{|ul| 96 | objs += parse_content(ul) {|obj| 97 | obj[:file] = path.join('/') 98 | obj[:year],obj[:month],obj[:day] = path 99 | yield(obj) if block_given? 100 | } 101 | } 102 | objs 103 | end 104 | 105 | def extract_links(html) 106 | m = html.match(/((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?/umi) 107 | end 108 | 109 | def parse_content(ul) 110 | objs = [] 111 | if ul['class'] == 'weibolist' 112 | ul.xpath('li/div[@id="singleweibo"]').each{|content| 113 | #source = content.at_xpath('*[@id="singleweiboheader"]/*[@id="singleweibologo"]/img')['src'] 114 | aid = content.at_xpath('*[@id="singleweiboheader"]/*[@id="singleweiboauthor"]/p') 115 | author = (aid && aid.text) || "" 116 | body = content.at_xpath('*[@id="singleweibobody"]/*[@class="singleweibotext"]/p').inner_html 117 | if author.size>0 118 | fullname,atname = author.split('@', 2) 119 | else 120 | fullname,atname = ["", ""] 121 | end 122 | 123 | tag, link, description = parse_body(body) 124 | 125 | if link.include?('t.co') 126 | link = get_redirect_url(link) 127 | end 128 | 129 | atname ||= "" 130 | host = "" 131 | begin 132 | host = URI.parse(link.strip).host 133 | rescue => e 134 | puts "\n[WARNING] #{e} of link: #{link}" 135 | end 136 | 137 | obj = {source:'twitter', 138 | fullname:fullname.strip, 139 | atname: atname.strip, 140 | tag: tag.strip, 141 | link: link.strip, 142 | host: host, 143 | description: description.strip} 144 | objs << obj 145 | yield(obj) if block_given? 146 | 147 | } 148 | elsif ul['id'] == 'manualfeedlist' 149 | ul.xpath('li/div[@class="singlemanualfeed"]').each{|content| 150 | aid = content.at_xpath('*[@class="singlefeedheader"]/*[@class="singlefeedauthor"]/p') 151 | author = (aid && aid.text) || "" 152 | if author.size>0 153 | fullname,atname = author.split('@', 2) 154 | else 155 | fullname,atname = ["", ""] 156 | end 157 | body = content.at_xpath('*[@class="singlefeedbody"]/*[@class="singlefeedtext"]/p').inner_html 158 | tag, link, description = parse_body(body) 159 | atname ||= "" 160 | 161 | unless link && link.size>8 162 | link = extract_links(description) || "" 163 | end 164 | 165 | if link.include?('t.co') 166 | link = get_redirect_url(link) 167 | end 168 | 169 | fullname = fullname.delete_prefix('Xuanwu Spider via ') if fullname.include?('Xuanwu Spider via ') 170 | host = "" 171 | begin 172 | host = URI.parse(link.strip).host 173 | rescue => e 174 | puts "\n[WARNING] #{e} of link: #{link}" 175 | end 176 | 177 | 178 | 179 | obj = {source:fullname.strip, 180 | atname: atname.strip, 181 | tag: tag.strip, 182 | link: link.strip, 183 | host: host, 184 | description: description.strip} 185 | objs << obj 186 | yield(obj) if block_given? 187 | } 188 | else 189 | throw "unknown ul of #{ul.html}" 190 | end 191 | objs 192 | end 193 | 194 | def parse_body(body) 195 | tag = '' 196 | link = '' 197 | m = body.match(/[\[]?(?.*?)\](?.*?)\.*?)\](?.*)(?http[s]?:\/\/.*)/umi) 201 | end 202 | 203 | unless m 204 | m = body.match(/[\[]?(?.*?)\](?.*)/umi) 205 | end 206 | 207 | if m 208 | tagis = m[:tag].scan(/\(.*?)\<\/i\>/um) 209 | if tagis.size > 0 210 | tag = m[:tag].scan(/\(.*?)\<\/i\>/um).first.first.strip 211 | end 212 | link = m[:link].strip if m.names.include? 'link' 213 | end 214 | 215 | [tag, link, m[:description]] 216 | end 217 | 218 | def self.host_of_url(url) 219 | begin 220 | url = 'http://'+url+'/' if !url.include?('http://') and !url.include?('https://') 221 | url = URI.encode(url) unless url.include? '%' #如果包含百分号%，说明已经编码过了 222 | uri = URI(url) 223 | uri.host 224 | rescue => e 225 | nil 226 | end 227 | end 228 | end 229 | 230 | puts ARGV 231 | if ARGV.size==1 then 232 | objs = XuanwuRss.new.test(ARGV[0]) {|obj| 233 | print '.' 234 | insert_article(obj) 235 | } 236 | else 237 | objs = XuanwuRss.new.run{|obj| 238 | print '.' 239 | insert_article(obj) 240 | } 241 | end 242 | 243 | cnt_hash = objs.each_with_object(Hash.new(0)){|h1, h2| h2[h1[:atname]]+=1}.sort_by{|k,v| v}.reverse 244 | puts "="*30 245 | puts "distinct count: #{cnt_hash.size}" 246 | all_cnt = 0 247 | cnt_hash.each{|k,v|all_cnt+=v} 248 | puts "articles count: #{all_cnt}" 249 | puts "sort by source: " 250 | puts cnt_hash.map{|k,v| "#{k}:\t#{v}"} 251 | 252 | tag_hash = objs.each_with_object(Hash.new(0)){|h1, h2| h2[h1[:tag]]+=1}.sort_by{|k,v| v}.reverse 253 | puts "="*30 254 | puts "distinct tag count: #{tag_hash.size}" 255 | puts "sort by tag: " 256 | puts tag_hash.map{|k,v| "#{k}:\t#{v}"} 257 | 258 | host_hash = objs.each_with_object(Hash.new(0)){|h1, h2| h2[XuanwuRss.host_of_url(h1[:link])]+=1}.sort_by{|k,v| v}.reverse 259 | puts "="*30 260 | puts "distinct host count: #{host_hash.size}" 261 | puts "sort by host: " 262 | puts host_hash.map{|k,v| "#{k}:\t#{v}"} --------------------------------------------------------------------------------