├── data └── .gitkeep ├── .gitignore ├── gen ├── nocase.sql ├── filtredirs.rb ├── proclinks.rb ├── sqlindex.rb ├── doublelink.rb ├── doublelink.nim ├── dumplinks.rb └── binindex.rb ├── Gemfile ├── Gemfile.lock ├── analyze ├── invalid_links.rb ├── inspect.rb ├── link_stats.rb ├── verify.rb ├── graph.rb └── strong_conn.rs ├── LICENSE ├── Rakefile └── README.md /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | bin 3 | gen/doublelink 4 | gen/nimcache -------------------------------------------------------------------------------- /gen/nocase.sql: -------------------------------------------------------------------------------- 1 | create index pages_nocase on pages (title collate nocase); 2 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gem "sqlite3" 3 | gem "progress" 4 | gem "ox" 5 | gem "triez" 6 | gem "rake" 7 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | ox (2.1.4) 5 | progress (3.0.0) 6 | rake (10.3.2) 7 | sqlite3 (1.3.10) 8 | triez (1.0.4) 9 | 10 | PLATFORMS 11 | ruby 12 | 13 | DEPENDENCIES 14 | ox 15 | progress 16 | rake 17 | sqlite3 18 | triez 19 | -------------------------------------------------------------------------------- /analyze/invalid_links.rb: -------------------------------------------------------------------------------- 1 | require "triez" 2 | 3 | raise "Usage: ruby invalid_links.rb path/to/links.txt path/to/titles.txt max" unless ARGV.length == 3 4 | links_path, titles_path, max = ARGV 5 | MAX_PAGES = max.to_i 6 | 7 | STDERR.puts "Building Validity Hash" 8 | valid = Triez.new value_type: :object 9 | IO.foreach(titles_path) do |l| 10 | valid[l.strip] = true 11 | end 12 | 13 | STDERR.puts "Analyzing Links" 14 | count = 0 15 | IO.foreach(links_path) do |line| 16 | page, *links = line.chomp.split('|').map{ |x| x.strip } 17 | invalid = links.uniq.reject { |l| valid.has_key?(l)} 18 | puts "# #{page}" 19 | invalid.each { |x| puts x } 20 | count += 1 21 | break if count > MAX_PAGES 22 | end 23 | -------------------------------------------------------------------------------- /gen/filtredirs.rb: -------------------------------------------------------------------------------- 1 | require "triez" 2 | 3 | raise "Usage: ruby filtredirs.rb path/to/titles.txt path/to/redirects.txt path/to/put/redirects-filt.txt" unless ARGV.length == 3 4 | titles_path, redir_path, out_path = ARGV 5 | 6 | puts "Building Validity Hash" 7 | valid = Triez.new value_type: :object 8 | IO.foreach(titles_path) do |l| 9 | valid[l.chomp] = true 10 | end 11 | 12 | puts "Processing..." 13 | out = File.open(out_path,'w') 14 | IO.foreach(redir_path) do |l| 15 | from,to = l.split('|').map { |x| x.strip } 16 | # This doesn't do anything, everything is capped 17 | # to[0] = to[0].capitalize 18 | next unless valid.has_key?(to) # points to valid thing 19 | next if valid.has_key?(from) # conflicts with real page 20 | out.puts "#{from}|#{to}" 21 | end 22 | out.close 23 | -------------------------------------------------------------------------------- /analyze/inspect.rb: -------------------------------------------------------------------------------- 1 | require_relative "graph.rb" 2 | 3 | def print_list(g,ls) 4 | p ls.map { |l| g.name(l) } 5 | end 6 | 7 | raise "Usage: inspect.rb path/to/index.bin path/to/xindex.db query" unless ARGV.length == 3 8 | 9 | f = File.open(ARGV[0]) 10 | g = Graph.new(f,ARGV[1]) 11 | 12 | query = ARGV[2].strip 13 | if query.to_i.to_s == query 14 | q = query.to_i 15 | else 16 | q = g.find(query) 17 | end 18 | raise "Could not find page" unless q 19 | 20 | puts "Name: #{g.name(q)}" 21 | puts "Index: #{q}" 22 | puts "Meta: #{g.meta(q).to_s(2).rjust(32,'0')}" 23 | puts "Link Count: #{g.link_count(q)}" 24 | puts "Bidirectional Links: #{g.bi_link_count(q)}" 25 | puts "Bidirectional Links:" 26 | print_list(g, g.page_bi_links(q)) 27 | links = g.page_links(q) 28 | puts "Outgoing Link Offsets:" 29 | p links 30 | puts "Outgoing Links:" 31 | print_list(g, links) 32 | -------------------------------------------------------------------------------- /gen/proclinks.rb: -------------------------------------------------------------------------------- 1 | require "triez" 2 | 3 | raise "Usage: ruby proclinks.rb path/to/titles.txt path/to/redirects.txt path/to/links.txt path/to/put/links.txt" unless ARGV.length == 4 4 | titles_path, redir_path, links_path, out_path = ARGV 5 | 6 | puts "Building Validity Hash" 7 | valid = Triez.new value_type: :object 8 | IO.foreach(titles_path) do |l| 9 | valid[l.chomp] = true 10 | end 11 | puts "Building Redirect Hash" 12 | redirects = Triez.new value_type: :object 13 | IO.foreach(redir_path) do |l| 14 | key,val = l.chomp.split('|') 15 | redirects[key] = val 16 | end 17 | 18 | puts "Processing..." 19 | out = File.open(out_path,'w') 20 | IO.foreach(links_path) do |l| 21 | page,meta,*links = l.split('|').map { |x| x.strip } 22 | # next if page.length == 0 23 | links.reject! { |li| li.length == 0 } 24 | links.each do |li| 25 | li[0] = li[0].capitalize 26 | end 27 | links.map! { |li| redirects[li] || li } 28 | links = links.select { |li| valid.has_key?(li) }.uniq 29 | links.unshift(page,meta) 30 | out.puts links.join('|') 31 | end 32 | out.close 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Tristan Hume 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /analyze/link_stats.rb: -------------------------------------------------------------------------------- 1 | require "triez" 2 | 3 | raise "Usage: ruby link_stats.rb path/to/links.txt path/to/titles.txt" unless ARGV.length == 2 4 | links_path, titles_path = ARGV 5 | 6 | puts "Building Validity Hash" 7 | valid = Triez.new value_type: :object 8 | IO.foreach(titles_path) do |l| 9 | valid[l.strip] = true 10 | end 11 | 12 | puts "Analyzing Links" 13 | stats = Hash.new(0) 14 | IO.foreach(links_path) do |line| 15 | page, *links = line.chomp.split('|').map{ |x| x.strip } 16 | stats[:pages] += 1 17 | p [page,links,line] unless page 18 | if valid.has_key?(page) 19 | stats[:valid_pages] += 1 20 | stats[:links] += links.count 21 | 22 | links_uniq = links.uniq 23 | stats[:unique_links] += links_uniq.count 24 | stats[:valid_links] += links_uniq.count { |l| valid.has_key?(l)} 25 | end 26 | end 27 | stats[:valid_link_frac] = stats[:valid_links] / stats[:unique_links].to_f 28 | stats[:valid_page_frac] = stats[:valid_pages] / stats[:pages].to_f 29 | [:links,:unique_links,:valid_links].each do |stat| 30 | avg_name = ("average_" + stat.to_s).intern 31 | stats[avg_name] = stats[stat] / stats[:valid_pages].to_f 32 | end 33 | p stats 34 | -------------------------------------------------------------------------------- /analyze/verify.rb: -------------------------------------------------------------------------------- 1 | require "progress" 2 | require "sqlite3" 3 | 4 | class Parser 5 | FILE_HEADER_SIZE = 4*4 6 | LINK_SIZE = 4 7 | HEADER_SIZE = 4*4 8 | 9 | def initialize(f,db_path) 10 | @f = f 11 | @db = SQLite3::Database.new db_path 12 | end 13 | 14 | def run 15 | header = @f.read(FILE_HEADER_SIZE) 16 | num_pages = header.unpack("LLLL")[1] 17 | puts "Processing #{num_pages} pages." 18 | num_pages.times.with_progress do 19 | do_page 20 | end 21 | end 22 | 23 | def do_page 24 | this_page = @f.pos 25 | raise "Header fail at #{this_page}" unless get_int == 0 26 | raise "No entry in page DB for #{this_page}" unless name(this_page) 27 | num_links = get_int 28 | raise "Already processed" unless get_int == 0 29 | raise "Bad meta" unless get_int < 2**16 30 | borked_links = @f.read(4*num_links).unpack("L*").reject { |p| name(p) } 31 | raise "Borked links for #{this_page}: #{borked_links.inspect}" unless borked_links.empty? 32 | # (1..num_links).map {get_int} 33 | end 34 | 35 | private 36 | 37 | def get_int 38 | @f.read(4).unpack("L").first 39 | end 40 | 41 | def name(p) 42 | rs = @db.execute("SELECT title FROM pages WHERE offset = ? LIMIT 1",p) 43 | return nil if rs.empty? 44 | # rs.first.first 45 | true 46 | end 47 | end 48 | 49 | raise "pass bin file to verify and db to check against" if ARGV.length != 2 50 | f = File.open(ARGV[0]) 51 | p = Parser.new(f, ARGV[1]) 52 | p.run 53 | -------------------------------------------------------------------------------- /analyze/graph.rb: -------------------------------------------------------------------------------- 1 | require "sqlite3" 2 | 3 | class File 4 | def each_chunk(chunk_size=1024*1024) 5 | yield read(chunk_size) until eof? 6 | end 7 | end 8 | 9 | class Graph 10 | HEADER_SIZE = 4 11 | def initialize(f,db_path, dbg = true) 12 | @debug = dbg 13 | debug "loading file" 14 | @d = [] 15 | f.each_chunk do |chunk| 16 | @d.concat(chunk.unpack("L*")) 17 | end 18 | @db = SQLite3::Database.new db_path 19 | end 20 | 21 | def at(p,i) 22 | @d[p/4+i] 23 | end 24 | 25 | def link_count(p) 26 | at(p,1) 27 | end 28 | 29 | def bi_link_count(p) 30 | at(p,2) 31 | end 32 | 33 | def meta(p) 34 | at(p,3) 35 | end 36 | 37 | def page_links(p) 38 | x = p/4 39 | c = @d[x+1] # link count 40 | @d[x+HEADER_SIZE..x+HEADER_SIZE+c-1] 41 | end 42 | 43 | def page_bi_links(p) 44 | x = p/4 45 | c = @d[x+2] # bi link count 46 | @d[x+HEADER_SIZE..x+HEADER_SIZE+c-1] 47 | end 48 | 49 | def page_un_links(p) 50 | x = p/4 51 | b = @d[x+2] # bi link count 52 | c = @d[x+1] # bi link count 53 | @d[x+HEADER_SIZE+b..x+HEADER_SIZE+c-1] 54 | end 55 | 56 | def name(p) 57 | rs = @db.execute("SELECT title FROM pages WHERE offset = ? LIMIT 1",p) 58 | return nil if rs.empty? 59 | rs.first.first 60 | end 61 | 62 | def find(s) 63 | rs = @db.execute("SELECT offset FROM pages WHERE title = ? LIMIT 1",s) 64 | return nil if rs.empty? 65 | rs.first.first 66 | end 67 | 68 | private 69 | 70 | def debug(msg) 71 | STDERR.puts msg if @debug 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /gen/sqlindex.rb: -------------------------------------------------------------------------------- 1 | require "sqlite3" 2 | require "fileutils" 3 | require "triez" 4 | 5 | class Parser 6 | FILE_HEADER_SIZE = 4*4 7 | LINK_SIZE = 4 8 | HEADER_SIZE = 4*4 9 | attr_accessor :pos 10 | 11 | def initialize(f,db_path) 12 | @f = f 13 | @pos = FILE_HEADER_SIZE 14 | @total = 0 15 | 16 | FileUtils.rm_f(db_path) 17 | @db = SQLite3::Database.new db_path 18 | @db.execute <<-SQL 19 | create table pages ( 20 | title varchar(256) PRIMARY KEY, 21 | offset int 22 | ); 23 | SQL 24 | @db.execute("CREATE INDEX pages_offset ON pages (offset)") 25 | @db.execute("PRAGMA synchronous = OFF;") 26 | end 27 | 28 | def finish 29 | @db.execute("PRAGMA synchronous = ON;") 30 | puts "Number of Pages: #{@total}" 31 | puts "File size: #{@pos}" 32 | end 33 | 34 | def document 35 | IO.foreach(@f).with_index do |l,i| 36 | page(l.chomp.split('|').map{ |x| x.strip }) 37 | print '.' if i % 1000 == 0 38 | end 39 | end 40 | 41 | def page(line) 42 | name = line[0] 43 | l = num_links(line) 44 | @db.execute("INSERT INTO pages (title, offset) VALUES (?,?)",[name,@pos]) 45 | @pos += HEADER_SIZE + LINK_SIZE*l 46 | @total += 1 47 | end 48 | 49 | def num_links(ls) 50 | # First is name, second is meta 51 | ls.length-2 52 | end 53 | end 54 | 55 | raise "Usage: ruby 3-sqlindex.rb path/to/links.txt path/to/put/xindex.db" unless ARGV.length == 2 56 | links_path, db_path = ARGV 57 | 58 | puts "Parsing" 59 | f = File.open(links_path) 60 | # f = STDIN 61 | p = Parser.new(f,db_path) 62 | p.document 63 | p.finish 64 | -------------------------------------------------------------------------------- /gen/doublelink.rb: -------------------------------------------------------------------------------- 1 | require "progress" 2 | 3 | class Parser 4 | FILE_HEADER_SIZE = 4*4 5 | LINK_SIZE = 4 6 | HEADER_SIZE = 4*4 7 | 8 | def initialize(f, out_path) 9 | @f = f 10 | @out = File.open(out_path,"w") 11 | end 12 | 13 | def run 14 | header = @f.read(FILE_HEADER_SIZE) 15 | num_pages = header.unpack("LLLL")[1] 16 | @out.write(header) 17 | num_pages.times.with_progress do 18 | do_page 19 | end 20 | end 21 | 22 | def do_page 23 | # puts "Doing page at #{@f.pos}" 24 | this_page = @f.pos 25 | links, meta = page_links 26 | double,single = links.partition { |l| bidirectional?(l,this_page)} 27 | output_page(double,single,meta) 28 | end 29 | 30 | def output_page(double,single, meta) 31 | total = double.length + single.length 32 | @out.write([0,total,double.length,meta].pack("LLL")) # header 33 | @out.write(double.pack("L*")) 34 | @out.write(single.pack("L*")) 35 | end 36 | 37 | def page_links 38 | user_data = get_int 39 | raise "Header fail at #{@f.pos - 4}: #{user_data} should be 0" unless user_data == 0 40 | num_links = get_int 41 | raise "Already processed" unless get_int == 0 42 | meta = get_int # metadata 43 | return ((1..num_links).map {get_int}), meta 44 | end 45 | 46 | private 47 | 48 | def get_int 49 | @f.read(4).unpack("L").first 50 | end 51 | 52 | def bidirectional?(page,other) 53 | old_pos = @f.pos 54 | @f.seek(page) 55 | links = page_links 56 | res = links.include?(other) 57 | @f.seek(old_pos) 58 | res 59 | end 60 | end 61 | raise "Usage: ruby 5-doublelink.rb path/to/index.bin path/to/put/newindex.bin" unless ARGV.length == 2 62 | bin_path, out_path = ARGV 63 | 64 | puts "Bidirectionally linking file." 65 | puts "This Ruby version is super slow. Install Nim, delete indexbi.bin and run again to get the super fast Nim version." 66 | f = File.open(bin_path) 67 | p = Parser.new(f, out_path) 68 | p.run 69 | -------------------------------------------------------------------------------- /gen/doublelink.nim: -------------------------------------------------------------------------------- 1 | import sequtils, queues, strutils, algorithm 2 | 3 | type 4 | Graph* = seq[int32] 5 | Page = int32 6 | 7 | const 8 | kPageUserDataField = 0 9 | kPageLinksField = 1 10 | kPageBidLinksField = 2 11 | kPageHeaderSize = 4 12 | kFirstPageIndex = 4 13 | 14 | proc offset*[A](some: ptr A; b: int): ptr A = 15 | result = cast[ptr A](cast[int](some) + (b * sizeof(A))) 16 | iterator iterPtr*[A](some: ptr A; num: int): A = 17 | for i in 0.. RAW_DUMP_PATH) 5 | dump = args[:dump_path] 6 | raise "#{dump} must exist" unless File.exist?(dump) 7 | sh "bzip2 -dc \"#{dump}\" | ruby gen/dumplinks.rb data/links-raw.txt data/redirects-raw.txt" 8 | end 9 | 10 | file "data/links-filt.txt" => ["data/links-raw.txt"] do 11 | sh "grep -Ev \"^(File|Template|Help|Draft):\" data/links-raw.txt > data/links-filt.txt" 12 | end 13 | 14 | file "data/titles.txt" => ["data/links-filt.txt"] do 15 | sh "cut -d'|' -f1 data/links-filt.txt > data/titles.txt" 16 | end 17 | 18 | file "data/redirects.txt" => ["data/links-raw.txt","data/titles.txt"] do 19 | ruby "gen/filtredirs.rb data/titles.txt data/redirects-raw.txt data/redirects.txt" 20 | end 21 | 22 | file "data/links.txt" => ["data/links-filt.txt","data/redirects.txt","data/titles.txt"] do 23 | ruby "gen/proclinks.rb data/titles.txt data/redirects.txt data/links-filt.txt data/links.txt" 24 | end 25 | 26 | file "data/xindex.db" => ["data/links.txt"] do 27 | ruby "gen/sqlindex.rb data/links.txt data/xindex.db" 28 | end 29 | 30 | file "data/index.bin" => ["data/links.txt","data/xindex.db"] do 31 | ruby "gen/binindex.rb data/links.txt data/xindex.db data/index.bin" 32 | end 33 | 34 | file "data/indexbi.bin" => ["data/index.bin"] do 35 | if system("which","nim") 36 | sh "nim c -d:release gen/doublelink.nim" 37 | sh "./gen/doublelink" 38 | else 39 | ruby "gen/doublelink.rb data/index.bin data/indexbi.bin" 40 | end 41 | end 42 | 43 | directory "bin" 44 | file "bin/strong_conn" => ["bin"] do 45 | sh "rustc -O -o bin/strong_conn analyze/strong_conn.rs" 46 | end 47 | 48 | task :verify => "data/index.bin" do 49 | ruby "analyze/verify.rb data/index.bin data/xindex.db" 50 | end 51 | 52 | task :inspect, :page do |t, args| 53 | ruby "analyze/inspect.rb data/indexbi.bin data/xindex.db \"#{args[:page]}\"" 54 | end 55 | 56 | task :link_stats do 57 | ruby "analyze/link_stats.rb data/links.txt data/titles.txt" 58 | end 59 | 60 | task :invalid_links do 61 | sh "ruby analyze/invalid_links.rb data/links.txt data/titles.txt 1000 > data/invalid-links.txt" 62 | end 63 | 64 | task :strong_conn => ["bin/strong_conn"] do 65 | sh "./bin/strong_conn data/index.bin" 66 | end 67 | 68 | task :nocase => ["data/xindex.db"] do 69 | cp "data/xindex.db", "data/xindex-nocase.db" 70 | sh "sqlite3 data/xindex-nocase.db < gen/nocase.sql" 71 | end 72 | -------------------------------------------------------------------------------- /gen/dumplinks.rb: -------------------------------------------------------------------------------- 1 | require 'ox' 2 | 3 | class Handler < ::Ox::Sax 4 | LINK_REGEX = /\[\[([^|\n\]]+)(?:\|[^\n\]]+)?\]\]|(<!--)|(-->)/ 5 | DIS_REGEX = /\{\{\s*((?:Disambiguation[^}]*)|Airport disambiguation|Biology disambiguation|Call sign disambiguation|Caselaw disambiguation|Chinese title disambiguation|Genus disambiguation|Geodis|Hndis|Hndis-cleanup|Hospital disambiguation|Letter disambiguation|Letter-NumberCombDisambig|Mathematical disambiguation|Mil-unit-dis|Numberdis|Phonetics disambiguation|Road disambiguation|School disambiguation|Species Latin name disambiguation|Wikipedia disambiguation|disambig|dab|disamb)\s*\}\}/i 6 | def initialize(link_file, redir_file) 7 | @link_file = File.open(link_file,"w") 8 | @redir_file = File.open(redir_file,"w") 9 | end 10 | 11 | def start_element(name) 12 | case name 13 | when :page 14 | @title = nil 15 | @links = [] 16 | @is_redirect = false 17 | @is_disambig = false 18 | when :redirect 19 | @is_redirect = true 20 | when :title 21 | @in_title = true 22 | when :text 23 | @in_text = true 24 | end 25 | end 26 | def end_element(name) 27 | case name 28 | when :text 29 | do_page 30 | @in_text = false 31 | when :title 32 | @in_title = false 33 | end 34 | end 35 | def attr(name, value) 36 | if @is_redirect && name == :title 37 | @redirect = value 38 | end 39 | end 40 | def text(value) 41 | case 42 | when @in_title 43 | @title = value 44 | when @in_text 45 | @page_length = value.bytesize 46 | @real_text = true 47 | value.scan(LINK_REGEX) do |lin, op, clos| 48 | # p [lin,op,clos,@real_text] 49 | if lin && @real_text 50 | @links << lin if lin.length < 100 51 | elsif op 52 | @real_text = false 53 | elsif clos 54 | @real_text = true 55 | end 56 | end 57 | if DIS_REGEX =~ value 58 | @is_disambig = true 59 | end 60 | end 61 | end 62 | 63 | private 64 | 65 | def do_page 66 | return unless @title 67 | if @is_redirect 68 | do_redirect 69 | else 70 | do_real_page 71 | end 72 | end 73 | 74 | def do_real_page 75 | @link_file.puts "#{@title}|#{@page_length}-#{@is_disambig ? 'D' : ''}|#{@links.map{ |x| x.strip }.join('|')}" 76 | end 77 | 78 | def do_redirect 79 | return unless @redirect 80 | @redir_file.puts "#{@title}|#{@redirect}" 81 | end 82 | end 83 | 84 | raise "Usage: cat wikidump.xml | ruby 1-dumplinks.rb path/to/put/links.txt path/to/put/redirects.txt" unless ARGV.length == 2 85 | puts "Dumping links..." 86 | handler = Handler.new(ARGV[0],ARGV[1]) 87 | Ox.sax_parse(handler, STDIN) 88 | -------------------------------------------------------------------------------- /gen/binindex.rb: -------------------------------------------------------------------------------- 1 | require "sqlite3" 2 | require "triez" 3 | 4 | class Parser 5 | FILE_HEADER_SIZE = 4*4 6 | LINK_SIZE = 4 7 | HEADER_SIZE = 4*4 8 | NAMESPACES = {"Category" => 1, "Wikipedia" => 2, "Portal" => 3, "Book" => 4} 9 | attr_accessor :pos 10 | 11 | def initialize(f,db_path,bin_path) 12 | @f = f 13 | @db = SQLite3::Database.new db_path 14 | @out = File.open(bin_path,"w") 15 | file_header 16 | end 17 | 18 | def document 19 | IO.foreach(@f).with_index do |l,i| 20 | page(l.chomp.split('|')) 21 | if i % 10_000 == 0 22 | puts "#{(i/@total.to_f*100.0).round(3)}%" 23 | end 24 | end 25 | @out.close 26 | end 27 | 28 | def page(line) 29 | name, meta, *links = line 30 | fill(name, meta, links) 31 | end 32 | 33 | def file_header 34 | @total = @db.execute("SELECT count(*) FROM pages").first.first 35 | # version, num articles, header length, extra 36 | @out.write([2,@total,FILE_HEADER_SIZE,HEADER_SIZE].pack("L*")) 37 | end 38 | 39 | def fill(title, meta, ls) 40 | link_data = ls.map{ |l| get_offset(l)} 41 | @out.write([0,link_data.length,0, meta_bits(title, meta)].pack("LLLL")) # header 42 | @out.write(link_data.pack("L*")) 43 | end 44 | 45 | private 46 | 47 | def get_offset(name) 48 | rows = @db.execute("SELECT offset FROM pages WHERE title = ? LIMIT 1", name) 49 | rows.first.first 50 | end 51 | 52 | # 32 bits of metadata, packed like so (starting at the least significant bit): 53 | # 54 | # 3 bits = log10(length of article markup in bytes) 55 | # 4 bits = min(number of words in title, 15) 56 | # 1 bit = 1 if is a disambiguation page 57 | # 58 | # 3 bits = article namespace of [normal, category, wikipedia, portal, book ... potential others ... 7=other namespace] 59 | # 1 bit = 1 if page is a "List of" article 60 | # 1 bit = 1 if page is a year 61 | # The following bits are not set by this script but their places are reserved 62 | # 1 bit = if the article is a featured article 63 | # 1 bit = if the article is a "good" article 64 | # (32-15=17) bits of zeroes reserved for future use 65 | def meta_bits(name, meta) 66 | textlen_str, flags = meta.split('-') 67 | log_textlen = textlen_str.length - 1 # should be log10(textlen) 68 | raise "Out of range textlen" if log_textlen > 7 || log_textlen < 0 69 | title_words = [name.split.length,15].min 70 | 71 | if /^([A-Z][a-z]+):/ =~ name 72 | type = NAMESPACES[$1] || 7 73 | else 74 | type = 0 75 | end 76 | 77 | is_disambig = (!!flags) && flags.include?('D') 78 | is_list = name.start_with?("List of ") 79 | is_year = (/^[0-9]{1,4}$/ === name) 80 | 81 | pack_bits([[log_textlen, 3], [title_words, 2], [is_disambig, 1], [type, 3], [is_list, 1], [is_year, 1]]) 82 | end 83 | 84 | def pack_bits(arr) 85 | res = 0 86 | bits_so_far = 0 87 | arr.each do |n, num_bits| 88 | n = 1 if n == true 89 | n = 0 if n == false 90 | res = res | (n << bits_so_far) 91 | bits_so_far += num_bits 92 | end 93 | res 94 | end 95 | end 96 | 97 | raise "Usage: ruby binindex.rb path/to/links.txt path/to/xindex.db path/to/put/index.bin" unless ARGV.length == 3 98 | links_path, db_path, bin_path = ARGV 99 | 100 | f = File.open(links_path) 101 | p = Parser.new(f,db_path, bin_path) 102 | p.document 103 | -------------------------------------------------------------------------------- /analyze/strong_conn.rs: -------------------------------------------------------------------------------- 1 | #![feature(env)] 2 | #![feature(old_io)] 3 | #![feature(old_path)] 4 | #![feature(collections)] 5 | 6 | #![allow(dead_code)] 7 | 8 | use std::env; 9 | use std::mem; 10 | use std::old_io::File; 11 | 12 | static FILE_HEADER_SIZE : usize = 4*4; 13 | static PAGE_HEADER_SIZE : usize = 4; 14 | 15 | static PAGE_USER_DATA : usize = 0; 16 | static PAGE_LINKS : usize = 1; 17 | static PAGE_BID_LINKS : usize = 2; 18 | 19 | struct Graph<'a> { 20 | data : &'a mut [u32], 21 | } 22 | 23 | struct PageIter<'a> { 24 | g : &'a Graph<'a>, 25 | cur : usize, 26 | } 27 | 28 | impl<'a> Iterator for PageIter<'a> { 29 | type Item = usize; 30 | fn next(&mut self) -> Option { 31 | let next_page = self.cur + (PAGE_HEADER_SIZE+self.g.link_count(self.cur))*4; 32 | self.cur = next_page; 33 | if next_page >= self.g.data.len() * 4 { None } else { Some(next_page) } 34 | } 35 | } 36 | 37 | impl<'a> Graph<'a> { 38 | fn first_page(&self) -> usize { 39 | FILE_HEADER_SIZE 40 | } 41 | 42 | fn find_next(&self, page : usize) -> Option { 43 | let next_page = page + (PAGE_HEADER_SIZE+self.link_count(page))*4; 44 | if next_page >= self.data.len() * 4 { None } else { Some(next_page) } 45 | } 46 | 47 | fn find_next_unmarked(&self,start : usize) -> Option { 48 | let mut page = start; 49 | while self.user_data(page) != 0 { 50 | page = page + (PAGE_HEADER_SIZE+self.link_count(page))*4; 51 | if page >= self.data.len() * 4 { return None;} 52 | } 53 | Some(page) 54 | } 55 | 56 | fn pages(&self) -> PageIter { 57 | PageIter {g: self, cur: self.first_page()} 58 | } 59 | 60 | fn page_count(&self) -> u32 { 61 | self.data[1] 62 | } 63 | 64 | fn link_count(&self, page : usize) -> usize { 65 | self.data[page/4+PAGE_LINKS] as usize 66 | } 67 | 68 | fn bid_link_count(&self, page : usize) -> usize { 69 | self.data[page/4+PAGE_BID_LINKS] as usize 70 | } 71 | 72 | fn links(&'a self, page : usize) -> Vec { 73 | let start = page/4+PAGE_HEADER_SIZE; 74 | let end = start+self.link_count(page); 75 | let link_range = &self.data[start..end]; 76 | link_range.iter().map(|x| *x as usize).collect::>() 77 | } 78 | 79 | fn set_user_data(&mut self, page : usize, data : u32) { 80 | self.data[page/4+PAGE_USER_DATA] = data; 81 | } 82 | 83 | fn user_data(&self, page : usize) -> u32 { 84 | self.data[page/4+PAGE_USER_DATA] 85 | } 86 | } 87 | 88 | fn flood_fill(graph : &mut Graph, start_page : usize, mark : u32) -> u32 { 89 | assert!(mark != 0); 90 | let mut stack = vec![start_page]; 91 | let mut marked_count = 0; 92 | while !stack.is_empty() { 93 | let page = stack.pop().unwrap(); 94 | 95 | if graph.user_data(page) != 0 {continue;} 96 | graph.set_user_data(page,mark); // mark visited 97 | // println!("Visiting {} with {} links",page,graph.link_count(page)); 98 | marked_count += 1; 99 | 100 | for linked in graph.links(page) { 101 | // println!("Pushing link to {}", linked); 102 | stack.push(linked); 103 | } 104 | } 105 | marked_count 106 | } 107 | 108 | fn find_conn_components(graph : &mut Graph) { 109 | let mut start_page = graph.first_page(); 110 | let mut comp_count = 0; 111 | loop { 112 | let count = flood_fill(graph, start_page,1); 113 | if count > 100 { 114 | println!("Found a connected component of {} nodes out of {} pages = {}.", 115 | count,graph.page_count(),(count as f32 / graph.page_count() as f32)); 116 | } 117 | comp_count += 1; 118 | 119 | let next_page = graph.find_next_unmarked(start_page); 120 | match next_page { 121 | Some(page) => start_page = page, 122 | None => break, 123 | } 124 | } 125 | println!("Found {} components.",comp_count); 126 | } 127 | 128 | fn fill_incoming_links(graph : &mut Graph) { 129 | let mut page = graph.first_page(); 130 | // Increment link count on all linked to pages, then move to next 131 | loop { 132 | for linked in graph.links(page) { 133 | let incd = graph.user_data(linked)+1; 134 | graph.set_user_data(linked, incd); 135 | } 136 | 137 | match graph.find_next(page) { 138 | None => break, 139 | Some(new_page) => page = new_page, 140 | } 141 | } 142 | } 143 | 144 | static DATA_HIST_MAX : usize = 50; 145 | fn analyze_user_data(graph : &Graph) { 146 | let mut hist : Vec = vec![0; DATA_HIST_MAX]; 147 | for page in graph.pages() { 148 | let count = graph.user_data(page); 149 | if (count as usize) < DATA_HIST_MAX { 150 | hist[count as usize] += 1; 151 | } 152 | } 153 | println!("Incoming links:"); 154 | for c in 0..hist.len() { 155 | println!("{}: {}",c, hist[c]); 156 | } 157 | } 158 | 159 | fn main() { 160 | let args: Vec = env::args().map(|x| x.to_string()).collect(); 161 | 162 | if args.len() != 2 { 163 | println!("Usage: ./strong_conn path/to/indexbi.bin"); 164 | env::set_exit_status(1); 165 | return; 166 | } 167 | 168 | let bin_path = Path::new(&args[1]); 169 | println!("Analyzing {}...",bin_path.display()); 170 | 171 | let mut file = File::open(&bin_path).ok().expect("Could not open graph file."); 172 | 173 | let mut graph_data : Vec; 174 | { 175 | let mut buf : Vec = file.read_to_end().ok().expect("Could not read file."); 176 | let len = buf.len(); 177 | println!("Read {} bytes of file!", len); 178 | if len % 4 != 0 { 179 | println!("Invalid file size!"); 180 | return; 181 | } 182 | let data_ptr : *mut u32 = unsafe {mem::transmute(buf.as_mut_ptr())}; 183 | graph_data = unsafe { Vec::from_raw_buf(data_ptr, len / 4)}; 184 | } 185 | let mut graph = Graph { data: graph_data.as_mut_slice() }; 186 | println!("Read {} words of file!", graph.data.len()); 187 | println!("Total pages: {}", graph.page_count()); 188 | 189 | find_conn_components(&mut graph); 190 | 191 | // println!("Finding incoming links..."); 192 | // fill_incoming_links(&mut graph); 193 | // println!("Analyzing incoming links..."); 194 | // analyze_user_data(&graph); 195 | } 196 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Wikicrush 2 | ========= 3 | 4 | Extracts link graphs in a variety of formats from Wikipedia data dumps. 5 | This includes a highly compact binary graph format designed for very efficient graph searches. 6 | 7 | It can compress a recent 10GB compressed Wikipedia dump into a 630MB binary link graph and a 550MB sqlite database for translating article names into binary graph offsets. 8 | 9 | Wikicrush was created for use in [Rate With Science](http://github.com/trishume/ratewithscience) where it allows sub-second breadth-first searches through all of Wikipedia on a cheap VPS with 1GB of RAM. 10 | 11 | ### Getting the Data 12 | 13 | You can either run the process yourself and get all the files plus control over the source dump by following the steps at the bottom or you can use the download I have prepared. 14 | 15 | The download is a zip file containing just `xindex.db` and `indexbi.bin` and was generated from `enwiki-20150205-pages-articles.xml.bz2` (i.e the February 2015 english Wikipedia dump). The file is 740MB and can be downloaded here: [http://thume.net/bigdownloads/wikidata.zip](http://thume.net/bigdownloads/wikidata.zip). **Note:** This uses the old graph format v1, see the `v1` branch readme for the old format. I'll try and process another more recent wiki dump into the new format soon. 16 | 17 | 18 | # The Files 19 | 20 | ## Features 21 | - Relatively small binary graph data fits in memory allowing fast processing. 22 | - Format design allows tight loops without external table lookups. 23 | - Properly skips commented out links that don't show up on rendered Wikipedia pages. 24 | - All links are validated to only include ones that go to valid pages. 25 | - Link edges go through redirects transparently. 26 | - Link lists sorted with bidirectional edges first. 27 | - Provides space to store node data during graph algorithm processing. 28 | - Tested and verified to accurately capture the links over many weeks of bug fixing and use in Rate With Science. 29 | 30 | ## Primary Data 31 | 32 | ### indexbi.bin 33 | 34 | This is the most important and awesome file, the crown jewel of the Wikicrush project. It is a dense binary link graph that does not contain the titles of the articles and links to offsets within itself. This way one can run graph algorithms in tight loops without having to deal with strings and lookup tables. The graph transparently follows redirects in that if a page links to a redirect, it will be included in the file as a link to the page that the redirect goes to. Also note some pages link to themselves. 35 | 36 | The file is a big array of 32-bit (4 byte) little-endian integers. This should be convenient to load into a big int array in the language of your choice. 37 | 38 | The first 4 ints are the file header. First the version, next the total number of pages **P**, then 2 unused. 39 | After this is **P** page data sections, each page is placed one after another until the end of the file. 40 | 41 | ##### Pages 42 | Each page starts with a 4 int page header: 43 | 44 | 1. The first int is zero and is reserved for the user. I have used this for marking pages as seen and referencing the parent page during breadth-first-search path finding. This way no external data tables are necessary. Useful when you `read` the file into a mutable array in memory. 45 | 2. The number of links **N** that the page has. 46 | 3. The number of bidirectional links **B** the page has. These are links where the page being linked to also links back to this page. This generally implies a stronger connection between the topics of the two pages. 47 | 4. A metadata integer **M** with a bunch of bit fields and some zeroes that should be ignored for adding future metadata 48 | 49 | This header is followed by **N** ints containing the byte offsets of the pages linked to. The first **B** of these are the pages that also link back to this page. Note that the offsets are in *bytes* rather than ints so you may have to do some dividing by 4 when following these links to other pages in your data int array. 50 | 51 | The next page section starts after the **N** links. This allows one to iterate through all the pages by skipping **N** ints forwards. 52 | 53 | 54 | ##### Overall Structure 55 | 56 | In a wacky notation where `{}` denote logical sections that are really just adjacent in the file and each element is a 4-byte int the file looks like this: 57 | ```{{version, P, ?, ?}, {{0,N,B,M},{link, ...}},{{0,N,B,M},{link, ...}}, ...}``` 58 | See `analyze/graph.rb` for an example of how to use this file in Ruby or `analyze/strong_conn.rs` for a Rust example. 59 | 60 | ##### Metadata 61 | 32 bits of metadata packed into integer bit fields of **M**, from least significant bits to most significant: 62 | 63 | 3 bits = log10(length of article markup in bytes) 64 | 4 bits = min(number of words in title, 15) 65 | 1 bit = 1 if is a disambiguation page 66 | 3 bits = article namespace index in [normal, category, wikipedia, portal, book ... reserved for future ... 7=other namespace] 67 | 1 bit = 1 if page is a "List of" article 68 | 1 bit = 1 if page is a year 69 | The following bits are not set by this script but their places are reserved 70 | 1 bit = if the article is a featured article 71 | 1 bit = if the article is a "good" article 72 | (32-15=17) bits of zeroes reserved for future use 73 | 74 | Example: if you want to extract the article namespace number from an integer `m` you could use code like (C-style bitwise operations): 75 | 76 | ```c 77 | (m >> 8) & 0b111 // or 0x7 or just 7 78 | ``` 79 | 80 | Because the namespace field is offset (3+4+1)=8 bits from the start and is 3 bits long. 81 | 82 | ### xindex.db 83 | 84 | This is an Sqlite database with a single table containing 3 columns and a row for every article: 85 | ```sql 86 | create table pages ( 87 | title varchar(256) PRIMARY KEY, 88 | offset int 89 | ); 90 | CREATE INDEX pages_offset ON pages (offset); 91 | ``` 92 | `title` is the article name, `offset` is the byte offset in the `indexbi.bin` file. 93 | 94 | It is how one maps from article titles to offsets in the `indexbi.bin` and `index.bin` files and back again. 95 | It has indexes for both ways so is reasonably fast. It is used like this, at least in Ruby: 96 | ```ruby 97 | def title_to_offset(s) 98 | # Use COLLATE NOCASE if accepting human input and don't want case sensitivity 99 | rs = @db.execute("SELECT offset FROM pages WHERE title = ? LIMIT 1",s) 100 | return nil if rs.empty? 101 | rs.first.first 102 | end 103 | ``` 104 | 105 | Note that this table does not contain redirects, that is something that might come in a future version. 106 | 107 | ### xindex-nocase.db 108 | 109 | Generated by running `rake nocase` this is the same as `xindex.db` except with an extra index created like this: 110 | 111 | create index pages_nocase on pages (title collate nocase); 112 | 113 | It is useful for interactive apps like [Rate With Science](http://github.com/trishume/ratewithscience) because it makes case insensitive `COLLATE NOCASE` queries much much faster. 114 | The cost is additional file size. 115 | 116 | ### links.txt 117 | 118 | This is a text file with a line for every article with the article name followed by a metadata column and then all the links it has separated by `|` characters. All links are with redirects already followed and all links are verified to point to a valid page and are unique-d (no link included more than once). This is the easiest file to work with for some cases but certainly not the most efficient. 119 | 120 | The metadata column currently contains the length of the page markup in bytes followed by a `-` and then a series of characters each of which represents a page tag. Currently the only tag is `D` which signifies a disambiguation page. 121 | 122 | Here's an example with many links truncated since these pages actually have hundreds of links: 123 | 124 | ``` 125 | A|2889-|Letter (alphabet)|Vowel|ISO basic Latin alphabet|Alpha|Italic type 126 | Achilles|2924-|Kantharos|Vulci|Cabinet des Médailles|Phthia|Thetis|Chiton (costume) 127 | ``` 128 | 129 | Note that this is meant to be parsed with a `split` operation and as such a page with no links is just the page name with no `|`. 130 | 131 | 132 | ## Intermediate Files 133 | These may be useful data but they are less polished than the primary files. They are used in the generation of the primary files. They are generally in easier formats (text) but contain gotchas that make them harder to work with like links to invalid pages. 134 | 135 | 136 | Except the lines are way way longer since articles often have hundreds of links. 137 | 138 | ### redirects.txt 139 | 140 | Text file containing one line for every redirect on Wikipedia. With the redirect followed by the page it redirects to separated by a `|`. Filtered to only include redirects where the target is a valid page and the source is not a valid page. 141 | 142 | ### titles.txt 143 | 144 | Contains the titles of all valid pages one per line. 145 | 146 | ### links-raw.txt and redirects-raw.txt 147 | 148 | These are the files produced directly from the wiki dump. 149 | They still contain `File:`, `Wikipedia:`, etc... pages. 150 | 151 | ### links-filt.txt 152 | 153 | Same as `links-raw.txt` but filtered through grep to weed out pages matching `^(File|Template|Wikipedia|Help|Draft):`. 154 | 155 | ### index.bin 156 | 157 | Same as `indexbi.bin` but without bidirectional links sorted first and with the **B** field set to `0`. 158 | The only point of using this file is if you don't want to bother generating `indexbi.bin`. 159 | 160 | ## Generating the Files 161 | 162 | 1. Install Ruby+Bundler and optionally [Nim](http://nim-lang.org/) to make one process WAY faster. 163 | 1. Git clone the latest Wikicrush 164 | 1. Run `bundle install` in the `wikicrush` directory. 165 | 1. Download the latest `enwiki--pages-articles.xml.bz2` 166 | 1. Symlink (or move) the dump into the `data` directory of your Wikicrush clone as `data/pages-articles.xml.bz2` 167 | 1. Run `rake data/indexbi.bin` in the `wikicrush` directory. 168 | 1. Wait somewhere between 12-48 hours depending on how fast your computer is. At times this will take up to 3GB of RAM and 15GB of hard drive space. 169 | 1. Tada you have the data files! 170 | 171 | If you want to do this in a more nuanced way there's more fine grained control. You can ask Rake to generate the files one at a time and delete the intermediate steps when you no longer need them to save disk space if you want. 172 | Refer to the Rakefile for which files depend on which others. If one of the steps crashes on you due to lack of disk space or memory, delete the half-finished file it was working on, resolve the issue and re-run the rake command. 173 | 174 | You may also want to modify some of the steps if you want. Particularly the article filtering step in case you want to for example exclude "List of X" articles. 175 | 176 | This also works for other wikis such as the Simple English Wikipedia, I use the simple english wiki dump for testing because it is much smaller so all the steps run in minutes rather than hours, but it is still in a language I understand. 177 | 178 | ## Tips for Use 179 | 180 | - Read the `indexbi.bin` file into a big int32 array. 181 | - Make good use of the user data storage in your algorithms. If you want to use an external table simply fill the user data segments with incrementing indices into that table. 182 | - Try to only use the Sqlite file at the edges of your algorithm when communicating with the user. I translate the user input to offsets before I start and then the graph algorithm output back to titles after I'm done. Thus avoiding touching strings during processing. 183 | - Check out the code I've wrote that uses and generates these files. There's Ruby and Rust in this repo and Nim and more Rust in my `ratewithscience` repo. 184 | --------------------------------------------------------------------------------