├── data
    └── .gitkeep
├── .gitignore
├── gen
    ├── nocase.sql
    ├── filtredirs.rb
    ├── proclinks.rb
    ├── sqlindex.rb
    ├── doublelink.rb
    ├── doublelink.nim
    ├── dumplinks.rb
    └── binindex.rb
├── Gemfile
├── Gemfile.lock
├── analyze
    ├── invalid_links.rb
    ├── inspect.rb
    ├── link_stats.rb
    ├── verify.rb
    ├── graph.rb
    └── strong_conn.rs
├── LICENSE
├── Rakefile
└── README.md


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/*
2 | bin
3 | gen/doublelink
4 | gen/nimcache


--------------------------------------------------------------------------------
/gen/nocase.sql:
--------------------------------------------------------------------------------
1 | create index pages_nocase on pages (title collate nocase);
2 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | gem "sqlite3"
3 | gem "progress"
4 | gem "ox"
5 | gem "triez"
6 | gem "rake"
7 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     ox (2.1.4)
 5 |     progress (3.0.0)
 6 |     rake (10.3.2)
 7 |     sqlite3 (1.3.10)
 8 |     triez (1.0.4)
 9 | 
10 | PLATFORMS
11 |   ruby
12 | 
13 | DEPENDENCIES
14 |   ox
15 |   progress
16 |   rake
17 |   sqlite3
18 |   triez
19 | 


--------------------------------------------------------------------------------
/analyze/invalid_links.rb:
--------------------------------------------------------------------------------
 1 | require "triez"
 2 | 
 3 | raise "Usage: ruby invalid_links.rb path/to/links.txt path/to/titles.txt max" unless ARGV.length == 3
 4 | links_path, titles_path, max = ARGV
 5 | MAX_PAGES = max.to_i
 6 | 
 7 | STDERR.puts "Building Validity Hash"
 8 | valid = Triez.new value_type: :object
 9 | IO.foreach(titles_path) do |l|
10 |   valid[l.strip] = true
11 | end
12 | 
13 | STDERR.puts "Analyzing Links"
14 | count = 0
15 | IO.foreach(links_path) do |line|
16 |   page, *links = line.chomp.split('|').map{ |x| x.strip }
17 |   invalid = links.uniq.reject { |l| valid.has_key?(l)}
18 |   puts "# #{page}"
19 |   invalid.each { |x| puts x }
20 |   count += 1
21 |   break if count > MAX_PAGES
22 | end
23 | 


--------------------------------------------------------------------------------
/gen/filtredirs.rb:
--------------------------------------------------------------------------------
 1 | require "triez"
 2 | 
 3 | raise "Usage: ruby filtredirs.rb path/to/titles.txt path/to/redirects.txt path/to/put/redirects-filt.txt" unless ARGV.length == 3
 4 | titles_path, redir_path, out_path = ARGV
 5 | 
 6 | puts "Building Validity Hash"
 7 | valid = Triez.new value_type: :object
 8 | IO.foreach(titles_path) do |l|
 9 |   valid[l.chomp] = true
10 | end
11 | 
12 | puts "Processing..."
13 | out = File.open(out_path,'w')
14 | IO.foreach(redir_path) do |l|
15 |   from,to = l.split('|').map { |x| x.strip }
16 |   # This doesn't do anything, everything is capped
17 |   # to[0] = to[0].capitalize
18 |   next unless valid.has_key?(to) # points to valid thing
19 |   next if valid.has_key?(from) # conflicts with real page
20 |   out.puts "#{from}|#{to}"
21 | end
22 | out.close
23 | 


--------------------------------------------------------------------------------
/analyze/inspect.rb:
--------------------------------------------------------------------------------
 1 | require_relative "graph.rb"
 2 | 
 3 | def print_list(g,ls)
 4 |   p ls.map { |l| g.name(l) }
 5 | end
 6 | 
 7 | raise "Usage: inspect.rb path/to/index.bin path/to/xindex.db query" unless ARGV.length == 3
 8 | 
 9 | f = File.open(ARGV[0])
10 | g = Graph.new(f,ARGV[1])
11 | 
12 | query = ARGV[2].strip
13 | if query.to_i.to_s == query
14 |   q = query.to_i
15 | else
16 |   q = g.find(query)
17 | end
18 | raise "Could not find page" unless q
19 | 
20 | puts "Name: #{g.name(q)}"
21 | puts "Index: #{q}"
22 | puts "Meta: #{g.meta(q).to_s(2).rjust(32,'0')}"
23 | puts "Link Count: #{g.link_count(q)}"
24 | puts "Bidirectional Links: #{g.bi_link_count(q)}"
25 | puts "Bidirectional Links:"
26 | print_list(g, g.page_bi_links(q))
27 | links = g.page_links(q)
28 | puts "Outgoing Link Offsets:"
29 | p links
30 | puts "Outgoing Links:"
31 | print_list(g, links)
32 | 


--------------------------------------------------------------------------------
/gen/proclinks.rb:
--------------------------------------------------------------------------------
 1 | require "triez"
 2 | 
 3 | raise "Usage: ruby proclinks.rb path/to/titles.txt path/to/redirects.txt path/to/links.txt path/to/put/links.txt" unless ARGV.length == 4
 4 | titles_path, redir_path, links_path, out_path = ARGV
 5 | 
 6 | puts "Building Validity Hash"
 7 | valid = Triez.new value_type: :object
 8 | IO.foreach(titles_path) do |l|
 9 |   valid[l.chomp] = true
10 | end
11 | puts "Building Redirect Hash"
12 | redirects = Triez.new value_type: :object
13 | IO.foreach(redir_path) do |l|
14 |   key,val = l.chomp.split('|')
15 |   redirects[key] = val
16 | end
17 | 
18 | puts "Processing..."
19 | out = File.open(out_path,'w')
20 | IO.foreach(links_path) do |l|
21 |   page,meta,*links = l.split('|').map { |x| x.strip }
22 |   # next if page.length == 0
23 |   links.reject! { |li| li.length == 0 }
24 |   links.each do |li|
25 |     li[0] = li[0].capitalize
26 |   end
27 |   links.map! { |li| redirects[li] || li }
28 |   links = links.select { |li| valid.has_key?(li) }.uniq
29 |   links.unshift(page,meta)
30 |   out.puts links.join('|')
31 | end
32 | out.close
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Tristan Hume
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/analyze/link_stats.rb:
--------------------------------------------------------------------------------
 1 | require "triez"
 2 | 
 3 | raise "Usage: ruby link_stats.rb path/to/links.txt path/to/titles.txt" unless ARGV.length == 2
 4 | links_path, titles_path = ARGV
 5 | 
 6 | puts "Building Validity Hash"
 7 | valid = Triez.new value_type: :object
 8 | IO.foreach(titles_path) do |l|
 9 |   valid[l.strip] = true
10 | end
11 | 
12 | puts "Analyzing Links"
13 | stats = Hash.new(0)
14 | IO.foreach(links_path) do |line|
15 |   page, *links = line.chomp.split('|').map{ |x| x.strip }
16 |   stats[:pages] += 1
17 |   p [page,links,line] unless page
18 |   if valid.has_key?(page)
19 |     stats[:valid_pages] += 1
20 |     stats[:links] += links.count
21 | 
22 |     links_uniq = links.uniq
23 |     stats[:unique_links] += links_uniq.count
24 |     stats[:valid_links] += links_uniq.count { |l| valid.has_key?(l)}
25 |   end
26 | end
27 | stats[:valid_link_frac] = stats[:valid_links] / stats[:unique_links].to_f
28 | stats[:valid_page_frac] = stats[:valid_pages] / stats[:pages].to_f
29 | [:links,:unique_links,:valid_links].each do |stat|
30 |   avg_name = ("average_" + stat.to_s).intern
31 |   stats[avg_name] = stats[stat] / stats[:valid_pages].to_f
32 | end
33 | p stats
34 | 


--------------------------------------------------------------------------------
/analyze/verify.rb:
--------------------------------------------------------------------------------
 1 | require "progress"
 2 | require "sqlite3"
 3 | 
 4 | class Parser
 5 |   FILE_HEADER_SIZE = 4*4
 6 |   LINK_SIZE = 4
 7 |   HEADER_SIZE = 4*4
 8 | 
 9 |   def initialize(f,db_path)
10 |     @f = f
11 |     @db = SQLite3::Database.new db_path
12 |   end
13 | 
14 |   def run
15 |     header = @f.read(FILE_HEADER_SIZE)
16 |     num_pages = header.unpack("LLLL")[1]
17 |     puts "Processing #{num_pages} pages."
18 |     num_pages.times.with_progress do
19 |       do_page
20 |     end
21 |   end
22 | 
23 |   def do_page
24 |     this_page = @f.pos
25 |     raise "Header fail at #{this_page}" unless get_int == 0
26 |     raise "No entry in page DB for #{this_page}" unless name(this_page)
27 |     num_links = get_int
28 |     raise "Already processed" unless get_int == 0
29 |     raise "Bad meta" unless get_int < 2**16
30 |     borked_links = @f.read(4*num_links).unpack("L*").reject { |p| name(p) }
31 |     raise "Borked links for #{this_page}: #{borked_links.inspect}" unless borked_links.empty?
32 |     # (1..num_links).map {get_int}
33 |   end
34 | 
35 |   private
36 | 
37 |   def get_int
38 |     @f.read(4).unpack("L").first
39 |   end
40 | 
41 |   def name(p)
42 |     rs = @db.execute("SELECT title FROM pages WHERE offset = ? LIMIT 1",p)
43 |     return nil if rs.empty?
44 |     # rs.first.first
45 |     true
46 |   end
47 | end
48 | 
49 | raise "pass bin file to verify and db to check against" if ARGV.length != 2
50 | f = File.open(ARGV[0])
51 | p = Parser.new(f, ARGV[1])
52 | p.run
53 | 


--------------------------------------------------------------------------------
/analyze/graph.rb:
--------------------------------------------------------------------------------
 1 | require "sqlite3"
 2 | 
 3 | class File
 4 |   def each_chunk(chunk_size=1024*1024)
 5 |     yield read(chunk_size) until eof?
 6 |   end
 7 | end
 8 | 
 9 | class Graph
10 |   HEADER_SIZE = 4
11 |   def initialize(f,db_path, dbg = true)
12 |     @debug = dbg
13 |     debug "loading file"
14 |     @d = []
15 |     f.each_chunk do |chunk|
16 |       @d.concat(chunk.unpack("L*"))
17 |     end
18 |     @db = SQLite3::Database.new db_path
19 |   end
20 | 
21 |   def at(p,i)
22 |     @d[p/4+i]
23 |   end
24 | 
25 |   def link_count(p)
26 |     at(p,1)
27 |   end
28 | 
29 |   def bi_link_count(p)
30 |     at(p,2)
31 |   end
32 | 
33 |   def meta(p)
34 |     at(p,3)
35 |   end
36 | 
37 |   def page_links(p)
38 |     x = p/4
39 |     c = @d[x+1] # link count
40 |     @d[x+HEADER_SIZE..x+HEADER_SIZE+c-1]
41 |   end
42 | 
43 |   def page_bi_links(p)
44 |     x = p/4
45 |     c = @d[x+2] # bi link count
46 |     @d[x+HEADER_SIZE..x+HEADER_SIZE+c-1]
47 |   end
48 | 
49 |   def page_un_links(p)
50 |     x = p/4
51 |     b = @d[x+2] # bi link count
52 |     c = @d[x+1] # bi link count
53 |     @d[x+HEADER_SIZE+b..x+HEADER_SIZE+c-1]
54 |   end
55 | 
56 |   def name(p)
57 |     rs = @db.execute("SELECT title FROM pages WHERE offset = ? LIMIT 1",p)
58 |     return nil if rs.empty?
59 |     rs.first.first
60 |   end
61 | 
62 |   def find(s)
63 |     rs = @db.execute("SELECT offset FROM pages WHERE title = ? LIMIT 1",s)
64 |     return nil if rs.empty?
65 |     rs.first.first
66 |   end
67 | 
68 |   private
69 | 
70 |   def debug(msg)
71 |     STDERR.puts msg if @debug
72 |   end
73 | end
74 | 


--------------------------------------------------------------------------------
/gen/sqlindex.rb:
--------------------------------------------------------------------------------
 1 | require "sqlite3"
 2 | require "fileutils"
 3 | require "triez"
 4 | 
 5 | class Parser
 6 |   FILE_HEADER_SIZE = 4*4
 7 |   LINK_SIZE = 4
 8 |   HEADER_SIZE = 4*4
 9 |   attr_accessor :pos
10 | 
11 |   def initialize(f,db_path)
12 |     @f = f
13 |     @pos = FILE_HEADER_SIZE
14 |     @total = 0
15 | 
16 |     FileUtils.rm_f(db_path)
17 |     @db = SQLite3::Database.new db_path
18 |     @db.execute <<-SQL
19 | create table pages (
20 |   title varchar(256) PRIMARY KEY,
21 |   offset int
22 | );
23 | SQL
24 |     @db.execute("CREATE INDEX pages_offset ON pages (offset)")
25 |     @db.execute("PRAGMA synchronous = OFF;")
26 |   end
27 | 
28 |   def finish
29 |     @db.execute("PRAGMA synchronous = ON;")
30 |     puts "Number of Pages: #{@total}"
31 |     puts "File size: #{@pos}"
32 |   end
33 | 
34 |   def document
35 |     IO.foreach(@f).with_index do |l,i|
36 |       page(l.chomp.split('|').map{ |x| x.strip })
37 |       print '.' if i % 1000 == 0
38 |     end
39 |   end
40 | 
41 |   def page(line)
42 |     name = line[0]
43 |     l = num_links(line)
44 |     @db.execute("INSERT INTO pages (title, offset) VALUES (?,?)",[name,@pos])
45 |     @pos += HEADER_SIZE + LINK_SIZE*l
46 |     @total += 1
47 |   end
48 | 
49 |   def num_links(ls)
50 |     # First is name, second is meta
51 |     ls.length-2
52 |   end
53 | end
54 | 
55 | raise "Usage: ruby 3-sqlindex.rb path/to/links.txt path/to/put/xindex.db" unless ARGV.length == 2
56 | links_path, db_path = ARGV
57 | 
58 | puts "Parsing"
59 | f = File.open(links_path)
60 | # f = STDIN
61 | p = Parser.new(f,db_path)
62 | p.document
63 | p.finish
64 | 


--------------------------------------------------------------------------------
/gen/doublelink.rb:
--------------------------------------------------------------------------------
 1 | require "progress"
 2 | 
 3 | class Parser
 4 |   FILE_HEADER_SIZE = 4*4
 5 |   LINK_SIZE = 4
 6 |   HEADER_SIZE = 4*4
 7 | 
 8 |   def initialize(f, out_path)
 9 |     @f = f
10 |     @out = File.open(out_path,"w")
11 |   end
12 | 
13 |   def run
14 |     header = @f.read(FILE_HEADER_SIZE)
15 |     num_pages = header.unpack("LLLL")[1]
16 |     @out.write(header)
17 |     num_pages.times.with_progress do
18 |       do_page
19 |     end
20 |   end
21 | 
22 |   def do_page
23 |     # puts "Doing page at #{@f.pos}"
24 |     this_page = @f.pos
25 |     links, meta = page_links
26 |     double,single = links.partition { |l| bidirectional?(l,this_page)}
27 |     output_page(double,single,meta)
28 |   end
29 | 
30 |   def output_page(double,single, meta)
31 |     total = double.length + single.length
32 |     @out.write([0,total,double.length,meta].pack("LLL")) # header
33 |     @out.write(double.pack("L*"))
34 |     @out.write(single.pack("L*"))
35 |   end
36 | 
37 |   def page_links
38 |     user_data = get_int
39 |     raise "Header fail at #{@f.pos - 4}: #{user_data} should be 0" unless user_data == 0
40 |     num_links = get_int
41 |     raise "Already processed" unless get_int == 0
42 |     meta = get_int # metadata
43 |     return ((1..num_links).map {get_int}), meta
44 |   end
45 | 
46 |   private
47 | 
48 |   def get_int
49 |     @f.read(4).unpack("L").first
50 |   end
51 | 
52 |   def bidirectional?(page,other)
53 |     old_pos = @f.pos
54 |     @f.seek(page)
55 |     links = page_links
56 |     res = links.include?(other)
57 |     @f.seek(old_pos)
58 |     res
59 |   end
60 | end
61 | raise "Usage: ruby 5-doublelink.rb path/to/index.bin path/to/put/newindex.bin" unless ARGV.length == 2
62 | bin_path, out_path = ARGV
63 | 
64 | puts "Bidirectionally linking file."
65 | puts "This Ruby version is super slow. Install Nim, delete indexbi.bin and run again to get the super fast Nim version."
66 | f = File.open(bin_path)
67 | p = Parser.new(f, out_path)
68 | p.run
69 | 


--------------------------------------------------------------------------------
/gen/doublelink.nim:
--------------------------------------------------------------------------------
 1 | import sequtils, queues, strutils, algorithm
 2 | 
 3 | type
 4 |   Graph* = seq[int32]
 5 |   Page = int32
 6 | 
 7 | const
 8 |   kPageUserDataField = 0
 9 |   kPageLinksField = 1
10 |   kPageBidLinksField = 2
11 |   kPageHeaderSize = 4
12 |   kFirstPageIndex = 4
13 | 
14 | proc offset*[A](some: ptr A; b: int): ptr A =
15 |   result = cast[ptr A](cast[int](some) + (b * sizeof(A)))
16 | iterator iterPtr*[A](some: ptr A; num: int): A =
17 |   for i in 0.. <num:
18 |     yield some.offset(i)[]
19 | proc load_bin_graph*() : Graph =
20 |   echo "loading graph..."
21 |   var f : File
22 |   discard open(f,"data/index.bin")
23 |   defer: close(f)
24 |   let size : int = getFileSize(f).int
25 |   let count = size /% 4
26 |   var s : seq[int32]
27 |   newSeq(s, count)
28 |   shallow(s)
29 |   discard readBuffer(f,addr(s[0]),size)
30 |   return s
31 | 
32 | proc write_bin_graph(g : var Graph) =
33 |   echo "Writing graph..."
34 |   var f : File
35 |   discard open(f,"data/indexbi.bin",fmWrite)
36 |   defer: close(f)
37 |   discard f.writeBuffer(addr(g[0]),g.len()*4)
38 |   echo "Wrote file..."
39 | 
40 | iterator link_indices(g : Graph, p : Page) : Page =
41 |   block:
42 |     let ind = p /% 4
43 |     let link_count = g[ind + kPageLinksField]
44 |     let start = ind+kPageHeaderSize
45 |     # echo "Yielding ", link_count, " links for ", p
46 |     for i in start.. <(start+link_count):
47 |       yield i
48 | iterator all_pages(g : Graph) : Page =
49 |   block:
50 |     var i = kFirstPageIndex
51 |     while i < g.len:
52 |       yield (i*4).Page
53 |       i += kPageHeaderSize+g[i+kPageLinksField]
54 | 
55 | proc links_to(g : Graph, p1 : Page, p2 : Page) : bool =
56 |   for link in link_indices(g,p1):
57 |     # echo "Checking ", link
58 |     if g[link] == p2:
59 |       return true
60 |   return false
61 | 
62 | proc double_link(g : var Graph) =
63 |   echo "Processing..."
64 |   for p in all_pages(g):
65 |     var store = (p /% 4) + kPageHeaderSize
66 |     var numbid = 0
67 |     for i in link_indices(g, p):
68 |       if links_to(g,g[i],p):
69 |         swap(g[store],g[i])
70 |         store += 1
71 |         numbid += 1
72 |     g[p /% 4 + kPageBidLinksField] = numbid.int32
73 | 
74 | when isMainModule:
75 |   var data = load_bin_graph()
76 |   data.double_link()
77 |   write_bin_graph(data)
78 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | RAW_DUMP_PATH = "data/pages-articles.xml.bz2"
 2 | 
 3 | file "data/links-raw.txt", :dump_path do |t,args|
 4 |   args.with_defaults(:dump_path => RAW_DUMP_PATH)
 5 |   dump = args[:dump_path]
 6 |   raise "#{dump} must exist" unless File.exist?(dump)
 7 |   sh "bzip2 -dc \"#{dump}\" | ruby gen/dumplinks.rb data/links-raw.txt data/redirects-raw.txt"
 8 | end
 9 | 
10 | file "data/links-filt.txt" => ["data/links-raw.txt"] do
11 |   sh "grep -Ev \"^(File|Template|Help|Draft):\" data/links-raw.txt > data/links-filt.txt"
12 | end
13 | 
14 | file "data/titles.txt" => ["data/links-filt.txt"] do
15 |   sh "cut -d'|' -f1 data/links-filt.txt > data/titles.txt"
16 | end
17 | 
18 | file "data/redirects.txt" => ["data/links-raw.txt","data/titles.txt"] do
19 |   ruby "gen/filtredirs.rb data/titles.txt data/redirects-raw.txt data/redirects.txt"
20 | end
21 | 
22 | file "data/links.txt" => ["data/links-filt.txt","data/redirects.txt","data/titles.txt"] do
23 |   ruby "gen/proclinks.rb data/titles.txt data/redirects.txt data/links-filt.txt data/links.txt"
24 | end
25 | 
26 | file "data/xindex.db" => ["data/links.txt"] do
27 |   ruby "gen/sqlindex.rb data/links.txt data/xindex.db"
28 | end
29 | 
30 | file "data/index.bin" => ["data/links.txt","data/xindex.db"] do
31 |   ruby "gen/binindex.rb data/links.txt data/xindex.db data/index.bin"
32 | end
33 | 
34 | file "data/indexbi.bin" => ["data/index.bin"] do
35 |   if system("which","nim")
36 |     sh "nim c -d:release gen/doublelink.nim"
37 |     sh "./gen/doublelink"
38 |   else
39 |     ruby "gen/doublelink.rb data/index.bin data/indexbi.bin"
40 |   end
41 | end
42 | 
43 | directory "bin"
44 | file "bin/strong_conn" => ["bin"] do
45 |   sh "rustc -O -o bin/strong_conn analyze/strong_conn.rs"
46 | end
47 | 
48 | task :verify => "data/index.bin" do
49 |   ruby "analyze/verify.rb data/index.bin data/xindex.db"
50 | end
51 | 
52 | task :inspect, :page do |t, args|
53 |   ruby "analyze/inspect.rb data/indexbi.bin data/xindex.db \"#{args[:page]}\""
54 | end
55 | 
56 | task :link_stats do
57 |   ruby "analyze/link_stats.rb data/links.txt data/titles.txt"
58 | end
59 | 
60 | task :invalid_links do
61 |   sh "ruby analyze/invalid_links.rb data/links.txt data/titles.txt 1000 > data/invalid-links.txt"
62 | end
63 | 
64 | task :strong_conn => ["bin/strong_conn"] do
65 |   sh "./bin/strong_conn data/index.bin"
66 | end
67 | 
68 | task :nocase => ["data/xindex.db"] do
69 |   cp "data/xindex.db", "data/xindex-nocase.db"
70 |   sh "sqlite3 data/xindex-nocase.db < gen/nocase.sql"
71 | end
72 | 


--------------------------------------------------------------------------------
/gen/dumplinks.rb:
--------------------------------------------------------------------------------
 1 | require 'ox'
 2 | 
 3 | class Handler < ::Ox::Sax
 4 |   LINK_REGEX = /\[\[([^|\n\]]+)(?:\|[^\n\]]+)?\]\]|(&lt;!--)|(--&gt;)/
 5 |   DIS_REGEX = /\{\{\s*((?:Disambiguation[^}]*)|Airport disambiguation|Biology disambiguation|Call sign disambiguation|Caselaw disambiguation|Chinese title disambiguation|Genus disambiguation|Geodis|Hndis|Hndis-cleanup|Hospital disambiguation|Letter disambiguation|Letter-NumberCombDisambig|Mathematical disambiguation|Mil-unit-dis|Numberdis|Phonetics disambiguation|Road disambiguation|School disambiguation|Species Latin name disambiguation|Wikipedia disambiguation|disambig|dab|disamb)\s*\}\}/i
 6 |   def initialize(link_file, redir_file)
 7 |     @link_file = File.open(link_file,"w")
 8 |     @redir_file = File.open(redir_file,"w")
 9 |   end
10 | 
11 |   def start_element(name)
12 |     case name
13 |     when :page
14 |       @title = nil
15 |       @links = []
16 |       @is_redirect = false
17 |       @is_disambig = false
18 |     when :redirect
19 |       @is_redirect = true
20 |     when :title
21 |       @in_title = true
22 |     when :text
23 |       @in_text = true
24 |     end
25 |   end
26 |   def end_element(name)
27 |     case name
28 |     when :text
29 |       do_page
30 |       @in_text = false
31 |     when :title
32 |       @in_title = false
33 |     end
34 |   end
35 |   def attr(name, value)
36 |     if @is_redirect && name == :title
37 |       @redirect = value
38 |     end
39 |   end
40 |   def text(value)
41 |     case
42 |     when @in_title
43 |       @title = value
44 |     when @in_text
45 |       @page_length = value.bytesize
46 |       @real_text = true
47 |       value.scan(LINK_REGEX) do |lin, op, clos|
48 |         # p [lin,op,clos,@real_text]
49 |         if lin && @real_text
50 |           @links << lin if lin.length < 100
51 |         elsif op
52 |           @real_text = false
53 |         elsif clos
54 |           @real_text = true
55 |         end
56 |       end
57 |       if DIS_REGEX =~ value
58 |         @is_disambig = true
59 |       end
60 |     end
61 |   end
62 | 
63 |   private
64 | 
65 |   def do_page
66 |     return unless @title
67 |     if @is_redirect
68 |       do_redirect
69 |     else
70 |       do_real_page
71 |     end
72 |   end
73 | 
74 |   def do_real_page
75 |     @link_file.puts "#{@title}|#{@page_length}-#{@is_disambig ? 'D' : ''}|#{@links.map{ |x| x.strip }.join('|')}"
76 |   end
77 | 
78 |   def do_redirect
79 |     return unless @redirect
80 |     @redir_file.puts "#{@title}|#{@redirect}"
81 |   end
82 | end
83 | 
84 | raise "Usage: cat wikidump.xml | ruby 1-dumplinks.rb path/to/put/links.txt path/to/put/redirects.txt" unless ARGV.length == 2
85 | puts "Dumping links..."
86 | handler = Handler.new(ARGV[0],ARGV[1])
87 | Ox.sax_parse(handler, STDIN)
88 | 


--------------------------------------------------------------------------------
/gen/binindex.rb:
--------------------------------------------------------------------------------
  1 | require "sqlite3"
  2 | require "triez"
  3 | 
  4 | class Parser
  5 |   FILE_HEADER_SIZE = 4*4
  6 |   LINK_SIZE = 4
  7 |   HEADER_SIZE = 4*4
  8 |   NAMESPACES = {"Category" => 1, "Wikipedia" => 2, "Portal" => 3, "Book" => 4}
  9 |   attr_accessor :pos
 10 | 
 11 |   def initialize(f,db_path,bin_path)
 12 |     @f = f
 13 |     @db = SQLite3::Database.new db_path
 14 |     @out = File.open(bin_path,"w")
 15 |     file_header
 16 |   end
 17 | 
 18 |   def document
 19 |     IO.foreach(@f).with_index do |l,i|
 20 |       page(l.chomp.split('|'))
 21 |       if i % 10_000 == 0
 22 |         puts "#{(i/@total.to_f*100.0).round(3)}%"
 23 |       end
 24 |     end
 25 |     @out.close
 26 |   end
 27 | 
 28 |   def page(line)
 29 |     name, meta, *links = line
 30 |     fill(name, meta, links)
 31 |   end
 32 | 
 33 |   def file_header
 34 |     @total = @db.execute("SELECT count(*) FROM pages").first.first
 35 |     # version, num articles, header length, extra
 36 |     @out.write([2,@total,FILE_HEADER_SIZE,HEADER_SIZE].pack("L*"))
 37 |   end
 38 | 
 39 |   def fill(title, meta, ls)
 40 |     link_data = ls.map{ |l| get_offset(l)}
 41 |     @out.write([0,link_data.length,0, meta_bits(title, meta)].pack("LLLL")) # header
 42 |     @out.write(link_data.pack("L*"))
 43 |   end
 44 | 
 45 |   private
 46 | 
 47 |   def get_offset(name)
 48 |     rows = @db.execute("SELECT offset FROM pages WHERE title = ? LIMIT 1", name)
 49 |     rows.first.first
 50 |   end
 51 | 
 52 |   # 32 bits of metadata, packed like so (starting at the least significant bit):
 53 |   #
 54 |   # 3 bits = log10(length of article markup in bytes)
 55 |   # 4 bits = min(number of words in title, 15)
 56 |   # 1 bit = 1 if is a disambiguation page
 57 |   #
 58 |   # 3 bits = article namespace of [normal, category, wikipedia, portal, book ... potential others ... 7=other namespace]
 59 |   # 1 bit = 1 if page is a "List of" article
 60 |   # 1 bit = 1 if page is a year
 61 |   # The following bits are not set by this script but their places are reserved
 62 |   # 1 bit = if the article is a featured article
 63 |   # 1 bit = if the article is a "good" article
 64 |   # (32-15=17) bits of zeroes reserved for future use
 65 |   def meta_bits(name, meta)
 66 |     textlen_str, flags = meta.split('-')
 67 |     log_textlen = textlen_str.length - 1 # should be log10(textlen)
 68 |     raise "Out of range textlen" if log_textlen > 7 || log_textlen < 0
 69 |     title_words = [name.split.length,15].min
 70 | 
 71 |     if /^([A-Z][a-z]+):/ =~ name
 72 |       type = NAMESPACES[$1] || 7
 73 |     else
 74 |       type = 0
 75 |     end
 76 | 
 77 |     is_disambig = (!!flags) && flags.include?('D')
 78 |     is_list = name.start_with?("List of ")
 79 |     is_year = (/^[0-9]{1,4}$/ === name)
 80 | 
 81 |     pack_bits([[log_textlen, 3], [title_words, 2], [is_disambig, 1], [type, 3], [is_list, 1], [is_year, 1]])
 82 |   end
 83 | 
 84 |   def pack_bits(arr)
 85 |     res = 0
 86 |     bits_so_far = 0
 87 |     arr.each do |n, num_bits|
 88 |       n = 1 if n == true
 89 |       n = 0 if n == false
 90 |       res = res | (n << bits_so_far)
 91 |       bits_so_far += num_bits
 92 |     end
 93 |     res
 94 |   end
 95 | end
 96 | 
 97 | raise "Usage: ruby binindex.rb path/to/links.txt path/to/xindex.db path/to/put/index.bin" unless ARGV.length == 3
 98 | links_path, db_path, bin_path = ARGV
 99 | 
100 | f = File.open(links_path)
101 | p = Parser.new(f,db_path, bin_path)
102 | p.document
103 | 


--------------------------------------------------------------------------------
/analyze/strong_conn.rs:
--------------------------------------------------------------------------------
  1 | #![feature(env)]
  2 | #![feature(old_io)]
  3 | #![feature(old_path)]
  4 | #![feature(collections)]
  5 | 
  6 | #![allow(dead_code)]
  7 | 
  8 | use std::env;
  9 | use std::mem;
 10 | use std::old_io::File;
 11 | 
 12 | static FILE_HEADER_SIZE : usize = 4*4;
 13 | static PAGE_HEADER_SIZE : usize = 4;
 14 | 
 15 | static PAGE_USER_DATA : usize = 0;
 16 | static PAGE_LINKS : usize = 1;
 17 | static PAGE_BID_LINKS : usize = 2;
 18 | 
 19 | struct Graph<'a> {
 20 |     data : &'a mut [u32],
 21 | }
 22 | 
 23 | struct PageIter<'a> {
 24 |     g : &'a Graph<'a>,
 25 |     cur : usize,
 26 | }
 27 | 
 28 | impl<'a> Iterator for PageIter<'a> {
 29 |     type Item = usize;
 30 |     fn next(&mut self) -> Option<usize> {
 31 |         let next_page = self.cur + (PAGE_HEADER_SIZE+self.g.link_count(self.cur))*4;
 32 |         self.cur = next_page;
 33 |         if next_page >= self.g.data.len() * 4 { None } else { Some(next_page) }
 34 |     }
 35 | }
 36 | 
 37 | impl<'a> Graph<'a> {
 38 |     fn first_page(&self) -> usize {
 39 |         FILE_HEADER_SIZE
 40 |     }
 41 | 
 42 |     fn find_next(&self, page : usize) -> Option<usize> {
 43 |         let next_page = page + (PAGE_HEADER_SIZE+self.link_count(page))*4;
 44 |         if next_page >= self.data.len() * 4 { None } else { Some(next_page) }
 45 |     }
 46 | 
 47 |     fn find_next_unmarked(&self,start : usize) -> Option<usize> {
 48 |         let mut page = start;
 49 |         while self.user_data(page) != 0 {
 50 |             page = page + (PAGE_HEADER_SIZE+self.link_count(page))*4;
 51 |             if page >= self.data.len() * 4 { return None;}
 52 |         }
 53 |         Some(page)
 54 |     }
 55 | 
 56 |     fn pages(&self) -> PageIter {
 57 |         PageIter {g: self, cur: self.first_page()}
 58 |     }
 59 | 
 60 |     fn page_count(&self) -> u32 {
 61 |         self.data[1]
 62 |     }
 63 | 
 64 |     fn link_count(&self, page : usize) -> usize {
 65 |         self.data[page/4+PAGE_LINKS] as usize
 66 |     }
 67 | 
 68 |     fn bid_link_count(&self, page : usize) -> usize {
 69 |         self.data[page/4+PAGE_BID_LINKS] as usize
 70 |     }
 71 | 
 72 |     fn links(&'a self, page : usize) -> Vec<usize> {
 73 |         let start = page/4+PAGE_HEADER_SIZE;
 74 |         let end = start+self.link_count(page);
 75 |         let link_range = &self.data[start..end];
 76 |         link_range.iter().map(|x| *x as usize).collect::<Vec<usize>>()
 77 |     }
 78 | 
 79 |     fn set_user_data(&mut self, page : usize, data : u32) {
 80 |         self.data[page/4+PAGE_USER_DATA] = data;
 81 |     }
 82 | 
 83 |     fn user_data(&self, page : usize) -> u32 {
 84 |         self.data[page/4+PAGE_USER_DATA]
 85 |     }
 86 | }
 87 | 
 88 | fn flood_fill(graph : &mut Graph, start_page : usize, mark : u32) -> u32 {
 89 |     assert!(mark != 0);
 90 |     let mut stack = vec![start_page];
 91 |     let mut marked_count = 0;
 92 |     while !stack.is_empty() {
 93 |         let page = stack.pop().unwrap();
 94 | 
 95 |         if graph.user_data(page) != 0 {continue;}
 96 |         graph.set_user_data(page,mark); // mark visited
 97 |         // println!("Visiting {} with {} links",page,graph.link_count(page));
 98 |         marked_count += 1;
 99 | 
100 |         for linked in graph.links(page) {
101 |             // println!("Pushing link to {}", linked);
102 |             stack.push(linked);
103 |         }
104 |     }
105 |     marked_count
106 | }
107 | 
108 | fn find_conn_components(graph : &mut Graph) {
109 |     let mut start_page = graph.first_page();
110 |     let mut comp_count = 0;
111 |     loop {
112 |         let count = flood_fill(graph, start_page,1);
113 |         if count > 100 {
114 |             println!("Found a connected component of {} nodes out of {} pages = {}.",
115 |                      count,graph.page_count(),(count as f32 / graph.page_count() as f32));
116 |         }
117 |         comp_count += 1;
118 | 
119 |         let next_page = graph.find_next_unmarked(start_page);
120 |         match next_page {
121 |             Some(page) => start_page = page,
122 |             None => break,
123 |         }
124 |     }
125 |     println!("Found {} components.",comp_count);
126 | }
127 | 
128 | fn fill_incoming_links(graph : &mut Graph) {
129 |     let mut page = graph.first_page();
130 |     // Increment link count on all linked to pages, then move to next
131 |     loop {
132 |         for linked in graph.links(page) {
133 |             let incd = graph.user_data(linked)+1;
134 |             graph.set_user_data(linked, incd);
135 |         }
136 | 
137 |         match graph.find_next(page) {
138 |             None => break,
139 |             Some(new_page) => page = new_page,
140 |         }
141 |     }
142 | }
143 | 
144 | static DATA_HIST_MAX : usize = 50;
145 | fn analyze_user_data(graph : &Graph) {
146 |     let mut hist : Vec<u32> = vec![0; DATA_HIST_MAX];
147 |     for page in graph.pages() {
148 |         let count = graph.user_data(page);
149 |         if (count as usize) < DATA_HIST_MAX {
150 |             hist[count as usize] += 1;
151 |         }
152 |     }
153 |     println!("Incoming links:");
154 |     for c in 0..hist.len() {
155 |         println!("{}: {}",c, hist[c]);
156 |     }
157 | }
158 | 
159 | fn main() {
160 |     let args: Vec<String> = env::args().map(|x| x.to_string()).collect();
161 | 
162 |     if args.len() != 2 {
163 |         println!("Usage: ./strong_conn path/to/indexbi.bin");
164 |         env::set_exit_status(1);
165 |         return;
166 |     }
167 | 
168 |     let bin_path = Path::new(&args[1]);
169 |     println!("Analyzing {}...",bin_path.display());
170 | 
171 |     let mut file = File::open(&bin_path).ok().expect("Could not open graph file.");
172 | 
173 |     let mut graph_data : Vec<u32>;
174 |     {
175 |         let mut buf : Vec<u8> = file.read_to_end().ok().expect("Could not read file.");
176 |         let len = buf.len();
177 |         println!("Read {} bytes of file!", len);
178 |         if len % 4 != 0 {
179 |             println!("Invalid file size!");
180 |             return;
181 |         }
182 |         let data_ptr : *mut u32 = unsafe {mem::transmute(buf.as_mut_ptr())};
183 |         graph_data = unsafe { Vec::from_raw_buf(data_ptr, len / 4)};
184 |     }
185 |     let mut graph = Graph { data: graph_data.as_mut_slice() };
186 |     println!("Read {} words of file!", graph.data.len());
187 |     println!("Total pages: {}", graph.page_count());
188 | 
189 |     find_conn_components(&mut graph);
190 | 
191 |     // println!("Finding incoming links...");
192 |     // fill_incoming_links(&mut graph);
193 |     // println!("Analyzing incoming links...");
194 |     // analyze_user_data(&graph);
195 | }
196 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Wikicrush
  2 | =========
  3 | 
  4 | Extracts link graphs in a variety of formats from Wikipedia data dumps.
  5 | This includes a highly compact binary graph format designed for very efficient graph searches.
  6 | 
  7 | It can compress a recent 10GB compressed Wikipedia dump into a 630MB binary link graph and a 550MB sqlite database for translating article names into binary graph offsets.
  8 | 
  9 | Wikicrush was created for use in [Rate With Science](http://github.com/trishume/ratewithscience) where it allows sub-second breadth-first searches through all of Wikipedia on a cheap VPS with 1GB of RAM.
 10 | 
 11 | ### Getting the Data
 12 | 
 13 | You can either run the process yourself and get all the files plus control over the source dump by following the steps at the bottom or you can use the download I have prepared.
 14 | 
 15 | The download is a zip file containing just `xindex.db` and `indexbi.bin` and was generated from `enwiki-20150205-pages-articles.xml.bz2` (i.e the February 2015 english Wikipedia dump). The file is 740MB and can be downloaded here: [http://thume.net/bigdownloads/wikidata.zip](http://thume.net/bigdownloads/wikidata.zip). **Note:** This uses the old graph format v1, see the `v1` branch readme for the old format. I'll try and process another more recent wiki dump into the new format soon.
 16 | 
 17 | 
 18 | # The Files
 19 | 
 20 | ## Features
 21 | - Relatively small binary graph data fits in memory allowing fast processing.
 22 | - Format design allows tight loops without external table lookups.
 23 | - Properly skips commented out links that don't show up on rendered Wikipedia pages.
 24 | - All links are validated to only include ones that go to valid pages.
 25 | - Link edges go through redirects transparently.
 26 | - Link lists sorted with bidirectional edges first.
 27 | - Provides space to store node data during graph algorithm processing.
 28 | - Tested and verified to accurately capture the links over many weeks of bug fixing and use in Rate With Science.
 29 | 
 30 | ## Primary Data
 31 | 
 32 | ### indexbi.bin
 33 | 
 34 | This is the most important and awesome file, the crown jewel of the Wikicrush project. It is a dense binary link graph that does not contain the titles of the articles and links to offsets within itself. This way one can run graph algorithms in tight loops without having to deal with strings and lookup tables. The graph transparently follows redirects in that if a page links to a redirect, it will be included in the file as a link to the page that the redirect goes to. Also note some pages link to themselves.
 35 | 
 36 | The file is a big array of 32-bit (4 byte) little-endian integers. This should be convenient to load into a big int array in the language of your choice.
 37 | 
 38 | The first 4 ints are the file header. First the version, next the total number of pages **P**, then 2 unused.
 39 | After this is **P** page data sections, each page is placed one after another until the end of the file.
 40 | 
 41 | ##### Pages
 42 | Each page starts with a 4 int page header:
 43 | 
 44 | 1. The first int is zero and is reserved for the user. I have used this for marking pages as seen and referencing the parent page during breadth-first-search path finding. This way no external data tables are necessary. Useful when you `read` the file into a mutable array in memory.
 45 | 2. The number of links **N** that the page has.
 46 | 3. The number of bidirectional links **B** the page has. These are links where the page being linked to also links back to this page. This generally implies a stronger connection between the topics of the two pages.
 47 | 4. A metadata integer **M** with a bunch of bit fields and some zeroes that should be ignored for adding future metadata
 48 | 
 49 | This header is followed by **N** ints containing the byte offsets of the pages linked to. The first **B** of these are the pages that also link back to this page. Note that the offsets are in *bytes* rather than ints so you may have to do some dividing by 4 when following these links to other pages in your data int array.
 50 | 
 51 | The next page section starts after the **N** links. This allows one to iterate through all the pages by skipping **N** ints forwards.
 52 | 
 53 | 
 54 | ##### Overall Structure
 55 | 
 56 | In a wacky notation where `{}` denote logical sections that are really just adjacent in the file and each element is a 4-byte int the file looks like this:
 57 | ```{{version, P, ?, ?}, {{0,N,B,M},{link, ...}},{{0,N,B,M},{link, ...}}, ...}```
 58 | See `analyze/graph.rb` for an example of how to use this file in Ruby or `analyze/strong_conn.rs` for a Rust example.
 59 | 
 60 | ##### Metadata
 61 | 32 bits of metadata packed into integer bit fields of **M**, from least significant bits to most significant:
 62 | 
 63 |     3 bits = log10(length of article markup in bytes)
 64 |     4 bits = min(number of words in title, 15)
 65 |     1 bit = 1 if is a disambiguation page
 66 |     3 bits = article namespace index in [normal, category, wikipedia, portal, book ... reserved for future ... 7=other namespace]
 67 |     1 bit = 1 if page is a "List of" article
 68 |     1 bit = 1 if page is a year
 69 |     The following bits are not set by this script but their places are reserved
 70 |     1 bit = if the article is a featured article
 71 |     1 bit = if the article is a "good" article
 72 |     (32-15=17) bits of zeroes reserved for future use
 73 | 
 74 | Example: if you want to extract the article namespace number from an integer `m` you could use code like (C-style bitwise operations):
 75 | 
 76 | ```c
 77 |     (m >> 8) & 0b111 // or 0x7 or just 7
 78 | ```
 79 | 
 80 | Because the namespace field is offset (3+4+1)=8 bits from the start and is 3 bits long.
 81 | 
 82 | ### xindex.db
 83 | 
 84 | This is an Sqlite database with a single table containing 3 columns and a row for every article:
 85 | ```sql
 86 | create table pages (
 87 |   title varchar(256) PRIMARY KEY,
 88 |   offset int
 89 | );
 90 | CREATE INDEX pages_offset ON pages (offset);
 91 | ```
 92 | `title` is the article name, `offset` is the byte offset in the `indexbi.bin` file.
 93 | 
 94 | It is how one maps from article titles to offsets in the `indexbi.bin` and `index.bin` files and back again.
 95 | It has indexes for both ways so is reasonably fast. It is used like this, at least in Ruby:
 96 | ```ruby
 97 | def title_to_offset(s)
 98 |   # Use COLLATE NOCASE if accepting human input and don't want case sensitivity
 99 |   rs = @db.execute("SELECT offset FROM pages WHERE title = ? LIMIT 1",s)
100 |   return nil if rs.empty?
101 |   rs.first.first
102 | end
103 | ```
104 | 
105 | Note that this table does not contain redirects, that is something that might come in a future version.
106 | 
107 | ### xindex-nocase.db
108 | 
109 | Generated by running `rake nocase` this is the same as `xindex.db` except with an extra index created like this:
110 | 
111 |     create index pages_nocase on pages (title collate nocase);
112 | 
113 | It is useful for interactive apps like [Rate With Science](http://github.com/trishume/ratewithscience) because it makes case insensitive `COLLATE NOCASE` queries much much faster.
114 | The cost is additional file size.
115 | 
116 | ### links.txt
117 | 
118 | This is a text file with a line for every article with the article name followed by a metadata column and then all the links it has separated by `|` characters. All links are with redirects already followed and all links are verified to point to a valid page and are unique-d (no link included more than once). This is the easiest file to work with for some cases but certainly not the most efficient.
119 | 
120 | The metadata column currently contains the length of the page markup in bytes followed by a `-` and then a series of characters each of which represents a page tag. Currently the only tag is `D` which signifies a disambiguation page.
121 | 
122 | Here's an example with many links truncated since these pages actually have hundreds of links:
123 | 
124 | ```
125 | A|2889-|Letter (alphabet)|Vowel|ISO basic Latin alphabet|Alpha|Italic type
126 | Achilles|2924-|Kantharos|Vulci|Cabinet des Médailles|Phthia|Thetis|Chiton (costume)
127 | ```
128 | 
129 | Note that this is meant to be parsed with a `split` operation and as such a page with no links is just the page name with no `|`.
130 | 
131 | 
132 | ## Intermediate Files
133 | These may be useful data but they are less polished than the primary files. They are used in the generation of the primary files. They are generally in easier formats (text) but contain gotchas that make them harder to work with like links to invalid pages.
134 | 
135 | 
136 | Except the lines are way way longer since articles often have hundreds of links.
137 | 
138 | ### redirects.txt
139 | 
140 | Text file containing one line for every redirect on Wikipedia. With the redirect followed by the page it redirects to separated by a `|`. Filtered to only include redirects where the target is a valid page and the source is not a valid page.
141 | 
142 | ### titles.txt
143 | 
144 | Contains the titles of all valid pages one per line.
145 | 
146 | ### links-raw.txt and redirects-raw.txt
147 | 
148 | These are the files produced directly from the wiki dump.
149 | They still contain `File:`, `Wikipedia:`, etc... pages.
150 | 
151 | ### links-filt.txt
152 | 
153 | Same as `links-raw.txt` but filtered through grep to weed out pages matching `^(File|Template|Wikipedia|Help|Draft):`.
154 | 
155 | ### index.bin
156 | 
157 | Same as `indexbi.bin` but without bidirectional links sorted first and with the **B** field set to `0`.
158 | The only point of using this file is if you don't want to bother generating `indexbi.bin`.
159 | 
160 | ## Generating the Files
161 | 
162 | 1. Install Ruby+Bundler and optionally [Nim](http://nim-lang.org/) to make one process WAY faster.
163 | 1. Git clone the latest Wikicrush
164 | 1. Run `bundle install` in the `wikicrush` directory.
165 | 1. Download the latest `enwiki-<some_date>-pages-articles.xml.bz2`
166 | 1. Symlink (or move) the dump into the `data` directory of your Wikicrush clone as `data/pages-articles.xml.bz2`
167 | 1. Run `rake data/indexbi.bin` in the `wikicrush` directory.
168 | 1. Wait somewhere between 12-48 hours depending on how fast your computer is. At times this will take up to 3GB of RAM and 15GB of hard drive space.
169 | 1. Tada you have the data files!
170 | 
171 | If you want to do this in a more nuanced way there's more fine grained control. You can ask Rake to generate the files one at a time and delete the intermediate steps when you no longer need them to save disk space if you want.
172 | Refer to the Rakefile for which files depend on which others. If one of the steps crashes on you due to lack of disk space or memory, delete the half-finished file it was working on, resolve the issue and re-run the rake command.
173 | 
174 | You may also want to modify some of the steps if you want. Particularly the article filtering step in case you want to for example exclude "List of X" articles.
175 | 
176 | This also works for other wikis such as the Simple English Wikipedia, I use the simple english wiki dump for testing because it is much smaller so all the steps run in minutes rather than hours, but it is still in a language I understand.
177 | 
178 | ## Tips for Use
179 | 
180 | - Read the `indexbi.bin` file into a big int32 array.
181 | - Make good use of the user data storage in your algorithms. If you want to use an external table simply fill the user data segments with incrementing indices into that table.
182 | - Try to only use the Sqlite file at the edges of your algorithm when communicating with the user. I translate the user input to offsets before I start and then the graph algorithm output back to titles after I'm done. Thus avoiding touching strings during processing.
183 | - Check out the code I've wrote that uses and generates these files. There's Ruby and Rust in this repo and Nim and more Rust in my `ratewithscience` repo.
184 | 


--------------------------------------------------------------------------------