├── .gitignore ├── lib ├── heliotrope │ ├── gmail-dumper.rb │ ├── query.rb │ ├── zmbox.rb │ ├── person.rb │ ├── hooks.rb │ ├── mbox-splitter.rb │ ├── maildir-walker.rb │ ├── message-adder.rb │ ├── decoder.rb │ ├── imap-dumper.rb │ ├── message.rb │ └── meta-index.rb ├── heliotrope.rb └── heliotrope-client.rb ├── Rakefile ├── bin ├── heliotrope-upgrade-index ├── heliotrope-console ├── heliotrope-add-contacts ├── heliotrope-add ├── heliotrope-reindex ├── heliotrope-import └── heliotrope-server ├── README └── test └── test_heliotrope.rb /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | .*.swo 3 | -------------------------------------------------------------------------------- /lib/heliotrope/gmail-dumper.rb: -------------------------------------------------------------------------------- 1 | module Heliotrope 2 | class GMailDumper < IMAPDumper 3 | GMAIL_HOST = "imap.gmail.com" 4 | GMAIL_PORT = 993 5 | 6 | def initialize opts 7 | super opts.merge(:host => GMAIL_HOST, :port => GMAIL_PORT, :ssl => true, :folder => "none") 8 | end 9 | 10 | def folder 11 | folders = @imap.xlist "", "*" 12 | allmail = folders.find { |x| x.attr.include? :Allmail } 13 | raise "can't find the all-mail folder" unless allmail 14 | allmail.name 15 | end 16 | 17 | def imap_query_columns 18 | %w(UID FLAGS X-GM-LABELS BODY.PEEK[]) 19 | end 20 | 21 | ## we can figure out our own labels 22 | def can_provide_labels?; true end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/heliotrope/query.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | ## just a wrapper around a Whistlepig query. trying to protect the user from 4 | ## explicit whistlepig dependencies 5 | require 'whistlepig' 6 | 7 | module Heliotrope 8 | class Query 9 | class ParseError < StandardError; end 10 | 11 | def initialize field, query, q=nil 12 | @whistlepig_q = q || begin 13 | Whistlepig::Query.new(field, query).term_map { |f, t| t.downcase } 14 | rescue Whistlepig::ParseError => e 15 | raise ParseError, e.message 16 | end 17 | end 18 | 19 | attr_reader :whistlepig_q 20 | def clone; Query.new(nil, nil, @whistlepig_q.clone) end 21 | def and other; Query.new(nil, nil, @whistlepig_q.and(other)) end 22 | 23 | def original_query_s; @whistlepig_q.query end 24 | def parsed_query_s 25 | s = @whistlepig_q.to_s 26 | s.force_encoding(Encoding::UTF_8) if Decoder.in_ruby19_hell? 27 | s 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/heliotrope.rb: -------------------------------------------------------------------------------- 1 | module Heliotrope 2 | ## ruby 1.9 versions of this don't work with Timeout::timeout, so 3 | ## we use this ruby 1.8 backport. 4 | def popen3(*cmd) 5 | pw, pr, pe = IO::pipe, IO::pipe, IO::pipe # [0] = read, [1] = write 6 | 7 | pid = fork do 8 | fork do 9 | pw[1].close; STDIN.reopen pw[0]; pw[0].close 10 | pr[0].close; STDOUT.reopen pr[1]; pr[1].close 11 | pe[0].close; STDERR.reopen pe[1]; pe[1].close 12 | exec(*cmd) 13 | end 14 | exit!(0) 15 | end 16 | 17 | pw[0].close; pr[1].close; pe[1].close 18 | Process.waitpid pid 19 | pi = [pw[1], pr[0], pe[0]] 20 | pw[1].sync = true 21 | 22 | begin 23 | yield(*pi) 24 | ensure 25 | pi.each { |p| p.close unless p.closed? } 26 | end 27 | end 28 | 29 | module_function :popen3 30 | end 31 | 32 | require "heliotrope/message-adder" 33 | require "heliotrope/decoder" 34 | require "heliotrope/person" 35 | require "heliotrope/message" 36 | require "heliotrope/mbox-splitter" 37 | require "heliotrope/imap-dumper" 38 | require "heliotrope/gmail-dumper" 39 | require "heliotrope/maildir-walker" 40 | require "heliotrope/meta-index" 41 | require "heliotrope/zmbox" 42 | require "heliotrope/query" 43 | require "heliotrope/hooks" 44 | -------------------------------------------------------------------------------- /lib/heliotrope/zmbox.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | require 'stringio' 4 | require 'zlib' 5 | 6 | ## a simple mbox with compressed messages 7 | 8 | module Heliotrope 9 | class ZMBox 10 | def initialize fn 11 | @io = File.open fn, "a+:BINARY" 12 | end 13 | 14 | def add string 15 | buf = StringIO.new 16 | zbuf = Zlib::GzipWriter.new buf 17 | zbuf.write string 18 | zbuf.close 19 | 20 | @io.seek 0, IO::SEEK_END 21 | offset = @io.tell 22 | @io.write [buf.string.bytesize].pack("L") 23 | @io.write buf.string 24 | @io.flush 25 | 26 | #printf "; compressed %dk => %dk (%.0f%% compression rate) and wrote at offset %d\n", string.bytesize / 1024, buf.string.bytesize / 1024, 100.0 - (100.0 * buf.string.bytesize / rawbody.bytesize), offset 27 | offset 28 | end 29 | 30 | def read offset 31 | @io.seek offset 32 | size = @io.read(4).unpack("L").first 33 | buf = StringIO.new @io.read(size) 34 | z = Zlib::GzipReader.new(buf) 35 | string = z.read 36 | z.close 37 | 38 | ## these come back in the system encoding. GzipReader doesn't seem to take 39 | ## an encoding spec. sigh. they need to be ascii. 40 | string.force_encoding(Encoding::BINARY) if Decoder.in_ruby19_hell? 41 | string 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'rake/gempackagetask.rb' 3 | 4 | spec = Gem::Specification.new do |s| 5 | s.name = "heliotrope" 6 | s.version = "0.1" 7 | s.date = Time.now 8 | s.email = "wmorgan-heliotrope@masanjin.net" 9 | s.authors = ["William Morgan"] 10 | s.summary = "Heliotrope is a personal, threaded, search-centric email server." 11 | s.homepage = "http://sup.rubyforge.org" 12 | s.files = Dir["lib/*.rb"] + Dir["lib/heliotrope/*.rb"] 13 | s.executables = Dir["bin/*"].map { |f| File.basename f } 14 | s.rubyforge_project = "sup" 15 | s.description = "Heliotrope is a personal, threaded, search-centric email server." 16 | 17 | s.add_dependency 'trollop', '>= 1.16.2' 18 | s.add_dependency 'whistlepig', '>= 0.12' 19 | s.add_dependency 'rmail', '>= 1.0.0' 20 | s.add_dependency 'leveldb-ruby', '>= 0.13' 21 | 22 | s.add_dependency 'locale' 23 | 24 | s.add_dependency 'rest-client' 25 | s.add_dependency 'rack' 26 | s.add_dependency 'json' 27 | s.add_dependency 'sinatra' 28 | end 29 | 30 | task :rdoc do |t| 31 | sh "rdoc lib README.txt History.txt -m README.txt" 32 | end 33 | 34 | task :test do 35 | sh %!ruby -rubygems -Ilib:ext:bin:test test/test_heliotrope.rb! 36 | end 37 | 38 | Rake::GemPackageTask.new(spec) do |pkg| 39 | pkg.need_tar = true 40 | end 41 | 42 | # vim: syntax=ruby 43 | -------------------------------------------------------------------------------- /bin/heliotrope-upgrade-index: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # encoding: UTF-8 3 | 4 | require 'rubygems' 5 | require 'trollop' 6 | require "heliotrope" 7 | 8 | opts = Trollop::options do 9 | banner < String, :required => true 15 | end 16 | 17 | def upgrade__to_0_1 store, index, hooks 18 | $stderr.puts "Sorry! To upgrade to index version 0.1, you must reindex everything." 19 | end 20 | 21 | ### execution starts here ### 22 | 23 | store = LevelDB::DB.load File.join(opts.dir, "store") 24 | index = Whistlepig::Index.new File.join(opts.dir, "index", "whistlepig") 25 | hooks = Heliotrope::Hooks.new File.join(opts.dir, "hooks") 26 | begin 27 | Heliotrope::MetaIndex.new store, index, hooks 28 | puts "No upgrade needed." 29 | rescue Heliotrope::MetaIndex::VersionMismatchError => e 30 | begin 31 | hv = (e.have_version || "").gsub(/\W/, "_") 32 | wv = (e.want_version || "").gsub(/\W/, "_") 33 | method = "upgrade_#{hv}_to_#{wv}" 34 | $stderr.puts "Trying to upgrade from #{e.have_version.inspect} to #{e.want_version.inspect}." 35 | begin 36 | send method, store, index, hooks 37 | rescue NoMethodError 38 | abort "No upgrade possible." 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/heliotrope/person.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | module Heliotrope 4 | class Person 5 | def initialize name, email, handle 6 | @name = name 7 | @email = email 8 | @handle = handle 9 | end 10 | 11 | attr_reader :name, :email, :handle 12 | 13 | def to_email_address 14 | qname = name =~ /"/ ? name.inspect : name 15 | [qname, "<#{email}>"].compact.join(" ") 16 | end 17 | 18 | def display_name; name || handle || email end 19 | 20 | ## takes a string, returns a [name, email, emailnodomain] combo 21 | ## e.g. for William Morgan , returns 22 | ## ["William Morgan", wmorgan@example.com, wmorgan] 23 | def self.from_string string # ripped from sup 24 | return if string.nil? || string.empty? 25 | 26 | name, email, handle = case string 27 | when /^(["'])(.*?[^\\])\1\s*<((\S+?)@\S+?)>/ 28 | a, b, c = $2, $3, $4 29 | a = a.gsub(/\\(["'])/, '\1') 30 | [a, b, c] 31 | when /(.+?)\s*<((\S+?)@\S+?)>/ 32 | [$1, $2, $3] 33 | when /<((\S+?)@\S+?)>/ 34 | [nil, $1, $2] 35 | when /((\S+?)@\S+)/ 36 | [nil, $1, $2] 37 | else 38 | [nil, string, nil] # i guess... 39 | end 40 | 41 | Person.new name, email, handle 42 | end 43 | 44 | def self.many_from_string string 45 | return [] if string.nil? || string !~ /\S/ 46 | emails = string.gsub(/[\t\r\n]+/, " ").split(/,\s*(?=(?:[^"]*"[^"]*")*(?![^"]*"))/) 47 | emails.map { |e| from_string e }.compact 48 | end 49 | 50 | def indexable_text; [name, email, handle].join(" ") end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /lib/heliotrope/hooks.rb: -------------------------------------------------------------------------------- 1 | require 'fileutils' 2 | 3 | module Heliotrope 4 | 5 | ## largely cribbed from sup 6 | class Hooks 7 | class Env 8 | def initialize name 9 | @__name = name 10 | @__cache = {} 11 | end 12 | 13 | def __run __hook, __filename, __locals 14 | __binding = binding 15 | __lprocs, __lvars = __locals.partition { |k, v| v.is_a?(Proc) } 16 | eval __lvars.map { |k, v| "#{k} = __locals[#{k.inspect}];" }.join, __binding 17 | ## we also support closures for delays evaluation. unfortunately 18 | ## we have to do this via method calls, so you don't get all the 19 | ## semantics of a regular variable. not ideal. 20 | __lprocs.each do |k, v| 21 | self.class.instance_eval do 22 | define_method k do 23 | @__cache[k] ||= v.call 24 | end 25 | end 26 | end 27 | ret = eval __hook, __binding, __filename 28 | @__cache = {} 29 | ret 30 | end 31 | end 32 | 33 | def initialize dir 34 | @dir = dir 35 | @hooks = {} 36 | @envs = {} 37 | 38 | FileUtils.mkdir_p dir unless File.exists? dir 39 | end 40 | 41 | def run name, locals={} 42 | hook = hook_for(name) or return 43 | env = @envs[hook] ||= Env.new(name) 44 | 45 | result = nil 46 | fn = fn_for name 47 | begin 48 | result = env.__run hook, fn, locals 49 | rescue Exception => e 50 | $stderr.puts "error running #{fn}: #{e.message}" 51 | $stderr.puts e.backtrace.join("\n") 52 | @hooks[name] = nil # disable it 53 | end 54 | result 55 | end 56 | 57 | def enabled? name; hook_for name end 58 | 59 | private 60 | 61 | def hook_for name 62 | @hooks[name] ||= begin 63 | IO.read fn_for(name) 64 | rescue SystemCallError => e 65 | #$stderr.puts "can't read hook: #{e.message}" 66 | nil 67 | end 68 | end 69 | 70 | def fn_for name 71 | File.join @dir, "#{name}.rb" 72 | end 73 | end 74 | 75 | end 76 | -------------------------------------------------------------------------------- /bin/heliotrope-console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'trollop' 5 | require 'whistlepig' 6 | require 'irb' 7 | 8 | require "heliotrope" 9 | 10 | module IRB 11 | def IRB.start_with_binding binding 12 | IRB.setup __FILE__ 13 | w = WorkSpace.new binding 14 | irb = Irb.new w 15 | @CONF[:MAIN_CONTEXT] = irb.context 16 | irb.eval_input 17 | end 18 | end 19 | 20 | opts = Trollop::options do 21 | banner < true, :type => String 27 | end 28 | 29 | include Heliotrope 30 | store = LevelDB::DB.new File.join(opts.dir, "store") 31 | index = Whistlepig::Index.new File.join(opts.dir, "index", "whistlepig") 32 | hooks = Hooks.new File.join(opts.dir, "hooks") 33 | metaindex = MetaIndex.load_or_die! store, index, hooks 34 | zmbox = ZMBox.new File.join(opts.dir, "messages") 35 | 36 | puts < num 65 | end 66 | end 67 | 68 | def done?; @stream.eof? end 69 | def finish! 70 | @stream.close 71 | { "last_offset" => @last_offset } 72 | end 73 | 74 | private 75 | 76 | ## total hack. but all such things are. 77 | def is_mbox_break_line? l 78 | l[0, 5] == "From " or return false # quick check 79 | l =~ BREAK_RE or return false # longer check 80 | true 81 | end 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /lib/heliotrope/maildir-walker.rb: -------------------------------------------------------------------------------- 1 | require 'time' 2 | 3 | module Heliotrope 4 | class MaildirWalker 5 | def initialize dirs 6 | @dirs = dirs 7 | @last_file_read = nil 8 | end 9 | 10 | def can_provide_labels?; true end 11 | def load! state 12 | @files = get_files 13 | if state 14 | @last_file_read = state["last_file_read"] 15 | if @last_file_read 16 | index = @files.index @last_file_read 17 | if index 18 | @files = @files[(index + 1) .. -1] || [] 19 | end 20 | end 21 | end 22 | end 23 | 24 | def each_message 25 | until done? 26 | fn = @files.shift 27 | message = IO.read fn 28 | yield message, ["inbox"], state_from_filename(fn), fn 29 | @last_file_read = fn 30 | end 31 | end 32 | 33 | def skip! num 34 | @files = @files[num .. -1] 35 | end 36 | 37 | def done? 38 | @files ||= get_files 39 | @files.empty? 40 | end 41 | 42 | def finish! 43 | { "last_file_read" => @last_file_read } #state 44 | end 45 | 46 | private 47 | 48 | def state_from_filename fn 49 | state = [] 50 | flags = if fn =~ /\,([\w]+)$/ 51 | $1.split(//) 52 | else 53 | [] 54 | end 55 | 56 | state << "unread" unless flags.member?("S") 57 | state << "starred" if flags.member?("F") 58 | state << "deleted" if flags.member?("T") 59 | state << "draft" if flags.member?("D") 60 | state 61 | end 62 | 63 | def get_files 64 | puts "; scanning #{@dirs.size} directories..." 65 | dirs = @dirs.map { |d| d.gsub(/([\*\?\[\]])/, '\\\\\1') } # have to escape for globbing 66 | files = dirs.map { |dir| Dir[File.join(dir, "cur", "*")] + Dir[File.join(dir, "new", "*")] }.flatten.sort 67 | puts "; found #{files.size} messages" 68 | files.sort_by { |fn| File.mtime(fn) } 69 | end 70 | 71 | def get_date_in_file fn 72 | File.open(fn, "r:BINARY") do |f| 73 | while(l = f.gets) 74 | if l =~ /^Date:\s+(.+\S)\s*$/ 75 | return begin 76 | Time.parse($1) 77 | rescue 78 | Time.at 0 79 | end 80 | end 81 | end 82 | end 83 | ## spam message don't have date headers 84 | # puts "; warning: no date in #{fn}" 85 | end 86 | end 87 | end 88 | -------------------------------------------------------------------------------- /bin/heliotrope-add-contacts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'trollop' 5 | require 'whistlepig' 6 | require 'irb' 7 | 8 | require "heliotrope" 9 | 10 | opts = Trollop::options do 11 | banner < true, :type => String 21 | opt :user_email_addresses, "A comma-delimited list of email addresses that belong to you. This is not necessary, but will improve the contact collection process.", :default => "" 22 | end 23 | 24 | include Heliotrope 25 | store = LevelDB::DB.new File.join(opts.dir, "store") 26 | index = Whistlepig::Index.new File.join(opts.dir, "index", "whistlepig") 27 | hooks = Hooks.new File.join(opts.dir, "hooks") 28 | metaindex = MetaIndex.load_or_die! store, index, hooks 29 | zmbox = ZMBox.new File.join(opts.dir, "messages") 30 | 31 | email_addresses = opts.user_email_addresses.split "," 32 | 33 | doc_id = 0 34 | startt = lastt = Time.now 35 | puts "Scanning..." 36 | while true 37 | doc_id += 1 38 | metainfo = metaindex.load_messageinfo doc_id 39 | break unless metainfo 40 | next if metainfo[:state].member?("spam") || metainfo[:state].member?("deleted") || 41 | metainfo[:labels].member?("spam") || metainfo[:labels].member?("deleted") 42 | 43 | rawbody = zmbox.read metainfo[:loc] 44 | rawbody.force_encoding "binary" if rawbody.respond_to?(:force_encoding) # sigh... 45 | message = Heliotrope::Message.new(rawbody).parse! 46 | 47 | if email_addresses.include?(message.from.email) # from you 48 | (message.to + message.cc + message.bcc).each do |contact| 49 | new = metaindex.touch_contact! contact, message.date 50 | puts "Adding contact you emailed: #{contact.to_email_address}" if new 51 | end 52 | elsif !message.is_list_or_automated_email? 53 | new = metaindex.touch_contact! message.from, message.date 54 | puts "Adding contact who emailed you: #{message.from.to_email_address}" if new 55 | end 56 | 57 | if (Time.now - lastt) > 5 58 | elapsed = Time.now - startt 59 | printf "; scanned %d messages in %.1fs = %.1f m/s\n", doc_id, elapsed, doc_id / elapsed 60 | lastt = Time.now 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /lib/heliotrope/message-adder.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'json' 3 | require "heliotrope" 4 | 5 | ## all the common functionality between heliotrope-add and -import 6 | module Heliotrope 7 | class MessageAdder 8 | def initialize opts 9 | @source = if opts.mbox_fn 10 | Heliotrope::MboxSplitter.new opts.mbox_fn 11 | elsif opts.maildir_dirs 12 | Heliotrope::MaildirWalker.new opts.maildir_dirs 13 | elsif opts.imap_host 14 | port = opts.imap_port || (opts.dont_use_ssl ? 143 : 993) 15 | username = opts.imap_username || ask("IMAP username: ") 16 | password = opts.imap_password || ask_secret("IMAP password: ") 17 | Heliotrope::IMAPDumper.new :host => opts.imap_host, :port => port, :ssl => !opts.dont_use_ssl, :username => username, :password => password, :folder => opts.imap_folder 18 | elsif opts.gmail_username 19 | username = opts.gmail_username || ask("GMail username: ") 20 | password = opts.gmail_password || ask_secret("GMail password: ") 21 | Heliotrope::GMailDumper.new :username => username, :password => password 22 | else 23 | Heliotrope::MBoxStream.new $stdin 24 | end 25 | @opts = opts 26 | end 27 | 28 | def each_message 29 | num_scanned = num_indexed = num_bad = num_seen = 0 30 | startt = lastt = Time.now 31 | state = if @opts.state_file && File.exist?(@opts.state_file) 32 | puts "Loading state..." 33 | JSON.parse IO.read(@opts.state_file) 34 | end 35 | 36 | puts "Loading mail source..." 37 | @source.load! state 38 | @source.skip! @opts.num_skip if @opts.num_skip 39 | 40 | puts "Adding mail..." 41 | begin 42 | @source.each_message do |rawbody, labels, state, desc| 43 | break if @opts.num_messages && (num_scanned >= @opts.num_messages) 44 | num_scanned += 1 45 | 46 | ## try to avoid being fucked by ruby 1.9 47 | rawbody.force_encoding("binary") if rawbody.respond_to?(:force_encoding) 48 | 49 | ## if the source can't set its own labels, we will just add everything to 50 | ## the inbox 51 | unless @source.can_provide_labels? 52 | labels += %w(inbox) 53 | state += %w(unread) 54 | end 55 | puts "; adding #{desc} with labels {#{labels.join ", "}} and state {#{state.join ", "}}" if @opts.verbose 56 | 57 | seen, indexed, bad = yield rawbody, state, labels 58 | num_seen += 1 if seen 59 | num_indexed += 1 if indexed 60 | num_bad += 1 if bad 61 | 62 | if (Time.now - lastt) > 5 # seconds 63 | elapsed = Time.now - startt 64 | printf "; scanned %d, indexed %d, skipped %d bad and %d seen messages in %.1fs = %.1f m/s\n", num_scanned, num_indexed, num_bad, num_seen, elapsed, num_scanned / elapsed 65 | lastt = Time.now 66 | end 67 | end 68 | ensure 69 | state = @source.finish! 70 | if @opts.state_file 71 | puts "Saving state..." 72 | File.open(@opts.state_file, "w") { |f| f.puts state.to_json } 73 | end 74 | end 75 | 76 | elapsed = Time.now - startt 77 | printf "; scanned %d, indexed %d, skipped %d bad and %d seen messages in %.1fs = %.1f m/s\n", num_scanned, num_indexed, num_bad, num_seen, elapsed, num_scanned / elapsed 78 | 79 | puts "Done." 80 | end 81 | 82 | private 83 | 84 | def ask q 85 | print q 86 | $stdout.flush 87 | (gets || abort).chomp 88 | end 89 | 90 | def ask_secret q 91 | begin 92 | `stty -echo` 93 | ask q 94 | ensure 95 | `stty echo` 96 | end 97 | end 98 | end 99 | end 100 | -------------------------------------------------------------------------------- /bin/heliotrope-add: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # encoding: UTF-8 3 | 4 | require 'rubygems' 5 | require 'trollop' 6 | require 'curb' 7 | require 'uri' 8 | require 'cgi' 9 | require "heliotrope" 10 | require 'json' 11 | 12 | opts = Trollop::options do 13 | banner < "localhost", :short => "H" 26 | opt :port, "Heliotrope server port", :default => 8042 27 | opt :num_messages, "Index at most this many messages and then quit", :type => Integer, :short => "n" 28 | opt :num_skip, "Skip this many many messages and then start adding", :type => Integer, :short => "k" 29 | opt :state_file, "Store state to, and read state from, this file. This enables successive runs against the same source to pick up only what's changed.", :type => String 30 | opt :add_labels, "Add these labels to every message (should be a comma-separated list)", :type => String 31 | opt :remove_labels, "Do not add any of these labels to any message (should be a comma-separated list)", :type => String 32 | opt :verbose, "Enable verbose output" 33 | 34 | banner < String, :short => "m" 39 | opt :mbox_start_offset, "Start file offset for scanning", :default => 0, :short => "s" 40 | banner < :strings 45 | banner < String 50 | opt :imap_port, "IMAP server port (default: 993 with ssl, 143 without)", :type => Integer 51 | opt :dont_use_ssl, "Don't use SSL" 52 | opt :imap_username, "IMAP username (default: prompt)", :type => String 53 | opt :imap_password, "IMAP password (default: prompt)", :type => String 54 | opt :imap_folder, "IMAP folder", :default => "INBOX" 55 | 56 | banner < String 61 | opt :gmail_password, "GMail password (default: prompt)", :type => String 62 | 63 | banner < rawbody, :state => state.to_json, :labels => labels.to_json } 85 | response = Curl::Easy.http_post server_url, params.map { |k, v| "#{k}=#{CGI.escape v}" }.join("&") 86 | if response.response_code != 200 87 | Trollop::die "Unexpected HTTP response code #{response.response_code} posting to #{server_url}" 88 | end 89 | response = response.body_str 90 | response = JSON.parse response 91 | 92 | if response["response"] == "ok" 93 | if response["status"] == "seen" 94 | seen = true 95 | else 96 | indexed = true 97 | end 98 | else 99 | bad = true 100 | ## everybody has broken messages, nobody things they do. don't cause panic. 101 | ## puts "Error for message at offset #{offset}: " + response["error_message"] 102 | end 103 | 104 | [seen, indexed, bad] 105 | end 106 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Heliotrope is a personal email server. It provides all the functionality you 2 | want in a modern email client: 3 | 4 | - proper message threading 5 | - labels 6 | - fast, full-text search over all messages with a complete query language 7 | - support for signed and encrypted email 8 | - an extensible JSON-over-HTTP API 9 | 10 | Heliotrope is a backend service against which email clients / MUAs can be 11 | written. To use it, you must use a client. For an example client, see 12 | Turnsole: http://github.com/wmorgan/turnsole. 13 | 14 | WHY ANOTHER PROTOCOL? WHY NOT JUST USE IMAP? 15 | 16 | Because IMAP is terrible and you want all those features listed above. 17 | 18 | REQUIREMENTS 19 | 20 | Heliotrope is written in a mixture of Ruby and C. You will need a working C 21 | compiler, and a modern Ruby. 1.8.7 or 1.9.2 are good choices. Ruby 1.9.2 22 | will give you better performance and better i18n support. 23 | 24 | Heliotrope uses html2text to generate text from HTML attachments. You will 25 | need this or some reasonable alternative. 26 | 27 | PREPARING YOURSELF 28 | 29 | 0. Find your happy place. 30 | 1. Install html2text. On Debian-based Linux distros, it's `apt-get 31 | install html2text`. On OS X, `port install html2text` and `brew install 32 | html2text` have both been known to work. 33 | 2. Install a modern Ruby. I recommend 1.9.3. On Debian-based Linux distros, 34 | you can try the ruby1.9.1 package. You can also download and compile it from 35 | http://www.ruby-lang.org/en/downloads/. For Debian-based Linux distros, 36 | make sure you have the libreadline-dev and zlib1g-dev packages installed. 37 | 3. Run `gem install trollop whistlepig rest-client sinatra rmail leveldb-ruby locale lrucache curb` 38 | 39 | GETTING IT 40 | 41 | Once Heliotrope is more stable, you will be able to install it via rubygems. 42 | For you, you should run this: 43 | 44 | git clone https://github.com/wmorgan/heliotrope.git 45 | 46 | RUNNING IT 47 | 48 | Decide where you want to store everything. Call this directory . 49 | Now run `ruby -Ilib bin/heliotrope-server -d `. 50 | 51 | Heliotrope exposes a simple debugging HTML interface. If you point your 52 | browser to http://localhost:8042/, you should see your empty mailstore. 53 | Congrats! You're running Heliotrope. 54 | 55 | IMPORTING EXISTING MAIL STORES 56 | 57 | To bulk import mail, use heltrope-import. You must stop your server first. 58 | 59 | To add a pre-existing mbox: 60 | ruby -Ilib bin/heliotrope-import -m -d 61 | 62 | To add a pre-existing maildir: 63 | ruby -Ilib bin/heliotrope-import -a -d 64 | 65 | To add a pre-existing Gmail account: 66 | ruby -Ilib bin/heliotrope-import -g -d 67 | 68 | To add a pre-existing, non-Gmail IMAP account: 69 | ruby -Ilib bin/heliotrope-import -i -d 70 | 71 | For testing purposes, you may want to limit the number of emails added by 72 | using the --num-messages option. If you additionally use the --state-file 73 | option to save state, successive invocations of heliotrope-import will 74 | resume where the stopped. 75 | 76 | After import, start the server again. You should see stuff at 77 | http://localhost:8042/. 78 | 79 | For a full client, see https://github.com/wmorgan/turnsole/. 80 | 81 | ADDING EMAIL 82 | 83 | Once the server is running, you can use heliotrope-add to add individual 84 | emails as they arrive. Unlike heliotrope-import, heliotrope-add talks to 85 | your running server. 86 | 87 | To add a single email: 88 | cat email.txt | ruby -Ilib bin/heliotrope-add 89 | 90 | To add messages from existing GMail, IMAP, mbox or maildir sources, use the 91 | same arguments as to heliotrope-import above. Note that if you specify 92 | --state-file, every run of heliotrope-add will pull in any new messages from 93 | the source. This makes it easy to mirror an existing account. 94 | 95 | For example, the following bash script: 96 | 97 | while true; do 98 | ruby -Ilib bin/heliotrope-import -g -d 99 | sleep 300 100 | done 101 | 102 | will mirror mail from a GMail account into Heliotrope with a 5-minute poll 103 | time. 104 | 105 | JSON API SPEC 106 | ------------- 107 | 108 | Heliotrope features an complete JSON-over-HTTP API. Documentation coming 109 | soon! For now, you can reverse engineer lib/heliotrope-client.rb. 110 | -------------------------------------------------------------------------------- /lib/heliotrope-client.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'curb' 3 | require 'uri' 4 | require 'json' 5 | require 'set' 6 | require 'lrucache' 7 | 8 | class HeliotropeClient 9 | class Error < StandardError; end 10 | 11 | MESSAGE_MUTABLE_STATE = Set.new %w(starred unread deleted) 12 | MESSAGE_IMMUTABLE_STATE = Set.new %w(attachment signed encrypted draft sent) 13 | MESSAGE_STATE = MESSAGE_MUTABLE_STATE + MESSAGE_IMMUTABLE_STATE 14 | 15 | attr_reader :url 16 | def initialize url 17 | @url = url 18 | @cache = LRUCache.new :max_size => 100 19 | @curl = Curl::Easy.new 20 | end 21 | 22 | def search query, num=20, start=0 23 | v = get_json "search", :q => query, :start => start, :num => num 24 | v["results"] 25 | end 26 | 27 | def count query 28 | get_json("count", :q => query)["count"] 29 | end 30 | 31 | def thread id; get_json("thread/#{id}")["messageinfos"] end 32 | def threadinfo id; get_json("thread/#{id}/info") end 33 | 34 | def messageinfos id 35 | @cache[[:message_info, id]] ||= get_json("message/#{id}", :only_infos => true) 36 | end 37 | 38 | def message id, mime_type_pref="text/plain" 39 | @cache[[:message, id, mime_type_pref]] ||= get_json("message/#{id}", :mime_type_pref => mime_type_pref) 40 | end 41 | 42 | def send_message message, opts={} 43 | opts[:labels] ||= [] 44 | opts[:state] ||= [] 45 | post_json "message/send", :message => message, :labels => opts[:labels].to_json, :state => opts[:state].to_json 46 | end 47 | 48 | def add_message message, opts={} 49 | opts[:labels] ||= [] 50 | opts[:state] ||= [] 51 | post_json "message", :message => message, :labels => opts[:labels].to_json, :state => opts[:state].to_json 52 | end 53 | 54 | def bounce_message message, opts={} 55 | opts[:force_recipients] ||= [] 56 | post_json "message/bounce", :message => message, :force_recipients => opts[:force_recipients].to_json 57 | end 58 | 59 | def message_part message_id, part_id 60 | ## not a json blob, but a binary region 61 | @cache[[:message_part, message_id, part_id]] ||= get_raw "/message/#{message_id}/part/#{part_id}" 62 | end 63 | 64 | def raw_message message_id 65 | ## not a json blob, but a binary region 66 | @cache[[:raw_message, message_id]] ||= get_raw "/message/#{message_id}/raw" 67 | end 68 | 69 | def labels; get_json("labels")["labels"] end 70 | def info; get_json("info") end 71 | def size; get_json("size")["size"] end 72 | def contacts_with_prefix prefix, num=100; get_json("contacts", :prefix => prefix, :num => num)["contacts"] end 73 | 74 | def prune_labels!; post_json("labels/prune")["labels"] end 75 | 76 | def set_labels! thread_id, labels 77 | post_json "thread/#{thread_id}/labels", :labels => labels.to_json 78 | end 79 | 80 | def set_state! message_id, state 81 | post_json "message/#{message_id}/state", :state => state.to_json 82 | end 83 | 84 | def set_thread_state! thread_id, state 85 | post_json "thread/#{thread_id}/state", :state => state.to_json 86 | end 87 | 88 | private 89 | 90 | def encode_params params; params.map { |k, v| "#{k}=#{CGI.escape v.to_s}" }.join("&") end 91 | 92 | def get_json path, params={} 93 | handle_errors do 94 | response = get_raw(path + ".json", params) 95 | response.force_encoding Encoding::UTF_8 if in_ruby19_hell? 96 | JSON.parse response 97 | end 98 | end 99 | 100 | def post_json path, params={} 101 | handle_errors do 102 | curl = Curl::Easy.http_post URI.join(@url, path + ".json").to_s, encode_params(params) 103 | if curl.response_code != 200 104 | raise Error, "Unexpected HTTP response code #{@url.response_code} posting to #{curl.url}" 105 | end 106 | response = curl.body_str 107 | response.force_encoding Encoding::UTF_8 if in_ruby19_hell? 108 | JSON.parse response 109 | end 110 | end 111 | 112 | def get_raw resource, params={} 113 | @curl.url = URI.join(@url, resource).to_s + (params.empty? ? "" : "?" + encode_params(params)) 114 | @curl.http_get 115 | if @curl.response_code != 200 116 | raise Error, "Unexpected HTTP response code #{@curl.response_code} getting #{@curl.url}" 117 | end 118 | @curl.body_str 119 | end 120 | 121 | def handle_errors 122 | begin 123 | v = yield 124 | raise Error, "invalid response: #{v.inspect[0..200]}" unless v.is_a?(Hash) 125 | case v["response"] 126 | when "ok"; v 127 | when "error"; raise Error, v.inspect 128 | else raise Error, "invalid response: #{v.inspect[0..200]}" 129 | end 130 | rescue SystemCallError, Curl::Err::CurlError, Curl::JSON::ParserError, SocketError, IOError => e 131 | raise Error, "#{e.message} (#{e.class})" 132 | end 133 | end 134 | 135 | def in_ruby19_hell? 136 | @in_ruby19_hell = "".respond_to?(:encoding) if @in_ruby19_hell.nil? 137 | @in_ruby19_hell 138 | end 139 | end 140 | -------------------------------------------------------------------------------- /bin/heliotrope-reindex: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # encoding: UTF-8 3 | 4 | require 'rubygems' 5 | require 'trollop' 6 | require "heliotrope" 7 | require 'fileutils' 8 | require 'json' 9 | 10 | opts = Trollop::options do 11 | banner < ".", :short => "d" 23 | opt :reorder, "Reorder all documents based on date header" 24 | opt :index_deleted, "Index deleted messages rather than skipping them" 25 | opt :index_spam, "Index spam messages rather than skipping them", :short => "s" 26 | opt :num, "Only process this many messages", :type => :int, :short => "n" 27 | opt :verbose, "Enable verbose output" 28 | end 29 | Trollop::die "invalid argument: #{ARGV.first}" unless ARGV.empty? 30 | 31 | class Loader 32 | def initialize metaindex, zmbox, opts={} 33 | @metaindex = metaindex 34 | @zmbox = zmbox 35 | @load_spam = opts[:load_spam] 36 | @load_deleted = opts[:load_deleted] 37 | @sort = opts[:sort] 38 | @num_deleted = @num_spam = 0 39 | end 40 | 41 | attr_reader :num_deleted, :num_spam 42 | 43 | def each_message(&b) 44 | if @sort 45 | each_message_sorted(&b) 46 | else 47 | each_message_regular(&b) 48 | end 49 | end 50 | 51 | private 52 | 53 | def each_message_regular 54 | doc_id = 0 55 | num_deleted = num_spam = 0 56 | startt = lastt = Time.now 57 | puts "Reindexing..." 58 | while true 59 | doc_id += 1 60 | entry, labels = get_message_summary_for doc_id 61 | break unless entry 62 | yield doc_id, entry, labels 63 | end 64 | end 65 | 66 | def each_message_sorted 67 | ids_and_dates = [] 68 | doc_id = 0 69 | puts "Loading dates..." 70 | while true 71 | doc_id += 1 72 | metainfo = @metaindex.load_messageinfo doc_id 73 | break unless metainfo 74 | ids_and_dates << [doc_id, metainfo[:date]] 75 | end 76 | puts "Sorting..." 77 | ids_and_dates = ids_and_dates.sort_by { |id, date| date } 78 | puts "Reindexing..." 79 | ids_and_dates.each do |id, date| 80 | entry, labels = get_message_summary_for id 81 | yield id, entry, labels 82 | end 83 | end 84 | 85 | def get_message_summary_for doc_id 86 | metainfo = @metaindex.load_messageinfo doc_id 87 | return unless metainfo 88 | 89 | entry, labels = if metainfo[:state].member?("deleted") && !@load_deleted 90 | @num_deleted += 1 91 | [Whistlepig::Entry.new, []] 92 | elsif metainfo[:state].member?("spam") && !@load_spam 93 | @num_spam += 1 94 | [Whistlepig::Entry.new, []] 95 | else 96 | rawbody = @zmbox.read metainfo[:loc] 97 | rawbody.force_encoding "binary" if rawbody.respond_to?(:force_encoding) # sigh... 98 | message = Heliotrope::Message.new(rawbody).parse! 99 | entry = Whistlepig::Entry.new 100 | entry.add_string "from", @metaindex.indexable_text_for(message.from).downcase 101 | entry.add_string "to", message.recipients.map { |x| @metaindex.indexable_text_for x }.join(" ").downcase 102 | entry.add_string "subject", message.subject.downcase 103 | entry.add_string "date", message.date.to_s 104 | entry.add_string "body", @metaindex.indexable_text_for(message).downcase 105 | [entry, metainfo[:labels] + metainfo[:state]] 106 | end 107 | end 108 | end 109 | 110 | FileUtils.mkdir_p File.join(opts.dir, "index-reindexed") 111 | store = LevelDB::DB.new File.join(opts.dir, "store") 112 | 113 | new_index_fn = File.join(opts.dir, "index-reindexed", "whistlepig") 114 | if Whistlepig::Index.exists? new_index_fn 115 | abort "Error: a reindex already exists in #{File.join opts.dir, "index-reindexed"}. Delete that directory first." 116 | end 117 | index = Whistlepig::Index.new new_index_fn 118 | hooks = Heliotrope::Hooks.new File.join(opts.dir, "hooks") 119 | metaindex = Heliotrope::MetaIndex.load_or_die! store, nil, hooks # nil index! (for "security") 120 | zmbox = Heliotrope::ZMBox.new File.join(opts.dir, "messages") 121 | loader = Loader.new metaindex, zmbox, :load_spam => opts.index_spam, :load_deleted => opts.index_deleted, :sort => opts.reorder 122 | 123 | startt = lastt = Time.now 124 | num_docs = 0 125 | loader.each_message do |store_docid, entry, labels| 126 | num_docs += 1 127 | index_docid = index.add_entry entry 128 | labels.each { |l| index.add_label index_docid, l } 129 | metaindex.write_docid_mapping! store_docid, index_docid 130 | 131 | if (Time.now - lastt) > 5 132 | elapsed = Time.now - startt 133 | num_indexed = num_docs - loader.num_deleted - loader.num_spam 134 | printf "; reindexed %d messages, skipped %d spam and %d deleted in %.1fs = %.1f m/s\n", num_indexed, loader.num_spam, loader.num_deleted, elapsed, num_indexed / elapsed 135 | lastt = Time.now 136 | end 137 | 138 | break if opts.num && num_docs >= opts.num 139 | end 140 | 141 | elapsed = Time.now - startt 142 | num_indexed = num_docs - loader.num_deleted - loader.num_spam 143 | printf "; reindexed %d messages, skipped %d spam and %d deleted in %.1fs = %.1f m/s\n", num_indexed, loader.num_spam, loader.num_deleted, elapsed, num_indexed / elapsed 144 | 145 | puts < e 52 | nil 53 | end 54 | 55 | (ret && ret.valid_encoding?) ? ret : force_to_ascii(text).force_encoding(target_charset) 56 | else 57 | begin 58 | Iconv.iconv("#{target_charset}//TRANSLIT//IGNORE", source_charset, text + " ").join[0 .. -2] # work around iconv bug with last two characters 59 | rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::InvalidCharacter, Iconv::IllegalSequence => e 60 | #$stderr.puts "WARNING couldn't transcode text from #{source_charset} to #{target_charset} (#{text[0 ... 20].inspect}...): got #{e.class}: #{e.message}" 61 | text = force_to_ascii text 62 | Iconv.iconv("#{target_charset}//TRANSLIT//IGNORE", "utf-8", text + " ").join[0 .. -2] 63 | end 64 | end 65 | end 66 | 67 | ## here's the last resort. take a string and manually, slowly, gently, turn 68 | ## it into some fucked-up thing that we at least know is ascii. 69 | ## 70 | ## the sad reality is that email messages often have the wrong content type, 71 | ## and then we have to do this in order to make them actually displayable. 72 | ## 73 | ## we could improve this with some encoding detection logic, but that's far 74 | ## beyond the scope of what i'm interested in spending my time on. 75 | def force_to_ascii s 76 | out = "" 77 | s.each_byte do |b| 78 | if (b & 128) != 0 79 | out << "\\x#{b.to_s 16}" 80 | else 81 | out << b.chr 82 | end 83 | end 84 | #out.force_encoding Encoding::UTF_8 if in_ruby19_hell? # not necessary? 85 | out 86 | end 87 | 88 | ## the next methods are stolen from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/101949 89 | ## and lightly adapted. 90 | # 91 | # $Id: rfc2047.rb,v 1.4 2003/04/18 20:55:56 sam Exp $ 92 | # 93 | # An implementation of RFC 2047 decoding. 94 | # 95 | # This module depends on the iconv library by Nobuyoshi Nakada, which I've 96 | # heard may be distributed as a standard part of Ruby 1.8. Many thanks to him 97 | # for helping with building and using iconv. 98 | # 99 | # Thanks to "Josef 'Jupp' Schugt" for pointing out an error with 100 | # stateful character sets. 101 | # 102 | # Copyright (c) Sam Roberts 2004 103 | # 104 | # This file is distributed under the same terms as Ruby. 105 | RFC2047_WORD = %r{=\?([!\#$%&'*+-/0-9A-Z\\^\`a-z{|}~]+)\?([BbQq])\?([!->@-~]+)\?=} 106 | RFC2047_WORDSEQ = %r{(#{RFC2047_WORD.source})\s+(?=#{RFC2047_WORD.source})} 107 | 108 | def is_rfc2047_encoded? s; s =~ RFC2047_WORD end 109 | 110 | # Decodes a string, +from+, containing RFC 2047 encoded words into a target 111 | # character set, +target+. See iconv_open(3) for information on the 112 | # supported target encodings. If one of the encoded words cannot be 113 | # converted to the target encoding, it is left in its encoded form. 114 | def decode_rfc2047 target_charset, from 115 | return unless is_rfc2047_encoded? from 116 | 117 | from = force_to_ascii from # strip any naughty characters---you should be rfc2047-encoding those! 118 | from = from.gsub RFC2047_WORDSEQ, '\1' 119 | out = from.gsub RFC2047_WORD do |word| 120 | source_charset, encoding, text = $1, $2, $3 121 | 122 | # B64 or QP decode, as necessary: 123 | text = case encoding 124 | when 'b', 'B'; text.unpack('m*')[0] 125 | when 'q', 'Q'; 126 | ## RFC 2047 has a variant of quoted printable where a ' ' character 127 | ## can be represented as an '_', rather than =32, so convert 128 | ## any of these that we find before doing the QP decoding. 129 | text.tr("_", " ").unpack('M*')[0] 130 | end 131 | 132 | transcode target_charset, source_charset, text 133 | end 134 | 135 | out 136 | end 137 | end 138 | end 139 | end 140 | -------------------------------------------------------------------------------- /bin/heliotrope-import: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # encoding: UTF-8 3 | 4 | require 'rubygems' 5 | require 'trollop' 6 | require "heliotrope" 7 | require 'json' 8 | 9 | SUP_LABEL_MAP = { "killed" => "muted" } 10 | 11 | opts = Trollop::options do 12 | banner < String, :required => true 30 | opt :num_messages, "Index at most this many messages and then quit", :type => Integer, :short => "n" 31 | opt :num_skip, "Skip this many many messages and then start indexing", :type => Integer, :short => "k" 32 | opt :state_file, "Store state to, and read state from, this file. This enables successive runs against the same source to pick up only what's changed.", :type => String 33 | opt :sup_label_file, "Load labels from a sup label dump from this file", :type => String 34 | opt :add_labels, "Add these labels to every message (should be a comma-separated list)", :type => String 35 | opt :remove_labels, "Do not add any of these labels to any message (should be a comma-separated list)", :type => String 36 | opt :verbose, "Enable verbose output" 37 | opt :no_skip_spam, "If a message is marked as spam, add it anyways (default: ignore it)" 38 | opt :no_skip_deleted, "If a message is marked as deleted, add it anyways (default: ignore it)" 39 | opt :user_email_addresses, "A comma-delimited list of email addresses that belong to you. This is not necessary, but will improve the contact collection process.", :default => "" 40 | 41 | banner < String, :short => "m" 46 | opt :mbox_start_offset, "Start file offset for scanning", :default => 0, :short => "s" 47 | banner < :strings 52 | banner < String 57 | opt :imap_port, "IMAP server port (default: 993 with ssl, 143 without)", :type => Integer 58 | opt :dont_use_ssl, "Don't use SSL" 59 | opt :imap_username, "IMAP username (default: prompt)", :type => String 60 | opt :imap_password, "IMAP password (default: prompt)", :type => String 61 | opt :imap_folder, "IMAP folder", :default => "INBOX" 62 | 63 | banner < String 68 | opt :gmail_password, "GMail password (default: prompt)", :type => String 69 | 70 | conflicts :mbox_fn, :maildir_dirs, :imap_host, :gmail_username 71 | end 72 | 73 | Trollop::die "unknown argument: #{ARGV.first}" unless ARGV.empty? 74 | 75 | add_labels = (opts.add_labels || "").split(/\s*,\s*/) 76 | remove_labels = (opts.remove_labels || "").split(/\s*,\s*/) 77 | 78 | imported_labels = if opts.sup_label_file 79 | puts "Loading sup label dump..." 80 | v = IO.foreach(opts.sup_label_file).inject({}) do |h, l| 81 | begin 82 | l =~ /^(\S+?) \((.*)\)$/ or abort "Can't parse labels line: #{l[0..250].inspect}" 83 | rescue ArgumentError # sigh 84 | abort "Can't parse labels line: #{l[0..250].inspect}" 85 | end 86 | 87 | msgid, ll = $1, $2 88 | h[msgid] = ll.split(/\s+/).map { |l| SUP_LABEL_MAP[l] || l } 89 | h 90 | end 91 | puts "Loaded #{v.size} labels." 92 | v 93 | else 94 | {} 95 | end 96 | 97 | email_addresses = opts.user_email_addresses.split(",") 98 | 99 | adder = Heliotrope::MessageAdder.new opts 100 | 101 | FileUtils.mkdir_p opts.dir 102 | FileUtils.mkdir_p File.join(opts.dir, "index") 103 | store = LevelDB::DB.new File.join(opts.dir, "store") 104 | index = Whistlepig::Index.new File.join(opts.dir, "index", "whistlepig") 105 | hooks = Heliotrope::Hooks.new File.join(opts.dir, "hooks") 106 | metaindex = Heliotrope::MetaIndex.new store, index, hooks 107 | zmbox = Heliotrope::ZMBox.new File.join(opts.dir, "messages") 108 | 109 | adder.each_message do |rawbody, source_state, source_labels| 110 | seen = indexed = bad = false 111 | 112 | begin 113 | message = Heliotrope::Message.new(rawbody).parse! 114 | 115 | if metaindex.contains_safe_msgid? message.safe_msgid 116 | seen = true 117 | else 118 | state, labels = if imported_labels.member? message.msgid 119 | ## if we have imported labels, use those for both labels and state 120 | v = imported_labels[message.msgid] 121 | [v, v] 122 | else 123 | [source_state, source_labels] 124 | end 125 | 126 | labels += add_labels 127 | labels -= remove_labels 128 | 129 | next if (state.include?("spam") || labels.include?("spam")) && !opts.no_skip_spam 130 | next if (state.include?("deleted") || labels.include?("deleted")) && !opts.no_skip_deleted 131 | 132 | loc = zmbox.add rawbody 133 | metaindex.add_message message, state, labels, :loc => loc 134 | 135 | if email_addresses.include?(message.from.email) # from you 136 | (message.to + message.cc + message.bcc).each { |contact| metaindex.touch_contact! contact, message.date } 137 | elsif !message.is_list_or_automated_email? 138 | metaindex.touch_contact! message.from, message.date 139 | end 140 | 141 | indexed = true 142 | end 143 | rescue Heliotrope::InvalidMessageError => e 144 | bad = true 145 | rescue Exception => e # sigh 146 | File.open("bad-message.txt", "w") { |f| f.write rawbody } 147 | $stderr.puts "* wrote broken message to bad-message.txt" 148 | raise e 149 | end 150 | 151 | [seen, indexed, bad] 152 | end 153 | -------------------------------------------------------------------------------- /lib/heliotrope/imap-dumper.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | require "net/imap" 3 | require 'json' 4 | require 'timeout' # TODO: system timer for 1.8? 5 | 6 | # Monkeypatch Net::IMAP to support GMail IMAP extensions. 7 | # http://code.google.com/apis/gmail/imap/ 8 | module Net 9 | class IMAP 10 | 11 | # Implement GMail XLIST command 12 | def xlist(refname, mailbox) 13 | synchronize do 14 | send_command("XLIST", refname, mailbox) 15 | return @responses.delete("XLIST") 16 | end 17 | end 18 | 19 | class ResponseParser 20 | def response_untagged 21 | match(T_STAR) 22 | match(T_SPACE) 23 | token = lookahead 24 | if token.symbol == T_NUMBER 25 | return numeric_response 26 | elsif token.symbol == T_ATOM 27 | case token.value 28 | when /\A(?:OK|NO|BAD|BYE|PREAUTH)\z/ni 29 | return response_cond 30 | when /\A(?:FLAGS)\z/ni 31 | return flags_response 32 | when /\A(?:LIST|LSUB|XLIST)\z/ni # Added XLIST 33 | return list_response 34 | when /\A(?:QUOTA)\z/ni 35 | return getquota_response 36 | when /\A(?:QUOTAROOT)\z/ni 37 | return getquotaroot_response 38 | when /\A(?:ACL)\z/ni 39 | return getacl_response 40 | when /\A(?:SEARCH|SORT)\z/ni 41 | return search_response 42 | when /\A(?:THREAD)\z/ni 43 | return thread_response 44 | when /\A(?:STATUS)\z/ni 45 | return status_response 46 | when /\A(?:CAPABILITY)\z/ni 47 | return capability_response 48 | else 49 | return text_response 50 | end 51 | else 52 | parse_error("unexpected token %s", token.symbol) 53 | end 54 | end 55 | 56 | def response_tagged 57 | tag = atom 58 | match(T_SPACE) 59 | token = match(T_ATOM) 60 | name = token.value.upcase 61 | match(T_SPACE) 62 | return TaggedResponse.new(tag, name, resp_text, @str) 63 | end 64 | 65 | def msg_att 66 | match(T_LPAR) 67 | attr = {} 68 | while true 69 | token = lookahead 70 | case token.symbol 71 | when T_RPAR 72 | shift_token 73 | break 74 | when T_SPACE 75 | shift_token 76 | token = lookahead 77 | end 78 | case token.value 79 | when /\A(?:ENVELOPE)\z/ni 80 | name, val = envelope_data 81 | when /\A(?:FLAGS)\z/ni 82 | name, val = flags_data 83 | when /\A(?:X-GM-LABELS)\z/ni # Added X-GM-LABELS extension 84 | name, val = flags_data 85 | when /\A(?:INTERNALDATE)\z/ni 86 | name, val = internaldate_data 87 | when /\A(?:RFC822(?:\.HEADER|\.TEXT)?)\z/ni 88 | name, val = rfc822_text 89 | when /\A(?:RFC822\.SIZE)\z/ni 90 | name, val = rfc822_size 91 | when /\A(?:BODY(?:STRUCTURE)?)\z/ni 92 | name, val = body_data 93 | when /\A(?:UID)\z/ni 94 | name, val = uid_data 95 | when /\A(?:X-GM-MSGID)\z/ni # Added X-GM-MSGID extension 96 | name, val = uid_data 97 | when /\A(?:X-GM-THRID)\z/ni # Added X-GM-THRID extension 98 | name, val = uid_data 99 | else 100 | parse_error("unknown attribute `%s'", token.value) 101 | end 102 | attr[name] = val 103 | end 104 | return attr 105 | end 106 | end 107 | end 108 | end 109 | 110 | module Heliotrope 111 | class IMAPDumper 112 | def can_provide_labels?; false end 113 | def imap_query_columns; %w(UID FLAGS BODY.PEEK[]) end 114 | 115 | def initialize opts 116 | %w(host port username password folder ssl).each do |x| 117 | v = opts[x.to_sym] 118 | raise ArgumentError, "need :#{x} option" if v.nil? 119 | instance_variable_set "@#{x}", v 120 | end 121 | @ids = nil 122 | end 123 | 124 | attr_reader :folder 125 | 126 | def load! state 127 | if state 128 | @last_added_uid = state["last_added_uid"] 129 | @last_uidvalidity = state["last_uidvalidity"] 130 | end 131 | get_ids! # sets @ids 132 | 133 | puts "; found #{@ids.size} unadded messages on server" 134 | end 135 | 136 | def skip! num 137 | @ids = @ids[num .. -1] || [] 138 | @msgs = [] 139 | end 140 | 141 | NUM_MESSAGES_PER_ITERATION = 20 142 | 143 | def each_message 144 | until done? 145 | get_more_messages! if @msgs.nil? || @msgs.empty? # sets @msgs 146 | break if @msgs.empty? 147 | 148 | body, labels, state, uid = @msgs.shift 149 | yield body, labels, state, uid 150 | @last_added_uid = uid 151 | end 152 | end 153 | 154 | def done?; @ids && @ids.empty? && @msgs && @msgs.empty? end 155 | 156 | def finish! 157 | state = { "last_added_uid" => @last_added_uid, "last_uidvalidity" => @last_uidvalidity } 158 | begin 159 | @imap.close if @imap 160 | rescue Net::IMAP::BadResponseError, SystemCallError, IOError 161 | end 162 | state 163 | end 164 | 165 | private 166 | 167 | def get_ids! 168 | puts "; connecting to #{@host}:#{@port} (ssl: #{!!@ssl})..." 169 | begin 170 | @imap = Net::IMAP.new @host, :port => @port, :ssl => @ssl 171 | rescue TypeError 172 | ## 1.8 compatibility. sigh. 173 | @imap = Net::IMAP.new @host, @port, @ssl 174 | end 175 | puts "; login as #{@username} ..." 176 | @imap.login @username, @password 177 | 178 | @imap.examine folder 179 | 180 | @uidvalidity = @imap.responses["UIDVALIDITY"].first 181 | @uidnext = @imap.responses["UIDNEXT"].first 182 | 183 | @ids = if @uidvalidity == @last_uidvalidity 184 | puts "; found #{@uidnext - @last_added_uid - 1} new messages..." 185 | ((@last_added_uid + 1) .. (@uidnext - 1)).to_a 186 | else 187 | if @last_uidvalidity 188 | puts "; UID validity has changed! your server sucks. re-downloading all uids as punishment..." 189 | else 190 | puts "; awww, is this your first time? don't be shy now. downloading all uids..." 191 | end 192 | @imap.uid_search(["NOT", "DELETED"]) || [] 193 | end 194 | 195 | @last_uidvalidity = @uidvalidity 196 | end 197 | 198 | def get_more_messages! 199 | if @ids.empty? 200 | @msgs = [] 201 | return 202 | end 203 | 204 | imapdata = [] 205 | while imapdata.empty? && !@ids.empty? 206 | ids = @ids.shift NUM_MESSAGES_PER_ITERATION 207 | query = ids.first .. ids.last 208 | puts "; requesting messages #{query.inspect} from imap server" 209 | startt = Time.now 210 | imapdata = begin 211 | Timeout.timeout(120) { @imap.uid_fetch(query, imap_query_columns) || [] } 212 | rescue Timeout::Error => e 213 | puts "warning: timeout. retrying" 214 | retry 215 | rescue Net::IMAP::NoResponseError => e 216 | puts "warning: skipping messages #{query}: #{e.message}" 217 | [] 218 | end 219 | elapsed = Time.now - startt 220 | puts "; got #{imapdata.size} messages" 221 | #printf "; the imap server loving gave us %d messages in %.1fs = a whopping %.1fm/s\n", imapdata.size, elapsed, imapdata.size / elapsed 222 | end 223 | 224 | @msgs = imapdata.map do |data| 225 | state = data.attr["FLAGS"].map { |flag| flag.to_s.downcase } 226 | if state.member? "seen" 227 | state -= ["seen"] 228 | else 229 | state += ["unread"] 230 | end 231 | 232 | if state.member? "flagged" 233 | state -= ["flagged"] 234 | state += ["starred"] 235 | end 236 | 237 | ## it's a little funny to do this gmail-specific label parsing here, but 238 | ## i'm hoping that other imap servers might one day support this extension 239 | labels = (data.attr["X-GM-LABELS"] || []).map { |label| Net::IMAP.decode_utf7(label.to_s).downcase } 240 | if labels.member? "sent" 241 | labels -= ["Sent"] 242 | state += ["sent"] 243 | end 244 | if labels.member? "starred" 245 | labels -= ["Starred"] 246 | state += ["starred"] 247 | end 248 | labels -= ["important"] # fuck that noise 249 | 250 | body = data.attr["BODY[]"].gsub "\r\n", "\n" 251 | uid = data.attr["UID"] 252 | 253 | [body, labels, state, uid] 254 | end 255 | end 256 | end 257 | end 258 | -------------------------------------------------------------------------------- /lib/heliotrope/message.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | require 'rmail' 4 | require 'digest/md5' 5 | require 'json' 6 | require 'timeout' 7 | 8 | module Heliotrope 9 | class InvalidMessageError < StandardError; end 10 | class Message 11 | def initialize rawbody 12 | @rawbody = rawbody 13 | @mime_parts = {} 14 | end 15 | 16 | def parse! 17 | @m = RMail::Parser.read @rawbody 18 | 19 | @msgid = find_msgids(decode_header(validate_field(:message_id, @m.header["message-id"]))).first 20 | ## this next error happens if we have a field, but we can't find a in it 21 | raise InvalidMessageError, "can't parse msgid: #{@m.header['message-id']}" unless @msgid 22 | @safe_msgid = munge_msgid @msgid 23 | 24 | @from = Person.from_string decode_header(validate_field(:from, @m.header["from"])) 25 | @date = begin 26 | Time.parse(validate_field(:date, @m.header["date"])).to_i 27 | rescue ArgumentError 28 | #puts "warning: invalid date field #{@m.header['date']}" 29 | 0 30 | end 31 | 32 | @to = Person.many_from_string decode_header(@m.header["to"]) 33 | @cc = Person.many_from_string decode_header(@m.header["cc"]) 34 | @bcc = Person.many_from_string decode_header(@m.header["bcc"]) 35 | @subject = decode_header @m.header["subject"] 36 | @reply_to = Person.from_string @m.header["reply-to"] 37 | 38 | @refs = find_msgids decode_header(@m.header["references"] || "") 39 | in_reply_to = find_msgids decode_header(@m.header["in-reply-to"] || "") 40 | @refs += in_reply_to unless @refs.member? in_reply_to.first 41 | @safe_refs = @refs.map { |r| munge_msgid(r) } 42 | 43 | ## various other headers that you don't think we will need until we 44 | ## actually need them. 45 | 46 | ## this is sometimes useful for determining who was the actual target of 47 | ## the email, in the case that someone has aliases 48 | @recipient_email = @m.header["envelope-to"] || @m.header["x-original-to"] || @m.header["delivered-to"] 49 | 50 | @list_id = @m.header["list-id"] 51 | @list_subscribe = @m.header["list-subscribe"] 52 | @list_unsubscribe = @m.header["list-unsubscribe"] 53 | @list_post = @m.header["list-post"] || @m.header["x-mailing-list"] 54 | 55 | self 56 | end 57 | 58 | attr_reader :msgid, :from, :to, :cc, :bcc, :subject, :date, :refs, :recipient_email, :list_post, :list_unsubscribe, :list_subscribe, :list_id, :reply_to, :safe_msgid, :safe_refs 59 | 60 | def is_list_or_automated_email? 61 | list_post || list_id || (from.email =~ /=|reply|postmaster|bounce/) 62 | end 63 | 64 | ## we don't encode any non-text parts here, because json encoding of 65 | ## binary objects is crazy-talk, and because those are likely to be 66 | ## big anyways. 67 | def to_h message_id, preferred_type 68 | parts = mime_parts(preferred_type).map do |type, fn, cid, content, size| 69 | if type =~ /^text\// 70 | { :type => type, :filename => fn, :cid => cid, :content => content, :here => true } 71 | else 72 | { :type => type, :filename => fn, :cid => cid, :size => content.size, :here => false } 73 | end 74 | end.compact 75 | 76 | { :from => (from ? from.to_email_address : ""), 77 | :to => to.map(&:to_email_address), 78 | :cc => cc.map(&:to_email_address), 79 | :bcc => bcc.map(&:to_email_address), 80 | :subject => subject, 81 | :date => date, 82 | :refs => refs, 83 | :parts => parts, 84 | :message_id => message_id, 85 | :snippet => snippet, 86 | :reply_to => (reply_to ? reply_to.to_email_address : ""), 87 | 88 | :recipient_email => recipient_email, 89 | :list_post => list_post, 90 | :list_subscribe => list_subscribe, 91 | :list_unsubscribe => list_unsubscribe, 92 | 93 | :email_message_id => @msgid, 94 | } 95 | end 96 | 97 | def direct_recipients; to end 98 | def indirect_recipients; cc + bcc end 99 | def recipients; direct_recipients + indirect_recipients end 100 | 101 | def indexable_text 102 | @indexable_text ||= begin 103 | v = ([from.indexable_text] + 104 | recipients.map { |r| r.indexable_text } + 105 | [subject] + 106 | mime_parts("text/plain").map do |type, fn, id, content| 107 | if fn 108 | fn 109 | elsif type =~ /text\// 110 | content 111 | end 112 | end 113 | ).flatten.compact.join(" ") 114 | 115 | v.gsub(/\s+[\W\d_]+(\s|$)/, " "). # drop funny tokens 116 | gsub(/\s+/, " ") 117 | end 118 | end 119 | 120 | SIGNED_MIME_TYPE = %r{multipart/signed;.*protocol="?application/pgp-signature"?}m 121 | ENCRYPTED_MIME_TYPE = %r{multipart/encrypted;.*protocol="?application/pgp-encrypted"?}m 122 | SIGNATURE_ATTACHMENT_TYPE = %r{application\/pgp-signature\b} 123 | 124 | def snippet 125 | mime_parts("text/plain").each do |type, fn, id, content| 126 | if (type =~ /text\//) && fn.nil? 127 | head = content[0, 1000].split "\n" 128 | head.shift while !head.empty? && head.first.empty? || head.first =~ /^\s*>|\-\-\-|(wrote|said):\s*$/ 129 | snippet = head.join(" ").gsub(/^\s+/, "").gsub(/\s+/, " ")[0, 100] 130 | return snippet 131 | end 132 | end 133 | "" 134 | end 135 | 136 | def has_attachment? 137 | @has_attachment ||= 138 | mime_parts("text/plain").any? do |type, fn, id, content| 139 | fn && (type !~ SIGNATURE_ATTACHMENT_TYPE) 140 | end 141 | end 142 | 143 | def signed? 144 | @signed ||= mime_part_types.any? { |t| t =~ SIGNED_MIME_TYPE } 145 | end 146 | 147 | def encrypted? 148 | @encrypted ||= mime_part_types.any? { |t| t =~ ENCRYPTED_MIME_TYPE } 149 | end 150 | 151 | def mime_parts preferred_type 152 | @mime_parts[preferred_type] ||= decode_mime_parts @m, preferred_type 153 | end 154 | 155 | private 156 | 157 | ## hash the fuck out of all message ids. trust me, you want this. 158 | def munge_msgid msgid 159 | Digest::MD5.hexdigest msgid 160 | end 161 | 162 | def find_msgids msgids 163 | msgids.scan(/<(.+?)>/).map(&:first) 164 | end 165 | 166 | def mime_part_types part=@m 167 | ptype = part.header["content-type"] || "" 168 | [ptype] + (part.multipart? ? part.body.map { |sub| mime_part_types sub } : []) 169 | end 170 | 171 | ## unnests all the mime stuff and returns a list of [type, filename, content] 172 | ## tuples. 173 | ## 174 | ## for multipart/alternative parts, will only return the subpart that matches 175 | ## preferred_type. if none of them, will only return the first subpart. 176 | def decode_mime_parts part, preferred_type, level=0 177 | if part.multipart? 178 | if mime_type_for(part) =~ /multipart\/alternative/ 179 | target = part.body.find { |p| mime_type_for(p).index(preferred_type) } || part.body.first 180 | if target # this can be nil 181 | decode_mime_parts target, preferred_type, level + 1 182 | else 183 | [] 184 | end 185 | else # decode 'em all 186 | part.body.compact.map { |subpart| decode_mime_parts subpart, preferred_type, level + 1 }.flatten 1 187 | end 188 | else 189 | type = mime_type_for part 190 | filename = mime_filename_for part 191 | id = mime_id_for part 192 | content = mime_content_for part, preferred_type 193 | [[type, filename, id, content]] 194 | end 195 | end 196 | 197 | private 198 | 199 | def validate_field what, thing 200 | raise InvalidMessageError, "missing '#{what}' header" if thing.nil? 201 | thing = thing.to_s.strip 202 | raise InvalidMessageError, "blank '#{what}' header: #{thing.inspect}" if thing.empty? 203 | thing 204 | end 205 | 206 | def mime_type_for part 207 | (part.header["content-type"] || "text/plain").gsub(/\s+/, " ").strip.downcase 208 | end 209 | 210 | def mime_id_for part 211 | header = part.header["content-id"] 212 | case header 213 | when /<(.+?)>/; $1 214 | else header 215 | end 216 | end 217 | 218 | ## a filename, or nil 219 | def mime_filename_for part 220 | cd = part.header["Content-Disposition"] 221 | ct = part.header["Content-Type"] 222 | 223 | ## RFC 2183 (Content-Disposition) specifies that disposition-parms are 224 | ## separated by ";". So, we match everything up to " and ; (if present). 225 | filename = if ct && ct =~ /name="?(.*?[^\\])("|;|\z)/im # find in content-type 226 | $1 227 | elsif cd && cd =~ /filename="?(.*?[^\\])("|;|\z)/m # find in content-disposition 228 | $1 229 | end 230 | 231 | ## filename could be RFC2047 encoded 232 | decode_header(filename).chomp if filename 233 | end 234 | 235 | ## rfc2047-decode a header, convert to utf-8, and normalize whitespace 236 | def decode_header v 237 | return "" if v.nil? 238 | 239 | v = if Decoder.is_rfc2047_encoded? v 240 | Decoder.decode_rfc2047 "utf-8", v 241 | else # assume it's ascii and transcode 242 | Decoder.transcode "utf-8", "ascii", v 243 | end 244 | 245 | v.gsub(/\s+/, " ").strip 246 | end 247 | 248 | CONVERSIONS = { 249 | ["text/html", "text/plain"] => :html_to_text 250 | } 251 | 252 | ## the content of a mime part itself. if the content-type is text/*, 253 | ## it will be converted to utf8. otherwise, it will be left in the 254 | ## original encoding 255 | def mime_content_for mime_part, preferred_type 256 | return "" unless mime_part.body # sometimes this happens. not sure why. 257 | 258 | mt = mime_type_for(mime_part) || "text/plain" # i guess 259 | content_type = if mt =~ /^(.+);/ then $1.downcase else mt end 260 | source_charset = if mt =~ /charset="?(.*?)"?(;|$)/i then $1 else "US-ASCII" end 261 | 262 | content = mime_part.decode 263 | converted_content, converted_charset = if(converter = CONVERSIONS[[content_type, preferred_type]]) 264 | send converter, content, source_charset 265 | else 266 | [content, source_charset] 267 | end 268 | 269 | if content_type =~ /^text\// 270 | Decoder.transcode "utf-8", converted_charset, converted_content 271 | else 272 | converted_content 273 | end 274 | end 275 | 276 | require 'locale' 277 | SYSTEM_CHARSET = Locale.charset 278 | HTML_CONVERSION_CMD = "html2text" 279 | HTML_CONVERSION_TIMEOUT = 10 # seconds... this thing can be slow 280 | def html_to_text html, charset 281 | ## ignore charset. html2text produces output in the system charset. 282 | #puts "; forced to decode html. running #{HTML_CONVERSION_CMD} on #{html.size}b mime part..." 283 | content = begin 284 | Timeout.timeout(HTML_CONVERSION_TIMEOUT) do 285 | Heliotrope.popen3(HTML_CONVERSION_CMD) do |inn, out, err| 286 | inn.print html 287 | inn.close 288 | out.read 289 | end 290 | end 291 | rescue Timeout::Error 292 | $stderr.puts "; warning: timeout when converting message from html to text" 293 | "[html conversion failed on this command (htmlconversionfailure)]" 294 | end 295 | [content, SYSTEM_CHARSET] 296 | end 297 | end 298 | end 299 | -------------------------------------------------------------------------------- /test/test_heliotrope.rb: -------------------------------------------------------------------------------- 1 | require 'test/unit' 2 | require 'fileutils' 3 | require 'digest/md5' 4 | require "heliotrope" 5 | 6 | include Heliotrope 7 | 8 | class HeliotropeTest < ::Test::Unit::TestCase 9 | TEST_DIR = "/tmp/heliotrope-test" 10 | 11 | class MockMessage 12 | def initialize opts={} 13 | @@ids ||= 0 14 | 15 | @opts = { 16 | :signed? => false, 17 | :has_attachment? => false, 18 | :encrypted? => false, 19 | 20 | :msgid => "msg-#{@@ids += 1}", 21 | :from => Person.from_string("Egg Zample "), 22 | :to => Person.from_string("Eggs Ample "), 23 | :cc => [], 24 | :bcc => [], 25 | :subject => "test message", 26 | :date => Time.now, 27 | :indexable_text => "i love mice", 28 | :direct_recipients => [], 29 | :indirect_recipients => [], 30 | :snippet => "i love mice", 31 | :refs => [], 32 | }.merge opts 33 | 34 | @opts[:recipients] ||= ([@opts[:to]] + @opts[:cc] + @opts[:bcc]).flatten.compact 35 | end 36 | 37 | def safe_msgid; Digest::MD5.hexdigest msgid end 38 | def safe_refs; refs.map { |r| Digest::MD5.hexdigest r } end 39 | 40 | def method_missing m, *a 41 | raise "no value for #{m.inspect}" unless @opts.member? m 42 | @opts[m] 43 | end 44 | end 45 | 46 | def setup 47 | FileUtils.rm_rf TEST_DIR 48 | FileUtils.mkdir TEST_DIR 49 | hooks = Hooks.new File.join(TEST_DIR, "hooks") 50 | 51 | store = LevelDB::DB.new File.join(TEST_DIR, "store") 52 | index = Whistlepig::Index.new File.join(TEST_DIR, "index") 53 | hooks = Heliotrope::Hooks.new File.join(TEST_DIR, "hooks") 54 | @metaindex = Heliotrope::MetaIndex.new store, index, hooks 55 | end 56 | 57 | def teardown 58 | @metaindex.close 59 | FileUtils.rm_rf TEST_DIR 60 | end 61 | 62 | def test_size 63 | assert_equal 0, @metaindex.size 64 | 65 | m1 = MockMessage.new 66 | x = @metaindex.add_message m1 67 | assert_equal 1, @metaindex.size 68 | 69 | m2 = MockMessage.new 70 | @metaindex.add_message m2 71 | assert_equal 2, @metaindex.size 72 | end 73 | 74 | def test_adding_duplicate_messages_does_nothing 75 | m1 = MockMessage.new :msgid => "a" 76 | @metaindex.add_message m1 77 | assert_equal 1, @metaindex.size 78 | 79 | m2 = MockMessage.new :msgid => "a" 80 | @metaindex.add_message m2 81 | assert_equal 1, @metaindex.size 82 | end 83 | 84 | def test_added_messages_are_available_in_search 85 | @metaindex.set_query Query.new("body", "hello") 86 | results = @metaindex.get_some_results 100 87 | assert_equal 0, results.size 88 | 89 | m1 = MockMessage.new :indexable_text => "hello bob" 90 | docid, threadid = @metaindex.add_message m1 91 | 92 | @metaindex.reset_query! 93 | results = @metaindex.get_some_results 100 94 | assert_equal 1, results.size 95 | assert_equal threadid, results.first[:thread_id] 96 | end 97 | 98 | def test_added_message_state_is_preserved 99 | m1 = MockMessage.new 100 | docid, threadid = @metaindex.add_message m1, %w(unread), [] 101 | 102 | summary = @metaindex.load_messageinfo docid 103 | assert_equal Set.new(%w(unread)), summary[:state] 104 | end 105 | 106 | def test_added_message_state_is_searchable_via_labels 107 | @metaindex.set_query Query.new("body", "~unread") 108 | assert_equal 0, @metaindex.count_results 109 | 110 | m1 = MockMessage.new 111 | docid, threadid = @metaindex.add_message m1, %w(unread), [] 112 | 113 | assert_equal 1, @metaindex.count_results 114 | end 115 | 116 | def test_message_state_is_modifiable 117 | m1 = MockMessage.new 118 | docid, threadid = @metaindex.add_message m1 119 | assert_equal Set.new, @metaindex.load_messageinfo(docid)[:state] 120 | 121 | @metaindex.update_message_state docid, %w(unread) 122 | assert_equal Set.new(%w(unread)), @metaindex.load_messageinfo(docid)[:state] 123 | 124 | @metaindex.update_message_state docid, %w(starred) 125 | assert_equal Set.new(%w(starred)), @metaindex.load_messageinfo(docid)[:state] 126 | 127 | @metaindex.update_message_state docid, %w(unread deleted) 128 | assert_equal Set.new(%w(unread deleted)), @metaindex.load_messageinfo(docid)[:state] 129 | end 130 | 131 | def test_message_state_ignores_random_stuff 132 | m1 = MockMessage.new 133 | docid, threadid = @metaindex.add_message m1, %w(hello there bob inbox unread is nice), [] 134 | assert_equal Set.new(%w(unread)), @metaindex.load_messageinfo(docid)[:state] 135 | end 136 | 137 | def test_added_thread_labels_are_applied_to_the_whole_thread 138 | m1 = MockMessage.new 139 | docid, threadid = @metaindex.add_message m1, [], %w(tired hungry) 140 | 141 | summary = @metaindex.load_threadinfo threadid 142 | assert_equal Set.new(%w(tired hungry)), summary[:labels] 143 | end 144 | 145 | def test_thread_labels_are_available_in_search 146 | m1 = MockMessage.new :indexable_text => "hello bob" 147 | docid, threadid = @metaindex.add_message m1 148 | 149 | @metaindex.set_query Query.new("body", "~tired") 150 | assert_equal 0, @metaindex.count_results 151 | 152 | @metaindex.update_thread_labels threadid, %w(tired) 153 | assert_equal 1, @metaindex.count_results 154 | 155 | results = @metaindex.get_some_results 100 156 | assert_equal threadid, results.first[:thread_id] 157 | end 158 | 159 | def test_thread_labels_from_added_messages_are_available_in_search 160 | @metaindex.set_query Query.new("body", "~tired") 161 | results = @metaindex.get_some_results 100 162 | assert_equal 0, results.size 163 | 164 | m1 = MockMessage.new :indexable_text => "hello bob" 165 | docid, threadid = @metaindex.add_message m1, [], %w(tired hungry) 166 | 167 | @metaindex.reset_query! 168 | results = @metaindex.get_some_results 100 169 | assert_equal 1, results.size 170 | end 171 | 172 | def test_thread_labels_are_modifiable 173 | m1 = MockMessage.new :indexable_text => "hello bob" 174 | docid, threadid = @metaindex.add_message m1 175 | assert_equal Set.new, @metaindex.load_threadinfo(threadid)[:labels] 176 | 177 | @metaindex.update_thread_labels threadid, %w(hungry) 178 | assert_equal Set.new(%w(hungry)), @metaindex.load_threadinfo(threadid)[:labels] 179 | 180 | @metaindex.update_thread_labels threadid, %w(tired) 181 | assert_equal Set.new(%w(tired)), @metaindex.load_threadinfo(threadid)[:labels] 182 | 183 | @metaindex.update_thread_labels threadid, %w(hungry tired) 184 | assert_equal Set.new(%w(hungry tired)), @metaindex.load_threadinfo(threadid)[:labels] 185 | end 186 | 187 | def test_messages_are_threaded 188 | m1 = MockMessage.new :msgid => "1" 189 | docid1, threadid1 = @metaindex.add_message m1 190 | 191 | m2 = MockMessage.new :msgid => "2", :refs => ["1"] 192 | docid2, threadid2 = @metaindex.add_message m2 193 | 194 | assert_equal threadid1, threadid2 195 | 196 | m3 = MockMessage.new :msgid => "3", :refs => ["2"] 197 | docid3, threadid3 = @metaindex.add_message m3 198 | 199 | assert_equal threadid2, threadid3 200 | 201 | m4 = MockMessage.new :msgid => "4", :refs => ["1"] 202 | docid4, threadid4 = @metaindex.add_message m4 203 | 204 | assert_equal threadid3, threadid4 205 | end 206 | 207 | def test_message_state_is_propagated_to_thread_as_a_disjunction_in_threadinfo 208 | m1 = MockMessage.new :msgid => "1" 209 | m2 = MockMessage.new :msgid => "2", :refs => ["1"] 210 | m3 = MockMessage.new :msgid => "3", :refs => ["2"] 211 | 212 | docid1, threadid1 = @metaindex.add_message m1, %w(unread) 213 | docid2, threadid2 = @metaindex.add_message m2, %w(unread) 214 | docid3, threadid3 = @metaindex.add_message m3, %w(unread) 215 | 216 | assert_equal threadid1, threadid2 217 | assert_equal threadid2, threadid3 218 | 219 | assert_equal Set.new(%w(unread)), @metaindex.load_threadinfo(threadid1)[:state] 220 | 221 | @metaindex.update_message_state docid1, [] 222 | assert_equal Set.new(%w(unread)), @metaindex.load_threadinfo(threadid1)[:state] 223 | 224 | @metaindex.update_message_state docid2, [] 225 | assert_equal Set.new(%w(unread)), @metaindex.load_threadinfo(threadid1)[:state] 226 | 227 | @metaindex.update_message_state docid3, [] 228 | assert_equal Set.new, @metaindex.load_threadinfo(threadid1)[:state] 229 | 230 | ## now add some back 231 | @metaindex.update_message_state docid3, %w(starred) 232 | assert_equal Set.new(%w(starred)), @metaindex.load_threadinfo(threadid1)[:state] 233 | end 234 | 235 | ## this captures a bug i had 236 | def test_message_state_is_propagated_to_threadinfo_even_if_it_is_just_on_the_root 237 | m1 = MockMessage.new :msgid => "1", :has_attachment? => true 238 | m2 = MockMessage.new :msgid => "2", :refs => ["1"] 239 | m3 = MockMessage.new :msgid => "3", :refs => ["2"] 240 | 241 | docid1, threadid1 = @metaindex.add_message m1 242 | docid2, threadid2 = @metaindex.add_message m2 243 | docid3, threadid3 = @metaindex.add_message m3 244 | 245 | assert_equal threadid1, threadid2 246 | assert_equal threadid2, threadid3 247 | 248 | assert_equal Set.new(%w(attachment)), @metaindex.load_threadinfo(threadid1)[:state] 249 | end 250 | 251 | def test_message_state_is_propagated_to_thread_as_a_disjunction_in_search 252 | @metaindex.set_query Query.new("body", "~unread") 253 | assert_equal 0, @metaindex.count_results 254 | 255 | m1 = MockMessage.new :msgid => "1" 256 | m2 = MockMessage.new :msgid => "2", :refs => ["1"] 257 | m3 = MockMessage.new :msgid => "3", :refs => ["2"] 258 | 259 | docid1, threadid1 = @metaindex.add_message m1, %w(unread) 260 | docid2, threadid2 = @metaindex.add_message m2, %w(unread) 261 | docid3, threadid3 = @metaindex.add_message m3, %w(unread) 262 | 263 | assert_equal threadid1, threadid2 264 | assert_equal threadid2, threadid3 265 | 266 | assert_equal 1, @metaindex.count_results 267 | @metaindex.update_message_state docid1, [] 268 | assert_equal 1, @metaindex.count_results 269 | @metaindex.update_message_state docid2, [] 270 | assert_equal 1, @metaindex.count_results 271 | @metaindex.update_message_state docid3, [] 272 | assert_equal 0, @metaindex.count_results 273 | 274 | @metaindex.set_query Query.new("body", "~starred") 275 | assert_equal 0, @metaindex.count_results 276 | @metaindex.update_message_state docid3, %w(starred) 277 | assert_equal 1, @metaindex.count_results 278 | end 279 | 280 | def test_adding_messages_can_join_threads 281 | m1 = MockMessage.new :msgid => "1" 282 | m3 = MockMessage.new :msgid => "3", :refs => ["2"] 283 | 284 | docid1, threadid1 = @metaindex.add_message m1 285 | docid3, threadid3 = @metaindex.add_message m3 286 | 287 | assert_not_equal threadid1, threadid3 288 | 289 | m2 = MockMessage.new :msgid => "2", :refs => ["1"] 290 | docid2, threadid2 = @metaindex.add_message m2 291 | 292 | threadid1 = @metaindex.load_messageinfo(docid1)[:thread_id] 293 | threadid2 = @metaindex.load_messageinfo(docid2)[:thread_id] 294 | threadid3 = @metaindex.load_messageinfo(docid3)[:thread_id] 295 | 296 | assert_not_nil threadid1 297 | assert_equal threadid1, threadid2 298 | assert_equal threadid2, threadid3 299 | end 300 | 301 | def test_adding_messages_applies_labels_to_everything_in_thread_and_that_is_reflected_in_search 302 | m1 = MockMessage.new :msgid => "1" 303 | m2 = MockMessage.new :msgid => "2", :refs => ["1"] 304 | m3 = MockMessage.new :msgid => "3", :refs => ["2"] 305 | 306 | docid1, threadid1 = @metaindex.add_message m1 307 | docid2, threadid2 = @metaindex.add_message m2 308 | 309 | @metaindex.set_query Query.new("body", "~hungry") 310 | assert_equal 0, @metaindex.count_results 311 | 312 | docid3, threadid3 = @metaindex.add_message m3, [], %w(hungry) 313 | assert_equal 1, @metaindex.count_results 314 | 315 | results = @metaindex.get_some_results 100 316 | assert_equal threadid3, results.first[:thread_id] 317 | docids = results.first[:structure].flatten 318 | assert_includes docid1, docids 319 | assert_includes docid2, docids 320 | assert_includes docid3, docids 321 | end 322 | 323 | def test_adding_messages_can_join_threads_and_labels_are_unionized 324 | m1 = MockMessage.new :msgid => "1" 325 | m3 = MockMessage.new :msgid => "3", :refs => ["2"] 326 | 327 | docid1, threadid1 = @metaindex.add_message m1, [], %w(fluffy) 328 | assert_equal Set.new(%w(fluffy)), @metaindex.load_threadinfo(threadid1)[:labels] 329 | 330 | docid3, threadid3 = @metaindex.add_message m3, [], %w(bunny) 331 | assert_equal Set.new(%w(bunny)), @metaindex.load_threadinfo(threadid3)[:labels] 332 | 333 | m2 = MockMessage.new :msgid => "2", :refs => ["1"] 334 | docid2, threadid2 = @metaindex.add_message m2 335 | 336 | assert_equal threadid2, @metaindex.load_messageinfo(docid1)[:thread_id] 337 | assert_equal threadid2, @metaindex.load_messageinfo(docid2)[:thread_id] 338 | assert_equal threadid2, @metaindex.load_messageinfo(docid3)[:thread_id] 339 | 340 | assert_equal Set.new(%w(fluffy bunny)), @metaindex.load_threadinfo(threadid2)[:labels] 341 | end 342 | 343 | def test_adding_messages_can_join_threads_and_their_labels_are_unionized_and_that_is_reflected_in_search 344 | m1 = MockMessage.new :msgid => "1" 345 | m3 = MockMessage.new :msgid => "3", :refs => ["2"] 346 | 347 | docid1, threadid1 = @metaindex.add_message m1, [], %w(fluffy) 348 | docid3, threadid3 = @metaindex.add_message m3, [], %w(bunny) 349 | 350 | @metaindex.set_query Query.new("body", "~fluffy") 351 | results = @metaindex.get_some_results 100 352 | assert_equal 1, results.size 353 | assert_equal threadid1, results.first[:thread_id] 354 | 355 | m2 = MockMessage.new :msgid => "2", :refs => ["1"] 356 | docid2, threadid2 = @metaindex.add_message m2 357 | 358 | @metaindex.reset_query! 359 | results = @metaindex.get_some_results 100 360 | assert_equal 1, results.size 361 | 362 | assert_equal threadid2, results.first[:thread_id] 363 | docids = results.first[:structure].flatten 364 | assert_includes docid1, docids 365 | assert_includes docid2, docids 366 | assert_includes docid3, docids 367 | end 368 | 369 | def test_labellist_updated_by_adding_messages_with_labels 370 | assert_empty @metaindex.all_labels 371 | 372 | @metaindex.add_message MockMessage.new, [], %w(potato) 373 | assert_equal Set.new(%w(potato)), @metaindex.all_labels 374 | 375 | @metaindex.add_message MockMessage.new, [], %w(potato leek) 376 | assert_equal Set.new(%w(potato leek)), @metaindex.all_labels 377 | end 378 | 379 | def test_labellist_updated_by_tweaking_thread_labels 380 | docid, threadid = @metaindex.add_message MockMessage.new, [], %w(potato) 381 | assert_equal Set.new(%w(potato)), @metaindex.all_labels 382 | 383 | @metaindex.update_thread_labels threadid, %w(muffin) 384 | assert_includes "muffin", @metaindex.all_labels 385 | end 386 | 387 | def test_labellist_pruning_removes_labels_without_corresponding_threads 388 | docid, threadid = @metaindex.add_message MockMessage.new, [], %w(potato) 389 | assert_equal Set.new(%w(potato)), @metaindex.all_labels 390 | 391 | @metaindex.update_thread_labels threadid, %w(muffin) 392 | @metaindex.prune_labels! 393 | assert_includes "muffin", @metaindex.all_labels 394 | assert_does_not_include "potato", @metaindex.all_labels 395 | end 396 | 397 | private 398 | 399 | def assert_includes v, set # standard one seems to have these things reversed 400 | assert set.include?(v), "#{set.inspect[0..50]} does not include #{v.inspect}" 401 | end 402 | 403 | def assert_does_not_include v, set 404 | assert !set.include?(v), "#{set.inspect[0..50]} includes #{v.inspect}" 405 | end 406 | 407 | def assert_empty x; x.empty? end unless respond_to?(:assert_empty) 408 | end 409 | -------------------------------------------------------------------------------- /bin/heliotrope-server: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # encoding: UTF-8 3 | 4 | require 'rubygems' 5 | require 'json' 6 | require 'trollop' 7 | require 'sinatra/base' 8 | require 'whistlepig' 9 | require "heliotrope" 10 | require 'cgi' 11 | require 'rack' 12 | require 'fileutils' 13 | 14 | class Set 15 | ## apparently we're going to have to do this the hard way 16 | def to_json(*a); to_a.to_json(*a) end 17 | end 18 | 19 | module Enumerable 20 | def uniq_by; inject({}) { |h, e| h[yield(e)] ||= e; h }.values end 21 | end 22 | 23 | class HeliotropeServer < Sinatra::Base 24 | API_VERSION = 0.1 25 | 26 | def initialize metaindex, zmbox, hooks 27 | @metaindex = metaindex 28 | @zmbox = zmbox 29 | @hooks = hooks 30 | @search_state = [] # keep this around for optimization purposes 31 | super() 32 | end 33 | 34 | helpers do 35 | include Rack::Utils 36 | end 37 | 38 | get "/" do 39 | redirect "/search?q=~inbox" 40 | end 41 | 42 | class RequestError < StandardError; end 43 | 44 | error RequestError do 45 | e = request.env['sinatra.error'] 46 | "error: " + e.message 47 | end 48 | 49 | DEFAULT_SEARCH_PAGE_SIZE = 20 50 | MAX_SEARCH_PAGE_SIZE = 300 51 | 52 | get "/info.json" do 53 | content_type :json 54 | { :response => :ok, 55 | :status => :up, 56 | :api_version => API_VERSION.to_s 57 | }.to_json 58 | end 59 | 60 | ## add a message to the store after sending it. idempotent: if the message 61 | ## has already been added, reports and error and does nothing. if the send 62 | ## fails, reports and error and does nothing. 63 | post "/message/send.json" do 64 | content_type :json 65 | 66 | rawbody = params["message"] or raise RequestError, "need a 'message' param" 67 | labels = JSON.parse(params["labels"] || "[]") 68 | state = JSON.parse(params["state"] || "[]") 69 | extra = JSON.parse(params["extra"] || "{}") 70 | 71 | begin 72 | message = raw_body_to_message rawbody 73 | if @metaindex.contains_safe_msgid? message.safe_msgid 74 | { :response => :ok, :status => :seen } 75 | else 76 | run_sendmail! rawbody, message 77 | result = add_message message, rawbody, labels, state + ["sent"], extra 78 | (message.to + message.cc + message.cc).each { |c| @metaindex.touch_contact! c } 79 | result 80 | end 81 | rescue Heliotrope::InvalidMessageError, SendmailFailedError => e 82 | { :response => :error, :message => e.message } 83 | end.to_json 84 | end 85 | 86 | ## sends a message without adding it to the index, or even checking whether 87 | ## it's there or not. accepts a force_recipients argument to override the 88 | ## recipients and ignore the To: line. 89 | post "/message/bounce.json" do 90 | content_type :json 91 | 92 | rawbody = params["message"] or raise RequestError, "need a 'message' param" 93 | force_recipients = JSON.parse(params["force_recipients"] || "[]") 94 | 95 | begin 96 | message = raw_body_to_message rawbody 97 | run_sendmail! rawbody, message, force_recipients 98 | { :response => :ok } 99 | rescue Heliotrope::InvalidMessageError, SendmailFailedError => e 100 | { :response => :error, :message => e.message } 101 | end.to_json 102 | end 103 | 104 | SENDMAIL = "sendmail -oem -ti" # don't like it? write a sendmail hook. 105 | SENDMAIL_BOUNCE = "sendmail -oem -i" 106 | class SendmailFailedError < StandardError; end 107 | 108 | def run_sendmail! rawbody, message, force_recipients=[] 109 | if @hooks.enabled? "sendmail" 110 | unless @hooks.run "sendmail", :rawbody => rawbody, :message => message, :force_recipients => force_recipients 111 | raise SendmailFailedError, "sendmail hook was not successful" 112 | end 113 | else 114 | cmd = if force_recipients.empty? 115 | SENDMAIL 116 | else 117 | SENDMAIL_BOUNCE + force_recipients.map { |x| " " + x }.join 118 | end 119 | puts "; running: `#{cmd}` on a #{rawbody.size}b message" 120 | ret = IO.popen(cmd, "w") { |p| p.puts rawbody } 121 | unless $?.success? 122 | raise SendmailFailedError, "sendmail command failed with non-zero exit status" 123 | end 124 | end 125 | end 126 | 127 | ## add a message to the store 128 | post "/message.json" do 129 | content_type :json 130 | 131 | rawbody = params["message"] or raise RequestError, "need a message" 132 | labels = JSON.parse(params["labels"] || "[]") 133 | state = JSON.parse(params["state"] || "[]") 134 | extra = JSON.parse(params["extra"] || "{}") 135 | 136 | begin 137 | message = raw_body_to_message rawbody 138 | if @metaindex.contains_safe_msgid? message.safe_msgid 139 | doc_id, thread_id = @metaindex.fetch_docid_for_safe_msgid message.safe_msgid 140 | { :response => :ok, :status => :seen, :doc_id => doc_id, :thread_id => thread_id } 141 | else 142 | if @hooks.enabled? "before-add-message" 143 | message, labels, state = @hooks.run "before-add-message", :message => message, :labels => labels, :state => state 144 | end 145 | result = add_message message, rawbody, labels, state, extra 146 | 147 | ## add any "important" contacts to the set of all contacts we've ever seen 148 | unless message.is_list_or_automated_email? 149 | ## just add the sender, for now. we can think about whether to add 150 | ## recipients as well later. 151 | @metaindex.touch_contact! message.from 152 | end 153 | 154 | result 155 | end 156 | rescue Heliotrope::InvalidMessageError => e 157 | { :response => :error, :error_message => e.message } 158 | end.to_json 159 | end 160 | 161 | def raw_body_to_message rawbody 162 | rawbody.force_encoding "binary" if rawbody.respond_to?(:force_encoding) # sigh... 163 | Heliotrope::Message.new(rawbody).parse! 164 | end 165 | 166 | def add_message message, rawbody, labels, state, extra 167 | loc = @zmbox.add rawbody 168 | extra.merge! :loc => loc 169 | doc_id, thread_id = @metaindex.add_message message, state, labels, extra 170 | { :response => :ok, :status => :unseen, :doc_id => doc_id, :thread_id => thread_id } 171 | end 172 | 173 | def get_query_from_params 174 | ## work around a rack (?) bug where quotes are omitted in queries like "hello bob" 175 | query = if env["rack.request.query_string"] =~ /\bq=(.+?)(&|$)/ 176 | CGI.unescape $1 177 | else 178 | params["q"] 179 | end 180 | 181 | raise RequestError, "need a query" unless query 182 | 183 | transformed = @hooks.run "transform-query", :query => query 184 | transformed = Heliotrope::Decoder.encode_as_utf8 transformed 185 | transformed || query 186 | end 187 | 188 | def get_search_results 189 | query = get_query_from_params 190 | start = (params["start"] || 0).to_i 191 | num = (params["num"] || DEFAULT_SEARCH_PAGE_SIZE).to_i 192 | num = DEFAULT_SEARCH_PAGE_SIZE if num <= 0 193 | num = [num, MAX_SEARCH_PAGE_SIZE].min 194 | 195 | query = Heliotrope::Query.new "body", query 196 | startt = Time.now 197 | continued = false 198 | results = if @search_state == [query.parsed_query_s, start] 199 | continued = true 200 | @metaindex.get_some_results num 201 | else 202 | @metaindex.set_query query 203 | @metaindex.get_some_results(start + num)[start, num] 204 | end 205 | elapsed = Time.now - startt 206 | printf "search(#{query.parsed_query_s}, #{start}, #{num}) took %.1fms\n", (elapsed * 1000) 207 | 208 | ## remove this field from the results because it's not useful and can be large 209 | results.each { |r| r.delete :structure } 210 | 211 | @search_state = [query.parsed_query_s, start + num] 212 | 213 | [query, results, { :start => start, :num => num, :elapsed => elapsed, :continued => continued }] 214 | end 215 | 216 | get "/size.json" do 217 | content_type :json 218 | { :response => :ok, :size => @metaindex.size }.to_json 219 | end 220 | 221 | get "/search.json" do 222 | content_type :json 223 | begin 224 | query, results, info = get_search_results 225 | { :response => :ok, :results => results } 226 | rescue Heliotrope::Query::ParseError => e 227 | { :response => :error, :message => e.message } 228 | end.to_json 229 | end 230 | 231 | get "/count.json" do 232 | content_type :json 233 | query = get_query_from_params 234 | query = Heliotrope::Query.new "body", query 235 | startt = Time.now 236 | @metaindex.set_query query 237 | count = @metaindex.count_results 238 | elapsed = Time.now - startt 239 | printf "count(#{query.parsed_query_s}) took %.1fms\n", (elapsed * 1000) 240 | @search_state = nil # don't confuse this with the regular search stuff 241 | { :response => :ok, :count => count }.to_json 242 | end 243 | 244 | get "/search" do 245 | content_type :html 246 | 247 | begin 248 | query, results, info = get_search_results 249 | start, num = info[:start], info[:num] 250 | nav = %{
} 251 | if start > 0 252 | nav += link_to_search(query.original_query_s, "<< top", 0, num) + "|" + link_to_search(query.original_query_s, "<< previous", [start - num, 0].max, num) 253 | end 254 | if (start > 0) && (results.size == num) 255 | nav += "|" 256 | end 257 | if results.size == num 258 | nav += link_to_search(query.original_query_s, "next >>", start + num, num) 259 | end 260 | nav += "
" 261 | 262 | header("Search: #{query.original_query_s}", query.original_query_s) + 263 | "
Parsed query: #{escape_html query.parsed_query_s}
" + 264 | "
Search took #{sprintf '%.2f', info[:elapsed]}s and #{info[:continued] ? 'was' : 'was NOT'} continued
" + 265 | "#{nav}" + 266 | results.map { |r| threadinfo_to_html r }.join + 267 | "
#{nav}" + footer 268 | 269 | rescue Heliotrope::Query::ParseError => e 270 | raise RequestError, "can't parse query: #{e.message}" 271 | end 272 | end 273 | 274 | ## thread info object 275 | get "/thread/:thread_id/info.json" do |thread_id| 276 | content_type :json 277 | threadinfo = get_threadinfo thread_id 278 | threadinfo.merge(:response => :ok).to_json 279 | end 280 | 281 | ## messages in the thread 282 | get "/thread/:thread_id.json" do |thread_id| 283 | content_type :json 284 | messageinfos = get_thread_summary thread_id 285 | { :messageinfos => messageinfos, :response => :ok }.to_json 286 | end 287 | 288 | get "/thread/:thread_id" do |thread_id| 289 | content_type :html 290 | 291 | messageinfos = get_thread_summary thread_id 292 | first = messageinfos.find { |mi, level| mi[:subject] }[0] 293 | 294 | header("Thread: " + first[:subject]) + 295 | messageinfos.map do |mi, level| 296 | %{
} + 297 | messageinfo_to_html(mi) + 298 | "
" 299 | end.join + 300 | footer 301 | end 302 | 303 | def get_thread_summary thread_id 304 | @metaindex.load_thread_messageinfos(thread_id.to_i) or raise Sinatra::NotFound, "can't find thread #{thread_id.inspect}" 305 | end 306 | 307 | def get_threadinfo thread_id 308 | threadinfo = @metaindex.load_threadinfo(thread_id.to_i) or raise Sinatra::NotFound, "can't find thread #{thread_id.inspect}" 309 | ## the structure field for long threads is deeply nested, and the 310 | ## json encoder complains about deeply-nested structures. we could 311 | ## raise its nesting limit, but this field isn't needed by anyone 312 | ## (so far!), so we'll just remove it. if someone wants it later, we 313 | ## can revisit. 314 | threadinfo.delete :structure 315 | threadinfo 316 | end 317 | 318 | def get_message_summary message_id 319 | @metaindex.load_messageinfo(message_id.to_i) or raise Sinatra::NotFound, "can't find message #{message_id.inspect}" 320 | end 321 | 322 | get "/message/:message_id.json" do |message_id| 323 | content_type :json 324 | message_id = message_id.to_i 325 | messageinfo = get_message_summary message_id 326 | message = load_actual_message messageinfo[:loc] 327 | 328 | ## merge the messageinfo (which has state, etc) with the actual content of the message 329 | if params[:only_infos] 330 | result = messageinfo 331 | else 332 | result = messageinfo.merge message.to_h(message_id, params["mime_type_pref"] || "text/html") 333 | end 334 | result.merge(:response => :ok).to_json 335 | end 336 | 337 | get "/message/:message_id" do |message_id| 338 | content_type :html 339 | message_id = message_id.to_i 340 | message = get_message_summary(message_id) or raise Sinatra::NotFound, "can't find message #{message_id.inspect}" 341 | 342 | begin 343 | threadinfo = @metaindex.load_threadinfo(message[:thread_id]) 344 | docids = threadinfo[:structure].flatten 345 | idx = docids.index(message_id) + 1 346 | idx += 1 while(docids[idx] && (docids[idx] < 0)) 347 | header(message[:subject]) + message_to_html(message, docids[idx]) + footer 348 | rescue Heliotrope::InvalidMessageError => e 349 | raise RequestError, "can't parse message #{message_id.inspect}: #{e.message}" 350 | end 351 | end 352 | 353 | get "/message/:message_id/part/:part_id" do |message_id, part_id| 354 | message = get_message_summary message_id 355 | part_id = part_id.to_i 356 | 357 | begin 358 | m = load_actual_message message[:loc] 359 | parts = m.mime_parts "text/html" 360 | 361 | raise Sinatra::NotFound, "can't find that part" unless part_id >= 0 && part_id <= parts.size 362 | type, fn, id, content = parts[part_id] 363 | if type =~ /^(\S+?);/ 364 | type = $1 365 | end 366 | content_type type 367 | response['Content-Disposition'] = (type =~ /^(text|image)\// ? "inline" : "attachment") 368 | response['Content-Disposition'] << %{; filename="#{fn}"} if fn 369 | content 370 | rescue Heliotrope::InvalidMessageError => e 371 | raise RequestError, "can't parse message #{message_id.inspect}: #{e.message}" 372 | end 373 | end 374 | 375 | get "/message/:message_id/raw" do |message_id| 376 | content_type :text 377 | message = get_message_summary message_id 378 | @zmbox.read message[:loc] 379 | end 380 | 381 | get "/labels.json" do 382 | content_type :json 383 | { :labels => @metaindex.all_labels, :response => :ok }.to_json 384 | end 385 | 386 | post "/labels/prune.json" do 387 | content_type :json 388 | @metaindex.prune_labels! 389 | { :labels => @metaindex.all_labels, :response => :ok }.to_json 390 | end 391 | 392 | post "/thread/:thread_id/labels.json" do |thread_id| 393 | content_type :json 394 | 395 | labels = params["labels"] or raise RequestError, "need a 'labels' param" 396 | labels = parse_json_set labels 397 | 398 | thread = get_thread_summary thread_id # ensure it exists... 399 | begin 400 | @metaindex.update_thread_labels thread_id, labels 401 | get_threadinfo(thread_id).merge :response => :ok 402 | rescue Heliotrope::MetaIndex::InvalidLabelError => e 403 | { :response => :error, :message => e.message } 404 | end.to_json 405 | end 406 | 407 | post "/thread/:thread_id/state.json" do |thread_id| 408 | content_type :json 409 | 410 | state = params["state"] or raise RequestError, "need a 'state' param" 411 | state = parse_json_set state 412 | 413 | thread = get_thread_summary thread_id # ensure it exists... 414 | @metaindex.update_thread_state thread_id, state 415 | 416 | get_threadinfo(thread_id).merge(:response => :ok).to_json 417 | end 418 | 419 | post "/message/:message_id/state.json" do |message_id| 420 | content_type :json 421 | 422 | state = params["state"] or raise RequestError, "need a 'state' param" 423 | state = parse_json_set state 424 | 425 | message = get_message_summary message_id # ensure it exists... 426 | @metaindex.update_message_state(message_id, state) 427 | 428 | get_message_summary(message_id).merge(:response => :ok).to_json 429 | end 430 | 431 | get "/contacts.json" do 432 | content_type :json 433 | 434 | prefix = params["prefix"] 435 | num = (params["num"] || 20).to_i 436 | 437 | contacts = @metaindex.contacts(:prefix => prefix, :num => num). 438 | sort_by { |c| -c[:timestamp] }. 439 | uniq_by { |c| c[:email] }. 440 | map { |c| { :name => c[:name], :email => c[:email] } } 441 | 442 | { :response => :ok, :contacts => contacts }.to_json 443 | end 444 | 445 | private 446 | 447 | def parse_json_set val 448 | begin 449 | Set.new JSON.parse(val) 450 | rescue JSON::ParserError => e 451 | raise RequestError, "cannot parse json [#{val.inspect}]: #{e.message}" 452 | end 453 | end 454 | 455 | def load_actual_message offset 456 | rawbody = @zmbox.read offset 457 | Heliotrope::Message.new(rawbody).parse! 458 | end 459 | 460 | def message_to_html message, next_messageid 461 | mid = message[:message_id] 462 | m = load_actual_message message[:loc] 463 | parts = m.mime_parts "text/html" 464 | 465 | ## build up the cid pointers for internal content 466 | cids = {} 467 | parts.each_with_index { |(type, fn, id, content), i| cids[id] = i if id } 468 | 469 | s = "
#{link_to_threadview message[:thread_id], "<< back to thread"}" 470 | s += " | #{link_to_messageview next_messageid, "next message in thread >>"}" if next_messageid 471 | s += "
" 472 | s += %{
From: #{link_to_search m.from.email.downcase, m.from.to_email_address}
} 473 | s += "
To: #{m.to.map { |p| link_to_search p.email.downcase, p.to_email_address }.join(", ")}
" 474 | s += "
Cc: #{m.cc.map { |p| link_to_search p.email.downcase, p.to_email_address }.join(", ")}
" unless m.cc.empty? 475 | s += "
Bcc: #{m.bcc.map { |p| link_to_search p.email.downcase, p.to_email_address }.join(", ")}
" unless m.bcc.empty? 476 | s += "
Date: #{Time.at m.date}
" 477 | s += "
Subject: #{escape_html m.subject}
" 478 | 479 | parts.each_with_index do |(type, fn, id, content), i| 480 | s += %{
} 481 | if fn 482 | s += link_to_attachment mid, i, "[attachment: #{fn} (#{type})]" 483 | if type =~ /^image\// # show a preview 484 | s += inline_image(mid, i) 485 | end 486 | else 487 | s += make_html type, content, mid, cids 488 | end 489 | end 490 | 491 | s += "
" + link_to_raw(mid, "(view raw)") + "
" 492 | s 493 | end 494 | 495 | ## this whole thing seems pretty dangerous... 496 | def make_html type, content, message_id, cids 497 | case type 498 | when /^text\/html/; cids.inject(content) { |c, (k, v)| c.gsub("cid:#{k}", inline_image_link(message_id, v)) } 499 | when /^text\/plain/; escape_html(content).gsub("\n", "
") 500 | else escape_html(content) 501 | end 502 | end 503 | 504 | MAX_PARTICIPANT_WIDTH = 40 505 | MAX_SUBJECT_WIDTH = 80 506 | 507 | def threadinfo_to_html thread 508 | participants = thread[:participants][0, 3].map do |v| 509 | p = Heliotrope::Person.from_string(v) 510 | link_to_search p.email.downcase, p.display_name 511 | end.join(", ") 512 | 513 | size = thread[:size] 514 | labels = (thread[:labels] - Heliotrope::MetaIndex::MESSAGE_STATE).map do |l| 515 | link_to_search "~#{l}", "+#{l}" 516 | end.join(" ") 517 | 518 | subject = thread[:subject][0, MAX_SUBJECT_WIDTH] 519 | subject = "(no subject)" if subject =~ /^\s*$/ 520 | subject = link_to_threadview thread[:thread_id], subject 521 | snippet = (thread[:snippet] || "?") 522 | 523 | date = escape_html Time.at(thread[:date]).strftime("%Y/%m/%d %H:%M") 524 | 525 | flags = escape_html( 526 | (thread[:state].include?("starred") ? "*" : " ") + 527 | (thread[:state].include?("unread") ? "N" : " ") + 528 | (thread[:state].include?("attachment") ? "@" : " ") + 529 | (thread[:state].include?("signed") ? "S" : " ") + 530 | (thread[:state].include?("encrypted") ? "E" : " ") 531 | ) 532 | 533 | <<-EOS 534 | 535 | #{flags} 536 |
#{participants}
537 | #{size} 538 |
#{labels} #{subject} #{escape_html snippet}
539 | #{date} 540 | 541 | EOS 542 | end 543 | 544 | def messageinfo_to_html message 545 | if message[:type] == "fake" # fake root 546 | "(an unreceived message)" 547 | else 548 | from = Heliotrope::Person.from_string message[:from] 549 | flags = 550 | (message[:state].include?("starred") ? "*" : " ") + 551 | (message[:state].include?("unread") ? "N" : " ") + 552 | (message[:state].include?("attachment") ? "@" : " ") + 553 | (message[:state].include?("signed") ? "S" : " ") + 554 | (message[:state].include?("encrypted") ? "E" : " ") 555 | date = escape_html Time.at(message[:date]).strftime("%Y/%m/%d %H:%M") 556 | subject = link_to_messageview message[:message_id], message[:subject] 557 | snippet = message[:snippet][0, MAX_SUBJECT_WIDTH - message[:subject].size] 558 | 559 | <<-EOS 560 | #{flags} 561 | #{link_to_search from.email.downcase, from.display_name} 562 | #{link_to_messageview message[:message_id], message[:subject]} 563 | #{escape_html snippet} ... 564 | #{date} 565 | EOS 566 | end 567 | end 568 | 569 | def link_to_search query, text, start=0, num=DEFAULT_SEARCH_PAGE_SIZE 570 | %{#{escape_html text}} 571 | end 572 | 573 | def link_to_threadview thread_id, text 574 | %{#{escape_html text}} 575 | end 576 | 577 | def link_to_messageview message_id, text 578 | %{#{escape_html text}} 579 | end 580 | 581 | def link_to_attachment message_id, part_id, text 582 | %{#{escape_html text}} 583 | end 584 | 585 | def link_to_raw message_id, text 586 | %{#{escape_html text}} 587 | end 588 | 589 | def inline_image_link message_id, part_id; "/message/#{message_id}/part/#{part_id}" end 590 | def inline_image message_id, part_id; %{} end 591 | 592 | def header title, query="" 593 | title = escape_html title 594 | <<-EOS 595 | Heliotrope: #{title} 596 | 597 | 613 |

#{title}

614 |
615 |
616 | #{link_to_search "~inbox", "[inbox]"} 617 | Search: 618 | 619 |
620 | EOS 621 | end 622 | 623 | def footer 624 | "" 625 | end 626 | end 627 | 628 | ### execution begins here ### 629 | 630 | opts = Trollop::options do 631 | banner < "localhost", :short => "-H" 637 | opt :port, "Port to listen on", :default => 8042 638 | opt :dir, "Base directory for all index files", :default => "." 639 | opt :mode, "Runtime mode", :default => "development" 640 | end 641 | 642 | FileUtils.mkdir_p opts.dir 643 | FileUtils.mkdir_p File.join(opts.dir, "index") 644 | store = LevelDB::DB.new File.join(opts.dir, "store") 645 | index = begin 646 | Whistlepig::Index.new File.join(opts.dir, "index", "whistlepig") 647 | rescue Whistlepig::VersionError => e 648 | puts < opts.port, :Host => opts.host 695 | -------------------------------------------------------------------------------- /lib/heliotrope/meta-index.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | require 'whistlepig' 4 | require 'leveldb' 5 | require 'set' 6 | 7 | class Array 8 | def ordered_uniq 9 | s = Set.new 10 | select { |e| !s.member?(e) && s.add(e) } 11 | end 12 | 13 | def max_by 14 | inject([nil, nil]) do |(maxe, maxv), e| 15 | v = yield e 16 | if maxv.nil? || v > maxv 17 | [e, v] 18 | else 19 | [maxe, maxv] 20 | end 21 | end.first 22 | end 23 | end 24 | 25 | module Heliotrope 26 | class MetaIndex 27 | class VersionMismatchError < StandardError 28 | attr_reader :have_version, :want_version 29 | 30 | def initialize have_version, want_version 31 | @have_version = have_version 32 | @want_version = want_version 33 | end 34 | 35 | def message 36 | "index is version #{have_version.inspect} but I am expecting #{want_version.inspect}" 37 | end 38 | end 39 | 40 | ## these are things that can be set on a per-message basis. each one 41 | ## corresponds to a particular label, but labels are propagated at the 42 | ## thread level whereas state is not. 43 | MESSAGE_MUTABLE_STATE = Set.new %w(starred unread deleted) 44 | ## flags that are set per-message but are not modifiable by the user 45 | MESSAGE_IMMUTABLE_STATE = Set.new %w(attachment signed encrypted draft sent) 46 | MESSAGE_STATE = MESSAGE_MUTABLE_STATE + MESSAGE_IMMUTABLE_STATE 47 | ## if you change any of those state things, be sure to update 48 | ## heliotrope-client as well. 49 | 50 | SNIPPET_MAX_SIZE = 100 # chars 51 | 52 | def initialize store, index, hooks, opts={} 53 | @store = store 54 | @index = index 55 | @hooks = hooks 56 | @query = nil # we always have (at most) one active query 57 | @debug = false 58 | reset_timers! 59 | check_version! if @index 60 | end 61 | 62 | def close 63 | @index.close 64 | @store.close 65 | end 66 | 67 | attr_reader :index_time, :store_time, :thread_time 68 | attr_accessor :debug 69 | 70 | def reset_timers! 71 | @index_time = @store_time = @thread_time = 0 72 | end 73 | 74 | def version; [major_version, minor_version].join(".") end 75 | def major_version; 0 end 76 | def minor_version; 1 end 77 | 78 | ## helper factory that assumes console access 79 | def self.load_or_die! store, index, hooks 80 | begin 81 | Heliotrope::MetaIndex.new store, index, hooks 82 | rescue Heliotrope::MetaIndex::VersionMismatchError => e 83 | $stderr.puts "Version mismatch error: #{e.message}." 84 | $stderr.puts "Try running #{File.dirname $0}/heliotrope-upgrade-index." 85 | abort 86 | end 87 | end 88 | 89 | def check_version! # throws a VersionMismatchError 90 | my_version = [major_version, minor_version].join(".") 91 | 92 | if @index.size == 0 93 | write_string "version", my_version 94 | else 95 | disk_version = load_string "version" 96 | raise VersionMismatchError.new(disk_version, my_version) unless my_version == disk_version 97 | end 98 | end 99 | 100 | def add_message message, state=[], labels=[], extra={} 101 | key = "docid/#{message.safe_msgid}" 102 | if contains_key? key 103 | docid = load_int key 104 | threadid = load_int "thread/#{docid}" 105 | return [docid, threadid] 106 | end 107 | 108 | state = Set.new state 109 | state &= MESSAGE_MUTABLE_STATE # filter to the only states the user can set 110 | state << "attachment" if message.has_attachment? # set any immutable state 111 | state << "signed" if message.signed? 112 | state << "encrypted" if message.encrypted? 113 | 114 | ## add message to index 115 | index_docid = index! message 116 | docid = gen_new_docid! 117 | 118 | ## write index_docid <-> docid mapping 119 | write_docid_mapping! docid, index_docid 120 | 121 | ## add message to store 122 | messageinfo = write_messageinfo! message, state, docid, extra 123 | 124 | ## build thread structure, collecting any labels from threads that have 125 | ## been joined by adding this message. 126 | threadid, thread_structure, old_labels = thread_message! message 127 | 128 | ## get the thread snippet 129 | snippet = calc_thread_snippet thread_structure 130 | 131 | ## get the thread state 132 | thread_state = merge_thread_state thread_structure 133 | 134 | ## calculate the labels 135 | labels = Set.new(labels) - MESSAGE_STATE # you can't set these 136 | labels += thread_state # but i can 137 | #labels += merge_thread_labels(thread_structure) # you can have these, though 138 | labels += old_labels # you can have these, though 139 | 140 | ## write thread to store 141 | threadinfo = write_threadinfo! threadid, thread_structure, labels, thread_state, snippet 142 | 143 | ## add labels to every message in the thread (for search to work) 144 | write_thread_message_labels! thread_structure, labels 145 | 146 | ## add the labels to the set of all labels we've ever seen 147 | add_labels_to_labellist! labels 148 | 149 | ## congrats, you have a doc and a thread! 150 | [docid, threadid] 151 | end 152 | 153 | ## add or update a contact 154 | def touch_contact! contact, timestamp=Time.now.to_i 155 | old_record = load_hash "c/#{contact.email.downcase}" 156 | if (old_record[:timestamp] || 0) < timestamp 157 | record = { :name => contact.name, :email => contact.email, :timestamp => timestamp } 158 | write_hash "c/#{contact.email.downcase}", record 159 | write_hash "c/#{contact.name.downcase}", record if contact.name 160 | old_record[:timestamp].nil? # return true if it's a brand-new record 161 | end 162 | end 163 | 164 | def contacts opts={} 165 | num = opts[:num] || 20 166 | prefix = opts[:prefix] 167 | 168 | iter = if prefix 169 | prefix = prefix.downcase.gsub("/", "") # oh yeah 170 | @store.each(:from => "c/#{prefix}", :to => "c/#{prefix}~") # ~ is the largest character ha ha ha :( :( :( 171 | else 172 | @store.each(:from => "c/") 173 | end 174 | 175 | iter.take(num).map { |k, v| load_hash k } 176 | end 177 | 178 | ## returns the new message state 179 | def update_message_state docid, state 180 | state = Set.new(state) & MESSAGE_MUTABLE_STATE 181 | 182 | changed, new_state = really_update_message_state docid, state 183 | if changed 184 | threadid = load_int "threadid/#{docid}" 185 | threadinfo = load_hash "thread/#{threadid}" 186 | rebuild_all_thread_metadata threadid, threadinfo 187 | end 188 | 189 | new_state 190 | end 191 | 192 | def update_thread_state threadid, state 193 | state = Set.new(state) & MESSAGE_MUTABLE_STATE 194 | 195 | threadinfo = load_hash "thread/#{threadid}" 196 | docids = threadinfo[:structure].flatten.select { |id| id > 0 } 197 | 198 | changed = false 199 | docids.each do |docid| 200 | this_changed, _ = really_update_message_state docid, state 201 | changed ||= this_changed 202 | end 203 | 204 | if changed 205 | threadinfo = rebuild_all_thread_metadata threadid, threadinfo 206 | else 207 | load_set "tstate/#{threadid}" 208 | end 209 | end 210 | 211 | def update_thread_labels threadid, labels 212 | labels = Set.new(labels) - MESSAGE_STATE 213 | 214 | ## add the labels to the set of all labels we've ever seen. do this 215 | ## first because it also does some validation. 216 | add_labels_to_labellist! labels 217 | 218 | key = "tlabels/#{threadid}" 219 | old_tlabels = load_set key 220 | new_tlabels = (old_tlabels & MESSAGE_STATE) + labels 221 | write_set key, new_tlabels 222 | 223 | threadinfo = load_hash "thread/#{threadid}" 224 | write_thread_message_labels! threadinfo[:structure], new_tlabels 225 | 226 | new_tlabels 227 | end 228 | 229 | def contains_safe_msgid? safe_msgid; contains_key? "docid/#{safe_msgid}" end 230 | 231 | def fetch_docid_for_safe_msgid safe_msgid 232 | key = "docid/#{safe_msgid}" 233 | if contains_key? key 234 | docid = load_int key 235 | threadid = load_int "threadid/#{docid}" 236 | [docid, threadid] 237 | end 238 | end 239 | 240 | def size; @index.size end 241 | 242 | def set_query query 243 | @index.teardown_query @query.whistlepig_q if @query # new query, drop old one 244 | @query = query 245 | @index.setup_query @query.whistlepig_q 246 | @seen_threads = {} 247 | end 248 | 249 | def reset_query! 250 | @index.teardown_query @query.whistlepig_q 251 | @index.setup_query @query.whistlepig_q 252 | @seen_threads = {} 253 | end 254 | 255 | def get_some_results num 256 | return [] unless @query 257 | 258 | startt = Time.now 259 | threadids = [] 260 | until threadids.size >= num 261 | index_docid = @index.run_query(@query.whistlepig_q, 1).first 262 | break unless index_docid 263 | doc_id, thread_id = get_thread_id_from_index_docid index_docid 264 | next if @seen_threads[thread_id] 265 | @seen_threads[thread_id] = true 266 | threadids << thread_id 267 | end 268 | 269 | loadt = Time.now 270 | results = threadids.map { |id| load_threadinfo id } 271 | endt = Time.now 272 | #printf "# search %.1fms, load %.1fms\n", 1000 * (loadt - startt), 1000 * (endt - startt) 273 | results 274 | end 275 | 276 | def load_threadinfo threadid 277 | h = load_thread(threadid) or return 278 | h.merge! :thread_id => threadid, 279 | :state => load_set("tstate/#{threadid}"), 280 | :labels => load_set("tlabels/#{threadid}"), 281 | :snippet => load_string("tsnip/#{threadid}"), 282 | :unread_participants => load_set("turps/#{threadid}") 283 | end 284 | 285 | def load_messageinfo docid 286 | key = "doc/#{docid}" 287 | return unless contains_key? key 288 | h = load_hash key 289 | h.merge :state => load_set("state/#{docid}"), 290 | :labels => load_set("mlabels/#{docid}"), 291 | :thread_id => load_int("threadid/#{docid}"), 292 | :snippet => load_string("msnip/#{docid}"), 293 | :message_id => docid 294 | end 295 | 296 | def load_thread_messageinfos threadid 297 | h = load_thread(threadid) or return 298 | load_structured_messageinfo h[:structure] 299 | end 300 | 301 | def count_results 302 | startt = Time.now 303 | thread_ids = Set.new 304 | query = @query.clone 305 | @index.setup_query query.whistlepig_q 306 | begin 307 | while true 308 | docids = @index.run_query query.whistlepig_q, 1000 309 | docids.each do |index_docid| 310 | doc_id, thread_id = get_thread_id_from_index_docid index_docid 311 | thread_ids << thread_id 312 | end 313 | break if docids.size < 1000 314 | end 315 | elapsed = Time.now - startt 316 | ensure 317 | @index.teardown_query query.whistlepig_q 318 | end 319 | thread_ids.size 320 | end 321 | 322 | def all_labels 323 | load_set "labellist" 324 | end 325 | 326 | ## expensive! runs a query for each label and sees if there are any docs for 327 | ## it 328 | def prune_labels! 329 | pruned_labels = all_labels.reject do |l| 330 | query = Whistlepig::Query.new "body", "~#{l}" 331 | @index.setup_query query 332 | docids = begin 333 | @index.run_query query, 1 334 | ensure 335 | @index.teardown_query query 336 | end 337 | 338 | docids.empty? 339 | end 340 | 341 | write_set "labellist", pruned_labels 342 | end 343 | 344 | def indexable_text_for thing 345 | orig = thing.indexable_text 346 | transformed = @hooks.run "transform-text", :text => orig 347 | transformed = Decoder.encode_as_utf8 transformed 348 | transformed || orig 349 | end 350 | 351 | def write_docid_mapping! store_docid, index_docid 352 | write_int "i2s/#{index_docid}", store_docid # redirect index to store 353 | write_int "s2i/#{store_docid}", index_docid # reidrect store to index 354 | end 355 | 356 | private 357 | 358 | def get_thread_id_from_index_docid index_docid 359 | store_docid = load_int("i2s/#{index_docid}") 360 | thread_id = load_int "threadid/#{store_docid}" 361 | raise "no thread_id for doc #{store_docid.inspect} (index doc #{index_docid.inspect})" unless thread_id # your index is corrupt! 362 | [store_docid, thread_id] 363 | end 364 | 365 | def get_index_docid_from_store_docid store_docid 366 | load_int "s2i/#{store_docid}" 367 | end 368 | 369 | 370 | def gen_new_docid! 371 | v = load_int("next_docid") || 1 372 | write_int "next_docid", v + 1 373 | v 374 | end 375 | 376 | def is_valid_whistlepig_token? l 377 | # copy logic from whistlepig's query-parser.lex 378 | l =~ /^[^\(\)"\-~:\*][^\(\)":]*$/ 379 | end 380 | 381 | def really_update_message_state docid, state 382 | ## update message state 383 | key = "state/#{docid}" 384 | old_mstate = load_set key 385 | new_mstate = (old_mstate - MESSAGE_MUTABLE_STATE) + state 386 | 387 | changed = new_mstate != old_mstate 388 | write_set key, new_mstate if changed 389 | [changed, new_mstate] 390 | end 391 | 392 | ## rebuild snippet, labels, read/unread participants, etc. for a 393 | ## thread. useful if something about one of the thread's messages has 394 | ## changed. 395 | ## 396 | ## returns the new thread state 397 | def rebuild_all_thread_metadata threadid, threadinfo 398 | ## recalc thread snippet 399 | key = "tsnip/#{threadid}" 400 | old_snippet = load_string key 401 | new_snippet = calc_thread_snippet threadinfo[:structure] 402 | if old_snippet != new_snippet 403 | write_string key, new_snippet 404 | end 405 | 406 | ## recalc thread state and labels 407 | old_tstate = load_set "tstate/#{threadid}" 408 | new_tstate = merge_thread_state threadinfo[:structure] 409 | new_tlabels = nil 410 | 411 | if new_tstate != old_tstate 412 | write_set "tstate/#{threadid}", new_tstate 413 | 414 | ## update thread labels 415 | key = "tlabels/#{threadid}" 416 | old_tlabels = load_set key 417 | new_tlabels = (old_tlabels - MESSAGE_MUTABLE_STATE) + new_tstate 418 | write_set key, new_tlabels 419 | 420 | write_thread_message_labels! threadinfo[:structure], new_tlabels 421 | end 422 | 423 | ## recalc the unread participants 424 | docids = threadinfo[:structure].flatten.select { |x| x > 0 } 425 | messages = docids.map { |id| load_hash("doc/#{id}") } 426 | states = docids.map { |id| load_hash("state/#{id}") } 427 | 428 | write_unread_participants! threadid, messages, states 429 | 430 | new_tstate 431 | end 432 | 433 | def write_unread_participants! threadid, messages, states 434 | unread_participants = messages.zip(states).map do |m, state| 435 | m[:from] if state.member?("unread") 436 | end.compact.to_set 437 | write_set "turps/#{threadid}", unread_participants 438 | end 439 | 440 | class InvalidLabelError < StandardError 441 | def initialize label 442 | super "#{label} is an invalid label" 443 | end 444 | end 445 | 446 | def add_labels_to_labellist! labels 447 | labels.each { |l| raise InvalidLabelError, l unless is_valid_whistlepig_token?(l) } 448 | key = "labellist" 449 | labellist = load_set key 450 | labellist_new = labellist + labels.select { |l| is_valid_whistlepig_token? l } 451 | write_set key, labellist_new unless labellist == labellist_new 452 | end 453 | 454 | def calc_thread_snippet thread_structure 455 | docids = thread_structure.flatten.select { |id| id > 0 } 456 | first_unread = docids.find { |docid| load_set("state/#{docid}").member?("unread") } 457 | load_string("msnip/#{first_unread || docids.first}") 458 | end 459 | 460 | ## get the state for a thread by merging the state from each message 461 | def merge_thread_state thread_structure 462 | thread_structure.flatten.inject(Set.new) do |set, docid| 463 | set + (docid < 0 ? [] : load_set("state/#{docid}")) 464 | end 465 | end 466 | 467 | ## get the labels for a thread by merging the labels from each message 468 | def merge_thread_labels thread_structure 469 | thread_structure.flatten.inject(Set.new) do |set, docid| 470 | set + (docid < 0 ? [] : load_set("mlabels/#{docid}")) 471 | end 472 | end 473 | 474 | ## sync labels to all messages within the thread. necessary if you want 475 | ## search to work properly. 476 | def write_thread_message_labels! thread_structure, labels 477 | thread_structure.flatten.each do |docid| 478 | next if docid < 0 # psuedo-root 479 | key = "mlabels/#{docid}" 480 | oldlabels = load_set key 481 | write_set key, labels 482 | 483 | ## write to index 484 | index_docid = get_index_docid_from_store_docid docid 485 | (oldlabels - labels).each do |l| 486 | puts "; removing ~#{l} from #{index_docid} (store #{docid})" if @debug 487 | @index.remove_label index_docid, l 488 | end 489 | (labels - oldlabels).each do |l| 490 | puts "; adding ~#{l} to #{index_docid} (store #{docid})" if @debug 491 | @index.add_label index_docid, l 492 | end 493 | end 494 | end 495 | 496 | def load_structured_messageinfo thread_structure, level=0 497 | id, *children = thread_structure 498 | doc = if id < 0 499 | {:type => "fake"} 500 | else 501 | load_messageinfo(id) 502 | end 503 | 504 | children.inject([[doc, level]]) { |a, c| a + load_structured_messageinfo(c, level + 1) } 505 | end 506 | 507 | def load_thread threadid 508 | key = "thread/#{threadid}" 509 | return unless contains_key? key 510 | load_hash key 511 | end 512 | 513 | ## given a single message, which contains a (partial) path from it to an 514 | ## ancestor (which itself may or may not be the root), build up the thread 515 | ## structures. doesn't hit the search index, just the kv store. 516 | def thread_message! message 517 | startt = Time.now 518 | 519 | ## build the path of msgids from leaf to ancestor 520 | ids = [message.safe_msgid] + message.safe_refs.reverse 521 | seen = {} 522 | ids = ids.map { |x| seen[x] = true && x unless seen[x] }.compact 523 | 524 | ## write parent/child relationships 525 | if ids.size > 1 526 | ids[0 .. -2].zip(ids[1 .. -1]).each do |id, parent| 527 | pkey = "pmsgid/#{id}" 528 | next if contains_key? pkey # don't overwrite--potential for mischief? 529 | write_string pkey, parent 530 | 531 | ckey = "cmsgids/#{parent}" 532 | v = load_set(ckey) 533 | v << id 534 | write_set ckey, v 535 | end 536 | end 537 | 538 | ## find the root of the whole thread 539 | root = ids.first 540 | seen = {} # guard against loops 541 | while(id = load_string("pmsgid/#{root}")) 542 | #puts "parent of #{root} is #{id}" 543 | break if seen[id]; seen[id] = true 544 | root = id 545 | end 546 | 547 | ## get the thread structure in terms of docids docs we've actually seen. 548 | ## generate psuedo-docids to join trees with parents we haven't seen yet 549 | ## when necessary. 550 | thread_structure = build_thread_structure_from root 551 | #puts "thread structure is #{thread_structure.inspect}" 552 | threadid = thread_structure.first # might actually be a psuedo-docid 553 | #puts "root msgid is #{root.inspect}, root docid is #{threadid}" 554 | 555 | ## if any of these docs are roots of old threads, delete those old threads, 556 | ## but keep track of all the labels we've seen 557 | old_labels = thread_structure.flatten.inject(Set.new) do |labels, id| 558 | tkey = "thread/#{id}" 559 | labels + if contains_key? tkey 560 | lkey = "tlabels/#{id}" 561 | v = load_set lkey 562 | @store.delete lkey 563 | @store.delete tkey 564 | v 565 | else 566 | Set.new 567 | end 568 | end 569 | 570 | ## write the thread ids for all documents. we need this at search time to 571 | ## do the message->thread mapping. 572 | thread_structure.flatten.each do |id| 573 | next if id < 0 # pseudo root 574 | write_int "threadid/#{id}", threadid 575 | end 576 | 577 | @thread_time += (Time.now - startt) 578 | [threadid, thread_structure, old_labels] 579 | end 580 | 581 | ## builds an array representation of the thread, filling in only those 582 | ## messages that we actually have in the store, and making psuedo-message 583 | ## roots for the cases when we have seen multiple children but not the 584 | ## parent. 585 | def build_thread_structure_from safe_msgid, seen={} 586 | return nil if seen[safe_msgid] 587 | 588 | docid = load_int "docid/#{safe_msgid}" 589 | children = load_set "cmsgids/#{safe_msgid}" 590 | #puts "> children of #{msgid} are #{children.inspect}" 591 | 592 | seen[safe_msgid] = true 593 | child_thread_structures = children.map { |c| build_thread_structure_from(c, seen) }.compact 594 | 595 | #puts "< bts(#{msgid}): docid=#{docid.inspect}, child_structs=#{child_thread_structures.inspect}" 596 | if docid 597 | if child_thread_structures.empty? 598 | [docid.to_i] 599 | else 600 | [docid.to_i] + child_thread_structures 601 | end 602 | else 603 | case child_thread_structures.size 604 | when 0; nil 605 | when 1; child_thread_structures.first 606 | else # need to make a psuedo root 607 | psuedo_root = -child_thread_structures.first.first # weird? 608 | [psuedo_root] + child_thread_structures 609 | end 610 | end 611 | end 612 | 613 | def write_threadinfo! threadid, thread_structure, labels, state, snippet 614 | subject = date = from = to = has_attachment = nil 615 | 616 | docids = thread_structure.flatten.select { |x| x > 0 } 617 | messages = docids.map { |id| load_hash("doc/#{id}") } 618 | states = docids.map { |id| load_hash("state/#{id}") } 619 | 620 | participants = messages.map { |m| m[:from] }.ordered_uniq 621 | direct_recipients = messages.map { |m| m[:to] }.flatten.to_set 622 | indirect_recipients = messages.map { |m| m[:cc] }.flatten.to_set 623 | 624 | first_message = messages.first # just take the root 625 | last_message = messages.max_by { |m| m[:date] } 626 | 627 | threadinfo = { 628 | :subject => first_message[:subject], 629 | :date => last_message[:date], 630 | :participants => participants, 631 | :direct_recipients => direct_recipients, 632 | :indirect_recipients => indirect_recipients, 633 | :size => docids.size, 634 | :structure => thread_structure, 635 | } 636 | 637 | write_hash "thread/#{threadid}", threadinfo 638 | write_set "tlabels/#{threadid}", labels 639 | write_set "tstate/#{threadid}", state 640 | write_string "tsnip/#{threadid}", snippet 641 | 642 | write_unread_participants! threadid, messages, states 643 | 644 | threadinfo 645 | end 646 | 647 | def index! message 648 | ## make the entry 649 | startt = Time.now 650 | entry = Whistlepig::Entry.new 651 | entry.add_string "from", indexable_text_for(message.from).downcase 652 | entry.add_string "to", message.recipients.map { |x| indexable_text_for x }.join(" ").downcase 653 | entry.add_string "subject", message.subject.downcase 654 | entry.add_string "date", message.date.to_s 655 | entry.add_string "body", indexable_text_for(message).downcase 656 | @index_time += Time.now - startt 657 | 658 | @index.add_entry entry 659 | end 660 | 661 | def write_messageinfo! message, state, docid, extra 662 | ## write it to the store 663 | startt = Time.now 664 | messageinfo = { 665 | :subject => message.subject, 666 | :date => message.date, 667 | :from => message.from.to_email_address, 668 | :to => message.direct_recipients.map { |x| x.to_email_address }, 669 | :cc => message.indirect_recipients.map { |x| x.to_email_address }, 670 | :has_attachment => message.has_attachment?, 671 | }.merge extra 672 | 673 | ## add it to the store 674 | write_hash "doc/#{docid}", messageinfo 675 | write_set "state/#{docid}", state 676 | write_int "docid/#{message.safe_msgid}", docid 677 | write_string "msnip/#{docid}", message.snippet[0, SNIPPET_MAX_SIZE] 678 | @store_time += Time.now - startt 679 | 680 | messageinfo 681 | end 682 | 683 | ## storing stuff is tricky 684 | ## 685 | ## strings can be stored directly but they MUST be marked (via 686 | ## #force_encoding) as binary, otherwise OklahomerMixer will truncate (!!!) 687 | ## #them if they contain any super-ASCII characters. (we could marshal 688 | ## #strings, but it costs quite a few bytes.) 689 | ## 690 | ## other objects are just marshalled, which is fine, and in ruby 1.9, string 691 | ## encodings will be preserved. HOWEVER, we need to recursively find all 692 | ## strings and mark them as utf-8 anyways, since they might've been 693 | ## marshalled by a 1.8 process, in which case they will come back as binary. 694 | ## 695 | ## once the entire world is safely in 1.9 and we never have a chance of 696 | ## someone first using 1.8, then switching to 1.9, we can remove some of this 697 | ## sillyness. 698 | 699 | STORE_ENCODING = Encoding::UTF_8 if Decoder.in_ruby19_hell? 700 | 701 | def munge o 702 | return o unless Decoder.in_ruby19_hell? 703 | case o 704 | when String; o.dup.force_encoding STORE_ENCODING 705 | when Hash; o.each { |k, v| o[k] = munge(v) } 706 | when Set; Set.new(o.map { |e| munge(e) }) 707 | when Array; o.map { |e| munge(e) } 708 | else; o 709 | end 710 | end 711 | 712 | def protect_string s 713 | if Decoder.in_ruby19_hell? 714 | s.force_encoding "binary" 715 | else 716 | s 717 | end 718 | end 719 | 720 | def load_string key; munge(@store[key]) end 721 | def write_string key, value 722 | puts "; #{key} => #{value.inspect}" if @debug 723 | @store[key] = protect_string(value.to_s) 724 | end 725 | 726 | def load_array key; @store.member?(key) ? munge(Marshal.load(@store[key])) : [] end 727 | def write_array key, value 728 | puts "; #{key} => #{value.inspect}" if @debug 729 | @store[key] = Marshal.dump(value.to_a) 730 | end 731 | 732 | def load_hash key; @store.member?(key) ? munge(Marshal.load(@store[key])) : {} end 733 | def write_hash key, value 734 | puts "; #{key} => #{value.inspect}" if @debug 735 | @store[key] = Marshal.dump(value.to_hash) 736 | end 737 | 738 | def load_int key; @store.member?(key) ? Marshal.load(@store[key]) : nil end 739 | def write_int key, value 740 | puts "; #{key} => #{value.inspect}" if @debug 741 | @store[key] = Marshal.dump(value.to_i) 742 | end 743 | 744 | def load_set key; @store.member?(key) ? munge(Set.new(Marshal.load(@store[key]))) : Set.new end 745 | def write_set key, value 746 | puts "; #{key} => #{value.inspect}" if @debug 747 | @store[key] = Marshal.dump(value.to_set.to_a) 748 | end 749 | 750 | def contains_key? key; @store.member? key end 751 | end 752 | end 753 | --------------------------------------------------------------------------------