├── .gitignore ├── Gemfile ├── bin └── duplitector ├── lib ├── duplitector │ ├── elasticsearch_id_generator.rb │ ├── normalizer.rb │ ├── stats.rb │ ├── client_wrapper.rb │ ├── test_data.rb │ ├── org_helper.rb │ ├── quality_measurer.rb │ ├── query_builder.rb │ ├── mapping_provider.rb │ └── deduplicator.rb └── duplitector.rb ├── Gemfile.lock ├── LICENSE.md ├── README.md └── data └── testdata.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'stretcher' 4 | gem 'trollop' -------------------------------------------------------------------------------- /bin/duplitector: -------------------------------------------------------------------------------- 1 | ruby lib/duplitector.rb --url 'http://localhost:9200' --index 'duplitector' --threshold 1.0 -v -------------------------------------------------------------------------------- /lib/duplitector/elasticsearch_id_generator.rb: -------------------------------------------------------------------------------- 1 | class ElasticsearchIdGenerator 2 | @@id = 0 3 | 4 | def self.get_next 5 | @@id += 1 6 | end 7 | end -------------------------------------------------------------------------------- /lib/duplitector/normalizer.rb: -------------------------------------------------------------------------------- 1 | class Normalizer 2 | 3 | # works in-place 4 | def normalize(org) 5 | handle_no_values org 6 | end 7 | 8 | private 9 | def handle_no_values(org) 10 | org.each { |k, v| org[k] = v.strip unless v.nil? }.delete_if { |k, v| v.nil? or v.empty? } 11 | end 12 | end -------------------------------------------------------------------------------- /lib/duplitector/stats.rb: -------------------------------------------------------------------------------- 1 | class Stats 2 | 3 | def initialize 4 | @not_duplicated = 0 5 | @duplicated = 0 6 | end 7 | 8 | def not_duplicate 9 | @not_duplicated += 1 10 | end 11 | 12 | def duplicate 13 | @duplicated += 1 14 | end 15 | 16 | def to_s 17 | "Organizations created: #{@not_duplicated}, Organizations resolved as duplicates: #{@duplicated}" 18 | end 19 | end -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | excon (0.16.10) 5 | faraday (0.8.7) 6 | multipart-post (~> 1.1) 7 | faraday_middleware (0.9.0) 8 | faraday (>= 0.7.4, < 0.9) 9 | faraday_middleware-multi_json (0.0.5) 10 | faraday_middleware 11 | multi_json 12 | hashie (2.0.5) 13 | multi_json (1.7.4) 14 | multipart-post (1.2.0) 15 | stretcher (1.18.1) 16 | excon (>= 0.16) 17 | faraday (~> 0.8) 18 | faraday_middleware (~> 0.9.0) 19 | faraday_middleware-multi_json (~> 0.0.5) 20 | hashie (>= 1.2.0) 21 | multi_json (>= 1.0) 22 | trollop (2.0) 23 | 24 | PLATFORMS 25 | ruby 26 | 27 | DEPENDENCIES 28 | stretcher 29 | trollop 30 | -------------------------------------------------------------------------------- /lib/duplitector/client_wrapper.rb: -------------------------------------------------------------------------------- 1 | class ClientWrapper 2 | 3 | def initialize(url, index, log) 4 | @log = log 5 | @idx = index 6 | @server = Stretcher::Server.new(url, logger: log) 7 | 8 | # Prepare an empty index 9 | @server.index(@idx).delete rescue nil 10 | @server.index(@idx).create() 11 | sleep 1 12 | end 13 | 14 | def create_mapping(mapping) 15 | @server.index(@idx).type(@idx).put_mapping(mapping) 16 | end 17 | 18 | def put(id, document) 19 | @server.index(@idx).type(@idx).put(id, document) 20 | end 21 | 22 | def search(query) 23 | @server.index(@idx).type(@idx).search({}, query) 24 | end 25 | 26 | def all 27 | @server.index(@idx).type(@idx).search({ size: 1000000 }, nil) 28 | end 29 | 30 | def refresh 31 | @server.index(@idx).refresh 32 | end 33 | end -------------------------------------------------------------------------------- /lib/duplitector/test_data.rb: -------------------------------------------------------------------------------- 1 | require_relative 'org_helper' 2 | 3 | class TestData 4 | 5 | def initialize(log) 6 | @log = log 7 | end 8 | 9 | def read_organizations(filename, n = nil) 10 | 11 | def limits_exceeded(counter, n) 12 | !n.nil? && counter >= n 13 | end 14 | 15 | def prepare_test_data(filename) 16 | file = File.new(filename, 'r') 17 | # skip headers: 18 | file.gets 19 | file 20 | end 21 | 22 | organizations = [] 23 | 24 | file = prepare_test_data filename 25 | 26 | counter = 0 27 | while !limits_exceeded(counter, n) and line = file.gets 28 | org = OrgHelper.from_array(line.split(/\t/)) 29 | @log.debug "Entry parsed: #{org}" 30 | 31 | organizations.push org 32 | counter += 1 33 | end 34 | 35 | file.close 36 | 37 | organizations 38 | end 39 | end -------------------------------------------------------------------------------- /lib/duplitector/org_helper.rb: -------------------------------------------------------------------------------- 1 | class OrgHelper 2 | @@keys = %w( id name type address city state postal_code country gov_id1 gov_id2 gov_id3 url telephone fax mail 3 | date_updated group_id ) 4 | 5 | def self.from_array(array) 6 | @@keys.zip(array).inject({}) do |k, v| 7 | k.merge!({ v[0] => v[1] }) 8 | end 9 | end 10 | 11 | # e.g. { id: "abc" } becomes { id: [ "abc" ] } 12 | def self.with_attributes_as_arrays(org) 13 | Hash[org.map { |k, v| [ k, [v] ] } ] 14 | end 15 | 16 | def self.merge(hash1, hash2) 17 | hash1.merge(hash2) do |key, a, b| 18 | if a.kind_of?(Array) then 19 | # merging duplicate keys is about putting all their values to a common array 20 | # e.g. given: 21 | # hash1 = { id: [ "1" ], name: [ "abc" ], es_id: 3 } 22 | # hash2 = { id: [ "1" ], name: [ "def" ] } 23 | # output is { id: [ "1", "1" ], name: [ "abc", "def" ], es_id: 3 } 24 | a.push(*b) 25 | else 26 | a != nil ? a : b 27 | end 28 | end 29 | end 30 | 31 | end -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Paweł Rychlik 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/duplitector/quality_measurer.rb: -------------------------------------------------------------------------------- 1 | class QualityMeasurer 2 | 3 | def initialize(client, log) 4 | @client = client 5 | @log = log 6 | end 7 | 8 | # There are three types of duplicate resolution states: 9 | # OK - a duplicate group resolved successfully; 10 | # FAIL - a duplicate not detected, but not wrongly associated with another duplicate group; still better than ERROR; 11 | # ERROR - a duplicate was erroneously associated with another duplicate group; should be avoided by all means. 12 | def measure 13 | ok = 0 14 | fail = 0 15 | error = 0 16 | 17 | query = { 18 | query: { match_all: {} }, 19 | size: 1000000, 20 | facets: { 21 | group_id: { 22 | terms: { 23 | field: 'group_id', 24 | size: 1000000 25 | } 26 | } 27 | } 28 | } 29 | 30 | results = @client.search(query) 31 | 32 | results.facets.group_id.terms.each { |facet| 33 | status = facet['count'] == 1 34 | if status then ok += 1 else fail += 1 end 35 | 36 | @log.info "#{status ? 'OK' : 'FAIL'}: \"group_id\"=>\"#{facet['term']}\" assigned to #{facet['count']} orgs." 37 | } 38 | 39 | results.results.each { |org| 40 | status = org['group_id'].uniq.length == 1 41 | error += 1 unless status 42 | @log.info "ERROR: Org[\"es_id\"=>#{org['es_id']}] has more than one \"group_id\"=>#{org['group_id']}." unless status 43 | } 44 | 45 | @log.info "Duplicate resolution: OK=#{ok}, FAIL=#{fail}, ERROR=#{error}." 46 | end 47 | end -------------------------------------------------------------------------------- /lib/duplitector/query_builder.rb: -------------------------------------------------------------------------------- 1 | class QueryBuilder 2 | 3 | def self.detect_duplicates_of(org_flat) 4 | { 5 | query: v2(org_flat), 6 | size: 5 7 | } 8 | end 9 | 10 | private 11 | 12 | def self.v2(org) 13 | { 14 | bool: { 15 | should: [ 16 | { 17 | fuzzy_like_this: { 18 | like_text: org['name'] || '', 19 | fields: %w(name), 20 | min_similarity: 0.5, 21 | boost: 5.0 22 | } 23 | }, 24 | { 25 | term: { 26 | gov_id1: { 27 | term: org['gov_id1'] || '', 28 | boost: 10.0 29 | } 30 | } 31 | }, 32 | { 33 | fuzzy_like_this: { 34 | like_text: org['city'] || '', 35 | fields: %w(city), 36 | min_similarity: 0.7, 37 | boost: 4.0 38 | } 39 | }, 40 | { 41 | match_phrase: { 42 | state: { 43 | query: org['state'] || '', 44 | boost: 4.0 45 | } 46 | } 47 | }, 48 | ], 49 | must: [], 50 | minimum_should_match: 2 51 | } 52 | } 53 | end 54 | 55 | end -------------------------------------------------------------------------------- /lib/duplitector/mapping_provider.rb: -------------------------------------------------------------------------------- 1 | class MappingProvider 2 | 3 | def mapping 4 | { 5 | temp: { 6 | properties: { 7 | address1: { 8 | type: "string" 9 | }, 10 | address2: { 11 | type: "string" 12 | }, 13 | city: { 14 | type: "string" 15 | }, 16 | country: { 17 | type: "string" 18 | }, 19 | date_updated: { 20 | type: "string" 21 | }, 22 | es_id: { 23 | type: "long", 24 | index: "not_analyzed" 25 | }, 26 | fax: { 27 | type: "string" 28 | }, 29 | gov_id1: { 30 | type: "string", 31 | index: "not_analyzed" 32 | }, 33 | gov_id2: { 34 | type: "string", 35 | index: "not_analyzed" 36 | }, 37 | gov_id3: { 38 | type: "string", 39 | index: "not_analyzed" 40 | }, 41 | id: { 42 | type: "string", 43 | index: "not_analyzed" 44 | }, 45 | mail: { 46 | type: "string" 47 | }, 48 | name: { 49 | type: "string" 50 | }, 51 | postal_code: { 52 | type: "string" 53 | }, 54 | state: { 55 | type: "string" 56 | }, 57 | telephone: { 58 | type: "string" 59 | }, 60 | url: { 61 | type: "string" 62 | } 63 | } 64 | } 65 | } 66 | end 67 | end -------------------------------------------------------------------------------- /lib/duplitector/deduplicator.rb: -------------------------------------------------------------------------------- 1 | require_relative 'org_helper' 2 | require_relative 'elasticsearch_id_generator' 3 | require_relative 'query_builder' 4 | require_relative 'stats' 5 | require_relative 'mapping_provider' 6 | require_relative 'client_wrapper' 7 | 8 | class Deduplicator 9 | 10 | def initialize(stats, log, client, score_cut_off) 11 | @stats = stats 12 | @log = log 13 | @client = client 14 | @score_cut_off = score_cut_off 15 | 16 | @client.create_mapping MappingProvider.new.mapping 17 | end 18 | 19 | def dedupe(org_flat) 20 | @log.info "Processing: #{org_flat}" 21 | 22 | query = QueryBuilder.detect_duplicates_of org_flat 23 | @log.debug "Query: #{query.to_json}" 24 | 25 | search_response = @client.search query 26 | @log.debug "Search response: #{search_response}" 27 | 28 | org = OrgHelper.with_attributes_as_arrays org_flat 29 | 30 | if (duplicate = is_duplicate(search_response)) 31 | # merge duplicate with existing organization 32 | merged = OrgHelper.merge(org, duplicate) 33 | @client.put(merged['es_id'], merged) 34 | @log.info "Merged duplicates into an existing organization: #{merged}" 35 | @stats.duplicate 36 | else 37 | # save as a new organization: 38 | id = org['es_id'] = ElasticsearchIdGenerator.get_next 39 | @client.put(id, org) 40 | @log.info "Created new organization: #{org}" 41 | @stats.not_duplicate 42 | end 43 | 44 | @client.refresh 45 | end 46 | 47 | def is_duplicate(response) 48 | results = response.results.sort_by(&:_score) 49 | duplicates = results.select { |item| item._score > @score_cut_off } 50 | if duplicates.empty? 51 | @log.info "No potential duplicates found. Highest score: #{results.first._score unless results.empty?}" 52 | nil 53 | else 54 | duplicate = duplicates.first 55 | @log.info "Found potential duplicates. Highest score: #{duplicate._score}" 56 | # take the one with the highest score, process it, return it 57 | duplicate.to_hash.select { |key, value| key.match(/^[^_]/) } 58 | end 59 | end 60 | end -------------------------------------------------------------------------------- /lib/duplitector.rb: -------------------------------------------------------------------------------- 1 | require 'stretcher' 2 | require 'trollop' 3 | 4 | require_relative 'duplitector/test_data' 5 | require_relative 'duplitector/deduplicator' 6 | require_relative 'duplitector/stats' 7 | require_relative 'duplitector/normalizer' 8 | require_relative 'duplitector/client_wrapper' 9 | require_relative 'duplitector/quality_measurer' 10 | 11 | opts = Trollop::options do 12 | # The file to read the test data from. 13 | # File contains tab-separated values describing organizations to be processed, the first row being an ignored header. 14 | # Note that OrgHelper.keys defines the columns meaning, and these keys are reused in QueryBuilder and MappingProvider. 15 | # The last column 'group_id' has a special meaning - every two entries that have the same group_id form a pair of 16 | # a unique organization and its duplicate. Based on that, it is possible to measure the quality of elasticsearch 17 | # queries used for finding duplicates, and gather overall statistics. 18 | opt :filename, 'Path to test-data filename', default: 'data/testdata.txt' 19 | # nil means all 20 | opt :count, 'Number of test entries to process', type: Integer, default: nil 21 | opt :url, 'URL to elasticsearch server', default: 'http://localhost:9200' 22 | # Evaluating whether a search query result is a duplicate of a given organization is based on a static cut-off 23 | # threshold. It's an essential modifier to the deduplication algorithm. 24 | opt :threshold, 'elasticsearch scoring threshold for differentiating between a duplicate and a unique item', 25 | default: 1.0 26 | opt :index, 'Name of elasticsearch index to use', default: 'duplitector' 27 | opt :verbose, 'Prints more information' 28 | end 29 | 30 | log = Logger.new(STDOUT) 31 | # use DEBUG for more detailed log information 32 | log.level = if opts[:verbose] then Logger::DEBUG else Logger::INFO end 33 | log.datetime_format = "%H:%M:%S" 34 | 35 | organizations = TestData.new(log).read_organizations opts[:filename], opts[:count] 36 | log.info '' 37 | 38 | client = ClientWrapper.new(opts[:url], opts[:index], log) 39 | stats = Stats.new 40 | deduplicator = Deduplicator.new(stats, log, client, opts[:threshold]) 41 | normalizer = Normalizer.new 42 | 43 | organizations.each do |org| 44 | normalizer.normalize org 45 | deduplicator.dedupe org 46 | log.info '' 47 | end 48 | 49 | QualityMeasurer.new(client, log).measure 50 | 51 | log.info '' 52 | log.info "Done. Stats: #{stats}" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | duplitector 2 | =========== 3 | 4 | A duplicate data detector engine based on Elasticsearch. It's been successfully used as a proof of concept, piloting a full-blown enterprize solution. 5 | 6 | Context 7 | ======= 8 | 9 | In certain systems we have to deal with lots of low-quality data, containing some typos, malformatted or missing fields, erraneous bits of information, sometimes coming from different sources, like careless humans, faulty sensors, multiple external data providers, etc. This kind of datasets often contain vast numbers of duplicate or similar entries. If this is the case - then these systems might struggle to deal with such unnatural, often unforeseen, conditions. It might, in turn, affect the quality of service delivered by the system. 10 | 11 | This project is meant to be a playground for developing a deduplication algorithm, and is currently aimed at the domain of various sorts of organizations (e.g. NPO databases). Still, it's small and generic enough, so that it can be easily adjusted to handle other data schemes or data sources. 12 | 13 | The repository contains a set of crafted organizations and their duplicates (partially fetched from [IRS](http://www.irs.gov/Charities-&-Non-Profits/Exempt-Organizations-Select-Check), partially intentionally modified, partially made up), so that it's convenient to test the algorithm's pieces. 14 | 15 | How do I run this thing? 16 | ======================== 17 | 18 | Requires: 19 | * ruby 1.9+ (tested on ruby 1.9.3), RubyGems with [bundler](http://rubygems.org/gems/bundler) 20 | * elasticsearch server running on localhost:9200 (configurable) 21 | 22 | ``` 23 | $ git clone https://github.com/pawelrychlik/duplitector.git 24 | $ cd duplitector 25 | $ bundle install 26 | $ ruby lib/duplitector.rb 27 | ``` 28 | 29 | Configuration via command-line arguments: 30 | ``` 31 | $ ruby lib/duplitector.rb --help 32 | Options: 33 | --filename, -f : Path to test-data filename (default: data/FoundationCenter.txt) 34 | --count, -c : Number of test entries to process 35 | --url, -u : URL to elasticsearch server (default: http://localhost:9200) 36 | --threshold, -t : elasticsearch scoring threshold for differentiating between a duplicate and a unique item 37 | (default: 1.0) 38 | --index, -i : Name of elasticsearch index to use (default: duplitector) 39 | --verbose, -v: Prints more information 40 | --help, -h: Show this message 41 | ``` 42 | 43 | Example output 44 | ============== 45 | 46 | (Cut for the sake of brevity). 47 | 48 | ``` 49 | Processing: {"id"=>"00-2237333", "name"=>"Lincoln Loan Fund", "type"=>"SOUNK", "city"=>"Fayetteville", "state"=>"AR", "country"=>"United States", "gov_id1"=>"EIN:002237333", "group_id"=>"18"} 50 | No potential duplicates found. Highest score: 51 | Created new organization: {"id"=>["00-2237333"], "name"=>["Lincoln Loan Fund"], "type"=>["SOUNK"], "city"=>["Fayetteville"], "state"=>["AR"], "country"=>["United States"], "gov_id1"=>["EIN:002237333"], "group_id"=>["18"], "es_id"=>99} 52 | 53 | Processing: {"id"=>"01-0140283", "name"=>"Pine Grove Cemetery Association", "type"=>"EO", "city"=>"Brunswick", "state"=>"ME", "gov_id1"=>"EIN:010140283", "group_id"=>"48"} 54 | Found potential duplicates. Highest score: 3.485476 55 | Merged duplicates into an existing organization: {"id"=>["01-0140283", "01-0140283"], "name"=>["Pine Grove Cemetery Association", "Pine Grove Cemetery Association"], "type"=>["EO", "EO"], "city"=>["Brunswick", "Brunswick"], "state"=>["ME", "ME"], "gov_id1"=>["EIN:010140283", "EIN:010140283"], "group_id"=>["48", "48"], "country"=>["United States"], "es_id"=>66} 56 | 57 | FAIL: "group_id"=>"98" assigned to 2 orgs. 58 | OK: "group_id"=>"11" assigned to 1 orgs. 59 | OK: "group_id"=>"10" assigned to 1 orgs. 60 | Duplicate resolution: OK=99, FAIL=1, ERROR=0. 61 | 62 | Done. Stats: Organizations created: 101, Organizations resolved as duplicates: 99 63 | ``` 64 | 65 | Useful resources on the subject of deduplication: 66 | * [an article](http://zmievski.org/2011/03/duplicates-detection-with-elasticsearch) by [Andrei Zmievski](http://twitter.com/a) 67 | -------------------------------------------------------------------------------- /data/testdata.txt: -------------------------------------------------------------------------------- 1 | ID Name Type Address City State/Province Postal Code Country Gov ID 1 Gov ID 2 Gov ID 3 URL Telephone Fax Email Date Group ID 2 | 00-2293921 Lockmann Fondation PC Habra CA United States EIN:002293921 19 3 | 01-0211500 Dyer Libraries PC Saco ME United States EIN:010211500 85 4 | 01-0104905 Laurel Hill Cemetery EO Saco United States EIN:010104905 36 5 | 01-0002847 Hulls Cove Neighbourhood Assoc. PC Hulls Cove ME EIN:010002847 23 6 | 01-0095329 Improved Order of Redmen of Maine EO,GROUP,LODGE Standish ME United States EIN:010095329 34 7 | 00-1058976 Newton Firefighters Children Foundation PC Waltham MA United States EIN:001058976 13 8 | 01-0180531 West Oxford Agriculture Society PC Fryeberg ME EIN:010180531 57 9 | 01-0015091 Hannover Football Club PC Cedar Knolls NJ EIN:010015091 24 10 | 00-1049015 Marshfield Womenade Inc. PC Marshfield MA United States EIN:001049015 11 11 | 01-0211504 Eunice Frye Home PF Portland ME United States EIN:010211504 88 12 | 01-0211481 Bangor Childrens Home PC Bangor ME United States EIN:010211481 76 13 | 01-0211481 Bangor Children Home PC ME United States EIN:010211481 76 14 | 01-0017496 Agamenticus Yacht Club of York PC York Harbor ME United States EIN:010017496 25 15 | 01-0211478 United Way of Eastern Maine PC Bangor ME United States EIN:010211478 75 16 | 00-0852649 Bethany Presbyterian Church PC Brookline MA United States EIN:000852649 6 17 | 01-0211500 Dyer Library Association PC Saco ME United States EIN:010211500 85 18 | 00-1017919 Progresive Aproach Inc. PC Cambridge MA United States EIN:001017919 8 19 | 01-0095329 Improved Order of Redman EO,GROUP,LODGE Standish ME United States EIN:010095329 34 20 | 00-0841363 Agape House of Prayer PC Mattapan MA United States EIN:000841363 5 21 | 01-0140290 Pine Grove Cemetery Corp EO Portland ME United States EIN:010140290 49 22 | 01-0131950 Plant Memorial Home POF Bath ME United States EIN:010131950 44 23 | 01-0024645 Bangor Symphony Orchestra PC Bangor ME United States EIN:010024645 28 24 | 01-0211494 Central Maine Medical Centre PC Lewistone EIN:010211494 83 25 | 00-0587764 Iglesia Bethesda Incorporated PC Lowel MA United States EIN:000587764 2 26 | 00-2293921 Lockman Foundation PC La Habra CA United States EIN:002293921 19 27 | 00-1017919 Progressive Approach Inc. PC Cambridge MA United States EIN:001017919 8 28 | 01-0187940 Rockland Cemetery Association EO Rockland ME United States EIN:010187940 60 29 | 01-0187940 Rockland Cemetery EO Rockland United States EIN:010187940 60 30 | 00-0765634 New Hope Congregational Church PC Boston MA United States EIN:000765634 4 31 | 01-0132625 Order of the Eastern Star of Maine EO,GROUP,LODGE Gardiner ME United States EIN:010132625 46 32 | 00-1727883 New Garden Park Inc. SO Worcester MA United States EIN:001727883 14 33 | 01-0043788 Community Christmas Project PC Pittsfield ME United States EIN:010043788 30 34 | 00-1045782 Schools Helping Schools Inc. PC Andover MA United States EIN:001045782 10 35 | 00-0852649 Bethany Presbiterian Church PC Brooklyn MA United States EIN:000852649 6 36 | 01-0147000 Rangeley Library Association PC Rangeley ME United States EIN:010147000 51 37 | 00-2030711 Church of Lord Jesus Christ of Macedonia PC Newark NJ United States EIN:002030711 15 38 | 01-0153128 Greater Rumford Community Centre PC Rumford United States EIN:010153128 52 39 | 01-0024155 Bangor Band Foundation PC Bangor ME United States EIN:010024155 27 40 | 00-1041338 PandoraBox Productions Inc. PC Brookline MA United States EIN:001041338 9 41 | 01-0130065 North Auburn Cemetery Association Inc. EO Auburn ME United States EIN:010130065 41 42 | 01-0211530 Maine Historical Society PC Portland ME United States 98 43 | 01-0130065 North Auburn Cemetery Association EO Auburn ME United States EIN:010130065 41 44 | 00-0635913 M. Apostolico Jesucristo Es El Senor Inc. PC Lawrance MA United States EIN:000635913 3 45 | 01-0114721 Mechanic Falls Maple Grove Cemetery Inc. EO Norway ME United States EIN:010114721 38 46 | 01-0211497 President & Trustees of Colby College PC Watersville ME United States EIN:010211497 84 47 | 01-0056642 Demerritt Cemetery Assoc EO Peru ME United States EIN:010056642 31 48 | 01-0207022 Fairmount Cemetery EO Presque Isle ME United States EIN:010207022 71 49 | 01-0207022 Fairmount Cemetery Assoc EO Presque Isle ME United States EIN:010207022 71 50 | 00-1045782 School Helping School PC Andoberg MA United States EIN:001045782 10 51 | 01-0153960 Saint Andrews Hospital PC Boothbay Harbor ME United States EIN:010153960 53 52 | 01-0191550 Northern Maine Fair PC Prescue Isle ME United States EIN:010191550 61 53 | 01-0211513 Jackson Laboratory PC Bar Harbor ME United States EIN:010211513 94 54 | 00-1727883 New Garden Inc. SO Worcester MA United States EIN:001727883 14 55 | 01-0205019 Riverside Cemetery EO Ft Fairfield ME United States EIN:010205019 69 56 | 01-0211504 Eunice Fry Home PF Portland United States EIN:010211504 88 57 | 01-0211483 Community Health and Counseling Services PC Bangour MA United States EIN:010211483 77 58 | 01-0024907 Bar Harbor Village Improvement Project POF Bar Harbor ME United States EIN:010024907 29 59 | 01-0199433 Portland Players Inc. PC S Portland ME United States EIN:010199433 66 60 | 00-0262650 Society of Saint Vincent De Paul of Memphis PC Memphis TN United States EIN:000262650 1 61 | 01-0124841 Mt. Pleasant Cemetery Corp EO Portland ME United States EIN:010124841 40 62 | 01-0211486 Mountain Desert Island YMCA PC Bar Harbor ME United States EIN:010211486 80 63 | 01-0056642 Demerit Cemetary Association EO Peru ME United States EIN:010056642 31 64 | 00-2030711 Church of Our Lord Jesus Christ of Macedonia PC Newark NJ United States EIN:002030711 15 65 | 01-0211513 Jackson Labs PC Bar Harbor ME United States EIN:010211513 94 66 | 01-0211502 Eastport Public Library Association PC Eastport ME United States EIN:010211502 86 67 | 01-0211502 Eastport Library PC Eastport ME United States EIN:010211502 86 68 | 01-0102346 Knights of Pythias of Maine Grand Lodge EO,GROUP,LODGE Hanover ME United States EIN:010102346 35 69 | 00-2215455 Micro-electronic Centre of North Carolina PC Research Triangle NC United States EIN:002215455 17 70 | 01-0207025 Green Lawn Rest Association EO Clinton ME United States EIN:010207025 72 71 | 01-0207025 Green Loan Rest Foundation EO Clinton ME United States EIN:010207025 72 72 | 00-7764840 Tri-County Lighthouse Baptist Church & Ministries Inc. PC Deerfield St. NJ United States EIN:007764840 22 73 | 01-0199596 Maine Medical Center Womens Board Canteen SOUNK Portland ME United States EIN:010199596 67 74 | 01-0211484 Bangor Theological Seminary PC Bangor ME United States EIN:010211484 78 75 | 00-2067446 Usa National Jr Tennis League POF White Plains NY United States EIN:002067446 16 76 | 01-0205019 Riverside Cemetery Corporation EO Ft Fairfield ME United States EIN:010205019 69 77 | 01-0211494 Central Maine Medical Center PC Lewiston ME United States EIN:010211494 83 78 | 01-0211486 Mt. Desert Island YMCA PC Bar Harbor ME United States EIN:010211486 80 79 | 01-0211484 Bangor Theological Seminary PC Bangore ME United States EIN:010211484 78 80 | 01-0211515 Jewish Community Council PC Bangor ME United States EIN:010211515 95 81 | 01-0211515 Jewish Community Council of Bangor PC Bangor ME United States EIN:010211515 95 82 | 01-0174648 Eastern Star of New Hampshire Grand Chapter EO,GROUP,LODGE Nashua NH United States EIN:010174648 54 83 | 01-0174648 Order of the Eastern Star of New Hampshire Grand Chapter EO,GROUP,LODGE Nashua NH United States EIN:010174648 54 84 | 01-0102346 Knights of Pytheas of Grand Maine Lodge EO,GROUP,LODGE Hannover ME United States EIN:010102346 35 85 | 01-0211509 Gould Academy PC Bethel ME EIN:010211509 92 86 | 01-0078060 Henrietta D Goodall Hospital Inc. PC Sanford ME United States EIN:010078060 32 87 | 01-0131950 Plant Memorial Home POF Bath MT United States EIN:010131950 44 88 | 01-0207716 Victoria Mansion PC Portland ME United States EIN:010207716 73 89 | 01-0191822 Veterans of Foreign Wars of the United States Dept of Maine EO,GROUP Vassalboro ME United States EIN:010191822 62 90 | 01-0180687 Windsor Improvement Corporation PC Windsor VT EIN:010180687 58 91 | 00-2296211 Assistance League of Ventura County PC Ventura CA United States EIN:002296211 21 92 | 01-0140283 Pine Grove Cemetery Association EO Brunswick ME United States EIN:010140283 48 93 | 00-0889899 Academic and Behavioural Clinic PC Boston MA United States EIN:000889899 7 94 | 01-0211517 Cedars Nursing Center PC MA United States EIN:010211517 96 95 | 01-0133442 Oxford Agriculture Society PC Norway ME United States EIN:010133442 47 96 | 01-0133442 Oxford County Agricultural Society PC Norway ME United States EIN:010133442 47 97 | 01-0211505 Trustees of Fryburg Academy PC Fryeburg United States EIN:010211505 89 98 | 01-0205722 Brookside Cemetery Association Inc. EO Mount Desert ME United States EIN:010205722 70 99 | 01-0153128 Greater Rumford Community Center PC Rumford ME United States EIN:010153128 52 100 | 01-0180687 Windsor Improvement Corporation PC Windsor VT United States EIN:010180687 58 101 | 01-0199596 Maine Medical Center Womens Board Canteen & Gift Shop SOUNK Portland ME United States EIN:010199596 67 102 | 01-0191953 Cumberland Cemetery EO Cumberland Ctr ME United States EIN:010191953 63 103 | 01-0211487 Bible Society of Maine PC Portland ME United States EIN:010211487 81 104 | 01-0211487 Bible Society of Maine PC Portland ME United States EIN:010211487 81 105 | 01-0211517 Cedars Nursing Care Center Inc. PC Portland ME United States EIN:010211517 96 106 | 01-0191822 Veterans of Foreign Wars of the US Departament of Maine EO,GROUP Vassalborough ME United States EIN:010191822 62 107 | 01-0211483 Community Health and Counseling Services PC Bangor ME United States EIN:010211483 77 108 | 00-0262650 The Society of St. Vincent De Paul of Memphis Inc. PC Memphis TN United States EIN:000262650 1 109 | 01-0145133 Prouts Neck Association PC Scarborough ME United States EIN:010145133 50 110 | 01-0202467 Mount Desert Island Biological Laboratory PC Salsbury Cove ME United States EIN:010202467 68 111 | 01-0211526 Maine Church Council PC ME United States EIN:010211526 97 112 | 01-0211526 Maine Council of Churches PC Portland ME United States EIN:010211526 97 113 | 01-0179500 Webber Hospital Association PC Biddeford ME EIN:010179500 56 114 | 00-1056907 Payabya Inc. PC Hudson MA United States EIN:001056907 12 115 | 00-2067446 Usta National Junior Tennis League Inc. POF White Plains NY United States EIN:002067446 16 116 | 01-0130427 Bridgeton Hospital PC Brighton ME United States EIN:010130427 42 117 | 01-0104976 St Andrew Church PC Los Alamitos CA United States EIN:010104976 37 118 | 00-7764840 Three-County Light House Baptist Church PC Deerfield NJ United States EIN:007764840 22 119 | 01-0199433 Portland Players PC St Portland ME United States EIN:010199433 66 120 | 01-0211485 Young Christians Association Bangor PC Bangor ME United States EIN:010211485 79 121 | 01-0211485 Young Mens Christian Association Bangor PC Bangor ME United States EIN:010211485 79 122 | 01-0211505 The Trustees of Fryeburg Academy PC Fryeburg ME United States EIN:010211505 89 123 | 01-0131335 Oak Grove Cemetery Association EO Gardener ME United States EIN:010131335 43 124 | 01-0153960 St. Andrews Hospital PC Boothbay Hbr ME United States EIN:010153960 53 125 | 01-0211476 The Chapman House PC Auburn ME United States EIN:010211476 74 126 | 01-0211508 Good Will Home Association PC Hincklay ME United States EIN:010211508 91 127 | 01-0019705 Ancient Free And Accepted Masons of Maine EO,GROUP,LODGE Portland ME United States EIN:010019705 26 128 | 01-0207716 Victoria Mansion PC Portland United States EIN:010207716 73 129 | 01-0024645 Bangor Symphony Orchestra PC Bancore United States EIN:010024645 28 130 | 01-0211532 Kents Hill School PC Kents Hill ME United States EIN:010211532 99 131 | 01-0198331 Maine Coast Regional Health Facilities PC Elsworth ME United States EIN:010198331 65 132 | 01-0177170 Waldo County General Hospital PC Belfast ME United States EIN:010177170 55 133 | 01-0177170 Waldo Hospital PC Belfast ME United States EIN:010177170 55 134 | 01-0191550 Northern Maine Fair Association PC Presque Isle ME United States EIN:010191550 61 135 | 01-0092600 Hope Cemetery Corp. EO Kenbunk United States EIN:010092600 33 136 | 01-0211530 Maine Historical Society PC Portland ME United States EIN:010211530 98 137 | 01-0196359 Iris Networks PC Portland ME United States EIN:010196359 64 138 | 01-0196359 Iris Network PC ME United States EIN:010196359 64 139 | 01-0211533 McArthur Library Foundation PC Biddeford ME United States EIN:010211533 100 140 | 01-0124810 Mt. Hope Cemetery Corp EO Bangor ME United States EIN:010124810 39 141 | 01-0211507 Good Samaritan Agency PC Bangor ME United States EIN:010211507 90 142 | 01-0211507 Good Samaritans PC Bangor ME United States EIN:010211507 90 143 | 01-0132610 Orchard Grove Cemetery Association EO Kittery ME United States EIN:010132610 45 144 | 01-0186800 Young Mens Christian Association State of Maine PC Winthrop ME United States EIN:010186800 59 145 | 01-0211478 United Way of Eastern Maine PC Bangor United States EIN:010211478 75 146 | 01-0043788 Community Christmas Project Inc. PC Pittsfield ME United States EIN:010043788 30 147 | 01-0180531 West Oxford Agricultural Society PC Fryeburg ME United States EIN:010180531 57 148 | 01-0211509 Gould Academy PC Bethel ME United States EIN:010211509 92 149 | 01-0211497 President & Trustees of Colby College PC Waterville ME United States EIN:010211497 84 150 | 01-0104976 Saint Andrew Church PC Los Alamitos CA United States EIN:010104976 37 151 | 01-0114721 Mechanic Falls Maple Hill Cemetery EO Norway ME United States EIN:010114721 38 152 | 00-1049015 Marshfield Womenaid Inc. PC Marshfield MA United States EIN:001049015 11 153 | 01-0179500 Webber Hospital Association PC Biddeford ME United States EIN:010179500 56 154 | 01-0015091 Hanover Soccer Club Inc. PC Cedar Knolls NJ United States EIN:010015091 24 155 | 01-0024907 Bar Harbor Village Improvement Association POF Bar Harbor ME United States EIN:010024907 29 156 | 01-0202467 Mount Desert Island Biological Lab PC Salisbury Cove ME EIN:010202467 68 157 | 01-0211533 McArthur Library Association PC Biddeford ME United States EIN:010211533 100 158 | 01-0205722 Brookside Cemetery Inc. EO Mount Desert ME United States EIN:010205722 70 159 | 01-0198331 Maine Coast Regional Health Facilities PC Ellsworth ME United States EIN:010198331 65 160 | 01-0092600 Hope Cemetery Corporation EO Kennebunk ME United States EIN:010092600 33 161 | 01-0191953 Cumberland Cemetery Association EO Cumberland Ctr ME United States EIN:010191953 63 162 | 01-0024155 Bangor Band PC Bangor ME United States EIN:010024155 27 163 | 01-0211512 Home for the Aged PC Portland ME United States EIN:010211512 93 164 | 01-0019705 Ancient Free & Accepted Masons of Maine Grand Lodge EO,GROUP,LODGE Portland ME United States EIN:010019705 26 165 | 01-0211508 Good Will Home Association PC Hinckley ME United States EIN:010211508 91 166 | 01-0211512 Home for the Elderly PC Portland ME United States EIN:010211512 93 167 | 00-1056907 Payiabya Org PC Huston MA United States EIN:001056907 12 168 | 01-0211476 Chapman House PC Auborn United States EIN:010211476 74 169 | 01-0002847 Hulls Cove Neighborhood Association PC Hulls Cove ME United States EIN:010002847 23 170 | 00-0587764 Iglesia Bethesda Inc. PC Lowell MA United States EIN:000587764 2 171 | 01-0132625 Order of the Eastern Star of Maine EO,GROUP,LODGE West Gardiner ME United States EIN:010132625 46 172 | 00-2296179 Religious Science Church Center of San Diego PC San Diego CA United States EIN:002296179 20 173 | 01-0124841 Mt. Pleasant Cemetery Corp EO S Portland ME United States EIN:010124841 40 174 | 01-0104905 Laurel Hill Cemetery Association EO Saco ME United States EIN:010104905 36 175 | 01-0130427 Bridgton Hospital PC Bridgton ME United States EIN:010130427 42 176 | 00-2237333 Lincoln Loan Fund SOUNK Fayetteville AR United States EIN:002237333 18 177 | 00-2237333 Lincolns Loan Foundation SOUNK Fayetteville AR United States EIN:002237333 18 178 | 00-2296211 Assistance League in Venture PC Ventura CA United States EIN:002296211 21 179 | 01-0017496 Agamentica Yacht Club of York PC York Harbour ME United States EIN:010017496 25 180 | 00-0841363 Agape House of Preyer PC Matappan MA United States EIN:000841363 5 181 | 01-0124810 Mountain Hope Cemetery Corporation EO Bangor ME United States EIN:010124810 39 182 | 00-1058976 Newton Firefighters Childrens Fund Inc. PC Waltham MA United States EIN:001058976 13 183 | 00-1041338 Pandoras Box Productions Inc. PC Brookline MA United States EIN:001041338 9 184 | 01-0078060 Henrietta Goodhall Hospital Inc. PC Stanford United States EIN:010078060 32 185 | 00-2215455 Microelectronic s Center of North Carolina PC Research Triangle Pk NC United States EIN:002215455 17 186 | 01-0131335 Oak Grove Cemetery Asso EO Gardiner ME United States EIN:010131335 43 187 | 01-0147000 Rangelay Library Assoc. PC Rangeley United States EIN:010147000 51 188 | 01-0140290 Pine Grove Cemetery Corp EO S Portland ME United States EIN:010140290 49 189 | 00-0765634 New Hope Congregation PC Boston United States EIN:000765634 4 190 | 01-0211488 Trustees of Bloomfield Academy PC Skowhegan ME United States EIN:010211488 82 191 | 00-0889899 Academic and Behavioral Clinic Inc. PC Boston MA United States EIN:000889899 7 192 | 01-0211503 Franklin Memorial Hospital PC Farmington ME United States EIN:010211503 87 193 | 01-0211503 Franklins Hospital PC Farmington ME United States EIN:010211503 87 194 | 01-0211532 Kents Hill School PC Kent's Hill ME United States EIN:010211532 99 195 | 01-0145133 Prouts Neck Foundation PC Scarboro ME United States EIN:010145133 50 196 | 01-0186800 Young Mens Christian Association of Maine PC Winthrop ME EIN:010186800 59 197 | 01-0140283 Pine Grove Cemetery Association EO Brunswick ME EIN:010140283 48 198 | 00-0635913 Ministerio Apostolico Jesucristo Es El Senor Inc. PC Lawrence MA United States EIN:000635913 3 199 | 01-0211488 Trustees of Bloomfield Academy PC Skewhegan ME United States EIN:010211488 82 200 | 01-0132610 Orchard Grove Cemetery EO Kittery ME United States EIN:010132610 45 201 | 00-2296179 Religious Science Church Centre in San Diego PC San Diego CA United States EIN:002296179 20 --------------------------------------------------------------------------------