├── LICENSE ├── README.md ├── bneighbors.py └── similarity.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Waylon Flinn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bneighbors 2 | Find exact nearest neighbors in relatively high dimensional spaces. Supports 3 | in-memory and out-of-core data sets (via [bcolz](https://github.com/Blosc/bcolz) 4 | and [bvec](https://github.com/waylonflinn/bvec)). 5 | 6 | Gives realtime performance in 20-100 dimensional feature spaces, over hundreds of 7 | thousands of items. 8 | 9 | Includes the following similarity measures 10 | 11 | * cosine 12 | * jaccard 13 | * generalized 14 | 15 | 16 | The generalized similarity measure is based on an alternate normalization of 17 | cosine similarity, and includes both cosine similarity and lift as special cases. 18 | 19 | 20 | ## todo 21 | * efficient calculation of a relevant subset of 22 | [Bregman Divergences](https://en.wikipedia.org/wiki/Bregman_divergence) 23 | * subsetting of feature vectors for inclusion in results with boolean vectors (carrays) 24 | -------------------------------------------------------------------------------- /bneighbors.py: -------------------------------------------------------------------------------- 1 | 2 | import bvec 3 | import bcolz 4 | 5 | import similarity as sim 6 | 7 | class Neighborhood: 8 | ''' 9 | "Don't you want to be my neighbor?" 10 | bneighbors finds nearest neighbors between two arbitrary vector spaces, 11 | contained in bcolz databases, using bvec. 12 | 13 | ''' 14 | 15 | def __init__(self, source_path): 16 | ''' 17 | Create the Neighborhood, for finding nearest neighbors. 18 | 19 | Args: 20 | source_path (string): path to a bcolz database with three carray 21 | columns: 'id', 'vector' and 'norm' 22 | 23 | ''' 24 | 25 | self.source_path = source_path 26 | 27 | # open bcolz datastores 28 | self.vectors = bvec.carray(rootdir=source_path + "/vector") 29 | self.norms = bvec.carray(rootdir=source_path + "/norm") 30 | self.source_table = bcolz.ctable(rootdir=source_path) 31 | 32 | #print("Created similarity object from BCOLZ files: source {0}; target: {1}".format(source_path, target_path)) 33 | 34 | # create similarity object 35 | self.similarity = sim.Similarity(self.vectors, self.norms) 36 | 37 | # create domain <-> index maps 38 | 39 | # dictionary taking ids to indeces (source) 40 | self.id_index_map = self._create_id_index_map(self.source_table) 41 | 42 | self.index_id_map = self._create_index_id_map(self.source_table) 43 | 44 | @staticmethod 45 | def _create_id_index_map(ctable): 46 | ''' 47 | create a dictionary taking ids to indeces (source) 48 | ''' 49 | 50 | i = 0 51 | id_index_map = {} 52 | for block in bcolz.iterblocks(ctable['id']): 53 | for item in block: 54 | id_index_map[str(item)] = i 55 | i += 1 56 | 57 | return id_index_map 58 | 59 | @staticmethod 60 | def _create_index_id_map(ctable): 61 | ''' 62 | create a dictionary taking an index to an id (target) 63 | ''' 64 | 65 | i = 0 66 | index_id_map = {} 67 | for block in bcolz.iterblocks(ctable['id']): 68 | for item in block: 69 | index_id_map[i] = str(item) 70 | i += 1 71 | 72 | return index_id_map 73 | 74 | 75 | def neighbors(self, source_id, n=100, sim_type=sim.SimilarityType.Cosine, p=None): 76 | ''' 77 | Find the nearest neighbors of the given source_id 78 | ''' 79 | 80 | if source_id not in self.id_index_map: 81 | return [] 82 | 83 | source_index = self.id_index_map[source_id] 84 | 85 | sorted_target_indeces = self.similarity.similarities(source_index, n=n, sim_type=sim_type, p=p) 86 | 87 | # convert indeces to domain names 88 | sorted_target_ids = ( (self.index_id_map[index], score) for (index, score) in sorted_target_indeces ) 89 | 90 | return sorted_target_ids 91 | 92 | 93 | def location(self, source_id): 94 | ''' 95 | Return the vector (numpy.ndarray) for the given source_id 96 | 97 | source_id: external identifier for the vector 98 | ''' 99 | 100 | if source_id not in self.id_index_map: 101 | return [] 102 | 103 | source_index = self.id_index_map[source_id] 104 | 105 | return self.vectors[source_index] 106 | -------------------------------------------------------------------------------- /similarity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class SimilarityType: 4 | Cosine = 1 5 | Jaccard = 2 6 | Euclidean = 3 7 | Lift = 4 8 | Generalized = 5 9 | 10 | class Similarity: 11 | 12 | 13 | def __init__(self, vector_carray, norm_carray): 14 | ''' 15 | Create a similarity store based on vectors stored in two bcolz datastores. 16 | Input index is into source_file, result indexes will be from the target_file. 17 | 18 | Args: 19 | source_file (string): path to the bcolz datastore containing query vectors 20 | target_file (string): path to the bcolz datastore containing response vectors 21 | ''' 22 | 23 | self.vector_carray = vector_carray 24 | self.norm_carray = norm_carray 25 | 26 | 27 | def similarities(self, index, n=100, sim_type=SimilarityType.Cosine, p=None): 28 | ''' 29 | Return a list of objects similar to the given one, along with their scores. 30 | Tuples containing index and similarity score are returned, restricted to the top n. 31 | Input index is into source_file, result indexes will be from the target_file. 32 | 33 | Args: 34 | index (int): the index of the object from the source vector store to get similarities for 35 | ''' 36 | 37 | source_vector = self.vector_carray[index] 38 | source_norm = self.norm_carray[index] 39 | 40 | # calc dots 41 | dots = self.vector_carray.dot(source_vector) 42 | 43 | if(sim_type == SimilarityType.Cosine): 44 | # cosine similarity 45 | similarities = self.cosine(dots, source_norm) 46 | elif(sim_type == SimilarityType.Jaccard): 47 | similarities = self.jaccard(dots, source_norm) 48 | elif(sim_type == SimilarityType.Generalized): 49 | if(p == None): 50 | raise ValueError("Must supply p when using Generalized similarity.") 51 | 52 | similarities = self.generalized(dots, source_norm, p) 53 | 54 | 55 | # sorted 56 | index_similarities_sorted = np.argsort(similarities)[::-1][:n] 57 | 58 | 59 | return zip(index_similarities_sorted, similarities[index_similarities_sorted]) 60 | 61 | def cosine(self, dots, source_norm): 62 | # divide by norms 63 | similarities = dots.divide(source_norm) 64 | 65 | similarities = similarities.divide(self.norm_carray) 66 | 67 | similarities = similarities.tondarray() 68 | similarities[np.isnan(similarities)] = 0 69 | 70 | return similarities 71 | 72 | def jaccard(self, dots, source_norm): 73 | dots = dots.tondarray() 74 | 75 | norms = self.norm_carray.tondarray() 76 | 77 | # create denominator 78 | # self.norm_carray^2 + source_norm^2 - similarities 79 | denominator = (norms ** 2 + source_norm ** 2) - dots 80 | 81 | # divide by norms 82 | similarities = dots / denominator 83 | 84 | return similarities 85 | 86 | def generalized(self, dots, source_norm, p=2.0): 87 | """ 88 | This implements a generalized similarity measure of which cosine 89 | and lift are special cases. It introduces a parameter which 90 | modulates the affect of popularity. 91 | 92 | Arguments: 93 | dots (carray): array of dot products 94 | source_norm (float64): value of norm of the source vector 95 | p (float64): popularity parameter. Cosine = 2.0, Lift = 1.0 96 | larger values of the parameter cause more popular items 97 | to be more heavily weighted. 98 | popularity here is expressed by the magnitude of the vector 99 | in the vector. this always holds for raw preference vectors 100 | and also generally hold for machine learned results derived 101 | from preference vectors. 102 | """ 103 | # divide by norms 104 | similarities = dots.divide(source_norm) 105 | 106 | similarities = similarities.tondarray() 107 | norms = self.norm_carray.tondarray() 108 | 109 | if(p == 2): 110 | denominator = norms * source_norm 111 | else: 112 | denominator = (norms * source_norm) ** (2.0/p) 113 | 114 | similarities = dots / denominator 115 | 116 | similarities[np.isnan(similarities)] = 0 117 | 118 | return similarities 119 | --------------------------------------------------------------------------------