├── LICENSE
├── README.md
├── bneighbors.py
└── similarity.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Waylon Flinn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bneighbors
 2 | Find exact nearest neighbors in relatively high dimensional spaces. Supports
 3 | in-memory and out-of-core data sets (via [bcolz](https://github.com/Blosc/bcolz)
 4 | and [bvec](https://github.com/waylonflinn/bvec)).
 5 | 
 6 | Gives realtime performance in 20-100 dimensional feature spaces, over hundreds of
 7 | thousands of items.
 8 | 
 9 | Includes the following similarity measures
10 | 
11 | * cosine
12 | * jaccard
13 | * generalized
14 | 
15 | 
16 | The generalized similarity measure is based on an alternate normalization of
17 | cosine similarity, and includes both cosine similarity and lift as special cases.
18 | 
19 | 
20 | ## todo
21 | * efficient calculation of a relevant subset of
22 | [Bregman Divergences](https://en.wikipedia.org/wiki/Bregman_divergence)
23 | * subsetting of feature vectors for inclusion in results with boolean vectors (carrays)
24 | 


--------------------------------------------------------------------------------
/bneighbors.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import bvec
  3 | import bcolz
  4 | 
  5 | import similarity as sim
  6 | 
  7 | class Neighborhood:
  8 | 	'''
  9 | 	"Don't you want to be my neighbor?"
 10 | 		bneighbors finds nearest neighbors between two arbitrary vector spaces,
 11 | 		contained in bcolz databases, using bvec.
 12 | 
 13 | 	'''
 14 | 
 15 | 	def __init__(self, source_path):
 16 | 		'''
 17 | 			Create the Neighborhood, for finding nearest neighbors.
 18 | 
 19 | 			Args:
 20 | 			source_path (string): path to a bcolz database with three carray
 21 | 			columns: 'id', 'vector' and 'norm'
 22 | 
 23 | 		'''
 24 | 
 25 | 		self.source_path = source_path
 26 | 
 27 | 		# open bcolz datastores
 28 | 		self.vectors = bvec.carray(rootdir=source_path + "/vector")
 29 | 		self.norms = bvec.carray(rootdir=source_path + "/norm")
 30 | 		self.source_table = bcolz.ctable(rootdir=source_path)
 31 | 
 32 | 		#print("Created similarity object from BCOLZ files: source {0}; target: {1}".format(source_path, target_path))
 33 | 
 34 | 		# create similarity object
 35 | 		self.similarity = sim.Similarity(self.vectors, self.norms)
 36 | 
 37 | 		# create domain <-> index maps
 38 | 
 39 | 		# dictionary taking ids to indeces (source)
 40 | 		self.id_index_map = self._create_id_index_map(self.source_table)
 41 | 
 42 | 		self.index_id_map = self._create_index_id_map(self.source_table)
 43 | 
 44 | 	@staticmethod
 45 | 	def _create_id_index_map(ctable):
 46 | 		'''
 47 | 		create a dictionary taking ids to indeces (source)
 48 | 		'''
 49 | 
 50 | 		i = 0
 51 | 		id_index_map = {}
 52 | 		for block in bcolz.iterblocks(ctable['id']):
 53 | 			for item in block:
 54 | 				id_index_map[str(item)] = i
 55 | 				i += 1
 56 | 
 57 | 		return id_index_map
 58 | 
 59 | 	@staticmethod
 60 | 	def _create_index_id_map(ctable):
 61 | 		'''
 62 | 		create a dictionary taking an index to an id (target)
 63 | 		'''
 64 | 
 65 | 		i = 0
 66 | 		index_id_map = {}
 67 | 		for block in bcolz.iterblocks(ctable['id']):
 68 | 			for item in block:
 69 | 				index_id_map[i] = str(item)
 70 | 				i += 1
 71 | 
 72 | 		return index_id_map
 73 | 
 74 | 
 75 | 	def neighbors(self, source_id, n=100, sim_type=sim.SimilarityType.Cosine, p=None):
 76 | 		'''
 77 | 			Find the nearest neighbors of the given source_id
 78 | 		'''
 79 | 
 80 | 		if source_id not in self.id_index_map:
 81 | 			return []
 82 | 
 83 | 		source_index = self.id_index_map[source_id]
 84 | 
 85 | 		sorted_target_indeces = self.similarity.similarities(source_index, n=n, sim_type=sim_type, p=p)
 86 | 
 87 | 		# convert indeces to domain names
 88 | 		sorted_target_ids = ( (self.index_id_map[index], score) for (index, score) in sorted_target_indeces )
 89 | 
 90 | 		return sorted_target_ids
 91 | 
 92 | 
 93 | 	def location(self, source_id):
 94 | 		'''
 95 | 			Return the vector (numpy.ndarray) for the given source_id
 96 | 
 97 | 			source_id: external identifier for the vector
 98 | 		'''
 99 | 
100 | 		if source_id not in self.id_index_map:
101 | 			return []
102 | 
103 | 		source_index = self.id_index_map[source_id]
104 | 
105 | 		return self.vectors[source_index]
106 | 


--------------------------------------------------------------------------------
/similarity.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class SimilarityType:
  4 | 	Cosine = 1
  5 | 	Jaccard = 2
  6 | 	Euclidean = 3
  7 | 	Lift = 4
  8 | 	Generalized = 5
  9 | 
 10 | class Similarity:
 11 | 
 12 | 
 13 | 	def __init__(self, vector_carray, norm_carray):
 14 | 		'''
 15 | 		Create a similarity store based on vectors stored in two bcolz datastores.
 16 | 		Input index is into source_file, result indexes will be from the target_file.
 17 | 
 18 | 		Args:
 19 | 			source_file (string): path to the bcolz datastore containing query vectors
 20 | 			target_file (string): path to the bcolz datastore containing response vectors
 21 | 		'''
 22 | 
 23 | 		self.vector_carray = vector_carray
 24 | 		self.norm_carray = norm_carray
 25 | 
 26 | 
 27 | 	def similarities(self, index, n=100, sim_type=SimilarityType.Cosine, p=None):
 28 | 		'''
 29 | 		Return a list of objects similar to the given one, along with their scores.
 30 | 		Tuples containing index and similarity score are returned, restricted to the top n.
 31 | 		Input index is into source_file, result indexes will be from the target_file.
 32 | 
 33 | 		Args:
 34 | 			index (int): the index of the object from the source vector store to get similarities for
 35 | 		'''
 36 | 
 37 | 		source_vector = self.vector_carray[index]
 38 | 		source_norm = self.norm_carray[index]
 39 | 
 40 | 		# calc dots
 41 | 		dots = self.vector_carray.dot(source_vector)
 42 | 
 43 | 		if(sim_type == SimilarityType.Cosine):
 44 | 			# cosine similarity
 45 | 			similarities = self.cosine(dots, source_norm)
 46 | 		elif(sim_type == SimilarityType.Jaccard):
 47 | 			similarities = self.jaccard(dots, source_norm)
 48 | 		elif(sim_type == SimilarityType.Generalized):
 49 | 			if(p == None):
 50 | 				raise ValueError("Must supply p when using Generalized similarity.")
 51 | 
 52 | 			similarities = self.generalized(dots, source_norm, p)
 53 | 
 54 | 
 55 | 		# sorted
 56 | 		index_similarities_sorted = np.argsort(similarities)[::-1][:n]
 57 | 
 58 | 
 59 | 		return zip(index_similarities_sorted, similarities[index_similarities_sorted])
 60 | 
 61 | 	def cosine(self, dots, source_norm):
 62 | 		# divide by norms
 63 | 		similarities = dots.divide(source_norm)
 64 | 
 65 | 		similarities = similarities.divide(self.norm_carray)
 66 | 
 67 | 		similarities = similarities.tondarray()
 68 | 		similarities[np.isnan(similarities)] = 0
 69 | 
 70 | 		return similarities
 71 | 
 72 | 	def jaccard(self, dots, source_norm):
 73 | 		dots = dots.tondarray()
 74 | 
 75 | 		norms = self.norm_carray.tondarray()
 76 | 
 77 | 		# create denominator
 78 | 		# self.norm_carray^2 + source_norm^2 - similarities
 79 | 		denominator = (norms ** 2 + source_norm ** 2) - dots
 80 | 
 81 | 		# divide by norms
 82 | 		similarities = dots / denominator
 83 | 
 84 | 		return similarities
 85 | 
 86 | 	def generalized(self, dots, source_norm, p=2.0):
 87 | 		"""
 88 | 			This implements a generalized similarity measure of which cosine
 89 | 			and lift are special cases. It introduces a parameter which
 90 | 			modulates the affect of popularity.
 91 | 
 92 | 			Arguments:
 93 | 				dots (carray): array of dot products
 94 | 				source_norm (float64): value of norm of the source vector
 95 | 				p (float64): popularity parameter. Cosine = 2.0, Lift = 1.0
 96 | 					larger values of the parameter cause more popular items
 97 | 					to be more heavily weighted.
 98 | 					popularity here is expressed by the magnitude of the vector
 99 | 					in the vector. this always holds for raw preference vectors
100 | 					and also generally hold for machine learned results derived
101 | 					from preference vectors.
102 | 		"""
103 | 		# divide by norms
104 | 		similarities = dots.divide(source_norm)
105 | 
106 | 		similarities = similarities.tondarray()
107 | 		norms = self.norm_carray.tondarray()
108 | 
109 | 		if(p == 2):
110 | 			denominator = norms * source_norm
111 | 		else:
112 | 			denominator = (norms * source_norm) ** (2.0/p)
113 | 
114 | 		similarities = dots / denominator
115 | 
116 | 		similarities[np.isnan(similarities)] = 0
117 | 
118 | 		return similarities
119 | 


--------------------------------------------------------------------------------