├── README.md ├── adwords.py ├── algorithmics ├── candies.py ├── chg_bits.py ├── cryptarithmetic.py ├── diff-div.py ├── flowers.py ├── forest_slicing.py ├── grammer-parser.py ├── hanoi_tower.py ├── inverse_function.py ├── iter_circle_sum.py ├── licence ├── media.py ├── n_c_p.py ├── numpy_circle_sum.py ├── palindromes.py ├── pooring_water.py ├── recur_circle_sum.py ├── string_red.py ├── suffixes.py └── unfriendly.py ├── classification ├── decision_trees.py └── knn.py ├── decorators.py ├── dijkstra.py ├── filters ├── __init__.py └── utils.py ├── frequency.py ├── graph_analysis.py ├── licence ├── map_reduce ├── README.md ├── item_frequency.py ├── map_reduce.py └── pi_estimation.py ├── movielens ├── u.item └── u.user ├── page_rank ├── README.md ├── page_rank.py └── page_rank_numpy.py ├── quora ├── answer_classifier.py ├── datacenter_c.py ├── dcc.c ├── feed_optimizer.py ├── nearby.py ├── results.txt ├── test.txt └── typehead.py ├── radix_tree.py ├── recommendation.py ├── shingles_minhash.py └── similarities ├── __init__.py ├── correlation.py ├── euclidean.py ├── jaccard_similarity.py └── tanimoto.py /README.md: -------------------------------------------------------------------------------- 1 | math and data analysis functions 2 | ================================ 3 | 4 | *shingling* 5 | - k-shingles generation 6 | - minhashing 7 | 8 | *jaccard similarity* 9 | - jaccard similarity calculation 10 | - jaccard distance calculation 11 | - jaccard conditional comparaison 12 | 13 | *adwords problem* 14 | - greedy_adwords 15 | - balance_adwords 16 | - generalized_balance_adwords 17 | 18 | *frequency problem* 19 | - items frequency 20 | - the algorithm of savasere, omniescinski and navathe 21 | 22 | *graph problem* 23 | - graph construction 24 | - shortest_path 25 | - longest path 26 | - centrality 27 | - independent graphs detection 28 | - clustering_coef 29 | - dijkstra 30 | - dijkstra with heap 31 | 32 | *recommendation problem* 33 | - hamming distance 34 | - euclidean distance 35 | - pearson correlation 36 | - tanimoto score 37 | - euclidean similarity 38 | - pearson similarity 39 | - tanimoto similarity 40 | - top similars 41 | - top similar with map reduce 42 | - recommendation user filtred 43 | - recommendation item filtred 44 | 45 | *Radix tree* 46 | - insert 47 | - remove 48 | - search 49 | - longest prefix 50 | 51 | *Decision tree* 52 | - Divide data 53 | - Gini impurity 54 | - Entropy 55 | - Variance 56 | - Buil tree 57 | - Prune 58 | - Classify 59 | - Draw tree 60 | 61 | *Page Rank* 62 | 63 | A very simple version/implementation of the page rank algorithm. 64 | - Page rank 65 | - Advanced version of page rank, topic sensitive 66 | - spam farms 67 | - spam farms 68 | - trust rank 69 | - Hiperlink induced topic search 70 | - Map reduce to efficiently calculates the page rank 71 | - Jaccard simiarity to be found in data analysis repo 72 | 73 | *Map-Reduce* 74 | 75 | Implementation of map reduce, and some examples. 76 | - Map Reduce class 77 | - Estimation of pi number 78 | - Calculation of frequency of Items from multiple files 79 | -------------------------------------------------------------------------------- /adwords.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: graph analysis 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | import collections 8 | from math import exp 9 | 10 | 11 | def bid_for_bider(bider, bids, item): 12 | """ 13 | Returns the bid the bider put on item 14 | """ 15 | return bids[item] if item in bids.keys() else 0 16 | 17 | 18 | def fraction_for_bider(bider, remaining_budget, initial_budget, bids, item): 19 | """ 20 | return the bid times 1 - e^-(fraction of remaining budget) 21 | """ 22 | return bids[item] * (1 - exp(-(remaining_budget / initial_budget))) if item in bids.keys() else 0 23 | 24 | 25 | def sort_biders(biders, bids=None, item=None, by_budget=False, by_bid=False, by_fraction=False): 26 | """ 27 | sort biders by budget 28 | """ 29 | result = [] 30 | if by_budget and not by_bid: 31 | return sorted(biders, key=lambda x: x[1], reverse=True) 32 | if by_bid and item is not None and not by_budget: 33 | return sorted(biders, key=lambda (x, y, z): bid_for_bider(x, bids[x], item), reverse=True) 34 | if by_fraction: 35 | return sorted(biders, key=lambda (x, y, z): fraction_for_bider(x, y, z, bids[x], item), reverse=True) 36 | return biders 37 | 38 | 39 | def greedy_adwords(biders, bids, items): 40 | """ 41 | greedy algorithms make their decision in response to each input element by maximizing some 42 | function of the input element and the past. 43 | all click through rates are the same 44 | #bider structer : bider, remaining budget, initial budget 45 | #bids for bidder structer : item, value ... 46 | """ 47 | result = [] 48 | for item in items: 49 | biders = sort_biders(biders, bids, item=item, by_bid=True) 50 | for b in range(len(biders)): 51 | bider, remaining_budget, initial_budget = biders[b] 52 | if item in bids[bider].keys() and remaining_budget >= bids[bider][item]: 53 | result.append((item, bider)) 54 | biders[b] = (bider, remaining_budget - bids[bider][item], initial_budget) 55 | break 56 | return result 57 | 58 | 59 | def balance_adwords(biders, bids, items): 60 | """ 61 | assigns a query to the advertiser who bids on the query and 62 | has the largest remaining budget. Ties may be broken arbitrarily. 63 | #bider structer : bider, remaining budget, initial budget 64 | #bids for bidder structer : item, value ... 65 | """ 66 | result = [] 67 | for item in items: 68 | biders = sort_biders(biders, by_budget=True) 69 | for b in range(len(biders)): 70 | bider, remaining_budget, initial_budget = biders[b] 71 | if item in bids[bider].keys() and remaining_budget >= bids[bider][item]: 72 | result.append((item, bider)) 73 | biders[b] = (bider, remaining_budget - bids[bider][item], initial_budget) 74 | break 75 | return result 76 | 77 | 78 | def generalized_balance_adwords(biders, bids, items): 79 | """ 80 | differs from the balance algoritghms in two ways: 81 | bias the choice of the bider in favor of the one with the higher bid 82 | less absolute about the remaining budget, rather, consider the fraction of the remaining budget 83 | #bider structer : bider, remaining budget, initial budget 84 | #bids for bidder structer : item, value ... 85 | """ 86 | result = [] 87 | for item in items: 88 | biders = sort_biders(biders, bids, item=item, by_fraction=True) 89 | for b in range(len(biders)): 90 | bider, remaining_budget, initial_budget = biders[b] 91 | if item in bids[bider].keys() and remaining_budget >= bids[bider][item]: 92 | result.append((item, bider)) 93 | biders[b] = (bider, remaining_budget - bids[bider][item], initial_budget) 94 | break 95 | return result 96 | 97 | 98 | def test_greedy(): 99 | biders = (("m", 30, 30), ("l", 10, 10), ("k", 25, 25), ("p", 20, 20)) 100 | bids = {"m": {'a': 2, 'b': 3, 'd': 1}, "l": {"c": 1, 'a': 5}, "p": {}, "k": {'b': 5, 'c': 2, 'd': 1}} 101 | items = tuple(('a', 'b', 'd', 'a', 'a')) 102 | print greedy_adwords(biders=biders, bids=bids, items=items) 103 | 104 | 105 | def test_balance(): 106 | biders = (("m", 30, 30), ("l", 10, 10), ("k", 25, 25), ("p", 20, 20)) 107 | bids = {"m": {'a': 2, 'b': 3, 'd': 1}, "l": {"c": 1, 'a': 5}, "p": {}, "k": {'b': 5, 'c': 2, 'd': 1}} 108 | items = tuple(('a', 'b', 'd', 'a', 'a')) 109 | print balance_adwords(biders=biders, bids=bids, items=items) 110 | 111 | 112 | def test_ageneralized_balance(): 113 | biders = (("m", 30, 30), ("l", 10, 10), ("k", 25, 25), ("p", 20, 20)) 114 | bids = {"m": {'a': 2, 'b': 3, 'd': 1}, "l": {"c": 1, 'a': 5}, "p": {}, "k": {'b': 5, 'c': 2, 'd': 1}} 115 | items = tuple(('a', 'b', 'd', 'a', 'a')) 116 | print ageneralized_balance_adwords(biders=biders, bids=bids, items=items) 117 | 118 | 119 | if __name__ == '__main__': 120 | test_greedy() 121 | test_balance() 122 | test_ageneralized_balance() 123 | -------------------------------------------------------------------------------- /algorithmics/candies.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | def candies(): 5 | N = int(raw_input()) 6 | next = collections.defaultdict(int) 7 | to_update = collections.defaultdict(int) 8 | 9 | def update(prev, current): 10 | if current in to_update: 11 | for i in range(to_update[current], current): 12 | next[i] += 1 13 | else: 14 | next[prev] += 1 15 | if prev >= 1 and next[prev] == next[prev - 1] and f[prev] < f[prev - 1]: 16 | update(prev - 1, current) 17 | else: 18 | to_update[current] = prev 19 | 20 | f = [] 21 | for i in range(N): 22 | current = int(raw_input()) 23 | f.append(current) 24 | if i == 0: 25 | next[i] = 1 26 | continue 27 | prev = f[i - 1] 28 | if current <= prev: 29 | next[i] = 1 30 | if next[i - 1] == 1: 31 | if current < prev: update(i - 1, i) 32 | # else: next[i] += 1 33 | elif current > prev: 34 | next[i] = next[i - 1] + 1 35 | 36 | print sum(next.values()) 37 | 38 | 39 | candies() 40 | -------------------------------------------------------------------------------- /algorithmics/chg_bits.py: -------------------------------------------------------------------------------- 1 | # setBit() returns an integer with the bit at 'offset' set to 1. 2 | def setBit(int_type, offset): 3 | mask = 1 << offset 4 | return (int_type | mask) 5 | 6 | 7 | # clearBit() returns an integer with the bit at 'offset' cleared. 8 | def clearBit(int_type, offset): 9 | mask = ~(1 << offset) 10 | return (int_type & mask) 11 | 12 | 13 | def set_bit(ind, val, int_type): 14 | return clearBit(int_type, ind) if val == "0" else setBit(int_type, ind) 15 | 16 | 17 | def get_c(inx, A, B, n): 18 | res = A + B 19 | mask = 1 << inx 20 | return '1' if (res & mask) else '0' 21 | 22 | 23 | def chg_bit(): 24 | N, Q = [int(x) for x in raw_input().split()] 25 | A = int(raw_input(), 2) 26 | B = int(raw_input(), 2) 27 | result = "" 28 | while Q > 0: 29 | q = [x for x in raw_input().split()] 30 | if len(q) == 2: # get operation 31 | result += get_c(int(q[1]), A, B, N + 1) 32 | elif q[0] == "set_a": 33 | A = set_bit(int(q[1]), q[2], A) 34 | else: 35 | B = set_bit(int(q[1]), q[2], B) 36 | Q -= 1 37 | print result 38 | 39 | 40 | chg_bit() 41 | -------------------------------------------------------------------------------- /algorithmics/cryptarithmetic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: cryptarithmetic 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | import string, re, itertools 8 | import time 9 | 10 | examples = """TWO + TWO == FOUR 11 | A**2 + B**2 == C**2 12 | A**2 + BE**2 == BY**2 13 | A**2 + BY**2 == BE**2 14 | X / X == X 15 | X / X == 1 16 | A**N + B**N == C**N and N > 1 17 | ATOM**0.5 == A + TO + M 18 | GLITTER is not GOLD 19 | ONE < TWO and FOUR < FIVE 20 | ONE < TWO < THREE 21 | RAMN == R**3+ RM**3 == N**3 + RX**3 22 | sum(range(AA)) == BB 23 | sum(range(POP)) == BOBO 24 | ODD + ODD == EVEN 25 | PLUTO is not set([PLANETS]) """.splitlines() 26 | 27 | 28 | def solve(formula, verbose=False): 29 | """Given a formula like 'ODD + ODD == EVEN', fill in digits to solve it. 30 | Input formula is a string; output is a digit-filled-in string or None.""" 31 | for f in fill_in(formula): 32 | if valid(f): 33 | if not verbose: print f 34 | return f 35 | 36 | 37 | def fill_in(formula): 38 | "Generate all possible fillings-in of letters in formula with digits." 39 | letters = ''.join(set(re.findall('[A-Z]', formula))) 40 | for digits in itertools.permutations('1234567890', len(letters)): 41 | table = string.maketrans(letters, ''.join(digits)) 42 | yield formula.translate(table) 43 | 44 | 45 | def valid(f): 46 | """Formula f is valid if and only if it has no 47 | numbers with leading zero, and evals true.""" 48 | try: 49 | return not re.search(r'\b0[0-9]', f) and eval(f) is True 50 | except ArithmeticError: 51 | return False 52 | 53 | 54 | def timedcall(fct, formula): 55 | """ 56 | Calculate time of execution 57 | """ 58 | t0 = time.clock() 59 | fct(formula) 60 | t1 = time.clock() 61 | return t1 - t0 62 | 63 | 64 | def compile_formula(formula, verbose=False): 65 | """Compile formula into a function. Also return letters found, as a str, 66 | in same order as parms of function. The first digit of a multi-digit 67 | number can't be 0. So if YOU is a word in the formula, and the function 68 | is called with Y eqal to 0, the function should return False.""" 69 | 70 | first_letters = set(re.findall(r'\b([A-Z])[A-Z]', formula)) 71 | letters = ''.join(set(re.findall('[A-Z]', formula))) 72 | parms = ', '.join(letters) 73 | tokens = map(compile_word, re.split('([A-Z]+)', formula)) 74 | body = ''.join(tokens) 75 | if first_letters: 76 | tests = ' and '.join(L + '!=0' for L in first_letters) 77 | body = '%s and (%s)' % (tests, body) 78 | f = 'lambda %s: %s' % (parms, body) 79 | if verbose: print f 80 | return eval(f), letters 81 | 82 | 83 | def compile_word(word): 84 | """Compile a word of uppercase letters as numeric digits. 85 | E.g., compile_word('YOU') => '(1*U+10*O+100*Y)' 86 | Non-uppercase words uncahanged: compile_word('+') => '+'""" 87 | if word.isupper(): 88 | terms = [('%s*%s' % (10 ** i, d)) 89 | for (i, d) in enumerate(word[::-1])] 90 | return '(' + '+'.join(terms) + ')' 91 | else: 92 | return word 93 | 94 | 95 | def faster_solve(formula): 96 | """Given a formula like 'ODD + ODD == EVEN', fill in digits to solve it. 97 | Input formula is a string; output is a digit-filled-in string or None. 98 | This version precompilesdef k(): the formula; only one eval per formula.""" 99 | f, letters = compile_formula(formula) 100 | for digits in itertools.permutations((1, 2, 3, 4, 5, 6, 7, 8, 9, 0), len(letters)): 101 | try: 102 | if f(*digits) is True: 103 | table = string.maketrans(letters, ''.join(map(str, digits))) 104 | return formula.translate(table) 105 | except ArithmeticError: 106 | pass 107 | 108 | 109 | def test1(): 110 | t0 = time.clock() 111 | for example in examples: 112 | print '%6.4f sec : %s' % (timedcall(faster_solve, example), example) 113 | print '%6.4f sec in total.' % (time.clock() - t0) 114 | 115 | 116 | def test2(): 117 | assert faster_solve('A + B == BA') == None # should NOT return '1 + 0 == 01' 118 | assert faster_solve('YOU == ME**2') == ('289 == 17**2' or '576 == 24**2' or '841 == 29**2') 119 | assert faster_solve('X / X == X') == '1 / 1 == 1' 120 | return 'tests pass' 121 | -------------------------------------------------------------------------------- /algorithmics/diff-div.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def check_nbr(nbr, nbrs, K): 5 | """ 6 | checks if we could find a potential diff 7 | only compare nbr with the lowest nbr in the list 8 | """ 9 | if nbr - nbrs[0] >= K: 10 | return True 11 | 12 | 13 | def generate_indices(low_val, high_val): 14 | """ 15 | given a list and a law value and high value, generate four indices that cut the interval by four 16 | """ 17 | step = int(round((high_val - low_val) / 4)) 18 | return low_val, low_val + step, low_val + 2 * step, low_val + 3 * step, high_val 19 | 20 | 21 | def dichotomie(nbr, nbrs, K, low_val, high_val, itr): 22 | """ 23 | proceed by binary search, recursion shouldn't go beyond the limit itr 24 | for the sake of optimization, we will cut each intervall by 4 25 | """ 26 | itr += 1 27 | ind1, ind2, ind3, ind4, ind5 = generate_indices(low_val, high_val) 28 | if ind2 == 0 or ind1 >= ind2 or ind2 >= ind3 or ind3 >= ind4 or ind4 >= ind5 or itr == 100: # can't be divided by 4, iterate 29 | for i in range(high_val, low_val - 1, -1): 30 | if nbr - nbrs[i] == K: 31 | return True 32 | break 33 | return False 34 | elif nbr - nbrs[ind4] == K: # first quarter border 35 | return True 36 | elif nbr - nbrs[ind4] > K: # first quarter 37 | return dichotomie(nbr, nbrs, K, ind4, ind5, itr) 38 | elif nbr - nbrs[ind3] == K: # 2d quarter border 39 | return True 40 | elif nbr - nbrs[ind3] > K: # 2d quarter 41 | return dichotomie(nbr, nbrs, K, ind3, ind4, itr) 42 | elif nbr - nbrs[ind2] == K: # 3d quarter border 43 | return True 44 | elif nbr - nbrs[ind2] > K: # 3d quarter 45 | return dichotomie(nbr, nbrs, K, ind2, ind3, itr) 46 | elif nbr - nbrs[ind1] == K: # 4th quarter border 47 | return True 48 | elif nbr - nbrs[ind1] > K: # 4th quarter 49 | return dichotomie(nbr, nbrs, K, ind1, ind1, itr) 50 | else: 51 | return False 52 | 53 | 54 | def diffs(): 55 | N, K = raw_input().split() 56 | N, K = int(N), int(K) 57 | nbrs = [int(n) for n in raw_input().split()] 58 | nbrs.sort() 59 | len_nbrs = len(nbrs) 60 | sum_diff = 0 61 | i = len_nbrs - 1 62 | while i > 0: 63 | itr = 0 64 | nbr = nbrs[i] 65 | if i / 10 > 10000: 66 | itr = 0 67 | elif i / 10 > 1000: 68 | itr = 1 69 | elif i / 10 > 100: 70 | itr = 2 71 | elif i / 10 > 10: 72 | itr = 3 73 | else: 74 | itr = 4 75 | if not check_nbr(nbr, nbrs, K): 76 | break 77 | if dichotomie(nbr, nbrs, K, 0, i - 1, itr=0): 78 | sum_diff += 1 79 | i -= 1 80 | return sum_diff 81 | 82 | 83 | t = time.clock() 84 | print diffs() 85 | print time.clock() - t 86 | -------------------------------------------------------------------------------- /algorithmics/flowers.py: -------------------------------------------------------------------------------- 1 | def flowers(): 2 | nbr_fl, nbr_fr = [int(x) for x in raw_input().split()] 3 | prices = [float(x) for x in raw_input().split()] 4 | prices.sort() 5 | amount = 0 6 | x = 0 7 | while nbr_fl: 8 | if nbr_fl <= nbr_fr: 9 | amount += sum([(1 + x) * c for c in prices[:nbr_fl]]) 10 | nbr_fl = 0 11 | else: 12 | amount += sum([(1 + x) * c for c in prices[nbr_fl - nbr_fr:nbr_fl]]) 13 | nbr_fl -= nbr_fr 14 | x += 1 15 | amount_flat = int(amount) 16 | amount_float = amount % 1 17 | if amount_float != 0: 18 | print amount 19 | else: 20 | print amount_flat 21 | 22 | 23 | flowers() 24 | -------------------------------------------------------------------------------- /algorithmics/forest_slicing.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | sample_input = """ 4 | 20 19 5 | 2 1 6 | 3 1 7 | 4 3 8 | 5 2 9 | 6 5 10 | 7 1 11 | 8 1 12 | 9 2 13 | 10 7 14 | 11 10 15 | 12 3 16 | 13 7 17 | 14 8 18 | 15 12 19 | 16 6 20 | 17 6 21 | 18 10 22 | 19 1 23 | 20 8 24 | """ 25 | # expected return 4 26 | N, M = raw_input().split() 27 | summy = 0 28 | forest = collections.defaultdict(list) 29 | depth = collections.defaultdict(int) 30 | 31 | 32 | def forest_construction(): 33 | """ 34 | construct the graph from std input 35 | """ 36 | global head 37 | for i in range(int(M)): 38 | node2, node1 = raw_input().split() 39 | forest[node1].append(node2) 40 | 41 | 42 | def nbr_nodes(node): 43 | """ 44 | returns the number of nodes in the current sub-graph 45 | """ 46 | nbr = 1 47 | for n in forest[node]: 48 | nbr += nbr_nodes(n) 49 | return nbr 50 | 51 | 52 | def nodes_depth(): 53 | """ 54 | construct depth for each node 55 | """ 56 | for node in forest.keys(): 57 | depth[node] = nbr_nodes(node) 58 | 59 | 60 | def get_head(): 61 | """ 62 | returns the head of the graph 63 | """ 64 | head = '' 65 | max_v = 0 66 | for k, v in depth.items(): 67 | if v > max_v: 68 | head = k 69 | max_v = v 70 | return head 71 | 72 | 73 | def forest_slicing(node): 74 | """ 75 | calculate the number of removed edges in such a forest. 76 | """ 77 | summy = 0 78 | for n in forest[node]: # direct successors 79 | if not n in depth: 80 | depth[n] = 1 81 | elif depth[n] % 2 == 0: 82 | summy += 1 83 | summy += forest_slicing(n) 84 | else: 85 | summy += forest_slicing(n) 86 | return summy 87 | 88 | 89 | forest_construction() 90 | nodes_depth() 91 | print forest_slicing(get_head()) 92 | -------------------------------------------------------------------------------- /algorithmics/grammer-parser.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: cryptarithmetic 3 | # 4 | # Author: mourad mourafiq 5 | # 6 | # Copyright: (c) mourad mourafiq 7 | # ------------------------------------------------------------------------------- 8 | 9 | from functools import update_wrapper 10 | from string import split 11 | import re 12 | 13 | 14 | def grammar(description, whitespace=r'\s*'): 15 | """Convert a description to a grammar. Each line is a rule for a 16 | non-terminal symbol; it looks like this: 17 | Symbol => A1 A2 ... | B1 B2 ... | C1 C2 ... 18 | where the right-hand side is one or more alternatives, separated by 19 | the '|' sign. Each alternative is a sesys.setrecursionlimit(1500)quence of atoms, separated by 20 | spaces. An atom is either a symbol on some left-hand side, or it is 21 | a regular expression that will be passed to re.match to match a token. 22 | 23 | Notation for *, +, or ? not allowed in a rule alternative (but ok 24 | within a token). Use '\' to continue long lines. You must include spaces 25 | or tabs around '=>' and '|'. That's within the grammar description itself. 26 | The grammar that gets defined allows whitespace between tokens by default; 27 | specify '' as the second argument to grammar() to disallow this (or supply 28 | any regular expression to describe allowable whitespace between tokens).""" 29 | G = {' ': whitespace} 30 | description = description.replace('\t', ' ') # no tabs! 31 | for line in split(description, '\n'): 32 | lhs, rhs = split(line, ' => ', 1) 33 | alternatives = split(rhs, ' | ') 34 | G[lhs] = tuple(map(split, alternatives)) 35 | return G 36 | 37 | 38 | def decorator(d): 39 | "Make function d a decorator: d wraps a function fn." 40 | 41 | def _d(fn): 42 | return update_wrapper(d(fn), fn) 43 | 44 | update_wrapper(_d, d) 45 | return _d 46 | 47 | 48 | @decorator 49 | def memo(f): 50 | """Decorator that caches the return value for each call to f(args). 51 | Then when called again with same args, we can just look it up.""" 52 | cache = {} 53 | 54 | def _f(*args): 55 | try: 56 | return cache[args] 57 | except KeyError: 58 | cache[args] = result = f(*args) 59 | return result 60 | except TypeError: 61 | # some element of args can't be a dict key 62 | return f(args) 63 | 64 | return _f 65 | 66 | 67 | def parse(start_symbol, text, grammar): 68 | """Example call: parse('Exp', '3*x + b', G). 69 | Returns a (tree, remainder) pair. If remainder is '', it parsed the whole 70 | string. Failure iff remainder is None. This is a deterministic PEG parser, 71 | so rule order (left-to-right) matters. Do 'E => T op E | T', putting the 72 | longest parse first; don't do 'E => T | T op E' 73 | Also, no left recursion allowed: don't do 'E => E op T'""" 74 | 75 | tokenizer = grammar[' '] + '(%s)' 76 | 77 | def parse_sequence(sequence, text): 78 | result = [] 79 | for atom in sequence: 80 | tree, text = parse_atom(atom, text) 81 | if text is None: return Fail 82 | result.append(tree) 83 | return result, text 84 | 85 | @memo 86 | def parse_atom(atom, text): 87 | if atom in grammar: # Non-Terminal: tuple of alternatives 88 | for alternative in grammar[atom]: 89 | tree, rem = parse_sequence(alternative, text) 90 | if rem is not None: return [atom] + tree, rem 91 | return Fail 92 | else: # Terminal: match characters against start of text 93 | m = re.match(tokenizer % atom, text) 94 | return Fail if (not m) else (m.group(1), text[m.end():]) 95 | 96 | # Body of parse: 97 | return parse_atom(start_symbol, text) 98 | 99 | 100 | Fail = (None, None) 101 | 102 | G = grammar("""Exp => Term [+-] Exp | Term 103 | Term => Factor [*/] Term Factor 104 | Factor => Funcall | Var | Num | [(] Exp [)] 105 | Funcall => Var [(] Exp [)] 106 | Exps => Exp [,] Exps | Exp 107 | Var => [a-zA-Z_]\w* 108 | Num => [-+]?[0-9]+([.][0-9]*)?""", whitespace='\s*') 109 | 110 | JSON = grammar("""object => { } | { members } 111 | members => pair , members | pair 112 | pair => string : value 113 | array => [[] []] | [[] elements []] 114 | elements => value , elements | elements 115 | value => string | number | object | array | true | false | null 116 | string => "[^"]*" 117 | number => int frac exp | int frac | int exp | int 118 | int => -?[0-9][0-9]* 119 | frac => [.][0-9]+ 120 | exp => [eE][-+]?[0-9]+""", whitespace='\s*') 121 | 122 | 123 | def json_parse(text): 124 | return parse('value', text, JSON) 125 | 126 | 127 | def test(): 128 | assert json_parse('["testing", 1, 2, 3]') == ( 129 | ['value', ['array', '[', ['elements', ['value', 130 | ['string', '"testing"']], ',', ['elements', ['value', ['number', 131 | ['int', '1']]], ',', ['elements', ['value', ['number', 132 | ['int', '2']]], ',', ['elements', ['value', ['number', 133 | ['int', '3']]]]]]], ']']], '') 134 | 135 | assert json_parse('-123.456e+789') == ( 136 | ['value', ['number', ['int', '-123'], ['frac', '.456'], ['exp', 'e+789']]], '') 137 | 138 | assert json_parse('{"age": 21, "state":"CO","occupation":"rides the rodeo"}') == ( 139 | ['value', ['object', '{', ['members', ['pair', ['string', '"age"'], 140 | ':', ['value', ['number', ['int', '21']]]], ',', ['members', 141 | ['pair', ['string', '"state"'], ':', ['value', ['string', '"CO"']]], 142 | ',', ['members', ['pair', ['string', '"occupation"'], ':', 143 | ['value', ['string', '"rides the rodeo"']]]]]], '}']], '') 144 | return 'tests pass' 145 | 146 | 147 | print G 148 | print JSON 149 | print test() 150 | -------------------------------------------------------------------------------- /algorithmics/hanoi_tower.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: cryptarithmetic 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | RODS = 'ABCDEFGHIJKLM' 8 | 9 | 10 | def hanoi_tower(nbr_disks=5, nbr_rods=6): 11 | """ 12 | Resolves the hanoi tower problem for n rods and n disks. 13 | goal is to move all disks from the left rod to the right rod 14 | """ 15 | goal = [''] * nbr_rods 16 | start = [''] * nbr_rods 17 | start[0] = RODS[0:nbr_disks] 18 | goal[-1] = RODS[0:nbr_disks] 19 | goal = tuple(goal) 20 | explored = set() # set of explored paths 21 | paths = [[start]] 22 | while paths: 23 | to_explore = paths.pop(0) 24 | current_state = to_explore[-1] 25 | for (state, action) in next_state(current_state, nbr_disks).items(): 26 | if state not in explored: 27 | explored.add(state) 28 | path2 = to_explore + [action, state] 29 | if goal == state: 30 | return path2 31 | else: 32 | paths.append(path2) 33 | return [] 34 | 35 | 36 | def next_state(state, nbr_disks): 37 | result = {} 38 | for i in range(nbr_disks + 1): 39 | if state[i] != '': 40 | i_disk = state[i][-1] 41 | for j in range(nbr_disks + 1): 42 | j_disk = state[j][-1] if state[j] != '' else state[j] 43 | if i != j and i_disk > j_disk: 44 | c_state = list(state) 45 | c_state[i] = state[i][:-1] 46 | c_state[j] = state[j] + i_disk 47 | result[tuple(c_state)] = 'move %s => %s' % (i_disk, j + 1) 48 | 49 | return result 50 | 51 | 52 | print hanoi_tower() 53 | -------------------------------------------------------------------------------- /algorithmics/inverse_function.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: cryptarithmetic 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | from __future__ import division 8 | 9 | 10 | def inverse(f, delta=1 / 1024.): 11 | """ 12 | given a function f, monotonically increasing on a positive interval 13 | return x = f_1(y) the approximation of its inverse function 14 | """ 15 | 16 | def _f(y): 17 | lv, hv = find_bounds(f, y) 18 | return binary_search(f, y, lv, hv, delta) 19 | 20 | return _f 21 | 22 | 23 | def find_bounds(f, y): 24 | """ 25 | given a function f, 26 | return lv & hv such that lv <= t(y) <= hv 27 | """ 28 | x = 1 29 | while f(x) < y: 30 | x = x * 2 31 | lv = 0 if x == 1 else x / 2 32 | return lv, x 33 | 34 | 35 | def binary_search(f, y, lv, hv, delta): 36 | """ 37 | recherche dicothomique. 38 | returns x such that f(x) is within delta of y : y-delta <= f(x) <= y+delta 39 | for exact approximation delta should be as small as possible 40 | """ 41 | while lv <= hv: 42 | x = (lv + hv) / 2 43 | if f(x) > y: 44 | hv = x - delta 45 | elif f(x) < y: 46 | lv = x + delta 47 | else: 48 | return x 49 | return hv if f(hv) - y < y - f(lv) else lv 50 | 51 | 52 | def square(x): return x * x 53 | 54 | 55 | def power4(x): return x ** 4 56 | 57 | 58 | sqrty = inverse(square) 59 | log4 = inverse(power4) 60 | print square(3) 61 | print power4(3) 62 | print sqrty(9) 63 | print log4(81) 64 | 65 | -------------------------------------------------------------------------------- /algorithmics/iter_circle_sum.py: -------------------------------------------------------------------------------- 1 | # circle sum 2 | def cal_sum(): 3 | T = int(raw_input()) 4 | 5 | def c_sum(a, N, M): 6 | res = [] 7 | for i in xrange(N): 8 | res.append(list(a)) 9 | for i in xrange(M): 10 | for j in xrange(N): 11 | res[j][((i + j) % N)] += (res[j][((i + j - 1) % N)] + res[j][((i + j + 1) % N)]) 12 | res[j][((i + j) % N)] %= 1000000007 13 | for i in xrange(N): 14 | print ' '.join(map(str, res[i])) 15 | 16 | for t in xrange(T): 17 | N, M = [int(x) for x in raw_input().split()] 18 | a = [int(x) for x in raw_input().split()] 19 | c_sum(a, N, M) 20 | if t < (T - 1): print "" 21 | 22 | 23 | cal_sum() 24 | -------------------------------------------------------------------------------- /algorithmics/licence: -------------------------------------------------------------------------------- 1 | Copyright (c) Mourad MOURAFIQ and individual contributors. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 11 | -------------------------------------------------------------------------------- /algorithmics/media.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | 4 | def add(e, min_s, max_s, max_min, min_max, l_min, l_max): 5 | if l_min == 0: 6 | min_s.append(e) 7 | max_min = e 8 | l_min += 1 9 | return True, max_min, min_max, l_min, l_max 10 | if e <= max_min: 11 | l_min += 1 12 | min_s.append(e) 13 | else: 14 | if e < min_max or l_max == 0: 15 | min_max = e 16 | l_max += 1 17 | max_s.append(e) 18 | return True, max_min, min_max, l_min, l_max 19 | 20 | 21 | def remove(e, min_s, max_s, max_min, min_max, l_min, l_max): 22 | if e in min_s: 23 | l_min -= 1 24 | min_s.remove(e) 25 | if e == max_min: 26 | if l_min > 0: 27 | max_min = max(min_s) 28 | else: 29 | max_min = 0 30 | return True, max_min, min_max, l_min, l_max 31 | if e in max_s: 32 | l_max -= 1 33 | max_s.remove(e) 34 | if e == min_max: 35 | if l_max > 0: 36 | min_max = min(max_s) 37 | else: 38 | min_max = 0 39 | return True, max_min, min_max, l_min, l_max 40 | return True, max_min, min_max, l_min, l_max 41 | 42 | 43 | def operate(op, e, min_s, max_s, max_min, min_max, l_min, l_max): 44 | if op == "a": return add(e, min_s, max_s, max_min, min_max, l_min, l_max) 45 | if op == "r": return remove(e, min_s, max_s, max_min, min_max, l_min, l_max) 46 | return False, max_min, min_max, l_min, l_max 47 | 48 | 49 | def size_s(min_s, max_s, max_min, min_max, l_min, l_max): 50 | if l_min == l_max == 0: 51 | return False, max_min, min_max, l_min, l_max 52 | if l_max > l_min: 53 | e = min_max 54 | max_s.remove(e) 55 | l_max -= 1 56 | if l_max > 0: 57 | min_max = min(max_s) 58 | else: 59 | min_max = 0 60 | min_s.append(e) 61 | l_min += 1 62 | max_min = e 63 | return True, max_min, min_max, l_min, l_max 64 | if l_min > l_max + 1: 65 | e = max_min 66 | min_s.remove(e) 67 | l_min -= 1 68 | if l_min > 0: 69 | max_min = max(min_s) 70 | else: 71 | max_min = 0 72 | max_s.append(e) 73 | l_max += 1 74 | min_max = e 75 | return True, max_min, min_max, l_min, l_max 76 | return True, max_min, min_max, l_min, l_max 77 | 78 | 79 | def calculate(min_s, max_s, max_min, min_max, l_min, l_max): 80 | if l_min > l_max: 81 | print max_min 82 | if l_min == l_max: 83 | med = max_min + min_max 84 | if med % 2 == 0: 85 | print "%.0lf" % (med / 2) 86 | else: 87 | print "%.1lf" % (med / 2) 88 | 89 | 90 | def median(): 91 | n_op = int(raw_input()) 92 | min_s = [] 93 | max_min = 0 94 | l_min = 0 95 | max_s = [] 96 | min_max = 0 97 | l_max = 0 98 | for i in xrange(n_op): 99 | op, e = [x for x in raw_input().split()] 100 | e = int(e) 101 | stat, max_min, min_max, l_min, l_max = operate(op, e, min_s, max_s, max_min, min_max, l_min, l_max) 102 | if stat: 103 | stat, max_min, min_max, l_min, l_max = size_s(min_s, max_s, max_min, min_max, l_min, l_max) 104 | if stat: 105 | calculate(min_s, max_s, max_min, min_max, l_min, l_max) 106 | else: 107 | print "Wrong!" 108 | else: 109 | print "Wrong!" 110 | 111 | 112 | median() 113 | -------------------------------------------------------------------------------- /algorithmics/n_c_p.py: -------------------------------------------------------------------------------- 1 | def n_div_P(n, P): 2 | if n >= P: 3 | j = 0 4 | m = n % P 5 | q = n / P 6 | j = q * (P - m - 1) 7 | return j + ((m + 1) * n_div_P(q, P)) 8 | else: 9 | return 0 10 | 11 | 12 | def n_C_p(): 13 | T = int(raw_input()) 14 | result = [] 15 | for i in range(T): 16 | n, P = raw_input().split() 17 | n, P = int(n), int(P) 18 | j = n_div_P(n, P) 19 | result.append(j) 20 | for i in range(T): 21 | print result[i] 22 | 23 | 24 | n_C_p() 25 | -------------------------------------------------------------------------------- /algorithmics/numpy_circle_sum.py: -------------------------------------------------------------------------------- 1 | # circle sum 2 | 3 | import numpy as nm 4 | 5 | 6 | def cal_sum(): 7 | T = int(raw_input()) 8 | 9 | def list_mat(n, m_div, m_mod): 10 | a_n = [] 11 | if m_div > 0: 12 | for i in xrange(n): 13 | a_i = nm.eye(n, dtype=nm.int) 14 | i_l = (i - 1) % n 15 | i_r = (i + 1) % n 16 | a_i[i_r][i] = 1 17 | a_i[i_l][i] = 1 18 | a_n.append(a_i) 19 | else: 20 | for i in xrange(n): 21 | a_i = nm.eye(n, dtype=nm.int) 22 | for j in xrange(m_mod): 23 | i_l = (i + j - 1) % n 24 | i_r = (i + j + 1) % n 25 | a_i[i_r][i] = 1 26 | a_i[i_l][i] = 1 27 | a_n.append(a_i) 28 | return a_n 29 | 30 | def mat_mult(n, a, a_n, ind, m_div, m_mod): 31 | res = nm.mat(nm.eye(n), dtype=nm.int) 32 | if m_div > 0: 33 | for i in xrange(n): 34 | j = (i + ind) % n 35 | res *= nm.mat(a_n[j]) 36 | res = res ** m_div 37 | for i in xrange(m_mod): 38 | j = (i + ind) % n 39 | res *= nm.mat(a_n[j]) 40 | return a * res 41 | 42 | for t in xrange(T): 43 | N, M = [int(x) for x in raw_input().split()] 44 | a = [int(x) for x in raw_input().split()] 45 | M_mod = M % N 46 | M_div = M / N 47 | a_n = list_mat(N, M_div, M_mod) 48 | for i in xrange(N): 49 | res = nm.nditer(mat_mult(N, a, a_n, i, M_div, M_mod)) 50 | print ' '.join(map(str, res)) 51 | if t < (T - 1): print "" 52 | 53 | 54 | cal_sum() 55 | -------------------------------------------------------------------------------- /algorithmics/palindromes.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: palindromes 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | 8 | def longest_subpalindrome(string): 9 | """ 10 | Returns the longest subpalindrome string from the current string 11 | Return (i,j) 12 | """ 13 | #first we check if string is "" 14 | if string == "": return (0, 0) 15 | 16 | def length(slice): a, b = slice; return b - a 17 | 18 | slices = [grow(string, start, end) 19 | for start in range(len(string)) 20 | for end in (start, start + 1) 21 | ] 22 | return max(slices, key=length) 23 | 24 | 25 | def grow(string, start, end): 26 | """ 27 | starts with a 0 or 1 length palindrome and try to grow bigger 28 | """ 29 | while (start > 0 and end < len(string) 30 | and string[start - 1].upper() == string[end].upper()): 31 | start -= 1; 32 | end += 1 33 | return (start, end) 34 | -------------------------------------------------------------------------------- /algorithmics/pooring_water.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: palindromes 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | def pooring_prob(size_x, size_y, goal, start=(0, 0)): 8 | """ 9 | Resolves the pooring problem for two glasses x & y. 10 | goal is the size we are looking for. 11 | size_x & size_y are respectively the size of the glass x, glass y respectively. 12 | """ 13 | if goal in start: 14 | return [start] 15 | explored = set() #set of visited states 16 | paths = [[start]] 17 | while paths: 18 | to_explore = paths.pop(0) 19 | (x, y) = to_explore[-1] 20 | for (state, action) in next_state(x, y, size_x, size_y).items(): 21 | if state not in explored: 22 | explored.add(state) 23 | path2 = to_explore + [action, state] 24 | if goal in state: 25 | return path2 26 | else: 27 | paths.append(path2) 28 | 29 | return [] 30 | 31 | 32 | def next_state(x, y, size_x, size_y): 33 | assert x <= size_x and y <= size_y 34 | return { 35 | (0, x + y) if x + y <= size_y else (x - (size_y - y), size_y): 'x->y', 36 | (y + x, 0) if x + y <= size_x else (size_x, y - (size_x - x)): 'y->x', 37 | (size_x, y): 'fill x', (x, size_y): 'fill y', 38 | (0, y): 'empty x', (x, 0): 'empty y', 39 | } 40 | 41 | 42 | print pooring_prob(440, 900, 600) 43 | -------------------------------------------------------------------------------- /algorithmics/recur_circle_sum.py: -------------------------------------------------------------------------------- 1 | # circle sum 2 | import sys 3 | 4 | sys.setrecursionlimit(15000) 5 | 6 | 7 | def cal_sum(): 8 | T = int(raw_input()) 9 | 10 | def c_sum(a, indice, start, end, N, round, a_cache): 11 | c = a[indice] 12 | if round == 0: return c 13 | if (indice, round) in a_cache: 14 | c = a_cache[(indice, round)] 15 | else: 16 | i_l = (indice - 1) % N 17 | i = indice 18 | i_r = (indice + 1) % N 19 | if round == 1: 20 | if i == start: 21 | c = a[i_l] + a[i] + a[i_r] 22 | elif i == end: 23 | c = c_sum(a, i_l, start, end, N, round, a_cache) + a[i] + c_sum(a, i_r, start, end, N, round, 24 | a_cache) 25 | else: 26 | c = c_sum(a, i_l, start, end, N, round, a_cache) + a[i] + a[i_r] 27 | else: 28 | if i == start: 29 | c = c_sum(a, i_l, start, end, N, round - 1, a_cache) + c_sum(a, i, start, end, N, round - 1, 30 | a_cache) + c_sum(a, i_r, start, end, N, 31 | round - 1, a_cache) 32 | elif i == end: 33 | c = c_sum(a, i_l, start, end, N, round, a_cache) + c_sum(a, i, start, end, N, round - 1, 34 | a_cache) + c_sum(a, i_r, start, end, N, 35 | round, a_cache) 36 | else: 37 | c = c_sum(a, i_l, start, end, N, round, a_cache) + c_sum(a, i, start, end, N, round - 1, 38 | a_cache) + c_sum(a, i_r, start, end, N, 39 | round - 1, a_cache) 40 | a_cache[(indice, round)] = c % 1000000007 41 | return c 42 | 43 | for t in xrange(T): 44 | N, M = [int(x) for x in raw_input().split()] 45 | a = [int(x) for x in raw_input().split()] 46 | for i in xrange(N): 47 | M_mod = M % N 48 | M_div = M / N 49 | res = list(a) 50 | a_cache = {} # intialize cache 51 | cpt = 1 52 | for j in xrange(N): 53 | ind = (j + i) % N 54 | if M_mod == 0: 55 | cpt = 0 56 | else: 57 | M_mod -= 1 58 | res[ind] = c_sum(a, ind, i, (i + N - 1) % N, N, M_div + cpt, a_cache) 59 | print ' '.join(map(str, res)) 60 | if t < (T - 1): print "" 61 | 62 | 63 | cal_sum() 64 | -------------------------------------------------------------------------------- /algorithmics/string_red.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | def reduce_str(strg): 5 | count_abc = collections.defaultdict(int) 6 | str_g = list(strg) 7 | for s in str_g: 8 | count_abc[s] += 1 9 | if (count_abc['a'] == 0 and count_abc['c'] == 0) or (count_abc['a'] == 0 and count_abc['b'] == 0) or ( 10 | count_abc['b'] == 0 and count_abc['c'] == 0): 11 | return len(strg) 12 | elif (count_abc['a'] % 2 == count_abc['b'] % 2 == count_abc['c'] % 2): 13 | return 2 14 | else: 15 | return 1 16 | 17 | 18 | def string_red(): 19 | T = int(raw_input()) 20 | r = set() 21 | for t in xrange(T): 22 | case = raw_input() 23 | print reduce_str(case) 24 | 25 | 26 | string_red() 27 | 28 | -------------------------------------------------------------------------------- /algorithmics/suffixes.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | 4 | def check_in(i_from, i_to, str, sfx): 5 | sum = 0 6 | for i in range(i_from, min(i_to, len(sfx))): 7 | if sfx[i] == str[i]: 8 | sum += 1 9 | else: 10 | break 11 | return sum 12 | 13 | 14 | def suffix(): 15 | N = int(raw_input()) 16 | for i in range(N): 17 | sum = 0 18 | str = raw_input() 19 | len_str = len(str) 20 | sum += len_str 21 | sfx = str[1:] 22 | cpt = 2 23 | while sfx != "": 24 | len_sfx = len(sfx) 25 | step = 90 26 | for i in range(0, len_sfx, step): 27 | if str[i:i + step] == sfx[i:i + step]: 28 | sum += step 29 | else: 30 | sum += check_in(i, i + step, str, sfx) 31 | break 32 | sfx = str[cpt:] 33 | cpt += 1 34 | print sum 35 | 36 | 37 | import time 38 | 39 | t = time.clock() 40 | suffix() 41 | print time.clock() - t 42 | -------------------------------------------------------------------------------- /algorithmics/unfriendly.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | from fractions import gcd 3 | 4 | 5 | def get_factors(x): 6 | factors = set([x]) 7 | 8 | sqrtX = int(sqrt(x)) 9 | 10 | for i in range(1, sqrtX + 1): 11 | 12 | if x % i == 0: 13 | factors.add(i) 14 | factors.add(x / i) 15 | 16 | return factors 17 | 18 | 19 | def friendly(): 20 | _, friendly = [int(i) for i in raw_input().split()] 21 | unfriendlies = [int(i) for i in raw_input().split()] 22 | 23 | friendly_factors = get_factors(friendly) 24 | 25 | unfriendly_factors = set() 26 | 27 | for unfriendly in unfriendlies: 28 | g = gcd(friendly, unfriendly) 29 | 30 | unfriendly_factors.add(g) 31 | unfriendly_factors.update(get_factors(g)) 32 | print len(friendly_factors - unfriendly_factors) 33 | 34 | 35 | friendly() 36 | -------------------------------------------------------------------------------- /classification/decision_trees.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 01, 2012 3 | 4 | @author: Mourad Mourafiq 5 | """ 6 | 7 | from math import log 8 | 9 | log2 = lambda x: log(x) / log(2) 10 | 11 | 12 | class Item(object): 13 | """ 14 | Describe an item 15 | 16 | @type id: string 17 | @param id: the id of the elemet 18 | 19 | @type value: string 20 | @param value: the value of the item 21 | 22 | @type coords: list 23 | @param coords: a list representing the parameters that locates the item 24 | """ 25 | 26 | def __init__(self, id, coords, value=None): 27 | self.id = id 28 | self.coords = coords 29 | self.value = value 30 | 31 | 32 | class Node(object): 33 | """ 34 | A node object in a decision tree 35 | 36 | @type column: int 37 | @param column: the column index of the criteria to be tested 38 | 39 | @type value: string 40 | @param value: the value that the column must match to get a true result 41 | 42 | @type results: dict 43 | @param results: stores a dictionary of results for this branch. This is None for everything 44 | except endpoints 45 | 46 | @type t_node: Node 47 | @param t_node: the next nodes in the tree if the result is true 48 | 49 | @type f_node: Node 50 | @param f_node: the next nodes in the tree if the result is false 51 | """ 52 | 53 | def __init__(self, col=-1, value=None, results=None, t_node=None, f_node=None): 54 | self.col = col 55 | self.value = value 56 | self.results = results 57 | self.t_node = t_node 58 | self.f_node = f_node 59 | 60 | def draw(self, indent=''): 61 | # Is this a leaf node? 62 | if self.results != None: 63 | print str(self.results) 64 | else: 65 | # Print the criteria 66 | print str(self.col) + ':' + str(self.value) + '? ' 67 | # Print the branches 68 | print indent + 'T->', 69 | self.t_node.draw(indent + ' ') 70 | print indent + 'F->', 71 | self.f_node.draw(indent + ' ') 72 | 73 | 74 | class DecisionTree(object): 75 | """ 76 | A decision tree object 77 | """ 78 | 79 | @staticmethod 80 | def count_results(data, item=True): 81 | """ 82 | count the occurrences of each result in the data set 83 | """ 84 | results_count = defaultdict(int) 85 | if item: 86 | for i in data: 87 | results_count[i.value] += 1 88 | else: 89 | for i in data: 90 | results_count[i[-1]] += 1 91 | return results_count 92 | 93 | @staticmethod 94 | def divide_data(data, column, value): 95 | """ 96 | Divides a set of rows on a specific column. 97 | """ 98 | # a function that decides if the row goes to the first or the second group (true or false) 99 | spliter = None 100 | if isinstance(value, int) or isinstance(value, float): 101 | spliter = lambda item: item.coords[column] >= value 102 | else: 103 | spliter = lambda item: item.coords[column] == value 104 | #divide the rows into two sets and return them 105 | set_true = [] 106 | set_false = [] 107 | for item in data: 108 | if spliter(item): 109 | set_true.append(item) 110 | else: 111 | set_false.append(item) 112 | return (set_true, set_false) 113 | 114 | @staticmethod 115 | def gini_impurity(data, item=True): 116 | """ 117 | Probability that a randomly placed item will be in the wrong category 118 | """ 119 | results_count = DecisionTree.count_results(data, item) 120 | len_data = len(data) 121 | imp = 0.0 122 | for k1, v1 in results_count.iteritems(): 123 | p1 = float(v1) / len_data 124 | for k2, v2 in results_count.iteritems(): 125 | if k1 == k2: continue 126 | p2 = float(v2) / len_data 127 | imp += p1 * p2 128 | return imp 129 | 130 | @staticmethod 131 | def entropy(data, item=True): 132 | """ 133 | estimate the disorder in the data set : sum of p(x)log(p(x)) 134 | """ 135 | results_count = DecisionTree.count_results(data, item) 136 | len_data = len(data) 137 | ent = 0.0 138 | for v in results_count.itervalues(): 139 | p = float(v) / len_data 140 | ent -= p * log2(p) 141 | return ent 142 | 143 | @staticmethod 144 | def variance(data): 145 | """ 146 | calculates the statistical variance for a set of rows 147 | more preferably to be used with numerical outcomes 148 | """ 149 | len_data = len(data) 150 | if len_data == 0: return 0 151 | score = [float(item.value) for item in data] 152 | mean = sum(score) / len(score) 153 | variance = sum([(s - mean) ** 2 for s in score]) / len(score) 154 | return variance 155 | 156 | 157 | @staticmethod 158 | def build_tree(data, disorder_function="entropy"): 159 | """ 160 | a recursive function that builds the tree by choosing the best dividing criteria 161 | disorder_function : 162 | for data that contains words and booleans; it is recommended to use entropy or gini_impurity 163 | for data that contains number; it is recommended to use variance 164 | """ 165 | if disorder_function == "entropy": 166 | disorder_estimator = DecisionTree.entropy 167 | elif disorder_function == "gini_impurity": 168 | disorder_estimator = DecisionTree.gini_impurity 169 | elif disorder_function == "variance": 170 | disorder_estimator = DecisionTree.variance 171 | len_data = len(data) 172 | if len_data == 0: return Node() 173 | current_disorder_level = disorder_estimator(data) 174 | # track enhancement of disorer's level 175 | best_enhancement = 0.0 176 | best_split = None 177 | best_split_sets = None 178 | # number columns 179 | nbr_coords = len(data[0].coords) 180 | for coord_ind in xrange(nbr_coords): 181 | #get unique values of the current column 182 | coord_values = {} 183 | for item in data: 184 | coord_values[item.coords[coord_ind]] = 1 185 | for coord_value in coord_values.iterkeys(): 186 | set1, set2 = DecisionTree.divide_data(data, coord_ind, coord_value) 187 | p1 = float(len(set1)) / len_data 188 | p2 = (1 - p1) 189 | enhancement = current_disorder_level - (p1 * disorder_estimator(set1)) - (p2 * disorder_estimator(set2)) 190 | if (enhancement > best_enhancement) and (len(set1) > 0 and len(set2) > 0): 191 | best_enhancement = enhancement 192 | best_split = (coord_ind, coord_value) 193 | best_split_sets = (set1, set2) 194 | if best_enhancement > 0: 195 | t_node = DecisionTree.build_tree(best_split_sets[0]) 196 | f_node = DecisionTree.build_tree(best_split_sets[1]) 197 | return Node(col=best_split[0], value=best_split[1], 198 | t_node=t_node, f_node=f_node) 199 | else: 200 | return Node(results=DecisionTree.count_results(data)) 201 | 202 | @staticmethod 203 | def prune(tree, min_enhancement, disorder_function="entropy"): 204 | """ 205 | checking pairs of nodes that have a common parent to see if merging 206 | them would increase the entropy by less than a specified threshold 207 | """ 208 | if disorder_function == "entropy": 209 | disorder_estimator = DecisionTree.entropy 210 | elif disorder_function == "gini_impurity": 211 | disorder_estimator = DecisionTree.gini_impurity 212 | elif disorder_function == "variance": 213 | disorder_estimator = DecisionTree.variance 214 | if tree.t_node.results == None: 215 | DecisionTree.prune(tree.t_node, min_enhancement) 216 | if tree.f_node.results == None: 217 | DecisionTree.prune(tree.f_node, min_enhancement) 218 | # If both the subbranches are now leaves, see if they should merged 219 | if (tree.t_node.results != None and tree.f_node.results != None): 220 | # Build a combined dataset 221 | t_node, f_node = [], [] 222 | for key, value in tree.t_node.results.items(): 223 | t_node += [[key]] * value 224 | for key, value in tree.f_node.results.items(): 225 | f_node += [[key]] * value 226 | # Test the enhancement 227 | delta = disorder_estimator(t_node + f_node, item=False) - ( 228 | disorder_estimator(t_node, item=False) + disorder_estimator(f_node, item=False) / 2) 229 | if delta < min_enhancement: 230 | # Merge the branches 231 | tree.t_node, tree.f_node = None, None 232 | tree.results = DecisionTree.count_results(t_node + f_node, item=False) 233 | 234 | @staticmethod 235 | def classify(observation, tree): 236 | """ 237 | Classify a new observation given a decision tree 238 | """ 239 | if tree.results != None: 240 | return tree.results 241 | # the observation value for the current criteria column 242 | observation_value = observation.coords[tree.col] 243 | if observation_value == None: 244 | t_results, f_results = DecisionTree.classify(observation, tree.t_node), DecisionTree.classify(observation, 245 | tree.f_node) 246 | t_count = sum(t_results.values()) 247 | f_count = sum(f_results.values()) 248 | t_prob = float(t_count) / (t_count + f_count) 249 | f_prob = float(f_count) / (t_count + f_count) 250 | result = {} 251 | for key, value in t_results.items(): result[key] = value * t_prob 252 | for key, value in f_results.items(): result[key] = value * f_prob 253 | return result 254 | else: 255 | #with branch to follow 256 | branch = None 257 | if (isinstance(observation_value, int) or isinstance(observation_value, float)): 258 | branch = tree.t_node if (observation_value >= tree.value) else tree.f_node 259 | else: 260 | branch = tree.t_node if (observation_value == tree.value) else tree.f_node 261 | return DecisionTree.classify(observation, branch) 262 | -------------------------------------------------------------------------------- /classification/knn.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | 4 | 5 | class Item(object): 6 | """ 7 | Describe an item 8 | 9 | @type id: string 10 | @param id: the id of the elemet 11 | 12 | @type value: float 13 | @param value: the value of the item 14 | 15 | @type coords: list 16 | @param coords: a list representing the parameters that locates the item 17 | """ 18 | 19 | def __init__(self, id, coords, value=None): 20 | self.id = id 21 | self.coords = coords 22 | self.value = value 23 | 24 | 25 | class KNN(object): 26 | @staticmethod 27 | def euclidean(v1, v2): 28 | d = 0.0 29 | for i in range(len(v1)): 30 | d += (v1[i] - v2[i]) ** 2 31 | return math.sqrt(d) 32 | 33 | @staticmethod 34 | def all_distances(data, item): 35 | distancelist = [] 36 | for i in range(len(data)): 37 | item2 = data[i] 38 | distancelist.append((KNN.euclidean(item.coords, item2.coords), i)) 39 | distancelist.sort() 40 | return distancelist 41 | 42 | @staticmethod 43 | def inverse_weight(dist, num=1.0, const=0.1): 44 | return num / (dist + const) 45 | 46 | @staticmethod 47 | def subtract_weight(dist, const=1.0): 48 | if dist > const: 49 | return 0 50 | else: 51 | return const - dist 52 | 53 | @staticmethod 54 | def gaussian(dist, sigma=10.0): 55 | return math.e ** (-dist ** 2 / (2 * sigma ** 2)) 56 | 57 | @staticmethod 58 | def knn_estimate(data, item, k=3): 59 | # Get sorted distances 60 | all_dist = KNN.all_distances(data, item) 61 | avg = 0.0 62 | # Take the average of the top k results 63 | for i in range(k): 64 | idx = all_dist[i][1] 65 | avg += data[idx].value 66 | avg = avg / k 67 | return avg 68 | 69 | @staticmethod 70 | def weighted_knn(data, item, k=5, weight_f="gaussian"): 71 | if weight_f == "subtract_weight": 72 | weightf = KNN.subtract_weight 73 | elif weight_f == "inverse_weight": 74 | weightf = KNN.inverse_weight 75 | elif weight_f == "gaussian": 76 | weightf = KNN.gaussian 77 | # Get distances 78 | all_dist = KNN.all_distances(data, item) 79 | avg = 0.0 80 | totalweight = 0.0 81 | # Get weighted average 82 | for i in range(k): 83 | dist = all_dist[i][0] 84 | idx = all_dist[i][1] 85 | weight = weightf(dist) 86 | avg += weight * int(data[idx].value) 87 | totalweight += weight 88 | avg = avg / totalweight 89 | return avg 90 | 91 | @staticmethod 92 | def prob_guess(data, item, low, high, k=5, weightf=gaussian): 93 | all_dist = KNN.all_distances(data, item) 94 | nweight = 0.0 95 | tweight = 0.0 96 | for i in range(k): 97 | dist = all_dist[i][0] 98 | idx = all_dist[i][1] 99 | weight = weightf(dist) 100 | v = data[idx].value 101 | # Is this point in the range? 102 | if v >= low and v <= high: 103 | nweight += weight 104 | tweight += weight 105 | if tweight == 0: return 0 106 | # The probability is the weights in the range 107 | # divided by all the weights 108 | return nweight / tweight 109 | 110 | @staticmethod 111 | def divide_data(data, test=0.05): 112 | trainset = [] 113 | testset = [] 114 | for row in data: 115 | if random.random() < test: 116 | testset.append(row) 117 | else: 118 | trainset.append(row) 119 | return trainset, testset 120 | 121 | @staticmethod 122 | def test_algorithm(algf, trainset, testset): 123 | error = 0.0 124 | for item in testset: 125 | guess = algf(trainset, item) 126 | error += (item.value - guess) ** 2 127 | return error / len(testset) 128 | 129 | @staticmethod 130 | def cross_validate(algf, data, trials=100, test=0.05): 131 | error = 0.0 132 | for i in range(trials): 133 | trainset, testset = KNN.divide_data(data, test) 134 | error += KNN.test_algorithm(algf, trainset, testset) 135 | return error / trials 136 | 137 | @staticmethod 138 | def rescale(data, scale): 139 | scaleddata = [] 140 | for item in data: 141 | scaled_coords = [scale[i] * item.coords[i] for i in range(len(scale))] 142 | scaled_item = Item(id=item.id, value=item.value, coords=scaled_coords) 143 | scaleddata.append(scaled_item) 144 | return scaleddata 145 | 146 | @staticmethod 147 | def cost_function(algf, data): 148 | """ 149 | this function should be used all along with an optimization function to determine the perfect scale 150 | notably one could use the annealing algorithm or the genetic algorithm 151 | """ 152 | 153 | def costf(scale): 154 | sdata = KNN.rescale(data, scale) 155 | return KNN.cross_validate(algf, sdata, trials=10) 156 | 157 | return costf 158 | 159 | @staticmethod 160 | def annealingoptimize(domain, costf, T=10000.0, cool=0.95, step=1): 161 | # Initialize the values randomly 162 | vec = [float(random.randint(domain[i][0], domain[i][1])) 163 | for i in range(len(domain))] 164 | 165 | while T > 0.1: 166 | # Choose one of the indices 167 | i = random.randint(0, len(domain) - 1) 168 | 169 | # Choose a direction to change it 170 | dir = random.randint(-step, step) 171 | 172 | # Create a new list with one of the values changed 173 | vecb = vec[:] 174 | vecb[i] += dir 175 | if vecb[i] < domain[i][0]: 176 | vecb[i] = domain[i][0] 177 | elif vecb[i] > domain[i][1]: 178 | vecb[i] = domain[i][1] 179 | 180 | # Calculate the current cost and the new cost 181 | ea = costf(vec) 182 | eb = costf(vecb) 183 | p = pow(math.e, (-eb - ea) / T) 184 | 185 | # Is it better, or does it make the probability 186 | # cutoff? 187 | if (eb < ea or random.random() < p): 188 | vec = vecb 189 | 190 | # Decrease the temperature 191 | T = T * cool 192 | return vec 193 | -------------------------------------------------------------------------------- /decorators.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: decorators 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | from functools import update_wrapper 8 | import time 9 | 10 | 11 | def decorator(d): 12 | "Make function d a decorator: d wraps a function fn." 13 | 14 | def _d(fn): 15 | return update_wrapper(d(fn), fn) 16 | 17 | update_wrapper(_d, d) 18 | return _d 19 | 20 | 21 | @decorator 22 | def trace(f): 23 | """ 24 | helps debug recursive calls 25 | """ 26 | indent = ' ' 27 | 28 | def _f(*args): 29 | signature = '%s(%s)' % (f.__name__, ', '.join(map(repr, args))) 30 | print '%s--> %s' % (trace.level * indent, signature) 31 | trace.level += 1 32 | try: 33 | result = f(*args) 34 | print '%s<-- %s == %s' % ((trace.level - 1) * indent, 35 | signature, result) 36 | finally: 37 | trace.level -= 1 38 | return result 39 | 40 | trace.level = 0 41 | return _f 42 | 43 | 44 | @decorator 45 | def timing(f): 46 | """ 47 | calculates time of computation 48 | """ 49 | def _f(*args): 50 | t0 = time.clock() 51 | result = f(*args) 52 | print t0 53 | t = time.clock() 54 | print t 55 | return result 56 | return _f 57 | 58 | @decorator 59 | def memo(f): 60 | """ 61 | a simple caching decorator 62 | """ 63 | cache = {} 64 | 65 | def _f(*args): 66 | try : 67 | return cache[args] 68 | except KeyError: 69 | cache[args] = result = f(*args) 70 | return result 71 | except TypeError: 72 | return f(args) 73 | return _f 74 | 75 | 76 | @decorator 77 | def count_calls(f): 78 | """ 79 | counts the number of calls to the function f 80 | """ 81 | def _f(*args): 82 | callcounts[_f] += 1 83 | return f(*args) 84 | callcounts[_f] = 0 85 | return _f 86 | 87 | callcounts = {} 88 | -------------------------------------------------------------------------------- /dijkstra.py: -------------------------------------------------------------------------------- 1 | def dijkstra(graph, node): 2 | """ 3 | Simulate the dijkstra algorithm in a graph 4 | """ 5 | distance_to = {} 6 | distance_to[node] = 0 7 | distance_path = {} 8 | while (distance_to): 9 | # in case we have a disjoint graph 10 | op_node = min_distance(distance_to) 11 | distance_path[op_node] = distance_to[op_node] 12 | del distance_to[op_node] 13 | for x, x_len in graph[op_node].items(): 14 | if x not in distance_path: 15 | if x not in distance_to: 16 | distance_to[x] = distance_path[op_node] + x_len 17 | elif distance_to[x] > distance_path[op_node] + x_len: 18 | distance_to[x] = distance_path[op_node] + x_len 19 | return distance_path 20 | -------------------------------------------------------------------------------- /filters/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mourad' 2 | -------------------------------------------------------------------------------- /filters/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_data(xs, ys, c, lw, label, linestyle, **kwargs): 6 | if ys is not None: 7 | plt.plot(xs, ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs) 8 | else: 9 | plt.plot(xs, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs) 10 | 11 | 12 | def plot_measurements(xs, ys=None, c='r', lw=2, label='Measurements', linestyle='--', **kwargs): 13 | plot_data(xs=xs, ys=ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs) 14 | 15 | 16 | def plot_predictions(xs, ys=None, c='b', lw=2, label='Measurements', linestyle=':', **kwargs): 17 | plot_data(xs=xs, ys=ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs) 18 | 19 | 20 | def plot_filter(xs, ys=None, c='g', lw=4, label='Filter', linestyle='-', **kwargs): 21 | plot_data(xs=xs, ys=ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs) 22 | 23 | 24 | def plot_track(xs, ys=None, c='k', lw=2, label='Track', linestyle='-', **kwargs): 25 | plot_data(xs=xs, ys=ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs) 26 | 27 | 28 | def generate_measurements(x_0, dx, num_measurements, noise, acceleration=0): 29 | data = [] 30 | for i in xrange(num_measurements): 31 | data.append(x_0 + dx * i + np.random.randn() * noise) 32 | dx += acceleration 33 | return data 34 | 35 | 36 | def g_h_filter(measurements, x_0, g, h, dx, dt=1.): 37 | """ 38 | Performs g-h filter on 1 state variable with a fixed g and h. 39 | :param measurements: 40 | :param x_0: initial value. 41 | :param g: g scale factor in g-h filter. 42 | :param h: h scale factor in g-h filter. 43 | :param dx: initial change rate. 44 | :param dt: time step. 45 | :return: 46 | """ 47 | x_i = x_0 48 | predictions = [] 49 | filtered_measurements = [] 50 | for measurement in measurements: 51 | # predict the value 52 | x_prediction = x_i + dx * dt 53 | predictions.append(x_prediction) 54 | 55 | # calculate the residual 56 | residual = measurement - x_prediction 57 | 58 | # update the change rate 59 | dx += h * residual / dt 60 | # update the initial guess/value 61 | x_i = x_prediction + g * residual 62 | 63 | filtered_measurements.append(x_i) 64 | 65 | return np.array(predictions), np.array(filtered_measurements) 66 | 67 | 68 | def plot_g_h_results(measurements, predictions, filtered_data, title='', z_label='Scale', ): 69 | plot_measurements(measurements, label=z_label) 70 | plot_predictions(predictions) 71 | plot_filter(filtered_data) 72 | plt.legend(loc=4) 73 | plt.title(title) 74 | plt.gca().set_xlim(left=0, right=len(measurements)) 75 | plt.show() 76 | 77 | 78 | test = [ 79 | {'title': 'test', 'x_0': 160, 'dx': 1, 'num_x': 30, 'noise': 3}, # testing assumptions 80 | {'title': 'bad initial', 'x_0': 5, 'x_0_guess': 30, 'dx': 1, 'num_x': 100, 'noise': 10}, # bad initial guess 81 | {'title': 'extreme noise', 'x_0': 5, 'dx': 1, 'num_x': 100, 'noise': 100}, # extreme noise 82 | {'title': 'acceleration', 'x_0': 10, 'dx': 0, 'num_x': 20, 'noise': 0, 'acceleration': 2, 'g': 0.2, 'h': 0.02}, 83 | # acceleration, shows the lag error or systemic error 84 | 85 | # varying g, greater g favors measurement instead of prediction 86 | {'title': 'g = 0.1', 'x_0': 5, 'x_0_guess': 0, 'dx': 5, 'num_x': 100, 'noise': 50, 'g': 0.1}, # g 0.1 87 | {'title': 'g = 0.5', 'x_0': 5, 'x_0_guess': 0, 'dx': 5, 'num_x': 100, 'noise': 50, 'g': 0.5}, # g 0.5 88 | {'title': 'g = 0.9', 'x_0': 5, 'x_0_guess': 0, 'dx': 5, 'num_x': 100, 'noise': 50, 'g': 0.9}, # g 0.9] 89 | 90 | # varying h, greater h makes the filter react rapidly to transient changes 91 | { 92 | 'title': 'h = 0.05', 'x_0': 0, 'x_0_guess': 0, 'dx': 0, 'num_x': 50, 'noise': 50, 'h': 0.05, 93 | 'measurements': np.linspace(0, 1, 50) 94 | }, # g 0.1 95 | { 96 | 'title': 'h = 0.05', 'x_0': 0, 'x_0_guess': 0, 'dx': 2, 'num_x': 50, 'noise': 50, 'h': 0.05, 97 | 'measurements': np.linspace(0, 1, 50) 98 | }, # g 0.5 99 | { 100 | 'title': 'h = 0.5', 'x_0': 0, 'x_0_guess': 0, 'dx': 2, 'num_x': 50, 'noise': 50, 'h': 0.5, 101 | 'measurements': np.linspace(0, 1, 50) 102 | }, # g 0.9 103 | 104 | 105 | ] 106 | for t in test: 107 | g = t.get('g', 0.2) 108 | h = t.get('h', 0.01) 109 | x_0 = t.get('x_0_guess', t['x_0']) 110 | measurements = t.get('measurements') 111 | if measurements is None: 112 | measurements = generate_measurements(t['x_0'], t['dx'], t['num_x'], t['noise'], t.get('acceleration', 0)) 113 | plt.xlim([0, t['num_x']]) 114 | plot_track([0, t['num_x']], [measurements[0], measurements[t['num_x'] - 1]], label='Actual weight') 115 | xs = xrange(1, t['num_x']+1) 116 | line = np.poly1d(np.polyfit(xs, measurements, 1)) 117 | plot_data(xs, line(xs), label='least squares', c='y', lw=3, linestyle='-') 118 | predictions, filtered_measurements = g_h_filter(measurements=measurements, x_0=x_0, dx=t['dx'], 119 | g=g, h=h, dt=1.) 120 | plot_g_h_results(measurements, predictions, filtered_measurements, title=t['title']) 121 | 122 | measurements = [5, 6, 7, 8, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] 123 | 124 | predictions, filtered_measurements = g_h_filter(measurements=measurements, x_0=4., dx=1., dt=1., g=.302, h=0.054) 125 | plot_g_h_results(measurements, predictions, filtered_measurements, 'g = 0.302, h = 0.054') 126 | 127 | predictions, filtered_measurements = g_h_filter(measurements=measurements, x_0=4., dx=1., dt=1., g=.546, h=0.205) 128 | plot_g_h_results(measurements, predictions, filtered_measurements, 'g = 0.546, h = 0.205') -------------------------------------------------------------------------------- /frequency.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: shingling minhashing 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | # !/usr/bin/env python 7 | 8 | from __future__ import division 9 | from itertools import combinations 10 | 11 | # exemple of baskets 12 | BASKETS = ( 13 | set(('cat', 'and', 'dog', 'bites')), 14 | set(('yahoo', 'news', 'claims', 'a', 'cat', 'mated', 'with', 'dog', 'and', 'produced', 'viable', 'offspring')), 15 | set(('cat', 'killer', 'is', 'a', 'free', 'big', 'dog')), 16 | set(('professional', 'free', 'advice', 'on', 'dog', 'training', 'puppy', 'training')), 17 | set(('cat', 'and', 'kitten', 'training', 'behavior')), 18 | set(('dog', 'cat', 'provides', 'training', 'in', 'eugene', 'oregon')), 19 | set(('dog', 'cat', 'is', 'slang', 'term', 'used', 'by', 'police', 'officers', 'for', 'malefemale', 'relationship')), 20 | set(('shop', 'for', 'your', 'show', 'dog', 'grooming', 'and', 'yet', 'pet', 'supplier')) 21 | ) 22 | 23 | 24 | def frequency(baskets, item): 25 | """ 26 | Frequency of item in baskets 27 | """ 28 | freq = 0 29 | for basket in baskets: 30 | if item <= basket: freq += 1 31 | return freq 32 | 33 | 34 | def frequent(frequency, support): 35 | """ 36 | If frequency of item is bigger than support then it is ferquent 37 | """ 38 | return True if frequency > support else False 39 | 40 | 41 | def confidence(baskets, item1, item2): 42 | """ 43 | Confidence of the rule Item1 -> item2 is the ratio freq2/freq1 44 | """ 45 | item = item1 | item2 46 | freq1 = frequency(baskets=baskets, item=item1) 47 | freq2 = frequency(baskets=baskets, item=item) 48 | print freq1 49 | print freq2 50 | return freq2 / freq1 if freq1 > 0 else 0 51 | 52 | 53 | def interest(baskets, item1, item2): 54 | """ 55 | the interest of an association rule item1->item2 to be the difference 56 | between its confidence and the fraction of baskets that contain item2. 57 | """ 58 | return confidence(baskets=baskets, item1=item1, item2=item2) - ( 59 | frequency(baskets=baskets, item=item2) / len(baskets)) 60 | 61 | 62 | def ferquent_items(baskets, support): 63 | """ 64 | Determines which items are frequent 65 | """ 66 | # items in baskets 67 | items = set() 68 | for basket in baskets: 69 | items |= basket 70 | pos = 0 71 | #first we determine which items are ferquent 72 | items_frequency = {} 73 | for item in items: 74 | freq = frequency(baskets, set([item])) 75 | items_frequency[item] = (freq, frequent(freq, support)) 76 | return [(item, i_frequency) for item, (i_frequency, i_frequent) in items_frequency.items() if i_frequent] 77 | 78 | 79 | def frequent_pairs(baskets, support): 80 | """ 81 | Determines which pairs are frequent 82 | A-priori algorithm 83 | """ 84 | frequent_items = [item for item, i_frequency in ferquent_items(baskets, support)] 85 | pairs_frequency = {} 86 | for pair in combinations(frequent_items, 2): 87 | if pair not in pairs_frequency: 88 | freq = frequency(baskets, set(pair)) 89 | pairs_frequency[pair] = (freq, frequent(freq, support)) 90 | print pairs_frequency 91 | 92 | 93 | def son_algo(baskets, support, fraction_baskets): 94 | """ 95 | the algorithm of savasere, omniescinski and navathe 96 | divide the input into nbr fractions, for each fraction find all frequent 97 | items for (1/fraction_baskets)*support 98 | """ 99 | items = set() 100 | nbr_baskets = len(baskets) 101 | for i in range(0, nbr_baskets, fraction_baskets): 102 | items |= set(ferquent_items((baskets[i:i + 3]), (fraction_baskets / nbr_baskets) * support)) 103 | return [item for item, i_frequency in ferquent_items(baskets, support) if i_frequency > support] 104 | -------------------------------------------------------------------------------- /graph_analysis.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: graph analysis 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | # !/usr/bin/env python 7 | from __future__ import division 8 | import random 9 | import heapq 10 | 11 | 12 | def rand_weight_link(graph, node1, node2): 13 | """ 14 | Create a link between node1 and node2 15 | """ 16 | if node1 not in graph: 17 | graph[node1] = {} 18 | graph[node1][node2] = random.randint(1, 10) 19 | if node2 not in graph: 20 | graph[node2] = {} 21 | graph[node2][node1] = random.randint(1, 10) 22 | 23 | 24 | def link(graph, node1, node2): 25 | """ 26 | Create a link between node1 and node2 27 | """ 28 | if node1 not in graph: 29 | graph[node1] = {} 30 | graph[node1][node2] = 1 31 | if node2 not in graph: 32 | graph[node2] = {} 33 | graph[node2][node1] = 1 34 | 35 | 36 | def add_node(graph_to, graph_from, node): 37 | if node not in graph_to: 38 | graph_to[node] = graph_from[node] 39 | for neighbor in graph_to[node].keys(): 40 | graph_to = add_node(graph_to, graph_from, neighbor) 41 | return graph_to 42 | 43 | 44 | def shortest_path(graph, node1, node2): 45 | """ 46 | Finds the shortest path from node1 in node 2 in graph. 47 | """ 48 | if node1 == node2: 49 | return [node1] 50 | explored = [] 51 | to_explore = [[node1]] 52 | while to_explore: 53 | path = to_explore.pop(0) 54 | s = path[-1] 55 | for successor in graph[s].keys(): 56 | if successor not in explored: 57 | explored.append(successor) 58 | path2 = path + [successor] 59 | if node2 == successor: 60 | return path2 61 | to_explore.append(path2) 62 | return [] 63 | 64 | 65 | def longest_path(graph, node=None): 66 | """ 67 | Returns the longest path in the graph if node is None 68 | I f node is not None, then it returns the longest path from node 69 | """ 70 | if node is not None: 71 | return max([shortest_path(graph, node, successor) for successor in graph.keys()], key=len) 72 | return max([shortest_path(graph, a, b) for a in graph.keys() for b in graph.keys()], key=len) 73 | 74 | 75 | def centrality(graph, node): 76 | """ 77 | Returns the centrality of node in graph 78 | """ 79 | return sum([len(shortest_path(graph, node, successor)) for successor in graph.keys()]) / len(graph.keys()) 80 | 81 | 82 | def indep_graphs(graph): 83 | """ 84 | Returns the independent graphs in the current graph 85 | """ 86 | graphs = [] 87 | 88 | def which_graph(node): 89 | for g in graphs: 90 | if node in g: return g 91 | return {} 92 | 93 | for node in graph.keys(): 94 | g = add_node(which_graph(node), graph, node) 95 | if g not in graphs: graphs.append(g) 96 | return graphs 97 | 98 | 99 | def graph_for_node(graph, node): 100 | """ 101 | Returns the independent graph containing node 102 | """ 103 | return add_node({}, graph, node) 104 | 105 | 106 | def check_pairwise_connectivity(graph, node1, node2): 107 | """ 108 | checks the connectivity between two nodes, 109 | and returns True if connected, otherwise False 110 | """ 111 | return True if node2 in graph_for_node(graph, node1) else False 112 | 113 | 114 | def clustering_coef(graph, node, verbose=False): 115 | """ 116 | calculates the clustering coef for a particular node in the graph 117 | let Dn = node degree 118 | Vn = number of links between neighbors of the node 119 | """ 120 | neighbors = graph[node].keys() 121 | Dn = len(neighbors) 122 | if Dn == 0: return Dn; 123 | Vn = 0 124 | for neighbor1 in neighbors: 125 | index1 = neighbors.index(neighbor1) 126 | for neighbor2 in neighbors: 127 | index2 = neighbors.index(neighbor2) 128 | if index1 < index2 and neighbor2 in graph[neighbor1]: Vn += 1 129 | coef = (2 * Vn) / (Dn * (Dn - 1)) 130 | if verbose: print '%s\'s degree : %s, links between neighbors : %s. Culestering coef : %s' % (node, Dn, Vn, coef) 131 | return coef 132 | 133 | 134 | def random_clustering_coef(graph, node, nbr_iterations=1000000): 135 | """ 136 | calculates the estimate clustering coef for a particular node in the graph 137 | """ 138 | vindex = {} 139 | d = 0 140 | for w in graph[node].keys(): 141 | vindex[d] = w 142 | d += 1 143 | 144 | total = 0 145 | for i in range(1, nbr_iterations): 146 | if d > 1: 147 | pick = random.randint(0, d - 1) 148 | v1 = vindex[pick] 149 | v2 = vindex[(pick + random.randint(1, d - 1)) % d] 150 | if v2 in graph[v1]: total += 1 151 | print i, (total + 0.0) / i 152 | 153 | 154 | def average_cluestering(graph, verbose=True): 155 | average = sum([clustering_coef(graph, node, verbose=verbose) for node in graph]) / len(graph) 156 | if verbose: print average 157 | return average 158 | 159 | 160 | def dijkstra(graph, node): 161 | """ 162 | Simulate the dijkstra algorithm in a graph 163 | """ 164 | distance_to = {} 165 | distance_to[node] = 0 166 | distance_path = {} 167 | while (distance_to): 168 | # in case we have a disjoint graph 169 | op_node = min_distance(distance_to) 170 | distance_path[op_node] = distance_to[op_node] 171 | del distance_to[op_node] 172 | for x, x_len in graph[op_node].items(): 173 | if x not in distance_path: 174 | if x not in distance_to: 175 | distance_to[x] = distance_path[op_node] + x_len 176 | elif distance_to[x] > distance_path[op_node] + x_len: 177 | distance_to[x] = distance_path[op_node] + x_len 178 | return distance_path 179 | 180 | 181 | def min_distance(distances): 182 | """ 183 | return the element with the min distance 184 | """ 185 | min = (-1, -1) 186 | for node, node_len in distances.items(): 187 | if min[1] > node_len or min[1] == -1: 188 | min = (node, node_len) 189 | return min[0] 190 | 191 | 192 | def dijkstra_heap(graph, node): 193 | """ 194 | Simulate the dijkstra algorithm in a graph 195 | """ 196 | track_distance = {} 197 | track_distance[node] = 0 198 | distance_to = [] 199 | heapq.heappush(distance_to, (0, node)) 200 | distance_path = {} 201 | while (distance_to): 202 | # in case we have a disjoint graph 203 | #op_node = min_distance(distance_to) 204 | #distance_path[op_node] = distance_to[op_node] 205 | #del distance_to[op_node] 206 | ind, op_node = heapq.heappop(distance_to) 207 | if op_node not in distance_path or ind < distance_path[op_node]: 208 | distance_path[op_node] = ind 209 | for x, x_len in graph[op_node].items(): 210 | if x not in distance_path: 211 | if x not in track_distance: 212 | track_distance[x] = distance_path[op_node] + x_len 213 | heapq.heappush(distance_to, (track_distance[x], x)) 214 | elif track_distance[x] > distance_path[op_node] + x_len: 215 | track_distance[x] = distance_path[op_node] + x_len 216 | heapq.heappush(distance_to, (track_distance[x], x)) 217 | return distance_path 218 | 219 | 220 | # heap functions 221 | 222 | def parent(i): return (i - 1) / 2 223 | 224 | 225 | def left_child(i): return 2 * i + 1 226 | 227 | 228 | def right_child(i): return 2 * i + 2 229 | 230 | 231 | def is_leaf(heap_list, i): return (left_child(i) >= len(heap_list)) and (right_child(i) >= len(heap_list)) 232 | 233 | 234 | def has_one_child(heap_list, i): return (left_child(i) < len(heap_list)) and (right_child(i) >= len(heap_list)) 235 | 236 | 237 | # Call this routine if the heap rooted at i satisfies the heap property 238 | # *except* perhaps i to its immediate children 239 | def down_heapify(heap_list, i): 240 | # If i is a leaf, heap property holds 241 | if is_leaf(heap_list, i): 242 | return 243 | # If i has one child... 244 | if has_one_child(heap_list, i): 245 | # check heap property 246 | if heap_list[i] > heap_list[left_child(i)]: 247 | # If it fails, swap, fixing i and its child (a leaf) 248 | (heap_list[i], heap_list[left_child(i)]) = (heap_list[left_child(i)], heap_list[i]) 249 | return 250 | # If i has two children... 251 | # check heap property 252 | if min(heap_list[left_child(i)], heap_list[right_child(i)]) >= heap_list[i]: 253 | return 254 | # If it fails, see which child is the smaller 255 | # and swap i's value into that child 256 | # Afterwards, recurse into that child, which might violate 257 | if heap_list[left_child(i)] < heap_list[right_child(i)]: 258 | # Swap into left child 259 | (heap_list[i], heap_list[left_child(i)]) = (heap_list[left_child(i)], heap_list[i]) 260 | down_heapify(heap_list, left_child(i)) 261 | return 262 | else: 263 | (heap_list[i], heap_list[right_child(i)]) = (heap_list[right_child(i)], heap_list[i]) 264 | down_heapify(heap_list, right_child(i)) 265 | return 266 | 267 | 268 | def build_heap(heap_list): 269 | for i in range(len(heap_list) - 1, -1, -1): 270 | down_heapify(heap_list, i) 271 | return heap_list 272 | 273 | 274 | def remove_min_heap(heap_list): 275 | heap_list[0] = heap_list.pop() 276 | down_heapify(heap_list, 0) 277 | return heap_list 278 | 279 | 280 | def sort_heap(heap_list): 281 | sorted_heap = [] 282 | while len(heap_list) > 0: 283 | sorted_heap = heap_list.pop() 284 | remove_min_heap(heap_list) 285 | return sorted_heap 286 | 287 | 288 | class bipartite_characteristcs(object): 289 | """ 290 | Returns the characteristics of thr bipartite graph based on : 291 | the n number of first nodes 292 | the m number of second nodes 293 | """ 294 | 295 | def __init__(self, graph, n_nodes, m_nodes): 296 | self.graph = graph 297 | self.n_nodes = n_nodes 298 | self.m_nodes = m_nodes 299 | 300 | def max_edges(self): return self.m_nodes * self.n_nodes 301 | 302 | def min_edges(self): return (self.n_nodes + self.m_nodes) - 1 303 | 304 | def max_length(self): return min(2 * self.n_nodes, 2 * self.m_nodes, (self.n_nodes + self.n_nodes) - 1) 305 | 306 | def max_clustering_coef(self): return 0 307 | -------------------------------------------------------------------------------- /licence: -------------------------------------------------------------------------------- 1 | Copyright (c) Mourad MOURAFIQ and individual contributors. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 11 | -------------------------------------------------------------------------------- /map_reduce/README.md: -------------------------------------------------------------------------------- 1 | map-reduce 2 | ========== 3 | 4 | Implementation of map reduce, and some examples. 5 | 6 | 7 | Contains : 8 | 9 | - Map Reduce class 10 | - Estimation of pi number 11 | - Calculation of frequency of Items from multiple files 12 | -------------------------------------------------------------------------------- /map_reduce/item_frequency.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: calculating items frequency 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | import multiprocessing 8 | import string 9 | 10 | from map_reduce import MapReduce 11 | 12 | 13 | def map_words(filename): 14 | """Read a file and return a sequence of (word, occurances) values. 15 | """ 16 | STOP_WORDS = set([ 17 | 'a', 'an', 'and', 'are', 'as', 'be', 'by', 'for', 'if', 'in', 18 | 'is', 'it', 'of', 'or', 'py', 'rst', 'that', 'the', 'to', 'with', 19 | ]) 20 | TR = string.maketrans(string.punctuation, ' ' * len(string.punctuation)) 21 | 22 | print multiprocessing.current_process().name, 'processing ', filename 23 | output = [] 24 | 25 | with open(filename, 'rt') as f: 26 | for line in f: 27 | if line.lstrip().startswith('..'): # Skip rst comment lines 28 | continue 29 | line = line.translate(TR) # Strip punctuation 30 | for word in line.split(): 31 | word = word.lower() 32 | if word.isalpha() and word not in STOP_WORDS: 33 | output.append((word, 1)) 34 | return output 35 | 36 | 37 | def words_frequency(item): 38 | """Convert the partitioned data for a word to a 39 | tuple containing the word and the number of occurances. 40 | """ 41 | word, occurances = item 42 | return (word, sum(occurances)) 43 | 44 | 45 | if __name__ == '__main__': 46 | import operator 47 | import glob 48 | 49 | input_files = glob.glob('./*.txt') # linux notation for directories 50 | 51 | mapper = MapReduce(map_words, words_frequency) 52 | word_counts = mapper(input_files) 53 | word_counts.sort(key=operator.itemgetter(1)) 54 | word_counts.reverse() 55 | 56 | print '\nTOP 20 Iems by frequency\n' 57 | top20 = word_counts[:20] 58 | longest = max(len(word) for word, count in top20) 59 | for word, count in top20: 60 | print '%-*s: %5s' % (longest + 1, word, count) 61 | -------------------------------------------------------------------------------- /map_reduce/map_reduce.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: map reduce class 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | from __future__ import division 8 | import collections 9 | import itertools 10 | import multiprocessing 11 | 12 | 13 | class MapReduce(object): 14 | """ 15 | The map reduce object, should be initialized with: 16 | map_fn 17 | reduce_fn 18 | nbr_workers 19 | """ 20 | 21 | def __init__(self, map_fn, reduce_fn, num_workers=None): 22 | """ 23 | initiaize the mapreduce object 24 | map_fn : Function to map inputs to intermediate data, takes as 25 | input one arg and returns a tuple (key, value) 26 | reduce_fn : Function to reduce intermediate data to final result 27 | takes as arg keys as produced from the map, and the values associated with it 28 | """ 29 | self.map_fn = map_fn 30 | self.reduce_fn = reduce_fn 31 | self.pool = multiprocessing.Pool(num_workers) 32 | 33 | def partition(self, mapped_values): 34 | """ 35 | returns the mapped_values organised by their keys. (keys, associated values) 36 | """ 37 | organised_data = collections.defaultdict(list) 38 | for key, value in mapped_values: 39 | organised_data[key].append(value) 40 | return organised_data.items() 41 | 42 | def __call__(self, inputs=None, chunk_size=1): 43 | """ 44 | process the data through the map reduce functions. 45 | inputs : iterable 46 | chank_size : amount of data to hand to each worker 47 | """ 48 | mapped_data = self.pool.map(self.map_fn, inputs, chunksize=chunk_size) 49 | partioned_data = self.partition(itertools.chain(*mapped_data)) 50 | reduced_data = self.pool.map(self.reduce_fn, partioned_data) 51 | return reduced_data 52 | 53 | -------------------------------------------------------------------------------- /map_reduce/pi_estimation.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: estimation of pi with map reduce 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | from __future__ import division 8 | import multiprocessing 9 | import numpy 10 | import random 11 | from map_reduce import MapReduce 12 | 13 | NBR_POINTS = 1000000 14 | RADIUQ = numpy.sqrt(NBR_POINTS) 15 | NBR_WORKERS = 4 16 | NBR_PER_WORKER = NBR_POINTS / NBR_WORKERS 17 | 18 | 19 | def probability_calculation(item): 20 | """Read a file and return a sequence of (word, occurances) values. 21 | """ 22 | 23 | print multiprocessing.current_process().name, 'calculating', item 24 | output = [] 25 | IN_CIRCLE = 0 26 | for i in range(int(NBR_PER_WORKER)): 27 | x = numpy.random.randint(0, RADIUQ) 28 | y = numpy.random.randint(0, RADIUQ) 29 | if (numpy.sqrt(x ** 2 + y ** 2) < RADIUQ): 30 | IN_CIRCLE += 1 31 | output.append(('pi', IN_CIRCLE)) 32 | return output 33 | 34 | 35 | def estimate_pi(item): 36 | """Convert the partitioned data for a word to a 37 | tuple containing the word and the number of occurances. 38 | """ 39 | key, occurances = item 40 | return (sum(occurances) / NBR_POINTS) * 4 41 | 42 | 43 | if __name__ == '__main__': 44 | mapper = MapReduce(probability_calculation, estimate_pi) 45 | pi = mapper([i for i in range(NBR_WORKERS)]) 46 | print pi 47 | -------------------------------------------------------------------------------- /movielens/u.item: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmourafiq/data-analysis/1df2ca020a554f1fdab7cc9e53115e249cc199ac/movielens/u.item -------------------------------------------------------------------------------- /movielens/u.user: -------------------------------------------------------------------------------- 1 | 1|24|M|technician|85711 2 | 2|53|F|other|94043 3 | 3|23|M|writer|32067 4 | 4|24|M|technician|43537 5 | 5|33|F|other|15213 6 | 6|42|M|executive|98101 7 | 7|57|M|administrator|91344 8 | 8|36|M|administrator|05201 9 | 9|29|M|student|01002 10 | 10|53|M|lawyer|90703 11 | 11|39|F|other|30329 12 | 12|28|F|other|06405 13 | 13|47|M|educator|29206 14 | 14|45|M|scientist|55106 15 | 15|49|F|educator|97301 16 | 16|21|M|entertainment|10309 17 | 17|30|M|programmer|06355 18 | 18|35|F|other|37212 19 | 19|40|M|librarian|02138 20 | 20|42|F|homemaker|95660 21 | 21|26|M|writer|30068 22 | 22|25|M|writer|40206 23 | 23|30|F|artist|48197 24 | 24|21|F|artist|94533 25 | 25|39|M|engineer|55107 26 | 26|49|M|engineer|21044 27 | 27|40|F|librarian|30030 28 | 28|32|M|writer|55369 29 | 29|41|M|programmer|94043 30 | 30|7|M|student|55436 31 | 31|24|M|artist|10003 32 | 32|28|F|student|78741 33 | 33|23|M|student|27510 34 | 34|38|F|administrator|42141 35 | 35|20|F|homemaker|42459 36 | 36|19|F|student|93117 37 | 37|23|M|student|55105 38 | 38|28|F|other|54467 39 | 39|41|M|entertainment|01040 40 | 40|38|M|scientist|27514 41 | 41|33|M|engineer|80525 42 | 42|30|M|administrator|17870 43 | 43|29|F|librarian|20854 44 | 44|26|M|technician|46260 45 | 45|29|M|programmer|50233 46 | 46|27|F|marketing|46538 47 | 47|53|M|marketing|07102 48 | 48|45|M|administrator|12550 49 | 49|23|F|student|76111 50 | 50|21|M|writer|52245 51 | 51|28|M|educator|16509 52 | 52|18|F|student|55105 53 | 53|26|M|programmer|55414 54 | 54|22|M|executive|66315 55 | 55|37|M|programmer|01331 56 | 56|25|M|librarian|46260 57 | 57|16|M|none|84010 58 | 58|27|M|programmer|52246 59 | 59|49|M|educator|08403 60 | 60|50|M|healthcare|06472 61 | 61|36|M|engineer|30040 62 | 62|27|F|administrator|97214 63 | 63|31|M|marketing|75240 64 | 64|32|M|educator|43202 65 | 65|51|F|educator|48118 66 | 66|23|M|student|80521 67 | 67|17|M|student|60402 68 | 68|19|M|student|22904 69 | 69|24|M|engineer|55337 70 | 70|27|M|engineer|60067 71 | 71|39|M|scientist|98034 72 | 72|48|F|administrator|73034 73 | 73|24|M|student|41850 74 | 74|39|M|scientist|T8H1N 75 | 75|24|M|entertainment|08816 76 | 76|20|M|student|02215 77 | 77|30|M|technician|29379 78 | 78|26|M|administrator|61801 79 | 79|39|F|administrator|03755 80 | 80|34|F|administrator|52241 81 | 81|21|M|student|21218 82 | 82|50|M|programmer|22902 83 | 83|40|M|other|44133 84 | 84|32|M|executive|55369 85 | 85|51|M|educator|20003 86 | 86|26|M|administrator|46005 87 | 87|47|M|administrator|89503 88 | 88|49|F|librarian|11701 89 | 89|43|F|administrator|68106 90 | 90|60|M|educator|78155 91 | 91|55|M|marketing|01913 92 | 92|32|M|entertainment|80525 93 | 93|48|M|executive|23112 94 | 94|26|M|student|71457 95 | 95|31|M|administrator|10707 96 | 96|25|F|artist|75206 97 | 97|43|M|artist|98006 98 | 98|49|F|executive|90291 99 | 99|20|M|student|63129 100 | 100|36|M|executive|90254 101 | 101|15|M|student|05146 102 | 102|38|M|programmer|30220 103 | 103|26|M|student|55108 104 | 104|27|M|student|55108 105 | 105|24|M|engineer|94043 106 | 106|61|M|retired|55125 107 | 107|39|M|scientist|60466 108 | 108|44|M|educator|63130 109 | 109|29|M|other|55423 110 | 110|19|M|student|77840 111 | 111|57|M|engineer|90630 112 | 112|30|M|salesman|60613 113 | 113|47|M|executive|95032 114 | 114|27|M|programmer|75013 115 | 115|31|M|engineer|17110 116 | 116|40|M|healthcare|97232 117 | 117|20|M|student|16125 118 | 118|21|M|administrator|90210 119 | 119|32|M|programmer|67401 120 | 120|47|F|other|06260 121 | 121|54|M|librarian|99603 122 | 122|32|F|writer|22206 123 | 123|48|F|artist|20008 124 | 124|34|M|student|60615 125 | 125|30|M|lawyer|22202 126 | 126|28|F|lawyer|20015 127 | 127|33|M|none|73439 128 | 128|24|F|marketing|20009 129 | 129|36|F|marketing|07039 130 | 130|20|M|none|60115 131 | 131|59|F|administrator|15237 132 | 132|24|M|other|94612 133 | 133|53|M|engineer|78602 134 | 134|31|M|programmer|80236 135 | 135|23|M|student|38401 136 | 136|51|M|other|97365 137 | 137|50|M|educator|84408 138 | 138|46|M|doctor|53211 139 | 139|20|M|student|08904 140 | 140|30|F|student|32250 141 | 141|49|M|programmer|36117 142 | 142|13|M|other|48118 143 | 143|42|M|technician|08832 144 | 144|53|M|programmer|20910 145 | 145|31|M|entertainment|V3N4P 146 | 146|45|M|artist|83814 147 | 147|40|F|librarian|02143 148 | 148|33|M|engineer|97006 149 | 149|35|F|marketing|17325 150 | 150|20|F|artist|02139 151 | 151|38|F|administrator|48103 152 | 152|33|F|educator|68767 153 | 153|25|M|student|60641 154 | 154|25|M|student|53703 155 | 155|32|F|other|11217 156 | 156|25|M|educator|08360 157 | 157|57|M|engineer|70808 158 | 158|50|M|educator|27606 159 | 159|23|F|student|55346 160 | 160|27|M|programmer|66215 161 | 161|50|M|lawyer|55104 162 | 162|25|M|artist|15610 163 | 163|49|M|administrator|97212 164 | 164|47|M|healthcare|80123 165 | 165|20|F|other|53715 166 | 166|47|M|educator|55113 167 | 167|37|M|other|L9G2B 168 | 168|48|M|other|80127 169 | 169|52|F|other|53705 170 | 170|53|F|healthcare|30067 171 | 171|48|F|educator|78750 172 | 172|55|M|marketing|22207 173 | 173|56|M|other|22306 174 | 174|30|F|administrator|52302 175 | 175|26|F|scientist|21911 176 | 176|28|M|scientist|07030 177 | 177|20|M|programmer|19104 178 | 178|26|M|other|49512 179 | 179|15|M|entertainment|20755 180 | 180|22|F|administrator|60202 181 | 181|26|M|executive|21218 182 | 182|36|M|programmer|33884 183 | 183|33|M|scientist|27708 184 | 184|37|M|librarian|76013 185 | 185|53|F|librarian|97403 186 | 186|39|F|executive|00000 187 | 187|26|M|educator|16801 188 | 188|42|M|student|29440 189 | 189|32|M|artist|95014 190 | 190|30|M|administrator|95938 191 | 191|33|M|administrator|95161 192 | 192|42|M|educator|90840 193 | 193|29|M|student|49931 194 | 194|38|M|administrator|02154 195 | 195|42|M|scientist|93555 196 | 196|49|M|writer|55105 197 | 197|55|M|technician|75094 198 | 198|21|F|student|55414 199 | 199|30|M|writer|17604 200 | 200|40|M|programmer|93402 201 | 201|27|M|writer|E2A4H 202 | 202|41|F|educator|60201 203 | 203|25|F|student|32301 204 | 204|52|F|librarian|10960 205 | 205|47|M|lawyer|06371 206 | 206|14|F|student|53115 207 | 207|39|M|marketing|92037 208 | 208|43|M|engineer|01720 209 | 209|33|F|educator|85710 210 | 210|39|M|engineer|03060 211 | 211|66|M|salesman|32605 212 | 212|49|F|educator|61401 213 | 213|33|M|executive|55345 214 | 214|26|F|librarian|11231 215 | 215|35|M|programmer|63033 216 | 216|22|M|engineer|02215 217 | 217|22|M|other|11727 218 | 218|37|M|administrator|06513 219 | 219|32|M|programmer|43212 220 | 220|30|M|librarian|78205 221 | 221|19|M|student|20685 222 | 222|29|M|programmer|27502 223 | 223|19|F|student|47906 224 | 224|31|F|educator|43512 225 | 225|51|F|administrator|58202 226 | 226|28|M|student|92103 227 | 227|46|M|executive|60659 228 | 228|21|F|student|22003 229 | 229|29|F|librarian|22903 230 | 230|28|F|student|14476 231 | 231|48|M|librarian|01080 232 | 232|45|M|scientist|99709 233 | 233|38|M|engineer|98682 234 | 234|60|M|retired|94702 235 | 235|37|M|educator|22973 236 | 236|44|F|writer|53214 237 | 237|49|M|administrator|63146 238 | 238|42|F|administrator|44124 239 | 239|39|M|artist|95628 240 | 240|23|F|educator|20784 241 | 241|26|F|student|20001 242 | 242|33|M|educator|31404 243 | 243|33|M|educator|60201 244 | 244|28|M|technician|80525 245 | 245|22|M|student|55109 246 | 246|19|M|student|28734 247 | 247|28|M|engineer|20770 248 | 248|25|M|student|37235 249 | 249|25|M|student|84103 250 | 250|29|M|executive|95110 251 | 251|28|M|doctor|85032 252 | 252|42|M|engineer|07733 253 | 253|26|F|librarian|22903 254 | 254|44|M|educator|42647 255 | 255|23|M|entertainment|07029 256 | 256|35|F|none|39042 257 | 257|17|M|student|77005 258 | 258|19|F|student|77801 259 | 259|21|M|student|48823 260 | 260|40|F|artist|89801 261 | 261|28|M|administrator|85202 262 | 262|19|F|student|78264 263 | 263|41|M|programmer|55346 264 | 264|36|F|writer|90064 265 | 265|26|M|executive|84601 266 | 266|62|F|administrator|78756 267 | 267|23|M|engineer|83716 268 | 268|24|M|engineer|19422 269 | 269|31|F|librarian|43201 270 | 270|18|F|student|63119 271 | 271|51|M|engineer|22932 272 | 272|33|M|scientist|53706 273 | 273|50|F|other|10016 274 | 274|20|F|student|55414 275 | 275|38|M|engineer|92064 276 | 276|21|M|student|95064 277 | 277|35|F|administrator|55406 278 | 278|37|F|librarian|30033 279 | 279|33|M|programmer|85251 280 | 280|30|F|librarian|22903 281 | 281|15|F|student|06059 282 | 282|22|M|administrator|20057 283 | 283|28|M|programmer|55305 284 | 284|40|M|executive|92629 285 | 285|25|M|programmer|53713 286 | 286|27|M|student|15217 287 | 287|21|M|salesman|31211 288 | 288|34|M|marketing|23226 289 | 289|11|M|none|94619 290 | 290|40|M|engineer|93550 291 | 291|19|M|student|44106 292 | 292|35|F|programmer|94703 293 | 293|24|M|writer|60804 294 | 294|34|M|technician|92110 295 | 295|31|M|educator|50325 296 | 296|43|F|administrator|16803 297 | 297|29|F|educator|98103 298 | 298|44|M|executive|01581 299 | 299|29|M|doctor|63108 300 | 300|26|F|programmer|55106 301 | 301|24|M|student|55439 302 | 302|42|M|educator|77904 303 | 303|19|M|student|14853 304 | 304|22|F|student|71701 305 | 305|23|M|programmer|94086 306 | 306|45|M|other|73132 307 | 307|25|M|student|55454 308 | 308|60|M|retired|95076 309 | 309|40|M|scientist|70802 310 | 310|37|M|educator|91711 311 | 311|32|M|technician|73071 312 | 312|48|M|other|02110 313 | 313|41|M|marketing|60035 314 | 314|20|F|student|08043 315 | 315|31|M|educator|18301 316 | 316|43|F|other|77009 317 | 317|22|M|administrator|13210 318 | 318|65|M|retired|06518 319 | 319|38|M|programmer|22030 320 | 320|19|M|student|24060 321 | 321|49|F|educator|55413 322 | 322|20|M|student|50613 323 | 323|21|M|student|19149 324 | 324|21|F|student|02176 325 | 325|48|M|technician|02139 326 | 326|41|M|administrator|15235 327 | 327|22|M|student|11101 328 | 328|51|M|administrator|06779 329 | 329|48|M|educator|01720 330 | 330|35|F|educator|33884 331 | 331|33|M|entertainment|91344 332 | 332|20|M|student|40504 333 | 333|47|M|other|V0R2M 334 | 334|32|M|librarian|30002 335 | 335|45|M|executive|33775 336 | 336|23|M|salesman|42101 337 | 337|37|M|scientist|10522 338 | 338|39|F|librarian|59717 339 | 339|35|M|lawyer|37901 340 | 340|46|M|engineer|80123 341 | 341|17|F|student|44405 342 | 342|25|F|other|98006 343 | 343|43|M|engineer|30093 344 | 344|30|F|librarian|94117 345 | 345|28|F|librarian|94143 346 | 346|34|M|other|76059 347 | 347|18|M|student|90210 348 | 348|24|F|student|45660 349 | 349|68|M|retired|61455 350 | 350|32|M|student|97301 351 | 351|61|M|educator|49938 352 | 352|37|F|programmer|55105 353 | 353|25|M|scientist|28480 354 | 354|29|F|librarian|48197 355 | 355|25|M|student|60135 356 | 356|32|F|homemaker|92688 357 | 357|26|M|executive|98133 358 | 358|40|M|educator|10022 359 | 359|22|M|student|61801 360 | 360|51|M|other|98027 361 | 361|22|M|student|44074 362 | 362|35|F|homemaker|85233 363 | 363|20|M|student|87501 364 | 364|63|M|engineer|01810 365 | 365|29|M|lawyer|20009 366 | 366|20|F|student|50670 367 | 367|17|M|student|37411 368 | 368|18|M|student|92113 369 | 369|24|M|student|91335 370 | 370|52|M|writer|08534 371 | 371|36|M|engineer|99206 372 | 372|25|F|student|66046 373 | 373|24|F|other|55116 374 | 374|36|M|executive|78746 375 | 375|17|M|entertainment|37777 376 | 376|28|F|other|10010 377 | 377|22|M|student|18015 378 | 378|35|M|student|02859 379 | 379|44|M|programmer|98117 380 | 380|32|M|engineer|55117 381 | 381|33|M|artist|94608 382 | 382|45|M|engineer|01824 383 | 383|42|M|administrator|75204 384 | 384|52|M|programmer|45218 385 | 385|36|M|writer|10003 386 | 386|36|M|salesman|43221 387 | 387|33|M|entertainment|37412 388 | 388|31|M|other|36106 389 | 389|44|F|writer|83702 390 | 390|42|F|writer|85016 391 | 391|23|M|student|84604 392 | 392|52|M|writer|59801 393 | 393|19|M|student|83686 394 | 394|25|M|administrator|96819 395 | 395|43|M|other|44092 396 | 396|57|M|engineer|94551 397 | 397|17|M|student|27514 398 | 398|40|M|other|60008 399 | 399|25|M|other|92374 400 | 400|33|F|administrator|78213 401 | 401|46|F|healthcare|84107 402 | 402|30|M|engineer|95129 403 | 403|37|M|other|06811 404 | 404|29|F|programmer|55108 405 | 405|22|F|healthcare|10019 406 | 406|52|M|educator|93109 407 | 407|29|M|engineer|03261 408 | 408|23|M|student|61755 409 | 409|48|M|administrator|98225 410 | 410|30|F|artist|94025 411 | 411|34|M|educator|44691 412 | 412|25|M|educator|15222 413 | 413|55|M|educator|78212 414 | 414|24|M|programmer|38115 415 | 415|39|M|educator|85711 416 | 416|20|F|student|92626 417 | 417|27|F|other|48103 418 | 418|55|F|none|21206 419 | 419|37|M|lawyer|43215 420 | 420|53|M|educator|02140 421 | 421|38|F|programmer|55105 422 | 422|26|M|entertainment|94533 423 | 423|64|M|other|91606 424 | 424|36|F|marketing|55422 425 | 425|19|M|student|58644 426 | 426|55|M|educator|01602 427 | 427|51|M|doctor|85258 428 | 428|28|M|student|55414 429 | 429|27|M|student|29205 430 | 430|38|M|scientist|98199 431 | 431|24|M|marketing|92629 432 | 432|22|M|entertainment|50311 433 | 433|27|M|artist|11211 434 | 434|16|F|student|49705 435 | 435|24|M|engineer|60007 436 | 436|30|F|administrator|17345 437 | 437|27|F|other|20009 438 | 438|51|F|administrator|43204 439 | 439|23|F|administrator|20817 440 | 440|30|M|other|48076 441 | 441|50|M|technician|55013 442 | 442|22|M|student|85282 443 | 443|35|M|salesman|33308 444 | 444|51|F|lawyer|53202 445 | 445|21|M|writer|92653 446 | 446|57|M|educator|60201 447 | 447|30|M|administrator|55113 448 | 448|23|M|entertainment|10021 449 | 449|23|M|librarian|55021 450 | 450|35|F|educator|11758 451 | 451|16|M|student|48446 452 | 452|35|M|administrator|28018 453 | 453|18|M|student|06333 454 | 454|57|M|other|97330 455 | 455|48|M|administrator|83709 456 | 456|24|M|technician|31820 457 | 457|33|F|salesman|30011 458 | 458|47|M|technician|Y1A6B 459 | 459|22|M|student|29201 460 | 460|44|F|other|60630 461 | 461|15|M|student|98102 462 | 462|19|F|student|02918 463 | 463|48|F|healthcare|75218 464 | 464|60|M|writer|94583 465 | 465|32|M|other|05001 466 | 466|22|M|student|90804 467 | 467|29|M|engineer|91201 468 | 468|28|M|engineer|02341 469 | 469|60|M|educator|78628 470 | 470|24|M|programmer|10021 471 | 471|10|M|student|77459 472 | 472|24|M|student|87544 473 | 473|29|M|student|94708 474 | 474|51|M|executive|93711 475 | 475|30|M|programmer|75230 476 | 476|28|M|student|60440 477 | 477|23|F|student|02125 478 | 478|29|M|other|10019 479 | 479|30|M|educator|55409 480 | 480|57|M|retired|98257 481 | 481|73|M|retired|37771 482 | 482|18|F|student|40256 483 | 483|29|M|scientist|43212 484 | 484|27|M|student|21208 485 | 485|44|F|educator|95821 486 | 486|39|M|educator|93101 487 | 487|22|M|engineer|92121 488 | 488|48|M|technician|21012 489 | 489|55|M|other|45218 490 | 490|29|F|artist|V5A2B 491 | 491|43|F|writer|53711 492 | 492|57|M|educator|94618 493 | 493|22|M|engineer|60090 494 | 494|38|F|administrator|49428 495 | 495|29|M|engineer|03052 496 | 496|21|F|student|55414 497 | 497|20|M|student|50112 498 | 498|26|M|writer|55408 499 | 499|42|M|programmer|75006 500 | 500|28|M|administrator|94305 501 | 501|22|M|student|10025 502 | 502|22|M|student|23092 503 | 503|50|F|writer|27514 504 | 504|40|F|writer|92115 505 | 505|27|F|other|20657 506 | 506|46|M|programmer|03869 507 | 507|18|F|writer|28450 508 | 508|27|M|marketing|19382 509 | 509|23|M|administrator|10011 510 | 510|34|M|other|98038 511 | 511|22|M|student|21250 512 | 512|29|M|other|20090 513 | 513|43|M|administrator|26241 514 | 514|27|M|programmer|20707 515 | 515|53|M|marketing|49508 516 | 516|53|F|librarian|10021 517 | 517|24|M|student|55454 518 | 518|49|F|writer|99709 519 | 519|22|M|other|55320 520 | 520|62|M|healthcare|12603 521 | 521|19|M|student|02146 522 | 522|36|M|engineer|55443 523 | 523|50|F|administrator|04102 524 | 524|56|M|educator|02159 525 | 525|27|F|administrator|19711 526 | 526|30|M|marketing|97124 527 | 527|33|M|librarian|12180 528 | 528|18|M|student|55104 529 | 529|47|F|administrator|44224 530 | 530|29|M|engineer|94040 531 | 531|30|F|salesman|97408 532 | 532|20|M|student|92705 533 | 533|43|M|librarian|02324 534 | 534|20|M|student|05464 535 | 535|45|F|educator|80302 536 | 536|38|M|engineer|30078 537 | 537|36|M|engineer|22902 538 | 538|31|M|scientist|21010 539 | 539|53|F|administrator|80303 540 | 540|28|M|engineer|91201 541 | 541|19|F|student|84302 542 | 542|21|M|student|60515 543 | 543|33|M|scientist|95123 544 | 544|44|F|other|29464 545 | 545|27|M|technician|08052 546 | 546|36|M|executive|22911 547 | 547|50|M|educator|14534 548 | 548|51|M|writer|95468 549 | 549|42|M|scientist|45680 550 | 550|16|F|student|95453 551 | 551|25|M|programmer|55414 552 | 552|45|M|other|68147 553 | 553|58|M|educator|62901 554 | 554|32|M|scientist|62901 555 | 555|29|F|educator|23227 556 | 556|35|F|educator|30606 557 | 557|30|F|writer|11217 558 | 558|56|F|writer|63132 559 | 559|69|M|executive|10022 560 | 560|32|M|student|10003 561 | 561|23|M|engineer|60005 562 | 562|54|F|administrator|20879 563 | 563|39|F|librarian|32707 564 | 564|65|M|retired|94591 565 | 565|40|M|student|55422 566 | 566|20|M|student|14627 567 | 567|24|M|entertainment|10003 568 | 568|39|M|educator|01915 569 | 569|34|M|educator|91903 570 | 570|26|M|educator|14627 571 | 571|34|M|artist|01945 572 | 572|51|M|educator|20003 573 | 573|68|M|retired|48911 574 | 574|56|M|educator|53188 575 | 575|33|M|marketing|46032 576 | 576|48|M|executive|98281 577 | 577|36|F|student|77845 578 | 578|31|M|administrator|M7A1A 579 | 579|32|M|educator|48103 580 | 580|16|M|student|17961 581 | 581|37|M|other|94131 582 | 582|17|M|student|93003 583 | 583|44|M|engineer|29631 584 | 584|25|M|student|27511 585 | 585|69|M|librarian|98501 586 | 586|20|M|student|79508 587 | 587|26|M|other|14216 588 | 588|18|F|student|93063 589 | 589|21|M|lawyer|90034 590 | 590|50|M|educator|82435 591 | 591|57|F|librarian|92093 592 | 592|18|M|student|97520 593 | 593|31|F|educator|68767 594 | 594|46|M|educator|M4J2K 595 | 595|25|M|programmer|31909 596 | 596|20|M|artist|77073 597 | 597|23|M|other|84116 598 | 598|40|F|marketing|43085 599 | 599|22|F|student|R3T5K 600 | 600|34|M|programmer|02320 601 | 601|19|F|artist|99687 602 | 602|47|F|other|34656 603 | 603|21|M|programmer|47905 604 | 604|39|M|educator|11787 605 | 605|33|M|engineer|33716 606 | 606|28|M|programmer|63044 607 | 607|49|F|healthcare|02154 608 | 608|22|M|other|10003 609 | 609|13|F|student|55106 610 | 610|22|M|student|21227 611 | 611|46|M|librarian|77008 612 | 612|36|M|educator|79070 613 | 613|37|F|marketing|29678 614 | 614|54|M|educator|80227 615 | 615|38|M|educator|27705 616 | 616|55|M|scientist|50613 617 | 617|27|F|writer|11201 618 | 618|15|F|student|44212 619 | 619|17|M|student|44134 620 | 620|18|F|writer|81648 621 | 621|17|M|student|60402 622 | 622|25|M|programmer|14850 623 | 623|50|F|educator|60187 624 | 624|19|M|student|30067 625 | 625|27|M|programmer|20723 626 | 626|23|M|scientist|19807 627 | 627|24|M|engineer|08034 628 | 628|13|M|none|94306 629 | 629|46|F|other|44224 630 | 630|26|F|healthcare|55408 631 | 631|18|F|student|38866 632 | 632|18|M|student|55454 633 | 633|35|M|programmer|55414 634 | 634|39|M|engineer|T8H1N 635 | 635|22|M|other|23237 636 | 636|47|M|educator|48043 637 | 637|30|M|other|74101 638 | 638|45|M|engineer|01940 639 | 639|42|F|librarian|12065 640 | 640|20|M|student|61801 641 | 641|24|M|student|60626 642 | 642|18|F|student|95521 643 | 643|39|M|scientist|55122 644 | 644|51|M|retired|63645 645 | 645|27|M|programmer|53211 646 | 646|17|F|student|51250 647 | 647|40|M|educator|45810 648 | 648|43|M|engineer|91351 649 | 649|20|M|student|39762 650 | 650|42|M|engineer|83814 651 | 651|65|M|retired|02903 652 | 652|35|M|other|22911 653 | 653|31|M|executive|55105 654 | 654|27|F|student|78739 655 | 655|50|F|healthcare|60657 656 | 656|48|M|educator|10314 657 | 657|26|F|none|78704 658 | 658|33|M|programmer|92626 659 | 659|31|M|educator|54248 660 | 660|26|M|student|77380 661 | 661|28|M|programmer|98121 662 | 662|55|M|librarian|19102 663 | 663|26|M|other|19341 664 | 664|30|M|engineer|94115 665 | 665|25|M|administrator|55412 666 | 666|44|M|administrator|61820 667 | 667|35|M|librarian|01970 668 | 668|29|F|writer|10016 669 | 669|37|M|other|20009 670 | 670|30|M|technician|21114 671 | 671|21|M|programmer|91919 672 | 672|54|F|administrator|90095 673 | 673|51|M|educator|22906 674 | 674|13|F|student|55337 675 | 675|34|M|other|28814 676 | 676|30|M|programmer|32712 677 | 677|20|M|other|99835 678 | 678|50|M|educator|61462 679 | 679|20|F|student|54302 680 | 680|33|M|lawyer|90405 681 | 681|44|F|marketing|97208 682 | 682|23|M|programmer|55128 683 | 683|42|M|librarian|23509 684 | 684|28|M|student|55414 685 | 685|32|F|librarian|55409 686 | 686|32|M|educator|26506 687 | 687|31|F|healthcare|27713 688 | 688|37|F|administrator|60476 689 | 689|25|M|other|45439 690 | 690|35|M|salesman|63304 691 | 691|34|M|educator|60089 692 | 692|34|M|engineer|18053 693 | 693|43|F|healthcare|85210 694 | 694|60|M|programmer|06365 695 | 695|26|M|writer|38115 696 | 696|55|M|other|94920 697 | 697|25|M|other|77042 698 | 698|28|F|programmer|06906 699 | 699|44|M|other|96754 700 | 700|17|M|student|76309 701 | 701|51|F|librarian|56321 702 | 702|37|M|other|89104 703 | 703|26|M|educator|49512 704 | 704|51|F|librarian|91105 705 | 705|21|F|student|54494 706 | 706|23|M|student|55454 707 | 707|56|F|librarian|19146 708 | 708|26|F|homemaker|96349 709 | 709|21|M|other|N4T1A 710 | 710|19|M|student|92020 711 | 711|22|F|student|15203 712 | 712|22|F|student|54901 713 | 713|42|F|other|07204 714 | 714|26|M|engineer|55343 715 | 715|21|M|technician|91206 716 | 716|36|F|administrator|44265 717 | 717|24|M|technician|84105 718 | 718|42|M|technician|64118 719 | 719|37|F|other|V0R2H 720 | 720|49|F|administrator|16506 721 | 721|24|F|entertainment|11238 722 | 722|50|F|homemaker|17331 723 | 723|26|M|executive|94403 724 | 724|31|M|executive|40243 725 | 725|21|M|student|91711 726 | 726|25|F|administrator|80538 727 | 727|25|M|student|78741 728 | 728|58|M|executive|94306 729 | 729|19|M|student|56567 730 | 730|31|F|scientist|32114 731 | 731|41|F|educator|70403 732 | 732|28|F|other|98405 733 | 733|44|F|other|60630 734 | 734|25|F|other|63108 735 | 735|29|F|healthcare|85719 736 | 736|48|F|writer|94618 737 | 737|30|M|programmer|98072 738 | 738|35|M|technician|95403 739 | 739|35|M|technician|73162 740 | 740|25|F|educator|22206 741 | 741|25|M|writer|63108 742 | 742|35|M|student|29210 743 | 743|31|M|programmer|92660 744 | 744|35|M|marketing|47024 745 | 745|42|M|writer|55113 746 | 746|25|M|engineer|19047 747 | 747|19|M|other|93612 748 | 748|28|M|administrator|94720 749 | 749|33|M|other|80919 750 | 750|28|M|administrator|32303 751 | 751|24|F|other|90034 752 | 752|60|M|retired|21201 753 | 753|56|M|salesman|91206 754 | 754|59|F|librarian|62901 755 | 755|44|F|educator|97007 756 | 756|30|F|none|90247 757 | 757|26|M|student|55104 758 | 758|27|M|student|53706 759 | 759|20|F|student|68503 760 | 760|35|F|other|14211 761 | 761|17|M|student|97302 762 | 762|32|M|administrator|95050 763 | 763|27|M|scientist|02113 764 | 764|27|F|educator|62903 765 | 765|31|M|student|33066 766 | 766|42|M|other|10960 767 | 767|70|M|engineer|00000 768 | 768|29|M|administrator|12866 769 | 769|39|M|executive|06927 770 | 770|28|M|student|14216 771 | 771|26|M|student|15232 772 | 772|50|M|writer|27105 773 | 773|20|M|student|55414 774 | 774|30|M|student|80027 775 | 775|46|M|executive|90036 776 | 776|30|M|librarian|51157 777 | 777|63|M|programmer|01810 778 | 778|34|M|student|01960 779 | 779|31|M|student|K7L5J 780 | 780|49|M|programmer|94560 781 | 781|20|M|student|48825 782 | 782|21|F|artist|33205 783 | 783|30|M|marketing|77081 784 | 784|47|M|administrator|91040 785 | 785|32|M|engineer|23322 786 | 786|36|F|engineer|01754 787 | 787|18|F|student|98620 788 | 788|51|M|administrator|05779 789 | 789|29|M|other|55420 790 | 790|27|M|technician|80913 791 | 791|31|M|educator|20064 792 | 792|40|M|programmer|12205 793 | 793|22|M|student|85281 794 | 794|32|M|educator|57197 795 | 795|30|M|programmer|08610 796 | 796|32|F|writer|33755 797 | 797|44|F|other|62522 798 | 798|40|F|writer|64131 799 | 799|49|F|administrator|19716 800 | 800|25|M|programmer|55337 801 | 801|22|M|writer|92154 802 | 802|35|M|administrator|34105 803 | 803|70|M|administrator|78212 804 | 804|39|M|educator|61820 805 | 805|27|F|other|20009 806 | 806|27|M|marketing|11217 807 | 807|41|F|healthcare|93555 808 | 808|45|M|salesman|90016 809 | 809|50|F|marketing|30803 810 | 810|55|F|other|80526 811 | 811|40|F|educator|73013 812 | 812|22|M|technician|76234 813 | 813|14|F|student|02136 814 | 814|30|M|other|12345 815 | 815|32|M|other|28806 816 | 816|34|M|other|20755 817 | 817|19|M|student|60152 818 | 818|28|M|librarian|27514 819 | 819|59|M|administrator|40205 820 | 820|22|M|student|37725 821 | 821|37|M|engineer|77845 822 | 822|29|F|librarian|53144 823 | 823|27|M|artist|50322 824 | 824|31|M|other|15017 825 | 825|44|M|engineer|05452 826 | 826|28|M|artist|77048 827 | 827|23|F|engineer|80228 828 | 828|28|M|librarian|85282 829 | 829|48|M|writer|80209 830 | 830|46|M|programmer|53066 831 | 831|21|M|other|33765 832 | 832|24|M|technician|77042 833 | 833|34|M|writer|90019 834 | 834|26|M|other|64153 835 | 835|44|F|executive|11577 836 | 836|44|M|artist|10018 837 | 837|36|F|artist|55409 838 | 838|23|M|student|01375 839 | 839|38|F|entertainment|90814 840 | 840|39|M|artist|55406 841 | 841|45|M|doctor|47401 842 | 842|40|M|writer|93055 843 | 843|35|M|librarian|44212 844 | 844|22|M|engineer|95662 845 | 845|64|M|doctor|97405 846 | 846|27|M|lawyer|47130 847 | 847|29|M|student|55417 848 | 848|46|M|engineer|02146 849 | 849|15|F|student|25652 850 | 850|34|M|technician|78390 851 | 851|18|M|other|29646 852 | 852|46|M|administrator|94086 853 | 853|49|M|writer|40515 854 | 854|29|F|student|55408 855 | 855|53|M|librarian|04988 856 | 856|43|F|marketing|97215 857 | 857|35|F|administrator|V1G4L 858 | 858|63|M|educator|09645 859 | 859|18|F|other|06492 860 | 860|70|F|retired|48322 861 | 861|38|F|student|14085 862 | 862|25|M|executive|13820 863 | 863|17|M|student|60089 864 | 864|27|M|programmer|63021 865 | 865|25|M|artist|11231 866 | 866|45|M|other|60302 867 | 867|24|M|scientist|92507 868 | 868|21|M|programmer|55303 869 | 869|30|M|student|10025 870 | 870|22|M|student|65203 871 | 871|31|M|executive|44648 872 | 872|19|F|student|74078 873 | 873|48|F|administrator|33763 874 | 874|36|M|scientist|37076 875 | 875|24|F|student|35802 876 | 876|41|M|other|20902 877 | 877|30|M|other|77504 878 | 878|50|F|educator|98027 879 | 879|33|F|administrator|55337 880 | 880|13|M|student|83702 881 | 881|39|M|marketing|43017 882 | 882|35|M|engineer|40503 883 | 883|49|M|librarian|50266 884 | 884|44|M|engineer|55337 885 | 885|30|F|other|95316 886 | 886|20|M|student|61820 887 | 887|14|F|student|27249 888 | 888|41|M|scientist|17036 889 | 889|24|M|technician|78704 890 | 890|32|M|student|97301 891 | 891|51|F|administrator|03062 892 | 892|36|M|other|45243 893 | 893|25|M|student|95823 894 | 894|47|M|educator|74075 895 | 895|31|F|librarian|32301 896 | 896|28|M|writer|91505 897 | 897|30|M|other|33484 898 | 898|23|M|homemaker|61755 899 | 899|32|M|other|55116 900 | 900|60|M|retired|18505 901 | 901|38|M|executive|L1V3W 902 | 902|45|F|artist|97203 903 | 903|28|M|educator|20850 904 | 904|17|F|student|61073 905 | 905|27|M|other|30350 906 | 906|45|M|librarian|70124 907 | 907|25|F|other|80526 908 | 908|44|F|librarian|68504 909 | 909|50|F|educator|53171 910 | 910|28|M|healthcare|29301 911 | 911|37|F|writer|53210 912 | 912|51|M|other|06512 913 | 913|27|M|student|76201 914 | 914|44|F|other|08105 915 | 915|50|M|entertainment|60614 916 | 916|27|M|engineer|N2L5N 917 | 917|22|F|student|20006 918 | 918|40|M|scientist|70116 919 | 919|25|M|other|14216 920 | 920|30|F|artist|90008 921 | 921|20|F|student|98801 922 | 922|29|F|administrator|21114 923 | 923|21|M|student|E2E3R 924 | 924|29|M|other|11753 925 | 925|18|F|salesman|49036 926 | 926|49|M|entertainment|01701 927 | 927|23|M|programmer|55428 928 | 928|21|M|student|55408 929 | 929|44|M|scientist|53711 930 | 930|28|F|scientist|07310 931 | 931|60|M|educator|33556 932 | 932|58|M|educator|06437 933 | 933|28|M|student|48105 934 | 934|61|M|engineer|22902 935 | 935|42|M|doctor|66221 936 | 936|24|M|other|32789 937 | 937|48|M|educator|98072 938 | 938|38|F|technician|55038 939 | 939|26|F|student|33319 940 | 940|32|M|administrator|02215 941 | 941|20|M|student|97229 942 | 942|48|F|librarian|78209 943 | 943|22|M|student|77841 944 | -------------------------------------------------------------------------------- /page_rank/README.md: -------------------------------------------------------------------------------- 1 | page-rank 2 | ========= 3 | 4 | A very simple version/implementation of the page rank algorithm. 5 | 6 | functions: 7 | 8 | - Page rank 9 | - Advanced version of page rank, topic sensitive 10 | - spam farms 11 | - spam mass 12 | - trust rank 13 | - Hiperlink induced topic search 14 | - Map reduce to efficiently calculates the page rank 15 | - Jaccard simiarity to be found in data analysis repo 16 | 17 | 18 | implementation using list and matrix from the **numpy** library. 19 | 20 | 21 | Calculation workflow : 22 | 23 | 1. Parse web pages for links 24 | 2. Parse links 25 | 3. Compute page rank (iterate until convergence) 26 | 4. Sort by page rank 27 | 5. Create index 28 | -------------------------------------------------------------------------------- /page_rank/page_rank.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: simple implementation if page rank 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | from __future__ import division 8 | import multiprocessing 9 | import numpy 10 | import random 11 | from map_reduce import MapReduce 12 | from jaccard_similarity import jaccard_sim 13 | 14 | # example of set of pages belonging to the same topic (the simple topic sensitive page rank version) 15 | S = set(('2', '4')) 16 | Es = [0, 1, 0, 1] 17 | 18 | 19 | def page_rank(matrix, taxation=False, b=1, Es=[], S=set(), nbr_iterations=10000000, verbose=False): 20 | """ 21 | calculate the page rank for each element based on the matrix in input 22 | we should validate if the matrix is stochastic 23 | if not we use the taxation method to ovoid dead ends (introducing the random surfers) 24 | v' = Mv + (1-b)e/n 25 | v : eigenvector 26 | The term (1-b)e/n is a vector each of whose components has value (1-b)/n and 27 | represents the introduction, with probability 1 - b, of a new random surfer at 28 | a random page. 29 | The mathematical formulation for the iteration that yields topic-sensitive 30 | PageRank is similar to the equation we used for general PageRank. The only 31 | difference is how we add the new surfers. Suppose S is a set of integers consisting 32 | of the row/column numbers for the pages we have identified as belonging to a 33 | certain topic (called the teleport set). Let eS be a vector that has 1 in the 34 | components in S and 0 in other components. Then the topic-sensitive Page- 35 | Rank for S is the limit of the iteration 36 | v' = bMv + (1 - b)eS/|S| 37 | Here, as usual, M is the transition matrix of the Web, and |S| is the size of set 38 | S. 39 | """ 40 | elements_length = len(matrix[0]) 41 | eigenvectors = [1 / elements_length] * elements_length 42 | if Es and taxation: 43 | taxation_v = [((1 - b) / len(S) * e) for e in Es] 44 | else: 45 | taxation_v = [(1 - b) / elements_length] * elements_length if taxation else [0] * elements_length 46 | 47 | eigenvectors_p = [0] * elements_length 48 | itr = 0 49 | # initializing map reduce 50 | mapper = MapReduce(page_rank_calculation, page_rank_vector) 51 | while eigenvectors_p != eigenvectors and itr < nbr_iterations: 52 | if eigenvectors_p != [0] * elements_length: eigenvectors = list(eigenvectors_p); 53 | for k, v in mapper([(i, eigenvectors, matrix, taxation_v, b) for i in range(elements_length)]): 54 | eigenvectors_p[k] = v 55 | itr += 1 56 | if verbose: print eigenvectors 57 | return eigenvectors 58 | 59 | 60 | def page_rank_vector(item): 61 | """Convert the partitioned data for a word to a 62 | tuple containing the word and the number of occurances. 63 | """ 64 | key, occurances = item 65 | return (key, sum(occurances)) 66 | 67 | 68 | def page_rank_calculation(itemi): 69 | """Read a file and return a sequence of (word, occurances) values. 70 | """ 71 | item, eigenvectors, matrix, taxation_v, b = itemi 72 | elements_length = len(matrix) 73 | # print multiprocessing.current_process().name, 'calculating', item 74 | output = [] 75 | vector_p = 0 76 | for j in range(elements_length): 77 | vector_p += eigenvectors[j] * matrix[item][j] * b 78 | vector_p += taxation_v[item] 79 | output.append((item, vector_p)) 80 | return output 81 | 82 | 83 | def matrix_vector_multiplication(matrix, vector, length, b, taxation_v): 84 | """ 85 | calculate the multiplication of matrix by vector 86 | """ 87 | vector_p = [0] * length 88 | for i in range(length): 89 | for j in range(length): 90 | vector_p[i] += vector[j] * matrix[i][j] * b 91 | vector_p[i] += taxation_v[i] 92 | return vector_p 93 | 94 | 95 | def construct_web(n, b, nbr_iterations=100000, verbose=False): 96 | """ 97 | Web consists of a clique (set of nodes with all possible arcs from one to another) 98 | of n nodes and a single additional node that is the successor of each of the n nodes 99 | in the clique. Determine the PageRank of each page, as a function of n 100 | and ?. 101 | """ 102 | all_nodes = 1 / (n + 1) 103 | all_nodes_p = 0 104 | last_node = 1 / (n + 1) 105 | last_node_p = 0 106 | itr = 0 107 | while (all_nodes != all_nodes_p or last_node != last_node_p) and itr < nbr_iterations: 108 | if all_nodes_p != 0: all_nodes = all_nodes_p; 109 | if last_node_p != 0: last_node = last_node_p; 110 | all_nodes_p = b * all_nodes * ((n - 1) / n) + (1 - b) / (1 + n) 111 | last_node_p = b * last_node + (1 - b) / (1 + n) 112 | itr += 1 113 | if verbose: 114 | print all_nodes 115 | print last_node 116 | return all_nodes 117 | 118 | 119 | # exemple of sets of keywords, to be used for the advanced page rank 120 | Sk = tuple((tuple(('0', '6', '7')), tuple(('1', '3', '4', '8')), tuple(('2', '5', '9', '10')))) 121 | 122 | 123 | def page_rank_advanced(matrix, b=1, P=set(), S=set(), nbr_iterations=100000, verbose=False): 124 | """ 125 | calculation of the topic sensitive page rank. 126 | S is the set of sets of topics 127 | P is set of topic keywords for each page 128 | the algorithm we shall implement is the following: 129 | 130 | => calculate the jackard similarity for P and Si 131 | => classify the page for a topic 132 | =>construct Es, such that is the set of corresponding teleport surfurs for each set of topics 133 | """ 134 | elements_length = len(matrix[0]) 135 | topics_length = len(S) 136 | Es = [] 137 | #calculate the jaccard similarity for each page and set and 138 | for s in S: 139 | Esp = [0] * elements_length 140 | for p in range(elements_length): 141 | Esp[p] = jaccard_sim(P[p], s) 142 | Es.append(Esp) 143 | print s 144 | print Esp 145 | #calculate the page rank for each topic 146 | for i in range(topics_length): 147 | page_rank(matrix, taxation=True, b=b, Es=Es[i], S=S[i], nbr_iterations=10000000, verbose=True) 148 | 149 | 150 | def spam_farm(Pa, Ps, Pn, b, verbose=False): 151 | """ 152 | The spam farm consists of the spammer?s own pages "target page", organized in a special 153 | way, and some links from the accessible pages to the 154 | spammer?s pages. Without some links from the outside, the spam farm would 155 | be useless, since it would not even be crawled by a typical search engine. 156 | Concerning the accessible pages, it might seem surprising that one can affect 157 | a page without owning it. However, today there are many sites, such as 158 | blogs or newspapers that invite others to post their comments on the site. In 159 | order to get as much PageRank flowing to his own pages from outside, the 160 | spammer posts many comments. 161 | In the spam farm, there is one page, the target page, at which the spammer 162 | attempts to place as much PageRank as possible. There are a large number 163 | Ps of supporting pages, that accumulate the portion of the PageRank that is 164 | distributed equally to all pages 165 | Pa : is the amount of accessible pages 166 | Ps : the amount of supporting pages 167 | Pn : the amount of total pages in the web 168 | => we are looking for PR_t : wich is the page rank for the target page 169 | -the page rank of each supporting page is : 170 | b*PR_t + (1-b)/Pn 171 | Since the page rank of the target page comes from 3 sources: 172 | 1. Pa from outside accessible pages 173 | 2. b times the page rank of the supporting pages: 174 | b*((b*PR_t)/Ps + (1-b)/Pn) 175 | 3. (1-b)/Pn, the share of the fraction (1-b) of the page rank that belongs to PR_t. 176 | is negligible and will be dropped to simplify the calculus 177 | 178 | => from (1) & (2) : 179 | PR_t = Pa + (b*Ps)*((b * PR_t)/Ps + (1-b)/Pn) + (1-b)/Pn 180 | PR_t = Pa/(1-b**2) + (b/(1+b))*(Ps/Pn)) + 1/(Pn*(1+b)) 181 | PR_t = Pa/x + y*(Ps/Pn) + 1/(Pn*(1+b)) 182 | where x = 1/(1- b**2) & y = b/(1+b) 183 | """ 184 | x = 1 / (1 - b ** 2) 185 | x *= 100 186 | y = b / (1 + b) 187 | y *= 100 188 | PR_t = Pa / x + y * (Ps / Pn) + 1 / (Pn * (1 + b)) 189 | if verbose: 190 | print 'Amplification of the external page rank contribution by %4.2f' % x 191 | print 'amount of PageRank that is %4.2f of the fraction Ps/n in the spam farm.' % y 192 | print 'page rank of target page %4.2f' % PR_t 193 | return PR_t 194 | 195 | 196 | def trust_rank(matrix, b=0.8, Ts=[], Tp=set(), nbr_iterations=10000000, verbose=False): 197 | """ 198 | TrustRank based on some teleport set of trustworthy pages. 199 | Computed the same way as a topic sensitive page rank. The only difference is that the 200 | teleport surfers are considered trustworthy pages 201 | Tp : trusted pages. 202 | Ts : trustworthy vector 203 | """ 204 | return page_rank(matrix=matrix, taxation=True, b=b, Es=Ts, S=Tp, nbr_iterations=nbr_iterations, verbose=verbose) 205 | 206 | 207 | def spam_mass(matrix, taxation=False, b=1, Es=[], S=set(), Ts=[], Tp=set(), nbr_iterations=10000000, verbose=False): 208 | """ 209 | calculate the spam mass of a pages : (Pr - Tr) / Pr 210 | """ 211 | pr = page_rank(matrix=matrix, taxation=True, b=b, Es=Es, S=S, nbr_iterations=nbr_iterations, verbose=verbose) 212 | tr = trust_rank(matrix=matrix, b=b, Ts=Ts, Tp=Tp, nbr_iterations=nbr_iterations, verbose=verbose) 213 | elements_length = len(matrix[0]) 214 | sm = [0] * elements_length 215 | for i in range(elements_length): 216 | sm[i] = (pr[i] - tr[i]) / pr[i] 217 | if verbose: print sm; 218 | return sm 219 | 220 | 221 | def hits(L, lam, mu, nbr_iterations=100000000, verbose=True): 222 | """ 223 | Hiperlink induced topic search 224 | Computation of hubbiness and authority 225 | Authority : page's quality that tells you best about a topic 226 | Hubbiness : page's quality that tells you best about other pages and how to find them 227 | Authority of a page is the sum of predecessors's hubbiness 228 | Hubiness of a page is the sum of predecessors's authority 229 | L[i][j] = 1 if page_i link to page_j otherwise 0 230 | """ 231 | elements_length = len(L[0]) 232 | L_t = transpose(L, elements_length) 233 | H = [1] * elements_length 234 | H_s = [0] * elements_length 235 | A = [0] * elements_length 236 | T = [0] * elements_length 237 | itr = 0 238 | while H != H_s and itr < nbr_iterations: 239 | if H_s != [0] * elements_length: H = H_s; 240 | A = matrix_vector_multiplication(L_t, H, elements_length, 1, T) 241 | m = max(A) 242 | for i in range(elements_length): 243 | A[i] /= m 244 | H_s = matrix_vector_multiplication(L, A, elements_length, 1, T) 245 | m = max(H_s) 246 | for i in range(elements_length): 247 | H_s[i] /= m 248 | itr += 1 249 | A = matrix_vector_multiplication(L_t, H, elements_length, 1, T) 250 | m = max(A) 251 | for i in range(elements_length): 252 | A[i] /= m 253 | if verbose: 254 | print H 255 | print A 256 | 257 | 258 | def transpose(matrix, elements_length, verbose=False): 259 | matrix_t = [] 260 | for i in range(elements_length): 261 | t = [0] * elements_length 262 | for j in range(elements_length): 263 | t[j] = matrix[j][i] 264 | matrix_t.append(t) 265 | if verbose: 266 | print matrix 267 | print matrix_t 268 | return matrix_t 269 | 270 | 271 | def test_construct(): 272 | matrix = [] 273 | matrix.append([0, 1 / 4, 1 / 4, 1 / 4, 0]) 274 | matrix.append([1 / 4, 0, 1 / 4, 1 / 4, 0]) 275 | matrix.append([1 / 4, 1 / 4, 0, 1 / 4, 0]) 276 | matrix.append([1 / 4, 1 / 4, 1 / 4, 0, 0]) 277 | matrix.append([1 / 4, 1 / 4, 1 / 4, 1 / 4, 0]) 278 | page_rank(matrix, taxation=True, b=0.8, verbose=True) 279 | construct_web(4, 0.8, verbose=True) 280 | 281 | 282 | def test_page_rank(): 283 | matrix = [] 284 | matrix.append([0, 1 / 2, 0, 0]) 285 | matrix.append([1 / 3, 0, 0, 1 / 2]) 286 | matrix.append([1 / 3, 0, 1, 1 / 2]) 287 | matrix.append([1 / 3, 1 / 2, 0, 0]) 288 | page_rank(matrix, taxation=True, b=0.85, verbose=True) 289 | page_rank(matrix, taxation=True, b=0.85, Es=Es, S=S, verbose=True) 290 | 291 | 292 | def test_page_rank_advanced(): 293 | matrix = [] 294 | matrix.append([0, 1 / 2, 0, 0]) 295 | matrix.append([1 / 3, 0, 0, 1 / 2]) 296 | matrix.append([1 / 3, 0, 1, 1 / 2]) 297 | matrix.append([1 / 3, 1 / 2, 0, 0]) 298 | P = tuple((tuple(('1', '2', '3', '4')), tuple(('0', '6', '7', '8')), tuple(('2', '5', '9', '10')), 299 | tuple(('2', '5', '9', '10', '0')))) 300 | page_rank_advanced(matrix, b=0.85, P=P, S=Sk, nbr_iterations=100000, verbose=False) 301 | 302 | 303 | def test_sapm_farm(): 304 | spam_farm(Pa=10, Ps=30, Pn=500, b=0.855, verbose=True) 305 | 306 | 307 | def test_spam_mass(): 308 | matrix = [] 309 | matrix.append([0, 1 / 2, 0, 0]) 310 | matrix.append([1 / 3, 0, 0, 1 / 2]) 311 | matrix.append([1 / 3, 0, 1, 1 / 2]) 312 | matrix.append([1 / 3, 1 / 2, 0, 0]) 313 | spam_mass(matrix, taxation=True, b=0.85, Ts=Es, Tp=S, verbose=True) 314 | 315 | 316 | def test_hits(): 317 | matrix = [] 318 | matrix.append([0, 1, 1, 1, 0]) 319 | matrix.append([1, 0, 0, 1, 0]) 320 | matrix.append([0, 0, 0, 0, 1]) 321 | matrix.append([0, 1, 1, 0, 0]) 322 | matrix.append([0, 0, 0, 0, 0]) 323 | hits(matrix, 0, 0) 324 | 325 | 326 | if __name__ == '__main__': 327 | test_construct() 328 | test_page_rank() 329 | test_page_rank_advanced() 330 | test_sapm_farm() 331 | test_spam_mass() 332 | 333 | -------------------------------------------------------------------------------- /page_rank/page_rank_numpy.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: simple implementation if page rank 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | from __future__ import division 8 | from data_analysis import jaccard_sim 9 | from numpy import * 10 | 11 | # example of set of pages belonging to the same topic (the simple topic sensitive page rank version) 12 | S = set(('2', '4')) 13 | Es = matrix([[0], [1], [0], [1]]) 14 | 15 | 16 | def page_rank(matrix, taxation=False, b=1, Es=[], S=set(), nbr_iterations=10000000, verbose=False): 17 | """ 18 | calculate the page rank for each element based on the matrix in input 19 | we should validate if the matrix is stochastic 20 | if not we use the taxation method to ovoid dead ends (introducing the random surfers) 21 | v' = Mv + (1-b)e/n 22 | v : eigenvector 23 | The term (1-b)e/n is a vector each of whose components has value (1-b)/n and 24 | represents the introduction, with probability 1 - b, of a new random surfer at 25 | a random page. 26 | The mathematical formulation for the iteration that yields topic-sensitive 27 | PageRank is similar to the equation we used for general PageRank. The only 28 | difference is how we add the new surfers. Suppose S is a set of integers consisting 29 | of the row/column numbers for the pages we have identified as belonging to a 30 | certain topic (called the teleport set). Let eS be a vector that has 1 in the 31 | components in S and 0 in other components. Then the topic-sensitive Page- 32 | Rank for S is the limit of the iteration 33 | v' = bMv + (1 - b)eS/|S| 34 | Here, as usual, M is the transition matrix of the Web, and |S| is the size of set 35 | S. 36 | """ 37 | elements_length = len(matrix) 38 | eigenvectors = (1 / elements_length) * mat(ones((elements_length, 1))) 39 | if S and taxation: 40 | taxation_v = (1 - b) / len(S) * Es 41 | else: 42 | taxation_v = (1 - b) / elements_length * mat(ones((elements_length, 1))) if taxation else mat( 43 | ones((elements_length, 1))) * 0 44 | 45 | eigenvectors_p = mat(ones((elements_length, 1))) * 0 46 | itr = 0 47 | while (eigenvectors_p != eigenvectors).any() and itr < nbr_iterations: 48 | if (eigenvectors_p != (mat(ones((elements_length, 1))) * 0)).any(): eigenvectors = eigenvectors_p; 49 | eigenvectors_p = matrix_vector_multiplication(matrix, eigenvectors, elements_length, b, taxation_v) 50 | itr += 1 51 | if verbose: print eigenvectors 52 | return eigenvectors 53 | 54 | 55 | def matrix_vector_multiplication(matrix, vector, length, b, taxation_v): 56 | """ 57 | calculate the multiplication of matrix by vector 58 | """ 59 | return b * matrix * vector + taxation_v 60 | 61 | 62 | def construct_web(n, b, nbr_iterations=100000, verbose=False): 63 | """ 64 | Web consists of a clique (set of nodes with all possible arcs from one to another) 65 | of n nodes and a single additional node that is the successor of each of the n nodes 66 | in the clique. Determine the PageRank of each page, as a function of n 67 | and ?. 68 | """ 69 | all_nodes = 1 / (n + 1) 70 | all_nodes_p = 0 71 | last_node = 1 / (n + 1) 72 | last_node_p = 0 73 | itr = 0 74 | while (all_nodes != all_nodes_p or last_node != last_node_p) and itr < nbr_iterations: 75 | if all_nodes_p != 0: all_nodes = all_nodes_p; 76 | if last_node_p != 0: last_node = last_node_p; 77 | all_nodes_p = b * all_nodes * ((n - 1) / n) + (1 - b) / (1 + n) 78 | last_node_p = b * last_node + (1 - b) / (1 + n) 79 | itr += 1 80 | if verbose: 81 | print all_nodes 82 | print last_node 83 | return all_nodes 84 | 85 | #exemple of sets of keywords, to be used for the advanced page rank 86 | Sk = tuple((tuple(('0', '6', '7')), tuple(('1', '3', '4', '8')), tuple(('2', '5', '9', '10')))) 87 | 88 | 89 | def page_rank_advanced(matrix, b=1, P=set(), S=set(), nbr_iterations=100000, verbose=False): 90 | """ 91 | calculation of the topic sensitive page rank. 92 | S is the set of sets of topics 93 | P is set of topic keywords for each page 94 | the algorithm we shall implement is the following: 95 | 96 | => calculate the jackard similarity for P and Si 97 | => classify the page for a topic 98 | =>construct Es, such that is the set of corresponding teleport surfurs for each set of topics 99 | """ 100 | elements_length = len(matrix) 101 | i = 0 102 | for s in S: 103 | Esp = [0] * elements_length 104 | #calculate the jaccard similarity for each page and set and 105 | for p in range(elements_length): 106 | Esp[p] = jaccard_sim(P[p], s) 107 | Esp = mat(Esp) 108 | print s 109 | print Esp 110 | #calculate the page rank for each topic 111 | page_rank(matrix, taxation=True, b=b, Es=Esp.getT(), S=S[i], nbr_iterations=10000000, verbose=True) 112 | i += 1 113 | 114 | 115 | def spam_farm(Pa, Ps, Pn, b, verbose=False): 116 | """ 117 | The spam farm consists of the spammer?s own pages "target page", organized in a special 118 | way, and some links from the accessible pages to the 119 | spammer?s pages. Without some links from the outside, the spam farm would 120 | be useless, since it would not even be crawled by a typical search engine. 121 | Concerning the accessible pages, it might seem surprising that one can affect 122 | a page without owning it. However, today there are many sites, such as 123 | blogs or newspapers that invite others to post their comments on the site. In 124 | order to get as much PageRank flowing to his own pages from outside, the 125 | spammer posts many comments. 126 | In the spam farm, there is one page, the target page, at which the spammer 127 | attempts to place as much PageRank as possible. There are a large number 128 | Ps of supporting pages, that accumulate the portion of the PageRank that is 129 | distributed equally to all pages 130 | Pa : is the amount of accessible pages 131 | Ps : the amount of supporting pages 132 | Pn : the amount of total pages in the web 133 | => we are looking for PR_t : wich is the page rank for the target page 134 | -the page rank of each supporting page is : 135 | b*PR_t + (1-b)/Pn 136 | Since the page rank of the target page comes from 3 sources: 137 | 1. Pa from outside accessible pages 138 | 2. b times the page rank of the supporting pages: 139 | b*((b*PR_t)/Ps + (1-b)/Pn) 140 | 3. (1-b)/Pn, the share of the fraction (1-b) of the page rank that belongs to PR_t. 141 | is negligible and will be dropped to simplify the calculus 142 | 143 | => from (1) & (2) : 144 | PR_t = Pa + (b*Ps)*((b * PR_t)/Ps + (1-b)/Pn) + (1-b)/Pn 145 | PR_t = Pa/(1-b**2) + (b/(1+b))*(Ps/Pn)) + 1/(Pn*(1+b)) 146 | PR_t = Pa/x + y*(Ps/Pn) + 1/(Pn*(1+b)) 147 | where x = 1/(1- b**2) & y = b/(1+b) 148 | """ 149 | x = 1 / (1 - b ** 2) 150 | x *= 100 151 | y = b / (1 + b) 152 | y *= 100 153 | PR_t = Pa / x + y * (Ps / Pn) + 1 / (Pn * (1 + b)) 154 | if verbose: 155 | print 'Amplification of the external page rank contribution by %4.2f' % x 156 | print 'amount of PageRank that is %4.2f of the fraction Ps/n in the spam farm.' % y 157 | print 'page rank of target page %4.2f' % PR_t 158 | return PR_t 159 | 160 | 161 | def trust_rank(matrix, b=0.8, Ts=[], Tp=set(), nbr_iterations=10000000, verbose=False): 162 | """ 163 | TrustRank based on some teleport set of trustworthy pages. 164 | Computed the same way as a topic sensitive page rank. The only difference is that the 165 | teleport surfers are considered trustworthy pages 166 | Tp : trusted pages. 167 | Ts : trustworthy vector 168 | """ 169 | return page_rank(matrix=matrix, taxation=True, b=b, Es=Ts, S=Tp, nbr_iterations=nbr_iterations, verbose=verbose) 170 | 171 | 172 | def spam_mass(matrix, taxation=False, b=1, Es=[], S=set(), Ts=[], Tp=set(), nbr_iterations=10000000, verbose=False): 173 | """ 174 | calculate the spam mass of a pages : (Pr - Tr) / Pr 175 | """ 176 | pr = page_rank(matrix=matrix, taxation=True, b=b, Es=Es, S=S, nbr_iterations=nbr_iterations, verbose=verbose) 177 | tr = trust_rank(matrix=matrix, b=b, Ts=Ts, Tp=Tp, nbr_iterations=nbr_iterations, verbose=verbose) 178 | elements_length = len(matrix[0]) 179 | 180 | sm = (pr - tr) / pr 181 | if verbose: print sm; 182 | return sm 183 | 184 | 185 | def hits(L, lam, mu, nbr_iterations=100000000, verbose=True): 186 | """ 187 | Hiperlink induced topic search 188 | Computation of hubbiness and authority 189 | Authority : page's quality that tells you best about a topic 190 | Hubbiness : page's quality that tells you best about other pages and how to find them 191 | Authority of a page is the sum of predecessors's hubbiness 192 | Hubiness of a page is the sum of predecessors's authority 193 | L[i][j] = 1 if page_i link to page_j otherwise 0 194 | """ 195 | elements_length = len(L) 196 | L_t = L.getT() 197 | H = mat(ones((elements_length, 1))) 198 | H_s = 0 * H 199 | A = [0] * H 200 | T = [0] * H 201 | itr = 0 202 | while (H != H_s).any() and itr < nbr_iterations: 203 | if (H_s != 0 * mat(ones((elements_length, 1)))).any(): H = H_s; 204 | A = matrix_vector_multiplication(L_t, H, elements_length, 1, T) 205 | m = A.max() 206 | A = A / m 207 | H_s = matrix_vector_multiplication(L, A, elements_length, 1, T) 208 | m = H_s.max() 209 | H_s = H_s / m 210 | itr += 1 211 | A = matrix_vector_multiplication(L_t, H, elements_length, 1, T) 212 | m = A.max() 213 | A = A / m 214 | if verbose: 215 | print H 216 | print A 217 | 218 | 219 | def test_construct(): 220 | construct_web(4, 0.8, verbose=True) 221 | 222 | 223 | def test_page_rank(): 224 | m = matrix([[0, 0.5, 0, 0], [1 / 3, 0, 0, 0.5], [1 / 3, 0, 1, 0.5], [1 / 3, 0.5, 0, 0]]) 225 | page_rank(m, taxation=True, b=0.85, verbose=True) 226 | page_rank(m, taxation=True, b=0.85, Es=Es, S=S, verbose=True) 227 | 228 | 229 | def test_page_rank_advanced(): 230 | m = matrix([[0, 0.5, 0, 0], [1 / 3, 0, 0, 0.5], [1 / 3, 0, 1, 0.5], [1 / 3, 0.5, 0, 0]]) 231 | P = tuple((tuple(('1', '2', '3', '4')), tuple(('0', '6', '7', '8')), tuple(('2', '5', '9', '10')), 232 | tuple(('2', '5', '9', '10', '0')))) 233 | page_rank_advanced(m, b=0.85, P=P, S=Sk, nbr_iterations=100000, verbose=True) 234 | 235 | 236 | def test_sapm_farm(): 237 | spam_farm(Pa=10, Ps=30, Pn=500, b=0.85, verbose=True) 238 | 239 | 240 | def test_spam_mass(): 241 | m = matrix([[0, 0.5, 0, 0], [1 / 3, 0, 0, 0.5], [1 / 3, 0, 1, 0.5], [1 / 3, 0.5, 0, 0]]) 242 | return spam_mass(m, taxation=True, b=0.8, Ts=Es, Tp=S, verbose=True) 243 | 244 | 245 | if __name__ == '__main__': 246 | test_construct() 247 | test_page_rank() 248 | test_page_rank_advanced() 249 | test_sapm_farm() 250 | test_spam_mass() 251 | 252 | -------------------------------------------------------------------------------- /quora/datacenter_c.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Jan 01, 2013 4 | 5 | @author: Mourad Mourafiq 6 | 7 | About: This is an attempt to solve the Quora challenge Typehead. 8 | """ 9 | 10 | GO_ROOM = 0 11 | NOGO_ROOM = 1 12 | ENTRY_ROOM = 2 13 | EXIT_ROOM = 3 14 | 15 | 16 | class Room(object): 17 | """ 18 | Room 19 | 20 | @type _x: int 21 | @param _x: x coordiante 22 | 23 | @type _y: int 24 | @param _y: y coordiante 25 | 26 | @type _type : int 27 | @param _type: the type of the node (0 1 2 3) 28 | 29 | 30 | @type _visited : boolean 31 | @param _visited: track if the room was visited 32 | 33 | @type _neighbours: list 34 | @param _neighbours: the neighbours rooms 35 | """ 36 | 37 | def __init__(self, type): 38 | self._type = type 39 | self._neighbours = [] 40 | self._visited = False 41 | 42 | def add(self, neighbour): 43 | self._neighbours.append(neighbour) 44 | 45 | def init(self): 46 | self._neighbours = [] 47 | 48 | def visit(self): 49 | self._visited = True 50 | 51 | def unvisit(self): 52 | self._visited = False 53 | 54 | def is_visited(self): 55 | return True if self._visited else False 56 | 57 | def is_exit(self): 58 | return True if self._type == EXIT_ROOM else False 59 | 60 | def is_entry(self): 61 | return True if self._type == ENTRY_ROOM else False 62 | 63 | def is_nogo(self): 64 | return True if self._type == NOGO_ROOM else False 65 | 66 | 67 | class Cooling(object): 68 | """ 69 | backtracking solution to datacenter cooling 70 | 71 | @type _rooms: dict 72 | @param _rooms: dictionary of the rooms of the datacenter 73 | 74 | 75 | @type _entry: tuple 76 | @param _entry: entry room coordiante 77 | 78 | @type _nbr_rooms: int 79 | @param _nbr_rooms: number of room in our datacenter 80 | 81 | @type _nbr_rooms_visited: int 82 | @param _nbr_rooms_visited: number of room visited so far 83 | 84 | @type _nbr_lines: int 85 | @param _nbr_lines: number of lines in our datacenter 86 | 87 | @type _nbr_columns: int 88 | @param _nbr_columns: number of columns in our datacenter 89 | 90 | @type _nbr_ways: int 91 | @param _nbr_ways: number of ways (result) 92 | """ 93 | 94 | def __init__(self, nbr_lines, nbr_columns): 95 | self._rooms = {} 96 | self._entry = (0, 0) 97 | self._nbr_rooms = nbr_lines * nbr_columns 98 | self._nbr_rooms_visited = 1 99 | self._nbr_lines = nbr_lines 100 | self._nbr_columns = nbr_columns 101 | self._nbr_ways = 0 102 | 103 | def add(self, type, line, column, look_for_entry=True): 104 | self._rooms[(line, column)] = Room(type) 105 | if look_for_entry: 106 | if self._rooms[(line, column)].is_entry(): 107 | self._entry = (line, column) 108 | look_for_entry = False 109 | 110 | def _construct_neighbours(self, coord): 111 | l, c = coord 112 | room = self._rooms[(l, c)] 113 | room.init() 114 | if l > 0: 115 | if not (self._rooms[(l - 1, c)].is_nogo() or self._rooms[(l - 1, c)].is_visited()): 116 | room.add((l - 1, c)) 117 | if l + 1 < self._nbr_lines: 118 | if not (self._rooms[(l + 1, c)].is_nogo() or self._rooms[(l + 1, c)].is_visited()): 119 | room.add((l + 1, c)) 120 | if c > 0: 121 | if not (self._rooms[(l, c - 1)].is_nogo() or self._rooms[(l, c - 1)].is_visited()): 122 | room.add((l, c - 1)) 123 | if c + 1 < self._nbr_columns: 124 | if not (self._rooms[(l, c + 1)].is_nogo() or self._rooms[(l, c + 1)].is_visited()): 125 | room.add((l, c + 1)) 126 | 127 | def _visit(self, room): 128 | room.visit() 129 | self._nbr_rooms_visited += 1 130 | 131 | def _unvisit(self, room): 132 | room.unvisit() 133 | self._nbr_rooms_visited -= 1 134 | 135 | def find_way(self, current_room_coord=None): 136 | if current_room_coord is None: 137 | current_room_coord = self._entry 138 | self._visit(self._rooms[current_room_coord]) 139 | # check if exist 140 | elif self._rooms[current_room_coord].is_exit(): 141 | if self._nbr_rooms_visited == self._nbr_rooms: 142 | self._nbr_ways += 1 143 | return True 144 | else: 145 | return False 146 | # not exit yet, try this room's neighbours 147 | self._construct_neighbours(current_room_coord) 148 | current_room = self._rooms[current_room_coord] 149 | for neighbour in current_room._neighbours: 150 | self._visit(self._rooms[neighbour]) 151 | self.find_way(neighbour) 152 | self._unvisit(self._rooms[neighbour]) 153 | # at this point we couldn't find the exit 154 | return False 155 | 156 | 157 | W, H = [int(x) for x in raw_input().split()] 158 | cool = Cooling(H, W) 159 | for l in xrange(H): 160 | rooms = [int(x) for x in raw_input().split()] 161 | for c in xrange(W): 162 | cool.add(rooms[c], l, c) 163 | cool.find_way() 164 | print cool._nbr_ways -------------------------------------------------------------------------------- /quora/dcc.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Author : mourad mourafiq (07/01/2012) 3 | * 4 | * This is an attempr to solve the datacenter cooling problem 5 | */ 6 | #include 7 | #include 8 | 9 | //constants 10 | #define VISITED_ROOM '4' 11 | #define GO_ROOM '0' 12 | #define NOGO_ROOM '1' 13 | #define ENTRY_ROOM '2' 14 | #define EXIT_ROOM '3' 15 | 16 | //global variables 17 | char **rooms; 18 | int *to_check; 19 | int nbr_rooms; 20 | int H; 21 | int W; 22 | int nbr_rooms_visited=0; 23 | int nbr_ways=0; 24 | 25 | int way_exists(int l, int c){ 26 | int i, j; 27 | int result = 0; 28 | int nbr_to_check = (nbr_rooms - nbr_rooms_visited)*2; 29 | int count_added = 0; 30 | if (c > 0){ 31 | if (rooms[l][c-1] == GO_ROOM){ 32 | rooms[l][c-1] = VISITED_ROOM; 33 | //nbr_to_check--; 34 | to_check[count_added++] = l; 35 | to_check[count_added++] = c-1; 36 | } 37 | else if (rooms[l][c-1] == EXIT_ROOM){ 38 | result = 1; 39 | } 40 | } 41 | if (l > 0){ 42 | if (rooms[l-1][c] == GO_ROOM){ 43 | rooms[l-1][c] = VISITED_ROOM; 44 | //nbr_to_check--; 45 | to_check[count_added++] = l-1; 46 | to_check[count_added++] = c; 47 | } 48 | else if (rooms[l-1][c] == EXIT_ROOM){ 49 | result = 1; 50 | } 51 | } 52 | if (c+1 < W){ 53 | if (rooms[l][c+1] == GO_ROOM){ 54 | rooms[l][c+1] = VISITED_ROOM; 55 | //nbr_to_check--; 56 | to_check[count_added++] = l; 57 | to_check[count_added++] = c+1; 58 | } 59 | else if (rooms[l][c+1] == EXIT_ROOM){ 60 | result = 1; 61 | } 62 | } 63 | if (l+1 < H){ 64 | if (rooms[l+1][c] == GO_ROOM){ 65 | rooms[l+1][c] = VISITED_ROOM; 66 | //nbr_to_check--; 67 | to_check[count_added++] = l+1; 68 | to_check[count_added++] = c; 69 | } 70 | else if (rooms[l+1][c] == EXIT_ROOM){ 71 | result = 1; 72 | } 73 | } 74 | if (result == 1){ 75 | for (i=0; i 0){ 89 | if (rooms[cl][cc-1] == GO_ROOM){ 90 | rooms[cl][cc-1] = VISITED_ROOM; 91 | //nbr_to_check--; 92 | to_check[count_added++] = cl; 93 | to_check[count_added++] = cc-1; 94 | } 95 | else if (rooms[cl][cc-1] == EXIT_ROOM) 96 | exit_found = 1; 97 | } 98 | if (cl > 0){ 99 | if (rooms[cl-1][cc] == GO_ROOM){ 100 | rooms[cl-1][cc] = VISITED_ROOM; 101 | //nbr_to_check--; 102 | to_check[count_added++] = cl-1; 103 | to_check[count_added++] = cc; 104 | } 105 | else if (rooms[cl-1][cc] == EXIT_ROOM) 106 | exit_found = 1; 107 | } 108 | if (cc+1 < W){ 109 | if (rooms[cl][cc+1] == GO_ROOM){ 110 | rooms[cl][cc+1] = VISITED_ROOM; 111 | //nbr_to_check--; 112 | to_check[count_added++] = cl; 113 | to_check[count_added++] = cc+1; 114 | } 115 | else if (rooms[cl][cc+1] == EXIT_ROOM) 116 | exit_found = 1; 117 | } 118 | if (cl+1 < H){ 119 | if (rooms[cl+1][cc] == GO_ROOM){ 120 | rooms[cl+1][cc] = VISITED_ROOM; 121 | //nbr_to_check--; 122 | to_check[count_added++] = cl+1; 123 | to_check[count_added++] = cc; 124 | } 125 | else if (rooms[cl+1][cc] == EXIT_ROOM) 126 | exit_found = 1; 127 | } 128 | if ((nbr_to_check == count_added) && (exit_found == 1)){ 129 | result = 1; 130 | break; 131 | } 132 | if (cpt >= count_added){ 133 | result = 0; 134 | break; 135 | } 136 | } 137 | for (i=0; i 0){ 148 | if (rooms[l][c-1] == GO_ROOM){ 149 | rooms[l][c-1] = VISITED_ROOM; 150 | nbr_rooms_visited++; 151 | find_ways(l, c-1); 152 | rooms[l][c-1] = GO_ROOM; 153 | nbr_rooms_visited--; 154 | } 155 | else if ((rooms[l][c-1] == EXIT_ROOM) && (nbr_rooms_visited == nbr_rooms)){ 156 | nbr_ways++; 157 | return 1; 158 | } 159 | } 160 | if (l > 0){ 161 | if (rooms[l-1][c] == GO_ROOM){ 162 | rooms[l-1][c] = VISITED_ROOM; 163 | nbr_rooms_visited++; 164 | find_ways(l-1, c); 165 | rooms[l-1][c] = GO_ROOM; 166 | nbr_rooms_visited--; 167 | } 168 | else if ((rooms[l-1][c] == EXIT_ROOM) && (nbr_rooms_visited == nbr_rooms)){ 169 | nbr_ways++; 170 | return 1; 171 | } 172 | } 173 | if (c+1 < W){ 174 | if (rooms[l][c+1] == GO_ROOM){ 175 | rooms[l][c+1] = VISITED_ROOM; 176 | nbr_rooms_visited++; 177 | find_ways(l, c+1); 178 | rooms[l][c+1] = GO_ROOM; 179 | nbr_rooms_visited--; 180 | } 181 | else if ((rooms[l][c+1] == EXIT_ROOM) && (nbr_rooms_visited == nbr_rooms)){ 182 | nbr_ways++; 183 | return 1; 184 | } 185 | } 186 | if (l+1 < H){ 187 | if (rooms[l+1][c] == GO_ROOM){ 188 | rooms[l+1][c] = VISITED_ROOM; 189 | nbr_rooms_visited++; 190 | find_ways(l+1, c); 191 | rooms[l+1][c] = GO_ROOM; 192 | nbr_rooms_visited--; 193 | } 194 | else if ((rooms[l+1][c] == EXIT_ROOM) && (nbr_rooms_visited == nbr_rooms)){ 195 | nbr_ways++; 196 | return 1; 197 | } 198 | } 199 | return 0; 200 | } 201 | 202 | int main(int argc, char* argv[]){ 203 | int i, j, start_l, start_c; 204 | scanf("%d %d\n", &W, &H); 205 | nbr_rooms = W * H - 2; 206 | char line[sizeof(char)*(W+2)*2]; 207 | rooms = (char **) malloc(H * sizeof(char *)); 208 | for (i=0; i story._proportioned_score): 58 | return True 59 | if (self._proportioned_score < story._proportioned_score): 60 | return False 61 | if (self._id < story._id): 62 | return True 63 | return False 64 | 65 | def _better_score(self, story): 66 | if (self._score > story._score): 67 | return True 68 | if (self._score < story._score): 69 | return False 70 | if (self._id < story._id): 71 | return True 72 | return False 73 | 74 | 75 | class Solution(object): 76 | """ 77 | Potential solution for the upcoming reload 78 | 79 | @type _stories: list 80 | @param _stories: The list of potential items. 81 | 82 | @type _len_stories : int 83 | @param _len_stories: The length of the list of stories. 84 | 85 | @type _score: int 86 | @param _score: The current solution's score. 87 | 88 | @type _height: int 89 | @param _height: The current solution's height. 90 | """ 91 | 92 | def __init__(self): 93 | self._stories = [] 94 | self._len_stories = 0 95 | self._score = 0 96 | self._height = 0 97 | 98 | def __repr__(self): 99 | return "%s %s %s" % ( 100 | self._score, self._len_stories, ' '.join(sorted([str(story._id) for story in self._stories]))) 101 | 102 | def __gt__(self, solution): 103 | # check who's score is better 104 | if self._score > solution._score: 105 | return True 106 | if self._score < solution._score: 107 | return False 108 | # same score; check who has less stories 109 | if self._len_stories < solution._len_stories: 110 | return True 111 | if self._len_stories > solution._len_stories: 112 | return False 113 | #same score, same number of stories; check who has smaller lexicographically 114 | if sorted([story._id for story in self._stories]) <= sorted([story._id for story in solution._stories]): 115 | return True 116 | else: 117 | return False 118 | 119 | @classmethod 120 | def clone(cls, solution): 121 | clone_solution = cls() 122 | clone_solution._stories = copy.copy(solution._stories) 123 | clone_solution._len_stories = solution._len_stories 124 | clone_solution._score = solution._score 125 | clone_solution._height = solution._height 126 | return clone_solution 127 | 128 | def add(self, story): 129 | """ 130 | add story to the solution 131 | """ 132 | self._stories.append(story) 133 | self._score += story._score 134 | self._height += story._height 135 | self._len_stories += 1 136 | 137 | def remove(self, story): 138 | """ 139 | remove story from the solution 140 | """ 141 | self._stories.remove(story) 142 | self._score -= story._score 143 | self._height -= story._height 144 | self._len_stories -= 1 145 | 146 | 147 | class Optimizer(object): 148 | """ 149 | Keep track of stories that can potentially make a solution. 150 | The stories should be sorted by time of publication. 151 | 152 | @type _stories: list 153 | @param stories: The list of stories that can potentially make a solution. 154 | 155 | @type _len_stories : int 156 | @param _len_stories: The length of the list of stories. 157 | 158 | @type __height: int 159 | @param window: The height of the browser. 160 | 161 | @type __window: int 162 | @param window: The window of recent stories. 163 | 164 | @type _best_story: Stroy 165 | @param _best_story: The best story so far. 166 | """ 167 | __height = 0 168 | __window = 0 169 | 170 | def __init__(self, window, height): 171 | self._stories = [] 172 | self._len_stories = 0 173 | Optimizer.__window = window 174 | Optimizer.__height = height 175 | self._best_story = Story() 176 | 177 | def _purge_old_stories(self, current_time): 178 | """ 179 | remove old stories form the current list of stories 180 | """ 181 | # check if the oldest stories can still be part of the solution 182 | to_be_removed = [] 183 | for old_story in self._stories: 184 | if (current_time - old_story._time) <= Optimizer.__window: 185 | break 186 | else: 187 | to_be_removed.append(old_story) 188 | for old_story in to_be_removed: 189 | self._stories.remove(old_story) 190 | self._len_stories -= 1 191 | 192 | def _brute_force(self): 193 | """ 194 | check all possibilities: 195 | 1) best solution for combination of 2 stories (if it exists). 196 | 2) best solution for combination of 3 stories (if it exists). 197 | . 198 | . 199 | l-1) best solution for combination of l-1 stories (if it exists). 200 | 201 | l : being the length of the current stories. 202 | """ 203 | best_solution = Solution() 204 | best_solution.add(self._best_story) 205 | for i in xrange(2, self._len_stories + 1): 206 | for tuple_stories in itertools.combinations(self._stories, i): 207 | if self.addable(tuple_stories): 208 | current_solution = Solution() 209 | for story in tuple_stories: 210 | current_solution.add(story) 211 | if current_solution > best_solution: 212 | best_solution = current_solution 213 | return best_solution 214 | 215 | def _annealing_simulated(self, T=1000.0, cool=0.35): 216 | """ 217 | perform the annealing simulated algorithm: 218 | 1) start with a random solution. 219 | 2) move to a neighbour solution. 220 | (favors better solutions, and accepts worst solutions with a certain probabilities 221 | to avoid local minimum until the temperature is totally down) 222 | """ 223 | # order stories based on their proportioned score 224 | ordered_stories = sorted(self._stories, reverse=True) 225 | # produce a random solution 226 | current_solution, stories_in_current = self.random_solution(ordered_stories, self._len_stories) 227 | best_solution = Solution.clone(current_solution) 228 | while (T > 0.1): 229 | temp_solution = Solution.clone(current_solution) 230 | stories_in_temp = copy.copy(stories_in_current) 231 | stories_at_true = [i for i in xrange(self._len_stories) if stories_in_temp[i]] 232 | #check if there is still stories 233 | if len(stories_at_true) == self._len_stories: 234 | break 235 | #choose a story and remove it 236 | if stories_at_true: 237 | indice = choice(stories_at_true) 238 | stories_in_temp[indice] = False 239 | temp_solution.remove(ordered_stories[indice]) 240 | else: 241 | indice = -1 242 | #add any number of other stories available 243 | for i in xrange(indice + 1, self._len_stories): 244 | if stories_in_temp[i]: 245 | continue 246 | story = ordered_stories[i] 247 | if self.addable((story,), temp_solution): 248 | stories_in_temp[i] = True 249 | temp_solution.add(story) 250 | elif temp_solution._height == self.__height: 251 | break 252 | #compare temp and current solutions 253 | if temp_solution > current_solution: 254 | current_solution = temp_solution 255 | stories_in_current = stories_in_temp 256 | #also since temp is better than current, compare it to best 257 | if current_solution > best_solution: 258 | best_solution = Solution.clone(current_solution) 259 | #current solution is better than temp 260 | #the algorithm states that we can still give it a try depending on a probability 261 | else: 262 | #since temp solution score is < current solution score 263 | #this probability will be near one at the beginning where T is high 264 | #but will get lower and lower as T cool down 265 | #hence will accept less and less bad solution 266 | p = pow(math.e, float(temp_solution._score - current_solution._score) / T) 267 | if p > random(): 268 | current_solution = temp_solution 269 | stories_in_current = stories_in_temp 270 | #decrease the temperature 271 | T = T * cool 272 | return best_solution 273 | 274 | def add(self, story): 275 | # check if the story's height is within the browser's height 276 | if story._height <= Optimizer.__height: 277 | self._stories.append(story) 278 | self._len_stories += 1 279 | if (story > self._best_story): 280 | self._best_story = story 281 | 282 | def produce_solution(self, current_time, solution=BRUTE_FORCE): 283 | self._purge_old_stories(current_time) 284 | if solution == BRUTE_FORCE: 285 | return self._brute_force() 286 | elif solution == ANNEALING_SIMULATED: 287 | return self._annealing_simulated() 288 | 289 | @classmethod 290 | def addable(cls, tuple_stories, solution=Solution()): 291 | total_height = solution._height 292 | for story in tuple_stories: 293 | total_height += story._height 294 | if total_height <= cls.__height: 295 | return True 296 | return False 297 | 298 | @classmethod 299 | def random_solution(cls, list_stories, length_stories): 300 | """ 301 | produce a random solution 302 | """ 303 | stories_in = [False] * length_stories 304 | solution = Solution() 305 | for i in xrange(length_stories): 306 | story = list_stories[i] 307 | if cls.addable((story,), solution): 308 | solution.add(story) 309 | stories_in[i] = True 310 | elif solution._height == cls.__height: 311 | break 312 | return solution, stories_in 313 | 314 | 315 | N, W, H = [int(x) for x in raw_input().split()] 316 | p = Optimizer(W, H) 317 | while (N): 318 | command = raw_input().split() 319 | if command[0] == "S": # story 320 | t, s, h = [int(x) for x in command[1:]] 321 | p.add(Story(t, s, h)) 322 | elif command[0] == "R": # Reload 323 | tr = int(command[1]) 324 | print p.produce_solution(tr, solution=ANNEALING_SIMULATED) 325 | N -= 1 -------------------------------------------------------------------------------- /quora/nearby.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Jan 09, 2013 4 | 5 | @author: Mourad Mourafiq 6 | 7 | About: This is an attempt to solve the Quora challenge Nearby. 8 | """ 9 | import math 10 | import heapq 11 | 12 | THRESHOLD = 0.001 13 | SQUARE_SIDE = 10 14 | 15 | 16 | class Square(object): 17 | """ 18 | Square is data structure that represents a part of the plane. 19 | A square is divided to 4 parts. 20 | 21 | @type _origine_x: float 22 | @param _origine_x: the x coordinate of the origine of this square 23 | 24 | @type _origine_y: float 25 | @param _origine_y: the y coordinate of the origine of this square 26 | 27 | @type _curent_distance: int 28 | @param _curent_distance: current distance from the query coordiantes 29 | 30 | @type _tn: int 31 | @param _tn: numbre of points in this square 32 | 33 | @type _topics: list 34 | @param _topics: list of topics 35 | """ 36 | 37 | def __init__(self, origine_x, origine_y): 38 | self._origine_x = origine_x 39 | self._origine_y = origine_y 40 | self._current_distance = 0 41 | self._tn = 0 42 | self._topics = [] 43 | 44 | def __gt__(self, square): 45 | delta = self._current_distance - square._current_distance 46 | if delta < 0: 47 | return True 48 | if delta > 0: 49 | return False 50 | 51 | def add(self, topic): 52 | self._tn += 1 53 | self._topics.append(topic) 54 | 55 | def set_current_distance(self, origin_x, origin_y): 56 | self._current_distance = Topic.euclidean_dis(self._origine_x, self._origine_y, origin_x, origin_y) 57 | for topic in self._topics: 58 | topic.set_current_distance(origin_x, origin_y) 59 | 60 | def get_topics(self, tn): 61 | if self._tn >= tn: 62 | return self._topics[:tn], 0, False 63 | else: 64 | return self._topics, tn - self._tn, True 65 | 66 | 67 | class Topic(object): 68 | """ 69 | Topic 70 | 71 | @type _id: int 72 | @param _id: the id of the topic 73 | 74 | @type _x: float 75 | @param _x: the x coordinate in the plane 76 | 77 | @type _y: float 78 | @param _y: the y coordinate in the plane 79 | 80 | @type _current_distance: float 81 | @param _current_distance: the current distance from the origin(origin being the query coordiantaes) 82 | 83 | @type _qn: int 84 | @param _qn: the number of questions associated with this topics 85 | 86 | @type _questions: list 87 | @param _questions: the list of the questions associated with this topic 88 | """ 89 | 90 | def __init__(self, id, x, y): 91 | self._id = id 92 | self._x = x 93 | self._y = y 94 | self._current_distance = 0 95 | self._qn = 0 96 | self._questions = [] 97 | 98 | def __gt__(self, topic): 99 | delta = self._current_distance - topic._current_distance 100 | if delta < -THRESHOLD: 101 | return True 102 | if delta > THRESHOLD: 103 | return False 104 | return True if self._id > topic._id else False 105 | 106 | def add(self, question): 107 | self._qn += 1 108 | self._questions.append(question) 109 | 110 | def get_questions(self, qn, questions): 111 | go_on = True 112 | for question in self._questions: 113 | if question not in questions: 114 | questions.append(question) 115 | qn -= 1 116 | if qn == 0: 117 | go_on = False 118 | break 119 | return sorted(questions, reverse=True), qn, go_on 120 | 121 | def set_current_distance(self, origin_x, origin_y): 122 | self._current_distance = self.euclidean_dis(self._x, self._y, origin_x, origin_y) 123 | 124 | @staticmethod 125 | def euclidean_dis(x1, y1, x2, y2): 126 | return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2)) 127 | 128 | 129 | class Nearby(object): 130 | """ 131 | Nearby solver 132 | 133 | @type _tn: int 134 | @param _tn: the number of topics created 135 | 136 | @type _topics: dict 137 | @param _topics: the dictionary of topics created 138 | """ 139 | 140 | 141 | def __init__(self, tn): 142 | self._tn = tn 143 | self._topics = {} 144 | 145 | def add_topic(self, topic_id, x, y): 146 | self._topics[topic_id] = Topic(topic_id, x, y) 147 | 148 | def add_question(self, question, nbr_topics, topics): 149 | if nbr_topics <= 0: 150 | return 151 | for i in xrange(nbr_topics): 152 | topic_id = int(topics[i]) 153 | self._topics[topic_id].add(question) 154 | 155 | def _process_query_topic(self, nbr_results, list_topics): 156 | if nbr_results > self._tn: 157 | nbr_results = self._tn 158 | return ' '.join([str(list_topics[i]._id) for i in xrange(nbr_results - 1, -1, -1)]) 159 | 160 | def _process_query_question(self, nbr_results, list_topics): 161 | results = [] 162 | go_on = True 163 | for i in xrange(self._tn - 1, -1, -1): 164 | results, nbr_results, go_on = list_topics[i].get_questions(nbr_results, results) 165 | if not go_on: 166 | break 167 | return ' '.join([str(x) for x in results]) 168 | 169 | def process_query(self, q_type, q_nbr_results, q_x, q_y): 170 | list_topics = [] 171 | for topic in self._topics.itervalues(): 172 | topic.set_current_distance(q_x, q_y) 173 | heapq.heappush(list_topics, topic) 174 | if q_type == "t": 175 | return self._process_query_topic(q_nbr_results, list_topics) 176 | if q_type == "q": 177 | return self._process_query_question(q_nbr_results, list_topics) 178 | 179 | 180 | class NearbySquare(Nearby): 181 | """ 182 | Nearby solver using the square data structure 183 | 184 | @type _tn: int 185 | @param _tn: the number of topics created 186 | 187 | @type _topics: dict 188 | @param _topics: the dictionary of topics created 189 | 190 | @type _squares: dict 191 | @param _squares: the dictionary of squares created 192 | """ 193 | 194 | 195 | def __init__(self, tn): 196 | self._tn = tn 197 | self._ts = 0 198 | self._topics = {} 199 | self._squares = {} 200 | 201 | def add_topic(self, topic_id, x, y): 202 | topic = Topic(topic_id, x, y) 203 | self._topics[topic_id] = topic 204 | # locate which square this topic should go 205 | left_x = x % 10 206 | left_y = y % 10 207 | square_x = (x - left_x) + 5 208 | square_y = (y - left_y) + 5 209 | # check if this square exists 210 | try: 211 | square = self._squares[(square_x, square_y)] 212 | except: 213 | square = Square(square_x, square_y) 214 | self._squares[(square_x, square_y)] = square 215 | self._ts += 1 216 | square.add(topic) 217 | 218 | def _process_query_topic(self, nbr_results, list_squares): 219 | results = [] 220 | go_on = True 221 | if nbr_results > self._tn: 222 | nbr_results = self._tn 223 | for i in xrange(self._ts - 1, -1, -1): 224 | temp_results, nbr_results, go_on = list_squares[i].get_topics(nbr_results) 225 | results += temp_results 226 | if not go_on: 227 | break 228 | results = sorted(results, reverse=True) 229 | return ' '.join([str(result._id) for result in results]) 230 | 231 | def _process_query_question(self, nbr_results, list_squares): 232 | results = [] 233 | go_on = True 234 | for i in xrange(self._ts - 1, -1, -1): 235 | for topic in sorted(list_squares[i]._topics, reverse=True): 236 | results, nbr_results, go_on = topic.get_questions(nbr_results, results) 237 | if not go_on: 238 | break 239 | if not go_on: 240 | break 241 | return ' '.join([str(x) for x in results]) 242 | 243 | def process_query(self, q_type, q_nbr_results, q_x, q_y): 244 | list_squares = [] 245 | for square in self._squares.itervalues(): 246 | square.set_current_distance(q_x, q_y) 247 | heapq.heappush(list_squares, square) 248 | if q_type == "t": 249 | return self._process_query_topic(q_nbr_results, list_squares) 250 | if q_type == "q": 251 | return self._process_query_question(q_nbr_results, list_squares) 252 | 253 | 254 | T, Q, N = [int(x) for x in raw_input().split()] 255 | nearby = NearbySquare(T) 256 | while (T): # list of topics 257 | command = raw_input().split() 258 | nearby.add_topic(int(command[0]), float(command[1]), float(command[2])) 259 | T -= 1 260 | while (Q): # list of questions 261 | command = raw_input().split() 262 | nearby.add_question(int(command[0]), int(command[1]), command[2:]) 263 | Q -= 1 264 | while (N): # process queries 265 | command = raw_input().split() 266 | print nearby.process_query(command[0], int(command[1]), float(command[2]), float(command[3])) 267 | N -= 1 -------------------------------------------------------------------------------- /quora/results.txt: -------------------------------------------------------------------------------- 1 | 3rCWr +1 2 | snInN -1 3 | ibfT7 -1 4 | IcbKR +1 5 | SIXmF +1 6 | dLCdh +1 7 | ziFJ8 -1 8 | 1WtTD -1 9 | 9uIKh +1 10 | df4Mc -1 11 | 3nxpY -1 12 | aesmq +1 13 | MyTDz +1 14 | TDMhx +1 15 | Y0rW3 +1 16 | KCcKf +1 17 | 2cz5M -1 18 | kqIJj -1 19 | C1Sg2 +1 20 | VAmIt -1 21 | ku9j1 +1 22 | TjHRV +1 23 | sq8Xj -1 24 | uKzm4 +1 25 | 3at4H +1 26 | 8nXGS +1 27 | 4x8ij +1 28 | PeMnA -1 29 | UUPpU -1 30 | HQ4lZ +1 31 | lEv01 +1 32 | XCY52 -1 33 | FNoY7 +1 34 | JJbco -1 35 | PHQ7z +1 36 | Xejsj +1 37 | A4IsT -1 38 | 7cU9R -1 39 | 8mSRL -1 40 | jRMuo -1 41 | dljxu +1 42 | wZyoj -1 43 | us2ca +1 44 | EVenw -1 45 | QLseT +1 46 | lanI5 +1 47 | RF9di +1 48 | 3e6Aa -1 49 | W5mvO -1 50 | LkKbu -1 51 | gbus8 -1 52 | LN4W4 -1 53 | 9FNA4 -1 54 | rd3qM +1 55 | pV8eI -1 56 | 5wnO7 -1 57 | 03KFY -1 58 | i25BS +1 59 | AziH8 -1 60 | YyCpz -1 61 | qLfPb +1 62 | CJBeL -1 63 | EaAPx -1 64 | sQDFf -1 65 | CELfn -1 66 | Ac6Hy -1 67 | ULryN -1 68 | qCBAx -1 69 | Hoz2c -1 70 | 8kLbb +1 71 | KAWwb +1 72 | lNMTe +1 73 | MzTHU -1 74 | Sq8XV -1 75 | s8ZUG -1 76 | lDcDx +1 77 | xZoGD +1 78 | vHLAR +1 79 | Ag2kt +1 80 | baqkE +1 81 | Hsbs2 -1 82 | BlvEz -1 83 | CJqsS -1 84 | vbzII +1 85 | R16fw -1 86 | IxBjS +1 87 | yJl2b -1 88 | RXdcX +1 89 | T7uzV +1 90 | ccZGw +1 91 | f9xFZ -1 92 | dO9iE -1 93 | 2TaN2 +1 94 | a1XoY -1 95 | 1xXiG -1 96 | a8kZK -1 97 | mXe41 +1 98 | 8NfbF +1 99 | Sqdti +1 100 | gnCsH -1 101 | YGNBE +1 102 | zOA3j +1 103 | b3Cm5 +1 104 | Wjo2X +1 105 | IcuwU +1 106 | FyieE +1 107 | 5OUK8 +1 108 | SjCvq -1 109 | kKVUa +1 110 | CEVSg +1 111 | F2MAp -1 112 | hOKPP +1 113 | X22r3 +1 114 | kDxwQ +1 115 | s42QM -1 116 | Olrdu -1 117 | P6Fag -1 118 | IHvly +1 119 | 5bWYy -1 120 | 5zeok +1 121 | 6Z4hF -1 122 | 1RjNM +1 123 | yc6uV +1 124 | JC92f +1 125 | pw6Bl +1 126 | KYPQw -1 127 | ZPFtI +1 128 | ZBvXR -1 129 | wofSH -1 130 | Q4Ika +1 131 | Y7U06 +1 132 | orSui +1 133 | BZ0Op -1 134 | 0T3oe -1 135 | A9NM2 +1 136 | nHCpf -1 137 | tieFX +1 138 | i28eq +1 139 | XftK1 -1 140 | DjFAx +1 141 | WDvPc +1 142 | UEEYY +1 143 | GKXw3 +1 144 | N5qAi +1 145 | 1DGAU -1 146 | XlcdP -1 147 | CVfA1 -1 148 | b39YD -1 149 | d1NlJ +1 150 | ue6lj -1 151 | hx3rt -1 152 | wc0Vt -1 153 | 8iD9Q -1 154 | PNt7q -1 155 | Y97G2 +1 156 | svZ34 +1 157 | sxgEq +1 158 | ZDIWx -1 159 | rD2Az -1 160 | pRsn8 +1 161 | MiByI +1 162 | vM7l5 +1 163 | kbQbh -1 164 | DhzQW -1 165 | W3cWn -1 166 | ItQ1c -1 167 | 4RG18 +1 168 | 9bZNj +1 169 | IHJ5G -1 170 | bsFcm -1 171 | LWfTR -1 172 | ZqXuD -1 173 | b3miO -1 174 | ruJ8j +1 175 | kpmxZ -1 176 | zmAo1 -1 177 | Xaref +1 178 | BhtUL -1 179 | FJefe +1 180 | EGsdK +1 181 | JMmSL +1 182 | SxZPl -1 183 | A9yVd -1 184 | YRv3l +1 185 | Np4je -1 186 | FC1TZ +1 187 | v4CRw +1 188 | DjT9c +1 189 | TEVGW +1 190 | DAO68 +1 191 | 7aT15 +1 192 | quDTm +1 193 | kUG9i -1 194 | FH95r +1 195 | dYH4a +1 196 | 1FDKf +1 197 | lVhAa -1 198 | pL58M -1 199 | UXRFO +1 200 | oNz1I +1 201 | oomHm -1 202 | BHZT7 +1 203 | Ky1yG +1 204 | Esu6G -1 205 | rucD8 -1 206 | NntCQ -1 207 | MiAWn -1 208 | a7X2g +1 209 | fH4fZ +1 210 | ew4Ra -1 211 | py7OD -1 212 | fsgxM +1 213 | bXpcf -1 214 | 9wBEn -1 215 | tgchk +1 216 | YHFHF -1 217 | VuXx7 -1 218 | rEnhQ -1 219 | APrIe +1 220 | SxoIg +1 221 | qdQFT +1 222 | v8u87 +1 223 | BTWxu +1 224 | JxQvo -1 225 | 3SvJ7 -1 226 | Gy42c -1 227 | lWPIk -1 228 | r1TCV -1 229 | LhKtL -1 230 | hfawL -1 231 | KJDMV -1 232 | BN93Q +1 233 | 4eMVe -1 234 | rqtFq -1 235 | rUI3j +1 236 | CERA7 -1 237 | S9Azu -1 238 | z6LjY +1 239 | e0tCJ +1 240 | n9CNr -1 241 | lEJzf +1 242 | gDUHm +1 243 | lWFHR +1 244 | MATqj +1 245 | 6Xt0t +1 246 | iIwLp +1 247 | 9Zm1e -1 248 | OHFov +1 249 | uan5I -1 250 | frFAd +1 251 | HkBSy +1 252 | C3a0q -1 253 | wh2WS +1 254 | ReTOY +1 255 | BT4jS -1 256 | 0xECG +1 257 | vj72F +1 258 | 7k7OU +1 259 | 3JIx0 -1 260 | UiKFt +1 261 | 1vGez -1 262 | mPsf0 -1 263 | AzO5u +1 264 | hpRrl -1 265 | 7cwj6 -1 266 | XMwNb -1 267 | kLXoo -1 268 | gJr3Q -1 269 | ZS89w +1 270 | DGunR +1 271 | 8KygV +1 272 | W7o8e +1 273 | uY40J -1 274 | b8Vdt -1 275 | NB5GM +1 276 | UW8Sd +1 277 | 8ilC4 -1 278 | Z37t7 -1 279 | TYLgu -1 280 | 28rEW +1 281 | YMh5A -1 282 | Z4KU6 -1 283 | YuXTr +1 284 | BymPK -1 285 | 0IiAP +1 286 | cqded +1 287 | NyiLN +1 288 | 6x5i5 +1 289 | 7w0vA +1 290 | 1uQnB +1 291 | oy1UM +1 292 | PKxcI -1 293 | 3CZAd +1 294 | hehRo -1 295 | a9F6V +1 296 | WhX2F +1 297 | ge4Nb -1 298 | Pe9Ds +1 299 | lZ14T +1 300 | mfglK +1 301 | 42faP +1 302 | lAque +1 303 | ntnal +1 304 | Jasel -1 305 | YYoW1 -1 306 | DHJVP +1 307 | SUIoA -1 308 | Tppy2 -1 309 | 7SLdU +1 310 | MJKb9 +1 311 | Nlg2a +1 312 | RNcmi +1 313 | SFZMz -1 314 | 9ukXM +1 315 | WMHcA +1 316 | Fifzv -1 317 | fDgUN -1 318 | SuI41 +1 319 | 6ikOI +1 320 | yvCqh -1 321 | UOpC8 -1 322 | 75C7e -1 323 | m5S2h +1 324 | gqSig -1 325 | BVxmn -1 326 | E3R54 +1 327 | 49kJj +1 328 | hgr96 +1 329 | ydfLT -1 330 | wjeiV +1 331 | Zb62A -1 332 | RrWdE -1 333 | 2H2wU +1 334 | HcVYf -1 335 | 5KjAs -1 336 | M2oQC +1 337 | 93kvL -1 338 | TC3y5 -1 339 | nvi5Q -1 340 | 31L05 +1 341 | uOBuO +1 342 | 7139c +1 343 | FWxzp +1 344 | RSwJ2 +1 345 | 2iRKp +1 346 | gJYDb -1 347 | NBV2w -1 348 | Xu24W +1 349 | 1B3K5 +1 350 | Hpe1Y +1 351 | 8fo4i -1 352 | mZnrj -1 353 | 4PTcN -1 354 | PwRKw +1 355 | lvuaq -1 356 | Xyz8E -1 357 | l8TsJ +1 358 | YXBaC -1 359 | 3AiLQ -1 360 | AUdZp -1 361 | OrhRn +1 362 | 0hKzy +1 363 | yVJG9 +1 364 | DwZ91 -1 365 | CiCix +1 366 | 1rURm -1 367 | uRb8x +1 368 | hNdOm -1 369 | 0mlqT +1 370 | HVfdA +1 371 | GQTwU +1 372 | FbAKm +1 373 | RkxT4 -1 374 | unLvt -1 375 | sAI1b +1 376 | m1Vzy +1 377 | vnCWO +1 378 | SheVD +1 379 | xU9p1 -1 380 | tWQIu +1 381 | D4Q0p +1 382 | J6oW5 +1 383 | JYSZN +1 384 | DSkNC -1 385 | NMX1A -1 386 | ilHXM -1 387 | gqx1j +1 388 | twJio -1 389 | AC2iU +1 390 | UjSKR -1 391 | kMJtM -1 392 | oCbBC -1 393 | 8ecRn -1 394 | eAnji +1 395 | ZON1D +1 396 | hKsn0 +1 397 | x8qVu -1 398 | xyI2l -1 399 | LQZ9d -1 400 | yCiZ7 -1 401 | z1G6d +1 402 | AEaC5 +1 403 | uwRzL -1 404 | QYyXT +1 405 | pvcTy -1 406 | pDQOT +1 407 | DexdS +1 408 | 2t6rO -1 409 | 3uMDO -1 410 | k7V2B -1 411 | sW0fL -1 412 | 3RJAN +1 413 | mB6hU +1 414 | CvxqX -1 415 | YYcs9 -1 416 | cZAn3 -1 417 | sToxl +1 418 | rZcB9 -1 419 | LiYCH -1 420 | 8Zquw -1 421 | PmZDG -1 422 | rsCuL -1 423 | dliev +1 424 | Hb30K +1 425 | LmSJT +1 426 | x8VHE +1 427 | gLbT6 -1 428 | 3ZCSD +1 429 | P0bbj +1 430 | 0SCTp -1 431 | Y9VUN +1 432 | hk8gh -1 433 | 8Jxn7 +1 434 | zyM6v -1 435 | 1UT5q -1 436 | xzTJu -1 437 | L5ty3 -1 438 | q3iR0 +1 439 | kIlr3 +1 440 | AFno7 -1 441 | Q30WV +1 442 | 9WyKp +1 443 | OrbMK -1 444 | 1crzL -1 445 | 75zq7 -1 446 | 0M9md -1 447 | 7x4uA +1 448 | g42wQ -1 449 | W6kby -1 450 | VzRvG +1 451 | wnddf -1 452 | Fw5Sc -1 453 | btL2k -1 454 | lmsyA -1 455 | QIczK -1 456 | i9PM1 +1 457 | ULDFC -1 458 | BjyGS +1 459 | chgHB +1 460 | y32d8 +1 461 | W5R4s +1 462 | lBi1Q +1 463 | GnXj3 -1 464 | 3ekNC -1 465 | wi6YM -1 466 | E29HU -1 467 | PZaNW -1 468 | ru6BY -1 469 | EIRJY +1 470 | L7OaE +1 471 | EcJrP +1 472 | vfi01 -1 473 | 1Vly7 -1 474 | ibAED -1 475 | GPqoG +1 476 | ADfzT -1 477 | FTRNE +1 478 | chBAx -1 479 | vQ6KG -1 480 | KwXO4 -1 481 | 3TzgC +1 482 | PgYPK -1 483 | M8Sqt -1 484 | CPbgW +1 485 | N2j8m +1 486 | taDRA -1 487 | ZLcrw +1 488 | H4sPY -1 489 | BkBE0 +1 490 | zLXXn -1 491 | yQjNY -1 492 | pWyKW +1 493 | hiH6G +1 494 | Ag1tS -1 495 | 7OD1M -1 496 | VBFpQ +1 497 | nP4UA +1 498 | opFHG -1 499 | siSyi -1 500 | zmgX6 -1 -------------------------------------------------------------------------------- /quora/typehead.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Created on Jan 04, 2013 4 | 5 | @author: Mourad Mourafiq 6 | 7 | About: This is an attempt to solve the Quora challenge Typehead. 8 | ''' 9 | import re 10 | import copy 11 | import datetime 12 | 13 | COMMANDS = "(ADD)|(DEL)|(W?QUERY)" 14 | ANY_STRING = "(\\S*.*)" 15 | SEPARATORS = "(?: |\\t)" 16 | IDS = "\\w+" 17 | TYPES = "user|topic|question|board" 18 | FLOATS = "[0-9]+(?:.[0-9]*)?" 19 | INTS = "[0-9]+" 20 | BOOSTS = "((?:" + TYPES + "|(?:" + IDS + ")):" + FLOATS + SEPARATORS + ")*" 21 | ANY_COMMAND = "(?P" + COMMANDS + ")" + SEPARATORS + "(?P" + ANY_STRING + ")" 22 | ADD_COMMAND = "(?P" + TYPES + ")" + SEPARATORS + \ 23 | "(?P" + IDS + ")" + SEPARATORS + \ 24 | "(?P" + FLOATS + ")" + SEPARATORS + \ 25 | "(?P" + ANY_STRING + ")" 26 | DEL_COMMAND = "(?P" + IDS + ")" 27 | QUERY_COMMAND = "(?P" + INTS +")" + SEPARATORS + \ 28 | "(?P" + ANY_STRING + ")" 29 | WQUERY_COMMAND = "(?P" + INTS +")" + SEPARATORS + \ 30 | "(?P" + INTS +")" + SEPARATORS + \ 31 | "(?P" + BOOSTS + ")" + \ 32 | "(?P" +ANY_STRING + ")" 33 | COMMAND_MATCHER = re.compile(ANY_COMMAND) 34 | ADD_COMMAND_MATCHER = re.compile(ADD_COMMAND) 35 | DEL_COMMAND_MATCHER = re.compile(DEL_COMMAND) 36 | QUERY_COMMAND_MATCHER = re.compile(QUERY_COMMAND) 37 | WQUERY_COMMAND_MATCHER = re.compile(WQUERY_COMMAND) 38 | 39 | NOK = "{'':[]}" 40 | 41 | class Prefixer(): 42 | 43 | def __init__(self): 44 | self.__data = {} 45 | 46 | def __repr__(self): 47 | return 'Prefixer(%s)' % (self.__data,) 48 | 49 | def __eq__(self, other): 50 | return self.__data == other.__data 51 | 52 | def get_data(self): 53 | return self.__data 54 | 55 | def insert(self, word, item_id): 56 | node = self.__data 57 | while word: 58 | prefix, key = self.longest_prefix(word, node.keys()) 59 | if not prefix: 60 | break 61 | len_prefix = len(prefix) 62 | if prefix != key: 63 | # split key into prefix:suffix, move data 64 | suffix = key[len_prefix:] 65 | current_node = node[key] 66 | node[prefix] = {suffix:current_node} 67 | del node[key] 68 | word = word[len_prefix:] 69 | node = node[prefix] 70 | if word: 71 | node[word] = eval(NOK) 72 | node[word][''].append(item_id) 73 | else: 74 | try: 75 | node[word].append(item_id) 76 | except: 77 | node[word] = [] 78 | node[word].append(item_id) 79 | return True 80 | 81 | def remove(self, word, item_id): 82 | node = self.__data 83 | while word: 84 | prefix, key = self.longest_prefix(word, node.keys()) 85 | if not prefix: 86 | return False 87 | node = node.get(prefix, None) 88 | if not node: 89 | return False 90 | word = word[len(prefix):] 91 | try: 92 | node[''].remove(item_id) 93 | return True 94 | except: 95 | return False 96 | 97 | def _search_dico(self, word): 98 | node = self.__data 99 | while word: 100 | prefix, key = self.longest_prefix(word, node.keys()) 101 | if not prefix: 102 | return False 103 | if not key: 104 | return False 105 | if prefix != key: 106 | if prefix == word: 107 | return node[key] 108 | else: 109 | return False 110 | node = node[prefix] 111 | word = word[len(prefix):] 112 | return node 113 | 114 | def search(self, word): 115 | dico = self._search_dico(word) 116 | if dico != False: 117 | return self.traverse_dico(dico) 118 | return [] 119 | 120 | @staticmethod 121 | def traverse_dico(dico): 122 | results = [] 123 | for key, value in dico.iteritems(): 124 | if key == '': 125 | results += value 126 | else: 127 | results += Prefixer.traverse_dico(value) 128 | return results 129 | 130 | @staticmethod 131 | def longest_prefix(word, candidates): 132 | """ 133 | return the longest prefix match between word and any of the 134 | candidates, if any. Only one candidate will much. 135 | """ 136 | if word: 137 | wc = word[0] 138 | for c in candidates: 139 | if c.startswith(wc): 140 | for i in reversed(xrange(1, min(len(word), len(c))+1)): 141 | if c.startswith(word[:i]): 142 | return (word[:i], c) 143 | return ('', None) 144 | 145 | 146 | class TypeHead(object): 147 | """ 148 | typehead object that manages all items 149 | 150 | @type items: dict 151 | @param items: dict of {id : item} 152 | """ 153 | 154 | def __init__(self): 155 | self.items = {} 156 | self.prefixer = Prefixer() 157 | 158 | def _add(self, item): 159 | item_id = item.id 160 | item_content = item.content 161 | #add item to the dict 162 | self.items[item_id] = item 163 | tokens = re.split(SEPARATORS, item_content.lower()) 164 | #add tokens to the prefixer 165 | for token in tokens: 166 | self.prefixer.insert(token, item_id) 167 | 168 | def _delete(self, item_id): 169 | item_content = self.items[item_id].content 170 | #delete the item from the dict 171 | del self.items[item_id] 172 | tokens = re.split(SEPARATORS, item_content.lower()) 173 | #remove items from the prefixer for each token 174 | for token in tokens: 175 | self.prefixer.remove(token, item_id) 176 | 177 | def _set_items_query(self, query): 178 | items_ids = set() 179 | tokens = re.split(SEPARATORS, query.lower()) 180 | cpt = True 181 | for token in tokens: 182 | if cpt: 183 | items_ids = set(self.prefixer.search(token)) 184 | else: 185 | items_ids = items_ids.intersection(set(self.prefixer.search(token))) 186 | if items_ids == set(): 187 | return items_ids 188 | cpt = False 189 | return items_ids 190 | 191 | 192 | def _query(self, nbr_results, query): 193 | #collect potential items' ids 194 | items_ids = self._set_items_query(query) 195 | #check if items_ids is not empty 196 | if items_ids == set(): 197 | return "" 198 | #rank them according to the scoring method 199 | sorted_results = SortedItems(nbr_results) 200 | for item_id in items_ids: 201 | sorted_results.add(self.items[item_id]) 202 | return sorted_results 203 | 204 | def _wquery(self, nbr_results, nbr_boosts, boosts, query): 205 | nbr_boosts = int(nbr_boosts) 206 | #collect potential items' ids 207 | items_ids = self._set_items_query(query) 208 | #check if items_ids is not empty 209 | if items_ids == set(): 210 | return "" 211 | #check the boosts and create boosts_dict 212 | boosts_dict = {} 213 | if nbr_boosts > 0: 214 | boosts = boosts.split() 215 | for boost in boosts: 216 | type, score = boost.split(':') 217 | boosts_dict[type] = float(score) 218 | #rank them according to the scoring method 219 | sorted_results = SortedItems(nbr_results) 220 | for item_id in items_ids: 221 | item = copy.deepcopy(self.items[item_id]) 222 | #chech the boost 223 | if nbr_boosts > 0: 224 | if item.id in boosts_dict.keys(): 225 | item.score *= boosts_dict[item.id] 226 | if item.type in boosts_dict.keys(): 227 | item.score *= boosts_dict[item.type] 228 | sorted_results.add(item) 229 | return sorted_results 230 | 231 | def process_command(self, in_command): 232 | """ 233 | validate the current command and map it to the right function 234 | """ 235 | any_command = COMMAND_MATCHER.match(in_command) 236 | # 237 | if any_command: 238 | command = any_command.group("command") 239 | parameters = any_command.group("parameters") 240 | if (command == "ADD"): 241 | add_command = ADD_COMMAND_MATCHER.match(parameters) 242 | self._add(Item(add_command.group("type"), add_command.group("id"), 243 | add_command.group("score"), add_command.group("content"))) 244 | elif (command == "DEL"): 245 | del_command = DEL_COMMAND_MATCHER.match(parameters) 246 | self._delete(del_command.group("id")) 247 | elif (command == "QUERY"): 248 | query_command = QUERY_COMMAND_MATCHER.match(parameters) 249 | results = self._query(query_command.group("nbr_results"), query_command.group("query")) 250 | print results 251 | elif (command == "WQUERY"): 252 | wquery_command = WQUERY_COMMAND_MATCHER.match(parameters) 253 | results = self._wquery(wquery_command.group("nbr_results"), wquery_command.group("nbr_boosts"), 254 | wquery_command.group("boosts"), wquery_command.group("query")) 255 | print results 256 | 257 | 258 | class Item(object): 259 | """ 260 | either a topic, a user, a board or a question 261 | 262 | @type type: str 263 | @param type: The item's type. 264 | 265 | @type id: str 266 | @param id: The item's id. 267 | 268 | @type score: float 269 | @param score: The item's score. 270 | 271 | @type content: str 272 | @param contetn: The item's content. 273 | 274 | @type time: time 275 | @param time: The item's time of creation. 276 | """ 277 | 278 | def __init__(self, type, id, score, content): 279 | self.type = type 280 | self.id = id 281 | self.score = float(score) 282 | self.content = content 283 | self.time = datetime.datetime.now() 284 | 285 | def __repr__(self): 286 | return self.id 287 | 288 | def better_than(self, item): 289 | """ 290 | compare the current item to the input item. 291 | follows this method: 292 | . highest score goes first. 293 | . same score; time FIFO. 294 | return true if the current item is better than the input, otherwise returns false 295 | """ 296 | if (self.score > item.score): 297 | return True 298 | if (self.score < item.score): 299 | return False 300 | return True if (self.time > item.time) else False 301 | 302 | 303 | class SortedItems(object): 304 | """ 305 | Keeps a list of sorted elements depending on the scoring method. 306 | 307 | @type items: list 308 | @param items: the list sorted items 309 | 310 | @type max_size: int 311 | @param max_sier: the max size of the list (-1 means unlimited number of items) 312 | """ 313 | 314 | def __init__(self, max_size=-1): 315 | self.items = [] 316 | self.max_size = int(max_size) 317 | 318 | def __repr__(self): 319 | return " ".join([item.id for item in self.items]) 320 | 321 | def set_max_size(self, max_size): 322 | self.max_size = int(max_size) 323 | 324 | def add(self, item): 325 | """ 326 | add new item to the list of items. 327 | if the list is full, add the item only if it has better score than at least one item 328 | in the list, and pop the item with the worst score. 329 | """ 330 | items_l = len(self.items) 331 | pos = items_l 332 | for i in xrange(items_l): 333 | if (item.better_than(self.items[i])): 334 | pos = i 335 | break 336 | if (self.max_size < 0 or pos < self.max_size): 337 | temp = self.items[:pos] 338 | temp.append(item) 339 | temp += self.items[pos:] 340 | self.items = temp 341 | #now in the case of exceeding max_size 342 | if (self.max_size > 0 and (items_l+1)>self.max_size): 343 | self.items.pop() 344 | 345 | 346 | t = TypeHead() 347 | N = int(raw_input()) 348 | while(N): 349 | t.process_command(raw_input()) 350 | N -= 1 -------------------------------------------------------------------------------- /radix_tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Created on Dec 01, 2012 4 | 5 | @author: Mourad Mourafiq 6 | 7 | About: This is an attempt to implement the radix tree algo. 8 | Features : 9 | -> insert 10 | -> remove 11 | -> search 12 | ''' 13 | NOK = "{'':[]}" 14 | 15 | 16 | class Prefixer(): 17 | def __init__(self): 18 | self.__data = {} 19 | 20 | def __repr__(self): 21 | return 'Prefixer(%s)' % (self.__data,) 22 | 23 | def __eq__(self, other): 24 | return self.__data == other.__data 25 | 26 | def get_data(self): 27 | return self.__data 28 | 29 | def insert(self, word, item_id): 30 | node = self.__data 31 | while word: 32 | prefix, key = self.longest_prefix(word, node.keys()) 33 | if not prefix: 34 | break 35 | len_prefix = len(prefix) 36 | if prefix != key: 37 | # split key into prefix:suffix, move data 38 | suffix = key[len_prefix:] 39 | current_node = node[key] 40 | node[prefix] = {suffix: current_node} 41 | del node[key] 42 | word = word[len_prefix:] 43 | node = node[prefix] 44 | if word: 45 | node[word] = eval(NOK) 46 | node[word][''].append(item_id) 47 | else: 48 | try: 49 | node[word].append(item_id) 50 | except: 51 | node[word] = [] 52 | node[word].append(item_id) 53 | return True 54 | 55 | def remove(self, word, item_id): 56 | node = self.__data 57 | while word: 58 | prefix, key = self.longest_prefix(word, node.keys()) 59 | if not prefix: 60 | return False 61 | node = node.get(prefix, None) 62 | if not node: 63 | return False 64 | word = word[len(prefix):] 65 | try: 66 | node[''].remove(item_id) 67 | return True 68 | except: 69 | return False 70 | 71 | def _search_dico(self, word): 72 | node = self.__data 73 | while word: 74 | prefix, key = self.longest_prefix(word, node.keys()) 75 | if not prefix: 76 | return False 77 | if not key: 78 | return False 79 | if prefix != key: 80 | if prefix == word: 81 | return node[key] 82 | else: 83 | return False 84 | node = node[prefix] 85 | word = word[len(prefix):] 86 | return node 87 | 88 | def search(self, word): 89 | dico = self._search_dico(word) 90 | if dico != False: 91 | return self.traverse_dico(dico) 92 | return [] 93 | 94 | @staticmethod 95 | def traverse_dico(dico): 96 | results = [] 97 | for key, value in dico.iteritems(): 98 | if key == '': 99 | results += value 100 | else: 101 | results += Prefixer.traverse_dico(value) 102 | return results 103 | 104 | @staticmethod 105 | def longest_prefix(word, candidates): 106 | """ 107 | return the longest prefix match between word and any of the 108 | candidates, if any. Only one candidate will much. 109 | """ 110 | if word: 111 | wc = word[0] 112 | for c in candidates: 113 | if c.startswith(wc): 114 | for i in reversed(xrange(1, min(len(word), len(c)) + 1)): 115 | if c.startswith(word[:i]): 116 | return (word[:i], c) 117 | return ('', None) 118 | -------------------------------------------------------------------------------- /recommendation.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: Recommendations 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | # !/usr/bin/env python 7 | from __future__ import division 8 | import multiprocessing 9 | import collections 10 | from map_reduce import MapReduce 11 | from similarities.correlation import pearson_sim 12 | from similarities.euclidean import euclidean_sim 13 | 14 | # A dictionary of movie critics and their ratings of a small 15 | # set of movies 16 | def loadMovieLens(path='movielens'): 17 | # Get movie titles 18 | movies = {} 19 | for line in open(path + '/u.item'): 20 | (id, title) = line.split('|')[0:2] 21 | movies[id] = title 22 | # Load data 23 | prefs = collections.defaultdict(dict) 24 | for line in open(path + '/u.data'): 25 | (user, movieid, rating, ts) = line.split('\t') 26 | prefs[user][movies[movieid]] = float(rating) 27 | return prefs 28 | 29 | 30 | critics = loadMovieLens() 31 | 32 | 33 | def transform_items(items): 34 | result = collections.defaultdict(dict) 35 | for x in items: 36 | for y in items[x]: 37 | # Flip item and person 38 | result[y][x] = items[x][y] 39 | return result 40 | 41 | 42 | def top_similars_map(data): 43 | """ 44 | map for top similars 45 | """ 46 | items, x, i, similarity = data 47 | l = len(items) 48 | y_items = items.keys()[i * (int(round(l / 4))):(i + 1) * (int(round(l / 4)))] 49 | print multiprocessing.current_process().name, 'processing ', x, i 50 | return [(similarity(items, x, y), y) for y in y_items if y != x] 51 | 52 | 53 | def top_similars_reduce(data): 54 | """ 55 | reduce for top similars 56 | """ 57 | sim, item = data 58 | return (sim, item) 59 | 60 | 61 | def top_similars_mapreduce(items, x, n=5, similarity=pearson_sim): 62 | """ 63 | Returns the best matches for x from the items. 64 | Number of results and similarity function are optional params. 65 | """ 66 | mapper = MapReduce(top_similars_map, top_similars_reduce) 67 | scores = mapper([(items, x, i, similarity) for i in range(4)]) 68 | # Sort the list so the highest scores appear at the top 69 | scores.sort() 70 | scores.reverse() 71 | return scores[:n] 72 | 73 | 74 | def top_similars(items, x, n=5, similarity=pearson_sim): 75 | """ 76 | Returns the best matches for x from the items. 77 | Number of results and similarity function are optional params. 78 | """ 79 | scores = [(similarity(items, x, y, cache=True), y) for y in items.keys() if y != x] 80 | # Sort the list so the highest scores appear at the top 81 | scores.sort() 82 | scores.reverse() 83 | return scores[:n] 84 | 85 | 86 | def similar_items(items, n=5, similarity=euclidean_sim, top_similars=top_similars): 87 | """ 88 | Returns a dictionary of top n similar items for each item 89 | """ 90 | similar_items_output = collections.defaultdict(dict) 91 | cpt = 0 92 | for item in items: 93 | cpt += 1 94 | if cpt % 100 == 0: print "%d / %d" % (cpt, len(items)) 95 | similars = top_similars(items=items, x=item, n=n, similarity=similarity) 96 | similar_items_output[item] = similars 97 | return similar_items_output 98 | 99 | 100 | def get_recommendations_user_filtred_map(data): 101 | """ 102 | map for the get_recommendations_user_filter function 103 | """ 104 | items, x, i, similarity = data 105 | l = len(items) 106 | y_items = items.keys()[i * (int(round(l / 4))):(i + 1) * (int(round(l / 4)))] 107 | print multiprocessing.current_process().name, 'processing ', x, i 108 | output = [] 109 | for y in y_items: 110 | if x == y: continue 111 | sim = similarity(items, x, y, cache=True) 112 | if sim <= 0: continue 113 | for item, score in items[y].items(): 114 | if item in items[x] and items[x][item] > 0: continue # ignore items x already interacted with 115 | output.append((item, (sim, score * sim))) 116 | return output 117 | 118 | 119 | def get_recommendations_user_filtred_reduce(data): 120 | """ 121 | reduce for the get_recommendations_user_filtred function 122 | """ 123 | item, scores = data 124 | ssim = 0 125 | ssim_x_score = 0 126 | for sim, sim_x_score in scores: 127 | ssim += sim 128 | ssim_x_score += sim_x_score 129 | return (item, ssim, ssim_x_score) 130 | 131 | 132 | def get_recommendations_user_filtred_mapreduce(items, x, n=5, similarity=pearson_sim): 133 | """ 134 | Returns recommendationx for x from the items, based on items from similar users 135 | """ 136 | mapper = MapReduce(get_recommendations_user_filtred_map, get_recommendations_user_filtred_reduce) 137 | scores = mapper([(items, x, i, similarity) for i in range(4)]) 138 | # Divide each total score by total weighting to get an average 139 | rankings = [(sim_x_score / sim, item) for (item, sim, sim_x_score) in scores] 140 | rankings.sort() 141 | rankings.reverse() 142 | return rankings[:n] 143 | 144 | 145 | def get_recommendations_user_filtred(items, x, n=5, similarity=pearson_sim): 146 | """ 147 | Returns recommendationx for x from the items, based on items from similar users 148 | """ 149 | similarities_sum = collections.defaultdict(int) 150 | sum_prod_sim_score = collections.defaultdict(int) 151 | for y in items.keys(): 152 | if x == y: continue # don't compare x with itself 153 | sim = similarity(items, x, y) 154 | if sim <= 0: continue # ignore similarities belew or equal 0 155 | for item, score in items[y].items(): 156 | if item in items[x] and items[x][item] > 0: continue # ignore items x already interacted with 157 | similarities_sum[item] += sim 158 | sum_prod_sim_score[item] += score * sim 159 | # Divide each total score by total weighting to get an average 160 | rankings = [(score / similarities_sum[item], item) for item, score in sum_prod_sim_score.items()] 161 | rankings.sort() 162 | rankings.reverse() 163 | return rankings[:n] 164 | 165 | 166 | def get_recommendations_item_filtred(items, similarity_matrix, x, n=5): 167 | """ 168 | Returns recommendations for x from items, based on items similar to user's items 169 | """ 170 | similarities_sum = collections.defaultdict(int) 171 | sum_prod_sim_score = collections.defaultdict(int) 172 | for item, score in items[x].items(): # loop over item from x 173 | for (sim, sim_item) in similarity_matrix[item]: # loop over similar items to item 174 | if sim_item in items[x]: continue 175 | # Weighted sum of scores times similarity 176 | similarities_sum[sim_item] += sim 177 | sum_prod_sim_score[sim_item] += sim * score 178 | # Divide each total score by total weighting to get an average 179 | rankings = [(score / similarities_sum[item], item) for item, score in sum_prod_sim_score.items()] 180 | rankings.sort() 181 | rankings.reverse() 182 | return rankings[:n] 183 | 184 | 185 | def test_euclidean(): 186 | # people 187 | print 'user euclidean similarity' 188 | print euclidean_sim(critics, 'Toy Story (1995)', 'Twelve Monkeys (1995)') 189 | print 'user top similarities' 190 | print top_similars(items=critics, x='99', similarity=euclidean_sim) 191 | print 'user recommendations' 192 | print get_recommendations_user_filtred(items=critics, x='99', similarity=euclidean_sim) 193 | # movies 194 | movies = transform_items(critics) 195 | print 'movies euclidean similarity' 196 | print euclidean_sim(movies, 'Toy Story (1995)', 'Twelve Monkeys (1995)') 197 | print 'movies top similarities' 198 | print top_similars(items=movies, x='Twelve Monkeys (1995)', similarity=euclidean_sim) 199 | print 'movies recommendations' 200 | print get_recommendations_user_filtred(items=movies, x='Twelve Monkeys (1995)', similarity=euclidean_sim) 201 | print 'similar items' 202 | similarity_matrix = similar_items(items=movies, similarity=euclidean_sim) 203 | print get_recommendations_item_filtred(items=critics, similarity_matrix=similarity_matrix, x='99') 204 | 205 | 206 | def test_pearson(): 207 | # people 208 | print 'pearson sim' 209 | print pearson_sim(critics, 'Toy Story (1995)', 'Twelve Monkeys (1995)') 210 | print 'user top sim' 211 | print top_similars(items=critics, x='99') 212 | 213 | print 'user recommendations' 214 | print get_recommendations_user_filtred(items=critics, x='99') 215 | # movies 216 | movies = transform_items(critics) 217 | print 'movies pearson sim' 218 | print pearson_sim(movies, 'Toy Story (1995)', 'Twelve Monkeys (1995)') 219 | print 'movies top similarities' 220 | print top_similars(items=movies, x='Twelve Monkeys (1995)') 221 | print 'movies recommendations' 222 | print get_recommendations_user_filtred(items=movies, x='Twelve Monkeys (1995)') 223 | print 'similar items' 224 | similarity_matrix = similar_items(items=movies, similarity=pearson_sim) 225 | print get_recommendations_item_filtred(items=critics, similarity_matrix=similarity_matrix, x='99') 226 | -------------------------------------------------------------------------------- /shingles_minhash.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: shingling minhashing 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | # !/usr/bin/env python 7 | 8 | from __future__ import division 9 | from math import * 10 | 11 | # example of stop words 12 | STOP_WORDS = set(('i', 'you', 'they', 'the', 'no', 'none', 'all', 'a', 'for', 'not', 'nor')) 13 | # example of hash function 14 | HASH_FCT_EX = lambda val: (2 * val + 4) % 5 15 | 16 | 17 | def k_shingles(string, k=2, use_stop_words=False, verbose=False): 18 | """ 19 | Return the set of k-shingles of the current text 20 | """ 21 | shingles = [] 22 | string_len = len(string) 23 | for i in range(0, string_len - k + 1): 24 | txt = [string[i:i + k]] 25 | shingles += txt 26 | k_sh = set(sorted(shingles)) 27 | if use_stop_words: 28 | k_sh = k_sh - STOP_WORDS 29 | if verbose: 30 | print "All possible shingles 27^%s = %s" % (k, 27 ** k) 31 | print "%s-shingles for %s : %s" % (k, string, k_sh) 32 | return k_sh 33 | 34 | 35 | def charateristic_matrix(list_sets, verbose=False): 36 | """ 37 | Retutrn the characterisc matrix for the current list of sets 38 | """ 39 | nbr_columns = len(list_sets) 40 | # constructiong the elements based on the union of sets 41 | elements = set() 42 | for i in list_sets: 43 | elements = elements | i 44 | elements = sorted(list(elements)) 45 | nbr_rows = len(elements) 46 | char_matrix = [] 47 | #initialising the characteristic matrix 48 | for i in range(0, nbr_rows): 49 | char_matrix.append([0] * nbr_columns) 50 | #constructiong the charastristic matrix 51 | for e in range(0, nbr_rows): 52 | for s in range(0, nbr_columns): 53 | char_matrix[e][s] = 1 if elements[e] in list_sets[s] else 0 54 | 55 | if verbose: print char_matrix 56 | return char_matrix 57 | 58 | 59 | def signature_vector(characteristic_matrix, hash_fct, verbose=False): 60 | """ 61 | Computing Minhash Signatures fot the current characteristic matrix, with hash_fct hash function 62 | """ 63 | nbr_columns = len(characteristic_matrix[0]) 64 | nbr_rows = len(characteristic_matrix) 65 | signature = [-1] * nbr_columns 66 | for r in range(0, nbr_rows): 67 | hash_value = hash_fct(r) 68 | for c in range(0, nbr_columns): 69 | if characteristic_matrix[r][c] == 1: 70 | # the row r has 1 in the column c, so it is potentially object to change 71 | if signature[c] > hash_value or signature[c] < 0: 72 | signature[c] = hash_value 73 | if verbose: print signature 74 | return signature 75 | 76 | 77 | def and_or_construction(p, r, b, and_first=True): 78 | """ 79 | function of r and b, the point of maximum slope and the value of that slope, for families of functions 80 | defined from the minhash functions. 81 | if and_first = True : An r-way AND construction followed by a b-way OR construction. 82 | else : A b-way OR construction followed by an r-way AND construction. 83 | """ 84 | return 1 - (1 - p ** r) ** b if and_first else (1 - (1 - p) ** b) ** r 85 | 86 | 87 | def and_or_s_curve(r, b, and_first=True): 88 | """ 89 | For an r and b given : it generates the s-curve 90 | """ 91 | p = 0.1 92 | for i in range(0, 9): 93 | p += 0.1 94 | print and_or_construction(p, r, b, and_first) 95 | 96 | 97 | def bloom_filter(n, m, k): 98 | """ 99 | A Bloom filter consists of: 100 | 1. An array of n bits, initially all 0?s. 101 | 2. A collection of hash functions h1, h2, . . . , hk. Each hash function maps 102 | ?key? values to n buckets, corresponding to the n bits of the bit-array. 103 | 3. A set S of m key values. 104 | The purpose of the Bloom filter is to allow through all stream elements whose 105 | keys are in S, while rejecting most of the stream elements whose keys are not 106 | in S. 107 | To initialize the bit array, begin with all bits 0. Take each key value in S 108 | and hash it using each of the k hash functions. Set to 1 each bit that is hi(K) 109 | for some hash function hi and some key value K in S. 110 | To test a key K that arrives in the stream, check that all of 111 | h1(K), h2(K), . . . , hk(K) 112 | are 1?s in the bit-array. If all are 1?s, then let the stream element through. If 113 | one or more of these bits are 0, then K could not be in S, so reject the stream 114 | element. 115 | """ 116 | # probability that a bit remains 0 117 | a = long(k * m) 118 | b = float(a) / float(n) 119 | prob_bit_0 = exp(- b) 120 | #the probability of a false positive is the probability of a 1 bit 121 | prob_false_positive = (1 - prob_bit_0) ** k 122 | print prob_false_positive 123 | -------------------------------------------------------------------------------- /similarities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmourafiq/data-analysis/1df2ca020a554f1fdab7cc9e53115e249cc199ac/similarities/__init__.py -------------------------------------------------------------------------------- /similarities/correlation.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: Recommendations 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | # !/usr/bin/env python 7 | from __future__ import division 8 | from math import sqrt 9 | 10 | PEARSON_SIMILARITY_CACHE = {} 11 | 12 | 13 | def get_commun_items(x, y): 14 | """ 15 | Returns the commun items between x and y 16 | """ 17 | return [i for i in x.keys() if i in y.keys()] 18 | 19 | 20 | def pearson_correlation(x, y, commun_items): 21 | """ 22 | The population correlation coefficient corr(x,y) between x and y with expected 23 | values m(x) and m(y) and standard deviations std(x) and std(y) is defined as: 24 | corr(x,y) = cov(x, y) / (std(x) * std(y)) = E((x - m(x))(y - m(y))) / (std(x) * std(y)) 25 | 26 | Returns the pearson correlation for x and y for a given list of commun items 27 | """ 28 | # Find the number of elements 29 | n = len(commun_items) 30 | # if they are no ratings in common, return 0 31 | if n == 0: return 0 32 | # Add up all the preferences 33 | sumX = sum([x[i] for i in commun_items]) 34 | sumY = sum([y[i] for i in commun_items]) 35 | # Sum up the squares 36 | sumX2 = sum([pow(x[i], 2) for i in commun_items]) 37 | sumY2 = sum([pow(y[i], 2) for i in commun_items]) 38 | # Sum up the products 39 | prodSum = sum([x[i] * y[i] for i in commun_items]) 40 | # Calculate Pearson score 41 | num = prodSum - (sumX * sumY / n) 42 | den = sqrt((sumX2 - pow(sumX, 2) / n) * (sumY2 - pow(sumY, 2) / n)) 43 | if den == 0: return 0 44 | r = num / den 45 | return r 46 | 47 | 48 | def pearson_sim(items, x, y, cache=False): 49 | """ 50 | Returns the similarity between x and y based on the pearson correaltion 51 | """ 52 | if cache: 53 | if (x, y) in PEARSON_SIMILARITY_CACHE: 54 | return PEARSON_SIMILARITY_CACHE[(x, y)] 55 | i_x = items[x] 56 | i_y = items[y] 57 | sim = pearson_correlation(i_x, i_y, get_commun_items(i_x, i_y)) 58 | PEARSON_SIMILARITY_CACHE[(x, y)] = sim 59 | PEARSON_SIMILARITY_CACHE[(y, x)] = sim 60 | return sim 61 | i_x = items[x] 62 | i_y = items[y] 63 | return pearson_correlation(i_x, i_y, get_commun_items(i_x, i_y)) 64 | 65 | 66 | def mean(input_array): 67 | for i in range(0, (len(input_array) - 1)): 68 | input_array[i] = float(input_array[i]) 69 | total_sum = 0.00 70 | for value in input_array: 71 | total_sum = total_sum + value 72 | return float(total_sum / len(input_array)) 73 | 74 | 75 | def standard_deviation(input_array): 76 | mu = mean(input_array) 77 | variance_numerator = 0.00 78 | for val in input_array: 79 | variance_numerator = variance_numerator + (val - mu) ** 2 # Sigma((x-mu)^2) 80 | variance = variance_numerator / len(input_array) 81 | return sqrt(variance) 82 | 83 | 84 | def covariance(x_array, y_array): 85 | if len(x_array) != len(y_array): 86 | return False 87 | x_mu = mean(x_array) 88 | y_mu = mean(y_array) 89 | covariance_numerator = 0.00 90 | for i in range(len(x_array)): 91 | covariance_numerator = covariance_numerator + (x_array[i] - x_mu) * (y_array[i] - y_mu) 92 | return covariance_numerator / len(x_array) 93 | 94 | 95 | def correlation(x_array, y_array): 96 | if covariance(x_array, y_array): 97 | return covariance(x_array, y_array) / ((standard_deviation(x_array)) * (standard_deviation(y_array))) 98 | else: 99 | return False 100 | 101 | 102 | X = [1, 2, 2, 4] 103 | Y = [2, 4, 6, 8] 104 | print correlation(X, Y) 105 | 106 | -------------------------------------------------------------------------------- /similarities/euclidean.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: Recommendations 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | from __future__ import division 8 | from math import sqrt 9 | 10 | EUCLIDEAN_SIMILARITY_CACHE = {} 11 | 12 | 13 | def get_commun_items(x, y): 14 | """ 15 | Returns the commun items between x and y 16 | """ 17 | return [i for i in x.keys() if i in y.keys()] 18 | 19 | 20 | def euclidean_dis(x, y, commun_items): 21 | """ 22 | Returns the euclidean distance between x and y for a given list of commun items 23 | """ 24 | return sqrt(sum([pow(x[i] - y[i], 2) for i in commun_items])) 25 | 26 | 27 | def euclidean_sim(items, x, y, cache=False): 28 | """ 29 | Returns the euclidean similarity between x and y. 30 | """ 31 | if cache: 32 | if (x, y) in EUCLIDEAN_SIMILARITY_CACHE: 33 | return EUCLIDEAN_SIMILARITY_CACHE[(x, y)] 34 | i_x = items[x] 35 | i_y = items[y] 36 | sim = 1 / (1 + euclidean_dis(i_x, i_y, get_commun_items(i_x, i_y))) 37 | EUCLIDEAN_SIMILARITY_CACHE[(x, y)] = sim 38 | EUCLIDEAN_SIMILARITY_CACHE[(y, x)] = sim 39 | return sim 40 | i_x = items[x] 41 | i_y = items[y] 42 | return 1 / (1 + euclidean_dis(i_x, i_y, get_commun_items(i_x, i_y))) 43 | -------------------------------------------------------------------------------- /similarities/jaccard_similarity.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------- 2 | # Name: jaccard similarity 3 | # 4 | # Author: mourad mourafiq 5 | # ------------------------------------------------------------------------------- 6 | 7 | from __future__ import division 8 | 9 | EX_TUP_1 = ('a', 'a', 'a', 'b') 10 | EX_TUP_1 = ('a', 'a', 'b', 'b', 'c') 11 | 12 | 13 | def jaccard_sim(tup_1, tup_2, verbose=False): 14 | """ 15 | calculate the jaccard similiarity of 2 tuples 16 | """ 17 | sum = len(tup_1) + len(tup_2) 18 | set_1 = set(tup_1) 19 | set_2 = set(tup_2) 20 | inter = 0 21 | for i in (set_1 & set_2): 22 | count_1 = tup_1.count(i) 23 | count_2 = tup_2.count(i) 24 | inter += count_1 if count_1 < count_2 else count_2 25 | j_sim = inter / sum 26 | if verbose: print j_sim 27 | return j_sim 28 | 29 | 30 | def jaccard_distance(tup_1, tup_2): 31 | """ 32 | Calculate the jaccard distance 33 | """ 34 | return 1 - jaccard_sim(tup_1, tup_2) 35 | 36 | 37 | def jaccard_conditional_comparaison(tup, list_tups, min_jaccard_sim, verbose=False): 38 | """ 39 | Suppose that "s" is a string of length "ls", and we are looking for 40 | strings with at least "sim" Jaccard similarity. 41 | To be sure that we do not have to compare "s" with "t", we must be certain that "sim" > ("ls" ? "p")/"ls". That 42 | is, "p" must be at least [(1 ? "sim")"ls"] + 1. Of course we want "p" to be as small as 43 | possible, so we do not index string s in more buckets than we need to. Thus, 44 | we shall hereafter take "p" = [(1 ? "sim")"ls"+ 1 to be the length of the prefix that 45 | gets indexed. 46 | P.S : "p" being the prefix of potential strings to be compared to "s" 47 | Case 1: p ? q. Here, the maximum size of the intersection is 48 | Ls ? i + 1 ? (p ? q) 49 | Since Ls = i + p, we can write the above expression for the intersection size as 50 | q + 1. The minimum size of the union is Ls + j ? 1, as it was when we did not 51 | take suffix length into account. Thus, we require 52 | (q + 1) /(Ls + j ? 1) ? J whenever p ? q. 53 | Case 2: p < q. Here, the maximum size of the intersection is Ls ? i + 1, as 54 | when suffix length was not considered. However, the minimum size of the union 55 | is now Ls + j ? 1 + q ? p. If we again use the relationship Ls = i + p, we can 56 | replace Ls ? p by i and get the formula i + j ? 1 + q for the size of the union. 57 | If the Jaccard similarity is at least J, then 58 | (Ls ? i + 1) / (i + j ? 1 + q) ? J 59 | whenever p < q. 60 | 61 | """ 62 | tup_length = len(tup) 63 | pre = int(((1 - min_jaccard_sim) * tup_length) + 1) 64 | max_length = int(tup_length / min_jaccard_sim) 65 | min_length = tup_length - pre 66 | potential_tups = [] 67 | for t in list_tups: 68 | t_length = len(t) 69 | # first we check teh current tup length 70 | if t_length >= min_length and t_length <= max_length: 71 | #second we should loop over all possible values for i & j 72 | for i in range(0, pre): 73 | for j in range(0, pre): 74 | p = tup_length - i 75 | q = t_length - j 76 | if (p >= q and ((q + 1) / (tup_length + j - 1)) >= min_jaccard_sim) or ( 77 | p < q and ((tup_length - i + 1) / (i + j - 1 + q)) >= min_jaccard_sim): 78 | potential_tups.append(t) 79 | if verbose: print potential_tups 80 | return potential_tups 81 | -------------------------------------------------------------------------------- /similarities/tanimoto.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 22, 2012 3 | 4 | @author: mourad mourafiq 5 | ''' 6 | 7 | TANIMOTO_SIMILARITY_CACHE = {} 8 | 9 | 10 | def tanimoto_sim(items, x, y, cache=False): 11 | """ 12 | Returns the similarity between x and y based on the tanimoto score 13 | """ 14 | c1, c2, shr = 0, 0, 0 15 | 16 | for i in range(len(x)): 17 | if x[i] != 0: c1 += 1 # in v1 18 | if y[i] != 0: c2 += 1 # in v2 19 | if x[i] != 0 and y[i] != 0: shr += 1 # in both 20 | 21 | return 1.0 - (float(shr) / (c1 + c2 - shr)) 22 | --------------------------------------------------------------------------------