├── README.md
├── adwords.py
├── algorithmics
    ├── candies.py
    ├── chg_bits.py
    ├── cryptarithmetic.py
    ├── diff-div.py
    ├── flowers.py
    ├── forest_slicing.py
    ├── grammer-parser.py
    ├── hanoi_tower.py
    ├── inverse_function.py
    ├── iter_circle_sum.py
    ├── licence
    ├── media.py
    ├── n_c_p.py
    ├── numpy_circle_sum.py
    ├── palindromes.py
    ├── pooring_water.py
    ├── recur_circle_sum.py
    ├── string_red.py
    ├── suffixes.py
    └── unfriendly.py
├── classification
    ├── decision_trees.py
    └── knn.py
├── decorators.py
├── dijkstra.py
├── filters
    ├── __init__.py
    └── utils.py
├── frequency.py
├── graph_analysis.py
├── licence
├── map_reduce
    ├── README.md
    ├── item_frequency.py
    ├── map_reduce.py
    └── pi_estimation.py
├── movielens
    ├── u.item
    └── u.user
├── page_rank
    ├── README.md
    ├── page_rank.py
    └── page_rank_numpy.py
├── quora
    ├── answer_classifier.py
    ├── datacenter_c.py
    ├── dcc.c
    ├── feed_optimizer.py
    ├── nearby.py
    ├── results.txt
    ├── test.txt
    └── typehead.py
├── radix_tree.py
├── recommendation.py
├── shingles_minhash.py
└── similarities
    ├── __init__.py
    ├── correlation.py
    ├── euclidean.py
    ├── jaccard_similarity.py
    └── tanimoto.py


/README.md:
--------------------------------------------------------------------------------
 1 | math and data analysis functions
 2 | ================================
 3 | 
 4 | *shingling*
 5 |  - k-shingles generation
 6 |  - minhashing
 7 |  
 8 | *jaccard similarity*
 9 |  - jaccard similarity calculation
10 |  - jaccard distance calculation
11 |  - jaccard conditional comparaison
12 | 
13 | *adwords problem*
14 |  - greedy_adwords
15 |  - balance_adwords
16 |  - generalized_balance_adwords
17 | 
18 | *frequency problem*
19 |  - items frequency
20 |  - the algorithm of savasere, omniescinski and navathe
21 | 
22 | *graph problem*
23 |  - graph construction
24 |  - shortest_path
25 |  - longest path
26 |  - centrality 
27 |  - independent graphs detection
28 |  - clustering_coef
29 |  - dijkstra
30 |  - dijkstra with heap
31 | 
32 | *recommendation problem*
33 |  - hamming distance
34 |  - euclidean distance
35 |  - pearson correlation
36 |  - tanimoto score
37 |  - euclidean similarity
38 |  - pearson similarity
39 |  - tanimoto similarity
40 |  - top similars
41 |  - top similar with map reduce
42 |  - recommendation user filtred
43 |  - recommendation item filtred
44 | 
45 | *Radix tree*
46 |  - insert
47 |  - remove
48 |  - search
49 |  - longest prefix
50 | 
51 | *Decision tree*
52 |  - Divide data
53 |  - Gini impurity
54 |  - Entropy
55 |  - Variance
56 |  - Buil tree
57 |  - Prune
58 |  - Classify
59 |  - Draw tree
60 | 
61 | *Page Rank*
62 | 
63 | A very simple version/implementation of the page rank algorithm.
64 |  - Page rank
65 |  - Advanced version of page rank, topic sensitive
66 |  - spam farms
67 |  - spam farms
68 |  - trust rank
69 |  - Hiperlink induced topic search
70 |  - Map reduce to efficiently calculates the page rank
71 |  - Jaccard simiarity to be found in data analysis repo
72 | 
73 | *Map-Reduce*
74 | 
75 | Implementation of map reduce, and some examples.
76 |  - Map Reduce class
77 |  - Estimation of pi number
78 |  - Calculation of frequency of Items from multiple files
79 | 


--------------------------------------------------------------------------------
/adwords.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        graph analysis
  3 | #
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | 
  7 | import collections
  8 | from math import exp
  9 | 
 10 | 
 11 | def bid_for_bider(bider, bids, item):
 12 |     """
 13 |     Returns the bid the bider put on item
 14 |     """
 15 |     return bids[item] if item in bids.keys() else 0
 16 | 
 17 | 
 18 | def fraction_for_bider(bider, remaining_budget, initial_budget, bids, item):
 19 |     """
 20 |     return the bid times 1 - e^-(fraction of remaining budget)
 21 |     """
 22 |     return bids[item] * (1 - exp(-(remaining_budget / initial_budget))) if item in bids.keys() else 0
 23 | 
 24 | 
 25 | def sort_biders(biders, bids=None, item=None, by_budget=False, by_bid=False, by_fraction=False):
 26 |     """
 27 |     sort biders by budget
 28 |     """
 29 |     result = []
 30 |     if by_budget and not by_bid:
 31 |         return sorted(biders, key=lambda x: x[1], reverse=True)
 32 |     if by_bid and item is not None and not by_budget:
 33 |         return sorted(biders, key=lambda (x, y, z): bid_for_bider(x, bids[x], item), reverse=True)
 34 |     if by_fraction:
 35 |         return sorted(biders, key=lambda (x, y, z): fraction_for_bider(x, y, z, bids[x], item), reverse=True)
 36 |     return biders
 37 | 
 38 | 
 39 | def greedy_adwords(biders, bids, items):
 40 |     """
 41 |     greedy algorithms make their decision in response to each input element by maximizing some
 42 |     function of the input element and the past.
 43 |     all click through rates are the same
 44 |     #bider structer : bider, remaining budget, initial budget
 45 |     #bids for bidder structer : item, value ...
 46 |     """
 47 |     result = []
 48 |     for item in items:
 49 |         biders = sort_biders(biders, bids, item=item, by_bid=True)
 50 |         for b in range(len(biders)):
 51 |             bider, remaining_budget, initial_budget = biders[b]
 52 |             if item in bids[bider].keys() and remaining_budget >= bids[bider][item]:
 53 |                 result.append((item, bider))
 54 |                 biders[b] = (bider, remaining_budget - bids[bider][item], initial_budget)
 55 |                 break
 56 |     return result
 57 | 
 58 | 
 59 | def balance_adwords(biders, bids, items):
 60 |     """
 61 |     assigns a query to the advertiser who bids on the query and
 62 |     has the largest remaining budget. Ties may be broken arbitrarily.
 63 |     #bider structer : bider, remaining budget, initial budget
 64 |     #bids for bidder structer : item, value ...
 65 |     """
 66 |     result = []
 67 |     for item in items:
 68 |         biders = sort_biders(biders, by_budget=True)
 69 |         for b in range(len(biders)):
 70 |             bider, remaining_budget, initial_budget = biders[b]
 71 |             if item in bids[bider].keys() and remaining_budget >= bids[bider][item]:
 72 |                 result.append((item, bider))
 73 |                 biders[b] = (bider, remaining_budget - bids[bider][item], initial_budget)
 74 |                 break
 75 |     return result
 76 | 
 77 | 
 78 | def generalized_balance_adwords(biders, bids, items):
 79 |     """
 80 |     differs from the balance algoritghms in two ways:
 81 |         bias the choice of the bider in favor of the one with the higher bid
 82 |         less absolute about the remaining budget, rather, consider the fraction of the remaining budget
 83 |     #bider structer : bider, remaining budget, initial budget
 84 |     #bids for bidder structer : item, value ...
 85 |     """
 86 |     result = []
 87 |     for item in items:
 88 |         biders = sort_biders(biders, bids, item=item, by_fraction=True)
 89 |         for b in range(len(biders)):
 90 |             bider, remaining_budget, initial_budget = biders[b]
 91 |             if item in bids[bider].keys() and remaining_budget >= bids[bider][item]:
 92 |                 result.append((item, bider))
 93 |                 biders[b] = (bider, remaining_budget - bids[bider][item], initial_budget)
 94 |                 break
 95 |     return result
 96 | 
 97 | 
 98 | def test_greedy():
 99 |     biders = (("m", 30, 30), ("l", 10, 10), ("k", 25, 25), ("p", 20, 20))
100 |     bids = {"m": {'a': 2, 'b': 3, 'd': 1}, "l": {"c": 1, 'a': 5}, "p": {}, "k": {'b': 5, 'c': 2, 'd': 1}}
101 |     items = tuple(('a', 'b', 'd', 'a', 'a'))
102 |     print greedy_adwords(biders=biders, bids=bids, items=items)
103 | 
104 | 
105 | def test_balance():
106 |     biders = (("m", 30, 30), ("l", 10, 10), ("k", 25, 25), ("p", 20, 20))
107 |     bids = {"m": {'a': 2, 'b': 3, 'd': 1}, "l": {"c": 1, 'a': 5}, "p": {}, "k": {'b': 5, 'c': 2, 'd': 1}}
108 |     items = tuple(('a', 'b', 'd', 'a', 'a'))
109 |     print balance_adwords(biders=biders, bids=bids, items=items)
110 | 
111 | 
112 | def test_ageneralized_balance():
113 |     biders = (("m", 30, 30), ("l", 10, 10), ("k", 25, 25), ("p", 20, 20))
114 |     bids = {"m": {'a': 2, 'b': 3, 'd': 1}, "l": {"c": 1, 'a': 5}, "p": {}, "k": {'b': 5, 'c': 2, 'd': 1}}
115 |     items = tuple(('a', 'b', 'd', 'a', 'a'))
116 |     print ageneralized_balance_adwords(biders=biders, bids=bids, items=items)
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     test_greedy()
121 |     test_balance()
122 |     test_ageneralized_balance()
123 | 


--------------------------------------------------------------------------------
/algorithmics/candies.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | 
 4 | def candies():
 5 |     N = int(raw_input())
 6 |     next = collections.defaultdict(int)
 7 |     to_update = collections.defaultdict(int)
 8 | 
 9 |     def update(prev, current):
10 |         if current in to_update:
11 |             for i in range(to_update[current], current):
12 |                 next[i] += 1
13 |         else:
14 |             next[prev] += 1
15 |             if prev >= 1 and next[prev] == next[prev - 1] and f[prev] < f[prev - 1]:
16 |                 update(prev - 1, current)
17 |             else:
18 |                 to_update[current] = prev
19 | 
20 |     f = []
21 |     for i in range(N):
22 |         current = int(raw_input())
23 |         f.append(current)
24 |         if i == 0:
25 |             next[i] = 1
26 |             continue
27 |         prev = f[i - 1]
28 |         if current <= prev:
29 |             next[i] = 1
30 |             if next[i - 1] == 1:
31 |                 if current < prev: update(i - 1, i)
32 |                 # else: next[i] += 1
33 |         elif current > prev:
34 |             next[i] = next[i - 1] + 1
35 | 
36 |     print sum(next.values())
37 | 
38 | 
39 | candies()
40 | 


--------------------------------------------------------------------------------
/algorithmics/chg_bits.py:
--------------------------------------------------------------------------------
 1 | # setBit() returns an integer with the bit at 'offset' set to 1.    
 2 | def setBit(int_type, offset):
 3 |     mask = 1 << offset
 4 |     return (int_type | mask)
 5 | 
 6 | 
 7 | # clearBit() returns an integer with the bit at 'offset' cleared. 
 8 | def clearBit(int_type, offset):
 9 |     mask = ~(1 << offset)
10 |     return (int_type & mask)
11 | 
12 | 
13 | def set_bit(ind, val, int_type):
14 |     return clearBit(int_type, ind) if val == "0" else setBit(int_type, ind)
15 | 
16 | 
17 | def get_c(inx, A, B, n):
18 |     res = A + B
19 |     mask = 1 << inx
20 |     return '1' if (res & mask) else '0'
21 | 
22 | 
23 | def chg_bit():
24 |     N, Q = [int(x) for x in raw_input().split()]
25 |     A = int(raw_input(), 2)
26 |     B = int(raw_input(), 2)
27 |     result = ""
28 |     while Q > 0:
29 |         q = [x for x in raw_input().split()]
30 |         if len(q) == 2:  # get operation
31 |             result += get_c(int(q[1]), A, B, N + 1)
32 |         elif q[0] == "set_a":
33 |             A = set_bit(int(q[1]), q[2], A)
34 |         else:
35 |             B = set_bit(int(q[1]), q[2], B)
36 |         Q -= 1
37 |     print result
38 | 
39 | 
40 | chg_bit()
41 | 


--------------------------------------------------------------------------------
/algorithmics/cryptarithmetic.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        cryptarithmetic
  3 | # 
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | 
  7 | import string, re, itertools
  8 | import time
  9 | 
 10 | examples = """TWO + TWO == FOUR
 11 |             A**2 + B**2 == C**2
 12 |             A**2 + BE**2 == BY**2
 13 |             A**2 + BY**2 == BE**2
 14 |             X / X == X
 15 |             X / X == 1
 16 |             A**N + B**N == C**N and N > 1
 17 |             ATOM**0.5 == A + TO + M
 18 |             GLITTER is not GOLD
 19 |             ONE < TWO and FOUR < FIVE
 20 |             ONE < TWO < THREE
 21 |             RAMN == R**3+ RM**3 == N**3 + RX**3
 22 |             sum(range(AA)) == BB
 23 |             sum(range(POP)) == BOBO
 24 |             ODD + ODD == EVEN
 25 |             PLUTO is not set([PLANETS]) """.splitlines()
 26 | 
 27 | 
 28 | def solve(formula, verbose=False):
 29 |     """Given a formula like 'ODD + ODD == EVEN', fill in digits to solve it.
 30 |     Input formula is a string; output is a digit-filled-in string or None."""
 31 |     for f in fill_in(formula):
 32 |         if valid(f):
 33 |             if not verbose: print f
 34 |             return f
 35 | 
 36 | 
 37 | def fill_in(formula):
 38 |     "Generate all possible fillings-in of letters in formula with digits."
 39 |     letters = ''.join(set(re.findall('[A-Z]', formula)))
 40 |     for digits in itertools.permutations('1234567890', len(letters)):
 41 |         table = string.maketrans(letters, ''.join(digits))
 42 |         yield formula.translate(table)
 43 | 
 44 | 
 45 | def valid(f):
 46 |     """Formula f is valid if and only if it has no 
 47 |     numbers with leading zero, and evals true."""
 48 |     try:
 49 |         return not re.search(r'\b0[0-9]', f) and eval(f) is True
 50 |     except ArithmeticError:
 51 |         return False
 52 | 
 53 | 
 54 | def timedcall(fct, formula):
 55 |     """
 56 |     Calculate time of execution 
 57 |     """
 58 |     t0 = time.clock()
 59 |     fct(formula)
 60 |     t1 = time.clock()
 61 |     return t1 - t0
 62 | 
 63 | 
 64 | def compile_formula(formula, verbose=False):
 65 |     """Compile formula into a function. Also return letters found, as a str,
 66 |     in same order as parms of function. The first digit of a multi-digit 
 67 |     number can't be 0. So if YOU is a word in the formula, and the function
 68 |     is called with Y eqal to 0, the function should return False."""
 69 | 
 70 |     first_letters = set(re.findall(r'\b([A-Z])[A-Z]', formula))
 71 |     letters = ''.join(set(re.findall('[A-Z]', formula)))
 72 |     parms = ', '.join(letters)
 73 |     tokens = map(compile_word, re.split('([A-Z]+)', formula))
 74 |     body = ''.join(tokens)
 75 |     if first_letters:
 76 |         tests = ' and '.join(L + '!=0' for L in first_letters)
 77 |         body = '%s and (%s)' % (tests, body)
 78 |     f = 'lambda %s: %s' % (parms, body)
 79 |     if verbose: print f
 80 |     return eval(f), letters
 81 | 
 82 | 
 83 | def compile_word(word):
 84 |     """Compile a word of uppercase letters as numeric digits.
 85 |     E.g., compile_word('YOU') => '(1*U+10*O+100*Y)'
 86 |     Non-uppercase words uncahanged: compile_word('+') => '+'"""
 87 |     if word.isupper():
 88 |         terms = [('%s*%s' % (10 ** i, d))
 89 |                  for (i, d) in enumerate(word[::-1])]
 90 |         return '(' + '+'.join(terms) + ')'
 91 |     else:
 92 |         return word
 93 | 
 94 | 
 95 | def faster_solve(formula):
 96 |     """Given a formula like 'ODD + ODD == EVEN', fill in digits to solve it.
 97 |     Input formula is a string; output is a digit-filled-in string or None.
 98 |     This version precompilesdef k(): the formula; only one eval per formula."""
 99 |     f, letters = compile_formula(formula)
100 |     for digits in itertools.permutations((1, 2, 3, 4, 5, 6, 7, 8, 9, 0), len(letters)):
101 |         try:
102 |             if f(*digits) is True:
103 |                 table = string.maketrans(letters, ''.join(map(str, digits)))
104 |                 return formula.translate(table)
105 |         except ArithmeticError:
106 |             pass
107 | 
108 | 
109 | def test1():
110 |     t0 = time.clock()
111 |     for example in examples:
112 |         print '%6.4f sec : %s' % (timedcall(faster_solve, example), example)
113 |     print '%6.4f sec in total.' % (time.clock() - t0)
114 | 
115 | 
116 | def test2():
117 |     assert faster_solve('A + B == BA') == None  # should NOT return '1 + 0 == 01'
118 |     assert faster_solve('YOU == ME**2') == ('289 == 17**2' or '576 == 24**2' or '841 == 29**2')
119 |     assert faster_solve('X / X == X') == '1 / 1 == 1'
120 |     return 'tests pass'
121 | 


--------------------------------------------------------------------------------
/algorithmics/diff-div.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | def check_nbr(nbr, nbrs, K):
 5 |     """
 6 |     checks if we could find a potential diff
 7 |     only compare nbr with the lowest nbr in the list
 8 |     """
 9 |     if nbr - nbrs[0] >= K:
10 |         return True
11 | 
12 | 
13 | def generate_indices(low_val, high_val):
14 |     """
15 |     given a list and a law value and high value, generate four indices that cut the interval by four    
16 |     """
17 |     step = int(round((high_val - low_val) / 4))
18 |     return low_val, low_val + step, low_val + 2 * step, low_val + 3 * step, high_val
19 | 
20 | 
21 | def dichotomie(nbr, nbrs, K, low_val, high_val, itr):
22 |     """
23 |     proceed by binary search, recursion shouldn't go beyond the limit itr
24 |     for the sake of optimization, we will cut each intervall by 4 
25 |     """
26 |     itr += 1
27 |     ind1, ind2, ind3, ind4, ind5 = generate_indices(low_val, high_val)
28 |     if ind2 == 0 or ind1 >= ind2 or ind2 >= ind3 or ind3 >= ind4 or ind4 >= ind5 or itr == 100:  # can't be divided by 4, iterate
29 |         for i in range(high_val, low_val - 1, -1):
30 |             if nbr - nbrs[i] == K:
31 |                 return True
32 |                 break
33 |         return False
34 |     elif nbr - nbrs[ind4] == K:  # first quarter border
35 |         return True
36 |     elif nbr - nbrs[ind4] > K:  # first quarter
37 |         return dichotomie(nbr, nbrs, K, ind4, ind5, itr)
38 |     elif nbr - nbrs[ind3] == K:  # 2d quarter border
39 |         return True
40 |     elif nbr - nbrs[ind3] > K:  # 2d quarter
41 |         return dichotomie(nbr, nbrs, K, ind3, ind4, itr)
42 |     elif nbr - nbrs[ind2] == K:  # 3d quarter border
43 |         return True
44 |     elif nbr - nbrs[ind2] > K:  # 3d quarter
45 |         return dichotomie(nbr, nbrs, K, ind2, ind3, itr)
46 |     elif nbr - nbrs[ind1] == K:  # 4th quarter border
47 |         return True
48 |     elif nbr - nbrs[ind1] > K:  # 4th quarter
49 |         return dichotomie(nbr, nbrs, K, ind1, ind1, itr)
50 |     else:
51 |         return False
52 | 
53 | 
54 | def diffs():
55 |     N, K = raw_input().split()
56 |     N, K = int(N), int(K)
57 |     nbrs = [int(n) for n in raw_input().split()]
58 |     nbrs.sort()
59 |     len_nbrs = len(nbrs)
60 |     sum_diff = 0
61 |     i = len_nbrs - 1
62 |     while i > 0:
63 |         itr = 0
64 |         nbr = nbrs[i]
65 |         if i / 10 > 10000:
66 |             itr = 0
67 |         elif i / 10 > 1000:
68 |             itr = 1
69 |         elif i / 10 > 100:
70 |             itr = 2
71 |         elif i / 10 > 10:
72 |             itr = 3
73 |         else:
74 |             itr = 4
75 |         if not check_nbr(nbr, nbrs, K):
76 |             break
77 |         if dichotomie(nbr, nbrs, K, 0, i - 1, itr=0):
78 |             sum_diff += 1
79 |         i -= 1
80 |     return sum_diff
81 | 
82 | 
83 | t = time.clock()
84 | print diffs()
85 | print time.clock() - t
86 | 


--------------------------------------------------------------------------------
/algorithmics/flowers.py:
--------------------------------------------------------------------------------
 1 | def flowers():
 2 |     nbr_fl, nbr_fr = [int(x) for x in raw_input().split()]
 3 |     prices = [float(x) for x in raw_input().split()]
 4 |     prices.sort()
 5 |     amount = 0
 6 |     x = 0
 7 |     while nbr_fl:
 8 |         if nbr_fl <= nbr_fr:
 9 |             amount += sum([(1 + x) * c for c in prices[:nbr_fl]])
10 |             nbr_fl = 0
11 |         else:
12 |             amount += sum([(1 + x) * c for c in prices[nbr_fl - nbr_fr:nbr_fl]])
13 |             nbr_fl -= nbr_fr
14 |             x += 1
15 |     amount_flat = int(amount)
16 |     amount_float = amount % 1
17 |     if amount_float != 0:
18 |         print amount
19 |     else:
20 |         print amount_flat
21 | 
22 | 
23 | flowers()
24 | 


--------------------------------------------------------------------------------
/algorithmics/forest_slicing.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | sample_input = """
 4 | 20 19
 5 | 2 1
 6 | 3 1
 7 | 4 3
 8 | 5 2
 9 | 6 5
10 | 7 1
11 | 8 1
12 | 9 2
13 | 10 7
14 | 11 10
15 | 12 3
16 | 13 7
17 | 14 8
18 | 15 12
19 | 16 6
20 | 17 6
21 | 18 10
22 | 19 1
23 | 20 8
24 | """
25 | # expected return 4
26 | N, M = raw_input().split()
27 | summy = 0
28 | forest = collections.defaultdict(list)
29 | depth = collections.defaultdict(int)
30 | 
31 | 
32 | def forest_construction():
33 |     """
34 |     construct the graph from std input
35 |     """
36 |     global head
37 |     for i in range(int(M)):
38 |         node2, node1 = raw_input().split()
39 |         forest[node1].append(node2)
40 | 
41 | 
42 | def nbr_nodes(node):
43 |     """
44 |     returns the number of nodes in the current sub-graph
45 |     """
46 |     nbr = 1
47 |     for n in forest[node]:
48 |         nbr += nbr_nodes(n)
49 |     return nbr
50 | 
51 | 
52 | def nodes_depth():
53 |     """
54 |     construct depth for each node
55 |     """
56 |     for node in forest.keys():
57 |         depth[node] = nbr_nodes(node)
58 | 
59 | 
60 | def get_head():
61 |     """
62 |     returns the head of the graph
63 |     """
64 |     head = ''
65 |     max_v = 0
66 |     for k, v in depth.items():
67 |         if v > max_v:
68 |             head = k
69 |             max_v = v
70 |     return head
71 | 
72 | 
73 | def forest_slicing(node):
74 |     """
75 |     calculate the number of removed edges in such a forest.
76 |     """
77 |     summy = 0
78 |     for n in forest[node]:  # direct successors
79 |         if not n in depth:
80 |             depth[n] = 1
81 |         elif depth[n] % 2 == 0:
82 |             summy += 1
83 |             summy += forest_slicing(n)
84 |         else:
85 |             summy += forest_slicing(n)
86 |     return summy
87 | 
88 | 
89 | forest_construction()
90 | nodes_depth()
91 | print forest_slicing(get_head())
92 | 


--------------------------------------------------------------------------------
/algorithmics/grammer-parser.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        cryptarithmetic
  3 | # 
  4 | # Author:      mourad mourafiq
  5 | #
  6 | # Copyright:   (c) mourad mourafiq 
  7 | # -------------------------------------------------------------------------------
  8 | 
  9 | from functools import update_wrapper
 10 | from string import split
 11 | import re
 12 | 
 13 | 
 14 | def grammar(description, whitespace=r'\s*'):
 15 |     """Convert a description to a grammar.  Each line is a rule for a
 16 |     non-terminal symbol; it looks like this:
 17 |         Symbol =>  A1 A2 ... | B1 B2 ... | C1 C2 ...
 18 |     where the right-hand side is one or more alternatives, separated by
 19 |     the '|' sign.  Each alternative is a sesys.setrecursionlimit(1500)quence of atoms, separated by
 20 |     spaces.  An atom is either a symbol on some left-hand side, or it is
 21 |     a regular expression that will be passed to re.match to match a token.
 22 |     
 23 |     Notation for *, +, or ? not allowed in a rule alternative (but ok
 24 |     within a token). Use '\' to continue long lines.  You must include spaces
 25 |     or tabs around '=>' and '|'. That's within the grammar description itself.
 26 |     The grammar that gets defined allows whitespace between tokens by default;
 27 |     specify '' as the second argument to grammar() to disallow this (or supply
 28 |     any regular expression to describe allowable whitespace between tokens)."""
 29 |     G = {' ': whitespace}
 30 |     description = description.replace('\t', ' ')  # no tabs!
 31 |     for line in split(description, '\n'):
 32 |         lhs, rhs = split(line, ' => ', 1)
 33 |         alternatives = split(rhs, ' | ')
 34 |         G[lhs] = tuple(map(split, alternatives))
 35 |     return G
 36 | 
 37 | 
 38 | def decorator(d):
 39 |     "Make function d a decorator: d wraps a function fn."
 40 | 
 41 |     def _d(fn):
 42 |         return update_wrapper(d(fn), fn)
 43 | 
 44 |     update_wrapper(_d, d)
 45 |     return _d
 46 | 
 47 | 
 48 | @decorator
 49 | def memo(f):
 50 |     """Decorator that caches the return value for each call to f(args).
 51 |     Then when called again with same args, we can just look it up."""
 52 |     cache = {}
 53 | 
 54 |     def _f(*args):
 55 |         try:
 56 |             return cache[args]
 57 |         except KeyError:
 58 |             cache[args] = result = f(*args)
 59 |             return result
 60 |         except TypeError:
 61 |             # some element of args can't be a dict key
 62 |             return f(args)
 63 | 
 64 |     return _f
 65 | 
 66 | 
 67 | def parse(start_symbol, text, grammar):
 68 |     """Example call: parse('Exp', '3*x + b', G).
 69 |     Returns a (tree, remainder) pair. If remainder is '', it parsed the whole
 70 |     string. Failure iff remainder is None. This is a deterministic PEG parser,
 71 |     so rule order (left-to-right) matters. Do 'E => T op E | T', putting the
 72 |     longest parse first; don't do 'E => T | T op E'
 73 |     Also, no left recursion allowed: don't do 'E => E op T'"""
 74 | 
 75 |     tokenizer = grammar[' '] + '(%s)'
 76 | 
 77 |     def parse_sequence(sequence, text):
 78 |         result = []
 79 |         for atom in sequence:
 80 |             tree, text = parse_atom(atom, text)
 81 |             if text is None: return Fail
 82 |             result.append(tree)
 83 |         return result, text
 84 | 
 85 |     @memo
 86 |     def parse_atom(atom, text):
 87 |         if atom in grammar:  # Non-Terminal: tuple of alternatives
 88 |             for alternative in grammar[atom]:
 89 |                 tree, rem = parse_sequence(alternative, text)
 90 |                 if rem is not None: return [atom] + tree, rem
 91 |             return Fail
 92 |         else:  # Terminal: match characters against start of text
 93 |             m = re.match(tokenizer % atom, text)
 94 |             return Fail if (not m) else (m.group(1), text[m.end():])
 95 | 
 96 |     # Body of parse:
 97 |     return parse_atom(start_symbol, text)
 98 | 
 99 | 
100 | Fail = (None, None)
101 | 
102 | G = grammar("""Exp => Term [+-] Exp | Term
103 | Term => Factor [*/] Term Factor
104 | Factor => Funcall | Var | Num | [(] Exp [)]
105 | Funcall => Var [(] Exp [)]
106 | Exps => Exp [,] Exps | Exp
107 | Var => [a-zA-Z_]\w*
108 | Num => [-+]?[0-9]+([.][0-9]*)?""", whitespace='\s*')
109 | 
110 | JSON = grammar("""object => { } | { members }
111 | members => pair , members | pair
112 | pair => string : value
113 | array => [[] []] | [[] elements []]
114 | elements => value , elements | elements
115 | value => string | number | object | array | true | false | null
116 | string => "[^"]*"
117 | number => int frac exp | int frac | int exp | int
118 | int => -?[0-9][0-9]*
119 | frac => [.][0-9]+
120 | exp => [eE][-+]?[0-9]+""", whitespace='\s*')
121 | 
122 | 
123 | def json_parse(text):
124 |     return parse('value', text, JSON)
125 | 
126 | 
127 | def test():
128 |     assert json_parse('["testing", 1, 2, 3]') == (
129 |                        ['value', ['array', '[', ['elements', ['value',
130 |                        ['string', '"testing"']], ',', ['elements', ['value', ['number',
131 |                        ['int', '1']]], ',', ['elements', ['value', ['number',
132 |                        ['int', '2']]], ',', ['elements', ['value', ['number',
133 |                        ['int', '3']]]]]]], ']']], '')
134 | 
135 |     assert json_parse('-123.456e+789') == (
136 |         ['value', ['number', ['int', '-123'], ['frac', '.456'], ['exp', 'e+789']]], '')
137 | 
138 |     assert json_parse('{"age": 21, "state":"CO","occupation":"rides the rodeo"}') == (
139 |                       ['value', ['object', '{', ['members', ['pair', ['string', '"age"'],
140 |                        ':', ['value', ['number', ['int', '21']]]], ',', ['members',
141 |                       ['pair', ['string', '"state"'], ':', ['value', ['string', '"CO"']]],
142 |                       ',', ['members', ['pair', ['string', '"occupation"'], ':',
143 |                       ['value', ['string', '"rides the rodeo"']]]]]], '}']], '')
144 |     return 'tests pass'
145 | 
146 | 
147 | print G
148 | print JSON
149 | print test()
150 | 


--------------------------------------------------------------------------------
/algorithmics/hanoi_tower.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        cryptarithmetic
 3 | # 
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | RODS = 'ABCDEFGHIJKLM'
 8 | 
 9 | 
10 | def hanoi_tower(nbr_disks=5, nbr_rods=6):
11 |     """
12 |     Resolves the hanoi tower problem for n rods and n disks.
13 |     goal is to move all disks from the left rod to the right rod 
14 |     """
15 |     goal = [''] * nbr_rods
16 |     start = [''] * nbr_rods
17 |     start[0] = RODS[0:nbr_disks]
18 |     goal[-1] = RODS[0:nbr_disks]
19 |     goal = tuple(goal)
20 |     explored = set()  # set of explored paths
21 |     paths = [[start]]
22 |     while paths:
23 |         to_explore = paths.pop(0)
24 |         current_state = to_explore[-1]
25 |         for (state, action) in next_state(current_state, nbr_disks).items():
26 |             if state not in explored:
27 |                 explored.add(state)
28 |                 path2 = to_explore + [action, state]
29 |                 if goal == state:
30 |                     return path2
31 |                 else:
32 |                     paths.append(path2)
33 |     return []
34 | 
35 | 
36 | def next_state(state, nbr_disks):
37 |     result = {}
38 |     for i in range(nbr_disks + 1):
39 |         if state[i] != '':
40 |             i_disk = state[i][-1]
41 |             for j in range(nbr_disks + 1):
42 |                 j_disk = state[j][-1] if state[j] != '' else state[j]
43 |                 if i != j and i_disk > j_disk:
44 |                     c_state = list(state)
45 |                     c_state[i] = state[i][:-1]
46 |                     c_state[j] = state[j] + i_disk
47 |                     result[tuple(c_state)] = 'move %s => %s' % (i_disk, j + 1)
48 | 
49 |     return result
50 | 
51 | 
52 | print hanoi_tower()
53 | 


--------------------------------------------------------------------------------
/algorithmics/inverse_function.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        cryptarithmetic
 3 | # 
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | from __future__ import division
 8 | 
 9 | 
10 | def inverse(f, delta=1 / 1024.):
11 |     """
12 |         given a function f, monotonically increasing on a positive interval
13 |         return x = f_1(y) the approximation of its inverse function
14 |     """
15 | 
16 |     def _f(y):
17 |         lv, hv = find_bounds(f, y)
18 |         return binary_search(f, y, lv, hv, delta)
19 | 
20 |     return _f
21 | 
22 | 
23 | def find_bounds(f, y):
24 |     """
25 |         given a function f,
26 |         return lv & hv such that lv <= t(y) <= hv
27 |     """
28 |     x = 1
29 |     while f(x) < y:
30 |         x = x * 2
31 |     lv = 0 if x == 1 else x / 2
32 |     return lv, x
33 | 
34 | 
35 | def binary_search(f, y, lv, hv, delta):
36 |     """
37 |     recherche dicothomique.
38 |     returns x such that f(x) is within delta of y : y-delta <= f(x) <= y+delta
39 |     for exact approximation delta should be as small as possible
40 |     """
41 |     while lv <= hv:
42 |         x = (lv + hv) / 2
43 |         if f(x) > y:
44 |             hv = x - delta
45 |         elif f(x) < y:
46 |             lv = x + delta
47 |         else:
48 |             return x
49 |     return hv if f(hv) - y < y - f(lv) else lv
50 | 
51 | 
52 | def square(x): return x * x
53 | 
54 | 
55 | def power4(x): return x ** 4
56 | 
57 | 
58 | sqrty = inverse(square)
59 | log4 = inverse(power4)
60 | print square(3)
61 | print power4(3)
62 | print sqrty(9)
63 | print log4(81)
64 |     
65 | 


--------------------------------------------------------------------------------
/algorithmics/iter_circle_sum.py:
--------------------------------------------------------------------------------
 1 | # circle sum
 2 | def cal_sum():
 3 |     T = int(raw_input())
 4 | 
 5 |     def c_sum(a, N, M):
 6 |         res = []
 7 |         for i in xrange(N):
 8 |             res.append(list(a))
 9 |         for i in xrange(M):
10 |             for j in xrange(N):
11 |                 res[j][((i + j) % N)] += (res[j][((i + j - 1) % N)] + res[j][((i + j + 1) % N)])
12 |                 res[j][((i + j) % N)] %= 1000000007
13 |         for i in xrange(N):
14 |             print ' '.join(map(str, res[i]))
15 | 
16 |     for t in xrange(T):
17 |         N, M = [int(x) for x in raw_input().split()]
18 |         a = [int(x) for x in raw_input().split()]
19 |         c_sum(a, N, M)
20 |         if t < (T - 1): print ""
21 | 
22 | 
23 | cal_sum()
24 | 


--------------------------------------------------------------------------------
/algorithmics/licence:
--------------------------------------------------------------------------------
 1 | Copyright (c) Mourad MOURAFIQ and individual contributors.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 |     1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | 
 8 |     2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
11 | 


--------------------------------------------------------------------------------
/algorithmics/media.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | 
  4 | def add(e, min_s, max_s, max_min, min_max, l_min, l_max):
  5 |     if l_min == 0:
  6 |         min_s.append(e)
  7 |         max_min = e
  8 |         l_min += 1
  9 |         return True, max_min, min_max, l_min, l_max
 10 |     if e <= max_min:
 11 |         l_min += 1
 12 |         min_s.append(e)
 13 |     else:
 14 |         if e < min_max or l_max == 0:
 15 |             min_max = e
 16 |         l_max += 1
 17 |         max_s.append(e)
 18 |     return True, max_min, min_max, l_min, l_max
 19 | 
 20 | 
 21 | def remove(e, min_s, max_s, max_min, min_max, l_min, l_max):
 22 |     if e in min_s:
 23 |         l_min -= 1
 24 |         min_s.remove(e)
 25 |         if e == max_min:
 26 |             if l_min > 0:
 27 |                 max_min = max(min_s)
 28 |             else:
 29 |                 max_min = 0
 30 |         return True, max_min, min_max, l_min, l_max
 31 |     if e in max_s:
 32 |         l_max -= 1
 33 |         max_s.remove(e)
 34 |         if e == min_max:
 35 |             if l_max > 0:
 36 |                 min_max = min(max_s)
 37 |             else:
 38 |                 min_max = 0
 39 |         return True, max_min, min_max, l_min, l_max
 40 |     return True, max_min, min_max, l_min, l_max
 41 | 
 42 | 
 43 | def operate(op, e, min_s, max_s, max_min, min_max, l_min, l_max):
 44 |     if op == "a": return add(e, min_s, max_s, max_min, min_max, l_min, l_max)
 45 |     if op == "r": return remove(e, min_s, max_s, max_min, min_max, l_min, l_max)
 46 |     return False, max_min, min_max, l_min, l_max
 47 | 
 48 | 
 49 | def size_s(min_s, max_s, max_min, min_max, l_min, l_max):
 50 |     if l_min == l_max == 0:
 51 |         return False, max_min, min_max, l_min, l_max
 52 |     if l_max > l_min:
 53 |         e = min_max
 54 |         max_s.remove(e)
 55 |         l_max -= 1
 56 |         if l_max > 0:
 57 |             min_max = min(max_s)
 58 |         else:
 59 |             min_max = 0
 60 |         min_s.append(e)
 61 |         l_min += 1
 62 |         max_min = e
 63 |         return True, max_min, min_max, l_min, l_max
 64 |     if l_min > l_max + 1:
 65 |         e = max_min
 66 |         min_s.remove(e)
 67 |         l_min -= 1
 68 |         if l_min > 0:
 69 |             max_min = max(min_s)
 70 |         else:
 71 |             max_min = 0
 72 |         max_s.append(e)
 73 |         l_max += 1
 74 |         min_max = e
 75 |         return True, max_min, min_max, l_min, l_max
 76 |     return True, max_min, min_max, l_min, l_max
 77 | 
 78 | 
 79 | def calculate(min_s, max_s, max_min, min_max, l_min, l_max):
 80 |     if l_min > l_max:
 81 |         print max_min
 82 |     if l_min == l_max:
 83 |         med = max_min + min_max
 84 |         if med % 2 == 0:
 85 |             print "%.0lf" % (med / 2)
 86 |         else:
 87 |             print "%.1lf" % (med / 2)
 88 | 
 89 | 
 90 | def median():
 91 |     n_op = int(raw_input())
 92 |     min_s = []
 93 |     max_min = 0
 94 |     l_min = 0
 95 |     max_s = []
 96 |     min_max = 0
 97 |     l_max = 0
 98 |     for i in xrange(n_op):
 99 |         op, e = [x for x in raw_input().split()]
100 |         e = int(e)
101 |         stat, max_min, min_max, l_min, l_max = operate(op, e, min_s, max_s, max_min, min_max, l_min, l_max)
102 |         if stat:
103 |             stat, max_min, min_max, l_min, l_max = size_s(min_s, max_s, max_min, min_max, l_min, l_max)
104 |             if stat:
105 |                 calculate(min_s, max_s, max_min, min_max, l_min, l_max)
106 |             else:
107 |                 print "Wrong!"
108 |         else:
109 |             print "Wrong!"
110 | 
111 | 
112 | median()
113 | 


--------------------------------------------------------------------------------
/algorithmics/n_c_p.py:
--------------------------------------------------------------------------------
 1 | def n_div_P(n, P):
 2 |     if n >= P:
 3 |         j = 0
 4 |         m = n % P
 5 |         q = n / P
 6 |         j = q * (P - m - 1)
 7 |         return j + ((m + 1) * n_div_P(q, P))
 8 |     else:
 9 |         return 0
10 | 
11 | 
12 | def n_C_p():
13 |     T = int(raw_input())
14 |     result = []
15 |     for i in range(T):
16 |         n, P = raw_input().split()
17 |         n, P = int(n), int(P)
18 |         j = n_div_P(n, P)
19 |         result.append(j)
20 |     for i in range(T):
21 |         print result[i]
22 | 
23 | 
24 | n_C_p()
25 | 


--------------------------------------------------------------------------------
/algorithmics/numpy_circle_sum.py:
--------------------------------------------------------------------------------
 1 | # circle sum
 2 | 
 3 | import numpy as nm
 4 | 
 5 | 
 6 | def cal_sum():
 7 |     T = int(raw_input())
 8 | 
 9 |     def list_mat(n, m_div, m_mod):
10 |         a_n = []
11 |         if m_div > 0:
12 |             for i in xrange(n):
13 |                 a_i = nm.eye(n, dtype=nm.int)
14 |                 i_l = (i - 1) % n
15 |                 i_r = (i + 1) % n
16 |                 a_i[i_r][i] = 1
17 |                 a_i[i_l][i] = 1
18 |                 a_n.append(a_i)
19 |         else:
20 |             for i in xrange(n):
21 |                 a_i = nm.eye(n, dtype=nm.int)
22 |                 for j in xrange(m_mod):
23 |                     i_l = (i + j - 1) % n
24 |                     i_r = (i + j + 1) % n
25 |                     a_i[i_r][i] = 1
26 |                     a_i[i_l][i] = 1
27 |                 a_n.append(a_i)
28 |         return a_n
29 | 
30 |     def mat_mult(n, a, a_n, ind, m_div, m_mod):
31 |         res = nm.mat(nm.eye(n), dtype=nm.int)
32 |         if m_div > 0:
33 |             for i in xrange(n):
34 |                 j = (i + ind) % n
35 |                 res *= nm.mat(a_n[j])
36 |             res = res ** m_div
37 |         for i in xrange(m_mod):
38 |             j = (i + ind) % n
39 |             res *= nm.mat(a_n[j])
40 |         return a * res
41 | 
42 |     for t in xrange(T):
43 |         N, M = [int(x) for x in raw_input().split()]
44 |         a = [int(x) for x in raw_input().split()]
45 |         M_mod = M % N
46 |         M_div = M / N
47 |         a_n = list_mat(N, M_div, M_mod)
48 |         for i in xrange(N):
49 |             res = nm.nditer(mat_mult(N, a, a_n, i, M_div, M_mod))
50 |             print ' '.join(map(str, res))
51 |         if t < (T - 1): print ""
52 | 
53 | 
54 | cal_sum()
55 | 


--------------------------------------------------------------------------------
/algorithmics/palindromes.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        palindromes
 3 | # 
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | 
 8 | def longest_subpalindrome(string):
 9 |     """
10 |     Returns the longest subpalindrome string from the current string
11 |     Return (i,j)
12 |     """
13 |     #first we check if string is ""
14 |     if string == "": return (0, 0)
15 | 
16 |     def length(slice): a, b = slice; return b - a
17 | 
18 |     slices = [grow(string, start, end)
19 |               for start in range(len(string))
20 |               for end in (start, start + 1)
21 |     ]
22 |     return max(slices, key=length)
23 | 
24 | 
25 | def grow(string, start, end):
26 |     """
27 |     starts with a 0 or 1 length palindrome and try to grow bigger 
28 |     """
29 |     while (start > 0 and end < len(string)
30 |            and string[start - 1].upper() == string[end].upper()):
31 |         start -= 1;
32 |         end += 1
33 |     return (start, end)
34 | 


--------------------------------------------------------------------------------
/algorithmics/pooring_water.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        palindromes
 3 | # 
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | def pooring_prob(size_x, size_y, goal, start=(0, 0)):
 8 |     """
 9 |     Resolves the pooring problem for two glasses x & y.
10 |     goal is the size we are looking for.
11 |     size_x & size_y are respectively the size of the glass x, glass y respectively. 
12 |     """
13 |     if goal in start:
14 |         return [start]
15 |     explored = set()  #set of visited states
16 |     paths = [[start]]
17 |     while paths:
18 |         to_explore = paths.pop(0)
19 |         (x, y) = to_explore[-1]
20 |         for (state, action) in next_state(x, y, size_x, size_y).items():
21 |             if state not in explored:
22 |                 explored.add(state)
23 |                 path2 = to_explore + [action, state]
24 |                 if goal in state:
25 |                     return path2
26 |                 else:
27 |                     paths.append(path2)
28 | 
29 |     return []
30 | 
31 | 
32 | def next_state(x, y, size_x, size_y):
33 |     assert x <= size_x and y <= size_y
34 |     return {
35 |         (0, x + y) if x + y <= size_y else (x - (size_y - y), size_y): 'x->y',
36 |         (y + x, 0) if x + y <= size_x else (size_x, y - (size_x - x)): 'y->x',
37 |         (size_x, y): 'fill x', (x, size_y): 'fill y',
38 |         (0, y): 'empty x', (x, 0): 'empty y',
39 |     }
40 | 
41 | 
42 | print pooring_prob(440, 900, 600)
43 | 


--------------------------------------------------------------------------------
/algorithmics/recur_circle_sum.py:
--------------------------------------------------------------------------------
 1 | # circle sum
 2 | import sys
 3 | 
 4 | sys.setrecursionlimit(15000)
 5 | 
 6 | 
 7 | def cal_sum():
 8 |     T = int(raw_input())
 9 | 
10 |     def c_sum(a, indice, start, end, N, round, a_cache):
11 |         c = a[indice]
12 |         if round == 0: return c
13 |         if (indice, round) in a_cache:
14 |             c = a_cache[(indice, round)]
15 |         else:
16 |             i_l = (indice - 1) % N
17 |             i = indice
18 |             i_r = (indice + 1) % N
19 |             if round == 1:
20 |                 if i == start:
21 |                     c = a[i_l] + a[i] + a[i_r]
22 |                 elif i == end:
23 |                     c = c_sum(a, i_l, start, end, N, round, a_cache) + a[i] + c_sum(a, i_r, start, end, N, round,
24 |                                                                                     a_cache)
25 |                 else:
26 |                     c = c_sum(a, i_l, start, end, N, round, a_cache) + a[i] + a[i_r]
27 |             else:
28 |                 if i == start:
29 |                     c = c_sum(a, i_l, start, end, N, round - 1, a_cache) + c_sum(a, i, start, end, N, round - 1,
30 |                                                                                  a_cache) + c_sum(a, i_r, start, end, N,
31 |                                                                                                   round - 1, a_cache)
32 |                 elif i == end:
33 |                     c = c_sum(a, i_l, start, end, N, round, a_cache) + c_sum(a, i, start, end, N, round - 1,
34 |                                                                              a_cache) + c_sum(a, i_r, start, end, N,
35 |                                                                                               round, a_cache)
36 |                 else:
37 |                     c = c_sum(a, i_l, start, end, N, round, a_cache) + c_sum(a, i, start, end, N, round - 1,
38 |                                                                              a_cache) + c_sum(a, i_r, start, end, N,
39 |                                                                                               round - 1, a_cache)
40 |             a_cache[(indice, round)] = c % 1000000007
41 |         return c
42 | 
43 |     for t in xrange(T):
44 |         N, M = [int(x) for x in raw_input().split()]
45 |         a = [int(x) for x in raw_input().split()]
46 |         for i in xrange(N):
47 |             M_mod = M % N
48 |             M_div = M / N
49 |             res = list(a)
50 |             a_cache = {}  # intialize cache
51 |             cpt = 1
52 |             for j in xrange(N):
53 |                 ind = (j + i) % N
54 |                 if M_mod == 0:
55 |                     cpt = 0
56 |                 else:
57 |                     M_mod -= 1
58 |                 res[ind] = c_sum(a, ind, i, (i + N - 1) % N, N, M_div + cpt, a_cache)
59 |             print ' '.join(map(str, res))
60 |         if t < (T - 1): print ""
61 | 
62 | 
63 | cal_sum()
64 | 


--------------------------------------------------------------------------------
/algorithmics/string_red.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | 
 4 | def reduce_str(strg):
 5 |     count_abc = collections.defaultdict(int)
 6 |     str_g = list(strg)
 7 |     for s in str_g:
 8 |         count_abc[s] += 1
 9 |     if (count_abc['a'] == 0 and count_abc['c'] == 0) or (count_abc['a'] == 0 and count_abc['b'] == 0) or (
10 |                     count_abc['b'] == 0 and count_abc['c'] == 0):
11 |         return len(strg)
12 |     elif (count_abc['a'] % 2 == count_abc['b'] % 2 == count_abc['c'] % 2):
13 |         return 2
14 |     else:
15 |         return 1
16 | 
17 | 
18 | def string_red():
19 |     T = int(raw_input())
20 |     r = set()
21 |     for t in xrange(T):
22 |         case = raw_input()
23 |         print reduce_str(case)
24 | 
25 | 
26 | string_red()
27 |         
28 | 


--------------------------------------------------------------------------------
/algorithmics/suffixes.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | 
 4 | def check_in(i_from, i_to, str, sfx):
 5 |     sum = 0
 6 |     for i in range(i_from, min(i_to, len(sfx))):
 7 |         if sfx[i] == str[i]:
 8 |             sum += 1
 9 |         else:
10 |             break
11 |     return sum
12 | 
13 | 
14 | def suffix():
15 |     N = int(raw_input())
16 |     for i in range(N):
17 |         sum = 0
18 |         str = raw_input()
19 |         len_str = len(str)
20 |         sum += len_str
21 |         sfx = str[1:]
22 |         cpt = 2
23 |         while sfx != "":
24 |             len_sfx = len(sfx)
25 |             step = 90
26 |             for i in range(0, len_sfx, step):
27 |                 if str[i:i + step] == sfx[i:i + step]:
28 |                     sum += step
29 |                 else:
30 |                     sum += check_in(i, i + step, str, sfx)
31 |                     break
32 |             sfx = str[cpt:]
33 |             cpt += 1
34 |         print sum
35 | 
36 | 
37 | import time
38 | 
39 | t = time.clock()
40 | suffix()
41 | print time.clock() - t          
42 | 


--------------------------------------------------------------------------------
/algorithmics/unfriendly.py:
--------------------------------------------------------------------------------
 1 | from math import sqrt
 2 | from fractions import gcd
 3 | 
 4 | 
 5 | def get_factors(x):
 6 |     factors = set([x])
 7 | 
 8 |     sqrtX = int(sqrt(x))
 9 | 
10 |     for i in range(1, sqrtX + 1):
11 | 
12 |         if x % i == 0:
13 |             factors.add(i)
14 |             factors.add(x / i)
15 | 
16 |     return factors
17 | 
18 | 
19 | def friendly():
20 |     _, friendly = [int(i) for i in raw_input().split()]
21 |     unfriendlies = [int(i) for i in raw_input().split()]
22 | 
23 |     friendly_factors = get_factors(friendly)
24 | 
25 |     unfriendly_factors = set()
26 | 
27 |     for unfriendly in unfriendlies:
28 |         g = gcd(friendly, unfriendly)
29 | 
30 |         unfriendly_factors.add(g)
31 |         unfriendly_factors.update(get_factors(g))
32 |     print len(friendly_factors - unfriendly_factors)
33 | 
34 | 
35 | friendly()
36 | 


--------------------------------------------------------------------------------
/classification/decision_trees.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Nov 01, 2012
  3 | 
  4 | @author: Mourad Mourafiq
  5 | """
  6 | 
  7 | from math import log
  8 | 
  9 | log2 = lambda x: log(x) / log(2)
 10 | 
 11 | 
 12 | class Item(object):
 13 |     """
 14 |     Describe an item
 15 |     
 16 |     @type id: string
 17 |     @param id: the id of the elemet
 18 |     
 19 |     @type value: string
 20 |     @param value: the value of the item
 21 |     
 22 |     @type coords: list
 23 |     @param coords: a list representing the parameters that locates the item   
 24 |     """
 25 | 
 26 |     def __init__(self, id, coords, value=None):
 27 |         self.id = id
 28 |         self.coords = coords
 29 |         self.value = value
 30 | 
 31 | 
 32 | class Node(object):
 33 |     """
 34 |     A node object in a decision tree
 35 |     
 36 |     @type column: int
 37 |     @param column: the column index of the criteria to be tested
 38 |     
 39 |     @type value: string
 40 |     @param value: the value that the column must match to get a true result
 41 | 
 42 |     @type results: dict
 43 |     @param results: stores a dictionary of results for this branch. This is None for everything
 44 |                     except endpoints
 45 |     
 46 |     @type t_node: Node
 47 |     @param t_node: the next nodes in the tree if the result is true
 48 |     
 49 |     @type f_node: Node
 50 |     @param f_node: the next nodes in the tree if the result is false
 51 |     """
 52 | 
 53 |     def __init__(self, col=-1, value=None, results=None, t_node=None, f_node=None):
 54 |         self.col = col
 55 |         self.value = value
 56 |         self.results = results
 57 |         self.t_node = t_node
 58 |         self.f_node = f_node
 59 | 
 60 |     def draw(self, indent=''):
 61 |         # Is this a leaf node?
 62 |         if self.results != None:
 63 |             print str(self.results)
 64 |         else:
 65 |             # Print the criteria
 66 |             print str(self.col) + ':' + str(self.value) + '? '
 67 |             # Print the branches
 68 |             print indent + 'T->',
 69 |             self.t_node.draw(indent + '  ')
 70 |             print indent + 'F->',
 71 |             self.f_node.draw(indent + '  ')
 72 | 
 73 | 
 74 | class DecisionTree(object):
 75 |     """
 76 |     A decision tree object
 77 |     """
 78 | 
 79 |     @staticmethod
 80 |     def count_results(data, item=True):
 81 |         """
 82 |         count the occurrences of each result in the data set
 83 |         """
 84 |         results_count = defaultdict(int)
 85 |         if item:
 86 |             for i in data:
 87 |                 results_count[i.value] += 1
 88 |         else:
 89 |             for i in data:
 90 |                 results_count[i[-1]] += 1
 91 |         return results_count
 92 | 
 93 |     @staticmethod
 94 |     def divide_data(data, column, value):
 95 |         """
 96 |         Divides a set of rows on a specific column.
 97 |         """
 98 |         # a function that decides if the row goes to the first or the second group (true or false)
 99 |         spliter = None
100 |         if isinstance(value, int) or isinstance(value, float):
101 |             spliter = lambda item: item.coords[column] >= value
102 |         else:
103 |             spliter = lambda item: item.coords[column] == value
104 |         #divide the rows into two sets and return them
105 |         set_true = []
106 |         set_false = []
107 |         for item in data:
108 |             if spliter(item):
109 |                 set_true.append(item)
110 |             else:
111 |                 set_false.append(item)
112 |         return (set_true, set_false)
113 | 
114 |     @staticmethod
115 |     def gini_impurity(data, item=True):
116 |         """
117 |         Probability that a randomly placed item will be in the wrong category
118 |         """
119 |         results_count = DecisionTree.count_results(data, item)
120 |         len_data = len(data)
121 |         imp = 0.0
122 |         for k1, v1 in results_count.iteritems():
123 |             p1 = float(v1) / len_data
124 |             for k2, v2 in results_count.iteritems():
125 |                 if k1 == k2: continue
126 |                 p2 = float(v2) / len_data
127 |                 imp += p1 * p2
128 |         return imp
129 | 
130 |     @staticmethod
131 |     def entropy(data, item=True):
132 |         """
133 |         estimate the disorder in the data set : sum of p(x)log(p(x))
134 |         """
135 |         results_count = DecisionTree.count_results(data, item)
136 |         len_data = len(data)
137 |         ent = 0.0
138 |         for v in results_count.itervalues():
139 |             p = float(v) / len_data
140 |             ent -= p * log2(p)
141 |         return ent
142 | 
143 |     @staticmethod
144 |     def variance(data):
145 |         """
146 |         calculates the statistical variance for a set of rows
147 |         more preferably to be used with numerical outcomes
148 |         """
149 |         len_data = len(data)
150 |         if len_data == 0: return 0
151 |         score = [float(item.value) for item in data]
152 |         mean = sum(score) / len(score)
153 |         variance = sum([(s - mean) ** 2 for s in score]) / len(score)
154 |         return variance
155 | 
156 | 
157 |     @staticmethod
158 |     def build_tree(data, disorder_function="entropy"):
159 |         """
160 |         a recursive function that builds the tree by choosing the best dividing criteria
161 |         disorder_function : 
162 |             for data that contains words and booleans; it is recommended to use entropy or gini_impurity
163 |             for data that contains number; it is recommended to use variance
164 |         """
165 |         if disorder_function == "entropy":
166 |             disorder_estimator = DecisionTree.entropy
167 |         elif disorder_function == "gini_impurity":
168 |             disorder_estimator = DecisionTree.gini_impurity
169 |         elif disorder_function == "variance":
170 |             disorder_estimator = DecisionTree.variance
171 |         len_data = len(data)
172 |         if len_data == 0: return Node()
173 |         current_disorder_level = disorder_estimator(data)
174 |         # track enhancement of disorer's level
175 |         best_enhancement = 0.0
176 |         best_split = None
177 |         best_split_sets = None
178 |         # number columns
179 |         nbr_coords = len(data[0].coords)
180 |         for coord_ind in xrange(nbr_coords):
181 |             #get unique values of the current column
182 |             coord_values = {}
183 |             for item in data:
184 |                 coord_values[item.coords[coord_ind]] = 1
185 |             for coord_value in coord_values.iterkeys():
186 |                 set1, set2 = DecisionTree.divide_data(data, coord_ind, coord_value)
187 |                 p1 = float(len(set1)) / len_data
188 |                 p2 = (1 - p1)
189 |                 enhancement = current_disorder_level - (p1 * disorder_estimator(set1)) - (p2 * disorder_estimator(set2))
190 |                 if (enhancement > best_enhancement) and (len(set1) > 0 and len(set2) > 0):
191 |                     best_enhancement = enhancement
192 |                     best_split = (coord_ind, coord_value)
193 |                     best_split_sets = (set1, set2)
194 |         if best_enhancement > 0:
195 |             t_node = DecisionTree.build_tree(best_split_sets[0])
196 |             f_node = DecisionTree.build_tree(best_split_sets[1])
197 |             return Node(col=best_split[0], value=best_split[1],
198 |                         t_node=t_node, f_node=f_node)
199 |         else:
200 |             return Node(results=DecisionTree.count_results(data))
201 | 
202 |     @staticmethod
203 |     def prune(tree, min_enhancement, disorder_function="entropy"):
204 |         """
205 |         checking pairs of nodes that have a common parent to see if merging 
206 |         them would increase the entropy by less than a specified threshold
207 |         """
208 |         if disorder_function == "entropy":
209 |             disorder_estimator = DecisionTree.entropy
210 |         elif disorder_function == "gini_impurity":
211 |             disorder_estimator = DecisionTree.gini_impurity
212 |         elif disorder_function == "variance":
213 |             disorder_estimator = DecisionTree.variance
214 |         if tree.t_node.results == None:
215 |             DecisionTree.prune(tree.t_node, min_enhancement)
216 |         if tree.f_node.results == None:
217 |             DecisionTree.prune(tree.f_node, min_enhancement)
218 |         # If both the subbranches are now leaves, see if they should merged
219 |         if (tree.t_node.results != None and tree.f_node.results != None):
220 |             # Build a combined dataset
221 |             t_node, f_node = [], []
222 |             for key, value in tree.t_node.results.items():
223 |                 t_node += [[key]] * value
224 |             for key, value in tree.f_node.results.items():
225 |                 f_node += [[key]] * value
226 |             # Test the enhancement 
227 |             delta = disorder_estimator(t_node + f_node, item=False) - (
228 |             disorder_estimator(t_node, item=False) + disorder_estimator(f_node, item=False) / 2)
229 |             if delta < min_enhancement:
230 |                 # Merge the branches
231 |                 tree.t_node, tree.f_node = None, None
232 |                 tree.results = DecisionTree.count_results(t_node + f_node, item=False)
233 | 
234 |     @staticmethod
235 |     def classify(observation, tree):
236 |         """
237 |         Classify a new observation given a decision tree
238 |         """
239 |         if tree.results != None:
240 |             return tree.results
241 |         # the observation value for the current criteria column
242 |         observation_value = observation.coords[tree.col]
243 |         if observation_value == None:
244 |             t_results, f_results = DecisionTree.classify(observation, tree.t_node), DecisionTree.classify(observation,
245 |                                                                                                           tree.f_node)
246 |             t_count = sum(t_results.values())
247 |             f_count = sum(f_results.values())
248 |             t_prob = float(t_count) / (t_count + f_count)
249 |             f_prob = float(f_count) / (t_count + f_count)
250 |             result = {}
251 |             for key, value in t_results.items(): result[key] = value * t_prob
252 |             for key, value in f_results.items(): result[key] = value * f_prob
253 |             return result
254 |         else:
255 |             #with branch to follow
256 |             branch = None
257 |             if (isinstance(observation_value, int) or isinstance(observation_value, float)):
258 |                 branch = tree.t_node if (observation_value >= tree.value) else tree.f_node
259 |             else:
260 |                 branch = tree.t_node if (observation_value == tree.value) else tree.f_node
261 |             return DecisionTree.classify(observation, branch)
262 | 


--------------------------------------------------------------------------------
/classification/knn.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | 
  4 | 
  5 | class Item(object):
  6 |     """
  7 |     Describe an item
  8 |     
  9 |     @type id: string
 10 |     @param id: the id of the elemet
 11 |     
 12 |     @type value: float
 13 |     @param value: the value of the item
 14 |     
 15 |     @type coords: list
 16 |     @param coords: a list representing the parameters that locates the item   
 17 |     """
 18 | 
 19 |     def __init__(self, id, coords, value=None):
 20 |         self.id = id
 21 |         self.coords = coords
 22 |         self.value = value
 23 | 
 24 | 
 25 | class KNN(object):
 26 |     @staticmethod
 27 |     def euclidean(v1, v2):
 28 |         d = 0.0
 29 |         for i in range(len(v1)):
 30 |             d += (v1[i] - v2[i]) ** 2
 31 |         return math.sqrt(d)
 32 | 
 33 |     @staticmethod
 34 |     def all_distances(data, item):
 35 |         distancelist = []
 36 |         for i in range(len(data)):
 37 |             item2 = data[i]
 38 |             distancelist.append((KNN.euclidean(item.coords, item2.coords), i))
 39 |         distancelist.sort()
 40 |         return distancelist
 41 | 
 42 |     @staticmethod
 43 |     def inverse_weight(dist, num=1.0, const=0.1):
 44 |         return num / (dist + const)
 45 | 
 46 |     @staticmethod
 47 |     def subtract_weight(dist, const=1.0):
 48 |         if dist > const:
 49 |             return 0
 50 |         else:
 51 |             return const - dist
 52 | 
 53 |     @staticmethod
 54 |     def gaussian(dist, sigma=10.0):
 55 |         return math.e ** (-dist ** 2 / (2 * sigma ** 2))
 56 | 
 57 |     @staticmethod
 58 |     def knn_estimate(data, item, k=3):
 59 |         # Get sorted distances
 60 |         all_dist = KNN.all_distances(data, item)
 61 |         avg = 0.0
 62 |         # Take the average of the top k results
 63 |         for i in range(k):
 64 |             idx = all_dist[i][1]
 65 |             avg += data[idx].value
 66 |         avg = avg / k
 67 |         return avg
 68 | 
 69 |     @staticmethod
 70 |     def weighted_knn(data, item, k=5, weight_f="gaussian"):
 71 |         if weight_f == "subtract_weight":
 72 |             weightf = KNN.subtract_weight
 73 |         elif weight_f == "inverse_weight":
 74 |             weightf = KNN.inverse_weight
 75 |         elif weight_f == "gaussian":
 76 |             weightf = KNN.gaussian
 77 |         # Get distances
 78 |         all_dist = KNN.all_distances(data, item)
 79 |         avg = 0.0
 80 |         totalweight = 0.0
 81 |         # Get weighted average
 82 |         for i in range(k):
 83 |             dist = all_dist[i][0]
 84 |             idx = all_dist[i][1]
 85 |             weight = weightf(dist)
 86 |             avg += weight * int(data[idx].value)
 87 |             totalweight += weight
 88 |         avg = avg / totalweight
 89 |         return avg
 90 | 
 91 |     @staticmethod
 92 |     def prob_guess(data, item, low, high, k=5, weightf=gaussian):
 93 |         all_dist = KNN.all_distances(data, item)
 94 |         nweight = 0.0
 95 |         tweight = 0.0
 96 |         for i in range(k):
 97 |             dist = all_dist[i][0]
 98 |             idx = all_dist[i][1]
 99 |             weight = weightf(dist)
100 |             v = data[idx].value
101 |             # Is this point in the range?
102 |             if v >= low and v <= high:
103 |                 nweight += weight
104 |             tweight += weight
105 |         if tweight == 0: return 0
106 |         # The probability is the weights in the range
107 |         # divided by all the weights
108 |         return nweight / tweight
109 | 
110 |     @staticmethod
111 |     def divide_data(data, test=0.05):
112 |         trainset = []
113 |         testset = []
114 |         for row in data:
115 |             if random.random() < test:
116 |                 testset.append(row)
117 |             else:
118 |                 trainset.append(row)
119 |         return trainset, testset
120 | 
121 |     @staticmethod
122 |     def test_algorithm(algf, trainset, testset):
123 |         error = 0.0
124 |         for item in testset:
125 |             guess = algf(trainset, item)
126 |             error += (item.value - guess) ** 2
127 |         return error / len(testset)
128 | 
129 |     @staticmethod
130 |     def cross_validate(algf, data, trials=100, test=0.05):
131 |         error = 0.0
132 |         for i in range(trials):
133 |             trainset, testset = KNN.divide_data(data, test)
134 |             error += KNN.test_algorithm(algf, trainset, testset)
135 |         return error / trials
136 | 
137 |     @staticmethod
138 |     def rescale(data, scale):
139 |         scaleddata = []
140 |         for item in data:
141 |             scaled_coords = [scale[i] * item.coords[i] for i in range(len(scale))]
142 |             scaled_item = Item(id=item.id, value=item.value, coords=scaled_coords)
143 |             scaleddata.append(scaled_item)
144 |         return scaleddata
145 | 
146 |     @staticmethod
147 |     def cost_function(algf, data):
148 |         """
149 |         this function should be used all along with an optimization function to determine the perfect scale
150 |         notably one could use the annealing algorithm or the genetic algorithm
151 |         """
152 | 
153 |         def costf(scale):
154 |             sdata = KNN.rescale(data, scale)
155 |             return KNN.cross_validate(algf, sdata, trials=10)
156 | 
157 |         return costf
158 | 
159 |     @staticmethod
160 |     def annealingoptimize(domain, costf, T=10000.0, cool=0.95, step=1):
161 |         # Initialize the values randomly
162 |         vec = [float(random.randint(domain[i][0], domain[i][1]))
163 |                for i in range(len(domain))]
164 | 
165 |         while T > 0.1:
166 |             # Choose one of the indices
167 |             i = random.randint(0, len(domain) - 1)
168 | 
169 |             # Choose a direction to change it
170 |             dir = random.randint(-step, step)
171 | 
172 |             # Create a new list with one of the values changed
173 |             vecb = vec[:]
174 |             vecb[i] += dir
175 |             if vecb[i] < domain[i][0]:
176 |                 vecb[i] = domain[i][0]
177 |             elif vecb[i] > domain[i][1]:
178 |                 vecb[i] = domain[i][1]
179 | 
180 |             # Calculate the current cost and the new cost
181 |             ea = costf(vec)
182 |             eb = costf(vecb)
183 |             p = pow(math.e, (-eb - ea) / T)
184 | 
185 |             # Is it better, or does it make the probability
186 |             # cutoff?
187 |             if (eb < ea or random.random() < p):
188 |                 vec = vecb
189 | 
190 |             # Decrease the temperature
191 |             T = T * cool
192 |         return vec
193 | 


--------------------------------------------------------------------------------
/decorators.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        decorators
 3 | # 
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | from functools import update_wrapper
 8 | import time
 9 | 
10 | 
11 | def decorator(d):
12 |     "Make function d a decorator: d wraps a function fn."
13 | 
14 |     def _d(fn):
15 |         return update_wrapper(d(fn), fn)
16 | 
17 |     update_wrapper(_d, d)
18 |     return _d
19 | 
20 | 
21 | @decorator
22 | def trace(f):
23 |     """
24 |     helps debug recursive calls
25 |     """
26 |     indent = '   '
27 | 
28 |     def _f(*args):
29 |         signature = '%s(%s)' % (f.__name__, ', '.join(map(repr, args)))
30 |         print '%s--> %s' % (trace.level * indent, signature)
31 |         trace.level += 1
32 |         try:
33 |             result = f(*args)
34 |             print '%s<-- %s == %s' % ((trace.level - 1) * indent,
35 |                                       signature, result)
36 |         finally:
37 |             trace.level -= 1
38 |         return result
39 | 
40 |     trace.level = 0
41 |     return _f
42 | 
43 | 
44 | @decorator
45 | def timing(f):
46 |     """
47 |     calculates time of computation
48 |     """
49 |     def _f(*args):
50 |         t0 = time.clock()
51 |         result = f(*args)
52 |         print t0
53 |         t =  time.clock()
54 |         print t
55 |         return result
56 |     return _f
57 | 
58 | @decorator
59 | def memo(f):
60 |     """
61 |     a simple caching decorator
62 |     """
63 |     cache = {}
64 | 
65 |     def _f(*args):
66 |         try :
67 |             return cache[args]
68 |         except KeyError:
69 |             cache[args] = result = f(*args)
70 |             return result
71 |         except TypeError:
72 |             return f(args)
73 |     return _f
74 | 
75 | 
76 | @decorator
77 | def count_calls(f):
78 |     """
79 |     counts the number of calls to the function f
80 |     """
81 |     def _f(*args):
82 |         callcounts[_f] += 1
83 |         return f(*args)
84 |     callcounts[_f] = 0
85 |     return _f
86 | 
87 | callcounts = {}
88 | 


--------------------------------------------------------------------------------
/dijkstra.py:
--------------------------------------------------------------------------------
 1 | def dijkstra(graph, node):
 2 |     """
 3 |     Simulate the dijkstra algorithm in a graph
 4 |     """
 5 |     distance_to = {}
 6 |     distance_to[node] = 0
 7 |     distance_path = {}
 8 |     while (distance_to):
 9 |         # in case we have a disjoint graph
10 |         op_node = min_distance(distance_to)
11 |         distance_path[op_node] = distance_to[op_node]
12 |         del distance_to[op_node]
13 |         for x, x_len in graph[op_node].items():
14 |             if x not in distance_path:
15 |                 if x not in distance_to:
16 |                     distance_to[x] = distance_path[op_node] + x_len
17 |                 elif distance_to[x] > distance_path[op_node] + x_len:
18 |                     distance_to[x] = distance_path[op_node] + x_len
19 |     return distance_path
20 | 


--------------------------------------------------------------------------------
/filters/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mourad'
2 | 


--------------------------------------------------------------------------------
/filters/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | 
  5 | def plot_data(xs, ys, c, lw, label, linestyle, **kwargs):
  6 |     if ys is not None:
  7 |         plt.plot(xs, ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs)
  8 |     else:
  9 |         plt.plot(xs, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs)
 10 | 
 11 | 
 12 | def plot_measurements(xs, ys=None, c='r', lw=2, label='Measurements', linestyle='--', **kwargs):
 13 |     plot_data(xs=xs, ys=ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs)
 14 | 
 15 | 
 16 | def plot_predictions(xs, ys=None, c='b', lw=2, label='Measurements', linestyle=':', **kwargs):
 17 |     plot_data(xs=xs, ys=ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs)
 18 | 
 19 | 
 20 | def plot_filter(xs, ys=None, c='g', lw=4, label='Filter', linestyle='-', **kwargs):
 21 |     plot_data(xs=xs, ys=ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs)
 22 | 
 23 | 
 24 | def plot_track(xs, ys=None, c='k', lw=2, label='Track', linestyle='-', **kwargs):
 25 |     plot_data(xs=xs, ys=ys, c=c, lw=lw, linestyle=linestyle, label=label, **kwargs)
 26 | 
 27 | 
 28 | def generate_measurements(x_0, dx, num_measurements, noise, acceleration=0):
 29 |     data = []
 30 |     for i in xrange(num_measurements):
 31 |         data.append(x_0 + dx * i + np.random.randn() * noise)
 32 |         dx += acceleration
 33 |     return data
 34 | 
 35 | 
 36 | def g_h_filter(measurements, x_0, g, h, dx, dt=1.):
 37 |     """
 38 |     Performs g-h filter on 1 state variable with a fixed g and h.
 39 |     :param measurements:
 40 |     :param x_0: initial value.
 41 |     :param g: g scale factor in g-h filter.
 42 |     :param h: h scale factor in g-h filter.
 43 |     :param dx: initial change rate.
 44 |     :param dt: time step.
 45 |     :return:
 46 |     """
 47 |     x_i = x_0
 48 |     predictions = []
 49 |     filtered_measurements = []
 50 |     for measurement in measurements:
 51 |         # predict the value
 52 |         x_prediction = x_i + dx * dt
 53 |         predictions.append(x_prediction)
 54 | 
 55 |         # calculate the residual
 56 |         residual = measurement - x_prediction
 57 | 
 58 |         # update the change rate
 59 |         dx += h * residual / dt
 60 |         # update the initial guess/value
 61 |         x_i = x_prediction + g * residual
 62 | 
 63 |         filtered_measurements.append(x_i)
 64 | 
 65 |     return np.array(predictions), np.array(filtered_measurements)
 66 | 
 67 | 
 68 | def plot_g_h_results(measurements, predictions, filtered_data, title='', z_label='Scale', ):
 69 |     plot_measurements(measurements, label=z_label)
 70 |     plot_predictions(predictions)
 71 |     plot_filter(filtered_data)
 72 |     plt.legend(loc=4)
 73 |     plt.title(title)
 74 |     plt.gca().set_xlim(left=0, right=len(measurements))
 75 |     plt.show()
 76 | 
 77 | 
 78 | test = [
 79 |     {'title': 'test', 'x_0': 160, 'dx': 1, 'num_x': 30, 'noise': 3},  # testing assumptions
 80 |     {'title': 'bad initial', 'x_0': 5, 'x_0_guess': 30, 'dx': 1, 'num_x': 100, 'noise': 10},  # bad initial guess
 81 |     {'title': 'extreme noise', 'x_0': 5, 'dx': 1, 'num_x': 100, 'noise': 100},  # extreme noise
 82 |     {'title': 'acceleration', 'x_0': 10, 'dx': 0, 'num_x': 20, 'noise': 0, 'acceleration': 2, 'g': 0.2, 'h': 0.02},
 83 |     # acceleration, shows the lag error or systemic error
 84 | 
 85 |     # varying g, greater g favors measurement instead of prediction
 86 |     {'title': 'g = 0.1', 'x_0': 5, 'x_0_guess': 0, 'dx': 5, 'num_x': 100, 'noise': 50, 'g': 0.1},  # g 0.1
 87 |     {'title': 'g = 0.5', 'x_0': 5, 'x_0_guess': 0, 'dx': 5, 'num_x': 100, 'noise': 50, 'g': 0.5},  # g 0.5
 88 |     {'title': 'g = 0.9', 'x_0': 5, 'x_0_guess': 0, 'dx': 5, 'num_x': 100, 'noise': 50, 'g': 0.9},  # g 0.9]
 89 | 
 90 |     # varying h, greater h makes the filter react rapidly to transient changes
 91 |     {
 92 |         'title': 'h = 0.05', 'x_0': 0, 'x_0_guess': 0, 'dx': 0, 'num_x': 50, 'noise': 50, 'h': 0.05,
 93 |         'measurements': np.linspace(0, 1, 50)
 94 |     },  # g 0.1
 95 |     {
 96 |         'title': 'h = 0.05', 'x_0': 0, 'x_0_guess': 0, 'dx': 2, 'num_x': 50, 'noise': 50, 'h': 0.05,
 97 |         'measurements': np.linspace(0, 1, 50)
 98 |     },  # g 0.5
 99 |     {
100 |         'title': 'h = 0.5', 'x_0': 0, 'x_0_guess': 0, 'dx': 2, 'num_x': 50, 'noise': 50, 'h': 0.5,
101 |         'measurements': np.linspace(0, 1, 50)
102 |     },  # g 0.9
103 | 
104 | 
105 | ]
106 | for t in test:
107 |     g = t.get('g', 0.2)
108 |     h = t.get('h', 0.01)
109 |     x_0 = t.get('x_0_guess', t['x_0'])
110 |     measurements = t.get('measurements')
111 |     if measurements is None:
112 |         measurements = generate_measurements(t['x_0'], t['dx'], t['num_x'], t['noise'], t.get('acceleration', 0))
113 |     plt.xlim([0, t['num_x']])
114 |     plot_track([0, t['num_x']], [measurements[0], measurements[t['num_x'] - 1]], label='Actual weight')
115 |     xs = xrange(1, t['num_x']+1)
116 |     line = np.poly1d(np.polyfit(xs, measurements, 1))
117 |     plot_data(xs, line(xs),  label='least squares', c='y', lw=3, linestyle='-')
118 |     predictions, filtered_measurements = g_h_filter(measurements=measurements, x_0=x_0, dx=t['dx'],
119 |                                                     g=g, h=h, dt=1.)
120 |     plot_g_h_results(measurements, predictions, filtered_measurements, title=t['title'])
121 | 
122 | measurements = [5, 6, 7, 8, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]
123 | 
124 | predictions, filtered_measurements = g_h_filter(measurements=measurements, x_0=4., dx=1., dt=1., g=.302, h=0.054)
125 | plot_g_h_results(measurements, predictions, filtered_measurements, 'g = 0.302, h = 0.054')
126 | 
127 | predictions, filtered_measurements = g_h_filter(measurements=measurements, x_0=4., dx=1., dt=1., g=.546, h=0.205)
128 | plot_g_h_results(measurements, predictions, filtered_measurements, 'g = 0.546, h = 0.205')


--------------------------------------------------------------------------------
/frequency.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        shingling minhashing
  3 | # 
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | # !/usr/bin/env python
  7 | 
  8 | from __future__ import division
  9 | from itertools import combinations
 10 | 
 11 | # exemple of baskets
 12 | BASKETS = (
 13 |     set(('cat', 'and', 'dog', 'bites')),
 14 |     set(('yahoo', 'news', 'claims', 'a', 'cat', 'mated', 'with', 'dog', 'and', 'produced', 'viable', 'offspring')),
 15 |     set(('cat', 'killer', 'is', 'a', 'free', 'big', 'dog')),
 16 |     set(('professional', 'free', 'advice', 'on', 'dog', 'training', 'puppy', 'training')),
 17 |     set(('cat', 'and', 'kitten', 'training', 'behavior')),
 18 |     set(('dog', 'cat', 'provides', 'training', 'in', 'eugene', 'oregon')),
 19 |     set(('dog', 'cat', 'is', 'slang', 'term', 'used', 'by', 'police', 'officers', 'for', 'malefemale', 'relationship')),
 20 |     set(('shop', 'for', 'your', 'show', 'dog', 'grooming', 'and', 'yet', 'pet', 'supplier'))
 21 | )
 22 | 
 23 | 
 24 | def frequency(baskets, item):
 25 |     """
 26 |     Frequency of item in baskets
 27 |     """
 28 |     freq = 0
 29 |     for basket in baskets:
 30 |         if item <= basket: freq += 1
 31 |     return freq
 32 | 
 33 | 
 34 | def frequent(frequency, support):
 35 |     """
 36 |     If frequency of item is bigger than support then it is ferquent
 37 |     """
 38 |     return True if frequency > support else False
 39 | 
 40 | 
 41 | def confidence(baskets, item1, item2):
 42 |     """
 43 |     Confidence of the rule Item1 -> item2 is the ratio freq2/freq1
 44 |     """
 45 |     item = item1 | item2
 46 |     freq1 = frequency(baskets=baskets, item=item1)
 47 |     freq2 = frequency(baskets=baskets, item=item)
 48 |     print freq1
 49 |     print freq2
 50 |     return freq2 / freq1 if freq1 > 0 else 0
 51 | 
 52 | 
 53 | def interest(baskets, item1, item2):
 54 |     """
 55 |     the interest of an association rule item1->item2 to be the difference
 56 |     between its confidence and the fraction of baskets that contain item2.
 57 |     """
 58 |     return confidence(baskets=baskets, item1=item1, item2=item2) - (
 59 |         frequency(baskets=baskets, item=item2) / len(baskets))
 60 | 
 61 | 
 62 | def ferquent_items(baskets, support):
 63 |     """
 64 |     Determines which items are frequent
 65 |     """
 66 |     # items in baskets
 67 |     items = set()
 68 |     for basket in baskets:
 69 |         items |= basket
 70 |     pos = 0
 71 |     #first we determine which items are ferquent
 72 |     items_frequency = {}
 73 |     for item in items:
 74 |         freq = frequency(baskets, set([item]))
 75 |         items_frequency[item] = (freq, frequent(freq, support))
 76 |     return [(item, i_frequency) for item, (i_frequency, i_frequent) in items_frequency.items() if i_frequent]
 77 | 
 78 | 
 79 | def frequent_pairs(baskets, support):
 80 |     """
 81 |     Determines which pairs are frequent
 82 |     A-priori algorithm
 83 |     """
 84 |     frequent_items = [item for item, i_frequency in ferquent_items(baskets, support)]
 85 |     pairs_frequency = {}
 86 |     for pair in combinations(frequent_items, 2):
 87 |         if pair not in pairs_frequency:
 88 |             freq = frequency(baskets, set(pair))
 89 |             pairs_frequency[pair] = (freq, frequent(freq, support))
 90 |     print pairs_frequency
 91 | 
 92 | 
 93 | def son_algo(baskets, support, fraction_baskets):
 94 |     """
 95 |         the algorithm of savasere, omniescinski and navathe
 96 |         divide the input into nbr fractions, for each fraction find all frequent
 97 |         items for (1/fraction_baskets)*support
 98 |     """
 99 |     items = set()
100 |     nbr_baskets = len(baskets)
101 |     for i in range(0, nbr_baskets, fraction_baskets):
102 |         items |= set(ferquent_items((baskets[i:i + 3]), (fraction_baskets / nbr_baskets) * support))
103 |     return [item for item, i_frequency in ferquent_items(baskets, support) if i_frequency > support]
104 | 


--------------------------------------------------------------------------------
/graph_analysis.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        graph analysis
  3 | #
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | # !/usr/bin/env python
  7 | from __future__ import division
  8 | import random
  9 | import heapq
 10 | 
 11 | 
 12 | def rand_weight_link(graph, node1, node2):
 13 |     """
 14 |     Create a link between node1 and node2
 15 |     """
 16 |     if node1 not in graph:
 17 |         graph[node1] = {}
 18 |     graph[node1][node2] = random.randint(1, 10)
 19 |     if node2 not in graph:
 20 |         graph[node2] = {}
 21 |     graph[node2][node1] = random.randint(1, 10)
 22 | 
 23 | 
 24 | def link(graph, node1, node2):
 25 |     """
 26 |     Create a link between node1 and node2
 27 |     """
 28 |     if node1 not in graph:
 29 |         graph[node1] = {}
 30 |     graph[node1][node2] = 1
 31 |     if node2 not in graph:
 32 |         graph[node2] = {}
 33 |     graph[node2][node1] = 1
 34 | 
 35 | 
 36 | def add_node(graph_to, graph_from, node):
 37 |     if node not in graph_to:
 38 |         graph_to[node] = graph_from[node]
 39 |         for neighbor in graph_to[node].keys():
 40 |             graph_to = add_node(graph_to, graph_from, neighbor)
 41 |     return graph_to
 42 | 
 43 | 
 44 | def shortest_path(graph, node1, node2):
 45 |     """
 46 |     Finds the shortest path from node1 in node 2 in graph.
 47 |     """
 48 |     if node1 == node2:
 49 |         return [node1]
 50 |     explored = []
 51 |     to_explore = [[node1]]
 52 |     while to_explore:
 53 |         path = to_explore.pop(0)
 54 |         s = path[-1]
 55 |         for successor in graph[s].keys():
 56 |             if successor not in explored:
 57 |                 explored.append(successor)
 58 |                 path2 = path + [successor]
 59 |                 if node2 == successor:
 60 |                     return path2
 61 |                 to_explore.append(path2)
 62 |     return []
 63 | 
 64 | 
 65 | def longest_path(graph, node=None):
 66 |     """
 67 |     Returns the longest path in the graph if node is None
 68 |     I f node is not None, then it returns the longest path from node
 69 |     """
 70 |     if node is not None:
 71 |         return max([shortest_path(graph, node, successor) for successor in graph.keys()], key=len)
 72 |     return max([shortest_path(graph, a, b) for a in graph.keys() for b in graph.keys()], key=len)
 73 | 
 74 | 
 75 | def centrality(graph, node):
 76 |     """
 77 |     Returns the centrality of node in graph
 78 |     """
 79 |     return sum([len(shortest_path(graph, node, successor)) for successor in graph.keys()]) / len(graph.keys())
 80 | 
 81 | 
 82 | def indep_graphs(graph):
 83 |     """
 84 |     Returns the independent graphs in the current graph
 85 |     """
 86 |     graphs = []
 87 | 
 88 |     def which_graph(node):
 89 |         for g in graphs:
 90 |             if node in g: return g
 91 |         return {}
 92 | 
 93 |     for node in graph.keys():
 94 |         g = add_node(which_graph(node), graph, node)
 95 |         if g not in graphs: graphs.append(g)
 96 |     return graphs
 97 | 
 98 | 
 99 | def graph_for_node(graph, node):
100 |     """
101 |     Returns the independent graph containing node
102 |     """
103 |     return add_node({}, graph, node)
104 | 
105 | 
106 | def check_pairwise_connectivity(graph, node1, node2):
107 |     """
108 |     checks the connectivity between two nodes, 
109 |     and returns True if connected, otherwise False    
110 |     """
111 |     return True if node2 in graph_for_node(graph, node1) else False
112 | 
113 | 
114 | def clustering_coef(graph, node, verbose=False):
115 |     """
116 |     calculates the clustering coef for a particular node in the graph
117 |      let Dn = node degree
118 |          Vn = number of links between neighbors of the node
119 |     """
120 |     neighbors = graph[node].keys()
121 |     Dn = len(neighbors)
122 |     if Dn == 0: return Dn;
123 |     Vn = 0
124 |     for neighbor1 in neighbors:
125 |         index1 = neighbors.index(neighbor1)
126 |         for neighbor2 in neighbors:
127 |             index2 = neighbors.index(neighbor2)
128 |             if index1 < index2 and neighbor2 in graph[neighbor1]: Vn += 1
129 |     coef = (2 * Vn) / (Dn * (Dn - 1))
130 |     if verbose: print '%s\'s degree : %s, links between neighbors : %s. Culestering coef : %s' % (node, Dn, Vn, coef)
131 |     return coef
132 | 
133 | 
134 | def random_clustering_coef(graph, node, nbr_iterations=1000000):
135 |     """
136 |     calculates the estimate clustering coef for a particular node in the graph
137 |     """
138 |     vindex = {}
139 |     d = 0
140 |     for w in graph[node].keys():
141 |         vindex[d] = w
142 |         d += 1
143 | 
144 |     total = 0
145 |     for i in range(1, nbr_iterations):
146 |         if d > 1:
147 |             pick = random.randint(0, d - 1)
148 |             v1 = vindex[pick]
149 |             v2 = vindex[(pick + random.randint(1, d - 1)) % d]
150 |             if v2 in graph[v1]: total += 1
151 |         print i, (total + 0.0) / i
152 | 
153 | 
154 | def average_cluestering(graph, verbose=True):
155 |     average = sum([clustering_coef(graph, node, verbose=verbose) for node in graph]) / len(graph)
156 |     if verbose: print average
157 |     return average
158 | 
159 | 
160 | def dijkstra(graph, node):
161 |     """
162 |     Simulate the dijkstra algorithm in a graph
163 |     """
164 |     distance_to = {}
165 |     distance_to[node] = 0
166 |     distance_path = {}
167 |     while (distance_to):
168 |         # in case we have a disjoint graph
169 |         op_node = min_distance(distance_to)
170 |         distance_path[op_node] = distance_to[op_node]
171 |         del distance_to[op_node]
172 |         for x, x_len in graph[op_node].items():
173 |             if x not in distance_path:
174 |                 if x not in distance_to:
175 |                     distance_to[x] = distance_path[op_node] + x_len
176 |                 elif distance_to[x] > distance_path[op_node] + x_len:
177 |                     distance_to[x] = distance_path[op_node] + x_len
178 |     return distance_path
179 | 
180 | 
181 | def min_distance(distances):
182 |     """
183 |     return the element with the min distance
184 |     """
185 |     min = (-1, -1)
186 |     for node, node_len in distances.items():
187 |         if min[1] > node_len or min[1] == -1:
188 |             min = (node, node_len)
189 |     return min[0]
190 | 
191 | 
192 | def dijkstra_heap(graph, node):
193 |     """
194 |     Simulate the dijkstra algorithm in a graph
195 |     """
196 |     track_distance = {}
197 |     track_distance[node] = 0
198 |     distance_to = []
199 |     heapq.heappush(distance_to, (0, node))
200 |     distance_path = {}
201 |     while (distance_to):
202 |         # in case we have a disjoint graph
203 |         #op_node = min_distance(distance_to)
204 |         #distance_path[op_node] = distance_to[op_node]
205 |         #del distance_to[op_node]
206 |         ind, op_node = heapq.heappop(distance_to)
207 |         if op_node not in distance_path or ind < distance_path[op_node]:
208 |             distance_path[op_node] = ind
209 |         for x, x_len in graph[op_node].items():
210 |             if x not in distance_path:
211 |                 if x not in track_distance:
212 |                     track_distance[x] = distance_path[op_node] + x_len
213 |                     heapq.heappush(distance_to, (track_distance[x], x))
214 |                 elif track_distance[x] > distance_path[op_node] + x_len:
215 |                     track_distance[x] = distance_path[op_node] + x_len
216 |                     heapq.heappush(distance_to, (track_distance[x], x))
217 |     return distance_path
218 | 
219 | 
220 | # heap functions
221 | 
222 | def parent(i): return (i - 1) / 2
223 | 
224 | 
225 | def left_child(i): return 2 * i + 1
226 | 
227 | 
228 | def right_child(i): return 2 * i + 2
229 | 
230 | 
231 | def is_leaf(heap_list, i): return (left_child(i) >= len(heap_list)) and (right_child(i) >= len(heap_list))
232 | 
233 | 
234 | def has_one_child(heap_list, i): return (left_child(i) < len(heap_list)) and (right_child(i) >= len(heap_list))
235 | 
236 | 
237 | # Call this routine if the heap rooted at i satisfies the heap property
238 | # *except* perhaps i to its immediate children
239 | def down_heapify(heap_list, i):
240 |     # If i is a leaf, heap property holds
241 |     if is_leaf(heap_list, i):
242 |         return
243 |     # If i has one child...
244 |     if has_one_child(heap_list, i):
245 |         # check heap property
246 |         if heap_list[i] > heap_list[left_child(i)]:
247 |             # If it fails, swap, fixing i and its child (a leaf)
248 |             (heap_list[i], heap_list[left_child(i)]) = (heap_list[left_child(i)], heap_list[i])
249 |         return
250 |     # If i has two children...
251 |     # check heap property
252 |     if min(heap_list[left_child(i)], heap_list[right_child(i)]) >= heap_list[i]:
253 |         return
254 |     # If it fails, see which child is the smaller
255 |     # and swap i's value into that child
256 |     # Afterwards, recurse into that child, which might violate
257 |     if heap_list[left_child(i)] < heap_list[right_child(i)]:
258 |         # Swap into left child
259 |         (heap_list[i], heap_list[left_child(i)]) = (heap_list[left_child(i)], heap_list[i])
260 |         down_heapify(heap_list, left_child(i))
261 |         return
262 |     else:
263 |         (heap_list[i], heap_list[right_child(i)]) = (heap_list[right_child(i)], heap_list[i])
264 |         down_heapify(heap_list, right_child(i))
265 |         return
266 | 
267 | 
268 | def build_heap(heap_list):
269 |     for i in range(len(heap_list) - 1, -1, -1):
270 |         down_heapify(heap_list, i)
271 |     return heap_list
272 | 
273 | 
274 | def remove_min_heap(heap_list):
275 |     heap_list[0] = heap_list.pop()
276 |     down_heapify(heap_list, 0)
277 |     return heap_list
278 | 
279 | 
280 | def sort_heap(heap_list):
281 |     sorted_heap = []
282 |     while len(heap_list) > 0:
283 |         sorted_heap = heap_list.pop()
284 |         remove_min_heap(heap_list)
285 |     return sorted_heap
286 | 
287 | 
288 | class bipartite_characteristcs(object):
289 |     """
290 |     Returns the characteristics of thr bipartite graph based on :
291 |         the n number of first nodes
292 |         the m number of second nodes
293 |     """
294 | 
295 |     def __init__(self, graph, n_nodes, m_nodes):
296 |         self.graph = graph
297 |         self.n_nodes = n_nodes
298 |         self.m_nodes = m_nodes
299 | 
300 |     def max_edges(self): return self.m_nodes * self.n_nodes
301 | 
302 |     def min_edges(self): return (self.n_nodes + self.m_nodes) - 1
303 | 
304 |     def max_length(self): return min(2 * self.n_nodes, 2 * self.m_nodes, (self.n_nodes + self.n_nodes) - 1)
305 | 
306 |     def max_clustering_coef(self): return 0
307 | 


--------------------------------------------------------------------------------
/licence:
--------------------------------------------------------------------------------
 1 | Copyright (c) Mourad MOURAFIQ and individual contributors.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 |     1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | 
 8 |     2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
11 | 


--------------------------------------------------------------------------------
/map_reduce/README.md:
--------------------------------------------------------------------------------
 1 | map-reduce
 2 | ==========
 3 | 
 4 | Implementation of map reduce, and some examples.
 5 | 
 6 | 
 7 | Contains :
 8 | 
 9 |  - Map Reduce class
10 |  - Estimation of pi number
11 |  - Calculation of frequency of Items from multiple files
12 | 


--------------------------------------------------------------------------------
/map_reduce/item_frequency.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        calculating items frequency
 3 | #
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | import multiprocessing
 8 | import string
 9 | 
10 | from map_reduce import MapReduce
11 | 
12 | 
13 | def map_words(filename):
14 |     """Read a file and return a sequence of (word, occurances) values.
15 |     """
16 |     STOP_WORDS = set([
17 |         'a', 'an', 'and', 'are', 'as', 'be', 'by', 'for', 'if', 'in',
18 |         'is', 'it', 'of', 'or', 'py', 'rst', 'that', 'the', 'to', 'with',
19 |     ])
20 |     TR = string.maketrans(string.punctuation, ' ' * len(string.punctuation))
21 | 
22 |     print multiprocessing.current_process().name, 'processing ', filename
23 |     output = []
24 | 
25 |     with open(filename, 'rt') as f:
26 |         for line in f:
27 |             if line.lstrip().startswith('..'):  # Skip rst comment lines
28 |                 continue
29 |             line = line.translate(TR)  # Strip punctuation
30 |             for word in line.split():
31 |                 word = word.lower()
32 |                 if word.isalpha() and word not in STOP_WORDS:
33 |                     output.append((word, 1))
34 |     return output
35 | 
36 | 
37 | def words_frequency(item):
38 |     """Convert the partitioned data for a word to a
39 |     tuple containing the word and the number of occurances.
40 |     """
41 |     word, occurances = item
42 |     return (word, sum(occurances))
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     import operator
47 |     import glob
48 | 
49 |     input_files = glob.glob('./*.txt')  # linux notation for directories
50 | 
51 |     mapper = MapReduce(map_words, words_frequency)
52 |     word_counts = mapper(input_files)
53 |     word_counts.sort(key=operator.itemgetter(1))
54 |     word_counts.reverse()
55 | 
56 |     print '\nTOP 20 Iems by frequency\n'
57 |     top20 = word_counts[:20]
58 |     longest = max(len(word) for word, count in top20)
59 |     for word, count in top20:
60 |         print '%-*s: %5s' % (longest + 1, word, count)
61 | 


--------------------------------------------------------------------------------
/map_reduce/map_reduce.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        map reduce class
 3 | #
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | from __future__ import division
 8 | import collections
 9 | import itertools
10 | import multiprocessing
11 | 
12 | 
13 | class MapReduce(object):
14 |     """
15 |     The map reduce object, should be initialized with:
16 |         map_fn
17 |         reduce_fn
18 |         nbr_workers
19 |     """
20 | 
21 |     def __init__(self, map_fn, reduce_fn, num_workers=None):
22 |         """
23 |         initiaize the mapreduce object
24 |             map_fn : Function to map inputs to intermediate data, takes as 
25 |             input one arg and returns a tuple (key, value)
26 |             reduce_fn : Function to reduce intermediate data to final result
27 |             takes as arg keys as produced from the map, and the values associated with it
28 |         """
29 |         self.map_fn = map_fn
30 |         self.reduce_fn = reduce_fn
31 |         self.pool = multiprocessing.Pool(num_workers)
32 | 
33 |     def partition(self, mapped_values):
34 |         """
35 |         returns the mapped_values organised by their keys. (keys, associated values)
36 |         """
37 |         organised_data = collections.defaultdict(list)
38 |         for key, value in mapped_values:
39 |             organised_data[key].append(value)
40 |         return organised_data.items()
41 | 
42 |     def __call__(self, inputs=None, chunk_size=1):
43 |         """
44 |         process the data through the map reduce functions.
45 |         inputs : iterable
46 |         chank_size : amount of data to hand to each worker 
47 |         """
48 |         mapped_data = self.pool.map(self.map_fn, inputs, chunksize=chunk_size)
49 |         partioned_data = self.partition(itertools.chain(*mapped_data))
50 |         reduced_data = self.pool.map(self.reduce_fn, partioned_data)
51 |         return reduced_data
52 |         
53 | 


--------------------------------------------------------------------------------
/map_reduce/pi_estimation.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        estimation of pi with map reduce
 3 | #
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | from __future__ import division
 8 | import multiprocessing
 9 | import numpy
10 | import random
11 | from map_reduce import MapReduce
12 | 
13 | NBR_POINTS = 1000000
14 | RADIUQ = numpy.sqrt(NBR_POINTS)
15 | NBR_WORKERS = 4
16 | NBR_PER_WORKER = NBR_POINTS / NBR_WORKERS
17 | 
18 | 
19 | def probability_calculation(item):
20 |     """Read a file and return a sequence of (word, occurances) values.
21 |     """
22 | 
23 |     print multiprocessing.current_process().name, 'calculating', item
24 |     output = []
25 |     IN_CIRCLE = 0
26 |     for i in range(int(NBR_PER_WORKER)):
27 |         x = numpy.random.randint(0, RADIUQ)
28 |         y = numpy.random.randint(0, RADIUQ)
29 |         if (numpy.sqrt(x ** 2 + y ** 2) < RADIUQ):
30 |             IN_CIRCLE += 1
31 |     output.append(('pi', IN_CIRCLE))
32 |     return output
33 | 
34 | 
35 | def estimate_pi(item):
36 |     """Convert the partitioned data for a word to a
37 |     tuple containing the word and the number of occurances.
38 |     """
39 |     key, occurances = item
40 |     return (sum(occurances) / NBR_POINTS) * 4
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     mapper = MapReduce(probability_calculation, estimate_pi)
45 |     pi = mapper([i for i in range(NBR_WORKERS)])
46 |     print pi
47 | 


--------------------------------------------------------------------------------
/movielens/u.item:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmourafiq/data-analysis/1df2ca020a554f1fdab7cc9e53115e249cc199ac/movielens/u.item


--------------------------------------------------------------------------------
/movielens/u.user:
--------------------------------------------------------------------------------
  1 | 1|24|M|technician|85711
  2 | 2|53|F|other|94043
  3 | 3|23|M|writer|32067
  4 | 4|24|M|technician|43537
  5 | 5|33|F|other|15213
  6 | 6|42|M|executive|98101
  7 | 7|57|M|administrator|91344
  8 | 8|36|M|administrator|05201
  9 | 9|29|M|student|01002
 10 | 10|53|M|lawyer|90703
 11 | 11|39|F|other|30329
 12 | 12|28|F|other|06405
 13 | 13|47|M|educator|29206
 14 | 14|45|M|scientist|55106
 15 | 15|49|F|educator|97301
 16 | 16|21|M|entertainment|10309
 17 | 17|30|M|programmer|06355
 18 | 18|35|F|other|37212
 19 | 19|40|M|librarian|02138
 20 | 20|42|F|homemaker|95660
 21 | 21|26|M|writer|30068
 22 | 22|25|M|writer|40206
 23 | 23|30|F|artist|48197
 24 | 24|21|F|artist|94533
 25 | 25|39|M|engineer|55107
 26 | 26|49|M|engineer|21044
 27 | 27|40|F|librarian|30030
 28 | 28|32|M|writer|55369
 29 | 29|41|M|programmer|94043
 30 | 30|7|M|student|55436
 31 | 31|24|M|artist|10003
 32 | 32|28|F|student|78741
 33 | 33|23|M|student|27510
 34 | 34|38|F|administrator|42141
 35 | 35|20|F|homemaker|42459
 36 | 36|19|F|student|93117
 37 | 37|23|M|student|55105
 38 | 38|28|F|other|54467
 39 | 39|41|M|entertainment|01040
 40 | 40|38|M|scientist|27514
 41 | 41|33|M|engineer|80525
 42 | 42|30|M|administrator|17870
 43 | 43|29|F|librarian|20854
 44 | 44|26|M|technician|46260
 45 | 45|29|M|programmer|50233
 46 | 46|27|F|marketing|46538
 47 | 47|53|M|marketing|07102
 48 | 48|45|M|administrator|12550
 49 | 49|23|F|student|76111
 50 | 50|21|M|writer|52245
 51 | 51|28|M|educator|16509
 52 | 52|18|F|student|55105
 53 | 53|26|M|programmer|55414
 54 | 54|22|M|executive|66315
 55 | 55|37|M|programmer|01331
 56 | 56|25|M|librarian|46260
 57 | 57|16|M|none|84010
 58 | 58|27|M|programmer|52246
 59 | 59|49|M|educator|08403
 60 | 60|50|M|healthcare|06472
 61 | 61|36|M|engineer|30040
 62 | 62|27|F|administrator|97214
 63 | 63|31|M|marketing|75240
 64 | 64|32|M|educator|43202
 65 | 65|51|F|educator|48118
 66 | 66|23|M|student|80521
 67 | 67|17|M|student|60402
 68 | 68|19|M|student|22904
 69 | 69|24|M|engineer|55337
 70 | 70|27|M|engineer|60067
 71 | 71|39|M|scientist|98034
 72 | 72|48|F|administrator|73034
 73 | 73|24|M|student|41850
 74 | 74|39|M|scientist|T8H1N
 75 | 75|24|M|entertainment|08816
 76 | 76|20|M|student|02215
 77 | 77|30|M|technician|29379
 78 | 78|26|M|administrator|61801
 79 | 79|39|F|administrator|03755
 80 | 80|34|F|administrator|52241
 81 | 81|21|M|student|21218
 82 | 82|50|M|programmer|22902
 83 | 83|40|M|other|44133
 84 | 84|32|M|executive|55369
 85 | 85|51|M|educator|20003
 86 | 86|26|M|administrator|46005
 87 | 87|47|M|administrator|89503
 88 | 88|49|F|librarian|11701
 89 | 89|43|F|administrator|68106
 90 | 90|60|M|educator|78155
 91 | 91|55|M|marketing|01913
 92 | 92|32|M|entertainment|80525
 93 | 93|48|M|executive|23112
 94 | 94|26|M|student|71457
 95 | 95|31|M|administrator|10707
 96 | 96|25|F|artist|75206
 97 | 97|43|M|artist|98006
 98 | 98|49|F|executive|90291
 99 | 99|20|M|student|63129
100 | 100|36|M|executive|90254
101 | 101|15|M|student|05146
102 | 102|38|M|programmer|30220
103 | 103|26|M|student|55108
104 | 104|27|M|student|55108
105 | 105|24|M|engineer|94043
106 | 106|61|M|retired|55125
107 | 107|39|M|scientist|60466
108 | 108|44|M|educator|63130
109 | 109|29|M|other|55423
110 | 110|19|M|student|77840
111 | 111|57|M|engineer|90630
112 | 112|30|M|salesman|60613
113 | 113|47|M|executive|95032
114 | 114|27|M|programmer|75013
115 | 115|31|M|engineer|17110
116 | 116|40|M|healthcare|97232
117 | 117|20|M|student|16125
118 | 118|21|M|administrator|90210
119 | 119|32|M|programmer|67401
120 | 120|47|F|other|06260
121 | 121|54|M|librarian|99603
122 | 122|32|F|writer|22206
123 | 123|48|F|artist|20008
124 | 124|34|M|student|60615
125 | 125|30|M|lawyer|22202
126 | 126|28|F|lawyer|20015
127 | 127|33|M|none|73439
128 | 128|24|F|marketing|20009
129 | 129|36|F|marketing|07039
130 | 130|20|M|none|60115
131 | 131|59|F|administrator|15237
132 | 132|24|M|other|94612
133 | 133|53|M|engineer|78602
134 | 134|31|M|programmer|80236
135 | 135|23|M|student|38401
136 | 136|51|M|other|97365
137 | 137|50|M|educator|84408
138 | 138|46|M|doctor|53211
139 | 139|20|M|student|08904
140 | 140|30|F|student|32250
141 | 141|49|M|programmer|36117
142 | 142|13|M|other|48118
143 | 143|42|M|technician|08832
144 | 144|53|M|programmer|20910
145 | 145|31|M|entertainment|V3N4P
146 | 146|45|M|artist|83814
147 | 147|40|F|librarian|02143
148 | 148|33|M|engineer|97006
149 | 149|35|F|marketing|17325
150 | 150|20|F|artist|02139
151 | 151|38|F|administrator|48103
152 | 152|33|F|educator|68767
153 | 153|25|M|student|60641
154 | 154|25|M|student|53703
155 | 155|32|F|other|11217
156 | 156|25|M|educator|08360
157 | 157|57|M|engineer|70808
158 | 158|50|M|educator|27606
159 | 159|23|F|student|55346
160 | 160|27|M|programmer|66215
161 | 161|50|M|lawyer|55104
162 | 162|25|M|artist|15610
163 | 163|49|M|administrator|97212
164 | 164|47|M|healthcare|80123
165 | 165|20|F|other|53715
166 | 166|47|M|educator|55113
167 | 167|37|M|other|L9G2B
168 | 168|48|M|other|80127
169 | 169|52|F|other|53705
170 | 170|53|F|healthcare|30067
171 | 171|48|F|educator|78750
172 | 172|55|M|marketing|22207
173 | 173|56|M|other|22306
174 | 174|30|F|administrator|52302
175 | 175|26|F|scientist|21911
176 | 176|28|M|scientist|07030
177 | 177|20|M|programmer|19104
178 | 178|26|M|other|49512
179 | 179|15|M|entertainment|20755
180 | 180|22|F|administrator|60202
181 | 181|26|M|executive|21218
182 | 182|36|M|programmer|33884
183 | 183|33|M|scientist|27708
184 | 184|37|M|librarian|76013
185 | 185|53|F|librarian|97403
186 | 186|39|F|executive|00000
187 | 187|26|M|educator|16801
188 | 188|42|M|student|29440
189 | 189|32|M|artist|95014
190 | 190|30|M|administrator|95938
191 | 191|33|M|administrator|95161
192 | 192|42|M|educator|90840
193 | 193|29|M|student|49931
194 | 194|38|M|administrator|02154
195 | 195|42|M|scientist|93555
196 | 196|49|M|writer|55105
197 | 197|55|M|technician|75094
198 | 198|21|F|student|55414
199 | 199|30|M|writer|17604
200 | 200|40|M|programmer|93402
201 | 201|27|M|writer|E2A4H
202 | 202|41|F|educator|60201
203 | 203|25|F|student|32301
204 | 204|52|F|librarian|10960
205 | 205|47|M|lawyer|06371
206 | 206|14|F|student|53115
207 | 207|39|M|marketing|92037
208 | 208|43|M|engineer|01720
209 | 209|33|F|educator|85710
210 | 210|39|M|engineer|03060
211 | 211|66|M|salesman|32605
212 | 212|49|F|educator|61401
213 | 213|33|M|executive|55345
214 | 214|26|F|librarian|11231
215 | 215|35|M|programmer|63033
216 | 216|22|M|engineer|02215
217 | 217|22|M|other|11727
218 | 218|37|M|administrator|06513
219 | 219|32|M|programmer|43212
220 | 220|30|M|librarian|78205
221 | 221|19|M|student|20685
222 | 222|29|M|programmer|27502
223 | 223|19|F|student|47906
224 | 224|31|F|educator|43512
225 | 225|51|F|administrator|58202
226 | 226|28|M|student|92103
227 | 227|46|M|executive|60659
228 | 228|21|F|student|22003
229 | 229|29|F|librarian|22903
230 | 230|28|F|student|14476
231 | 231|48|M|librarian|01080
232 | 232|45|M|scientist|99709
233 | 233|38|M|engineer|98682
234 | 234|60|M|retired|94702
235 | 235|37|M|educator|22973
236 | 236|44|F|writer|53214
237 | 237|49|M|administrator|63146
238 | 238|42|F|administrator|44124
239 | 239|39|M|artist|95628
240 | 240|23|F|educator|20784
241 | 241|26|F|student|20001
242 | 242|33|M|educator|31404
243 | 243|33|M|educator|60201
244 | 244|28|M|technician|80525
245 | 245|22|M|student|55109
246 | 246|19|M|student|28734
247 | 247|28|M|engineer|20770
248 | 248|25|M|student|37235
249 | 249|25|M|student|84103
250 | 250|29|M|executive|95110
251 | 251|28|M|doctor|85032
252 | 252|42|M|engineer|07733
253 | 253|26|F|librarian|22903
254 | 254|44|M|educator|42647
255 | 255|23|M|entertainment|07029
256 | 256|35|F|none|39042
257 | 257|17|M|student|77005
258 | 258|19|F|student|77801
259 | 259|21|M|student|48823
260 | 260|40|F|artist|89801
261 | 261|28|M|administrator|85202
262 | 262|19|F|student|78264
263 | 263|41|M|programmer|55346
264 | 264|36|F|writer|90064
265 | 265|26|M|executive|84601
266 | 266|62|F|administrator|78756
267 | 267|23|M|engineer|83716
268 | 268|24|M|engineer|19422
269 | 269|31|F|librarian|43201
270 | 270|18|F|student|63119
271 | 271|51|M|engineer|22932
272 | 272|33|M|scientist|53706
273 | 273|50|F|other|10016
274 | 274|20|F|student|55414
275 | 275|38|M|engineer|92064
276 | 276|21|M|student|95064
277 | 277|35|F|administrator|55406
278 | 278|37|F|librarian|30033
279 | 279|33|M|programmer|85251
280 | 280|30|F|librarian|22903
281 | 281|15|F|student|06059
282 | 282|22|M|administrator|20057
283 | 283|28|M|programmer|55305
284 | 284|40|M|executive|92629
285 | 285|25|M|programmer|53713
286 | 286|27|M|student|15217
287 | 287|21|M|salesman|31211
288 | 288|34|M|marketing|23226
289 | 289|11|M|none|94619
290 | 290|40|M|engineer|93550
291 | 291|19|M|student|44106
292 | 292|35|F|programmer|94703
293 | 293|24|M|writer|60804
294 | 294|34|M|technician|92110
295 | 295|31|M|educator|50325
296 | 296|43|F|administrator|16803
297 | 297|29|F|educator|98103
298 | 298|44|M|executive|01581
299 | 299|29|M|doctor|63108
300 | 300|26|F|programmer|55106
301 | 301|24|M|student|55439
302 | 302|42|M|educator|77904
303 | 303|19|M|student|14853
304 | 304|22|F|student|71701
305 | 305|23|M|programmer|94086
306 | 306|45|M|other|73132
307 | 307|25|M|student|55454
308 | 308|60|M|retired|95076
309 | 309|40|M|scientist|70802
310 | 310|37|M|educator|91711
311 | 311|32|M|technician|73071
312 | 312|48|M|other|02110
313 | 313|41|M|marketing|60035
314 | 314|20|F|student|08043
315 | 315|31|M|educator|18301
316 | 316|43|F|other|77009
317 | 317|22|M|administrator|13210
318 | 318|65|M|retired|06518
319 | 319|38|M|programmer|22030
320 | 320|19|M|student|24060
321 | 321|49|F|educator|55413
322 | 322|20|M|student|50613
323 | 323|21|M|student|19149
324 | 324|21|F|student|02176
325 | 325|48|M|technician|02139
326 | 326|41|M|administrator|15235
327 | 327|22|M|student|11101
328 | 328|51|M|administrator|06779
329 | 329|48|M|educator|01720
330 | 330|35|F|educator|33884
331 | 331|33|M|entertainment|91344
332 | 332|20|M|student|40504
333 | 333|47|M|other|V0R2M
334 | 334|32|M|librarian|30002
335 | 335|45|M|executive|33775
336 | 336|23|M|salesman|42101
337 | 337|37|M|scientist|10522
338 | 338|39|F|librarian|59717
339 | 339|35|M|lawyer|37901
340 | 340|46|M|engineer|80123
341 | 341|17|F|student|44405
342 | 342|25|F|other|98006
343 | 343|43|M|engineer|30093
344 | 344|30|F|librarian|94117
345 | 345|28|F|librarian|94143
346 | 346|34|M|other|76059
347 | 347|18|M|student|90210
348 | 348|24|F|student|45660
349 | 349|68|M|retired|61455
350 | 350|32|M|student|97301
351 | 351|61|M|educator|49938
352 | 352|37|F|programmer|55105
353 | 353|25|M|scientist|28480
354 | 354|29|F|librarian|48197
355 | 355|25|M|student|60135
356 | 356|32|F|homemaker|92688
357 | 357|26|M|executive|98133
358 | 358|40|M|educator|10022
359 | 359|22|M|student|61801
360 | 360|51|M|other|98027
361 | 361|22|M|student|44074
362 | 362|35|F|homemaker|85233
363 | 363|20|M|student|87501
364 | 364|63|M|engineer|01810
365 | 365|29|M|lawyer|20009
366 | 366|20|F|student|50670
367 | 367|17|M|student|37411
368 | 368|18|M|student|92113
369 | 369|24|M|student|91335
370 | 370|52|M|writer|08534
371 | 371|36|M|engineer|99206
372 | 372|25|F|student|66046
373 | 373|24|F|other|55116
374 | 374|36|M|executive|78746
375 | 375|17|M|entertainment|37777
376 | 376|28|F|other|10010
377 | 377|22|M|student|18015
378 | 378|35|M|student|02859
379 | 379|44|M|programmer|98117
380 | 380|32|M|engineer|55117
381 | 381|33|M|artist|94608
382 | 382|45|M|engineer|01824
383 | 383|42|M|administrator|75204
384 | 384|52|M|programmer|45218
385 | 385|36|M|writer|10003
386 | 386|36|M|salesman|43221
387 | 387|33|M|entertainment|37412
388 | 388|31|M|other|36106
389 | 389|44|F|writer|83702
390 | 390|42|F|writer|85016
391 | 391|23|M|student|84604
392 | 392|52|M|writer|59801
393 | 393|19|M|student|83686
394 | 394|25|M|administrator|96819
395 | 395|43|M|other|44092
396 | 396|57|M|engineer|94551
397 | 397|17|M|student|27514
398 | 398|40|M|other|60008
399 | 399|25|M|other|92374
400 | 400|33|F|administrator|78213
401 | 401|46|F|healthcare|84107
402 | 402|30|M|engineer|95129
403 | 403|37|M|other|06811
404 | 404|29|F|programmer|55108
405 | 405|22|F|healthcare|10019
406 | 406|52|M|educator|93109
407 | 407|29|M|engineer|03261
408 | 408|23|M|student|61755
409 | 409|48|M|administrator|98225
410 | 410|30|F|artist|94025
411 | 411|34|M|educator|44691
412 | 412|25|M|educator|15222
413 | 413|55|M|educator|78212
414 | 414|24|M|programmer|38115
415 | 415|39|M|educator|85711
416 | 416|20|F|student|92626
417 | 417|27|F|other|48103
418 | 418|55|F|none|21206
419 | 419|37|M|lawyer|43215
420 | 420|53|M|educator|02140
421 | 421|38|F|programmer|55105
422 | 422|26|M|entertainment|94533
423 | 423|64|M|other|91606
424 | 424|36|F|marketing|55422
425 | 425|19|M|student|58644
426 | 426|55|M|educator|01602
427 | 427|51|M|doctor|85258
428 | 428|28|M|student|55414
429 | 429|27|M|student|29205
430 | 430|38|M|scientist|98199
431 | 431|24|M|marketing|92629
432 | 432|22|M|entertainment|50311
433 | 433|27|M|artist|11211
434 | 434|16|F|student|49705
435 | 435|24|M|engineer|60007
436 | 436|30|F|administrator|17345
437 | 437|27|F|other|20009
438 | 438|51|F|administrator|43204
439 | 439|23|F|administrator|20817
440 | 440|30|M|other|48076
441 | 441|50|M|technician|55013
442 | 442|22|M|student|85282
443 | 443|35|M|salesman|33308
444 | 444|51|F|lawyer|53202
445 | 445|21|M|writer|92653
446 | 446|57|M|educator|60201
447 | 447|30|M|administrator|55113
448 | 448|23|M|entertainment|10021
449 | 449|23|M|librarian|55021
450 | 450|35|F|educator|11758
451 | 451|16|M|student|48446
452 | 452|35|M|administrator|28018
453 | 453|18|M|student|06333
454 | 454|57|M|other|97330
455 | 455|48|M|administrator|83709
456 | 456|24|M|technician|31820
457 | 457|33|F|salesman|30011
458 | 458|47|M|technician|Y1A6B
459 | 459|22|M|student|29201
460 | 460|44|F|other|60630
461 | 461|15|M|student|98102
462 | 462|19|F|student|02918
463 | 463|48|F|healthcare|75218
464 | 464|60|M|writer|94583
465 | 465|32|M|other|05001
466 | 466|22|M|student|90804
467 | 467|29|M|engineer|91201
468 | 468|28|M|engineer|02341
469 | 469|60|M|educator|78628
470 | 470|24|M|programmer|10021
471 | 471|10|M|student|77459
472 | 472|24|M|student|87544
473 | 473|29|M|student|94708
474 | 474|51|M|executive|93711
475 | 475|30|M|programmer|75230
476 | 476|28|M|student|60440
477 | 477|23|F|student|02125
478 | 478|29|M|other|10019
479 | 479|30|M|educator|55409
480 | 480|57|M|retired|98257
481 | 481|73|M|retired|37771
482 | 482|18|F|student|40256
483 | 483|29|M|scientist|43212
484 | 484|27|M|student|21208
485 | 485|44|F|educator|95821
486 | 486|39|M|educator|93101
487 | 487|22|M|engineer|92121
488 | 488|48|M|technician|21012
489 | 489|55|M|other|45218
490 | 490|29|F|artist|V5A2B
491 | 491|43|F|writer|53711
492 | 492|57|M|educator|94618
493 | 493|22|M|engineer|60090
494 | 494|38|F|administrator|49428
495 | 495|29|M|engineer|03052
496 | 496|21|F|student|55414
497 | 497|20|M|student|50112
498 | 498|26|M|writer|55408
499 | 499|42|M|programmer|75006
500 | 500|28|M|administrator|94305
501 | 501|22|M|student|10025
502 | 502|22|M|student|23092
503 | 503|50|F|writer|27514
504 | 504|40|F|writer|92115
505 | 505|27|F|other|20657
506 | 506|46|M|programmer|03869
507 | 507|18|F|writer|28450
508 | 508|27|M|marketing|19382
509 | 509|23|M|administrator|10011
510 | 510|34|M|other|98038
511 | 511|22|M|student|21250
512 | 512|29|M|other|20090
513 | 513|43|M|administrator|26241
514 | 514|27|M|programmer|20707
515 | 515|53|M|marketing|49508
516 | 516|53|F|librarian|10021
517 | 517|24|M|student|55454
518 | 518|49|F|writer|99709
519 | 519|22|M|other|55320
520 | 520|62|M|healthcare|12603
521 | 521|19|M|student|02146
522 | 522|36|M|engineer|55443
523 | 523|50|F|administrator|04102
524 | 524|56|M|educator|02159
525 | 525|27|F|administrator|19711
526 | 526|30|M|marketing|97124
527 | 527|33|M|librarian|12180
528 | 528|18|M|student|55104
529 | 529|47|F|administrator|44224
530 | 530|29|M|engineer|94040
531 | 531|30|F|salesman|97408
532 | 532|20|M|student|92705
533 | 533|43|M|librarian|02324
534 | 534|20|M|student|05464
535 | 535|45|F|educator|80302
536 | 536|38|M|engineer|30078
537 | 537|36|M|engineer|22902
538 | 538|31|M|scientist|21010
539 | 539|53|F|administrator|80303
540 | 540|28|M|engineer|91201
541 | 541|19|F|student|84302
542 | 542|21|M|student|60515
543 | 543|33|M|scientist|95123
544 | 544|44|F|other|29464
545 | 545|27|M|technician|08052
546 | 546|36|M|executive|22911
547 | 547|50|M|educator|14534
548 | 548|51|M|writer|95468
549 | 549|42|M|scientist|45680
550 | 550|16|F|student|95453
551 | 551|25|M|programmer|55414
552 | 552|45|M|other|68147
553 | 553|58|M|educator|62901
554 | 554|32|M|scientist|62901
555 | 555|29|F|educator|23227
556 | 556|35|F|educator|30606
557 | 557|30|F|writer|11217
558 | 558|56|F|writer|63132
559 | 559|69|M|executive|10022
560 | 560|32|M|student|10003
561 | 561|23|M|engineer|60005
562 | 562|54|F|administrator|20879
563 | 563|39|F|librarian|32707
564 | 564|65|M|retired|94591
565 | 565|40|M|student|55422
566 | 566|20|M|student|14627
567 | 567|24|M|entertainment|10003
568 | 568|39|M|educator|01915
569 | 569|34|M|educator|91903
570 | 570|26|M|educator|14627
571 | 571|34|M|artist|01945
572 | 572|51|M|educator|20003
573 | 573|68|M|retired|48911
574 | 574|56|M|educator|53188
575 | 575|33|M|marketing|46032
576 | 576|48|M|executive|98281
577 | 577|36|F|student|77845
578 | 578|31|M|administrator|M7A1A
579 | 579|32|M|educator|48103
580 | 580|16|M|student|17961
581 | 581|37|M|other|94131
582 | 582|17|M|student|93003
583 | 583|44|M|engineer|29631
584 | 584|25|M|student|27511
585 | 585|69|M|librarian|98501
586 | 586|20|M|student|79508
587 | 587|26|M|other|14216
588 | 588|18|F|student|93063
589 | 589|21|M|lawyer|90034
590 | 590|50|M|educator|82435
591 | 591|57|F|librarian|92093
592 | 592|18|M|student|97520
593 | 593|31|F|educator|68767
594 | 594|46|M|educator|M4J2K
595 | 595|25|M|programmer|31909
596 | 596|20|M|artist|77073
597 | 597|23|M|other|84116
598 | 598|40|F|marketing|43085
599 | 599|22|F|student|R3T5K
600 | 600|34|M|programmer|02320
601 | 601|19|F|artist|99687
602 | 602|47|F|other|34656
603 | 603|21|M|programmer|47905
604 | 604|39|M|educator|11787
605 | 605|33|M|engineer|33716
606 | 606|28|M|programmer|63044
607 | 607|49|F|healthcare|02154
608 | 608|22|M|other|10003
609 | 609|13|F|student|55106
610 | 610|22|M|student|21227
611 | 611|46|M|librarian|77008
612 | 612|36|M|educator|79070
613 | 613|37|F|marketing|29678
614 | 614|54|M|educator|80227
615 | 615|38|M|educator|27705
616 | 616|55|M|scientist|50613
617 | 617|27|F|writer|11201
618 | 618|15|F|student|44212
619 | 619|17|M|student|44134
620 | 620|18|F|writer|81648
621 | 621|17|M|student|60402
622 | 622|25|M|programmer|14850
623 | 623|50|F|educator|60187
624 | 624|19|M|student|30067
625 | 625|27|M|programmer|20723
626 | 626|23|M|scientist|19807
627 | 627|24|M|engineer|08034
628 | 628|13|M|none|94306
629 | 629|46|F|other|44224
630 | 630|26|F|healthcare|55408
631 | 631|18|F|student|38866
632 | 632|18|M|student|55454
633 | 633|35|M|programmer|55414
634 | 634|39|M|engineer|T8H1N
635 | 635|22|M|other|23237
636 | 636|47|M|educator|48043
637 | 637|30|M|other|74101
638 | 638|45|M|engineer|01940
639 | 639|42|F|librarian|12065
640 | 640|20|M|student|61801
641 | 641|24|M|student|60626
642 | 642|18|F|student|95521
643 | 643|39|M|scientist|55122
644 | 644|51|M|retired|63645
645 | 645|27|M|programmer|53211
646 | 646|17|F|student|51250
647 | 647|40|M|educator|45810
648 | 648|43|M|engineer|91351
649 | 649|20|M|student|39762
650 | 650|42|M|engineer|83814
651 | 651|65|M|retired|02903
652 | 652|35|M|other|22911
653 | 653|31|M|executive|55105
654 | 654|27|F|student|78739
655 | 655|50|F|healthcare|60657
656 | 656|48|M|educator|10314
657 | 657|26|F|none|78704
658 | 658|33|M|programmer|92626
659 | 659|31|M|educator|54248
660 | 660|26|M|student|77380
661 | 661|28|M|programmer|98121
662 | 662|55|M|librarian|19102
663 | 663|26|M|other|19341
664 | 664|30|M|engineer|94115
665 | 665|25|M|administrator|55412
666 | 666|44|M|administrator|61820
667 | 667|35|M|librarian|01970
668 | 668|29|F|writer|10016
669 | 669|37|M|other|20009
670 | 670|30|M|technician|21114
671 | 671|21|M|programmer|91919
672 | 672|54|F|administrator|90095
673 | 673|51|M|educator|22906
674 | 674|13|F|student|55337
675 | 675|34|M|other|28814
676 | 676|30|M|programmer|32712
677 | 677|20|M|other|99835
678 | 678|50|M|educator|61462
679 | 679|20|F|student|54302
680 | 680|33|M|lawyer|90405
681 | 681|44|F|marketing|97208
682 | 682|23|M|programmer|55128
683 | 683|42|M|librarian|23509
684 | 684|28|M|student|55414
685 | 685|32|F|librarian|55409
686 | 686|32|M|educator|26506
687 | 687|31|F|healthcare|27713
688 | 688|37|F|administrator|60476
689 | 689|25|M|other|45439
690 | 690|35|M|salesman|63304
691 | 691|34|M|educator|60089
692 | 692|34|M|engineer|18053
693 | 693|43|F|healthcare|85210
694 | 694|60|M|programmer|06365
695 | 695|26|M|writer|38115
696 | 696|55|M|other|94920
697 | 697|25|M|other|77042
698 | 698|28|F|programmer|06906
699 | 699|44|M|other|96754
700 | 700|17|M|student|76309
701 | 701|51|F|librarian|56321
702 | 702|37|M|other|89104
703 | 703|26|M|educator|49512
704 | 704|51|F|librarian|91105
705 | 705|21|F|student|54494
706 | 706|23|M|student|55454
707 | 707|56|F|librarian|19146
708 | 708|26|F|homemaker|96349
709 | 709|21|M|other|N4T1A
710 | 710|19|M|student|92020
711 | 711|22|F|student|15203
712 | 712|22|F|student|54901
713 | 713|42|F|other|07204
714 | 714|26|M|engineer|55343
715 | 715|21|M|technician|91206
716 | 716|36|F|administrator|44265
717 | 717|24|M|technician|84105
718 | 718|42|M|technician|64118
719 | 719|37|F|other|V0R2H
720 | 720|49|F|administrator|16506
721 | 721|24|F|entertainment|11238
722 | 722|50|F|homemaker|17331
723 | 723|26|M|executive|94403
724 | 724|31|M|executive|40243
725 | 725|21|M|student|91711
726 | 726|25|F|administrator|80538
727 | 727|25|M|student|78741
728 | 728|58|M|executive|94306
729 | 729|19|M|student|56567
730 | 730|31|F|scientist|32114
731 | 731|41|F|educator|70403
732 | 732|28|F|other|98405
733 | 733|44|F|other|60630
734 | 734|25|F|other|63108
735 | 735|29|F|healthcare|85719
736 | 736|48|F|writer|94618
737 | 737|30|M|programmer|98072
738 | 738|35|M|technician|95403
739 | 739|35|M|technician|73162
740 | 740|25|F|educator|22206
741 | 741|25|M|writer|63108
742 | 742|35|M|student|29210
743 | 743|31|M|programmer|92660
744 | 744|35|M|marketing|47024
745 | 745|42|M|writer|55113
746 | 746|25|M|engineer|19047
747 | 747|19|M|other|93612
748 | 748|28|M|administrator|94720
749 | 749|33|M|other|80919
750 | 750|28|M|administrator|32303
751 | 751|24|F|other|90034
752 | 752|60|M|retired|21201
753 | 753|56|M|salesman|91206
754 | 754|59|F|librarian|62901
755 | 755|44|F|educator|97007
756 | 756|30|F|none|90247
757 | 757|26|M|student|55104
758 | 758|27|M|student|53706
759 | 759|20|F|student|68503
760 | 760|35|F|other|14211
761 | 761|17|M|student|97302
762 | 762|32|M|administrator|95050
763 | 763|27|M|scientist|02113
764 | 764|27|F|educator|62903
765 | 765|31|M|student|33066
766 | 766|42|M|other|10960
767 | 767|70|M|engineer|00000
768 | 768|29|M|administrator|12866
769 | 769|39|M|executive|06927
770 | 770|28|M|student|14216
771 | 771|26|M|student|15232
772 | 772|50|M|writer|27105
773 | 773|20|M|student|55414
774 | 774|30|M|student|80027
775 | 775|46|M|executive|90036
776 | 776|30|M|librarian|51157
777 | 777|63|M|programmer|01810
778 | 778|34|M|student|01960
779 | 779|31|M|student|K7L5J
780 | 780|49|M|programmer|94560
781 | 781|20|M|student|48825
782 | 782|21|F|artist|33205
783 | 783|30|M|marketing|77081
784 | 784|47|M|administrator|91040
785 | 785|32|M|engineer|23322
786 | 786|36|F|engineer|01754
787 | 787|18|F|student|98620
788 | 788|51|M|administrator|05779
789 | 789|29|M|other|55420
790 | 790|27|M|technician|80913
791 | 791|31|M|educator|20064
792 | 792|40|M|programmer|12205
793 | 793|22|M|student|85281
794 | 794|32|M|educator|57197
795 | 795|30|M|programmer|08610
796 | 796|32|F|writer|33755
797 | 797|44|F|other|62522
798 | 798|40|F|writer|64131
799 | 799|49|F|administrator|19716
800 | 800|25|M|programmer|55337
801 | 801|22|M|writer|92154
802 | 802|35|M|administrator|34105
803 | 803|70|M|administrator|78212
804 | 804|39|M|educator|61820
805 | 805|27|F|other|20009
806 | 806|27|M|marketing|11217
807 | 807|41|F|healthcare|93555
808 | 808|45|M|salesman|90016
809 | 809|50|F|marketing|30803
810 | 810|55|F|other|80526
811 | 811|40|F|educator|73013
812 | 812|22|M|technician|76234
813 | 813|14|F|student|02136
814 | 814|30|M|other|12345
815 | 815|32|M|other|28806
816 | 816|34|M|other|20755
817 | 817|19|M|student|60152
818 | 818|28|M|librarian|27514
819 | 819|59|M|administrator|40205
820 | 820|22|M|student|37725
821 | 821|37|M|engineer|77845
822 | 822|29|F|librarian|53144
823 | 823|27|M|artist|50322
824 | 824|31|M|other|15017
825 | 825|44|M|engineer|05452
826 | 826|28|M|artist|77048
827 | 827|23|F|engineer|80228
828 | 828|28|M|librarian|85282
829 | 829|48|M|writer|80209
830 | 830|46|M|programmer|53066
831 | 831|21|M|other|33765
832 | 832|24|M|technician|77042
833 | 833|34|M|writer|90019
834 | 834|26|M|other|64153
835 | 835|44|F|executive|11577
836 | 836|44|M|artist|10018
837 | 837|36|F|artist|55409
838 | 838|23|M|student|01375
839 | 839|38|F|entertainment|90814
840 | 840|39|M|artist|55406
841 | 841|45|M|doctor|47401
842 | 842|40|M|writer|93055
843 | 843|35|M|librarian|44212
844 | 844|22|M|engineer|95662
845 | 845|64|M|doctor|97405
846 | 846|27|M|lawyer|47130
847 | 847|29|M|student|55417
848 | 848|46|M|engineer|02146
849 | 849|15|F|student|25652
850 | 850|34|M|technician|78390
851 | 851|18|M|other|29646
852 | 852|46|M|administrator|94086
853 | 853|49|M|writer|40515
854 | 854|29|F|student|55408
855 | 855|53|M|librarian|04988
856 | 856|43|F|marketing|97215
857 | 857|35|F|administrator|V1G4L
858 | 858|63|M|educator|09645
859 | 859|18|F|other|06492
860 | 860|70|F|retired|48322
861 | 861|38|F|student|14085
862 | 862|25|M|executive|13820
863 | 863|17|M|student|60089
864 | 864|27|M|programmer|63021
865 | 865|25|M|artist|11231
866 | 866|45|M|other|60302
867 | 867|24|M|scientist|92507
868 | 868|21|M|programmer|55303
869 | 869|30|M|student|10025
870 | 870|22|M|student|65203
871 | 871|31|M|executive|44648
872 | 872|19|F|student|74078
873 | 873|48|F|administrator|33763
874 | 874|36|M|scientist|37076
875 | 875|24|F|student|35802
876 | 876|41|M|other|20902
877 | 877|30|M|other|77504
878 | 878|50|F|educator|98027
879 | 879|33|F|administrator|55337
880 | 880|13|M|student|83702
881 | 881|39|M|marketing|43017
882 | 882|35|M|engineer|40503
883 | 883|49|M|librarian|50266
884 | 884|44|M|engineer|55337
885 | 885|30|F|other|95316
886 | 886|20|M|student|61820
887 | 887|14|F|student|27249
888 | 888|41|M|scientist|17036
889 | 889|24|M|technician|78704
890 | 890|32|M|student|97301
891 | 891|51|F|administrator|03062
892 | 892|36|M|other|45243
893 | 893|25|M|student|95823
894 | 894|47|M|educator|74075
895 | 895|31|F|librarian|32301
896 | 896|28|M|writer|91505
897 | 897|30|M|other|33484
898 | 898|23|M|homemaker|61755
899 | 899|32|M|other|55116
900 | 900|60|M|retired|18505
901 | 901|38|M|executive|L1V3W
902 | 902|45|F|artist|97203
903 | 903|28|M|educator|20850
904 | 904|17|F|student|61073
905 | 905|27|M|other|30350
906 | 906|45|M|librarian|70124
907 | 907|25|F|other|80526
908 | 908|44|F|librarian|68504
909 | 909|50|F|educator|53171
910 | 910|28|M|healthcare|29301
911 | 911|37|F|writer|53210
912 | 912|51|M|other|06512
913 | 913|27|M|student|76201
914 | 914|44|F|other|08105
915 | 915|50|M|entertainment|60614
916 | 916|27|M|engineer|N2L5N
917 | 917|22|F|student|20006
918 | 918|40|M|scientist|70116
919 | 919|25|M|other|14216
920 | 920|30|F|artist|90008
921 | 921|20|F|student|98801
922 | 922|29|F|administrator|21114
923 | 923|21|M|student|E2E3R
924 | 924|29|M|other|11753
925 | 925|18|F|salesman|49036
926 | 926|49|M|entertainment|01701
927 | 927|23|M|programmer|55428
928 | 928|21|M|student|55408
929 | 929|44|M|scientist|53711
930 | 930|28|F|scientist|07310
931 | 931|60|M|educator|33556
932 | 932|58|M|educator|06437
933 | 933|28|M|student|48105
934 | 934|61|M|engineer|22902
935 | 935|42|M|doctor|66221
936 | 936|24|M|other|32789
937 | 937|48|M|educator|98072
938 | 938|38|F|technician|55038
939 | 939|26|F|student|33319
940 | 940|32|M|administrator|02215
941 | 941|20|M|student|97229
942 | 942|48|F|librarian|78209
943 | 943|22|M|student|77841
944 | 


--------------------------------------------------------------------------------
/page_rank/README.md:
--------------------------------------------------------------------------------
 1 | page-rank
 2 | =========
 3 | 
 4 | A very simple version/implementation of the page rank algorithm.
 5 | 
 6 | functions:
 7 | 
 8 |  - Page rank
 9 |  - Advanced version of page rank, topic sensitive
10 |  - spam farms
11 |  - spam mass
12 |  - trust rank
13 |  - Hiperlink induced topic search
14 |  - Map reduce to efficiently calculates the page rank
15 |  - Jaccard simiarity to be found in data analysis repo
16 | 
17 | 
18 | implementation using list and matrix from the **numpy** library.
19 | 
20 | 
21 | Calculation workflow : 
22 | 
23 |  1. Parse web pages for links
24 |  2. Parse links
25 |  3. Compute page rank  (iterate until convergence)
26 |  4. Sort by page rank
27 |  5. Create index
28 | 


--------------------------------------------------------------------------------
/page_rank/page_rank.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        simple implementation if page rank
  3 | #
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | 
  7 | from __future__ import division
  8 | import multiprocessing
  9 | import numpy
 10 | import random
 11 | from map_reduce import MapReduce
 12 | from jaccard_similarity import jaccard_sim
 13 | 
 14 | # example of set of pages belonging to the same topic (the simple topic sensitive page rank version)
 15 | S = set(('2', '4'))
 16 | Es = [0, 1, 0, 1]
 17 | 
 18 | 
 19 | def page_rank(matrix, taxation=False, b=1, Es=[], S=set(), nbr_iterations=10000000, verbose=False):
 20 |     """
 21 |         calculate the page rank for each element based on the matrix in input
 22 |         we should validate if the matrix is stochastic
 23 |         if not we use the taxation method to ovoid dead ends (introducing the random surfers)
 24 |             v' = Mv + (1-b)e/n
 25 |             v : eigenvector
 26 |             The term (1-b)e/n is a vector each of whose components has value (1-b)/n and
 27 |             represents the introduction, with probability 1 - b, of a new random surfer at
 28 |             a random page.
 29 |         The mathematical formulation for the iteration that yields topic-sensitive
 30 |         PageRank is similar to the equation we used for general PageRank. The only
 31 |         difference is how we add the new surfers. Suppose S is a set of integers consisting
 32 |         of the row/column numbers for the pages we have identified as belonging to a
 33 |         certain topic (called the teleport set). Let eS be a vector that has 1 in the
 34 |         components in S and 0 in other components. Then the topic-sensitive Page-
 35 |         Rank for S is the limit of the iteration
 36 |             v' = bMv + (1 - b)eS/|S|
 37 |         Here, as usual, M is the transition matrix of the Web, and |S| is the size of set
 38 |         S.
 39 |     """
 40 |     elements_length = len(matrix[0])
 41 |     eigenvectors = [1 / elements_length] * elements_length
 42 |     if Es and taxation:
 43 |         taxation_v = [((1 - b) / len(S) * e) for e in Es]
 44 |     else:
 45 |         taxation_v = [(1 - b) / elements_length] * elements_length if taxation else [0] * elements_length
 46 | 
 47 |     eigenvectors_p = [0] * elements_length
 48 |     itr = 0
 49 |     # initializing map reduce
 50 |     mapper = MapReduce(page_rank_calculation, page_rank_vector)
 51 |     while eigenvectors_p != eigenvectors and itr < nbr_iterations:
 52 |         if eigenvectors_p != [0] * elements_length: eigenvectors = list(eigenvectors_p);
 53 |         for k, v in mapper([(i, eigenvectors, matrix, taxation_v, b) for i in range(elements_length)]):
 54 |             eigenvectors_p[k] = v
 55 |         itr += 1
 56 |     if verbose: print eigenvectors
 57 |     return eigenvectors
 58 | 
 59 | 
 60 | def page_rank_vector(item):
 61 |     """Convert the partitioned data for a word to a
 62 |     tuple containing the word and the number of occurances.
 63 |     """
 64 |     key, occurances = item
 65 |     return (key, sum(occurances))
 66 | 
 67 | 
 68 | def page_rank_calculation(itemi):
 69 |     """Read a file and return a sequence of (word, occurances) values.
 70 |     """
 71 |     item, eigenvectors, matrix, taxation_v, b = itemi
 72 |     elements_length = len(matrix)
 73 |     # print multiprocessing.current_process().name, 'calculating', item
 74 |     output = []
 75 |     vector_p = 0
 76 |     for j in range(elements_length):
 77 |         vector_p += eigenvectors[j] * matrix[item][j] * b
 78 |     vector_p += taxation_v[item]
 79 |     output.append((item, vector_p))
 80 |     return output
 81 | 
 82 | 
 83 | def matrix_vector_multiplication(matrix, vector, length, b, taxation_v):
 84 |     """
 85 |         calculate the multiplication of matrix by vector
 86 |     """
 87 |     vector_p = [0] * length
 88 |     for i in range(length):
 89 |         for j in range(length):
 90 |             vector_p[i] += vector[j] * matrix[i][j] * b
 91 |         vector_p[i] += taxation_v[i]
 92 |     return vector_p
 93 | 
 94 | 
 95 | def construct_web(n, b, nbr_iterations=100000, verbose=False):
 96 |     """
 97 |     Web consists of a clique (set of nodes with all possible arcs from one to another)
 98 |     of n nodes and a single additional node that is the successor of each of the n nodes
 99 |     in the clique. Determine the PageRank of each page, as a function of n
100 |     and ?.
101 |     """
102 |     all_nodes = 1 / (n + 1)
103 |     all_nodes_p = 0
104 |     last_node = 1 / (n + 1)
105 |     last_node_p = 0
106 |     itr = 0
107 |     while (all_nodes != all_nodes_p or last_node != last_node_p) and itr < nbr_iterations:
108 |         if all_nodes_p != 0: all_nodes = all_nodes_p;
109 |         if last_node_p != 0: last_node = last_node_p;
110 |         all_nodes_p = b * all_nodes * ((n - 1) / n) + (1 - b) / (1 + n)
111 |         last_node_p = b * last_node + (1 - b) / (1 + n)
112 |         itr += 1
113 |     if verbose:
114 |         print all_nodes
115 |         print last_node
116 |     return all_nodes
117 | 
118 | 
119 | # exemple of sets of keywords, to be used for the advanced page rank
120 | Sk = tuple((tuple(('0', '6', '7')), tuple(('1', '3', '4', '8')), tuple(('2', '5', '9', '10'))))
121 | 
122 | 
123 | def page_rank_advanced(matrix, b=1, P=set(), S=set(), nbr_iterations=100000, verbose=False):
124 |     """
125 |         calculation of the topic sensitive page rank.
126 |         S is the set of sets of topics
127 |         P is set of topic keywords for each page
128 |         the algorithm we shall implement is the following:
129 | 
130 |                 => calculate the jackard similarity for P and Si
131 |                 => classify the page for a topic
132 |                 =>construct Es, such that is the set of corresponding teleport surfurs for each set of topics
133 |     """
134 |     elements_length = len(matrix[0])
135 |     topics_length = len(S)
136 |     Es = []
137 |     #calculate the jaccard similarity for each page and set and
138 |     for s in S:
139 |         Esp = [0] * elements_length
140 |         for p in range(elements_length):
141 |             Esp[p] = jaccard_sim(P[p], s)
142 |         Es.append(Esp)
143 |         print s
144 |         print Esp
145 |     #calculate the page rank for each topic
146 |     for i in range(topics_length):
147 |         page_rank(matrix, taxation=True, b=b, Es=Es[i], S=S[i], nbr_iterations=10000000, verbose=True)
148 | 
149 | 
150 | def spam_farm(Pa, Ps, Pn, b, verbose=False):
151 |     """
152 |         The spam farm consists of the spammer?s own pages "target page", organized in a special
153 |         way, and some links from the accessible pages to the
154 |         spammer?s pages. Without some links from the outside, the spam farm would
155 |         be useless, since it would not even be crawled by a typical search engine.
156 |         Concerning the accessible pages, it might seem surprising that one can affect
157 |         a page without owning it. However, today there are many sites, such as
158 |         blogs or newspapers that invite others to post their comments on the site. In
159 |         order to get as much PageRank flowing to his own pages from outside, the
160 |         spammer posts many comments.
161 |         In the spam farm, there is one page, the target page, at which the spammer
162 |         attempts to place as much PageRank as possible. There are a large number
163 |         Ps of supporting pages, that accumulate the portion of the PageRank that is
164 |         distributed equally to all pages
165 |         Pa : is the amount of accessible pages
166 |         Ps : the amount of supporting pages
167 |         Pn : the amount of total pages in the web
168 |             => we are looking for PR_t : wich is the page rank for the target page
169 |         -the page rank of each supporting page is :
170 |             b*PR_t + (1-b)/Pn
171 |         Since the page rank of the target page comes from 3 sources:
172 |             1. Pa from outside accessible pages
173 |             2. b times the page rank of the supporting pages:
174 |                     b*((b*PR_t)/Ps + (1-b)/Pn)
175 |             3. (1-b)/Pn, the share of the fraction (1-b) of the page rank that belongs  to PR_t.
176 |                 is negligible and will be dropped to simplify the calculus
177 | 
178 |             => from (1) & (2) :
179 |                 PR_t = Pa + (b*Ps)*((b * PR_t)/Ps + (1-b)/Pn) + (1-b)/Pn
180 |                 PR_t = Pa/(1-b**2) + (b/(1+b))*(Ps/Pn)) + 1/(Pn*(1+b))
181 |                 PR_t = Pa/x + y*(Ps/Pn) + 1/(Pn*(1+b))
182 |             where x = 1/(1- b**2) & y = b/(1+b)
183 |     """
184 |     x = 1 / (1 - b ** 2)
185 |     x *= 100
186 |     y = b / (1 + b)
187 |     y *= 100
188 |     PR_t = Pa / x + y * (Ps / Pn) + 1 / (Pn * (1 + b))
189 |     if verbose:
190 |         print 'Amplification of the external page rank contribution by %4.2f' % x
191 |         print 'amount of PageRank that is %4.2f of the fraction Ps/n in the spam farm.' % y
192 |         print 'page rank of target page %4.2f' % PR_t
193 |     return PR_t
194 | 
195 | 
196 | def trust_rank(matrix, b=0.8, Ts=[], Tp=set(), nbr_iterations=10000000, verbose=False):
197 |     """
198 |         TrustRank based on some teleport set of trustworthy pages.
199 |         Computed the same way as a topic sensitive page rank. The only difference is that the
200 |         teleport surfers are considered trustworthy pages
201 |         Tp : trusted pages.
202 |         Ts : trustworthy vector
203 |     """
204 |     return page_rank(matrix=matrix, taxation=True, b=b, Es=Ts, S=Tp, nbr_iterations=nbr_iterations, verbose=verbose)
205 | 
206 | 
207 | def spam_mass(matrix, taxation=False, b=1, Es=[], S=set(), Ts=[], Tp=set(), nbr_iterations=10000000, verbose=False):
208 |     """
209 |         calculate the spam mass of a pages : (Pr - Tr) / Pr
210 |     """
211 |     pr = page_rank(matrix=matrix, taxation=True, b=b, Es=Es, S=S, nbr_iterations=nbr_iterations, verbose=verbose)
212 |     tr = trust_rank(matrix=matrix, b=b, Ts=Ts, Tp=Tp, nbr_iterations=nbr_iterations, verbose=verbose)
213 |     elements_length = len(matrix[0])
214 |     sm = [0] * elements_length
215 |     for i in range(elements_length):
216 |         sm[i] = (pr[i] - tr[i]) / pr[i]
217 |     if verbose: print sm;
218 |     return sm
219 | 
220 | 
221 | def hits(L, lam, mu, nbr_iterations=100000000, verbose=True):
222 |     """
223 |         Hiperlink induced topic search
224 |         Computation of hubbiness and authority
225 |         Authority : page's quality that tells you best about a topic
226 |         Hubbiness : page's quality that tells you best about other pages and how to find them
227 |         Authority of a page is the sum of predecessors's hubbiness
228 |         Hubiness of a page is the sum of predecessors's authority
229 |         L[i][j] = 1 if page_i link to page_j otherwise 0
230 |     """
231 |     elements_length = len(L[0])
232 |     L_t = transpose(L, elements_length)
233 |     H = [1] * elements_length
234 |     H_s = [0] * elements_length
235 |     A = [0] * elements_length
236 |     T = [0] * elements_length
237 |     itr = 0
238 |     while H != H_s and itr < nbr_iterations:
239 |         if H_s != [0] * elements_length: H = H_s;
240 |         A = matrix_vector_multiplication(L_t, H, elements_length, 1, T)
241 |         m = max(A)
242 |         for i in range(elements_length):
243 |             A[i] /= m
244 |         H_s = matrix_vector_multiplication(L, A, elements_length, 1, T)
245 |         m = max(H_s)
246 |         for i in range(elements_length):
247 |             H_s[i] /= m
248 |         itr += 1
249 |     A = matrix_vector_multiplication(L_t, H, elements_length, 1, T)
250 |     m = max(A)
251 |     for i in range(elements_length):
252 |         A[i] /= m
253 |     if verbose:
254 |         print H
255 |         print A
256 | 
257 | 
258 | def transpose(matrix, elements_length, verbose=False):
259 |     matrix_t = []
260 |     for i in range(elements_length):
261 |         t = [0] * elements_length
262 |         for j in range(elements_length):
263 |             t[j] = matrix[j][i]
264 |         matrix_t.append(t)
265 |     if verbose:
266 |         print matrix
267 |         print matrix_t
268 |     return matrix_t
269 | 
270 | 
271 | def test_construct():
272 |     matrix = []
273 |     matrix.append([0, 1 / 4, 1 / 4, 1 / 4, 0])
274 |     matrix.append([1 / 4, 0, 1 / 4, 1 / 4, 0])
275 |     matrix.append([1 / 4, 1 / 4, 0, 1 / 4, 0])
276 |     matrix.append([1 / 4, 1 / 4, 1 / 4, 0, 0])
277 |     matrix.append([1 / 4, 1 / 4, 1 / 4, 1 / 4, 0])
278 |     page_rank(matrix, taxation=True, b=0.8, verbose=True)
279 |     construct_web(4, 0.8, verbose=True)
280 | 
281 | 
282 | def test_page_rank():
283 |     matrix = []
284 |     matrix.append([0, 1 / 2, 0, 0])
285 |     matrix.append([1 / 3, 0, 0, 1 / 2])
286 |     matrix.append([1 / 3, 0, 1, 1 / 2])
287 |     matrix.append([1 / 3, 1 / 2, 0, 0])
288 |     page_rank(matrix, taxation=True, b=0.85, verbose=True)
289 |     page_rank(matrix, taxation=True, b=0.85, Es=Es, S=S, verbose=True)
290 | 
291 | 
292 | def test_page_rank_advanced():
293 |     matrix = []
294 |     matrix.append([0, 1 / 2, 0, 0])
295 |     matrix.append([1 / 3, 0, 0, 1 / 2])
296 |     matrix.append([1 / 3, 0, 1, 1 / 2])
297 |     matrix.append([1 / 3, 1 / 2, 0, 0])
298 |     P = tuple((tuple(('1', '2', '3', '4')), tuple(('0', '6', '7', '8')), tuple(('2', '5', '9', '10')),
299 |                tuple(('2', '5', '9', '10', '0'))))
300 |     page_rank_advanced(matrix, b=0.85, P=P, S=Sk, nbr_iterations=100000, verbose=False)
301 | 
302 | 
303 | def test_sapm_farm():
304 |     spam_farm(Pa=10, Ps=30, Pn=500, b=0.855, verbose=True)
305 | 
306 | 
307 | def test_spam_mass():
308 |     matrix = []
309 |     matrix.append([0, 1 / 2, 0, 0])
310 |     matrix.append([1 / 3, 0, 0, 1 / 2])
311 |     matrix.append([1 / 3, 0, 1, 1 / 2])
312 |     matrix.append([1 / 3, 1 / 2, 0, 0])
313 |     spam_mass(matrix, taxation=True, b=0.85, Ts=Es, Tp=S, verbose=True)
314 | 
315 | 
316 | def test_hits():
317 |     matrix = []
318 |     matrix.append([0, 1, 1, 1, 0])
319 |     matrix.append([1, 0, 0, 1, 0])
320 |     matrix.append([0, 0, 0, 0, 1])
321 |     matrix.append([0, 1, 1, 0, 0])
322 |     matrix.append([0, 0, 0, 0, 0])
323 |     hits(matrix, 0, 0)
324 | 
325 | 
326 | if __name__ == '__main__':
327 |     test_construct()
328 |     test_page_rank()
329 |     test_page_rank_advanced()
330 |     test_sapm_farm()
331 |     test_spam_mass()
332 |     
333 | 


--------------------------------------------------------------------------------
/page_rank/page_rank_numpy.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        simple implementation if page rank
  3 | #
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | 
  7 | from __future__ import division
  8 | from data_analysis import jaccard_sim
  9 | from numpy import *
 10 | 
 11 | # example of set of pages belonging to the same topic (the simple topic sensitive page rank version)
 12 | S = set(('2', '4'))
 13 | Es = matrix([[0], [1], [0], [1]])
 14 | 
 15 | 
 16 | def page_rank(matrix, taxation=False, b=1, Es=[], S=set(), nbr_iterations=10000000, verbose=False):
 17 |     """
 18 |         calculate the page rank for each element based on the matrix in input
 19 |         we should validate if the matrix is stochastic
 20 |         if not we use the taxation method to ovoid dead ends (introducing the random surfers)
 21 |             v' = Mv + (1-b)e/n
 22 |             v : eigenvector
 23 |             The term (1-b)e/n is a vector each of whose components has value (1-b)/n and
 24 |             represents the introduction, with probability 1 - b, of a new random surfer at
 25 |             a random page.
 26 |         The mathematical formulation for the iteration that yields topic-sensitive
 27 |         PageRank is similar to the equation we used for general PageRank. The only
 28 |         difference is how we add the new surfers. Suppose S is a set of integers consisting
 29 |         of the row/column numbers for the pages we have identified as belonging to a
 30 |         certain topic (called the teleport set). Let eS be a vector that has 1 in the
 31 |         components in S and 0 in other components. Then the topic-sensitive Page-
 32 |         Rank for S is the limit of the iteration
 33 |             v' = bMv + (1 - b)eS/|S|
 34 |         Here, as usual, M is the transition matrix of the Web, and |S| is the size of set
 35 |         S.
 36 |     """
 37 |     elements_length = len(matrix)
 38 |     eigenvectors = (1 / elements_length) * mat(ones((elements_length, 1)))
 39 |     if S and taxation:
 40 |         taxation_v = (1 - b) / len(S) * Es
 41 |     else:
 42 |         taxation_v = (1 - b) / elements_length * mat(ones((elements_length, 1))) if taxation else mat(
 43 |             ones((elements_length, 1))) * 0
 44 | 
 45 |     eigenvectors_p = mat(ones((elements_length, 1))) * 0
 46 |     itr = 0
 47 |     while (eigenvectors_p != eigenvectors).any() and itr < nbr_iterations:
 48 |         if (eigenvectors_p != (mat(ones((elements_length, 1))) * 0)).any(): eigenvectors = eigenvectors_p;
 49 |         eigenvectors_p = matrix_vector_multiplication(matrix, eigenvectors, elements_length, b, taxation_v)
 50 |         itr += 1
 51 |     if verbose: print eigenvectors
 52 |     return eigenvectors
 53 | 
 54 | 
 55 | def matrix_vector_multiplication(matrix, vector, length, b, taxation_v):
 56 |     """
 57 |         calculate the multiplication of matrix by vector
 58 |     """
 59 |     return b * matrix * vector + taxation_v
 60 | 
 61 | 
 62 | def construct_web(n, b, nbr_iterations=100000, verbose=False):
 63 |     """
 64 |     Web consists of a clique (set of nodes with all possible arcs from one to another)
 65 |     of n nodes and a single additional node that is the successor of each of the n nodes
 66 |     in the clique. Determine the PageRank of each page, as a function of n
 67 |     and ?.
 68 |     """
 69 |     all_nodes = 1 / (n + 1)
 70 |     all_nodes_p = 0
 71 |     last_node = 1 / (n + 1)
 72 |     last_node_p = 0
 73 |     itr = 0
 74 |     while (all_nodes != all_nodes_p or last_node != last_node_p) and itr < nbr_iterations:
 75 |         if all_nodes_p != 0: all_nodes = all_nodes_p;
 76 |         if last_node_p != 0: last_node = last_node_p;
 77 |         all_nodes_p = b * all_nodes * ((n - 1) / n) + (1 - b) / (1 + n)
 78 |         last_node_p = b * last_node + (1 - b) / (1 + n)
 79 |         itr += 1
 80 |     if verbose:
 81 |         print all_nodes
 82 |         print last_node
 83 |     return all_nodes
 84 | 
 85 | #exemple of sets of keywords, to be used for the advanced page rank
 86 | Sk = tuple((tuple(('0', '6', '7')), tuple(('1', '3', '4', '8')), tuple(('2', '5', '9', '10'))))
 87 | 
 88 | 
 89 | def page_rank_advanced(matrix, b=1, P=set(), S=set(), nbr_iterations=100000, verbose=False):
 90 |     """
 91 |         calculation of the topic sensitive page rank.
 92 |         S is the set of sets of topics
 93 |         P is set of topic keywords for each page
 94 |         the algorithm we shall implement is the following:
 95 | 
 96 |                 => calculate the jackard similarity for P and Si
 97 |                 => classify the page for a topic
 98 |                 =>construct Es, such that is the set of corresponding teleport surfurs for each set of topics
 99 |     """
100 |     elements_length = len(matrix)
101 |     i = 0
102 |     for s in S:
103 |         Esp = [0] * elements_length
104 |         #calculate the jaccard similarity for each page and set and
105 |         for p in range(elements_length):
106 |             Esp[p] = jaccard_sim(P[p], s)
107 |         Esp = mat(Esp)
108 |         print s
109 |         print Esp
110 |         #calculate the page rank for each topic
111 |         page_rank(matrix, taxation=True, b=b, Es=Esp.getT(), S=S[i], nbr_iterations=10000000, verbose=True)
112 |         i += 1
113 | 
114 | 
115 | def spam_farm(Pa, Ps, Pn, b, verbose=False):
116 |     """
117 |         The spam farm consists of the spammer?s own pages "target page", organized in a special
118 |         way, and some links from the accessible pages to the
119 |         spammer?s pages. Without some links from the outside, the spam farm would
120 |         be useless, since it would not even be crawled by a typical search engine.
121 |         Concerning the accessible pages, it might seem surprising that one can affect
122 |         a page without owning it. However, today there are many sites, such as
123 |         blogs or newspapers that invite others to post their comments on the site. In
124 |         order to get as much PageRank flowing to his own pages from outside, the
125 |         spammer posts many comments.
126 |         In the spam farm, there is one page, the target page, at which the spammer
127 |         attempts to place as much PageRank as possible. There are a large number
128 |         Ps of supporting pages, that accumulate the portion of the PageRank that is
129 |         distributed equally to all pages
130 |         Pa : is the amount of accessible pages
131 |         Ps : the amount of supporting pages
132 |         Pn : the amount of total pages in the web
133 |             => we are looking for PR_t : wich is the page rank for the target page
134 |         -the page rank of each supporting page is :
135 |             b*PR_t + (1-b)/Pn
136 |         Since the page rank of the target page comes from 3 sources:
137 |             1. Pa from outside accessible pages
138 |             2. b times the page rank of the supporting pages:
139 |                     b*((b*PR_t)/Ps + (1-b)/Pn)
140 |             3. (1-b)/Pn, the share of the fraction (1-b) of the page rank that belongs  to PR_t.
141 |                 is negligible and will be dropped to simplify the calculus
142 | 
143 |             => from (1) & (2) :
144 |                 PR_t = Pa + (b*Ps)*((b * PR_t)/Ps + (1-b)/Pn) + (1-b)/Pn
145 |                 PR_t = Pa/(1-b**2) + (b/(1+b))*(Ps/Pn)) + 1/(Pn*(1+b))
146 |                 PR_t = Pa/x + y*(Ps/Pn) + 1/(Pn*(1+b))
147 |             where x = 1/(1- b**2) & y = b/(1+b)
148 |     """
149 |     x = 1 / (1 - b ** 2)
150 |     x *= 100
151 |     y = b / (1 + b)
152 |     y *= 100
153 |     PR_t = Pa / x + y * (Ps / Pn) + 1 / (Pn * (1 + b))
154 |     if verbose:
155 |         print 'Amplification of the external page rank contribution by %4.2f' % x
156 |         print 'amount of PageRank that is %4.2f of the fraction Ps/n in the spam farm.' % y
157 |         print 'page rank of target page %4.2f' % PR_t
158 |     return PR_t
159 | 
160 | 
161 | def trust_rank(matrix, b=0.8, Ts=[], Tp=set(), nbr_iterations=10000000, verbose=False):
162 |     """
163 |         TrustRank based on some teleport set of trustworthy pages.
164 |         Computed the same way as a topic sensitive page rank. The only difference is that the
165 |         teleport surfers are considered trustworthy pages
166 |         Tp : trusted pages.
167 |         Ts : trustworthy vector
168 |     """
169 |     return page_rank(matrix=matrix, taxation=True, b=b, Es=Ts, S=Tp, nbr_iterations=nbr_iterations, verbose=verbose)
170 | 
171 | 
172 | def spam_mass(matrix, taxation=False, b=1, Es=[], S=set(), Ts=[], Tp=set(), nbr_iterations=10000000, verbose=False):
173 |     """
174 |         calculate the spam mass of a pages : (Pr - Tr) / Pr
175 |     """
176 |     pr = page_rank(matrix=matrix, taxation=True, b=b, Es=Es, S=S, nbr_iterations=nbr_iterations, verbose=verbose)
177 |     tr = trust_rank(matrix=matrix, b=b, Ts=Ts, Tp=Tp, nbr_iterations=nbr_iterations, verbose=verbose)
178 |     elements_length = len(matrix[0])
179 | 
180 |     sm = (pr - tr) / pr
181 |     if verbose: print sm;
182 |     return sm
183 | 
184 | 
185 | def hits(L, lam, mu, nbr_iterations=100000000, verbose=True):
186 |     """
187 |         Hiperlink induced topic search
188 |         Computation of hubbiness and authority
189 |         Authority : page's quality that tells you best about a topic
190 |         Hubbiness : page's quality that tells you best about other pages and how to find them
191 |         Authority of a page is the sum of predecessors's hubbiness
192 |         Hubiness of a page is the sum of predecessors's authority
193 |         L[i][j] = 1 if page_i link to page_j otherwise 0
194 |     """
195 |     elements_length = len(L)
196 |     L_t = L.getT()
197 |     H = mat(ones((elements_length, 1)))
198 |     H_s = 0 * H
199 |     A = [0] * H
200 |     T = [0] * H
201 |     itr = 0
202 |     while (H != H_s).any() and itr < nbr_iterations:
203 |         if (H_s != 0 * mat(ones((elements_length, 1)))).any(): H = H_s;
204 |         A = matrix_vector_multiplication(L_t, H, elements_length, 1, T)
205 |         m = A.max()
206 |         A = A / m
207 |         H_s = matrix_vector_multiplication(L, A, elements_length, 1, T)
208 |         m = H_s.max()
209 |         H_s = H_s / m
210 |         itr += 1
211 |     A = matrix_vector_multiplication(L_t, H, elements_length, 1, T)
212 |     m = A.max()
213 |     A = A / m
214 |     if verbose:
215 |         print H
216 |         print A
217 | 
218 | 
219 | def test_construct():
220 |     construct_web(4, 0.8, verbose=True)
221 | 
222 | 
223 | def test_page_rank():
224 |     m = matrix([[0, 0.5, 0, 0], [1 / 3, 0, 0, 0.5], [1 / 3, 0, 1, 0.5], [1 / 3, 0.5, 0, 0]])
225 |     page_rank(m, taxation=True, b=0.85, verbose=True)
226 |     page_rank(m, taxation=True, b=0.85, Es=Es, S=S, verbose=True)
227 | 
228 | 
229 | def test_page_rank_advanced():
230 |     m = matrix([[0, 0.5, 0, 0], [1 / 3, 0, 0, 0.5], [1 / 3, 0, 1, 0.5], [1 / 3, 0.5, 0, 0]])
231 |     P = tuple((tuple(('1', '2', '3', '4')), tuple(('0', '6', '7', '8')), tuple(('2', '5', '9', '10')),
232 |                tuple(('2', '5', '9', '10', '0'))))
233 |     page_rank_advanced(m, b=0.85, P=P, S=Sk, nbr_iterations=100000, verbose=True)
234 | 
235 | 
236 | def test_sapm_farm():
237 |     spam_farm(Pa=10, Ps=30, Pn=500, b=0.85, verbose=True)
238 | 
239 | 
240 | def test_spam_mass():
241 |     m = matrix([[0, 0.5, 0, 0], [1 / 3, 0, 0, 0.5], [1 / 3, 0, 1, 0.5], [1 / 3, 0.5, 0, 0]])
242 |     return spam_mass(m, taxation=True, b=0.8, Ts=Es, Tp=S, verbose=True)
243 | 
244 | 
245 | if __name__ == '__main__':
246 |     test_construct()
247 |     test_page_rank()
248 |     test_page_rank_advanced()
249 |     test_sapm_farm()
250 |     test_spam_mass()
251 |     
252 | 


--------------------------------------------------------------------------------
/quora/datacenter_c.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Jan 01, 2013
  4 | 
  5 | @author: Mourad Mourafiq
  6 | 
  7 | About: This is an attempt to solve the Quora challenge Typehead.
  8 | """
  9 | 
 10 | GO_ROOM = 0
 11 | NOGO_ROOM = 1
 12 | ENTRY_ROOM = 2
 13 | EXIT_ROOM = 3
 14 | 
 15 | 
 16 | class Room(object):
 17 |     """
 18 |     Room
 19 |     
 20 |     @type _x: int
 21 |     @param _x: x coordiante
 22 |       
 23 |     @type _y: int
 24 |     @param _y: y coordiante
 25 |     
 26 |     @type _type : int
 27 |     @param _type: the type of the node (0 1 2 3)
 28 |     
 29 |     
 30 |     @type _visited : boolean
 31 |     @param _visited: track if the room was visited
 32 |     
 33 |     @type _neighbours: list
 34 |     @param _neighbours: the neighbours rooms   
 35 |     """
 36 | 
 37 |     def __init__(self, type):
 38 |         self._type = type
 39 |         self._neighbours = []
 40 |         self._visited = False
 41 | 
 42 |     def add(self, neighbour):
 43 |         self._neighbours.append(neighbour)
 44 | 
 45 |     def init(self):
 46 |         self._neighbours = []
 47 | 
 48 |     def visit(self):
 49 |         self._visited = True
 50 | 
 51 |     def unvisit(self):
 52 |         self._visited = False
 53 | 
 54 |     def is_visited(self):
 55 |         return True if self._visited else False
 56 | 
 57 |     def is_exit(self):
 58 |         return True if self._type == EXIT_ROOM else False
 59 | 
 60 |     def is_entry(self):
 61 |         return True if self._type == ENTRY_ROOM else False
 62 | 
 63 |     def is_nogo(self):
 64 |         return True if self._type == NOGO_ROOM else False
 65 | 
 66 | 
 67 | class Cooling(object):
 68 |     """
 69 |     backtracking solution to datacenter cooling
 70 |     
 71 |     @type _rooms: dict
 72 |     @param _rooms: dictionary of the rooms of the datacenter
 73 |     
 74 |     
 75 |     @type _entry: tuple
 76 |     @param _entry: entry room coordiante
 77 |     
 78 |     @type _nbr_rooms: int
 79 |     @param _nbr_rooms: number of room in our datacenter
 80 |     
 81 |     @type _nbr_rooms_visited: int
 82 |     @param _nbr_rooms_visited: number of room visited so far
 83 |     
 84 |     @type _nbr_lines: int
 85 |     @param _nbr_lines: number of lines in our datacenter
 86 |     
 87 |     @type _nbr_columns: int
 88 |     @param _nbr_columns: number of columns in our datacenter
 89 |     
 90 |     @type _nbr_ways: int
 91 |     @param _nbr_ways: number of ways (result)  
 92 |     """
 93 | 
 94 |     def __init__(self, nbr_lines, nbr_columns):
 95 |         self._rooms = {}
 96 |         self._entry = (0, 0)
 97 |         self._nbr_rooms = nbr_lines * nbr_columns
 98 |         self._nbr_rooms_visited = 1
 99 |         self._nbr_lines = nbr_lines
100 |         self._nbr_columns = nbr_columns
101 |         self._nbr_ways = 0
102 | 
103 |     def add(self, type, line, column, look_for_entry=True):
104 |         self._rooms[(line, column)] = Room(type)
105 |         if look_for_entry:
106 |             if self._rooms[(line, column)].is_entry():
107 |                 self._entry = (line, column)
108 |                 look_for_entry = False
109 | 
110 |     def _construct_neighbours(self, coord):
111 |         l, c = coord
112 |         room = self._rooms[(l, c)]
113 |         room.init()
114 |         if l > 0:
115 |             if not (self._rooms[(l - 1, c)].is_nogo() or self._rooms[(l - 1, c)].is_visited()):
116 |                 room.add((l - 1, c))
117 |         if l + 1 < self._nbr_lines:
118 |             if not (self._rooms[(l + 1, c)].is_nogo() or self._rooms[(l + 1, c)].is_visited()):
119 |                 room.add((l + 1, c))
120 |         if c > 0:
121 |             if not (self._rooms[(l, c - 1)].is_nogo() or self._rooms[(l, c - 1)].is_visited()):
122 |                 room.add((l, c - 1))
123 |         if c + 1 < self._nbr_columns:
124 |             if not (self._rooms[(l, c + 1)].is_nogo() or self._rooms[(l, c + 1)].is_visited()):
125 |                 room.add((l, c + 1))
126 | 
127 |     def _visit(self, room):
128 |         room.visit()
129 |         self._nbr_rooms_visited += 1
130 | 
131 |     def _unvisit(self, room):
132 |         room.unvisit()
133 |         self._nbr_rooms_visited -= 1
134 | 
135 |     def find_way(self, current_room_coord=None):
136 |         if current_room_coord is None:
137 |             current_room_coord = self._entry
138 |             self._visit(self._rooms[current_room_coord])
139 |         # check if exist
140 |         elif self._rooms[current_room_coord].is_exit():
141 |             if self._nbr_rooms_visited == self._nbr_rooms:
142 |                 self._nbr_ways += 1
143 |                 return True
144 |             else:
145 |                 return False
146 |         # not exit yet, try this room's neighbours
147 |         self._construct_neighbours(current_room_coord)
148 |         current_room = self._rooms[current_room_coord]
149 |         for neighbour in current_room._neighbours:
150 |             self._visit(self._rooms[neighbour])
151 |             self.find_way(neighbour)
152 |             self._unvisit(self._rooms[neighbour])
153 |             # at this point we couldn't find the exit
154 |         return False
155 | 
156 | 
157 | W, H = [int(x) for x in raw_input().split()]
158 | cool = Cooling(H, W)
159 | for l in xrange(H):
160 |     rooms = [int(x) for x in raw_input().split()]
161 |     for c in xrange(W):
162 |         cool.add(rooms[c], l, c)
163 | cool.find_way()
164 | print cool._nbr_ways


--------------------------------------------------------------------------------
/quora/dcc.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Author : mourad mourafiq (07/01/2012)
  3 |  *
  4 |  * This is an attempr to solve the datacenter cooling problem
  5 |  */
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | 
  9 | //constants
 10 | #define VISITED_ROOM '4'
 11 | #define GO_ROOM '0'
 12 | #define NOGO_ROOM '1'
 13 | #define ENTRY_ROOM '2'
 14 | #define EXIT_ROOM '3'
 15 | 
 16 | //global variables
 17 | char **rooms;
 18 | int *to_check;
 19 | int nbr_rooms;
 20 | int H;
 21 | int W;
 22 | int nbr_rooms_visited=0;
 23 | int nbr_ways=0;
 24 | 
 25 | int way_exists(int l, int c){
 26 | 	int i, j;
 27 | 	int result = 0;
 28 | 	int nbr_to_check = (nbr_rooms - nbr_rooms_visited)*2;
 29 | 	int count_added = 0;
 30 | 	if (c > 0){
 31 | 		if (rooms[l][c-1] == GO_ROOM){
 32 | 			rooms[l][c-1] = VISITED_ROOM;
 33 | 			//nbr_to_check--;
 34 | 			to_check[count_added++] = l;
 35 | 			to_check[count_added++] = c-1;
 36 | 		}
 37 | 		else if (rooms[l][c-1] == EXIT_ROOM){
 38 | 			result = 1;
 39 | 		}
 40 | 	}
 41 | 	if (l > 0){
 42 | 		if (rooms[l-1][c] == GO_ROOM){
 43 | 			rooms[l-1][c] = VISITED_ROOM;
 44 | 			//nbr_to_check--;
 45 | 			to_check[count_added++] = l-1;
 46 | 			to_check[count_added++] = c;
 47 | 		}
 48 | 		else if (rooms[l-1][c] == EXIT_ROOM){
 49 | 			result = 1;
 50 | 		}
 51 | 	}
 52 | 	if (c+1 < W){
 53 | 		if (rooms[l][c+1] == GO_ROOM){
 54 | 			rooms[l][c+1] = VISITED_ROOM;
 55 | 			//nbr_to_check--;
 56 | 			to_check[count_added++] = l;
 57 | 			to_check[count_added++] = c+1;
 58 | 		}
 59 | 		else if (rooms[l][c+1] == EXIT_ROOM){
 60 | 			result = 1;
 61 | 		}
 62 | 	}
 63 | 	if (l+1 < H){
 64 | 		if (rooms[l+1][c] == GO_ROOM){
 65 | 			rooms[l+1][c] = VISITED_ROOM;
 66 | 			//nbr_to_check--;
 67 | 			to_check[count_added++] = l+1;
 68 | 			to_check[count_added++] = c;
 69 | 		}
 70 | 		else if (rooms[l+1][c] == EXIT_ROOM){
 71 | 			result = 1;
 72 | 		}
 73 | 	}
 74 | 	if (result == 1){
 75 | 		for (i=0; i<count_added; i+=2){
 76 | 			rooms[to_check[i]][to_check[i+1]] = GO_ROOM;
 77 | 		}
 78 | 		return result;
 79 | 	}
 80 | 	if (count_added == 0){
 81 | 		return 0;
 82 | 	}
 83 | 	int cpt = 0;
 84 | 	int exit_found = 0;
 85 |     while(1){
 86 |         int cl = to_check[cpt++];
 87 |         int cc = to_check[cpt++];
 88 |         if (cc > 0){
 89 |             if (rooms[cl][cc-1] == GO_ROOM){
 90 |                 rooms[cl][cc-1] = VISITED_ROOM;
 91 |                 //nbr_to_check--;
 92 |                 to_check[count_added++] = cl;
 93 |                 to_check[count_added++] = cc-1;
 94 |             }
 95 |             else if (rooms[cl][cc-1] == EXIT_ROOM)
 96 |                 	exit_found = 1;
 97 |         }
 98 |         if (cl > 0){
 99 |             if (rooms[cl-1][cc] == GO_ROOM){
100 |                 rooms[cl-1][cc] = VISITED_ROOM;
101 |                 //nbr_to_check--;
102 | 				to_check[count_added++] = cl-1;
103 | 		        to_check[count_added++] = cc;
104 |             }
105 |             else if (rooms[cl-1][cc] == EXIT_ROOM)
106 |                 	exit_found = 1;
107 |         }
108 |         if (cc+1 < W){
109 |             if (rooms[cl][cc+1] == GO_ROOM){
110 |                 rooms[cl][cc+1] = VISITED_ROOM;
111 |                 //nbr_to_check--;
112 |                 to_check[count_added++] = cl;
113 |                 to_check[count_added++] = cc+1;
114 |             }
115 |             else if (rooms[cl][cc+1] == EXIT_ROOM)
116 |                 	exit_found = 1;
117 |         }
118 |         if (cl+1 < H){
119 |             if (rooms[cl+1][cc] == GO_ROOM){
120 |                 rooms[cl+1][cc] = VISITED_ROOM;
121 |                 //nbr_to_check--;
122 |                 to_check[count_added++] = cl+1;
123 |                 to_check[count_added++] = cc;
124 |             }
125 |             else if (rooms[cl+1][cc] == EXIT_ROOM)
126 |                 	exit_found = 1;
127 |         }
128 |         if ((nbr_to_check == count_added) && (exit_found == 1)){
129 |         	result = 1;
130 |         	break;
131 |         }
132 |         if (cpt >= count_added){
133 |         	result = 0;
134 |         	break;
135 |         }
136 |     }
137 |     for (i=0; i<count_added; i+=2){
138 | 		rooms[to_check[i]][to_check[i+1]] = GO_ROOM;
139 | 	}
140 | 	return result;
141 | }
142 | 
143 | long find_ways(int l, int c){
144 | 	if (way_exists(l, c) == 0){
145 | 		return 0;
146 | 	}
147 | 	if (c > 0){
148 | 		if (rooms[l][c-1] == GO_ROOM){
149 | 			rooms[l][c-1] = VISITED_ROOM;
150 | 			nbr_rooms_visited++;
151 | 			find_ways(l, c-1);
152 | 			rooms[l][c-1] = GO_ROOM;
153 | 			nbr_rooms_visited--;
154 | 		}
155 | 		else if ((rooms[l][c-1] == EXIT_ROOM) && (nbr_rooms_visited == nbr_rooms)){
156 | 				nbr_ways++;
157 | 				return 1;
158 | 		}
159 | 	}
160 | 	if (l > 0){
161 | 		if (rooms[l-1][c] == GO_ROOM){
162 | 			rooms[l-1][c] = VISITED_ROOM;
163 | 			nbr_rooms_visited++;
164 | 			find_ways(l-1, c);
165 | 			rooms[l-1][c] = GO_ROOM;
166 | 			nbr_rooms_visited--;
167 | 		}
168 | 		else if ((rooms[l-1][c] == EXIT_ROOM) && (nbr_rooms_visited == nbr_rooms)){
169 | 				nbr_ways++;
170 | 				return 1;
171 | 		}
172 | 	}
173 | 	if (c+1 < W){
174 | 		if (rooms[l][c+1] == GO_ROOM){
175 | 			rooms[l][c+1] = VISITED_ROOM;
176 | 			nbr_rooms_visited++;
177 | 			find_ways(l, c+1);
178 | 			rooms[l][c+1] = GO_ROOM;
179 | 			nbr_rooms_visited--;
180 | 		}
181 | 		else if ((rooms[l][c+1] == EXIT_ROOM) && (nbr_rooms_visited == nbr_rooms)){
182 | 				nbr_ways++;
183 | 				return 1;
184 | 		}
185 | 	}
186 | 	if (l+1 < H){
187 | 		if (rooms[l+1][c] == GO_ROOM){
188 | 			rooms[l+1][c] = VISITED_ROOM;
189 | 			nbr_rooms_visited++;
190 | 			find_ways(l+1, c);
191 | 			rooms[l+1][c] = GO_ROOM;
192 | 			nbr_rooms_visited--;
193 | 		}
194 | 		else if ((rooms[l+1][c] == EXIT_ROOM) && (nbr_rooms_visited == nbr_rooms)){
195 | 				nbr_ways++;
196 | 				return 1;
197 | 		}
198 | 	}
199 | 	return 0;
200 | }
201 | 
202 | int main(int argc, char* argv[]){
203 | 	int i, j, start_l, start_c;
204 | 	scanf("%d %d\n", &W, &H);
205 | 	nbr_rooms = W * H - 2;
206 | 	char line[sizeof(char)*(W+2)*2];
207 | 	rooms = (char **) malloc(H * sizeof(char *));
208 | 	for (i=0; i<H; i++){
209 | 		rooms[i] = (char *) malloc(W*sizeof(char));
210 | 		fgets(line, sizeof(char)*(W+2)*2, stdin);
211 | 		for (j=0; j<W; j++){
212 | 			rooms[i][j] = line[2*j];
213 | 			//Now checks whether we can identify the start and goal state
214 | 			if (rooms[i][j] == ENTRY_ROOM){
215 | 				start_l = i;
216 | 				start_c = j;
217 | 			}else if(rooms[i][j] == NOGO_ROOM){
218 | 				nbr_rooms -= 1;
219 | 			}
220 | 		}
221 | 	}
222 | 	to_check = (int *) malloc(nbr_rooms*2*sizeof(int));
223 | 	find_ways(start_l, start_c);
224 | 	printf("%d\n", nbr_ways);
225 | 	for (i= 0; i<H; i++){
226 | 			free(rooms[i]);
227 | 		}
228 | 	free(rooms);
229 | 	free(to_check);
230 | 	return 0;
231 | }
232 | 


--------------------------------------------------------------------------------
/quora/feed_optimizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Jan 11, 2013
  4 | 
  5 | @author: Mourad Mourafiq
  6 | 
  7 | About: This is an attempt to solve the Quora challenge Feed Optimizer.
  8 | """
  9 | import itertools
 10 | import copy
 11 | import math
 12 | from random import choice, random
 13 | 
 14 | BRUTE_FORCE = 1
 15 | ANNEALING_SIMULATED = 2
 16 | 
 17 | 
 18 | class Story(object):
 19 |     """
 20 |     Story object 
 21 |     
 22 |     @type _cpt: int
 23 |     @param _cpt: counts the number of instance created.
 24 |     
 25 |     @type _height: int
 26 |     @param _height: The stroy's height.
 27 |         
 28 |     @type _time: int
 29 |     @param _time: The time of publication.
 30 |     
 31 |     @type _id: int
 32 |     @param _id: The story's id.
 33 |     
 34 |     @type _score: int
 35 |     @param _score: The story's score.
 36 |     
 37 |     @type _height: int
 38 |     @param _height: The stroy's height.   
 39 |     
 40 |     @type _proportioned_score: float
 41 |     @param _proportioned_score: The stroy's _score proportioned to height.    
 42 |     """
 43 |     __cpt = 0
 44 | 
 45 |     def __init__(self, time=-1, score=-1, height=-1):
 46 |         self._id = Story.__cpt
 47 |         self._time = time
 48 |         self._score = score
 49 |         self._height = height
 50 |         self._proportioned_score = float(score) / height
 51 |         Story.__cpt += 1
 52 | 
 53 |     def __repr__(self):
 54 |         return "id: %s, time: %s" % (self._id, self._time)
 55 | 
 56 |     def __gt__(self, story):
 57 |         if (self._proportioned_score > story._proportioned_score):
 58 |             return True
 59 |         if (self._proportioned_score < story._proportioned_score):
 60 |             return False
 61 |         if (self._id < story._id):
 62 |             return True
 63 |         return False
 64 | 
 65 |     def _better_score(self, story):
 66 |         if (self._score > story._score):
 67 |             return True
 68 |         if (self._score < story._score):
 69 |             return False
 70 |         if (self._id < story._id):
 71 |             return True
 72 |         return False
 73 | 
 74 | 
 75 | class Solution(object):
 76 |     """
 77 |     Potential solution for the upcoming reload
 78 |     
 79 |     @type _stories: list
 80 |     @param _stories: The list of potential items.
 81 | 
 82 |     @type _len_stories : int
 83 |     @param _len_stories: The length of the list of stories.    
 84 |     
 85 |     @type _score: int
 86 |     @param _score: The current solution's score.
 87 |     
 88 |     @type _height: int
 89 |     @param _height: The current solution's height.    
 90 |     """
 91 | 
 92 |     def __init__(self):
 93 |         self._stories = []
 94 |         self._len_stories = 0
 95 |         self._score = 0
 96 |         self._height = 0
 97 | 
 98 |     def __repr__(self):
 99 |         return "%s %s %s" % (
100 |             self._score, self._len_stories, ' '.join(sorted([str(story._id) for story in self._stories])))
101 | 
102 |     def __gt__(self, solution):
103 |         # check who's score is better
104 |         if self._score > solution._score:
105 |             return True
106 |         if self._score < solution._score:
107 |             return False
108 |         # same score; check who has less stories
109 |         if self._len_stories < solution._len_stories:
110 |             return True
111 |         if self._len_stories > solution._len_stories:
112 |             return False
113 |         #same score, same number of stories; check who has smaller lexicographically
114 |         if sorted([story._id for story in self._stories]) <= sorted([story._id for story in solution._stories]):
115 |             return True
116 |         else:
117 |             return False
118 | 
119 |     @classmethod
120 |     def clone(cls, solution):
121 |         clone_solution = cls()
122 |         clone_solution._stories = copy.copy(solution._stories)
123 |         clone_solution._len_stories = solution._len_stories
124 |         clone_solution._score = solution._score
125 |         clone_solution._height = solution._height
126 |         return clone_solution
127 | 
128 |     def add(self, story):
129 |         """
130 |         add story to the solution
131 |         """
132 |         self._stories.append(story)
133 |         self._score += story._score
134 |         self._height += story._height
135 |         self._len_stories += 1
136 | 
137 |     def remove(self, story):
138 |         """
139 |         remove story from the solution
140 |         """
141 |         self._stories.remove(story)
142 |         self._score -= story._score
143 |         self._height -= story._height
144 |         self._len_stories -= 1
145 | 
146 | 
147 | class Optimizer(object):
148 |     """
149 |     Keep track of stories that can potentially make a solution.
150 |     The stories should be sorted by time of publication.
151 |     
152 |     @type _stories: list
153 |     @param stories: The list of stories that can potentially make a solution.
154 |     
155 |     @type _len_stories : int
156 |     @param _len_stories: The length of the list of stories.
157 |     
158 |     @type __height: int 
159 |     @param window: The height of the browser.  
160 |     
161 |     @type __window: int 
162 |     @param window: The window of recent stories.   
163 |     
164 |     @type _best_story: Stroy
165 |     @param _best_story: The  best story so far.   
166 |     """
167 |     __height = 0
168 |     __window = 0
169 | 
170 |     def __init__(self, window, height):
171 |         self._stories = []
172 |         self._len_stories = 0
173 |         Optimizer.__window = window
174 |         Optimizer.__height = height
175 |         self._best_story = Story()
176 | 
177 |     def _purge_old_stories(self, current_time):
178 |         """
179 |         remove old stories form the current list of stories
180 |         """
181 |         # check if the oldest stories can still be part of the solution
182 |         to_be_removed = []
183 |         for old_story in self._stories:
184 |             if (current_time - old_story._time) <= Optimizer.__window:
185 |                 break
186 |             else:
187 |                 to_be_removed.append(old_story)
188 |         for old_story in to_be_removed:
189 |             self._stories.remove(old_story)
190 |             self._len_stories -= 1
191 | 
192 |     def _brute_force(self):
193 |         """
194 |         check all possibilities:
195 |             1) best solution for combination of 2 stories (if it exists).
196 |             2) best solution for combination of 3 stories (if it exists).
197 |             .
198 |             .
199 |             l-1) best solution for combination of l-1 stories (if it exists).
200 |             
201 |             l : being the length of the current stories.
202 |         """
203 |         best_solution = Solution()
204 |         best_solution.add(self._best_story)
205 |         for i in xrange(2, self._len_stories + 1):
206 |             for tuple_stories in itertools.combinations(self._stories, i):
207 |                 if self.addable(tuple_stories):
208 |                     current_solution = Solution()
209 |                     for story in tuple_stories:
210 |                         current_solution.add(story)
211 |                     if current_solution > best_solution:
212 |                         best_solution = current_solution
213 |         return best_solution
214 | 
215 |     def _annealing_simulated(self, T=1000.0, cool=0.35):
216 |         """
217 |         perform the annealing simulated algorithm:
218 |             1) start with a random solution.
219 |             2) move to a neighbour solution. 
220 |                 (favors better solutions, and accepts worst solutions with a certain probabilities
221 |                  to avoid local minimum until the temperature is totally down)
222 |         """
223 |         # order stories based on their proportioned score
224 |         ordered_stories = sorted(self._stories, reverse=True)
225 |         # produce a random solution
226 |         current_solution, stories_in_current = self.random_solution(ordered_stories, self._len_stories)
227 |         best_solution = Solution.clone(current_solution)
228 |         while (T > 0.1):
229 |             temp_solution = Solution.clone(current_solution)
230 |             stories_in_temp = copy.copy(stories_in_current)
231 |             stories_at_true = [i for i in xrange(self._len_stories) if stories_in_temp[i]]
232 |             #check if there is still stories
233 |             if len(stories_at_true) == self._len_stories:
234 |                 break
235 |                 #choose a story and remove it
236 |             if stories_at_true:
237 |                 indice = choice(stories_at_true)
238 |                 stories_in_temp[indice] = False
239 |                 temp_solution.remove(ordered_stories[indice])
240 |             else:
241 |                 indice = -1
242 |             #add any number of other stories available
243 |             for i in xrange(indice + 1, self._len_stories):
244 |                 if stories_in_temp[i]:
245 |                     continue
246 |                 story = ordered_stories[i]
247 |                 if self.addable((story,), temp_solution):
248 |                     stories_in_temp[i] = True
249 |                     temp_solution.add(story)
250 |                 elif temp_solution._height == self.__height:
251 |                     break
252 |             #compare temp and current solutions
253 |             if temp_solution > current_solution:
254 |                 current_solution = temp_solution
255 |                 stories_in_current = stories_in_temp
256 |                 #also since temp is better than current, compare it to best
257 |                 if current_solution > best_solution:
258 |                     best_solution = Solution.clone(current_solution)
259 |             #current solution is better than temp
260 |             #the algorithm states that we can still give it a try depending on a probability
261 |             else:
262 |                 #since temp solution score is < current solution score
263 |                 #this probability will be near one at the beginning where T is high 
264 |                 #but will get lower and lower as T cool down 
265 |                 #hence will accept less and less bad solution
266 |                 p = pow(math.e, float(temp_solution._score - current_solution._score) / T)
267 |                 if p > random():
268 |                     current_solution = temp_solution
269 |                     stories_in_current = stories_in_temp
270 |                     #decrease the temperature
271 |             T = T * cool
272 |         return best_solution
273 | 
274 |     def add(self, story):
275 |         # check if the story's height is within the browser's height
276 |         if story._height <= Optimizer.__height:
277 |             self._stories.append(story)
278 |             self._len_stories += 1
279 |             if (story > self._best_story):
280 |                 self._best_story = story
281 | 
282 |     def produce_solution(self, current_time, solution=BRUTE_FORCE):
283 |         self._purge_old_stories(current_time)
284 |         if solution == BRUTE_FORCE:
285 |             return self._brute_force()
286 |         elif solution == ANNEALING_SIMULATED:
287 |             return self._annealing_simulated()
288 | 
289 |     @classmethod
290 |     def addable(cls, tuple_stories, solution=Solution()):
291 |         total_height = solution._height
292 |         for story in tuple_stories:
293 |             total_height += story._height
294 |         if total_height <= cls.__height:
295 |             return True
296 |         return False
297 | 
298 |     @classmethod
299 |     def random_solution(cls, list_stories, length_stories):
300 |         """
301 |         produce a random solution
302 |         """
303 |         stories_in = [False] * length_stories
304 |         solution = Solution()
305 |         for i in xrange(length_stories):
306 |             story = list_stories[i]
307 |             if cls.addable((story,), solution):
308 |                 solution.add(story)
309 |                 stories_in[i] = True
310 |             elif solution._height == cls.__height:
311 |                 break
312 |         return solution, stories_in
313 | 
314 | 
315 | N, W, H = [int(x) for x in raw_input().split()]
316 | p = Optimizer(W, H)
317 | while (N):
318 |     command = raw_input().split()
319 |     if command[0] == "S":  # story
320 |         t, s, h = [int(x) for x in command[1:]]
321 |         p.add(Story(t, s, h))
322 |     elif command[0] == "R":  # Reload
323 |         tr = int(command[1])
324 |         print p.produce_solution(tr, solution=ANNEALING_SIMULATED)
325 |     N -= 1  


--------------------------------------------------------------------------------
/quora/nearby.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Jan 09, 2013
  4 | 
  5 | @author: Mourad Mourafiq
  6 | 
  7 | About: This is an attempt to solve the Quora challenge Nearby.
  8 | """
  9 | import math
 10 | import heapq
 11 | 
 12 | THRESHOLD = 0.001
 13 | SQUARE_SIDE = 10
 14 | 
 15 | 
 16 | class Square(object):
 17 |     """
 18 |     Square is data structure that represents a part of the plane.
 19 |     A square is divided to 4 parts.
 20 |     
 21 |     @type _origine_x: float
 22 |     @param _origine_x: the x coordinate of the origine of this square
 23 |     
 24 |     @type _origine_y: float
 25 |     @param _origine_y: the y coordinate of the origine of this square
 26 |     
 27 |     @type _curent_distance: int
 28 |     @param _curent_distance: current distance from the query coordiantes
 29 |      
 30 |     @type _tn: int
 31 |     @param _tn: numbre of points in this square  
 32 |      
 33 |     @type _topics: list
 34 |     @param _topics: list of topics
 35 |     """
 36 | 
 37 |     def __init__(self, origine_x, origine_y):
 38 |         self._origine_x = origine_x
 39 |         self._origine_y = origine_y
 40 |         self._current_distance = 0
 41 |         self._tn = 0
 42 |         self._topics = []
 43 | 
 44 |     def __gt__(self, square):
 45 |         delta = self._current_distance - square._current_distance
 46 |         if delta < 0:
 47 |             return True
 48 |         if delta > 0:
 49 |             return False
 50 | 
 51 |     def add(self, topic):
 52 |         self._tn += 1
 53 |         self._topics.append(topic)
 54 | 
 55 |     def set_current_distance(self, origin_x, origin_y):
 56 |         self._current_distance = Topic.euclidean_dis(self._origine_x, self._origine_y, origin_x, origin_y)
 57 |         for topic in self._topics:
 58 |             topic.set_current_distance(origin_x, origin_y)
 59 | 
 60 |     def get_topics(self, tn):
 61 |         if self._tn >= tn:
 62 |             return self._topics[:tn], 0, False
 63 |         else:
 64 |             return self._topics, tn - self._tn, True
 65 | 
 66 | 
 67 | class Topic(object):
 68 |     """
 69 |     Topic
 70 |     
 71 |     @type _id: int
 72 |     @param _id: the id of the topic
 73 |     
 74 |     @type _x: float
 75 |     @param _x: the x coordinate in the plane
 76 |     
 77 |     @type _y: float
 78 |     @param _y: the y coordinate in the plane
 79 |     
 80 |     @type _current_distance: float
 81 |     @param _current_distance: the current distance from the origin(origin being the query coordiantaes) 
 82 |     
 83 |     @type _qn: int
 84 |     @param _qn: the number of questions associated with this topics
 85 |     
 86 |     @type _questions: list
 87 |     @param _questions: the list of the questions associated with this topic
 88 |     """
 89 | 
 90 |     def __init__(self, id, x, y):
 91 |         self._id = id
 92 |         self._x = x
 93 |         self._y = y
 94 |         self._current_distance = 0
 95 |         self._qn = 0
 96 |         self._questions = []
 97 | 
 98 |     def __gt__(self, topic):
 99 |         delta = self._current_distance - topic._current_distance
100 |         if delta < -THRESHOLD:
101 |             return True
102 |         if delta > THRESHOLD:
103 |             return False
104 |         return True if self._id > topic._id else False
105 | 
106 |     def add(self, question):
107 |         self._qn += 1
108 |         self._questions.append(question)
109 | 
110 |     def get_questions(self, qn, questions):
111 |         go_on = True
112 |         for question in self._questions:
113 |             if question not in questions:
114 |                 questions.append(question)
115 |                 qn -= 1
116 |                 if qn == 0:
117 |                     go_on = False
118 |                     break
119 |         return sorted(questions, reverse=True), qn, go_on
120 | 
121 |     def set_current_distance(self, origin_x, origin_y):
122 |         self._current_distance = self.euclidean_dis(self._x, self._y, origin_x, origin_y)
123 | 
124 |     @staticmethod
125 |     def euclidean_dis(x1, y1, x2, y2):
126 |         return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
127 | 
128 | 
129 | class Nearby(object):
130 |     """
131 |     Nearby solver
132 |     
133 |     @type _tn: int
134 |     @param _tn: the number of topics created
135 |     
136 |     @type _topics: dict
137 |     @param _topics: the dictionary of topics created  
138 |     """
139 | 
140 | 
141 |     def __init__(self, tn):
142 |         self._tn = tn
143 |         self._topics = {}
144 | 
145 |     def add_topic(self, topic_id, x, y):
146 |         self._topics[topic_id] = Topic(topic_id, x, y)
147 | 
148 |     def add_question(self, question, nbr_topics, topics):
149 |         if nbr_topics <= 0:
150 |             return
151 |         for i in xrange(nbr_topics):
152 |             topic_id = int(topics[i])
153 |             self._topics[topic_id].add(question)
154 | 
155 |     def _process_query_topic(self, nbr_results, list_topics):
156 |         if nbr_results > self._tn:
157 |             nbr_results = self._tn
158 |         return ' '.join([str(list_topics[i]._id) for i in xrange(nbr_results - 1, -1, -1)])
159 | 
160 |     def _process_query_question(self, nbr_results, list_topics):
161 |         results = []
162 |         go_on = True
163 |         for i in xrange(self._tn - 1, -1, -1):
164 |             results, nbr_results, go_on = list_topics[i].get_questions(nbr_results, results)
165 |             if not go_on:
166 |                 break
167 |         return ' '.join([str(x) for x in results])
168 | 
169 |     def process_query(self, q_type, q_nbr_results, q_x, q_y):
170 |         list_topics = []
171 |         for topic in self._topics.itervalues():
172 |             topic.set_current_distance(q_x, q_y)
173 |             heapq.heappush(list_topics, topic)
174 |         if q_type == "t":
175 |             return self._process_query_topic(q_nbr_results, list_topics)
176 |         if q_type == "q":
177 |             return self._process_query_question(q_nbr_results, list_topics)
178 | 
179 | 
180 | class NearbySquare(Nearby):
181 |     """
182 |     Nearby solver using the square data structure
183 |     
184 |     @type _tn: int
185 |     @param _tn: the number of topics created
186 |     
187 |     @type _topics: dict
188 |     @param _topics: the dictionary of topics created
189 |     
190 |     @type _squares: dict
191 |     @param _squares: the dictionary of squares created  
192 |     """
193 | 
194 | 
195 |     def __init__(self, tn):
196 |         self._tn = tn
197 |         self._ts = 0
198 |         self._topics = {}
199 |         self._squares = {}
200 | 
201 |     def add_topic(self, topic_id, x, y):
202 |         topic = Topic(topic_id, x, y)
203 |         self._topics[topic_id] = topic
204 |         # locate which square this topic should go
205 |         left_x = x % 10
206 |         left_y = y % 10
207 |         square_x = (x - left_x) + 5
208 |         square_y = (y - left_y) + 5
209 |         # check if this square exists
210 |         try:
211 |             square = self._squares[(square_x, square_y)]
212 |         except:
213 |             square = Square(square_x, square_y)
214 |             self._squares[(square_x, square_y)] = square
215 |             self._ts += 1
216 |         square.add(topic)
217 | 
218 |     def _process_query_topic(self, nbr_results, list_squares):
219 |         results = []
220 |         go_on = True
221 |         if nbr_results > self._tn:
222 |             nbr_results = self._tn
223 |         for i in xrange(self._ts - 1, -1, -1):
224 |             temp_results, nbr_results, go_on = list_squares[i].get_topics(nbr_results)
225 |             results += temp_results
226 |             if not go_on:
227 |                 break
228 |         results = sorted(results, reverse=True)
229 |         return ' '.join([str(result._id) for result in results])
230 | 
231 |     def _process_query_question(self, nbr_results, list_squares):
232 |         results = []
233 |         go_on = True
234 |         for i in xrange(self._ts - 1, -1, -1):
235 |             for topic in sorted(list_squares[i]._topics, reverse=True):
236 |                 results, nbr_results, go_on = topic.get_questions(nbr_results, results)
237 |                 if not go_on:
238 |                     break
239 |             if not go_on:
240 |                 break
241 |         return ' '.join([str(x) for x in results])
242 | 
243 |     def process_query(self, q_type, q_nbr_results, q_x, q_y):
244 |         list_squares = []
245 |         for square in self._squares.itervalues():
246 |             square.set_current_distance(q_x, q_y)
247 |             heapq.heappush(list_squares, square)
248 |         if q_type == "t":
249 |             return self._process_query_topic(q_nbr_results, list_squares)
250 |         if q_type == "q":
251 |             return self._process_query_question(q_nbr_results, list_squares)
252 | 
253 | 
254 | T, Q, N = [int(x) for x in raw_input().split()]
255 | nearby = NearbySquare(T)
256 | while (T):  # list of topics
257 |     command = raw_input().split()
258 |     nearby.add_topic(int(command[0]), float(command[1]), float(command[2]))
259 |     T -= 1
260 | while (Q):  # list of questions
261 |     command = raw_input().split()
262 |     nearby.add_question(int(command[0]), int(command[1]), command[2:])
263 |     Q -= 1
264 | while (N):  # process queries
265 |     command = raw_input().split()
266 |     print nearby.process_query(command[0], int(command[1]), float(command[2]), float(command[3]))
267 |     N -= 1    


--------------------------------------------------------------------------------
/quora/results.txt:
--------------------------------------------------------------------------------
  1 | 3rCWr +1
  2 | snInN -1
  3 | ibfT7 -1
  4 | IcbKR +1
  5 | SIXmF +1
  6 | dLCdh +1
  7 | ziFJ8 -1
  8 | 1WtTD -1
  9 | 9uIKh +1
 10 | df4Mc -1
 11 | 3nxpY -1
 12 | aesmq +1
 13 | MyTDz +1
 14 | TDMhx +1
 15 | Y0rW3 +1
 16 | KCcKf +1
 17 | 2cz5M -1
 18 | kqIJj -1
 19 | C1Sg2 +1
 20 | VAmIt -1
 21 | ku9j1 +1
 22 | TjHRV +1
 23 | sq8Xj -1
 24 | uKzm4 +1
 25 | 3at4H +1
 26 | 8nXGS +1
 27 | 4x8ij +1
 28 | PeMnA -1
 29 | UUPpU -1
 30 | HQ4lZ +1
 31 | lEv01 +1
 32 | XCY52 -1
 33 | FNoY7 +1
 34 | JJbco -1
 35 | PHQ7z +1
 36 | Xejsj +1
 37 | A4IsT -1
 38 | 7cU9R -1
 39 | 8mSRL -1
 40 | jRMuo -1
 41 | dljxu +1
 42 | wZyoj -1
 43 | us2ca +1
 44 | EVenw -1
 45 | QLseT +1
 46 | lanI5 +1
 47 | RF9di +1
 48 | 3e6Aa -1
 49 | W5mvO -1
 50 | LkKbu -1
 51 | gbus8 -1
 52 | LN4W4 -1
 53 | 9FNA4 -1
 54 | rd3qM +1
 55 | pV8eI -1
 56 | 5wnO7 -1
 57 | 03KFY -1
 58 | i25BS +1
 59 | AziH8 -1
 60 | YyCpz -1
 61 | qLfPb +1
 62 | CJBeL -1
 63 | EaAPx -1
 64 | sQDFf -1
 65 | CELfn -1
 66 | Ac6Hy -1
 67 | ULryN -1
 68 | qCBAx -1
 69 | Hoz2c -1
 70 | 8kLbb +1
 71 | KAWwb +1
 72 | lNMTe +1
 73 | MzTHU -1
 74 | Sq8XV -1
 75 | s8ZUG -1
 76 | lDcDx +1
 77 | xZoGD +1
 78 | vHLAR +1
 79 | Ag2kt +1
 80 | baqkE +1
 81 | Hsbs2 -1
 82 | BlvEz -1
 83 | CJqsS -1
 84 | vbzII +1
 85 | R16fw -1
 86 | IxBjS +1
 87 | yJl2b -1
 88 | RXdcX +1
 89 | T7uzV +1
 90 | ccZGw +1
 91 | f9xFZ -1
 92 | dO9iE -1
 93 | 2TaN2 +1
 94 | a1XoY -1
 95 | 1xXiG -1
 96 | a8kZK -1
 97 | mXe41 +1
 98 | 8NfbF +1
 99 | Sqdti +1
100 | gnCsH -1
101 | YGNBE +1
102 | zOA3j +1
103 | b3Cm5 +1
104 | Wjo2X +1
105 | IcuwU +1
106 | FyieE +1
107 | 5OUK8 +1
108 | SjCvq -1
109 | kKVUa +1
110 | CEVSg +1
111 | F2MAp -1
112 | hOKPP +1
113 | X22r3 +1
114 | kDxwQ +1
115 | s42QM -1
116 | Olrdu -1
117 | P6Fag -1
118 | IHvly +1
119 | 5bWYy -1
120 | 5zeok +1
121 | 6Z4hF -1
122 | 1RjNM +1
123 | yc6uV +1
124 | JC92f +1
125 | pw6Bl +1
126 | KYPQw -1
127 | ZPFtI +1
128 | ZBvXR -1
129 | wofSH -1
130 | Q4Ika +1
131 | Y7U06 +1
132 | orSui +1
133 | BZ0Op -1
134 | 0T3oe -1
135 | A9NM2 +1
136 | nHCpf -1
137 | tieFX +1
138 | i28eq +1
139 | XftK1 -1
140 | DjFAx +1
141 | WDvPc +1
142 | UEEYY +1
143 | GKXw3 +1
144 | N5qAi +1
145 | 1DGAU -1
146 | XlcdP -1
147 | CVfA1 -1
148 | b39YD -1
149 | d1NlJ +1
150 | ue6lj -1
151 | hx3rt -1
152 | wc0Vt -1
153 | 8iD9Q -1
154 | PNt7q -1
155 | Y97G2 +1
156 | svZ34 +1
157 | sxgEq +1
158 | ZDIWx -1
159 | rD2Az -1
160 | pRsn8 +1
161 | MiByI +1
162 | vM7l5 +1
163 | kbQbh -1
164 | DhzQW -1
165 | W3cWn -1
166 | ItQ1c -1
167 | 4RG18 +1
168 | 9bZNj +1
169 | IHJ5G -1
170 | bsFcm -1
171 | LWfTR -1
172 | ZqXuD -1
173 | b3miO -1
174 | ruJ8j +1
175 | kpmxZ -1
176 | zmAo1 -1
177 | Xaref +1
178 | BhtUL -1
179 | FJefe +1
180 | EGsdK +1
181 | JMmSL +1
182 | SxZPl -1
183 | A9yVd -1
184 | YRv3l +1
185 | Np4je -1
186 | FC1TZ +1
187 | v4CRw +1
188 | DjT9c +1
189 | TEVGW +1
190 | DAO68 +1
191 | 7aT15 +1
192 | quDTm +1
193 | kUG9i -1
194 | FH95r +1
195 | dYH4a +1
196 | 1FDKf +1
197 | lVhAa -1
198 | pL58M -1
199 | UXRFO +1
200 | oNz1I +1
201 | oomHm -1
202 | BHZT7 +1
203 | Ky1yG +1
204 | Esu6G -1
205 | rucD8 -1
206 | NntCQ -1
207 | MiAWn -1
208 | a7X2g +1
209 | fH4fZ +1
210 | ew4Ra -1
211 | py7OD -1
212 | fsgxM +1
213 | bXpcf -1
214 | 9wBEn -1
215 | tgchk +1
216 | YHFHF -1
217 | VuXx7 -1
218 | rEnhQ -1
219 | APrIe +1
220 | SxoIg +1
221 | qdQFT +1
222 | v8u87 +1
223 | BTWxu +1
224 | JxQvo -1
225 | 3SvJ7 -1
226 | Gy42c -1
227 | lWPIk -1
228 | r1TCV -1
229 | LhKtL -1
230 | hfawL -1
231 | KJDMV -1
232 | BN93Q +1
233 | 4eMVe -1
234 | rqtFq -1
235 | rUI3j +1
236 | CERA7 -1
237 | S9Azu -1
238 | z6LjY +1
239 | e0tCJ +1
240 | n9CNr -1
241 | lEJzf +1
242 | gDUHm +1
243 | lWFHR +1
244 | MATqj +1
245 | 6Xt0t +1
246 | iIwLp +1
247 | 9Zm1e -1
248 | OHFov +1
249 | uan5I -1
250 | frFAd +1
251 | HkBSy +1
252 | C3a0q -1
253 | wh2WS +1
254 | ReTOY +1
255 | BT4jS -1
256 | 0xECG +1
257 | vj72F +1
258 | 7k7OU +1
259 | 3JIx0 -1
260 | UiKFt +1
261 | 1vGez -1
262 | mPsf0 -1
263 | AzO5u +1
264 | hpRrl -1
265 | 7cwj6 -1
266 | XMwNb -1
267 | kLXoo -1
268 | gJr3Q -1
269 | ZS89w +1
270 | DGunR +1
271 | 8KygV +1
272 | W7o8e +1
273 | uY40J -1
274 | b8Vdt -1
275 | NB5GM +1
276 | UW8Sd +1
277 | 8ilC4 -1
278 | Z37t7 -1
279 | TYLgu -1
280 | 28rEW +1
281 | YMh5A -1
282 | Z4KU6 -1
283 | YuXTr +1
284 | BymPK -1
285 | 0IiAP +1
286 | cqded +1
287 | NyiLN +1
288 | 6x5i5 +1
289 | 7w0vA +1
290 | 1uQnB +1
291 | oy1UM +1
292 | PKxcI -1
293 | 3CZAd +1
294 | hehRo -1
295 | a9F6V +1
296 | WhX2F +1
297 | ge4Nb -1
298 | Pe9Ds +1
299 | lZ14T +1
300 | mfglK +1
301 | 42faP +1
302 | lAque +1
303 | ntnal +1
304 | Jasel -1
305 | YYoW1 -1
306 | DHJVP +1
307 | SUIoA -1
308 | Tppy2 -1
309 | 7SLdU +1
310 | MJKb9 +1
311 | Nlg2a +1
312 | RNcmi +1
313 | SFZMz -1
314 | 9ukXM +1
315 | WMHcA +1
316 | Fifzv -1
317 | fDgUN -1
318 | SuI41 +1
319 | 6ikOI +1
320 | yvCqh -1
321 | UOpC8 -1
322 | 75C7e -1
323 | m5S2h +1
324 | gqSig -1
325 | BVxmn -1
326 | E3R54 +1
327 | 49kJj +1
328 | hgr96 +1
329 | ydfLT -1
330 | wjeiV +1
331 | Zb62A -1
332 | RrWdE -1
333 | 2H2wU +1
334 | HcVYf -1
335 | 5KjAs -1
336 | M2oQC +1
337 | 93kvL -1
338 | TC3y5 -1
339 | nvi5Q -1
340 | 31L05 +1
341 | uOBuO +1
342 | 7139c +1
343 | FWxzp +1
344 | RSwJ2 +1
345 | 2iRKp +1
346 | gJYDb -1
347 | NBV2w -1
348 | Xu24W +1
349 | 1B3K5 +1
350 | Hpe1Y +1
351 | 8fo4i -1
352 | mZnrj -1
353 | 4PTcN -1
354 | PwRKw +1
355 | lvuaq -1
356 | Xyz8E -1
357 | l8TsJ +1
358 | YXBaC -1
359 | 3AiLQ -1
360 | AUdZp -1
361 | OrhRn +1
362 | 0hKzy +1
363 | yVJG9 +1
364 | DwZ91 -1
365 | CiCix +1
366 | 1rURm -1
367 | uRb8x +1
368 | hNdOm -1
369 | 0mlqT +1
370 | HVfdA +1
371 | GQTwU +1
372 | FbAKm +1
373 | RkxT4 -1
374 | unLvt -1
375 | sAI1b +1
376 | m1Vzy +1
377 | vnCWO +1
378 | SheVD +1
379 | xU9p1 -1
380 | tWQIu +1
381 | D4Q0p +1
382 | J6oW5 +1
383 | JYSZN +1
384 | DSkNC -1
385 | NMX1A -1
386 | ilHXM -1
387 | gqx1j +1
388 | twJio -1
389 | AC2iU +1
390 | UjSKR -1
391 | kMJtM -1
392 | oCbBC -1
393 | 8ecRn -1
394 | eAnji +1
395 | ZON1D +1
396 | hKsn0 +1
397 | x8qVu -1
398 | xyI2l -1
399 | LQZ9d -1
400 | yCiZ7 -1
401 | z1G6d +1
402 | AEaC5 +1
403 | uwRzL -1
404 | QYyXT +1
405 | pvcTy -1
406 | pDQOT +1
407 | DexdS +1
408 | 2t6rO -1
409 | 3uMDO -1
410 | k7V2B -1
411 | sW0fL -1
412 | 3RJAN +1
413 | mB6hU +1
414 | CvxqX -1
415 | YYcs9 -1
416 | cZAn3 -1
417 | sToxl +1
418 | rZcB9 -1
419 | LiYCH -1
420 | 8Zquw -1
421 | PmZDG -1
422 | rsCuL -1
423 | dliev +1
424 | Hb30K +1
425 | LmSJT +1
426 | x8VHE +1
427 | gLbT6 -1
428 | 3ZCSD +1
429 | P0bbj +1
430 | 0SCTp -1
431 | Y9VUN +1
432 | hk8gh -1
433 | 8Jxn7 +1
434 | zyM6v -1
435 | 1UT5q -1
436 | xzTJu -1
437 | L5ty3 -1
438 | q3iR0 +1
439 | kIlr3 +1
440 | AFno7 -1
441 | Q30WV +1
442 | 9WyKp +1
443 | OrbMK -1
444 | 1crzL -1
445 | 75zq7 -1
446 | 0M9md -1
447 | 7x4uA +1
448 | g42wQ -1
449 | W6kby -1
450 | VzRvG +1
451 | wnddf -1
452 | Fw5Sc -1
453 | btL2k -1
454 | lmsyA -1
455 | QIczK -1
456 | i9PM1 +1
457 | ULDFC -1
458 | BjyGS +1
459 | chgHB +1
460 | y32d8 +1
461 | W5R4s +1
462 | lBi1Q +1
463 | GnXj3 -1
464 | 3ekNC -1
465 | wi6YM -1
466 | E29HU -1
467 | PZaNW -1
468 | ru6BY -1
469 | EIRJY +1
470 | L7OaE +1
471 | EcJrP +1
472 | vfi01 -1
473 | 1Vly7 -1
474 | ibAED -1
475 | GPqoG +1
476 | ADfzT -1
477 | FTRNE +1
478 | chBAx -1
479 | vQ6KG -1
480 | KwXO4 -1
481 | 3TzgC +1
482 | PgYPK -1
483 | M8Sqt -1
484 | CPbgW +1
485 | N2j8m +1
486 | taDRA -1
487 | ZLcrw +1
488 | H4sPY -1
489 | BkBE0 +1
490 | zLXXn -1
491 | yQjNY -1
492 | pWyKW +1
493 | hiH6G +1
494 | Ag1tS -1
495 | 7OD1M -1
496 | VBFpQ +1
497 | nP4UA +1
498 | opFHG -1
499 | siSyi -1
500 | zmgX6 -1


--------------------------------------------------------------------------------
/quora/typehead.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | Created on Jan 04, 2013
  4 | 
  5 | @author: Mourad Mourafiq
  6 | 
  7 | About: This is an attempt to solve the Quora challenge Typehead.
  8 | '''
  9 | import re
 10 | import copy
 11 | import datetime             
 12 | 
 13 | COMMANDS = "(ADD)|(DEL)|(W?QUERY)"
 14 | ANY_STRING = "(\\S*.*)"
 15 | SEPARATORS = "(?: |\\t)"
 16 | IDS = "\\w+"
 17 | TYPES = "user|topic|question|board"
 18 | FLOATS = "[0-9]+(?:.[0-9]*)?" 
 19 | INTS = "[0-9]+"
 20 | BOOSTS = "((?:" + TYPES + "|(?:" + IDS + ")):" + FLOATS + SEPARATORS + ")*"
 21 | ANY_COMMAND = "(?P<command>" + COMMANDS + ")" + SEPARATORS + "(?P<parameters>" + ANY_STRING + ")"
 22 | ADD_COMMAND = "(?P<type>" + TYPES + ")" + SEPARATORS + \
 23 |                 "(?P<id>" + IDS + ")" + SEPARATORS + \
 24 |                 "(?P<score>" + FLOATS + ")" + SEPARATORS + \
 25 |                 "(?P<content>" + ANY_STRING + ")"
 26 | DEL_COMMAND = "(?P<id>" + IDS + ")" 
 27 | QUERY_COMMAND = "(?P<nbr_results>" + INTS +")" + SEPARATORS + \
 28 |                 "(?P<query>" + ANY_STRING + ")"
 29 | WQUERY_COMMAND = "(?P<nbr_results>" + INTS +")" + SEPARATORS + \
 30 |                  "(?P<nbr_boosts>" + INTS +")" + SEPARATORS + \
 31 |                  "(?P<boosts>" + BOOSTS + ")" + \
 32 |                  "(?P<query>" +ANY_STRING + ")"
 33 | COMMAND_MATCHER = re.compile(ANY_COMMAND)
 34 | ADD_COMMAND_MATCHER = re.compile(ADD_COMMAND)
 35 | DEL_COMMAND_MATCHER = re.compile(DEL_COMMAND)
 36 | QUERY_COMMAND_MATCHER = re.compile(QUERY_COMMAND)
 37 | WQUERY_COMMAND_MATCHER = re.compile(WQUERY_COMMAND)
 38 | 
 39 | NOK = "{'':[]}"
 40 | 
 41 | class Prefixer():
 42 | 
 43 |     def __init__(self):        
 44 |         self.__data = {}
 45 | 
 46 |     def __repr__(self):
 47 |         return 'Prefixer(%s)' % (self.__data,)
 48 | 
 49 |     def __eq__(self, other):
 50 |         return self.__data == other.__data    
 51 |     
 52 |     def get_data(self):
 53 |         return self.__data    
 54 |     
 55 |     def insert(self, word, item_id):
 56 |         node = self.__data
 57 |         while word:
 58 |             prefix, key = self.longest_prefix(word, node.keys())
 59 |             if not prefix:
 60 |                 break
 61 |             len_prefix = len(prefix)
 62 |             if prefix != key:
 63 |                 # split key into prefix:suffix, move data
 64 |                 suffix = key[len_prefix:]
 65 |                 current_node = node[key]
 66 |                 node[prefix] = {suffix:current_node}
 67 |                 del node[key]
 68 |             word = word[len_prefix:]
 69 |             node = node[prefix]       
 70 |         if word:
 71 |             node[word] = eval(NOK) 
 72 |             node[word][''].append(item_id)
 73 |         else:
 74 |             try:
 75 |                 node[word].append(item_id)
 76 |             except:
 77 |                 node[word] = []
 78 |                 node[word].append(item_id)
 79 |         return True
 80 | 
 81 |     def remove(self, word, item_id):
 82 |         node = self.__data
 83 |         while word:
 84 |             prefix, key = self.longest_prefix(word, node.keys())
 85 |             if not prefix:
 86 |                 return False
 87 |             node = node.get(prefix, None)
 88 |             if not node:
 89 |                 return False
 90 |             word = word[len(prefix):]
 91 |         try:
 92 |             node[''].remove(item_id)
 93 |             return True
 94 |         except:
 95 |             return False
 96 |         
 97 |     def _search_dico(self, word):
 98 |         node = self.__data        
 99 |         while word:
100 |             prefix, key = self.longest_prefix(word, node.keys())
101 |             if not prefix:
102 |                 return False
103 |             if not key:
104 |                 return False
105 |             if prefix != key:
106 |                 if prefix == word:
107 |                     return node[key]
108 |                 else:
109 |                     return False
110 |             node = node[prefix]            
111 |             word = word[len(prefix):]   
112 |         return node   
113 |     
114 |     def search(self, word):
115 |         dico = self._search_dico(word)        
116 |         if dico != False:
117 |             return self.traverse_dico(dico)
118 |         return []
119 |                       
120 |     @staticmethod  
121 |     def traverse_dico(dico):
122 |         results = []
123 |         for key, value in dico.iteritems():
124 |             if key == '':
125 |                 results += value
126 |             else:
127 |                 results += Prefixer.traverse_dico(value)
128 |         return results
129 |     
130 |     @staticmethod
131 |     def longest_prefix(word, candidates):
132 |         """
133 |         return the longest prefix match between word and any of the
134 |         candidates, if any. Only one candidate will much.
135 |         """
136 |         if word:
137 |             wc = word[0]
138 |             for c in candidates:
139 |                 if c.startswith(wc):
140 |                     for i in reversed(xrange(1, min(len(word), len(c))+1)):
141 |                         if c.startswith(word[:i]):
142 |                             return (word[:i], c)
143 |         return ('', None)
144 | 
145 | 
146 | class TypeHead(object):
147 |     """
148 |     typehead object that manages all items
149 |     
150 |     @type items: dict
151 |     @param items: dict of {id : item}     
152 |     """    
153 |     
154 |     def __init__(self):
155 |         self.items = {}
156 |         self.prefixer = Prefixer()
157 |     
158 |     def _add(self, item):
159 |         item_id = item.id
160 |         item_content = item.content
161 |         #add item to the dict
162 |         self.items[item_id] = item
163 |         tokens = re.split(SEPARATORS, item_content.lower())
164 |         #add tokens to the prefixer
165 |         for token in tokens:
166 |             self.prefixer.insert(token, item_id)        
167 |     
168 |     def _delete(self, item_id):        
169 |         item_content = self.items[item_id].content
170 |         #delete the item from the dict
171 |         del self.items[item_id]     
172 |         tokens = re.split(SEPARATORS, item_content.lower())
173 |         #remove items from the prefixer for each token
174 |         for token in tokens:
175 |             self.prefixer.remove(token, item_id)           
176 |     
177 |     def _set_items_query(self, query):
178 |         items_ids = set()
179 |         tokens = re.split(SEPARATORS, query.lower())
180 |         cpt = True
181 |         for token in tokens:
182 |             if cpt:
183 |                 items_ids = set(self.prefixer.search(token))
184 |             else:
185 |                 items_ids = items_ids.intersection(set(self.prefixer.search(token)))
186 |             if items_ids == set():
187 |                 return items_ids 
188 |             cpt = False            
189 |         return items_ids
190 |             
191 |                 
192 |     def _query(self, nbr_results, query):
193 |         #collect potential items' ids
194 |         items_ids = self._set_items_query(query)
195 |         #check if items_ids is not empty
196 |         if items_ids == set():
197 |             return ""
198 |         #rank them according to the scoring method
199 |         sorted_results = SortedItems(nbr_results)
200 |         for item_id in items_ids:
201 |             sorted_results.add(self.items[item_id])
202 |         return sorted_results
203 |     
204 |     def _wquery(self, nbr_results, nbr_boosts, boosts, query):
205 |         nbr_boosts = int(nbr_boosts)        
206 |         #collect potential items' ids
207 |         items_ids = self._set_items_query(query)
208 |         #check if items_ids is not empty
209 |         if items_ids == set():
210 |             return ""
211 |         #check the boosts and create boosts_dict
212 |         boosts_dict = {}
213 |         if nbr_boosts > 0:
214 |             boosts = boosts.split()
215 |             for boost in boosts:
216 |                 type, score = boost.split(':')
217 |                 boosts_dict[type] = float(score)                
218 |         #rank them according to the scoring method
219 |         sorted_results = SortedItems(nbr_results)
220 |         for item_id in items_ids:
221 |             item = copy.deepcopy(self.items[item_id])            
222 |             #chech the boost
223 |             if nbr_boosts > 0:     
224 |                 if item.id in boosts_dict.keys():
225 |                     item.score *= boosts_dict[item.id]
226 |                 if item.type in boosts_dict.keys():
227 |                     item.score *= boosts_dict[item.type]       
228 |             sorted_results.add(item)
229 |         return sorted_results
230 |     
231 |     def process_command(self, in_command):
232 |         """
233 |         validate the current command and map it to the right function
234 |         """
235 |         any_command = COMMAND_MATCHER.match(in_command)
236 |         #
237 |         if any_command:
238 |             command = any_command.group("command")
239 |             parameters = any_command.group("parameters")
240 |             if (command == "ADD"):                
241 |                 add_command = ADD_COMMAND_MATCHER.match(parameters)
242 |                 self._add(Item(add_command.group("type"), add_command.group("id"), 
243 |                            add_command.group("score"), add_command.group("content")))
244 |             elif (command == "DEL"):
245 |                 del_command = DEL_COMMAND_MATCHER.match(parameters)
246 |                 self._delete(del_command.group("id"))
247 |             elif (command == "QUERY"):
248 |                 query_command = QUERY_COMMAND_MATCHER.match(parameters)
249 |                 results = self._query(query_command.group("nbr_results"), query_command.group("query"))
250 |                 print results
251 |             elif (command == "WQUERY"):
252 |                 wquery_command = WQUERY_COMMAND_MATCHER.match(parameters)
253 |                 results = self._wquery(wquery_command.group("nbr_results"), wquery_command.group("nbr_boosts"),
254 |                                         wquery_command.group("boosts"), wquery_command.group("query"))
255 |                 print results
256 |                 
257 | 
258 | class Item(object):
259 |     """
260 |     either a topic, a user, a board or a question
261 |     
262 |     @type type: str
263 |     @param type: The item's type.
264 |     
265 |     @type id: str
266 |     @param id: The item's id.
267 |     
268 |     @type score: float
269 |     @param score: The item's score.
270 |     
271 |     @type content: str
272 |     @param contetn: The item's content.
273 |     
274 |     @type time: time
275 |     @param time: The item's time of creation.    
276 |     """
277 |     
278 |     def __init__(self, type, id, score, content):
279 |         self.type = type
280 |         self.id = id
281 |         self.score = float(score)
282 |         self.content = content
283 |         self.time = datetime.datetime.now()
284 |         
285 |     def __repr__(self):
286 |         return self.id
287 |     
288 |     def better_than(self, item):
289 |         """
290 |         compare the current item to the input item.        
291 |         follows this method:
292 |             . highest score goes first.
293 |             . same score; time FIFO.        
294 |         return true if the current item is better than the input, otherwise returns false
295 |         """
296 |         if (self.score > item.score):
297 |             return True
298 |         if (self.score < item.score):
299 |             return False
300 |         return True if (self.time > item.time) else False
301 |     
302 | 
303 | class SortedItems(object):
304 |     """
305 |     Keeps a list of sorted elements depending on the scoring method.
306 |     
307 |     @type items: list
308 |     @param items: the list sorted items  
309 |     
310 |     @type max_size: int
311 |     @param max_sier: the max size of the list (-1 means unlimited number of items)        
312 |     """
313 |     
314 |     def __init__(self, max_size=-1):        
315 |         self.items = []
316 |         self.max_size = int(max_size)   
317 |         
318 |     def __repr__(self):
319 |         return " ".join([item.id for item in self.items])
320 |     
321 |     def set_max_size(self, max_size):        
322 |         self.max_size = int(max_size)
323 |     
324 |     def add(self, item):
325 |         """
326 |         add new item to the list of items.
327 |         if the list is full, add the item only if it has better score than at least one item
328 |         in the list, and pop the item with the worst score.
329 |         """
330 |         items_l = len(self.items)
331 |         pos = items_l
332 |         for i in xrange(items_l):
333 |             if (item.better_than(self.items[i])):
334 |                 pos = i                
335 |                 break  
336 |         if (self.max_size < 0 or pos < self.max_size):
337 |             temp = self.items[:pos]
338 |             temp.append(item)
339 |             temp += self.items[pos:]
340 |             self.items = temp                                   
341 |             #now in the case of exceeding max_size
342 |             if (self.max_size > 0 and (items_l+1)>self.max_size):
343 |                 self.items.pop()
344 | 
345 |   
346 | t = TypeHead()
347 | N = int(raw_input())
348 | while(N):
349 |     t.process_command(raw_input())
350 |     N -= 1         


--------------------------------------------------------------------------------
/radix_tree.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | Created on Dec 01, 2012
  4 | 
  5 | @author: Mourad Mourafiq
  6 | 
  7 | About: This is an attempt to implement the radix tree algo.
  8 | 	   Features : 
  9 | 			-> insert
 10 | 			-> remove
 11 | 			-> search
 12 | '''
 13 | NOK = "{'':[]}"
 14 | 
 15 | 
 16 | class Prefixer():
 17 |     def __init__(self):
 18 |         self.__data = {}
 19 | 
 20 |     def __repr__(self):
 21 |         return 'Prefixer(%s)' % (self.__data,)
 22 | 
 23 |     def __eq__(self, other):
 24 |         return self.__data == other.__data
 25 | 
 26 |     def get_data(self):
 27 |         return self.__data
 28 | 
 29 |     def insert(self, word, item_id):
 30 |         node = self.__data
 31 |         while word:
 32 |             prefix, key = self.longest_prefix(word, node.keys())
 33 |             if not prefix:
 34 |                 break
 35 |             len_prefix = len(prefix)
 36 |             if prefix != key:
 37 |                 # split key into prefix:suffix, move data
 38 |                 suffix = key[len_prefix:]
 39 |                 current_node = node[key]
 40 |                 node[prefix] = {suffix: current_node}
 41 |                 del node[key]
 42 |             word = word[len_prefix:]
 43 |             node = node[prefix]
 44 |         if word:
 45 |             node[word] = eval(NOK)
 46 |             node[word][''].append(item_id)
 47 |         else:
 48 |             try:
 49 |                 node[word].append(item_id)
 50 |             except:
 51 |                 node[word] = []
 52 |                 node[word].append(item_id)
 53 |         return True
 54 | 
 55 |     def remove(self, word, item_id):
 56 |         node = self.__data
 57 |         while word:
 58 |             prefix, key = self.longest_prefix(word, node.keys())
 59 |             if not prefix:
 60 |                 return False
 61 |             node = node.get(prefix, None)
 62 |             if not node:
 63 |                 return False
 64 |             word = word[len(prefix):]
 65 |         try:
 66 |             node[''].remove(item_id)
 67 |             return True
 68 |         except:
 69 |             return False
 70 | 
 71 |     def _search_dico(self, word):
 72 |         node = self.__data
 73 |         while word:
 74 |             prefix, key = self.longest_prefix(word, node.keys())
 75 |             if not prefix:
 76 |                 return False
 77 |             if not key:
 78 |                 return False
 79 |             if prefix != key:
 80 |                 if prefix == word:
 81 |                     return node[key]
 82 |                 else:
 83 |                     return False
 84 |             node = node[prefix]
 85 |             word = word[len(prefix):]
 86 |         return node
 87 | 
 88 |     def search(self, word):
 89 |         dico = self._search_dico(word)
 90 |         if dico != False:
 91 |             return self.traverse_dico(dico)
 92 |         return []
 93 | 
 94 |     @staticmethod
 95 |     def traverse_dico(dico):
 96 |         results = []
 97 |         for key, value in dico.iteritems():
 98 |             if key == '':
 99 |                 results += value
100 |             else:
101 |                 results += Prefixer.traverse_dico(value)
102 |         return results
103 | 
104 |     @staticmethod
105 |     def longest_prefix(word, candidates):
106 |         """
107 |         return the longest prefix match between word and any of the
108 |         candidates, if any. Only one candidate will much.
109 |         """
110 |         if word:
111 |             wc = word[0]
112 |             for c in candidates:
113 |                 if c.startswith(wc):
114 |                     for i in reversed(xrange(1, min(len(word), len(c)) + 1)):
115 |                         if c.startswith(word[:i]):
116 |                             return (word[:i], c)
117 |         return ('', None)
118 | 


--------------------------------------------------------------------------------
/recommendation.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        Recommendations
  3 | # 
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | # !/usr/bin/env python
  7 | from __future__ import division
  8 | import multiprocessing
  9 | import collections
 10 | from map_reduce import MapReduce
 11 | from similarities.correlation import pearson_sim
 12 | from similarities.euclidean import euclidean_sim
 13 | 
 14 | # A dictionary of movie critics and their ratings of a small
 15 | # set of movies
 16 | def loadMovieLens(path='movielens'):
 17 |     # Get movie titles
 18 |     movies = {}
 19 |     for line in open(path + '/u.item'):
 20 |         (id, title) = line.split('|')[0:2]
 21 |         movies[id] = title
 22 |     # Load data
 23 |     prefs = collections.defaultdict(dict)
 24 |     for line in open(path + '/u.data'):
 25 |         (user, movieid, rating, ts) = line.split('\t')
 26 |         prefs[user][movies[movieid]] = float(rating)
 27 |     return prefs
 28 | 
 29 | 
 30 | critics = loadMovieLens()
 31 | 
 32 | 
 33 | def transform_items(items):
 34 |     result = collections.defaultdict(dict)
 35 |     for x in items:
 36 |         for y in items[x]:
 37 |             # Flip item and person
 38 |             result[y][x] = items[x][y]
 39 |     return result
 40 | 
 41 | 
 42 | def top_similars_map(data):
 43 |     """
 44 |     map for top similars
 45 |     """
 46 |     items, x, i, similarity = data
 47 |     l = len(items)
 48 |     y_items = items.keys()[i * (int(round(l / 4))):(i + 1) * (int(round(l / 4)))]
 49 |     print multiprocessing.current_process().name, 'processing ', x, i
 50 |     return [(similarity(items, x, y), y) for y in y_items if y != x]
 51 | 
 52 | 
 53 | def top_similars_reduce(data):
 54 |     """
 55 |     reduce for top similars
 56 |     """
 57 |     sim, item = data
 58 |     return (sim, item)
 59 | 
 60 | 
 61 | def top_similars_mapreduce(items, x, n=5, similarity=pearson_sim):
 62 |     """
 63 |     Returns the best matches for x from the items.
 64 |     Number of results and similarity function are optional params.
 65 |     """
 66 |     mapper = MapReduce(top_similars_map, top_similars_reduce)
 67 |     scores = mapper([(items, x, i, similarity) for i in range(4)])
 68 |     # Sort the list so the highest scores appear at the top
 69 |     scores.sort()
 70 |     scores.reverse()
 71 |     return scores[:n]
 72 | 
 73 | 
 74 | def top_similars(items, x, n=5, similarity=pearson_sim):
 75 |     """
 76 |     Returns the best matches for x from the items.
 77 |     Number of results and similarity function are optional params.
 78 |     """
 79 |     scores = [(similarity(items, x, y, cache=True), y) for y in items.keys() if y != x]
 80 |     # Sort the list so the highest scores appear at the top
 81 |     scores.sort()
 82 |     scores.reverse()
 83 |     return scores[:n]
 84 | 
 85 | 
 86 | def similar_items(items, n=5, similarity=euclidean_sim, top_similars=top_similars):
 87 |     """
 88 |     Returns a dictionary of top n similar items for each item
 89 |     """
 90 |     similar_items_output = collections.defaultdict(dict)
 91 |     cpt = 0
 92 |     for item in items:
 93 |         cpt += 1
 94 |         if cpt % 100 == 0: print "%d / %d" % (cpt, len(items))
 95 |         similars = top_similars(items=items, x=item, n=n, similarity=similarity)
 96 |         similar_items_output[item] = similars
 97 |     return similar_items_output
 98 | 
 99 | 
100 | def get_recommendations_user_filtred_map(data):
101 |     """
102 |     map for the get_recommendations_user_filter function
103 |     """
104 |     items, x, i, similarity = data
105 |     l = len(items)
106 |     y_items = items.keys()[i * (int(round(l / 4))):(i + 1) * (int(round(l / 4)))]
107 |     print multiprocessing.current_process().name, 'processing ', x, i
108 |     output = []
109 |     for y in y_items:
110 |         if x == y: continue
111 |         sim = similarity(items, x, y, cache=True)
112 |         if sim <= 0: continue
113 |         for item, score in items[y].items():
114 |             if item in items[x] and items[x][item] > 0: continue  # ignore items x already interacted with
115 |             output.append((item, (sim, score * sim)))
116 |     return output
117 | 
118 | 
119 | def get_recommendations_user_filtred_reduce(data):
120 |     """
121 |     reduce for the get_recommendations_user_filtred function
122 |     """
123 |     item, scores = data
124 |     ssim = 0
125 |     ssim_x_score = 0
126 |     for sim, sim_x_score in scores:
127 |         ssim += sim
128 |         ssim_x_score += sim_x_score
129 |     return (item, ssim, ssim_x_score)
130 | 
131 | 
132 | def get_recommendations_user_filtred_mapreduce(items, x, n=5, similarity=pearson_sim):
133 |     """
134 |     Returns recommendationx for x from the items, based on items from similar users     
135 |     """
136 |     mapper = MapReduce(get_recommendations_user_filtred_map, get_recommendations_user_filtred_reduce)
137 |     scores = mapper([(items, x, i, similarity) for i in range(4)])
138 |     # Divide each total score by total weighting to get an average
139 |     rankings = [(sim_x_score / sim, item) for (item, sim, sim_x_score) in scores]
140 |     rankings.sort()
141 |     rankings.reverse()
142 |     return rankings[:n]
143 | 
144 | 
145 | def get_recommendations_user_filtred(items, x, n=5, similarity=pearson_sim):
146 |     """
147 |     Returns recommendationx for x from the items, based on items from similar users     
148 |     """
149 |     similarities_sum = collections.defaultdict(int)
150 |     sum_prod_sim_score = collections.defaultdict(int)
151 |     for y in items.keys():
152 |         if x == y: continue  # don't compare x with itself
153 |         sim = similarity(items, x, y)
154 |         if sim <= 0: continue  # ignore similarities belew or equal 0
155 |         for item, score in items[y].items():
156 |             if item in items[x] and items[x][item] > 0: continue  # ignore items x already interacted with
157 |             similarities_sum[item] += sim
158 |             sum_prod_sim_score[item] += score * sim
159 |     # Divide each total score by total weighting to get an average
160 |     rankings = [(score / similarities_sum[item], item) for item, score in sum_prod_sim_score.items()]
161 |     rankings.sort()
162 |     rankings.reverse()
163 |     return rankings[:n]
164 | 
165 | 
166 | def get_recommendations_item_filtred(items, similarity_matrix, x, n=5):
167 |     """
168 |     Returns recommendations for x from items, based on items similar to user's items 
169 |     """
170 |     similarities_sum = collections.defaultdict(int)
171 |     sum_prod_sim_score = collections.defaultdict(int)
172 |     for item, score in items[x].items():  # loop over item from x
173 |         for (sim, sim_item) in similarity_matrix[item]:  # loop over similar items to item
174 |             if sim_item in items[x]: continue
175 |             # Weighted sum of scores times similarity
176 |             similarities_sum[sim_item] += sim
177 |             sum_prod_sim_score[sim_item] += sim * score
178 |     # Divide each total score by total weighting to get an average
179 |     rankings = [(score / similarities_sum[item], item) for item, score in sum_prod_sim_score.items()]
180 |     rankings.sort()
181 |     rankings.reverse()
182 |     return rankings[:n]
183 | 
184 | 
185 | def test_euclidean():
186 |     # people
187 |     print 'user euclidean similarity'
188 |     print euclidean_sim(critics, 'Toy Story (1995)', 'Twelve Monkeys (1995)')
189 |     print 'user top similarities'
190 |     print top_similars(items=critics, x='99', similarity=euclidean_sim)
191 |     print 'user recommendations'
192 |     print get_recommendations_user_filtred(items=critics, x='99', similarity=euclidean_sim)
193 |     # movies
194 |     movies = transform_items(critics)
195 |     print 'movies euclidean similarity'
196 |     print euclidean_sim(movies, 'Toy Story (1995)', 'Twelve Monkeys (1995)')
197 |     print 'movies top similarities'
198 |     print top_similars(items=movies, x='Twelve Monkeys (1995)', similarity=euclidean_sim)
199 |     print 'movies recommendations'
200 |     print get_recommendations_user_filtred(items=movies, x='Twelve Monkeys (1995)', similarity=euclidean_sim)
201 |     print 'similar items'
202 |     similarity_matrix = similar_items(items=movies, similarity=euclidean_sim)
203 |     print get_recommendations_item_filtred(items=critics, similarity_matrix=similarity_matrix, x='99')
204 | 
205 | 
206 | def test_pearson():
207 |     # people
208 |     print 'pearson sim'
209 |     print pearson_sim(critics, 'Toy Story (1995)', 'Twelve Monkeys (1995)')
210 |     print 'user top sim'
211 |     print top_similars(items=critics, x='99')
212 | 
213 |     print 'user recommendations'
214 |     print get_recommendations_user_filtred(items=critics, x='99')
215 |     # movies
216 |     movies = transform_items(critics)
217 |     print 'movies pearson sim'
218 |     print pearson_sim(movies, 'Toy Story (1995)', 'Twelve Monkeys (1995)')
219 |     print 'movies top similarities'
220 |     print top_similars(items=movies, x='Twelve Monkeys (1995)')
221 |     print 'movies recommendations'
222 |     print get_recommendations_user_filtred(items=movies, x='Twelve Monkeys (1995)')
223 |     print 'similar items'
224 |     similarity_matrix = similar_items(items=movies, similarity=pearson_sim)
225 |     print get_recommendations_item_filtred(items=critics, similarity_matrix=similarity_matrix, x='99')
226 | 


--------------------------------------------------------------------------------
/shingles_minhash.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        shingling minhashing
  3 | # 
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | # !/usr/bin/env python
  7 | 
  8 | from __future__ import division
  9 | from math import *
 10 | 
 11 | # example of stop words
 12 | STOP_WORDS = set(('i', 'you', 'they', 'the', 'no', 'none', 'all', 'a', 'for', 'not', 'nor'))
 13 | # example of hash function
 14 | HASH_FCT_EX = lambda val: (2 * val + 4) % 5
 15 | 
 16 | 
 17 | def k_shingles(string, k=2, use_stop_words=False, verbose=False):
 18 |     """
 19 |     Return the set of k-shingles of the current text
 20 |     """
 21 |     shingles = []
 22 |     string_len = len(string)
 23 |     for i in range(0, string_len - k + 1):
 24 |         txt = [string[i:i + k]]
 25 |         shingles += txt
 26 |     k_sh = set(sorted(shingles))
 27 |     if use_stop_words:
 28 |         k_sh = k_sh - STOP_WORDS
 29 |     if verbose:
 30 |         print "All possible shingles 27^%s = %s" % (k, 27 ** k)
 31 |         print "%s-shingles for %s : %s" % (k, string, k_sh)
 32 |     return k_sh
 33 | 
 34 | 
 35 | def charateristic_matrix(list_sets, verbose=False):
 36 |     """
 37 |         Retutrn the characterisc matrix for the current list of sets
 38 |     """
 39 |     nbr_columns = len(list_sets)
 40 |     # constructiong the elements based on the union of sets
 41 |     elements = set()
 42 |     for i in list_sets:
 43 |         elements = elements | i
 44 |     elements = sorted(list(elements))
 45 |     nbr_rows = len(elements)
 46 |     char_matrix = []
 47 |     #initialising the characteristic matrix
 48 |     for i in range(0, nbr_rows):
 49 |         char_matrix.append([0] * nbr_columns)
 50 |     #constructiong the charastristic matrix
 51 |     for e in range(0, nbr_rows):
 52 |         for s in range(0, nbr_columns):
 53 |             char_matrix[e][s] = 1 if elements[e] in list_sets[s] else 0
 54 | 
 55 |     if verbose: print char_matrix
 56 |     return char_matrix
 57 | 
 58 | 
 59 | def signature_vector(characteristic_matrix, hash_fct, verbose=False):
 60 |     """
 61 |         Computing Minhash Signatures fot the current characteristic matrix, with hash_fct hash function
 62 |     """
 63 |     nbr_columns = len(characteristic_matrix[0])
 64 |     nbr_rows = len(characteristic_matrix)
 65 |     signature = [-1] * nbr_columns
 66 |     for r in range(0, nbr_rows):
 67 |         hash_value = hash_fct(r)
 68 |         for c in range(0, nbr_columns):
 69 |             if characteristic_matrix[r][c] == 1:
 70 |                 # the row r has 1 in the column c, so it is potentially object to change
 71 |                 if signature[c] > hash_value or signature[c] < 0:
 72 |                     signature[c] = hash_value
 73 |     if verbose: print signature
 74 |     return signature
 75 | 
 76 | 
 77 | def and_or_construction(p, r, b, and_first=True):
 78 |     """
 79 |         function of r and b, the point of maximum slope and the value of that slope, for families of functions
 80 |         defined from the minhash functions.
 81 |         if and_first = True : An r-way AND construction followed by a b-way OR construction.
 82 |         else : A b-way OR construction followed by an r-way AND construction.
 83 |     """
 84 |     return 1 - (1 - p ** r) ** b if and_first else (1 - (1 - p) ** b) ** r
 85 | 
 86 | 
 87 | def and_or_s_curve(r, b, and_first=True):
 88 |     """
 89 |         For an r and b given : it generates the s-curve
 90 |     """
 91 |     p = 0.1
 92 |     for i in range(0, 9):
 93 |         p += 0.1
 94 |         print and_or_construction(p, r, b, and_first)
 95 | 
 96 | 
 97 | def bloom_filter(n, m, k):
 98 |     """
 99 |         A Bloom filter consists of:
100 |         1. An array of n bits, initially all 0?s.
101 |         2. A collection of hash functions h1, h2, . . . , hk. Each hash function maps
102 |         ?key? values to n buckets, corresponding to the n bits of the bit-array.
103 |         3. A set S of m key values.
104 |         The purpose of the Bloom filter is to allow through all stream elements whose
105 |         keys are in S, while rejecting most of the stream elements whose keys are not
106 |         in S.
107 |         To initialize the bit array, begin with all bits 0. Take each key value in S
108 |         and hash it using each of the k hash functions. Set to 1 each bit that is hi(K)
109 |         for some hash function hi and some key value K in S.
110 |         To test a key K that arrives in the stream, check that all of
111 |         h1(K), h2(K), . . . , hk(K)
112 |         are 1?s in the bit-array. If all are 1?s, then let the stream element through. If
113 |         one or more of these bits are 0, then K could not be in S, so reject the stream
114 |         element.
115 |     """
116 |     # probability that a bit remains 0
117 |     a = long(k * m)
118 |     b = float(a) / float(n)
119 |     prob_bit_0 = exp(- b)
120 |     #the probability of a false positive is the probability of a 1 bit
121 |     prob_false_positive = (1 - prob_bit_0) ** k
122 |     print prob_false_positive
123 | 


--------------------------------------------------------------------------------
/similarities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmourafiq/data-analysis/1df2ca020a554f1fdab7cc9e53115e249cc199ac/similarities/__init__.py


--------------------------------------------------------------------------------
/similarities/correlation.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------
  2 | # Name:        Recommendations
  3 | # 
  4 | # Author:      mourad mourafiq
  5 | # -------------------------------------------------------------------------------
  6 | # !/usr/bin/env python
  7 | from __future__ import division
  8 | from math import sqrt
  9 | 
 10 | PEARSON_SIMILARITY_CACHE = {}
 11 | 
 12 | 
 13 | def get_commun_items(x, y):
 14 |     """
 15 |     Returns the commun items between x and y
 16 |     """
 17 |     return [i for i in x.keys() if i in y.keys()]
 18 | 
 19 | 
 20 | def pearson_correlation(x, y, commun_items):
 21 |     """
 22 |     The population correlation coefficient corr(x,y) between x and y with expected
 23 |     values m(x) and m(y) and standard deviations std(x) and std(y) is defined as:
 24 |         corr(x,y) = cov(x, y) / (std(x) * std(y)) = E((x - m(x))(y - m(y))) / (std(x) * std(y))
 25 |     
 26 |     Returns the pearson correlation for x and y for a given list of commun items 
 27 |     """
 28 |     # Find the number of elements
 29 |     n = len(commun_items)
 30 |     # if they are no ratings in common, return 0
 31 |     if n == 0: return 0
 32 |     # Add up all the preferences
 33 |     sumX = sum([x[i] for i in commun_items])
 34 |     sumY = sum([y[i] for i in commun_items])
 35 |     # Sum up the squares
 36 |     sumX2 = sum([pow(x[i], 2) for i in commun_items])
 37 |     sumY2 = sum([pow(y[i], 2) for i in commun_items])
 38 |     # Sum up the products
 39 |     prodSum = sum([x[i] * y[i] for i in commun_items])
 40 |     # Calculate Pearson score
 41 |     num = prodSum - (sumX * sumY / n)
 42 |     den = sqrt((sumX2 - pow(sumX, 2) / n) * (sumY2 - pow(sumY, 2) / n))
 43 |     if den == 0: return 0
 44 |     r = num / den
 45 |     return r
 46 | 
 47 | 
 48 | def pearson_sim(items, x, y, cache=False):
 49 |     """
 50 |     Returns the similarity between x and y based on the pearson correaltion
 51 |     """
 52 |     if cache:
 53 |         if (x, y) in PEARSON_SIMILARITY_CACHE:
 54 |             return PEARSON_SIMILARITY_CACHE[(x, y)]
 55 |         i_x = items[x]
 56 |         i_y = items[y]
 57 |         sim = pearson_correlation(i_x, i_y, get_commun_items(i_x, i_y))
 58 |         PEARSON_SIMILARITY_CACHE[(x, y)] = sim
 59 |         PEARSON_SIMILARITY_CACHE[(y, x)] = sim
 60 |         return sim
 61 |     i_x = items[x]
 62 |     i_y = items[y]
 63 |     return pearson_correlation(i_x, i_y, get_commun_items(i_x, i_y))
 64 | 
 65 | 
 66 | def mean(input_array):
 67 |     for i in range(0, (len(input_array) - 1)):
 68 |         input_array[i] = float(input_array[i])
 69 |     total_sum = 0.00
 70 |     for value in input_array:
 71 |         total_sum = total_sum + value
 72 |     return float(total_sum / len(input_array))
 73 | 
 74 | 
 75 | def standard_deviation(input_array):
 76 |     mu = mean(input_array)
 77 |     variance_numerator = 0.00
 78 |     for val in input_array:
 79 |         variance_numerator = variance_numerator + (val - mu) ** 2  # Sigma((x-mu)^2)
 80 |     variance = variance_numerator / len(input_array)
 81 |     return sqrt(variance)
 82 | 
 83 | 
 84 | def covariance(x_array, y_array):
 85 |     if len(x_array) != len(y_array):
 86 |         return False
 87 |     x_mu = mean(x_array)
 88 |     y_mu = mean(y_array)
 89 |     covariance_numerator = 0.00
 90 |     for i in range(len(x_array)):
 91 |         covariance_numerator = covariance_numerator + (x_array[i] - x_mu) * (y_array[i] - y_mu)
 92 |     return covariance_numerator / len(x_array)
 93 | 
 94 | 
 95 | def correlation(x_array, y_array):
 96 |     if covariance(x_array, y_array):
 97 |         return covariance(x_array, y_array) / ((standard_deviation(x_array)) * (standard_deviation(y_array)))
 98 |     else:
 99 |         return False
100 | 
101 | 
102 | X = [1, 2, 2, 4]
103 | Y = [2, 4, 6, 8]
104 | print correlation(X, Y)
105 | 
106 | 


--------------------------------------------------------------------------------
/similarities/euclidean.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        Recommendations
 3 | # 
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | from __future__ import division
 8 | from math import sqrt
 9 | 
10 | EUCLIDEAN_SIMILARITY_CACHE = {}
11 | 
12 | 
13 | def get_commun_items(x, y):
14 |     """
15 |     Returns the commun items between x and y
16 |     """
17 |     return [i for i in x.keys() if i in y.keys()]
18 | 
19 | 
20 | def euclidean_dis(x, y, commun_items):
21 |     """
22 |     Returns the euclidean distance between x and y for a given list of commun items
23 |     """
24 |     return sqrt(sum([pow(x[i] - y[i], 2) for i in commun_items]))
25 | 
26 | 
27 | def euclidean_sim(items, x, y, cache=False):
28 |     """
29 |     Returns the euclidean similarity between x and y.
30 |     """
31 |     if cache:
32 |         if (x, y) in EUCLIDEAN_SIMILARITY_CACHE:
33 |             return EUCLIDEAN_SIMILARITY_CACHE[(x, y)]
34 |         i_x = items[x]
35 |         i_y = items[y]
36 |         sim = 1 / (1 + euclidean_dis(i_x, i_y, get_commun_items(i_x, i_y)))
37 |         EUCLIDEAN_SIMILARITY_CACHE[(x, y)] = sim
38 |         EUCLIDEAN_SIMILARITY_CACHE[(y, x)] = sim
39 |         return sim
40 |     i_x = items[x]
41 |     i_y = items[y]
42 |     return 1 / (1 + euclidean_dis(i_x, i_y, get_commun_items(i_x, i_y)))
43 | 


--------------------------------------------------------------------------------
/similarities/jaccard_similarity.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------
 2 | # Name:        jaccard similarity
 3 | # 
 4 | # Author:      mourad mourafiq
 5 | # -------------------------------------------------------------------------------
 6 | 
 7 | from __future__ import division
 8 | 
 9 | EX_TUP_1 = ('a', 'a', 'a', 'b')
10 | EX_TUP_1 = ('a', 'a', 'b', 'b', 'c')
11 | 
12 | 
13 | def jaccard_sim(tup_1, tup_2, verbose=False):
14 |     """
15 |         calculate the jaccard similiarity of 2 tuples
16 |     """
17 |     sum = len(tup_1) + len(tup_2)
18 |     set_1 = set(tup_1)
19 |     set_2 = set(tup_2)
20 |     inter = 0
21 |     for i in (set_1 & set_2):
22 |         count_1 = tup_1.count(i)
23 |         count_2 = tup_2.count(i)
24 |         inter += count_1 if count_1 < count_2 else count_2
25 |     j_sim = inter / sum
26 |     if verbose: print j_sim
27 |     return j_sim
28 | 
29 | 
30 | def jaccard_distance(tup_1, tup_2):
31 |     """
32 |         Calculate the jaccard distance
33 |     """
34 |     return 1 - jaccard_sim(tup_1, tup_2)
35 | 
36 | 
37 | def jaccard_conditional_comparaison(tup, list_tups, min_jaccard_sim, verbose=False):
38 |     """
39 |         Suppose that "s" is a string of length "ls", and we are looking for
40 |         strings with at least "sim" Jaccard similarity.
41 |         To be sure that we do not have to compare "s" with "t", we must be certain that "sim" > ("ls" ? "p")/"ls". That
42 |         is, "p" must be at least [(1 ? "sim")"ls"] + 1. Of course we want "p" to be as small as
43 |         possible, so we do not index string s in more buckets than we need to. Thus,
44 |         we shall hereafter take "p" = [(1 ? "sim")"ls"+ 1 to be the length of the prefix that
45 |         gets indexed.
46 |          P.S : "p" being the prefix of potential strings to be compared to "s"
47 |          Case 1: p ? q. Here, the maximum size of the intersection is
48 |             Ls ? i + 1 ? (p ? q)
49 |             Since Ls = i + p, we can write the above expression for the intersection size as
50 |             q + 1. The minimum size of the union is Ls + j ? 1, as it was when we did not
51 |             take suffix length into account. Thus, we require
52 |             (q + 1) /(Ls + j ? 1) ? J whenever p ? q.
53 |          Case 2: p < q. Here, the maximum size of the intersection is Ls ? i + 1, as
54 |             when suffix length was not considered. However, the minimum size of the union
55 |             is now Ls + j ? 1 + q ? p. If we again use the relationship Ls = i + p, we can
56 |             replace Ls ? p by i and get the formula i + j ? 1 + q for the size of the union.
57 |             If the Jaccard similarity is at least J, then
58 |             (Ls ? i + 1) / (i + j ? 1 + q) ? J
59 |             whenever p < q.
60 | 
61 |     """
62 |     tup_length = len(tup)
63 |     pre = int(((1 - min_jaccard_sim) * tup_length) + 1)
64 |     max_length = int(tup_length / min_jaccard_sim)
65 |     min_length = tup_length - pre
66 |     potential_tups = []
67 |     for t in list_tups:
68 |         t_length = len(t)
69 |         # first we check teh current tup length
70 |         if t_length >= min_length and t_length <= max_length:
71 |             #second we should loop over all possible values for i & j
72 |             for i in range(0, pre):
73 |                 for j in range(0, pre):
74 |                     p = tup_length - i
75 |                     q = t_length - j
76 |                     if (p >= q and ((q + 1) / (tup_length + j - 1)) >= min_jaccard_sim) or (
77 |                                     p < q and ((tup_length - i + 1) / (i + j - 1 + q)) >= min_jaccard_sim):
78 |                         potential_tups.append(t)
79 |     if verbose: print potential_tups
80 |     return potential_tups
81 | 


--------------------------------------------------------------------------------
/similarities/tanimoto.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Aug 22, 2012
 3 | 
 4 | @author: mourad mourafiq
 5 | '''
 6 | 
 7 | TANIMOTO_SIMILARITY_CACHE = {}
 8 | 
 9 | 
10 | def tanimoto_sim(items, x, y, cache=False):
11 |     """
12 |     Returns the similarity between x and y based on the tanimoto score
13 |     """
14 |     c1, c2, shr = 0, 0, 0
15 | 
16 |     for i in range(len(x)):
17 |         if x[i] != 0: c1 += 1  # in v1
18 |         if y[i] != 0: c2 += 1  # in v2
19 |         if x[i] != 0 and y[i] != 0: shr += 1  # in both
20 | 
21 |     return 1.0 - (float(shr) / (c1 + c2 - shr))
22 | 


--------------------------------------------------------------------------------