├── README.md ├── week1 ├── _232418a57c83ad19339c24a22a56b9b1_Programming-Assignment-1.pdf ├── non_shared_substring │ └── non_shared_substring.cpp ├── suffix_tree │ ├── suffix_tree.cpp │ └── trie.cpp ├── trie │ └── trie.cpp ├── trie_matching │ └── trie_matching.cpp └── trie_matching_extended │ └── trie_matching_extended.cpp ├── week2 ├── _332adc34d317de0b0cc790581a52df01_Programming-Assignment-2.pdf ├── bwmatching │ └── bwmatching.cpp ├── bwt │ └── bwt.cpp ├── bwtinverse │ └── bwtinverse.cpp └── suffix_array │ └── suffix_array.cpp └── week3-4 ├── _120e875d62d8c0bc18a11e21fd818465_Programming-Assignment-3.pdf ├── kmp └── kmp.cpp ├── suffix_array_long └── suffix_array_long.cpp ├── suffix_array_matching └── suffix_array_matching.cpp └── suffix_tree_from_array └── suffix_tree_from_array.cpp /README.md: -------------------------------------------------------------------------------- 1 | # algorithms-on-strings 2 | Contains my code submission for Algorithms on Strings course as offered by University of California, San Diego & National Research University Higher School of Economics on Coursera.
3 | https://www.coursera.org/learn/algorithms-on-strings

4 | Note: The purpose of this repository is to preserve my submissions for future reference and is not inteded to help other students in any form of plagirism. That being said I'll be more than happy to help with and discuss any problem related to the course. 5 | Feel free to message me on Gitter(https://gitter.im/nishchayp). 6 | -------------------------------------------------------------------------------- /week1/_232418a57c83ad19339c24a22a56b9b1_Programming-Assignment-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nishchayp/algorithms-on-strings/3f8a3924baf277210b55612bf289f74ecbe8ad15/week1/_232418a57c83ad19339c24a22a56b9b1_Programming-Assignment-1.pdf -------------------------------------------------------------------------------- /week1/non_shared_substring/non_shared_substring.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | 6 | string solve (string p, string q) 7 | { 8 | string result = p; 9 | 10 | return result; 11 | } 12 | 13 | int main (void) 14 | { 15 | string p; 16 | cin >> p; 17 | string q; 18 | cin >> q; 19 | 20 | string ans = solve (p, q); 21 | 22 | cout << ans << endl; 23 | 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /week1/suffix_tree/suffix_tree.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | using std::cin; 8 | using std::cout; 9 | using std::endl; 10 | using std::map; 11 | using std::string; 12 | using std::vector; 13 | using std::pair; 14 | using std::make_pair; 15 | 16 | struct Node { 17 | pair range; 18 | vector next; 19 | }; 20 | 21 | // Build a suffix tree of the string text and return a vector 22 | // with all of the labels of its edges (the corresponding 23 | // substrings of the text) in any order. 24 | vector ComputeSuffixTreeEdges(const string& text) { 25 | vector result; 26 | // Implement this function yourself 27 | Node trie; 28 | for (int i = text.size() - 1; i >= 0; i--) { 29 | bool found = false; 30 | Node curr = trie; 31 | int j = i; 32 | bool cont_to_next_level = true; 33 | while (cont_to_next_level) { 34 | if (curr.next.empty()) { 35 | break; 36 | } 37 | for (int k = 0; k < curr.next.size(); k++) { 38 | if (text[j] == text[curr.next[k].range.first]) { 39 | found = true; 40 | int l = curr.next[k].range.first; 41 | int r = curr.next[k].range.second; 42 | while(text[j] == text[l]) { 43 | if (l == r) { 44 | cont_to_next_level = true; 45 | curr = curr.next[k]; 46 | j++; 47 | break; 48 | } 49 | j++; 50 | l++; 51 | cont_to_next_level = false; 52 | } 53 | if (!cont_to_next_level) { 54 | curr = curr.next[k]; 55 | curr.range.second = l - 1; 56 | Node new_node; 57 | new_node.range = make_pair(j, text.size() - 1); 58 | curr.next.push_back(new_node); 59 | } 60 | break; 61 | } 62 | } 63 | } 64 | if (!found) { 65 | Node new_node; 66 | new_node.range = make_pair(i, text.size() - 1); 67 | trie.next.push_back(new_node); 68 | } 69 | } 70 | return result; 71 | } 72 | 73 | int main() { 74 | string text; 75 | cin >> text; 76 | vector edges = ComputeSuffixTreeEdges(text); 77 | for (int i = 0; i < edges.size(); ++i) { 78 | cout << edges[i] << endl; 79 | } 80 | return 0; 81 | } 82 | -------------------------------------------------------------------------------- /week1/suffix_tree/trie.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | using std::cin; 8 | using std::cout; 9 | using std::endl; 10 | using std::map; 11 | using std::string; 12 | using std::vector; 13 | using std::pair; 14 | using std::make_pair; 15 | 16 | struct Node { 17 | pair range; 18 | vector next; 19 | }; 20 | 21 | void insert_suffix_to_trie(const string& text, Node* trie, int l_idx) { 22 | Node* curr_node = trie; 23 | for (int k = l_idx; k < text.size(); k++) { 24 | bool match = false; 25 | for (int i = 0; i < curr_node->next.size(); i++) { 26 | if (text[k] == text[curr_node->next[i].range.first]) { 27 | curr_node = &(curr_node->next[i]); 28 | match = true; 29 | break; 30 | } 31 | } 32 | if (!match) { 33 | Node new_node; 34 | new_node.range.first = k; 35 | new_node.range.second = k; 36 | curr_node->next.push_back(new_node); 37 | curr_node = &(curr_node->next.back()); 38 | } 39 | } 40 | } 41 | 42 | void build_trie(const string& text, Node* trie) { 43 | for (int i = text.size() - 1; i >= 0; i--) { 44 | insert_suffix_to_trie(text, trie, i); 45 | } 46 | } 47 | 48 | void compress_trie(const string& text, Node* trie) { 49 | for (int i = 0; i < trie->next.size(); i++) { 50 | compress_trie(text, &(trie->next[i])); 51 | } 52 | if (trie->next.size() == 1) { 53 | Node desc = trie->next[0]; 54 | trie->range.second = desc.range.second; 55 | trie->next.clear(); 56 | for (int i = 0; i < desc.next.size(); i++) 57 | trie->next.push_back(desc.next[i]); 58 | } 59 | } 60 | 61 | void traverse_trie(const string& text, Node* trie, vector &result) { 62 | for (int i = 0; i < trie->next.size(); i++) { 63 | traverse_trie(text, &(trie->next[i]), result); 64 | } 65 | int l = trie->range.first; 66 | int r = trie->range.second; 67 | if (l != -1) { 68 | result.push_back(text.substr(l, r - l + 1)); 69 | } 70 | } 71 | 72 | 73 | // Build a suffix tree of the string text and return a vector 74 | // with all of the labels of its edges (the corresponding 75 | // substrings of the text) in any order. 76 | vector ComputeSuffixTreeEdges(const string& text) { 77 | vector result; 78 | // Implement this function yourself 79 | Node trie; 80 | trie.range = make_pair(-1, -1); 81 | build_trie(text, &trie); 82 | compress_trie(text, &trie); 83 | traverse_trie(text, &trie, result); 84 | return result; 85 | } 86 | 87 | int main() { 88 | string text; 89 | cin >> text; 90 | vector edges = ComputeSuffixTreeEdges(text); 91 | for (int i = 0; i < edges.size(); ++i) { 92 | cout << edges[i] << endl; 93 | } 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /week1/trie/trie.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | using std::map; 8 | using std::vector; 9 | using std::string; 10 | using std::pair; 11 | 12 | typedef map edges; 13 | typedef vector trie; 14 | 15 | trie build_trie(vector & patterns) { 16 | trie t; 17 | // write your code here 18 | for (int i = 0; i < patterns.size(); i++) { 19 | int x = 0; 20 | for (int j = 0; j < patterns[i].size(); j++) { 21 | bool match_found = false; 22 | if (x < t.size()) { 23 | for (const auto & k : t[x]) { 24 | if (k.first == patterns[i][j]) { 25 | x = k.second; 26 | match_found = true; 27 | break; 28 | } 29 | } 30 | if (!match_found) { 31 | t[x].insert(pair (patterns[i][j], t.size())); 32 | x = t.size(); 33 | } 34 | } else { 35 | map m; 36 | m.insert(pair (patterns[i][j], t.size() + 1)); 37 | t.push_back(m); 38 | x = t.size(); 39 | } 40 | } 41 | map m; 42 | m.insert(pair ('$', -1)); 43 | t.push_back(m); 44 | } 45 | return t; 46 | } 47 | 48 | int main() { 49 | size_t n; 50 | std::cin >> n; 51 | vector patterns; 52 | for (size_t i = 0; i < n; i++) { 53 | string s; 54 | std::cin >> s; 55 | patterns.push_back(s); 56 | } 57 | 58 | trie t = build_trie(patterns); 59 | for (size_t i = 0; i < t.size(); ++i) { 60 | for (const auto & j : t[i]) { 61 | if (j.first != '$') { 62 | std::cout << i << "->" << j.second << ":" << j.first << "\n"; 63 | } 64 | } 65 | } 66 | 67 | return 0; 68 | } -------------------------------------------------------------------------------- /week1/trie_matching/trie_matching.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | int const Letters = 4; 11 | int const NA = -1; 12 | 13 | struct Node 14 | { 15 | int next [Letters]; 16 | 17 | Node () 18 | { 19 | fill (next, next + Letters, NA); 20 | } 21 | 22 | bool isLeaf () const 23 | { 24 | return (next[0] == NA && next[1] == NA && next[2] == NA && next[3] == NA); 25 | } 26 | }; 27 | 28 | int letterToIndex (char letter) 29 | { 30 | switch (letter) 31 | { 32 | case 'A': return 0; break; 33 | case 'C': return 1; break; 34 | case 'G': return 2; break; 35 | case 'T': return 3; break; 36 | default: assert (false); return -1; 37 | } 38 | } 39 | 40 | void build_trie (const vector & patterns, vector &t) 41 | { 42 | for (int i = 0; i < patterns.size(); i++) 43 | { 44 | int x = 0; 45 | for (int j = 0; j < patterns[i].size(); j++) 46 | { 47 | int index = letterToIndex(patterns[i][j]); 48 | if (x >= t.size()) 49 | { 50 | t.resize(x + 1); 51 | } 52 | if (t[x].next[index] != -1) 53 | { 54 | x = t[x].next[index]; 55 | } 56 | else 57 | { 58 | t[x].next[index] = t.size(); 59 | x = t[x].next[index]; 60 | t.resize(x + 1); 61 | } 62 | } 63 | } 64 | 65 | } 66 | 67 | vector solve (const string& text, int n, const vector & patterns) 68 | { 69 | vector result; 70 | 71 | // write your code here 72 | vector t; 73 | build_trie(patterns, t); 74 | 75 | for (int i = 0; i < text.size(); i++) 76 | { 77 | int index = letterToIndex(text[i]); 78 | int x = 0; 79 | if (t[x].next[index] != -1) 80 | { 81 | bool found = true; 82 | for (int j = i; !t[x].isLeaf() ; j++) 83 | { 84 | if (j >= text.size()) 85 | { 86 | found = false; 87 | break; 88 | } 89 | index = letterToIndex(text[j]); 90 | if (t[x].next[index] != -1) 91 | { 92 | x = t[x].next[index]; 93 | } 94 | else 95 | { 96 | found = false; 97 | break; 98 | } 99 | } 100 | if (found) 101 | { 102 | result.push_back(i); 103 | } 104 | } 105 | } 106 | 107 | return result; 108 | } 109 | 110 | int main (void) 111 | { 112 | string t; 113 | cin >> t; 114 | 115 | int n; 116 | cin >> n; 117 | 118 | vector patterns (n); 119 | for (int i = 0; i < n; i++) 120 | { 121 | cin >> patterns[i]; 122 | } 123 | 124 | vector ans; 125 | ans = solve (t, n, patterns); 126 | 127 | for (int i = 0; i < (int) ans.size (); i++) 128 | { 129 | cout << ans[i]; 130 | if (i + 1 < (int) ans.size ()) 131 | { 132 | cout << " "; 133 | } 134 | else 135 | { 136 | cout << endl; 137 | } 138 | } 139 | 140 | return 0; 141 | } 142 | -------------------------------------------------------------------------------- /week1/trie_matching_extended/trie_matching_extended.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | int const Letters = 4; 11 | int const NA = -1; 12 | 13 | struct Node 14 | { 15 | int next [Letters]; 16 | bool patternEnd; 17 | 18 | Node () 19 | { 20 | fill (next, next + Letters, NA); 21 | patternEnd = false; 22 | } 23 | }; 24 | 25 | int letterToIndex (char letter) 26 | { 27 | switch (letter) 28 | { 29 | case 'A': return 0; break; 30 | case 'C': return 1; break; 31 | case 'G': return 2; break; 32 | case 'T': return 3; break; 33 | default: assert (false); return -1; 34 | } 35 | } 36 | 37 | void build_trie (const vector & patterns, vector &t) 38 | { 39 | for (int i = 0; i < patterns.size(); i++) 40 | { 41 | int x = 0; 42 | for (int j = 0; j < patterns[i].size(); j++) 43 | { 44 | int index = letterToIndex(patterns[i][j]); 45 | if (x >= t.size()) 46 | { 47 | t.resize(x + 1); 48 | } 49 | if (t[x].next[index] != -1) 50 | { 51 | x = t[x].next[index]; 52 | } 53 | else 54 | { 55 | t[x].next[index] = t.size(); 56 | x = t[x].next[index]; 57 | t.resize(x + 1); 58 | } 59 | } 60 | t[x].patternEnd = true; 61 | } 62 | 63 | } 64 | 65 | vector solve (const string& text, int n, const vector & patterns) 66 | { 67 | vector result; 68 | 69 | // write your code here 70 | vector t; 71 | build_trie(patterns, t); 72 | 73 | for (int i = 0; i < text.size(); i++) 74 | { 75 | int index = letterToIndex(text[i]); 76 | int x = 0; 77 | if (t[x].next[index] != -1) 78 | { 79 | bool found = true; 80 | for (int j = i; !t[x].patternEnd ; j++) 81 | { 82 | if (j >= text.size()) 83 | { 84 | found = false; 85 | break; 86 | } 87 | index = letterToIndex(text[j]); 88 | if (t[x].next[index] != -1) 89 | { 90 | x = t[x].next[index]; 91 | } 92 | else 93 | { 94 | found = false; 95 | break; 96 | } 97 | } 98 | if (found) 99 | { 100 | result.push_back(i); 101 | } 102 | } 103 | } 104 | 105 | return result; 106 | } 107 | 108 | int main (void) 109 | { 110 | string t; 111 | cin >> t; 112 | 113 | int n; 114 | cin >> n; 115 | 116 | vector patterns (n); 117 | for (int i = 0; i < n; i++) 118 | { 119 | cin >> patterns[i]; 120 | } 121 | 122 | vector ans; 123 | ans = solve (t, n, patterns); 124 | 125 | for (int i = 0; i < (int) ans.size (); i++) 126 | { 127 | cout << ans[i]; 128 | if (i + 1 < (int) ans.size ()) 129 | { 130 | cout << " "; 131 | } 132 | else 133 | { 134 | cout << endl; 135 | } 136 | } 137 | 138 | return 0; 139 | } 140 | -------------------------------------------------------------------------------- /week2/_332adc34d317de0b0cc790581a52df01_Programming-Assignment-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nishchayp/algorithms-on-strings/3f8a3924baf277210b55612bf289f74ecbe8ad15/week2/_332adc34d317de0b0cc790581a52df01_Programming-Assignment-2.pdf -------------------------------------------------------------------------------- /week2/bwmatching/bwmatching.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using std::cin; 10 | using std::istringstream; 11 | using std::map; 12 | using std::string; 13 | using std::vector; 14 | using std::cout; 15 | using std::endl; 16 | 17 | // Preprocess the Burrows-Wheeler Transform bwt of some text 18 | // and compute as a result: 19 | // * starts - for each character C in bwt, starts[C] is the first position 20 | // of this character in the sorted array of 21 | // all characters of the text. 22 | // * occ_count_before - for each character C in bwt and each position P in bwt, 23 | // occ_count_before[C][P] is the number of occurrences of character C in bwt 24 | // from position 0 to position P inclusive. 25 | 26 | // Note: Make occ_count_before[C][P] exclusive P. 27 | // It will make the code cleaner and easier to implement. 28 | 29 | void PreprocessBWT(const string& bwt, 30 | map& starts, 31 | map >& occ_count_before) { 32 | // Implement this function yourself 33 | for (int i = 0; i < bwt.size(); i++) { 34 | starts[bwt[i]]++; 35 | occ_count_before[bwt[i]][i + 1]++; 36 | for (auto& it : occ_count_before) { 37 | if (i < bwt.size()) { 38 | it.second[i + 1] += it.second[i]; 39 | } 40 | } 41 | } 42 | 43 | int cf = 0; 44 | for (auto& it : starts) { 45 | if (it.second == 0) { 46 | it.second = -1; 47 | } else { 48 | int temp = it.second; 49 | it.second = cf; 50 | cf += temp; 51 | } 52 | } 53 | } 54 | 55 | // Compute the number of occurrences of string pattern in the text 56 | // given only Burrows-Wheeler Transform bwt of the text and additional 57 | // information we get from the preprocessing stage - starts and occ_count_before. 58 | int CountOccurrences(const string& pattern, 59 | const string& bwt, 60 | const map& starts, 61 | const map >& occ_count_before) { 62 | // Implement this function yourself 63 | string pattern_cp = pattern; 64 | int top = 0; 65 | int bottom = bwt.size() - 1; 66 | while (top <= bottom) { 67 | if (!pattern_cp.empty()) { 68 | char symbol = pattern_cp.back(); 69 | pattern_cp.pop_back(); 70 | if (occ_count_before.find(symbol)->second[bottom + 1] > occ_count_before.find(symbol)->second[top]) { 71 | top = starts.find(symbol)->second + occ_count_before.find(symbol)->second[top]; 72 | bottom = starts.find(symbol)->second + occ_count_before.find(symbol)->second[bottom + 1] - 1; 73 | } else { 74 | return 0; 75 | } 76 | } else { 77 | return bottom - top + 1; 78 | } 79 | } 80 | 81 | return 0; 82 | } 83 | 84 | 85 | int main() { 86 | string bwt; 87 | cin >> bwt; 88 | int pattern_count; 89 | cin >> pattern_count; 90 | // Start of each character in the sorted list of characters of bwt, 91 | // see the description in the comment about function PreprocessBWT 92 | map starts = {{'$', 0}, {'A', 0}, {'C', 0}, {'G', 0}, {'T', 0}}; 93 | // Occurrence counts for each character and each position in bwt, 94 | // see the description in the comment about function PreprocessBWT 95 | map > occ_count_before = {{'$', vector(bwt.size() + 1)}, {'A', vector(bwt.size() + 1)}, {'C', vector(bwt.size() + 1)}, {'G', vector(bwt.size() + 1)}, {'T', vector(bwt.size() + 1)}};; 96 | // Preprocess the BWT once to get starts and occ_count_before. 97 | // For each pattern, we will then use these precomputed values and 98 | // spend only O(|pattern|) to find all occurrences of the pattern 99 | // in the text instead of O(|pattern| + |text|). 100 | PreprocessBWT(bwt, starts, occ_count_before); 101 | 102 | for (int pi = 0; pi < pattern_count; ++pi) { 103 | string pattern; 104 | cin >> pattern; 105 | int occ_count = CountOccurrences(pattern, bwt, starts, occ_count_before); 106 | printf("%d ", occ_count); 107 | } 108 | printf("\n"); 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /week2/bwt/bwt.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using std::cin; 7 | using std::cout; 8 | using std::endl; 9 | using std::string; 10 | using std::vector; 11 | 12 | string BWT(const string& text) { 13 | string result = ""; 14 | 15 | // write your code here 16 | vector m(text.size()); 17 | for (int i = 0; i < m.size(); i++) { 18 | m[i] = text.substr(i) + text.substr(0, i); 19 | } 20 | 21 | sort(m.begin(), m.end()); 22 | 23 | for (int i = 0; i < m.size(); i++) { 24 | result += m[i][m.size() - 1]; 25 | } 26 | 27 | return result; 28 | } 29 | 30 | int main() { 31 | string text; 32 | cin >> text; 33 | cout << BWT(text) << endl; 34 | return 0; 35 | } -------------------------------------------------------------------------------- /week2/bwtinverse/bwtinverse.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | using std::cin; 8 | using std::cout; 9 | using std::endl; 10 | using std::string; 11 | using std::vector; 12 | using std::pair; 13 | using std::make_pair; 14 | 15 | string InverseBWT(const string& bwt) { 16 | string text = ""; 17 | 18 | // write your code here 19 | // string col1 = bwt; 20 | vector > col1(bwt.size()); 21 | for (int i = 0; i < bwt.size(); i++) { 22 | col1[i] = make_pair(bwt[i], i); 23 | } 24 | sort(col1.begin(), col1.end()); 25 | 26 | pair symbol = col1[0]; 27 | for (int i = 0; i < bwt.size(); i++) { 28 | symbol = col1[symbol.second]; 29 | text += symbol.first; 30 | } 31 | 32 | return text; 33 | } 34 | 35 | int main() { 36 | string bwt; 37 | cin >> bwt; 38 | cout << InverseBWT(bwt) << endl; 39 | return 0; 40 | } 41 | -------------------------------------------------------------------------------- /week2/suffix_array/suffix_array.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using std::cin; 9 | using std::cout; 10 | using std::endl; 11 | using std::make_pair; 12 | using std::pair; 13 | using std::string; 14 | using std::vector; 15 | 16 | // Build suffix array of the string text and 17 | // return a vector result of the same length as the text 18 | // such that the value result[i] is the index (0-based) 19 | // in text where the i-th lexicographically smallest 20 | // suffix of text starts. 21 | vector BuildSuffixArray(const string& text) { 22 | vector result(text.size()); 23 | // Implement this function yourself 24 | vector > suffix(text.size()); 25 | for (int i = 0; i < text.size(); i++) { 26 | suffix[i] = make_pair(text.substr(i), i); 27 | } 28 | sort(suffix.begin(), suffix.end()); 29 | for (int i = 0; i < suffix.size(); i++) { 30 | result[i] = suffix[i].second; 31 | } 32 | return result; 33 | } 34 | 35 | int main() { 36 | string text; 37 | cin >> text; 38 | vector suffix_array = BuildSuffixArray(text); 39 | for (int i = 0; i < suffix_array.size(); ++i) { 40 | cout << suffix_array[i] << ' '; 41 | } 42 | cout << endl; 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /week3-4/_120e875d62d8c0bc18a11e21fd818465_Programming-Assignment-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nishchayp/algorithms-on-strings/3f8a3924baf277210b55612bf289f74ecbe8ad15/week3-4/_120e875d62d8c0bc18a11e21fd818465_Programming-Assignment-3.pdf -------------------------------------------------------------------------------- /week3-4/kmp/kmp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using std::cin; 7 | using std::string; 8 | using std::vector; 9 | 10 | void compute_prefix(string& p_t, vector& s) { 11 | int border = 0; 12 | for (int i = 1; i < p_t.size(); i++) { 13 | while (border > 0 && p_t[i] != p_t[border]) { 14 | border = s[border - 1]; 15 | } 16 | if (p_t[i] == p_t[border]) { 17 | border++; 18 | s[i] = border; 19 | } 20 | if (border == 0) { 21 | s[i] = 0; 22 | } 23 | } 24 | } 25 | 26 | // Find all occurrences of the pattern in the text and return a 27 | // vector with all positions in the text (starting from 0) where 28 | // the pattern starts in the text. 29 | vector find_pattern(const string& pattern, const string& text) { 30 | vector result; 31 | // Implement this function yourself 32 | string p_t = pattern + '$' + text; 33 | vector s(p_t.size()); 34 | compute_prefix(p_t, s); 35 | for (int i = pattern.size() + 1; i < p_t.size(); i++) { 36 | if (s[i] == pattern.size()) { 37 | result.push_back(i - 2 * pattern.size()); 38 | } 39 | } 40 | return result; 41 | } 42 | 43 | int main() { 44 | string pattern, text; 45 | cin >> pattern; 46 | cin >> text; 47 | vector result = find_pattern(pattern, text); 48 | for (int i = 0; i < result.size(); ++i) { 49 | printf("%d ", result[i]); 50 | } 51 | printf("\n"); 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /week3-4/suffix_array_long/suffix_array_long.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | using std::cin; 8 | using std::cout; 9 | using std::endl; 10 | using std::make_pair; 11 | using std::pair; 12 | using std::string; 13 | using std::vector; 14 | 15 | void print_int_v(vector& v) { 16 | for (int i = 0; i < v.size(); i++) { 17 | cout << v[i] << " "; 18 | } 19 | cout << endl; 20 | } 21 | 22 | int char_to_idx(char x) { 23 | switch (x) { 24 | case '$': 25 | return 0; 26 | case 'A': 27 | return 1; 28 | case 'C': 29 | return 2; 30 | case 'G': 31 | return 3; 32 | case 'T': 33 | return 4; 34 | } 35 | } 36 | 37 | void counting_sort_char(const string& text, vector& order) { 38 | vector c_f(5, 0); 39 | for (int i = 0; i < text.size(); i++) { 40 | c_f[char_to_idx(text[i])]++; 41 | } 42 | for (int i = 1; i < c_f.size(); i++) { 43 | c_f[i] += c_f[i - 1]; 44 | } 45 | for (int i = text.size() - 1; i >= 0; i--) { 46 | order[--c_f[char_to_idx(text[i])]] = i; 47 | } 48 | } 49 | 50 | void compute_class(const string& text, vector& order, vector& sort_class) { 51 | int count = 0; 52 | sort_class[order[0]] = count; 53 | for (int i = 1; i < order.size(); i++) { 54 | if (text[order[i]] == text[order[i - 1]]) 55 | sort_class[order[i]] = count; 56 | else 57 | sort_class[order[i]] = ++count; 58 | } 59 | } 60 | 61 | vector sort_double(const string& text, int l, vector& order, vector& sort_class) { 62 | vector new_order(text.size()); 63 | vector c_f(text.size(), 0); 64 | 65 | for (int i = 0; i < order.size(); i++) { 66 | c_f[sort_class[i]]++; 67 | } 68 | for (int i = 1; i < c_f.size(); i++) { 69 | c_f[i] += c_f[i - 1]; 70 | } 71 | for (int i = order.size() - 1; i >= 0; i--) { 72 | int start = (order[i] - l + order.size()) % order.size(); 73 | new_order[--c_f[sort_class[start]]] = start; 74 | } 75 | order.clear(); 76 | return new_order; 77 | } 78 | 79 | vector update_class(vector& order, vector& sort_class, int l) { 80 | vector new_sort_class(order.size()); 81 | int count = 0; 82 | new_sort_class[order[0]] = count; 83 | for (int i = 1; i < order.size(); i++) { 84 | int cur = order[i]; 85 | int cur_mid = (order[i] + l) % order.size(); 86 | int prev = order[i - 1]; 87 | int prev_mid = (order[i - 1] + l) % order.size(); 88 | if (sort_class[cur]!= sort_class[prev] || sort_class[cur_mid] != sort_class[prev_mid]) { 89 | new_sort_class[order[i]] = ++count; 90 | } else { 91 | new_sort_class[order[i]] = count; 92 | } 93 | } 94 | sort_class.clear(); 95 | return new_sort_class; 96 | } 97 | 98 | // Build suffix array of the string text and 99 | // return a vector result of the same length as the text 100 | // such that the value result[i] is the index (0-based) 101 | // in text where the i-th lexicographically smallest 102 | // suffix of text starts. 103 | vector BuildSuffixArray(const string& text) { 104 | vector result; 105 | // Implement this function yourself 106 | vector order(text.size()); 107 | counting_sort_char(text, order); 108 | 109 | vector sort_class(text.size()); 110 | compute_class(text, order, sort_class); 111 | 112 | for (int l = 1; l <= text.size(); l *= 2) { 113 | order = sort_double(text, l, order, sort_class); 114 | sort_class = update_class(order, sort_class, l); 115 | } 116 | 117 | return order; 118 | } 119 | 120 | int main() { 121 | string text; 122 | cin >> text; 123 | vector suffix_array = BuildSuffixArray(text); 124 | for (int i = 0; i < suffix_array.size(); ++i) { 125 | cout << suffix_array[i] << ' '; 126 | } 127 | cout << endl; 128 | return 0; 129 | } 130 | -------------------------------------------------------------------------------- /week3-4/suffix_array_matching/suffix_array_matching.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | int char_to_idx(char x) { 11 | switch (x) { 12 | case '$': 13 | return 0; 14 | case 'A': 15 | return 1; 16 | case 'C': 17 | return 2; 18 | case 'G': 19 | return 3; 20 | case 'T': 21 | return 4; 22 | } 23 | } 24 | 25 | void counting_sort_char(const string& text, vector& order) { 26 | vector c_f(5, 0); 27 | for (int i = 0; i < text.size(); i++) { 28 | c_f[char_to_idx(text[i])]++; 29 | } 30 | for (int i = 1; i < c_f.size(); i++) { 31 | c_f[i] += c_f[i - 1]; 32 | } 33 | for (int i = text.size() - 1; i >= 0; i--) { 34 | order[--c_f[char_to_idx(text[i])]] = i; 35 | } 36 | } 37 | 38 | void compute_class(const string& text, vector& order, vector& sort_class) { 39 | int count = 0; 40 | sort_class[order[0]] = count; 41 | for (int i = 1; i < order.size(); i++) { 42 | if (text[order[i]] == text[order[i - 1]]) 43 | sort_class[order[i]] = count; 44 | else 45 | sort_class[order[i]] = ++count; 46 | } 47 | } 48 | 49 | vector sort_double(const string& text, int l, vector& order, vector& sort_class) { 50 | vector new_order(text.size()); 51 | vector c_f(text.size(), 0); 52 | 53 | for (int i = 0; i < order.size(); i++) { 54 | c_f[sort_class[i]]++; 55 | } 56 | for (int i = 1; i < c_f.size(); i++) { 57 | c_f[i] += c_f[i - 1]; 58 | } 59 | for (int i = order.size() - 1; i >= 0; i--) { 60 | int start = (order[i] - l + order.size()) % order.size(); 61 | new_order[--c_f[sort_class[start]]] = start; 62 | } 63 | order.clear(); 64 | return new_order; 65 | } 66 | 67 | vector update_class(vector& order, vector& sort_class, int l) { 68 | vector new_sort_class(order.size()); 69 | int count = 0; 70 | new_sort_class[order[0]] = count; 71 | for (int i = 1; i < order.size(); i++) { 72 | int cur = order[i]; 73 | int cur_mid = (order[i] + l) % order.size(); 74 | int prev = order[i - 1]; 75 | int prev_mid = (order[i - 1] + l) % order.size(); 76 | if (sort_class[cur]!= sort_class[prev] || sort_class[cur_mid] != sort_class[prev_mid]) { 77 | new_sort_class[order[i]] = ++count; 78 | } else { 79 | new_sort_class[order[i]] = count; 80 | } 81 | } 82 | sort_class.clear(); 83 | return new_sort_class; 84 | } 85 | 86 | // Build suffix array of the string text and 87 | // return a vector result of the same length as the text 88 | // such that the value result[i] is the index (0-based) 89 | // in text where the i-th lexicographically smallest 90 | // suffix of text starts. 91 | vector BuildSuffixArray(const string& text) { 92 | vector result; 93 | // Implement this function yourself 94 | vector order(text.size()); 95 | counting_sort_char(text, order); 96 | 97 | vector sort_class(text.size()); 98 | compute_class(text, order, sort_class); 99 | 100 | for (int l = 1; l <= text.size(); l *= 2) { 101 | order = sort_double(text, l, order, sort_class); 102 | sort_class = update_class(order, sort_class, l); 103 | } 104 | 105 | return order; 106 | } 107 | 108 | string BWT_fast(const string& text, vector& suffix_array) { 109 | string result = ""; 110 | for (int i = 0; i < suffix_array.size(); i++) { 111 | result += text[(suffix_array[i] - 1 + text.size()) % text.size()]; 112 | } 113 | return result; 114 | } 115 | 116 | string BWT(const string& text) { 117 | string result = ""; 118 | 119 | // write your code here 120 | vector m(text.size()); 121 | for (int i = 0; i < m.size(); i++) { 122 | m[i] = text.substr(i) + text.substr(0, i); 123 | } 124 | 125 | sort(m.begin(), m.end()); 126 | 127 | for (int i = 0; i < m.size(); i++) { 128 | result += m[i][m.size() - 1]; 129 | } 130 | 131 | return result; 132 | } 133 | 134 | // Preprocess the Burrows-Wheeler Transform bwt of some text 135 | // and compute as a result: 136 | // * starts - for each character C in bwt, starts[C] is the first position 137 | // of this character in the sorted array of 138 | // all characters of the text. 139 | // * occ_count_before - for each character C in bwt and each position P in bwt, 140 | // occ_count_before[C][P] is the number of occurrences of character C in bwt 141 | // from position 0 to position P inclusive. 142 | 143 | // Note: Make occ_count_before[C][P] exclusive P. 144 | // It will make the code cleaner and easier to implement. 145 | 146 | void PreprocessBWT(const string& bwt, 147 | map& starts, 148 | map >& occ_count_before) { 149 | // Implement this function yourself 150 | for (int i = 0; i < bwt.size(); i++) { 151 | starts[bwt[i]]++; 152 | occ_count_before[bwt[i]][i + 1]++; 153 | for (auto& it : occ_count_before) { 154 | if (i < bwt.size()) { 155 | it.second[i + 1] += it.second[i]; 156 | } 157 | } 158 | } 159 | 160 | int cf = 0; 161 | for (auto& it : starts) { 162 | if (it.second == 0) { 163 | it.second = -1; 164 | } else { 165 | int temp = it.second; 166 | it.second = cf; 167 | cf += temp; 168 | } 169 | } 170 | } 171 | 172 | 173 | vector FindOccurrences(const string& pattern, 174 | const string& text, 175 | const vector& suffix_array, 176 | const string& bwt, 177 | const map& starts, 178 | const map >& occ_count_before) { 179 | vector result; 180 | 181 | string pattern_cp = pattern; 182 | int top = 0; 183 | int bottom = bwt.size() - 1; 184 | while (top <= bottom) { 185 | if (!pattern_cp.empty()) { 186 | char symbol = pattern_cp.back(); 187 | pattern_cp.pop_back(); 188 | if (occ_count_before.find(symbol)->second[bottom + 1] > occ_count_before.find(symbol)->second[top]) { 189 | top = starts.find(symbol)->second + occ_count_before.find(symbol)->second[top]; 190 | bottom = starts.find(symbol)->second + occ_count_before.find(symbol)->second[bottom + 1] - 1; 191 | } else { 192 | return result; 193 | } 194 | } else { 195 | for (int i = top; i <= bottom; i++) { 196 | result.push_back(suffix_array[i]); 197 | } 198 | return result; 199 | } 200 | } 201 | 202 | return result; 203 | } 204 | 205 | int main() { 206 | char buffer[100001]; 207 | scanf("%s", buffer); 208 | string text = buffer; 209 | text += '$'; 210 | vector suffix_array = BuildSuffixArray(text); 211 | 212 | string bwt = BWT_fast(text, suffix_array); 213 | 214 | // Start of each character in the sorted list of characters of bwt, 215 | // see the description in the comment about function PreprocessBWT 216 | map starts = {{'$', 0}, {'A', 0}, {'C', 0}, {'G', 0}, {'T', 0}}; 217 | // Occurrence counts for each character and each position in bwt, 218 | // see the description in the comment about function PreprocessBWT 219 | map > occ_count_before = {{'$', vector(bwt.size() + 1)}, {'A', vector(bwt.size() + 1)}, {'C', vector(bwt.size() + 1)}, {'G', vector(bwt.size() + 1)}, {'T', vector(bwt.size() + 1)}};; 220 | // Preprocess the BWT once to get starts and occ_count_before. 221 | // For each pattern, we will then use these precomputed values and 222 | // spend only O(|pattern|) to find all occurrences of the pattern 223 | // in the text instead of O(|pattern| + |text|). 224 | PreprocessBWT(bwt, starts, occ_count_before); 225 | 226 | int pattern_count; 227 | scanf("%d", &pattern_count); 228 | vector occurs(text.length(), false); 229 | for (int pattern_index = 0; pattern_index < pattern_count; ++pattern_index) { 230 | scanf("%s", buffer); 231 | string pattern = buffer; 232 | vector occurrences = FindOccurrences(pattern, text, suffix_array, bwt, starts, occ_count_before); 233 | for (int j = 0; j < occurrences.size(); ++j) { 234 | occurs[occurrences[j]] = true; 235 | } 236 | } 237 | for (int i = 0; i < occurs.size(); ++i) { 238 | if (occurs[i]) { 239 | printf("%d ", i); 240 | } 241 | } 242 | printf("\n"); 243 | return 0; 244 | } 245 | -------------------------------------------------------------------------------- /week3-4/suffix_tree_from_array/suffix_tree_from_array.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using std::make_pair; 9 | using std::map; 10 | using std::pair; 11 | using std::string; 12 | using std::vector; 13 | 14 | // Data structure to store edges of a suffix tree. 15 | struct Edge { 16 | // The ending node of this edge. 17 | int node; 18 | // Starting position of the substring of the text 19 | // corresponding to the label of this edge. 20 | int start; 21 | // Position right after the end of the substring of the text 22 | // corresponding to the label of this edge. 23 | int end; 24 | 25 | Edge(int node_, int start_, int end_) : node(node_), start(start_), end(end_) {} 26 | Edge(const Edge& e) : node(e.node), start(e.start), end(e.end) {} 27 | }; 28 | 29 | // Build suffix tree of the string text given its suffix array suffix_array 30 | // and LCP array lcp_array. Return the tree as a mapping from a node ID 31 | // to the vector of all outgoing edges of the corresponding node. The edges in the 32 | // vector must be sorted in the ascending order by the first character of the edge label. 33 | // Root must have node ID = 0, and all other node IDs must be different 34 | // nonnegative integers. 35 | // 36 | // For example, if text = "ACACAA$", an edge with label "$" from root to a node with ID 1 37 | // must be represented by Edge(1, 6, 7). This edge must be present in the vector tree[0] 38 | // (corresponding to the root node), and it should be the first edge in the vector 39 | // (because it has the smallest first character of all edges outgoing from the root). 40 | map > SuffixTreeFromSuffixArray( 41 | const vector& suffix_array, 42 | const vector& lcp_array, 43 | const string& text) { 44 | map > tree; 45 | // Implement this function yourself 46 | return tree; 47 | } 48 | 49 | 50 | int main() { 51 | char buffer[200001]; 52 | scanf("%s", buffer); 53 | string text = buffer; 54 | vector suffix_array(text.length()); 55 | for (int i = 0; i < text.length(); ++i) { 56 | scanf("%d", &suffix_array[i]); 57 | } 58 | vector lcp_array(text.length() - 1); 59 | for (int i = 0; i + 1 < text.length(); ++i) { 60 | scanf("%d", &lcp_array[i]); 61 | } 62 | // Build the suffix tree and get a mapping from 63 | // suffix tree node ID to the list of outgoing Edges. 64 | map > tree = SuffixTreeFromSuffixArray(suffix_array, lcp_array, text); 65 | printf("%s\n", buffer); 66 | // Output the edges of the suffix tree in the required order. 67 | // Note that we use here the contract that the root of the tree 68 | // will have node ID = 0 and that each vector of outgoing edges 69 | // will be sorted by the first character of the corresponding edge label. 70 | // 71 | // The following code avoids recursion to avoid stack overflow issues. 72 | // It uses a stack to convert recursive function to a while loop. 73 | // The stack stores pairs (node, edge_index). 74 | // This code is an equivalent of 75 | // 76 | // OutputEdges(tree, 0); 77 | // 78 | // for the following _recursive_ function OutputEdges: 79 | // 80 | // void OutputEdges(map > tree, int node_id) { 81 | // const vector& edges = tree[node_id]; 82 | // for (int edge_index = 0; edge_index < edges.size(); ++edge_index) { 83 | // printf("%d %d\n", edges[edge_index].start, edges[edge_index].end); 84 | // OutputEdges(tree, edges[edge_index].node); 85 | // } 86 | // } 87 | // 88 | vector > stack(1, make_pair(0, 0)); 89 | while (!stack.empty()) { 90 | pair p = stack.back(); 91 | stack.pop_back(); 92 | int node = p.first; 93 | int edge_index = p.second; 94 | if (!tree.count(node)) { 95 | continue; 96 | } 97 | const vector& edges = tree[node]; 98 | if (edge_index + 1 < edges.size()) { 99 | stack.push_back(make_pair(node, edge_index + 1)); 100 | } 101 | printf("%d %d\n", edges[edge_index].start, edges[edge_index].end); 102 | stack.push_back(make_pair(edges[edge_index].node, 0)); 103 | } 104 | return 0; 105 | } 106 | --------------------------------------------------------------------------------