├── Slides.pptx
├── Project Report.pdf
├── README.md
├── LICENSE
├── palindrome.ipynb
├── Project.ipynb
├── LPS.ipynb
├── LPS3.ipynb
├── new.ipynb
└── Code.ipynb


/Slides.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jayasurya-Marasani/Suffix-Arrays-in-Genome-Assembly/HEAD/Slides.pptx


--------------------------------------------------------------------------------
/Project Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jayasurya-Marasani/Suffix-Arrays-in-Genome-Assembly/HEAD/Project Report.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Suffix-Arrays-in-Genome-Assembly
2 | The repository contains the use of suffix arrays in genome assembly
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Jayasurya Marasani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/palindrome.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 7,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "def longestPalSubstr(str):\n",
10 |     "    n = len(str)\n",
11 |     "    maxLength = 1\n",
12 |     "    start = 0\n",
13 |     "    for i in range(n):\n",
14 |     "        for j in range(i, n):\n",
15 |     "            flag = 1\n",
16 |     "            for k in range(0, ((j - i) // 2) + 1):\n",
17 |     "                if (str[i + k] != str[j - k]):\n",
18 |     "                    flag = 0\n",
19 |     "            if (flag != 0 and (j - i + 1) > maxLength):\n",
20 |     "                start = i\n",
21 |     "                maxLength = j - i + 1           \n",
22 |     "    print(\"Longest palindrome subString is: \")\n",
23 |     "    s1 =''\n",
24 |     "    for i in range(start, start + maxLength):\n",
25 |     "        s1 = s1 + str[i]\n",
26 |     "    return s1, len(s1)"
27 |    ]
28 |   },
29 |   {
30 |    "cell_type": "code",
31 |    "execution_count": 8,
32 |    "metadata": {},
33 |    "outputs": [
34 |     {
35 |      "name": "stdout",
36 |      "output_type": "stream",
37 |      "text": [
38 |       "Longest palindrome subString is: \n",
39 |       "('anana', 5)\n"
40 |      ]
41 |     }
42 |    ],
43 |    "source": [
44 |     "s = input('Enter the string: ')\n",
45 |     "print(longestPalSubstr(s))"
46 |    ]
47 |   }
48 |  ],
49 |  "metadata": {
50 |   "interpreter": {
51 |    "hash": "3dc8e47f82edb7463ca464588a7358a3d93aa0e55ce1d99dfe2c7a888c7afeb3"
52 |   },
53 |   "kernelspec": {
54 |    "display_name": "Python 3.10.1 64-bit",
55 |    "language": "python",
56 |    "name": "python3"
57 |   },
58 |   "language_info": {
59 |    "codemirror_mode": {
60 |     "name": "ipython",
61 |     "version": 3
62 |    },
63 |    "file_extension": ".py",
64 |    "mimetype": "text/x-python",
65 |    "name": "python",
66 |    "nbconvert_exporter": "python",
67 |    "pygments_lexer": "ipython3",
68 |    "version": "3.10.1"
69 |   },
70 |   "orig_nbformat": 4
71 |  },
72 |  "nbformat": 4,
73 |  "nbformat_minor": 2
74 | }
75 | 


--------------------------------------------------------------------------------
/Project.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a48ea6b0",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Finding the longest repeated substring"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 31,
 14 |    "id": "fc4abb7b",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "def Construct_LCP_array(suffix_array, input_string):\n",
 19 |     "    n = len(suffix_array)\n",
 20 |     "    size = len(input_string)\n",
 21 |     "    r = [None] * size\n",
 22 |     "    for i in range(n):\n",
 23 |     "        r[suffix_array[i]] = i\n",
 24 |     "    lcp = [None] * size\n",
 25 |     "    lcp[0] = 0\n",
 26 |     "    h = 0\n",
 27 |     "    for i in range(size):\n",
 28 |     "        if r[i] > 0:\n",
 29 |     "            j = suffix_array[r[i] - 1]\n",
 30 |     "            while i != size - h and j != size - h and input_string[i + h] == input_string[j + h]:\n",
 31 |     "                h = h + 1\n",
 32 |     "            lcp[r[i]] = h\n",
 33 |     "            if h > 0:\n",
 34 |     "                h = h - 1\n",
 35 |     "    return lcp"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 32,
 41 |    "id": "c2cbdb1d",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "def longestCommonPrefix(strs):\n",
 46 |     "    if len(strs) == 0:\n",
 47 |     "        return \"\"\n",
 48 |     "    current = strs[0]\n",
 49 |     "    for i in range(1, len(strs)):\n",
 50 |     "        temp = \"\"\n",
 51 |     "        if len(current) == 0:\n",
 52 |     "            break\n",
 53 |     "        for j in range(len(strs[i])):\n",
 54 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
 55 |     "                temp += current[j]\n",
 56 |     "            else:\n",
 57 |     "                break\n",
 58 |     "        current = temp\n",
 59 |     "    return len(current)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 33,
 65 |    "id": "587fe403",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "def Construct_LCP_array(suffix_array, input_string):\n",
 70 |     "    \n",
 71 |     "    ordered_list = []\n",
 72 |     "    for i in suffix_array:\n",
 73 |     "        ordered_list.append(input_string[i:])\n",
 74 |     "    print(ordered_list)\n",
 75 |     "    list3 = [0]*len(ordered_list)\n",
 76 |     "    for i in range(0 , len(ordered_list)):\n",
 77 |     "        if i==0:\n",
 78 |     "            list3[i]=0\n",
 79 |     "        else:\n",
 80 |     "            strs=[ordered_list[i], ordered_list[i-1]]\n",
 81 |     "            list3[i]=longestCommonPrefix(strs)\n",
 82 |     "    return list3"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 34,
 88 |    "id": "1d2f2ea7",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "def Construct_SuffixArray(input_str, len_str):\n",
 93 |     "    Suffix_array = []\n",
 94 |     "    Suffix_Dict = {}\n",
 95 |     "    Suffix_DictReverse = {}\n",
 96 |     "\n",
 97 |     "    for i in range(len_str):\n",
 98 |     "        permutation = input_str[i:len_str]\n",
 99 |     "        Suffix_Dict[i] = permutation\n",
100 |     "        Suffix_DictReverse[permutation] = i\n",
101 |     "\n",
102 |     "    orderedList = sorted(Suffix_Dict.values())\n",
103 |     "\n",
104 |     "    for i in orderedList:\n",
105 |     "        Suffix_array.append(Suffix_DictReverse[i])\n",
106 |     "\n",
107 |     "    return Suffix_array, Suffix_Dict"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 35,
113 |    "id": "fc479fdb",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "def main():\n",
118 |     "    input_str = input(\"Enter the Input String : \")\n",
119 |     "    len_str = len(input_str)\n",
120 |     "    Suffix_array, suffix_dict = Construct_SuffixArray(input_str, len_str)\n",
121 |     "    print(Suffix_array)\n",
122 |     "    lcp = Construct_LCP_array(Suffix_array, input_str)\n",
123 |     "    print(lcp)\n",
124 |     "    idx = lcp.index(max(lcp))\n",
125 |     "    idx_suffix = Suffix_array[idx]\n",
126 |     "    result = suffix_dict[idx_suffix]\n",
127 |     "    print(\"Input string: \" + input_str)\n",
128 |     "    print(\"Longest repeated substring: \" + result[0:max(lcp)])"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 36,
134 |    "id": "be97b6b4",
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "[10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]\n",
142 |       "['A', 'ABRA', 'ABRACADABRA', 'ACADABRA', 'ADABRA', 'BRA', 'BRACADABRA', 'CADABRA', 'DABRA', 'RA', 'RACADABRA']\n",
143 |       "[0, 1, 4, 1, 1, 0, 3, 0, 0, 0, 2]\n",
144 |       "Input string: ABRACADABRA\n",
145 |       "Longest repeated substring: ABRA\n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "if __name__ == \"__main__\":      #ABRACADABRA\n",
151 |     "    main()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "id": "5ddf0e55",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": []
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "Python 3",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.10.1"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 5
184 | }
185 | 


--------------------------------------------------------------------------------
/LPS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 18,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "banana\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "s = input('Enter the String:\\n')\n",
 18 |     "print(s)\n"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 19,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/plain": [
 29 |        "'banana#ananab'"
 30 |       ]
 31 |      },
 32 |      "execution_count": 19,
 33 |      "metadata": {},
 34 |      "output_type": "execute_result"
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "s_new = s + '#' + s[::-1]\n",
 39 |     "s_new"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 20,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "def suffix_array_alternative_naive(s):\n",
 49 |     "    return [rank for suffix, rank in sorted((s[i:], i) for i in range(len(s)))]"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 21,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "def LCP(sa , s_new):\n",
 59 |     "    size = len(s_new)\n",
 60 |     "    r = [None]* size\n",
 61 |     "    for i in range(size):\n",
 62 |     "        r[sa[i]] = i\n",
 63 |     "    lcp = [None]*size\n",
 64 |     "    h = 0\n",
 65 |     "    print(r)\n",
 66 |     "    for i in range(size):\n",
 67 |     "        if r[i] > 0:\n",
 68 |     "            j = sa [r[i] - 1]\n",
 69 |     "            while i != size-h and j!= size-h and s_new[i+h] == s_new[j+h]:\n",
 70 |     "                h = h+1\n",
 71 |     "            lcp[r[i]] =h\n",
 72 |     "            if h > 0:\n",
 73 |     "                h = h - 1\n",
 74 |     "    if size>0:\n",
 75 |     "        lcp[0] = 0\n",
 76 |     "    return lcp"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 26,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "[6, 5, 11, 3, 9, 1, 7, 12, 0, 4, 10, 2, 8]\n",
 89 |       "#ananab\n",
 90 |       "a#ananab\n",
 91 |       "ab\n",
 92 |       "ana#ananab\n",
 93 |       "anab\n",
 94 |       "anana#ananab\n",
 95 |       "ananab\n",
 96 |       "b\n",
 97 |       "banana#ananab\n",
 98 |       "na#ananab\n",
 99 |       "nab\n",
100 |       "nana#ananab\n",
101 |       "nanab\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "sa = suffix_array_alternative_naive(s_new)\n",
107 |     "print(sa)\n",
108 |     "for i in range (len(s_new)):\n",
109 |     "    print(s_new[sa[i]:])"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 23,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "[8, 5, 11, 3, 9, 1, 0, 6, 12, 4, 10, 2, 7]\n"
122 |      ]
123 |     },
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "[0, 0, 1, 1, 3, 3, 5, 0, 1, 0, 2, 2, 4]"
128 |       ]
129 |      },
130 |      "execution_count": 23,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "lcp = LCP(sa,s_new)\n",
137 |     "lcp"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 24,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "def longestCommonPrefix(strs):\n",
147 |     "    if len(strs) == 0:\n",
148 |     "        return \"\"\n",
149 |     "    current = strs[0]\n",
150 |     "    for i in range(1, len(strs)):\n",
151 |     "        temp = \"\"\n",
152 |     "        if len(current) == 0:\n",
153 |     "            break\n",
154 |     "        for j in range(len(strs[i])):\n",
155 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
156 |     "                temp += current[j]\n",
157 |     "            else:\n",
158 |     "                break\n",
159 |     "        current = temp\n",
160 |     "    return current"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 25,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "calculating longest prefixes between a#ananab and ab\n",
173 |       "longest prefix between them is \"a\"\n",
174 |       "The length is = 1\n",
175 |       "Position =  11\n",
176 |       "calculating longest prefixes between ana#ananab and anab\n",
177 |       "longest prefix between them is \"ana\"\n",
178 |       "The length is = 3\n",
179 |       "Position =  9\n",
180 |       "calculating longest prefixes between anana#ananab and ananab\n",
181 |       "longest prefix between them is \"anana\"\n",
182 |       "The length is = 5\n",
183 |       "Position =  7\n",
184 |       "Length of Longest Palindrome is =  5\n",
185 |       "Longest Palindrome is =  anana\n"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "length_of_longest_palindrome = 0\n",
191 |     "longest_length = 0\n",
192 |     "Len = len(s_new)   # Length of updated string\n",
193 |     "actual_len = len(s) # Length of original string\n",
194 |     "Position = 0\n",
195 |     "strs = [None]*2\n",
196 |     "for i in range(1,Len):\n",
197 |     "    \n",
198 |     "    if lcp[i]>longest_length:\n",
199 |     "        if(sa[i-1]<actual_len and sa[i]>actual_len) or (sa[i]<actual_len and sa[i-1]>actual_len):\n",
200 |     "            print('calculating longest prefixes between {a} and {b}'.format(a = s_new[sa[i-1]:], b = s_new[sa[i]:]))\n",
201 |     "            strs[0] = s_new[sa[i-1]:]\n",
202 |     "            strs[1] = s_new[sa[i]:]\n",
203 |     "            longest_length = lcp[i]\n",
204 |     "            print('longest prefix between them is \"{}\"'.format(longestCommonPrefix(strs)))\n",
205 |     "            print('The length is = {}'.format(longest_length))\n",
206 |     "            Position = sa[i]\n",
207 |     "            print(\"Position = \",Position)\n",
208 |     "\n",
209 |     "length_of_longest_palindrome = longest_length\n",
210 |     "longest_palindrome = s_new[Position:Position+longest_length]\n",
211 |     "print('Length of Longest Palindrome is = ',length_of_longest_palindrome)\n",
212 |     "print('Longest Palindrome is = ',longest_palindrome)\n",
213 |     "            "
214 |    ]
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "interpreter": {
219 |    "hash": "3dc8e47f82edb7463ca464588a7358a3d93aa0e55ce1d99dfe2c7a888c7afeb3"
220 |   },
221 |   "kernelspec": {
222 |    "display_name": "Python 3.10.1 64-bit",
223 |    "language": "python",
224 |    "name": "python3"
225 |   },
226 |   "language_info": {
227 |    "codemirror_mode": {
228 |     "name": "ipython",
229 |     "version": 3
230 |    },
231 |    "file_extension": ".py",
232 |    "mimetype": "text/x-python",
233 |    "name": "python",
234 |    "nbconvert_exporter": "python",
235 |    "pygments_lexer": "ipython3",
236 |    "version": "3.10.1"
237 |   },
238 |   "orig_nbformat": 4
239 |  },
240 |  "nbformat": 4,
241 |  "nbformat_minor": 2
242 | }
243 | 


--------------------------------------------------------------------------------
/LPS3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 9,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "The inverted suffix array is:\n",
 13 |       "\n",
 14 |       "[1, 10, '#', 0]\n",
 15 |       "[2, 10, '$', 0]\n",
 16 |       "[2, 4, 'AAATGC$', 2]\n",
 17 |       "[2, 5, 'AATGC$', 1]\n",
 18 |       "[2, 2, 'ATAAATGC$', 2]\n",
 19 |       "[1, 6, 'ATGC#', 4]\n",
 20 |       "[2, 6, 'ATGC$', 2]\n",
 21 |       "[1, 2, 'ATTTATGC#', 0]\n",
 22 |       "[1, 9, 'C#', 1]\n",
 23 |       "[2, 9, 'C$', 1]\n",
 24 |       "[2, 1, 'CATAAATGC$', 3]\n",
 25 |       "[1, 1, 'CATTTATGC#', 0]\n",
 26 |       "[1, 8, 'GC#', 2]\n",
 27 |       "[2, 8, 'GC$', 2]\n",
 28 |       "[2, 0, 'GCATAAATGC$', 4]\n",
 29 |       "[1, 0, 'GCATTTATGC#', 0]\n",
 30 |       "[2, 3, 'TAAATGC$', 2]\n",
 31 |       "[1, 5, 'TATGC#', 1]\n",
 32 |       "[1, 7, 'TGC#', 3]\n",
 33 |       "[2, 7, 'TGC$', 1]\n",
 34 |       "[1, 4, 'TTATGC#', 2]\n",
 35 |       "[1, 3, 'TTTATGC#', 0]\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "def complement(seq):\n",
 41 |     "    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}\n",
 42 |     "    bases = [str(i) for i in seq]\n",
 43 |     "    bases = [complement[base] for base in bases]\n",
 44 |     "    compliment= ''.join(bases)\n",
 45 |     "    reverse_compliment=compliment[::-1]\n",
 46 |     "    return seq,reverse_compliment\n",
 47 |     "\n",
 48 |     "def add_charectors(seq,reverse_compliment):\n",
 49 |     "    seq+= '#'\n",
 50 |     "    reverse_compliment+='$'\n",
 51 |     "    return seq,reverse_compliment\n",
 52 |     "\n",
 53 |     "\n",
 54 |     "\n",
 55 |     "def Construct_SuffixArray(str_1, str_2):\n",
 56 |     "    list1=[]\n",
 57 |     "    string_id={}\n",
 58 |     "    suffix_dict={}\n",
 59 |     "    len_str=len(str_1)\n",
 60 |     "    for i in range(len_str):\n",
 61 |     "        permutation1 = str_1[i:len_str]\n",
 62 |     "        permutation2 = str_2[i:len_str]\n",
 63 |     "        list1.append(permutation1)\n",
 64 |     "        list1.append(permutation2)\n",
 65 |     "        suffix_dict[permutation1]=i\n",
 66 |     "        suffix_dict[permutation2]=i\n",
 67 |     "        string_id[permutation1]=1\n",
 68 |     "        string_id[permutation2]=2\n",
 69 |     "    \n",
 70 |     "    ordered_list=sorted(list1)\n",
 71 |     "    list2 = [[string_id[i],i] for i in ordered_list]\n",
 72 |     "    list3 = []\n",
 73 |     "    for i in list2:\n",
 74 |     "        list3.append([i[0],suffix_dict[i[1]],i[1]])\n",
 75 |     "\n",
 76 |     "    for i in range(0 , len(ordered_list)):\n",
 77 |     "        if len(ordered_list)-1 == i:\n",
 78 |     "            list3[i].append(0)\n",
 79 |     "        else:\n",
 80 |     "            strs=[ordered_list[i], ordered_list[i+1]]\n",
 81 |     "            list3[i].append(longestCommonPrefix(strs))\n",
 82 |     "    return list3\n",
 83 |     "\n",
 84 |     "def longestCommonPrefix(strs):\n",
 85 |     "    if len(strs) == 0:\n",
 86 |     "        return \"\"\n",
 87 |     "    current = strs[0]\n",
 88 |     "    for i in range(1, len(strs)):\n",
 89 |     "        temp = \"\"\n",
 90 |     "        if len(current) == 0:\n",
 91 |     "            break\n",
 92 |     "        for j in range(len(strs[i])):\n",
 93 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
 94 |     "                temp += current[j]\n",
 95 |     "            else:\n",
 96 |     "                break\n",
 97 |     "        current = temp\n",
 98 |     "    return len(current)\n",
 99 |     "\n",
100 |     "\n",
101 |     "\n",
102 |     "seq,rev=complement(input('Enter the DNA sequence:')) # GCATTTATGC , CGCTGTAGCG, \n",
103 |     "seq1,rev1=add_charectors(seq,rev)\n",
104 |     "inverted_sa= Construct_SuffixArray(seq1,rev1)\n",
105 |     "print('The inverted suffix array is:\\n')\n",
106 |     "for i in inverted_sa:\n",
107 |     "    print(i)\n"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## ALGORITHM TO DETECT LONG ARMED GAPPED PALINDROMES"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "Finding Maximum LCP indexes corresponding to Maximum LCP Length"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 10,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "[6, 0]\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "max_LCP = max(inverted_sa,key = lambda x: x[3])[3]\n",
139 |     "max_LCP_indexes = []\n",
140 |     "for i in inverted_sa:\n",
141 |     "    if i[3]==max_LCP and i[0]==1:\n",
142 |     "        max_LCP_indexes.append(i[1])\n",
143 |     "    elif i[3]==max_LCP and i[0]==2:\n",
144 |     "        max_LCP_indexes.append(i[1])\n",
145 |     "print(max_LCP_indexes)\n"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "Verifying Length Constraints"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 11,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "name": "stdout",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "Complementary\n"
165 |      ]
166 |     }
167 |    ],
168 |    "source": [
169 |     "from Bio.Seq import Seq\n",
170 |     "\n",
171 |     "i = min(max_LCP_indexes)\n",
172 |     "j = max(max_LCP_indexes)\n",
173 |     "wi=''\n",
174 |     "wj=''\n",
175 |     "while(i<j):\n",
176 |     "    a = i-max_LCP\n",
177 |     "    b = i-1\n",
178 |     "    if a>=0 and b>=0 and b>a:\n",
179 |     "        wi =''\n",
180 |     "        wi = seq[0:i]\n",
181 |     "        if(wi==rev[a:b+1]):\n",
182 |     "            wi = wi[::-1]\n",
183 |     "            break\n",
184 |     "    i = i+1\n",
185 |     "while i<j:\n",
186 |     "    a = j+max_LCP-1\n",
187 |     "    if a<len(seq):\n",
188 |     "        wj=''\n",
189 |     "        wj=seq[j:]\n",
190 |     "        if wj==rev[j:a+1]:\n",
191 |     "            break\n",
192 |     "    j = j-1\n",
193 |     "wj = Seq(wj)\n",
194 |     "flag = -1\n",
195 |     "if wi == wj.complement():\n",
196 |     "    print('Complementary')\n",
197 |     "    flag = 1"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "Verifying Spacer Length Constraints"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 12,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "name": "stdout",
214 |      "output_type": "stream",
215 |      "text": [
216 |       "The sequence GCATTTATGC is long armed gapped palindrome\n",
217 |       "spacer length =  2\n",
218 |       "palindrome arm length =  4\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "\n",
224 |     "if j-i<= max_LCP and flag ==1:\n",
225 |     "    print('The sequence {} is long armed gapped palindrome'.format(seq))\n",
226 |     "    print('spacer length = ',j-i)\n",
227 |     "    print('palindrome arm length = ',len(wi))"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "## ALGORITHM TO DETECT LENGTH CONSTRAINED GAPPED PALINDROMES"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 13,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "# These are constants that are predefined\n",
244 |     "minarm = 3\n",
245 |     "maxgap = 5\n",
246 |     "mingap = 3         "
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 14,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "[1, 7]\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "minarm_indexes = []\n",
264 |     "for k in inverted_sa:\n",
265 |     "    if k[3]==minarm:\n",
266 |     "        minarm_indexes.append(k[1])\n",
267 |     "print(minarm_indexes)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "Verify Plaindrome Arm Constraints"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 15,
280 |    "metadata": {},
281 |    "outputs": [
282 |     {
283 |      "name": "stdout",
284 |      "output_type": "stream",
285 |      "text": [
286 |       "Complementary\n"
287 |      ]
288 |     }
289 |    ],
290 |    "source": [
291 |     "i = min(minarm_indexes)\n",
292 |     "j = max(minarm_indexes)\n",
293 |     "wi=''\n",
294 |     "wj=''\n",
295 |     "while(i<j):\n",
296 |     "    a = i-minarm\n",
297 |     "    b = i-1\n",
298 |     "    if a>=0 and b>=0 and b>a:\n",
299 |     "        wi = ''\n",
300 |     "        wi = seq[0:i]\n",
301 |     "        if(wi==rev[a:b+1]):\n",
302 |     "            wi = wi[::-1]\n",
303 |     "            break\n",
304 |     "    i = i+1\n",
305 |     "\n",
306 |     "while i<j:\n",
307 |     "    a = j+minarm-1\n",
308 |     "    if a<len(seq):\n",
309 |     "        wj=''\n",
310 |     "        wj=seq[j:]\n",
311 |     "        if wj==rev[j:a+1]:\n",
312 |     "            break\n",
313 |     "    j = j-1\n",
314 |     "wj = Seq(wj)\n",
315 |     "falg =-1\n",
316 |     "if wi == wj.complement():\n",
317 |     "    print('Complementary')\n",
318 |     "    flag = 1        "
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "Verifying Spacer Length Constraints"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 16,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "name": "stdout",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "The sequence GCATTTATGC is length constrained gapped palindrome\n",
338 |       "spacer length =  4\n",
339 |       "palindrome arm length =  3\n"
340 |      ]
341 |     }
342 |    ],
343 |    "source": [
344 |     "if mingap <= j-i and j-i <=maxgap and flag ==1:\n",
345 |     "    print('The sequence {} is length constrained gapped palindrome'.format(seq))\n",
346 |     "    print('spacer length = ',j-i)\n",
347 |     "    print('palindrome arm length = ',len(wi))"
348 |    ]
349 |   }
350 |  ],
351 |  "metadata": {
352 |   "interpreter": {
353 |    "hash": "3dc8e47f82edb7463ca464588a7358a3d93aa0e55ce1d99dfe2c7a888c7afeb3"
354 |   },
355 |   "kernelspec": {
356 |    "display_name": "Python 3.10.1 64-bit",
357 |    "language": "python",
358 |    "name": "python3"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.10.1"
371 |   },
372 |   "orig_nbformat": 4
373 |  },
374 |  "nbformat": 4,
375 |  "nbformat_minor": 2
376 | }
377 | 


--------------------------------------------------------------------------------
/new.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "The entered String is =  TAGCCGATT\n",
 13 |       "The Updated String is  TAGCCGATT#TTAGCCGAT\n",
 14 |       "Suffix Array is =  [9, 12, 1, 17, 6, 14, 3, 15, 4, 16, 5, 13, 2, 18, 8, 11, 0, 7, 10]\n",
 15 |       "LCP:\n",
 16 |       " [0, 0, 7, 1, 2, 0, 5, 1, 4, 0, 3, 1, 6, 0, 1, 1, 8, 1, 2]\n",
 17 |       "calculating longest prefixes between AGCCGAT and AGCCGATT#TTAGCCGAT\n",
 18 |       "longest prefix between them is \"AGCCGAT\"\n",
 19 |       "The length is = 7\n",
 20 |       "Position =  1\n",
 21 |       "calculating longest prefixes between TAGCCGAT and TAGCCGATT#TTAGCCGAT\n",
 22 |       "longest prefix between them is \"TAGCCGAT\"\n",
 23 |       "The length is = 8\n",
 24 |       "Position =  0\n",
 25 |       "Length of Longest Palindrome is =  8\n",
 26 |       "Longest Palindrome is =  TAGCCGAT\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "def suffix_array_alternative_naive(s):\n",
 32 |     "    return [rank for suffix, rank in sorted((s[i:], i) for i in range(len(s)))]\n",
 33 |     "\n",
 34 |     "def LCP(sa , s_new):\n",
 35 |     "    n = len(sa)\n",
 36 |     "    size = len(s_new)\n",
 37 |     "    r = [None]* size\n",
 38 |     "    for i in range(n):\n",
 39 |     "        r[sa[i]] = i\n",
 40 |     "    lcp = [None]*size\n",
 41 |     "    h = 0\n",
 42 |     "    for i in range(size):\n",
 43 |     "        if r[i] > 0:\n",
 44 |     "            j = sa [r[i] - 1]\n",
 45 |     "            while i != size-h and j!= size-h and s_new[i+h] == s_new[j+h]:\n",
 46 |     "                h = h+1\n",
 47 |     "            lcp[r[i]] =h\n",
 48 |     "            if h > 0:\n",
 49 |     "                h = h - 1\n",
 50 |     "    if size>0:\n",
 51 |     "        lcp[0] = 0\n",
 52 |     "    return lcp\n",
 53 |     "\n",
 54 |     "def longestCommonPrefix(strs):\n",
 55 |     "    if len(strs) == 0:\n",
 56 |     "        return \"\"\n",
 57 |     "    current = strs[0]\n",
 58 |     "    for i in range(1, len(strs)):\n",
 59 |     "        temp = \"\"\n",
 60 |     "        if len(current) == 0:\n",
 61 |     "            break\n",
 62 |     "        for j in range(len(strs[i])):\n",
 63 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
 64 |     "                temp += current[j]\n",
 65 |     "            else:\n",
 66 |     "                break\n",
 67 |     "        current = temp\n",
 68 |     "    return current\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "s = input('Enter the String:\\n')\n",
 72 |     "print('The entered String is = ',s)\n",
 73 |     "s_new = s + '#' + s[::-1]\n",
 74 |     "print('The Updated String is ',s_new)\n",
 75 |     "sa = suffix_array_alternative_naive(s_new)\n",
 76 |     "print('Suffix Array is = ',sa)\n",
 77 |     "lcp = LCP(sa,s_new)\n",
 78 |     "print('LCP:\\n',lcp)\n",
 79 |     "length_of_longest_palindrome = 0\n",
 80 |     "longest_length = 0\n",
 81 |     "Len = len(s_new)   # Length of updated string\n",
 82 |     "actual_len = len(s) # Length of original string\n",
 83 |     "Position = 0\n",
 84 |     "strs = [None]*2\n",
 85 |     "for i in range(1,Len):\n",
 86 |     "    \n",
 87 |     "    if lcp[i]>longest_length:\n",
 88 |     "        if(sa[i-1]<actual_len and sa[i]>actual_len) or (sa[i]<actual_len and sa[i-1]>actual_len):\n",
 89 |     "            print('calculating longest prefixes between {a} and {b}'.format(a = s_new[sa[i-1]:], b = s_new[sa[i]:]))\n",
 90 |     "            strs[0] = s_new[sa[i-1]:]\n",
 91 |     "            strs[1] = s_new[sa[i]:]\n",
 92 |     "            longest_length = lcp[i]\n",
 93 |     "            print('longest prefix between them is \"{}\"'.format(longestCommonPrefix(strs)))\n",
 94 |     "            print('The length is = {}'.format(longest_length))\n",
 95 |     "            Position = sa[i]\n",
 96 |     "            print(\"Position = \",Position)\n",
 97 |     "\n",
 98 |     "length_of_longest_palindrome = longest_length\n",
 99 |     "longest_palindrome = s_new[Position:Position+longest_length]\n",
100 |     "print('Length of Longest Palindrome is = ',length_of_longest_palindrome)\n",
101 |     "print('Longest Palindrome is = ',longest_palindrome)\n",
102 |     "            \n"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "The Sequence is:  GCATTTATGC\n",
115 |       "The reversed sequence is:  GCATAAATGC\n",
116 |       "The Updated Sequence is:  GCATTTATGC#\n",
117 |       "The updated reversed sequence is:  GCATAAATGC$\n",
118 |       "The inverted suffix array is:\n",
119 |       "[1, 10, '#', 0]\n",
120 |       "[2, 10, '$', 0]\n",
121 |       "[2, 4, 'AAATGC$', 2]\n",
122 |       "[2, 5, 'AATGC$', 1]\n",
123 |       "[2, 2, 'ATAAATGC$', 2]\n",
124 |       "[1, 6, 'ATGC#', 4]\n",
125 |       "[2, 6, 'ATGC$', 2]\n",
126 |       "[1, 2, 'ATTTATGC#', 0]\n",
127 |       "[1, 9, 'C#', 1]\n",
128 |       "[2, 9, 'C$', 1]\n",
129 |       "[2, 1, 'CATAAATGC$', 3]\n",
130 |       "[1, 1, 'CATTTATGC#', 0]\n",
131 |       "[1, 8, 'GC#', 2]\n",
132 |       "[2, 8, 'GC$', 2]\n",
133 |       "[2, 0, 'GCATAAATGC$', 4]\n",
134 |       "[1, 0, 'GCATTTATGC#', 0]\n",
135 |       "[2, 3, 'TAAATGC$', 2]\n",
136 |       "[1, 5, 'TATGC#', 1]\n",
137 |       "[1, 7, 'TGC#', 3]\n",
138 |       "[2, 7, 'TGC$', 1]\n",
139 |       "[1, 4, 'TTATGC#', 2]\n",
140 |       "[1, 3, 'TTTATGC#', 0]\n",
141 |       "The Maximum LCP Indexes are:  [6, 0]\n",
142 |       "The two Arms are Complementary\n",
143 |       "The sequence GCATTTATGC is long armed gapped palindrome\n",
144 |       "spacer length =  2\n",
145 |       "palindrome arm length =  4\n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "def complement(seq):\n",
151 |     "    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}\n",
152 |     "    bases = [str(i) for i in seq]\n",
153 |     "    bases = [complement[base] for base in bases]\n",
154 |     "    compliment= ''.join(bases)\n",
155 |     "    reverse_compliment=compliment[::-1]\n",
156 |     "    return seq,reverse_compliment\n",
157 |     "\n",
158 |     "def add_charectors(seq,reverse_compliment):\n",
159 |     "    seq+= '#'\n",
160 |     "    reverse_compliment+='$'\n",
161 |     "    return seq,reverse_compliment\n",
162 |     "\n",
163 |     "\n",
164 |     "\n",
165 |     "def Construct_SuffixArray(str_1, str_2):\n",
166 |     "    list1=[]\n",
167 |     "    string_id={}\n",
168 |     "    suffix_dict={}\n",
169 |     "    len_str=len(str_1)\n",
170 |     "    for i in range(len_str):\n",
171 |     "        permutation1 = str_1[i:len_str]\n",
172 |     "        permutation2 = str_2[i:len_str]\n",
173 |     "        list1.append(permutation1)\n",
174 |     "        list1.append(permutation2)\n",
175 |     "        suffix_dict[permutation1]=i\n",
176 |     "        suffix_dict[permutation2]=i\n",
177 |     "        string_id[permutation1]=1\n",
178 |     "        string_id[permutation2]=2\n",
179 |     "    \n",
180 |     "    ordered_list=sorted(list1)\n",
181 |     "    list2 = [[string_id[i],i] for i in ordered_list]\n",
182 |     "    list3 = []\n",
183 |     "    for i in list2:\n",
184 |     "        list3.append([i[0],suffix_dict[i[1]],i[1]])\n",
185 |     "\n",
186 |     "    for i in range(0 , len(ordered_list)):\n",
187 |     "        if len(ordered_list)-1 == i:\n",
188 |     "            list3[i].append(0)\n",
189 |     "        else:\n",
190 |     "            strs=[ordered_list[i], ordered_list[i+1]]\n",
191 |     "            list3[i].append(longestCommonPrefix(strs))\n",
192 |     "    return list3\n",
193 |     "\n",
194 |     "def longestCommonPrefix(strs):\n",
195 |     "    if len(strs) == 0:\n",
196 |     "        return \"\"\n",
197 |     "    current = strs[0]\n",
198 |     "    for i in range(1, len(strs)):\n",
199 |     "        temp = \"\"\n",
200 |     "        if len(current) == 0:\n",
201 |     "            break\n",
202 |     "        for j in range(len(strs[i])):\n",
203 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
204 |     "                temp += current[j]\n",
205 |     "            else:\n",
206 |     "                break\n",
207 |     "        current = temp\n",
208 |     "    return len(current)\n",
209 |     "\n",
210 |     "\n",
211 |     "\n",
212 |     "seq,rev=complement(input('Enter the DNA sequence:')) # GCATTTATGC , CGCTGTAGCG, \n",
213 |     "print('The Sequence is: ',seq)\n",
214 |     "print('The reversed sequence is: ',rev)\n",
215 |     "seq1,rev1=add_charectors(seq,rev)\n",
216 |     "print('The Updated Sequence is: ',seq1)\n",
217 |     "print('The updated reversed sequence is: ',rev1)\n",
218 |     "inverted_sa= Construct_SuffixArray(seq1,rev1)\n",
219 |     "print('The inverted suffix array is:')\n",
220 |     "for i in inverted_sa:\n",
221 |     "    print(i)\n",
222 |     "\n",
223 |     "max_LCP = max(inverted_sa,key = lambda x: x[3])[3]\n",
224 |     "max_LCP_indexes = []\n",
225 |     "for i in inverted_sa:\n",
226 |     "    if i[3]==max_LCP and i[0]==1:\n",
227 |     "        max_LCP_indexes.append(i[1])\n",
228 |     "    elif i[3]==max_LCP and i[0]==2:\n",
229 |     "        max_LCP_indexes.append(i[1])\n",
230 |     "print('The Maximum LCP Indexes are: ',max_LCP_indexes)\n",
231 |     "from Bio.Seq import Seq\n",
232 |     "\n",
233 |     "i = min(max_LCP_indexes)\n",
234 |     "j = max(max_LCP_indexes)\n",
235 |     "wi=''\n",
236 |     "wj=''\n",
237 |     "while(i<j):\n",
238 |     "    a = i-max_LCP\n",
239 |     "    b = i-1\n",
240 |     "    if a>=0 and b>=0 and b>a:\n",
241 |     "        wi =''\n",
242 |     "        wi = seq[0:i]\n",
243 |     "        if(wi==rev[a:b+1]):\n",
244 |     "            wi = wi[::-1]\n",
245 |     "            break\n",
246 |     "    i = i+1\n",
247 |     "while i<j:\n",
248 |     "    a = j+max_LCP-1\n",
249 |     "    if a<len(seq):\n",
250 |     "        wj=''\n",
251 |     "        wj=seq[j:]\n",
252 |     "        if wj==rev[j:a+1]:\n",
253 |     "            break\n",
254 |     "    j = j-1\n",
255 |     "wj = Seq(wj)\n",
256 |     "flag = -1\n",
257 |     "if wi == wj.complement():\n",
258 |     "    print('The two Arms are Complementary')\n",
259 |     "    flag = 1\n",
260 |     "\n",
261 |     "\n",
262 |     "if j-i<= max_LCP and flag ==1:\n",
263 |     "    print('The sequence {} is long armed gapped palindrome'.format(seq))\n",
264 |     "    print('spacer length = ',j-i)\n",
265 |     "    print('palindrome arm length = ',len(wi))"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 5,
271 |    "metadata": {},
272 |    "outputs": [
273 |     {
274 |      "name": "stdout",
275 |      "output_type": "stream",
276 |      "text": [
277 |       "The Sequence is:  GCATTTATGC\n",
278 |       "The reversed sequence is:  GCATAAATGC\n",
279 |       "The Updated Sequence is:  GCATTTATGC#\n",
280 |       "The updated reversed sequence is:  GCATAAATGC$\n",
281 |       "The inverted suffix array is:\n",
282 |       "[1, 10, '#', 0]\n",
283 |       "[2, 10, '$', 0]\n",
284 |       "[2, 4, 'AAATGC$', 2]\n",
285 |       "[2, 5, 'AATGC$', 1]\n",
286 |       "[2, 2, 'ATAAATGC$', 2]\n",
287 |       "[1, 6, 'ATGC#', 4]\n",
288 |       "[2, 6, 'ATGC$', 2]\n",
289 |       "[1, 2, 'ATTTATGC#', 0]\n",
290 |       "[1, 9, 'C#', 1]\n",
291 |       "[2, 9, 'C$', 1]\n",
292 |       "[2, 1, 'CATAAATGC$', 3]\n",
293 |       "[1, 1, 'CATTTATGC#', 0]\n",
294 |       "[1, 8, 'GC#', 2]\n",
295 |       "[2, 8, 'GC$', 2]\n",
296 |       "[2, 0, 'GCATAAATGC$', 4]\n",
297 |       "[1, 0, 'GCATTTATGC#', 0]\n",
298 |       "[2, 3, 'TAAATGC$', 2]\n",
299 |       "[1, 5, 'TATGC#', 1]\n",
300 |       "[1, 7, 'TGC#', 3]\n",
301 |       "[2, 7, 'TGC$', 1]\n",
302 |       "[1, 4, 'TTATGC#', 2]\n",
303 |       "[1, 3, 'TTTATGC#', 0]\n",
304 |       "[1, 7]\n",
305 |       "The palindromic arms are Complementary\n",
306 |       "The sequence GCATTTATGC is length constrained gapped palindrome\n",
307 |       "spacer length =  4\n",
308 |       "palindrome arm length =  3\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "def complement(seq):\n",
314 |     "    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}\n",
315 |     "    bases = [str(i) for i in seq]\n",
316 |     "    bases = [complement[base] for base in bases]\n",
317 |     "    compliment= ''.join(bases)\n",
318 |     "    reverse_compliment=compliment[::-1]\n",
319 |     "    return seq,reverse_compliment\n",
320 |     "\n",
321 |     "def add_charectors(seq,reverse_compliment):\n",
322 |     "    seq+= '#'\n",
323 |     "    reverse_compliment+='$'\n",
324 |     "    return seq,reverse_compliment\n",
325 |     "\n",
326 |     "\n",
327 |     "\n",
328 |     "def Construct_SuffixArray(str_1, str_2):\n",
329 |     "    list1=[]\n",
330 |     "    string_id={}\n",
331 |     "    suffix_dict={}\n",
332 |     "    len_str=len(str_1)\n",
333 |     "    for i in range(len_str):\n",
334 |     "        permutation1 = str_1[i:len_str]\n",
335 |     "        permutation2 = str_2[i:len_str]\n",
336 |     "        list1.append(permutation1)\n",
337 |     "        list1.append(permutation2)\n",
338 |     "        suffix_dict[permutation1]=i\n",
339 |     "        suffix_dict[permutation2]=i\n",
340 |     "        string_id[permutation1]=1\n",
341 |     "        string_id[permutation2]=2\n",
342 |     "    \n",
343 |     "    ordered_list=sorted(list1)\n",
344 |     "    list2 = [[string_id[i],i] for i in ordered_list]\n",
345 |     "    list3 = []\n",
346 |     "    for i in list2:\n",
347 |     "        list3.append([i[0],suffix_dict[i[1]],i[1]])\n",
348 |     "\n",
349 |     "    for i in range(0 , len(ordered_list)):\n",
350 |     "        if len(ordered_list)-1 == i:\n",
351 |     "            list3[i].append(0)\n",
352 |     "        else:\n",
353 |     "            strs=[ordered_list[i], ordered_list[i+1]]\n",
354 |     "            list3[i].append(longestCommonPrefix(strs))\n",
355 |     "    return list3\n",
356 |     "\n",
357 |     "def longestCommonPrefix(strs):\n",
358 |     "    if len(strs) == 0:\n",
359 |     "        return \"\"\n",
360 |     "    current = strs[0]\n",
361 |     "    for i in range(1, len(strs)):\n",
362 |     "        temp = \"\"\n",
363 |     "        if len(current) == 0:\n",
364 |     "            break\n",
365 |     "        for j in range(len(strs[i])):\n",
366 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
367 |     "                temp += current[j]\n",
368 |     "            else:\n",
369 |     "                break\n",
370 |     "        current = temp\n",
371 |     "    return len(current)\n",
372 |     "\n",
373 |     "\n",
374 |     "\n",
375 |     "seq,rev=complement(input('Enter the DNA sequence:')) # GCATTTATGC , CGCTGTAGCG, \n",
376 |     "print('The Sequence is: ',seq)\n",
377 |     "print('The reversed sequence is: ',rev)\n",
378 |     "seq1,rev1=add_charectors(seq,rev)\n",
379 |     "print('The Updated Sequence is: ',seq1)\n",
380 |     "print('The updated reversed sequence is: ',rev1)\n",
381 |     "inverted_sa= Construct_SuffixArray(seq1,rev1)\n",
382 |     "print('The inverted suffix array is:')\n",
383 |     "for i in inverted_sa:\n",
384 |     "    print(i)\n",
385 |     "\n",
386 |     "# These are constants that are predefined\n",
387 |     "minarm = 3\n",
388 |     "maxgap = 5\n",
389 |     "mingap = 3   \n",
390 |     "minarm_indexes = []\n",
391 |     "for k in inverted_sa:\n",
392 |     "    if k[3]==minarm:\n",
393 |     "        minarm_indexes.append(k[1])\n",
394 |     "print(minarm_indexes)\n",
395 |     "i = min(minarm_indexes)\n",
396 |     "j = max(minarm_indexes)\n",
397 |     "wi=''\n",
398 |     "wj=''\n",
399 |     "while(i<j):\n",
400 |     "    a = i-minarm\n",
401 |     "    b = i-1\n",
402 |     "    if a>=0 and b>=0 and b>a:\n",
403 |     "        wi = ''\n",
404 |     "        wi = seq[0:i]\n",
405 |     "        if(wi==rev[a:b+1]):\n",
406 |     "            wi = wi[::-1]\n",
407 |     "            break\n",
408 |     "    i = i+1\n",
409 |     "\n",
410 |     "while i<j:\n",
411 |     "    a = j+minarm-1\n",
412 |     "    if a<len(seq):\n",
413 |     "        wj=''\n",
414 |     "        wj=seq[j:]\n",
415 |     "        if wj==rev[j:a+1]:\n",
416 |     "            break\n",
417 |     "    j = j-1\n",
418 |     "wj = Seq(wj)\n",
419 |     "falg =-1\n",
420 |     "if wi == wj.complement():\n",
421 |     "    print('The palindromic arms are Complementary')\n",
422 |     "    flag = 1        \n",
423 |     "if mingap <= j-i and j-i <=maxgap and flag ==1:\n",
424 |     "    print('The sequence {} is length constrained gapped palindrome'.format(seq))\n",
425 |     "    print('spacer length = ',j-i)\n",
426 |     "    print('palindrome arm length = ',len(wi))"
427 |    ]
428 |   }
429 |  ],
430 |  "metadata": {
431 |   "interpreter": {
432 |    "hash": "3dc8e47f82edb7463ca464588a7358a3d93aa0e55ce1d99dfe2c7a888c7afeb3"
433 |   },
434 |   "kernelspec": {
435 |    "display_name": "Python 3.10.1 64-bit",
436 |    "language": "python",
437 |    "name": "python3"
438 |   },
439 |   "language_info": {
440 |    "codemirror_mode": {
441 |     "name": "ipython",
442 |     "version": 3
443 |    },
444 |    "file_extension": ".py",
445 |    "mimetype": "text/x-python",
446 |    "name": "python",
447 |    "nbconvert_exporter": "python",
448 |    "pygments_lexer": "ipython3",
449 |    "version": "3.10.1"
450 |   },
451 |   "orig_nbformat": 4
452 |  },
453 |  "nbformat": 4,
454 |  "nbformat_minor": 2
455 | }
456 | 


--------------------------------------------------------------------------------
/Code.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Group 6 "
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     " \n",
  15 |     " "
  16 |    ]
  17 |   },
  18 |   {
  19 |    "cell_type": "markdown",
  20 |    "metadata": {},
  21 |    "source": [
  22 |     "# Abhinandhu       AM.EN.U4AIE20002"
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "markdown",
  27 |    "metadata": {},
  28 |    "source": [
  29 |     "Suffix Array Construction"
  30 |    ]
  31 |   },
  32 |   {
  33 |    "cell_type": "code",
  34 |    "execution_count": 1,
  35 |    "metadata": {},
  36 |    "outputs": [
  37 |     {
  38 |      "name": "stdout",
  39 |      "output_type": "stream",
  40 |      "text": [
  41 |       "Enter the Input String : ATGCATA\n",
  42 |       "[6, 4, 0, 3, 2, 5, 1]\n"
  43 |      ]
  44 |     }
  45 |    ],
  46 |    "source": [
  47 |     "def Construct_SuffixArray(Input_str,len_str):\n",
  48 |     "    Suffix_array = []\n",
  49 |     "    Suffix_Dict = {}\n",
  50 |     "    Suffix_DictReverse = {}\n",
  51 |     "\n",
  52 |     "\n",
  53 |     "    for i in range(len_str):\n",
  54 |     "        permutation = Input_str[i:len_str]\n",
  55 |     "        Suffix_Dict[i]=permutation\n",
  56 |     "        Suffix_DictReverse[permutation]=i\n",
  57 |     " \n",
  58 |     "    orderedList = sorted(Suffix_Dict.values())\n",
  59 |     "    numberOfElements = len(orderedList)\n",
  60 |     "\n",
  61 |     "    for i in orderedList:\n",
  62 |     "         Suffix_array.append(Suffix_DictReverse[i])\n",
  63 |     "\n",
  64 |     "    return Suffix_array\n",
  65 |     "\n",
  66 |     "def main():\n",
  67 |     "    Input_str = input(\"Enter the Input String : \") #ATGCATA\n",
  68 |     "    len_str = len(Input_str)\n",
  69 |     "    Suffix_array = Construct_SuffixArray(Input_str,len_str)\n",
  70 |     "    print(Suffix_array)\n",
  71 |     "if __name__ == \"__main__\":\n",
  72 |     "   main()\n",
  73 |     "    "
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "markdown",
  78 |    "metadata": {},
  79 |    "source": [
  80 |     "Pattern Match\n"
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "code",
  85 |    "execution_count": 5,
  86 |    "metadata": {},
  87 |    "outputs": [
  88 |     {
  89 |      "name": "stdout",
  90 |      "output_type": "stream",
  91 |      "text": [
  92 |       "Enter the Input String : ATGCCGCAATG\n",
  93 |       "Enter the pattern to Search : ATG\n",
  94 |       "[8, 0]\n"
  95 |      ]
  96 |     }
  97 |    ],
  98 |    "source": [
  99 |     "def Construct_SuffixArray(Input_str,len_str):\n",
 100 |     "    Suffix_array = []\n",
 101 |     "    Suffix_Dict = {}\n",
 102 |     "    Suffix_DictReverse = {}\n",
 103 |     "\n",
 104 |     "\n",
 105 |     "    for i in range(len_str):\n",
 106 |     "        permutation = Input_str[i:len_str]\n",
 107 |     "        Suffix_Dict[i]=permutation\n",
 108 |     "        Suffix_DictReverse[permutation]=i\n",
 109 |     " \n",
 110 |     "    Sorted_List = sorted(Suffix_Dict.values())\n",
 111 |     "     \n",
 112 |     "\n",
 113 |     "    count = 0\n",
 114 |     "    for i in Sorted_List:\n",
 115 |     "         Suffix_array.append(Suffix_DictReverse[i])\n",
 116 |     "    #output += str(suffixDictReverse[i])\n",
 117 |     "         count += 1\n",
 118 |     "    return Suffix_array\n",
 119 |     "\n",
 120 |     "def Search_pattern(pattern ,Input_str,Suffix_array):\n",
 121 |     "    l = 0\n",
 122 |     "    r = len(Suffix_array) -1\n",
 123 |     "\n",
 124 |     "    while l < r: \n",
 125 |     "        mid_value = (l+r)//2 #set the middle to binary search\n",
 126 |     "        if Input_str[Suffix_array[mid_value]:] < pattern:\n",
 127 |     "            l = mid_value+1\n",
 128 |     "        else:\n",
 129 |     "            r = mid_value\n",
 130 |     "\n",
 131 |     "    def match_at(i):\n",
 132 |     "        return Input_str[i: i + len(pattern)] == pattern\n",
 133 |     "\n",
 134 |     "    if not match_at(Suffix_array[l]):\n",
 135 |     "        raise IndexError('NO MATCH FOUND')\n",
 136 |     "\n",
 137 |     "    # Suffix_array[l] is one match\n",
 138 |     "    # now we walk backwards to find the first match\n",
 139 |     "    first = l\n",
 140 |     "    \n",
 141 |     "    while first > 0 and match_at(Suffix_array[first - 1]):\n",
 142 |     "        print(match_at(Suffix_array[first - 1]))\n",
 143 |     "        first -= 1\n",
 144 |     "\n",
 145 |     "    # and walk forwards to find the last match\n",
 146 |     "    last = l\n",
 147 |     "    while last <len(Suffix_array) and match_at(Suffix_array[last]):\n",
 148 |     "        last += 1\n",
 149 |     "\n",
 150 |     "    return Suffix_array[first:last]\n",
 151 |     "\n",
 152 |     "\n",
 153 |     "        \n",
 154 |     "def main():\n",
 155 |     "    Input_str = input(\"Enter the Input String : \") #ATGCATCATG\n",
 156 |     "    len_str = len(Input_str)\n",
 157 |     "    pattern = input(\"Enter the pattern to Search : \") #ATG\n",
 158 |     "    Suffix_array = Construct_SuffixArray(Input_str,len_str)\n",
 159 |     "\n",
 160 |     "    print(Search_pattern(pattern,Input_str,Suffix_array))\n",
 161 |     "    \n",
 162 |     "if __name__ == \"__main__\":\n",
 163 |     "   main()    "
 164 |    ]
 165 |   },
 166 |   {
 167 |    "cell_type": "markdown",
 168 |    "metadata": {},
 169 |    "source": [
 170 |     " "
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "markdown",
 175 |    "metadata": {},
 176 |    "source": [
 177 |     "# Hariprasad S   AM.EN.U4AIE20035"
 178 |    ]
 179 |   },
 180 |   {
 181 |    "cell_type": "markdown",
 182 |    "metadata": {},
 183 |    "source": [
 184 |     "Finding the Longest Repeated Substring"
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "code",
 189 |    "execution_count": 6,
 190 |    "metadata": {},
 191 |    "outputs": [],
 192 |    "source": [
 193 |     "def longestCommonPrefix(strs):\n",
 194 |     "    if len(strs) == 0:\n",
 195 |     "        return \"\"\n",
 196 |     "    current = strs[0]\n",
 197 |     "    for i in range(1, len(strs)):\n",
 198 |     "        temp = \"\"\n",
 199 |     "        if len(current) == 0:\n",
 200 |     "            break\n",
 201 |     "        for j in range(len(strs[i])):\n",
 202 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
 203 |     "                temp += current[j]\n",
 204 |     "            else:\n",
 205 |     "                break\n",
 206 |     "        current = temp\n",
 207 |     "    return len(current)"
 208 |    ]
 209 |   },
 210 |   {
 211 |    "cell_type": "code",
 212 |    "execution_count": 7,
 213 |    "metadata": {},
 214 |    "outputs": [],
 215 |    "source": [
 216 |     "def Construct_LCP_array(suffix_array, input_string):\n",
 217 |     "    ordered_list = []\n",
 218 |     "    for i in suffix_array:\n",
 219 |     "        ordered_list.append(input_string[i:])\n",
 220 |     "    list3 = [0]*len(ordered_list)\n",
 221 |     "    for i in range(0 , len(ordered_list)):\n",
 222 |     "        if i==0:\n",
 223 |     "            list3[i]=0\n",
 224 |     "        else:\n",
 225 |     "            strs=[ordered_list[i], ordered_list[i-1]]\n",
 226 |     "            list3[i]=longestCommonPrefix(strs)\n",
 227 |     "    return list3"
 228 |    ]
 229 |   },
 230 |   {
 231 |    "cell_type": "code",
 232 |    "execution_count": 8,
 233 |    "metadata": {},
 234 |    "outputs": [],
 235 |    "source": [
 236 |     "def Construct_SuffixArray(input_str, len_str):\n",
 237 |     "    Suffix_array = []\n",
 238 |     "    Suffix_Dict = {}\n",
 239 |     "    Suffix_DictReverse = {}\n",
 240 |     "\n",
 241 |     "    for i in range(len_str):\n",
 242 |     "        permutation = input_str[i:len_str]\n",
 243 |     "        Suffix_Dict[i] = permutation\n",
 244 |     "        Suffix_DictReverse[permutation] = i\n",
 245 |     "\n",
 246 |     "    orderedList = sorted(Suffix_Dict.values())\n",
 247 |     "\n",
 248 |     "    for i in orderedList:\n",
 249 |     "        Suffix_array.append(Suffix_DictReverse[i])\n",
 250 |     "\n",
 251 |     "    return Suffix_array, Suffix_Dict"
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": 10,
 257 |    "metadata": {},
 258 |    "outputs": [],
 259 |    "source": [
 260 |     "def main():\n",
 261 |     "    input_str = input(\"Enter the Input String : \")\n",
 262 |     "    len_str = len(input_str)\n",
 263 |     "    Suffix_array, suffix_dict = Construct_SuffixArray(input_str, len_str)\n",
 264 |     "    lcp = Construct_LCP_array(Suffix_array, input_str)\n",
 265 |     "    idx = lcp.index(max(lcp))\n",
 266 |     "    idx_suffix = Suffix_array[idx]\n",
 267 |     "    result = suffix_dict[idx_suffix]\n",
 268 |     "    print(\"Input string: \" + input_str)\n",
 269 |     "    print(\"Longest repeated substring: \" + result[0:max(lcp)])"
 270 |    ]
 271 |   },
 272 |   {
 273 |    "cell_type": "code",
 274 |    "execution_count": 11,
 275 |    "metadata": {},
 276 |    "outputs": [
 277 |     {
 278 |      "name": "stdout",
 279 |      "output_type": "stream",
 280 |      "text": [
 281 |       "Enter the Input String : ABRACADABRA\n",
 282 |       "Input string: ABRACADABRA\n",
 283 |       "Longest repeated substring: ABRA\n"
 284 |      ]
 285 |     }
 286 |    ],
 287 |    "source": [
 288 |     "if __name__ == \"__main__\":\n",
 289 |     "    main()"
 290 |    ]
 291 |   },
 292 |   {
 293 |    "cell_type": "markdown",
 294 |    "metadata": {},
 295 |    "source": [
 296 |     "| "
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "markdown",
 301 |    "metadata": {},
 302 |    "source": [
 303 |     "# M Mahadev   AM.EN.U4AIE20045"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "markdown",
 308 |    "metadata": {},
 309 |    "source": [
 310 |     "Longest Common Prefix"
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "code",
 315 |    "execution_count": 12,
 316 |    "metadata": {},
 317 |    "outputs": [],
 318 |    "source": [
 319 |     "\n",
 320 |     "def Construct_SuffixArray(Input_str, len_str):\n",
 321 |     "    Suffix_array = []\n",
 322 |     "    Suffix_Dict = {}\n",
 323 |     "    Suffix_DictReverse = {}\n",
 324 |     "    for i in range(len_str):\n",
 325 |     "        permutation = Input_str[i:len_str]\n",
 326 |     "        Suffix_Dict[i] = permutation\n",
 327 |     "        Suffix_DictReverse[permutation] = i\n",
 328 |     "    orderedList = sorted(Suffix_Dict.values())\n",
 329 |     "    numberOfElements = len(orderedList)\n",
 330 |     "    for i in orderedList:\n",
 331 |     "        Suffix_array.append(Suffix_DictReverse[i])\n",
 332 |     "    return Suffix_array"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "code",
 337 |    "execution_count": 13,
 338 |    "metadata": {},
 339 |    "outputs": [],
 340 |    "source": [
 341 |     "def inverse_array(l):\n",
 342 |     "    n = len(l)\n",
 343 |     "    ans = [0] * n\n",
 344 |     "    for i in range(n):\n",
 345 |     "        ans[l[i]] = i\n",
 346 |     "    return ans\n",
 347 |     "\n",
 348 |     "def suffix_array_naive(s):\n",
 349 |     "    return inverse_array(Construct_SuffixArray(text,len(text)))\n"
 350 |    ]
 351 |   },
 352 |   {
 353 |    "cell_type": "code",
 354 |    "execution_count": 14,
 355 |    "metadata": {},
 356 |    "outputs": [],
 357 |    "source": [
 358 |     "def longestCommonPrefix(strs):\n",
 359 |     "    if len(strs) == 0:\n",
 360 |     "        return \"\"\n",
 361 |     "    current = strs[0]\n",
 362 |     "    for i in range(1, len(strs)):\n",
 363 |     "        temp = \"\"\n",
 364 |     "        if len(current) == 0:\n",
 365 |     "            break\n",
 366 |     "        for j in range(len(strs[i])):\n",
 367 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
 368 |     "                temp += current[j]\n",
 369 |     "            else:\n",
 370 |     "                break\n",
 371 |     "        current = temp\n",
 372 |     "    return len(current)"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "code",
 377 |    "execution_count": 15,
 378 |    "metadata": {},
 379 |    "outputs": [],
 380 |    "source": [
 381 |     "def Construct_LCP_array(suffix_array, input_string):\n",
 382 |     "    \n",
 383 |     "    ordered_list = []\n",
 384 |     "    for i in suffix_array:\n",
 385 |     "        ordered_list.append(input_string[i:])\n",
 386 |     "    list3 = [0]*len(ordered_list)\n",
 387 |     "    for i in range(0 , len(ordered_list)):\n",
 388 |     "        if i==0:\n",
 389 |     "            list3[i]=0\n",
 390 |     "        else:\n",
 391 |     "            strs=[ordered_list[i], ordered_list[i-1]]\n",
 392 |     "            list3[i]=longestCommonPrefix(strs)\n",
 393 |     "    return list3"
 394 |    ]
 395 |   },
 396 |   {
 397 |    "cell_type": "code",
 398 |    "execution_count": 16,
 399 |    "metadata": {},
 400 |    "outputs": [],
 401 |    "source": [
 402 |     "def longest_common_substring(text):\n",
 403 |     "    sa=Construct_SuffixArray(text,len(text))\n",
 404 |     "    rsa=suffix_array_naive(text)\n",
 405 |     "    lcp=Construct_LCP_array(sa,text)\n",
 406 |     "    maxlen = max(lcp)\n",
 407 |     "    result = {}\n",
 408 |     "    for i in range(1, len(text)):\n",
 409 |     "        if lcp[i] == maxlen:\n",
 410 |     "            j1, j2, h = sa[i - 1], sa[i], lcp[i]\n",
 411 |     "            assert text[j1:j1 + h] == text[j2:j2 + h],\"There is an assertion error\"\n",
 412 |     "            substring = text[j1:j1 + h]\n",
 413 |     "            if substring not in result:\n",
 414 |     "                result[substring] = [j1]\n",
 415 |     "            result[substring].append(j2)\n",
 416 |     "    return dict((k, sorted(v)) for k, v in result.items())\n",
 417 |     "\n",
 418 |     "\n",
 419 |     "\n",
 420 |     "\n"
 421 |    ]
 422 |   },
 423 |   {
 424 |    "cell_type": "code",
 425 |    "execution_count": 17,
 426 |    "metadata": {},
 427 |    "outputs": [
 428 |     {
 429 |      "name": "stdout",
 430 |      "output_type": "stream",
 431 |      "text": [
 432 |       "Enter the string: banana\n",
 433 |       "Longest common substrings in \"banana \" are:\n",
 434 |       "{'ana': [1, 3]}\n"
 435 |      ]
 436 |     }
 437 |    ],
 438 |    "source": [
 439 |     "\n",
 440 |     "if __name__ == '__main__':\n",
 441 |     "    text =input(\"Enter the string: \")\n",
 442 |     "    result = longest_common_substring(text)\n",
 443 |     "    print('Longest common substrings in \"{0} \" are:\\n{1}'.format(\n",
 444 |     "        text[:20], result))"
 445 |    ]
 446 |   },
 447 |   {
 448 |    "cell_type": "markdown",
 449 |    "metadata": {},
 450 |    "source": [
 451 |     " "
 452 |    ]
 453 |   },
 454 |   {
 455 |    "cell_type": "markdown",
 456 |    "metadata": {},
 457 |    "source": [
 458 |     "# Marasani Jayasurya  AM.EN.U4AIE20048"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "markdown",
 463 |    "metadata": {},
 464 |    "source": [
 465 |     "## Finding longest palindromic  substring"
 466 |    ]
 467 |   },
 468 |   {
 469 |    "cell_type": "markdown",
 470 |    "metadata": {},
 471 |    "source": [
 472 |     "Method 1 : Brute Force method"
 473 |    ]
 474 |   },
 475 |   {
 476 |    "cell_type": "code",
 477 |    "execution_count": 22,
 478 |    "metadata": {},
 479 |    "outputs": [],
 480 |    "source": [
 481 |     "def longestPalSubstr(str):\n",
 482 |     "    n = len(str)\n",
 483 |     "    maxLength = 1\n",
 484 |     "    start = 0\n",
 485 |     "    for i in range(n):\n",
 486 |     "        for j in range(i, n):\n",
 487 |     "            flag = 1\n",
 488 |     "            for k in range(0, ((j - i) // 2) + 1):\n",
 489 |     "                if (str[i + k] != str[j - k]):\n",
 490 |     "                    flag = 0\n",
 491 |     "            if (flag != 0 and (j - i + 1) > maxLength):\n",
 492 |     "                start = i\n",
 493 |     "                maxLength = j - i + 1           \n",
 494 |     "    print(\"Longest palindrome subString is: \")\n",
 495 |     "    s1 =''\n",
 496 |     "    for i in range(start, start + maxLength):\n",
 497 |     "        s1 = s1 + str[i]\n",
 498 |     "    return s1, len(s1)"
 499 |    ]
 500 |   },
 501 |   {
 502 |    "cell_type": "code",
 503 |    "execution_count": 23,
 504 |    "metadata": {},
 505 |    "outputs": [
 506 |     {
 507 |      "name": "stdout",
 508 |      "output_type": "stream",
 509 |      "text": [
 510 |       "Enter the string: banana\n",
 511 |       "Longest palindrome subString is: \n",
 512 |       "('anana', 5)\n"
 513 |      ]
 514 |     }
 515 |    ],
 516 |    "source": [
 517 |     "s = input('Enter the string: ')\n",
 518 |     "print(longestPalSubstr(s))"
 519 |    ]
 520 |   },
 521 |   {
 522 |    "cell_type": "markdown",
 523 |    "metadata": {},
 524 |    "source": [
 525 |     "Method 2 : Using Suffix Array"
 526 |    ]
 527 |   },
 528 |   {
 529 |    "cell_type": "code",
 530 |    "execution_count": 24,
 531 |    "metadata": {},
 532 |    "outputs": [
 533 |     {
 534 |      "name": "stdout",
 535 |      "output_type": "stream",
 536 |      "text": [
 537 |       "Enter the String:\n",
 538 |       "banana\n",
 539 |       "banana\n"
 540 |      ]
 541 |     }
 542 |    ],
 543 |    "source": [
 544 |     "s = input('Enter the String:\\n')\n",
 545 |     "print(s)\n"
 546 |    ]
 547 |   },
 548 |   {
 549 |    "cell_type": "code",
 550 |    "execution_count": 25,
 551 |    "metadata": {},
 552 |    "outputs": [
 553 |     {
 554 |      "data": {
 555 |       "text/plain": [
 556 |        "'banana#ananab'"
 557 |       ]
 558 |      },
 559 |      "execution_count": 25,
 560 |      "metadata": {},
 561 |      "output_type": "execute_result"
 562 |     }
 563 |    ],
 564 |    "source": [
 565 |     "s_new = s + '#' + s[::-1]\n",
 566 |     "s_new"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": 26,
 572 |    "metadata": {},
 573 |    "outputs": [],
 574 |    "source": [
 575 |     "def suffix_array_alternative_naive(s):\n",
 576 |     "    return [rank for suffix, rank in sorted((s[i:], i) for i in range(len(s)))]"
 577 |    ]
 578 |   },
 579 |   {
 580 |    "cell_type": "code",
 581 |    "execution_count": 30,
 582 |    "metadata": {},
 583 |    "outputs": [],
 584 |    "source": [
 585 |     "def LCP(sa , s_new):\n",
 586 |     "    size = len(s_new)\n",
 587 |     "    r = [None]* size\n",
 588 |     "    for i in range(size):\n",
 589 |     "        r[sa[i]] = i\n",
 590 |     "    lcp = [None]*size\n",
 591 |     "    h = 0\n",
 592 |     "\n",
 593 |     "    for i in range(size):\n",
 594 |     "        if r[i] > 0:\n",
 595 |     "            j = sa [r[i] - 1]\n",
 596 |     "            while i != size-h and j!= size-h and s_new[i+h] == s_new[j+h]:\n",
 597 |     "                h = h+1\n",
 598 |     "            lcp[r[i]] =h\n",
 599 |     "            if h > 0:\n",
 600 |     "                h = h - 1\n",
 601 |     "    if size>0:\n",
 602 |     "        lcp[0] = 0\n",
 603 |     "    return lcp\n"
 604 |    ]
 605 |   },
 606 |   {
 607 |    "cell_type": "code",
 608 |    "execution_count": 31,
 609 |    "metadata": {},
 610 |    "outputs": [
 611 |     {
 612 |      "name": "stdout",
 613 |      "output_type": "stream",
 614 |      "text": [
 615 |       "[6, 5, 11, 3, 9, 1, 7, 12, 0, 4, 10, 2, 8]\n",
 616 |       "#ananab\n",
 617 |       "a#ananab\n",
 618 |       "ab\n",
 619 |       "ana#ananab\n",
 620 |       "anab\n",
 621 |       "anana#ananab\n",
 622 |       "ananab\n",
 623 |       "b\n",
 624 |       "banana#ananab\n",
 625 |       "na#ananab\n",
 626 |       "nab\n",
 627 |       "nana#ananab\n",
 628 |       "nanab\n"
 629 |      ]
 630 |     }
 631 |    ],
 632 |    "source": [
 633 |     "sa = suffix_array_alternative_naive(s_new)\n",
 634 |     "print(sa)\n",
 635 |     "for i in range (len(s_new)):\n",
 636 |     "    print(s_new[sa[i]:])"
 637 |    ]
 638 |   },
 639 |   {
 640 |    "cell_type": "code",
 641 |    "execution_count": 32,
 642 |    "metadata": {},
 643 |    "outputs": [
 644 |     {
 645 |      "data": {
 646 |       "text/plain": [
 647 |        "[0, 0, 1, 1, 3, 3, 5, 0, 1, 0, 2, 2, 4]"
 648 |       ]
 649 |      },
 650 |      "execution_count": 32,
 651 |      "metadata": {},
 652 |      "output_type": "execute_result"
 653 |     }
 654 |    ],
 655 |    "source": [
 656 |     "lcp = LCP(sa,s_new)\n",
 657 |     "lcp"
 658 |    ]
 659 |   },
 660 |   {
 661 |    "cell_type": "code",
 662 |    "execution_count": 33,
 663 |    "metadata": {},
 664 |    "outputs": [],
 665 |    "source": [
 666 |     "def longestCommonPrefix(strs):\n",
 667 |     "    if len(strs) == 0:\n",
 668 |     "        return \"\"\n",
 669 |     "    current = strs[0]\n",
 670 |     "    for i in range(1, len(strs)):\n",
 671 |     "        temp = \"\"\n",
 672 |     "        if len(current) == 0:\n",
 673 |     "            break\n",
 674 |     "        for j in range(len(strs[i])):\n",
 675 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
 676 |     "                temp += current[j]\n",
 677 |     "            else:\n",
 678 |     "                break\n",
 679 |     "        current = temp\n",
 680 |     "    return current"
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "code",
 685 |    "execution_count": 34,
 686 |    "metadata": {},
 687 |    "outputs": [
 688 |     {
 689 |      "name": "stdout",
 690 |      "output_type": "stream",
 691 |      "text": [
 692 |       "calculating longest prefixes between a#ananab and ab\n",
 693 |       "longest prefix between them is \"a\"\n",
 694 |       "The length is = 1\n",
 695 |       "Position =  11\n",
 696 |       "calculating longest prefixes between ana#ananab and anab\n",
 697 |       "longest prefix between them is \"ana\"\n",
 698 |       "The length is = 3\n",
 699 |       "Position =  9\n",
 700 |       "calculating longest prefixes between anana#ananab and ananab\n",
 701 |       "longest prefix between them is \"anana\"\n",
 702 |       "The length is = 5\n",
 703 |       "Position =  7\n",
 704 |       "Length of Longest Palindrome is =  5\n",
 705 |       "Longest Palindrome is =  anana\n"
 706 |      ]
 707 |     }
 708 |    ],
 709 |    "source": [
 710 |     "length_of_longest_palindrome = 0\n",
 711 |     "longest_length = 0\n",
 712 |     "Len = len(s_new)   # Length of updated string\n",
 713 |     "actual_len = len(s) # Length of original string\n",
 714 |     "Position = 0\n",
 715 |     "strs = [None]*2\n",
 716 |     "for i in range(1,Len):\n",
 717 |     "    \n",
 718 |     "    if lcp[i]>longest_length:\n",
 719 |     "        if(sa[i-1]<actual_len and sa[i]>actual_len) or (sa[i]<actual_len and sa[i-1]>actual_len):\n",
 720 |     "            print('calculating longest prefixes between {a} and {b}'.format(a = s_new[sa[i-1]:], b = s_new[sa[i]:]))\n",
 721 |     "            strs[0] = s_new[sa[i-1]:]\n",
 722 |     "            strs[1] = s_new[sa[i]:]\n",
 723 |     "            longest_length = lcp[i]\n",
 724 |     "            print('longest prefix between them is \"{}\"'.format(longestCommonPrefix(strs)))\n",
 725 |     "            print('The length is = {}'.format(longest_length))\n",
 726 |     "            Position = sa[i]\n",
 727 |     "            print(\"Position = \",Position)\n",
 728 |     "\n",
 729 |     "length_of_longest_palindrome = longest_length\n",
 730 |     "longest_palindrome = s_new[Position:Position+longest_length]\n",
 731 |     "print('Length of Longest Palindrome is = ',length_of_longest_palindrome)\n",
 732 |     "print('Longest Palindrome is = ',longest_palindrome)\n",
 733 |     "            "
 734 |    ]
 735 |   },
 736 |   {
 737 |    "cell_type": "markdown",
 738 |    "metadata": {},
 739 |    "source": [
 740 |     " "
 741 |    ]
 742 |   },
 743 |   {
 744 |    "cell_type": "markdown",
 745 |    "metadata": {},
 746 |    "source": [
 747 |     "## Algorithm to detect Long armed gapped palindrome"
 748 |    ]
 749 |   },
 750 |   {
 751 |    "cell_type": "code",
 752 |    "execution_count": 35,
 753 |    "metadata": {},
 754 |    "outputs": [
 755 |     {
 756 |      "name": "stdout",
 757 |      "output_type": "stream",
 758 |      "text": [
 759 |       "Enter the DNA sequence:GCATTTATGC\n",
 760 |       "The inverted suffix array is:\n",
 761 |       "\n",
 762 |       "[1, 10, '#', 0]\n",
 763 |       "[2, 10, '$', 0]\n",
 764 |       "[2, 4, 'AAATGC$', 2]\n",
 765 |       "[2, 5, 'AATGC$', 1]\n",
 766 |       "[2, 2, 'ATAAATGC$', 2]\n",
 767 |       "[1, 6, 'ATGC#', 4]\n",
 768 |       "[2, 6, 'ATGC$', 2]\n",
 769 |       "[1, 2, 'ATTTATGC#', 0]\n",
 770 |       "[1, 9, 'C#', 1]\n",
 771 |       "[2, 9, 'C$', 1]\n",
 772 |       "[2, 1, 'CATAAATGC$', 3]\n",
 773 |       "[1, 1, 'CATTTATGC#', 0]\n",
 774 |       "[1, 8, 'GC#', 2]\n",
 775 |       "[2, 8, 'GC$', 2]\n",
 776 |       "[2, 0, 'GCATAAATGC$', 4]\n",
 777 |       "[1, 0, 'GCATTTATGC#', 0]\n",
 778 |       "[2, 3, 'TAAATGC$', 2]\n",
 779 |       "[1, 5, 'TATGC#', 1]\n",
 780 |       "[1, 7, 'TGC#', 3]\n",
 781 |       "[2, 7, 'TGC$', 1]\n",
 782 |       "[1, 4, 'TTATGC#', 2]\n",
 783 |       "[1, 3, 'TTTATGC#', 0]\n"
 784 |      ]
 785 |     }
 786 |    ],
 787 |    "source": [
 788 |     "def complement(seq):\n",
 789 |     "    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}\n",
 790 |     "    bases = [str(i) for i in seq]\n",
 791 |     "    bases = [complement[base] for base in bases]\n",
 792 |     "    compliment= ''.join(bases)\n",
 793 |     "    reverse_compliment=compliment[::-1]\n",
 794 |     "    return seq,reverse_compliment\n",
 795 |     "\n",
 796 |     "def add_charectors(seq,reverse_compliment):\n",
 797 |     "    seq+= '#'\n",
 798 |     "    reverse_compliment+='$'\n",
 799 |     "    return seq,reverse_compliment\n",
 800 |     "\n",
 801 |     "\n",
 802 |     "\n",
 803 |     "def Construct_SuffixArray(str_1, str_2):\n",
 804 |     "    list1=[]\n",
 805 |     "    string_id={}\n",
 806 |     "    suffix_dict={}\n",
 807 |     "    len_str=len(str_1)\n",
 808 |     "    for i in range(len_str):\n",
 809 |     "        permutation1 = str_1[i:len_str]\n",
 810 |     "        permutation2 = str_2[i:len_str]\n",
 811 |     "        list1.append(permutation1)\n",
 812 |     "        list1.append(permutation2)\n",
 813 |     "        suffix_dict[permutation1]=i\n",
 814 |     "        suffix_dict[permutation2]=i\n",
 815 |     "        string_id[permutation1]=1\n",
 816 |     "        string_id[permutation2]=2\n",
 817 |     "    \n",
 818 |     "    ordered_list=sorted(list1)\n",
 819 |     "    list2 = [[string_id[i],i] for i in ordered_list]\n",
 820 |     "    list3 = []\n",
 821 |     "    for i in list2:\n",
 822 |     "        list3.append([i[0],suffix_dict[i[1]],i[1]])\n",
 823 |     "\n",
 824 |     "    for i in range(0 , len(ordered_list)):\n",
 825 |     "        if len(ordered_list)-1 == i:\n",
 826 |     "            list3[i].append(0)\n",
 827 |     "        else:\n",
 828 |     "            strs=[ordered_list[i], ordered_list[i+1]]\n",
 829 |     "            list3[i].append(longestCommonPrefix(strs))\n",
 830 |     "    return list3\n",
 831 |     "\n",
 832 |     "def longestCommonPrefix(strs):\n",
 833 |     "    if len(strs) == 0:\n",
 834 |     "        return \"\"\n",
 835 |     "    current = strs[0]\n",
 836 |     "    for i in range(1, len(strs)):\n",
 837 |     "        temp = \"\"\n",
 838 |     "        if len(current) == 0:\n",
 839 |     "            break\n",
 840 |     "        for j in range(len(strs[i])):\n",
 841 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
 842 |     "                temp += current[j]\n",
 843 |     "            else:\n",
 844 |     "                break\n",
 845 |     "        current = temp\n",
 846 |     "    return len(current)\n",
 847 |     "\n",
 848 |     "\n",
 849 |     "\n",
 850 |     "seq,rev=complement(input('Enter the DNA sequence:')) # GCATTTATGC , CGCTGTAGCG, \n",
 851 |     "seq1,rev1=add_charectors(seq,rev)\n",
 852 |     "inverted_sa= Construct_SuffixArray(seq1,rev1)\n",
 853 |     "print('The inverted suffix array is:\\n')\n",
 854 |     "for i in inverted_sa:\n",
 855 |     "    print(i)\n"
 856 |    ]
 857 |   },
 858 |   {
 859 |    "cell_type": "markdown",
 860 |    "metadata": {},
 861 |    "source": [
 862 |     "Finding maximum Lcp indexes corresponding to the maximum LCP length"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "code",
 867 |    "execution_count": 36,
 868 |    "metadata": {},
 869 |    "outputs": [
 870 |     {
 871 |      "name": "stdout",
 872 |      "output_type": "stream",
 873 |      "text": [
 874 |       "[6, 0]\n"
 875 |      ]
 876 |     }
 877 |    ],
 878 |    "source": [
 879 |     "max_LCP = max(inverted_sa,key = lambda x: x[3])[3]\n",
 880 |     "max_LCP_indexes = []\n",
 881 |     "for i in inverted_sa:\n",
 882 |     "    if i[3]==max_LCP and i[0]==1:\n",
 883 |     "        max_LCP_indexes.append(i[1])\n",
 884 |     "    elif i[3]==max_LCP and i[0]==2:\n",
 885 |     "        max_LCP_indexes.append(i[1])\n",
 886 |     "print(max_LCP_indexes)\n"
 887 |    ]
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": 37,
 892 |    "metadata": {},
 893 |    "outputs": [
 894 |     {
 895 |      "name": "stdout",
 896 |      "output_type": "stream",
 897 |      "text": [
 898 |       "Complementary\n"
 899 |      ]
 900 |     }
 901 |    ],
 902 |    "source": [
 903 |     "from Bio.Seq import Seq\n",
 904 |     "\n",
 905 |     "i = min(max_LCP_indexes)\n",
 906 |     "j = max(max_LCP_indexes)\n",
 907 |     "wi=''\n",
 908 |     "wj=''\n",
 909 |     "while(i<j):\n",
 910 |     "    a = i-max_LCP\n",
 911 |     "    b = i-1\n",
 912 |     "    if a>=0 and b>=0 and b>a:\n",
 913 |     "        wi =''\n",
 914 |     "        wi = seq[0:i]\n",
 915 |     "        if(wi==rev[a:b+1]):\n",
 916 |     "            wi = wi[::-1]\n",
 917 |     "            break\n",
 918 |     "    i = i+1\n",
 919 |     "while i<j:\n",
 920 |     "    a = j+max_LCP-1\n",
 921 |     "    if a<len(seq):\n",
 922 |     "        wj=''\n",
 923 |     "        wj=seq[j:]\n",
 924 |     "        if wj==rev[j:a+1]:\n",
 925 |     "            break\n",
 926 |     "    j = j-1\n",
 927 |     "wj = Seq(wj)\n",
 928 |     "flag = -1\n",
 929 |     "if wi == wj.complement():\n",
 930 |     "    print('Complementary')\n",
 931 |     "    flag = 1"
 932 |    ]
 933 |   },
 934 |   {
 935 |    "cell_type": "markdown",
 936 |    "metadata": {},
 937 |    "source": [
 938 |     "Verifying Spacer Length Constraints"
 939 |    ]
 940 |   },
 941 |   {
 942 |    "cell_type": "code",
 943 |    "execution_count": 38,
 944 |    "metadata": {},
 945 |    "outputs": [
 946 |     {
 947 |      "name": "stdout",
 948 |      "output_type": "stream",
 949 |      "text": [
 950 |       "The sequence GCATTTATGC is long armed gapped palindrome\n",
 951 |       "spacer length =  2\n",
 952 |       "palindrome arm length =  4\n"
 953 |      ]
 954 |     }
 955 |    ],
 956 |    "source": [
 957 |     "\n",
 958 |     "if j-i<= max_LCP and flag ==1:\n",
 959 |     "    print('The sequence {} is long armed gapped palindrome'.format(seq))\n",
 960 |     "    print('spacer length = ',j-i)\n",
 961 |     "    print('palindrome arm length = ',len(wi))"
 962 |    ]
 963 |   },
 964 |   {
 965 |    "cell_type": "markdown",
 966 |    "metadata": {},
 967 |    "source": [
 968 |     " "
 969 |    ]
 970 |   },
 971 |   {
 972 |    "cell_type": "markdown",
 973 |    "metadata": {},
 974 |    "source": [
 975 |     "##  ALGORITHM TO DETECT LENGTH CONSTRAINED GAPPED PALINDROMES"
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "code",
 980 |    "execution_count": 40,
 981 |    "metadata": {},
 982 |    "outputs": [
 983 |     {
 984 |      "name": "stdout",
 985 |      "output_type": "stream",
 986 |      "text": [
 987 |       "Enter the DNA sequence:GCATTTATGC\n",
 988 |       "The inverted suffix array is:\n",
 989 |       "\n",
 990 |       "[1, 10, '#', 0]\n",
 991 |       "[2, 10, '$', 0]\n",
 992 |       "[2, 4, 'AAATGC$', 2]\n",
 993 |       "[2, 5, 'AATGC$', 1]\n",
 994 |       "[2, 2, 'ATAAATGC$', 2]\n",
 995 |       "[1, 6, 'ATGC#', 4]\n",
 996 |       "[2, 6, 'ATGC$', 2]\n",
 997 |       "[1, 2, 'ATTTATGC#', 0]\n",
 998 |       "[1, 9, 'C#', 1]\n",
 999 |       "[2, 9, 'C$', 1]\n",
1000 |       "[2, 1, 'CATAAATGC$', 3]\n",
1001 |       "[1, 1, 'CATTTATGC#', 0]\n",
1002 |       "[1, 8, 'GC#', 2]\n",
1003 |       "[2, 8, 'GC$', 2]\n",
1004 |       "[2, 0, 'GCATAAATGC$', 4]\n",
1005 |       "[1, 0, 'GCATTTATGC#', 0]\n",
1006 |       "[2, 3, 'TAAATGC$', 2]\n",
1007 |       "[1, 5, 'TATGC#', 1]\n",
1008 |       "[1, 7, 'TGC#', 3]\n",
1009 |       "[2, 7, 'TGC$', 1]\n",
1010 |       "[1, 4, 'TTATGC#', 2]\n",
1011 |       "[1, 3, 'TTTATGC#', 0]\n"
1012 |      ]
1013 |     }
1014 |    ],
1015 |    "source": [
1016 |     "def complement(seq):\n",
1017 |     "    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}\n",
1018 |     "    bases = [str(i) for i in seq]\n",
1019 |     "    bases = [complement[base] for base in bases]\n",
1020 |     "    compliment= ''.join(bases)\n",
1021 |     "    reverse_compliment=compliment[::-1]\n",
1022 |     "    return seq,reverse_compliment\n",
1023 |     "\n",
1024 |     "def add_charectors(seq,reverse_compliment):\n",
1025 |     "    seq+= '#'\n",
1026 |     "    reverse_compliment+='$'\n",
1027 |     "    return seq,reverse_compliment\n",
1028 |     "\n",
1029 |     "\n",
1030 |     "\n",
1031 |     "def Construct_SuffixArray(str_1, str_2):\n",
1032 |     "    list1=[]\n",
1033 |     "    string_id={}\n",
1034 |     "    suffix_dict={}\n",
1035 |     "    len_str=len(str_1)\n",
1036 |     "    for i in range(len_str):\n",
1037 |     "        permutation1 = str_1[i:len_str]\n",
1038 |     "        permutation2 = str_2[i:len_str]\n",
1039 |     "        list1.append(permutation1)\n",
1040 |     "        list1.append(permutation2)\n",
1041 |     "        suffix_dict[permutation1]=i\n",
1042 |     "        suffix_dict[permutation2]=i\n",
1043 |     "        string_id[permutation1]=1\n",
1044 |     "        string_id[permutation2]=2\n",
1045 |     "    \n",
1046 |     "    ordered_list=sorted(list1)\n",
1047 |     "    list2 = [[string_id[i],i] for i in ordered_list]\n",
1048 |     "    list3 = []\n",
1049 |     "    for i in list2:\n",
1050 |     "        list3.append([i[0],suffix_dict[i[1]],i[1]])\n",
1051 |     "\n",
1052 |     "    for i in range(0 , len(ordered_list)):\n",
1053 |     "        if len(ordered_list)-1 == i:\n",
1054 |     "            list3[i].append(0)\n",
1055 |     "        else:\n",
1056 |     "            strs=[ordered_list[i], ordered_list[i+1]]\n",
1057 |     "            list3[i].append(longestCommonPrefix(strs))\n",
1058 |     "    return list3\n",
1059 |     "\n",
1060 |     "def longestCommonPrefix(strs):\n",
1061 |     "    if len(strs) == 0:\n",
1062 |     "        return \"\"\n",
1063 |     "    current = strs[0]\n",
1064 |     "    for i in range(1, len(strs)):\n",
1065 |     "        temp = \"\"\n",
1066 |     "        if len(current) == 0:\n",
1067 |     "            break\n",
1068 |     "        for j in range(len(strs[i])):\n",
1069 |     "            if j < len(current) and current[j] == strs[i][j]:\n",
1070 |     "                temp += current[j]\n",
1071 |     "            else:\n",
1072 |     "                break\n",
1073 |     "        current = temp\n",
1074 |     "    return len(current)\n",
1075 |     "\n",
1076 |     "\n",
1077 |     "\n",
1078 |     "seq,rev=complement(input('Enter the DNA sequence:')) # GCATTTATGC , CGCTGTAGCG, \n",
1079 |     "seq1,rev1=add_charectors(seq,rev)\n",
1080 |     "inverted_sa= Construct_SuffixArray(seq1,rev1)\n",
1081 |     "print('The inverted suffix array is:\\n')\n",
1082 |     "for i in inverted_sa:\n",
1083 |     "    print(i)\n"
1084 |    ]
1085 |   },
1086 |   {
1087 |    "cell_type": "code",
1088 |    "execution_count": 41,
1089 |    "metadata": {},
1090 |    "outputs": [],
1091 |    "source": [
1092 |     "# These are constants that are predefined\n",
1093 |     "minarm = 3\n",
1094 |     "maxgap = 5\n",
1095 |     "mingap = 3         "
1096 |    ]
1097 |   },
1098 |   {
1099 |    "cell_type": "code",
1100 |    "execution_count": 42,
1101 |    "metadata": {},
1102 |    "outputs": [
1103 |     {
1104 |      "name": "stdout",
1105 |      "output_type": "stream",
1106 |      "text": [
1107 |       "[1, 7]\n"
1108 |      ]
1109 |     }
1110 |    ],
1111 |    "source": [
1112 |     "minarm_indexes = []\n",
1113 |     "for k in inverted_sa:\n",
1114 |     "    if k[3]==minarm:\n",
1115 |     "        minarm_indexes.append(k[1])\n",
1116 |     "print(minarm_indexes)"
1117 |    ]
1118 |   },
1119 |   {
1120 |    "cell_type": "markdown",
1121 |    "metadata": {},
1122 |    "source": [
1123 |     "Verify Palindrome Arm  Constraints"
1124 |    ]
1125 |   },
1126 |   {
1127 |    "cell_type": "code",
1128 |    "execution_count": 43,
1129 |    "metadata": {},
1130 |    "outputs": [
1131 |     {
1132 |      "name": "stdout",
1133 |      "output_type": "stream",
1134 |      "text": [
1135 |       "Complementary\n"
1136 |      ]
1137 |     }
1138 |    ],
1139 |    "source": [
1140 |     "i = min(minarm_indexes)\n",
1141 |     "j = max(minarm_indexes)\n",
1142 |     "wi=''\n",
1143 |     "wj=''\n",
1144 |     "while(i<j):\n",
1145 |     "    a = i-minarm\n",
1146 |     "    b = i-1\n",
1147 |     "    if a>=0 and b>=0 and b>a:\n",
1148 |     "        wi = ''\n",
1149 |     "        wi = seq[0:i]\n",
1150 |     "        if(wi==rev[a:b+1]):\n",
1151 |     "            wi = wi[::-1]\n",
1152 |     "            break\n",
1153 |     "    i = i+1\n",
1154 |     "\n",
1155 |     "while i<j:\n",
1156 |     "    a = j+minarm-1\n",
1157 |     "    if a<len(seq):\n",
1158 |     "        wj=''\n",
1159 |     "        wj=seq[j:]\n",
1160 |     "        if wj==rev[j:a+1]:\n",
1161 |     "            break\n",
1162 |     "    j = j-1\n",
1163 |     "wj = Seq(wj)\n",
1164 |     "falg =-1\n",
1165 |     "if wi == wj.complement():\n",
1166 |     "    print('Complementary')\n",
1167 |     "    flag = 1        "
1168 |    ]
1169 |   },
1170 |   {
1171 |    "cell_type": "markdown",
1172 |    "metadata": {},
1173 |    "source": [
1174 |     "Veryfying Spacer Length Constraints"
1175 |    ]
1176 |   },
1177 |   {
1178 |    "cell_type": "code",
1179 |    "execution_count": 44,
1180 |    "metadata": {},
1181 |    "outputs": [
1182 |     {
1183 |      "name": "stdout",
1184 |      "output_type": "stream",
1185 |      "text": [
1186 |       "The sequence GCATTTATGC is length constrained gapped palindrome\n",
1187 |       "spacer length =  4\n",
1188 |       "palindrome arm length =  3\n"
1189 |      ]
1190 |     }
1191 |    ],
1192 |    "source": [
1193 |     "if mingap <= j-i and j-i <=maxgap and flag ==1:\n",
1194 |     "    print('The sequence {} is length constrained gapped palindrome'.format(seq))\n",
1195 |     "    print('spacer length = ',j-i)\n",
1196 |     "    print('palindrome arm length = ',len(wi))"
1197 |    ]
1198 |   },
1199 |   {
1200 |    "cell_type": "code",
1201 |    "execution_count": null,
1202 |    "metadata": {},
1203 |    "outputs": [],
1204 |    "source": []
1205 |   }
1206 |  ],
1207 |  "metadata": {
1208 |   "kernelspec": {
1209 |    "display_name": "Python 3",
1210 |    "language": "python",
1211 |    "name": "python3"
1212 |   },
1213 |   "language_info": {
1214 |    "codemirror_mode": {
1215 |     "name": "ipython",
1216 |     "version": 3
1217 |    },
1218 |    "file_extension": ".py",
1219 |    "mimetype": "text/x-python",
1220 |    "name": "python",
1221 |    "nbconvert_exporter": "python",
1222 |    "pygments_lexer": "ipython3",
1223 |    "version": "3.10.1"
1224 |   }
1225 |  },
1226 |  "nbformat": 4,
1227 |  "nbformat_minor": 4
1228 | }
1229 | 


--------------------------------------------------------------------------------