├── GUI ├── mainform.lfm ├── mainform.pas ├── optionlists.txt ├── pycoevolgui.ico ├── pycoevolgui.lpi ├── pycoevolgui.lpr └── pycoevolgui.res ├── Matrix ├── BLOSUM62 ├── CLM ├── CPVN ├── MCLACHLAN ├── PAM250 └── VOL ├── Parameters.py ├── Params.config ├── Pycoevol.py ├── Pycoevol_paper.pdf ├── Pycoevol_userguide.pdf ├── README.md ├── Results └── output_results ├── SIFTS └── Database version ├── refseq_protein.pal └── src ├── ALIGN.py ├── BLAST.py ├── COEVOL.py ├── INFO.py ├── MAIN.py ├── ORGANISM.py ├── SASA.py ├── SEQ.py ├── UTILS.py ├── __init__.py └── tools ├── blast+ ├── db │ └── refseq_protein.pal └── psiblast_here ├── clustalw └── clustalw_here ├── mafft └── mafft_here └── muscle └── muscle_here /GUI/mainform.lfm: -------------------------------------------------------------------------------- 1 | object Form1: TForm1 2 | Left = 628 3 | Height = 604 4 | Top = 203 5 | Width = 843 6 | Caption = 'Pycoevol' 7 | ClientHeight = 604 8 | ClientWidth = 843 9 | OnActivate = FormActivate 10 | OnClose = FormClose 11 | OnCreate = FormCreate 12 | LCLVersion = '0.9.30.4' 13 | object RunPyCoBt: TButton 14 | Left = 640 15 | Height = 25 16 | Top = 168 17 | Width = 192 18 | Caption = 'Run Pycoevol' 19 | OnClick = RunPyCoBtClick 20 | TabOrder = 0 21 | end 22 | object PycoFolderEd: TEdit 23 | Left = 112 24 | Height = 28 25 | Top = 9 26 | Width = 672 27 | TabOrder = 1 28 | end 29 | object Label1: TLabel 30 | Left = 8 31 | Height = 21 32 | Top = 16 33 | Width = 104 34 | Caption = 'Pycoevol folder' 35 | ParentColor = False 36 | OnClick = Label1Click 37 | end 38 | object PycoFolderBrowseBt: TButton 39 | Left = 791 40 | Height = 25 41 | Top = 7 42 | Width = 32 43 | Caption = '...' 44 | OnClick = PycoFolderBrowseBtClick 45 | TabOrder = 2 46 | end 47 | object ParamFileEd: TEdit 48 | Left = 112 49 | Height = 28 50 | Top = 37 51 | Width = 672 52 | TabOrder = 3 53 | end 54 | object Label2: TLabel 55 | Left = 8 56 | Height = 21 57 | Top = 44 58 | Width = 100 59 | Caption = 'Parameters file' 60 | ParentColor = False 61 | end 62 | object ParamFileBrowseBt: TButton 63 | Left = 791 64 | Height = 25 65 | Top = 35 66 | Width = 32 67 | Caption = '...' 68 | OnClick = ParamFileBrowseBtClick 69 | TabOrder = 4 70 | end 71 | object PsiblastCb: TComboBox 72 | Left = 8 73 | Height = 28 74 | Top = 194 75 | Width = 140 76 | ItemHeight = 20 77 | TabOrder = 5 78 | Text = 'PsiblastCb' 79 | end 80 | object AlignmentCb: TComboBox 81 | Left = 160 82 | Height = 28 83 | Top = 194 84 | Width = 140 85 | ItemHeight = 20 86 | TabOrder = 6 87 | Text = 'PsiblastCb' 88 | end 89 | object CoevolutionCb: TComboBox 90 | Left = 312 91 | Height = 28 92 | Top = 194 93 | Width = 140 94 | ItemHeight = 20 95 | TabOrder = 7 96 | Text = 'PsiblastCb' 97 | end 98 | object Label3: TLabel 99 | Left = 8 100 | Height = 21 101 | Top = 172 102 | Width = 99 103 | Caption = 'Psiblast option' 104 | ParentColor = False 105 | end 106 | object Label4: TLabel 107 | Left = 160 108 | Height = 21 109 | Top = 172 110 | Width = 118 111 | Caption = 'Alignment option' 112 | ParentColor = False 113 | end 114 | object Label5: TLabel 115 | Left = 312 116 | Height = 21 117 | Top = 172 118 | Width = 141 119 | Caption = 'Coevolution measure' 120 | ParentColor = False 121 | end 122 | object File1Ed: TEdit 123 | Left = 112 124 | Height = 28 125 | Top = 98 126 | Width = 488 127 | TabOrder = 8 128 | end 129 | object Label6: TLabel 130 | Left = 8 131 | Height = 21 132 | Top = 100 133 | Width = 60 134 | Caption = 'Protein 1' 135 | ParentColor = False 136 | end 137 | object File1BrowseBt: TButton 138 | Left = 600 139 | Height = 25 140 | Top = 96 141 | Width = 32 142 | Caption = '...' 143 | OnClick = File1BrowseBtClick 144 | TabOrder = 9 145 | end 146 | object File2Ed: TEdit 147 | Left = 112 148 | Height = 28 149 | Top = 126 150 | Width = 488 151 | TabOrder = 10 152 | end 153 | object Label7: TLabel 154 | Left = 8 155 | Height = 21 156 | Top = 128 157 | Width = 60 158 | Caption = 'Protein 2' 159 | ParentColor = False 160 | end 161 | object File2BrowseBt: TButton 162 | Left = 600 163 | Height = 25 164 | Top = 124 165 | Width = 32 166 | Caption = '...' 167 | OnClick = File2BrowseBtClick 168 | TabOrder = 11 169 | end 170 | object Label8: TLabel 171 | Left = 352 172 | Height = 21 173 | Top = 80 174 | Width = 24 175 | Caption = 'File' 176 | ParentColor = False 177 | end 178 | object Chain1Ed: TEdit 179 | Left = 640 180 | Height = 28 181 | Top = 98 182 | Width = 48 183 | TabOrder = 12 184 | end 185 | object Chain2Ed: TEdit 186 | Left = 640 187 | Height = 28 188 | Top = 126 189 | Width = 48 190 | TabOrder = 13 191 | end 192 | object Label9: TLabel 193 | Left = 648 194 | Height = 21 195 | Top = 80 196 | Width = 38 197 | Caption = 'Chain' 198 | ParentColor = False 199 | end 200 | object Id1Ed: TEdit 201 | Left = 688 202 | Height = 28 203 | Top = 98 204 | Width = 144 205 | TabOrder = 14 206 | end 207 | object Id2Ed: TEdit 208 | Left = 688 209 | Height = 28 210 | Top = 126 211 | Width = 144 212 | TabOrder = 15 213 | end 214 | object Label10: TLabel 215 | Left = 736 216 | Height = 21 217 | Top = 80 218 | Width = 37 219 | Caption = 'Label' 220 | ParentColor = False 221 | end 222 | object PycoMm: TMemo 223 | Left = 10 224 | Height = 360 225 | Top = 240 226 | Width = 822 227 | ReadOnly = True 228 | ScrollBars = ssVertical 229 | TabOrder = 16 230 | end 231 | object PythonClEd: TEdit 232 | Left = 480 233 | Height = 28 234 | Top = 194 235 | Width = 128 236 | TabOrder = 17 237 | Text = 'python' 238 | end 239 | object Label11: TLabel 240 | Left = 480 241 | Height = 21 242 | Top = 172 243 | Width = 120 244 | Caption = 'Python interpreter' 245 | ParentColor = False 246 | end 247 | object StopPycoBt: TButton 248 | Left = 640 249 | Height = 25 250 | Top = 200 251 | Width = 192 252 | Caption = 'Stop Pycoevol' 253 | Enabled = False 254 | OnClick = StopPycoBtClick 255 | TabOrder = 18 256 | end 257 | object OpenDialog1: TOpenDialog 258 | left = 64 259 | top = 284 260 | end 261 | object SelectDirectoryDialog1: TSelectDirectoryDialog 262 | left = 69 263 | top = 272 264 | end 265 | end 266 | -------------------------------------------------------------------------------- /GUI/mainform.pas: -------------------------------------------------------------------------------- 1 | {******************************************************************************* 2 | This file is part of the Pycoevol. 3 | This work is public domain. Enjoy. 4 | ******************************************************************************** 5 | Author: Ludwig Krippahl 6 | Date: 21.4.2012 7 | Purpose: 8 | Pycoevol GUI 9 | Requirements: 10 | Revisions: 11 | To do: 12 | *******************************************************************************} 13 | 14 | unit mainform; 15 | 16 | {$mode objfpc}{$H+} 17 | 18 | interface 19 | 20 | uses 21 | Classes, SysUtils, FileUtil, Forms, Controls, Graphics, Dialogs, StdCtrls, 22 | Grids,Process, INIFiles; 23 | 24 | type 25 | 26 | { TForm1 } 27 | 28 | TForm1 = class(TForm) 29 | StopPycoBt: TButton; 30 | PythonClEd: TEdit; 31 | Label10: TLabel; 32 | Label11: TLabel; 33 | Label6: TLabel; 34 | Label7: TLabel; 35 | Label8: TLabel; 36 | Label9: TLabel; 37 | PycoMm: TMemo; 38 | OpenDialog1: TOpenDialog; 39 | Label3: TLabel; 40 | Label4: TLabel; 41 | Label5: TLabel; 42 | File2BrowseBt: TButton; 43 | File2Ed: TEdit; 44 | Chain2Ed: TEdit; 45 | Id2Ed: TEdit; 46 | PsiblastCb: TComboBox; 47 | Label2: TLabel; 48 | AlignmentCb: TComboBox; 49 | CoevolutionCb: TComboBox; 50 | PycoFolderBrowseBt: TButton; 51 | ParamFileBrowseBt: TButton; 52 | File1BrowseBt: TButton; 53 | PycoFolderEd: TEdit; 54 | Label1: TLabel; 55 | ParamFileEd: TEdit; 56 | File1Ed: TEdit; 57 | Chain1Ed: TEdit; 58 | Id1Ed: TEdit; 59 | RunPyCoBt: TButton; 60 | SelectDirectoryDialog1: TSelectDirectoryDialog; 61 | procedure File1BrowseBtClick(Sender: TObject); 62 | procedure File2BrowseBtClick(Sender: TObject); 63 | procedure FormActivate(Sender: TObject); 64 | procedure FormClose(Sender: TObject; var CloseAction: TCloseAction); 65 | procedure FormCreate(Sender: TObject); 66 | procedure Label1Click(Sender: TObject); 67 | procedure ParamFileBrowseBtClick(Sender: TObject); 68 | procedure ParamFileEditClick(Sender: TObject); 69 | procedure PycoFolderBrowseBtClick(Sender: TObject); 70 | procedure RunPyCoBtClick(Sender: TObject); 71 | procedure StopPycoBtClick(Sender: TObject); 72 | private 73 | { private declarations } 74 | FTerminatePyCo:Boolean; 75 | Init:Boolean; 76 | function GetCommandLine:string; 77 | procedure LoadLists; 78 | procedure SaveConfiguration; 79 | procedure LoadConfiguration; 80 | procedure RunPycoevol; 81 | public 82 | { public declarations } 83 | end; 84 | 85 | var 86 | Form1: TForm1; 87 | 88 | implementation 89 | 90 | {$R *.lfm} 91 | 92 | { TForm1 } 93 | 94 | procedure TForm1.FormCreate(Sender: TObject); 95 | begin 96 | LoadLists; 97 | Init:=True; 98 | end; 99 | 100 | procedure TForm1.Label1Click(Sender: TObject); 101 | begin 102 | 103 | end; 104 | 105 | procedure TForm1.FormActivate(Sender: TObject); 106 | begin 107 | if Init then 108 | begin 109 | LoadConfiguration; 110 | Init:=False; 111 | end; 112 | end; 113 | 114 | procedure TForm1.File2BrowseBtClick(Sender: TObject); 115 | begin 116 | OpenDialog1.Filter:='PDB file|*.pdb|Sequence|*.fasta|Any|*.*'; 117 | if OpenDialog1.Execute then 118 | File2Ed.Text:=OpenDialog1.FileName; 119 | end; 120 | 121 | procedure TForm1.File1BrowseBtClick(Sender: TObject); 122 | begin 123 | OpenDialog1.Filter:='PDB file|*.pdb|Sequence|*.fasta|Any|*.*'; 124 | if OpenDialog1.Execute then 125 | File1Ed.Text:=OpenDialog1.FileName; 126 | end; 127 | 128 | procedure TForm1.FormClose(Sender: TObject; var CloseAction: TCloseAction); 129 | begin 130 | SaveConfiguration; 131 | end; 132 | 133 | procedure TForm1.ParamFileBrowseBtClick(Sender: TObject); 134 | begin 135 | OpenDialog1.Filter:='Parameter file|*.config|Any|*.*'; 136 | if OpenDialog1.Execute then 137 | ParamFileEd.Text:=OpenDialog1.FileName; 138 | end; 139 | 140 | procedure TForm1.ParamFileEditClick(Sender: TObject); 141 | 142 | var proc:TProcess; 143 | 144 | begin 145 | proc:=TProcess.Create(nil); 146 | proc.CommandLine:='"'+ParamFileEd.Text+'"'; 147 | proc.Execute; 148 | proc.Free; 149 | end; 150 | 151 | procedure TForm1.PycoFolderBrowseBtClick(Sender: TObject); 152 | begin 153 | if SelectDirectoryDialog1.Execute then 154 | PycoFolderEd.Text:=SelectDirectoryDialog1.FileName; 155 | end; 156 | 157 | procedure TForm1.RunPyCoBtClick(Sender: TObject); 158 | 159 | var oc:string; 160 | 161 | begin 162 | RunPyCoBt.Enabled:=False; 163 | StopPyCoBt.Enabled:=True; 164 | FTerminatePyco:=False; 165 | oc:=RunPyCoBt.Caption; 166 | RunPyCoBt.Caption:='Busy...'; 167 | Application.ProcessMessages; 168 | try 169 | RunPycoevol; 170 | finally 171 | RunPyCoBt.Caption:=oc; 172 | RunPyCoBt.Enabled:=True; 173 | end; 174 | end; 175 | 176 | procedure TForm1.StopPycoBtClick(Sender: TObject); 177 | begin 178 | StopPyCoBt.Enabled:=False; 179 | FTerminatePyco:=True; 180 | end; 181 | 182 | function TForm1.GetCommandLine: string; 183 | begin 184 | 185 | //TODO: Check for spaces in parameters?? 186 | 187 | Result:=PythonClEd.Text+' "'+PycoFolderEd.Text+PathDelim+'Pycoevol.py"'; 188 | if (File1Ed.Text<>'') and (File2Ed.Text<>'') then 189 | Result:=Result+' "'+File1Ed.Text+'" "'+File2Ed.Text+'"'; 190 | if (Chain1Ed.Text<>'') and (Chain2Ed.Text<>'') then 191 | Result:=Result+' -x'+Chain1Ed.Text+' -x'+Chain2Ed.Text; 192 | if (Id1Ed.Text<>'') and (Id2Ed.Text<>'') then 193 | Result:=Result+' -i'+Id1Ed.Text+' -i'+Id2Ed.Text; 194 | Result:=Result+' -b'+PsiblastCb.Text+' -a'+AlignmentCb.Text+' -c'+CoevolutionCb.Text; 195 | if (ParamFileEd.text<>'') then 196 | Result:=Result+' -p"'+ParamFileEd.text+'"'; 197 | end; 198 | 199 | procedure TForm1.LoadLists; 200 | 201 | var 202 | sl:TStringList; 203 | f:Integer; 204 | s:string; 205 | currbox:TComboBox; 206 | 207 | begin 208 | sl:=TStringList.Create; 209 | sl.LoadFromFile('optionlists.txt'); 210 | for f:=0 to sl.Count-1 do 211 | begin 212 | s:=sl.Strings[f]; 213 | if s='**coevolution' then currbox:=CoevolutionCB 214 | else if s='**alignment' then currbox:=AlignmentCB 215 | else if s='**psiblast' then currbox:=PsiblastCB 216 | else 217 | currbox.Items.Add(s); 218 | end; 219 | CoevolutionCb.ItemIndex:=0; 220 | AlignmentCb.ItemIndex:=0; 221 | PsiblastCb.ItemIndex:=0; 222 | 223 | sl.Free; 224 | end; 225 | 226 | procedure TForm1.SaveConfiguration; 227 | 228 | var 229 | cfg:string; 230 | ini:TIniFile; 231 | 232 | begin 233 | cfg := GetAppConfigFile(False); 234 | if not DirectoryExists(ExtractFileDir(cfg)) then 235 | CreateDir(ExtractFileDir(cfg)); 236 | ini:=TiniFile.Create(cfg); 237 | ini.WriteString('Form','PycoevolFolder',PycoFolderEd.Text); 238 | ini.WriteString('Form','ParametersFile',ParamFileEd.Text); 239 | ini.WriteString('Form','File1',File1Ed.Text); 240 | ini.WriteString('Form','File2',File2Ed.Text); 241 | ini.WriteString('Form','Psiblast',PsiblastCb.Text); 242 | ini.WriteString('Form','Alignment',AlignmentCB.Text); 243 | ini.WriteString('Form','Coevolution',CoevolutionCB.Text); 244 | ini.WriteString('Form','Python',PythonClEd.Text); 245 | ini.UpdateFile; 246 | ini.Free; 247 | end; 248 | 249 | procedure TForm1.LoadConfiguration; 250 | 251 | var 252 | cfg:string; 253 | ini:TIniFile; 254 | 255 | begin 256 | cfg := GetAppConfigFile(False); 257 | if FileExists(cfg) then 258 | begin 259 | ini:=TIniFile.Create(cfg); 260 | PycoFolderEd.Text:=ini.ReadString('Form','PycoevolFolder',''); 261 | ParamFileEd.Text:=ini.ReadString('Form','ParametersFile',''); 262 | File1Ed.Text:=ini.ReadString('Form','File1',''); 263 | File2Ed.Text:=ini.ReadString('Form','File2',''); 264 | PythonClEd.Text:=ini.ReadString('Form','Python',''); 265 | 266 | PsiblastCb.ItemIndex:=PsiblastCb.Items.IndexOf(ini.ReadString('Form','Psiblast','')); 267 | if PsiBlastCB.ItemIndex<0 then PsiBlastCb.ItemIndex:=0; 268 | AlignmentCB.ItemIndex:=AlignmentCB.Items.IndexOf(ini.ReadString('Form','Alignment','')); 269 | if AlignmentCB.ItemIndex<0 then AlignmentCB.ItemIndex:=0; 270 | CoevolutionCB.ItemIndex:=CoevolutionCB.Items.IndexOf(ini.ReadString('Form','Coevolution','')); 271 | if CoevolutionCB.ItemIndex<0 then CoevolutionCB.ItemIndex:=0; 272 | 273 | ini.Free; 274 | end; 275 | end; 276 | 277 | procedure TForm1.RunPycoevol; 278 | 279 | 280 | var 281 | ebytes,nbytes: LongInt; 282 | proc:TProcess; 283 | cl:string; 284 | 285 | procedure RefreshOutput; 286 | 287 | var 288 | s:string; 289 | 290 | begin 291 | nbytes := proc.Output.NumBytesAvailable; 292 | while nbytes > 0 do 293 | begin 294 | SetLength(s,nbytes); 295 | proc.Output.Read(s[1], nbytes); 296 | nbytes := proc.Output.NumBytesAvailable; 297 | PycoMm.Lines.Add(s); 298 | end; 299 | ebytes := proc.Stderr.NumBytesAvailable; 300 | if ebytes>0 then PycoMm.Lines.Add('*** ERROR ***'); 301 | while ebytes > 0 do 302 | begin 303 | SetLength(s,ebytes); 304 | proc.Stderr.Read(s[1], ebytes); 305 | ebytes := proc.Stderr.NumBytesAvailable; 306 | PycoMm.Lines.Add(s); 307 | end; 308 | PycoMm.SelStart := Length(PycoMm.Lines.Text)-1; 309 | PycoMm.SelLength:=0; 310 | PycoMm.SetFocus; 311 | Application.ProcessMessages; 312 | if FTerminatePyCo then proc.Active:=False; 313 | end; 314 | 315 | begin 316 | SetCurrentDir(PyCoFolderEd.Text); 317 | cl:=GetCommandLine; 318 | proc:=TProcess.Create(nil); 319 | proc.CommandLine := cl; 320 | proc.Options := [poUsePipes]; 321 | PycoMm.Lines.Add(cl); 322 | Application.ProcessMessages; 323 | proc.Execute; 324 | while proc.Running do 325 | begin 326 | Application.ProcessMessages; 327 | RefreshOutput; 328 | Sleep(500); 329 | end; 330 | RefreshOutput; 331 | proc.Free; 332 | if FTerminatePyCo then PycoMm.Lines.Add('Terminated by user') 333 | else PycoMm.Lines.Add('Done'); 334 | PycoMm.SelStart := Length(PycoMm.Lines.Text)-1; 335 | PycoMm.SelLength:=0; 336 | PycoMm.SetFocus; 337 | 338 | end; 339 | 340 | end. 341 | 342 | -------------------------------------------------------------------------------- /GUI/optionlists.txt: -------------------------------------------------------------------------------- 1 | **coevolution 2 | mi 3 | mie 4 | rcwmi 5 | cpvn 6 | clm 7 | vol 8 | omes 9 | pearson 10 | spearman 11 | mcbasc 12 | quartets 13 | sca 14 | elsc 15 | **psiblast 16 | internet 17 | local 18 | custom 19 | **alignment 20 | clustalw 21 | muscle 22 | mafft 23 | custom -------------------------------------------------------------------------------- /GUI/pycoevolgui.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/GUI/pycoevolgui.ico -------------------------------------------------------------------------------- /GUI/pycoevolgui.lpi: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | -------------------------------------------------------------------------------- /GUI/pycoevolgui.lpr: -------------------------------------------------------------------------------- 1 | program pycoevolgui; 2 | 3 | {$mode objfpc}{$H+} 4 | 5 | uses 6 | {$IFDEF UNIX}{$IFDEF UseCThreads} 7 | cthreads, 8 | {$ENDIF}{$ENDIF} 9 | Interfaces, // this includes the LCL widgetset 10 | Forms, mainform 11 | { you can add units after this }; 12 | 13 | {$R *.res} 14 | 15 | begin 16 | Application.Initialize; 17 | Application.CreateForm(TForm1, Form1); 18 | Application.Run; 19 | end. 20 | 21 | -------------------------------------------------------------------------------- /GUI/pycoevolgui.res: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/GUI/pycoevolgui.res -------------------------------------------------------------------------------- /Matrix/BLOSUM62: -------------------------------------------------------------------------------- 1 | 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 2 | -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 3 | -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 4 | -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 5 | 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 6 | -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 7 | -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 8 | 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 9 | -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 10 | -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 11 | -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 12 | -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 13 | -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 14 | -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 15 | -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 16 | 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 17 | 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 18 | -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 1 2 -3 -4 -3 -2 -4 19 | -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 20 | 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 21 | -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 22 | -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 23 | 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 24 | -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -1 -------------------------------------------------------------------------------- /Matrix/CLM: -------------------------------------------------------------------------------- 1 | 0.50 0.90 0.30 0.30 1.40 0.30 0.60 1.30 0.30 1.30 1.10 0.40 0.50 0.50 0.50 0.40 0.60 1.10 1.40 1.30 2 | 0.90 9.60 0.60 0.50 2.60 0.70 1.80 1.80 0.60 1.70 1.50 0.70 1.00 0.80 0.80 0.80 0.80 1.70 2.70 2.20 3 | 0.30 0.60 0.50 0.30 0.50 0.40 1.10 0.50 1.00 0.40 0.50 0.70 0.40 0.50 1.50 0.60 0.50 0.40 1.10 1.20 4 | 0.30 0.40 0.40 0.40 0.80 0.30 1.00 0.60 1.10 0.60 0.70 0.60 0.50 0.60 1.50 0.50 0.60 0.50 1.20 1.30 5 | 1.40 2.40 0.60 0.70 4.30 0.90 1.40 3.30 0.70 3.20 2.80 0.80 1.30 1.10 1.40 0.90 1.10 2.60 4.10 3.10 6 | 0.40 0.60 0.40 0.30 1.00 0.40 0.60 0.60 0.50 0.60 0.70 0.60 0.40 0.50 0.70 0.40 0.50 0.60 1.10 1.00 7 | 0.70 1.50 1.40 1.10 1.20 0.70 2.20 1.10 0.70 1.20 1.60 0.80 0.60 0.90 1.10 1.00 1.00 0.90 2.30 1.90 8 | 1.40 1.70 0.50 0.60 3.40 0.60 1.10 3.50 0.60 3.20 2.60 0.60 0.90 0.80 1.00 0.70 1.10 2.70 3.20 2.80 9 | 0.30 0.60 1.00 1.10 0.80 0.40 0.60 0.60 0.30 0.60 0.60 0.60 0.40 0.70 0.50 0.50 0.50 0.50 0.90 1.10 10 | 1.20 1.60 0.40 0.60 3.70 0.60 1.10 3.30 0.60 3.30 2.70 0.60 0.90 0.90 1.00 0.60 1.00 2.60 3.40 2.50 11 | 1.10 1.60 0.50 0.50 3.40 0.60 1.40 2.40 0.50 2.50 2.90 0.80 1.00 1.00 1.00 0.70 0.80 2.20 3.60 2.80 12 | 0.40 0.70 0.70 0.60 0.80 0.60 0.90 0.60 0.60 0.60 0.90 0.90 0.70 0.90 1.00 0.60 0.80 0.60 1.30 1.20 13 | 0.50 1.00 0.50 0.50 1.40 0.30 1.00 0.80 0.40 0.90 1.10 0.70 0.70 0.70 0.90 0.50 0.60 0.80 2.20 1.70 14 | 0.40 0.80 0.60 0.50 1.10 0.60 1.00 0.80 0.70 0.80 0.90 0.80 0.70 0.80 1.60 0.50 0.80 0.70 1.50 1.20 15 | 0.50 0.90 1.70 1.60 1.50 0.70 1.40 1.10 0.50 1.10 1.10 1.00 0.90 1.00 1.00 0.90 0.90 1.00 1.90 1.90 16 | 0.30 0.60 0.50 0.50 0.80 0.30 0.90 0.60 0.50 0.60 0.60 0.60 0.40 0.70 0.70 0.40 0.60 0.60 1.20 0.90 17 | 0.50 0.90 0.60 0.70 1.10 0.40 1.00 1.10 0.60 1.00 1.00 0.60 0.60 0.80 0.90 0.70 0.80 0.90 1.40 1.20 18 | 1.10 1.40 0.40 0.60 3.00 0.50 1.00 2.60 0.60 2.80 2.20 0.60 0.80 0.80 0.80 0.70 0.90 2.40 2.30 2.00 19 | 1.30 2.40 1.20 1.00 3.60 1.10 2.30 3.00 1.20 3.70 3.70 1.00 1.90 1.60 1.80 1.10 1.10 2.70 4.20 3.40 20 | 1.20 1.80 1.10 1.20 3.30 0.90 1.90 2.20 1.20 2.20 2.60 1.00 1.50 1.30 1.60 1.00 1.10 2.00 3.00 2.50 -------------------------------------------------------------------------------- /Matrix/CPVN: -------------------------------------------------------------------------------- 1 | 3.89 4.91 4.59 5.33 1.76 5.25 2.84 0.77 3.05 1.00 6.24 5.61 3.27 3.38 3.20 3.60 2.30 1.59 3.23 3.80 2 | 4.91 3.74 4.20 4.69 2.89 4.37 2.57 -0.41 2.83 1.42 2.92 3.95 2.90 3.21 3.22 3.22 1.93 1.36 4.45 4.18 3 | 4.59 4.20 4.03 4.86 2.93 5.32 2.77 -0.37 2.07 1.41 5.77 4.19 2.50 4.88 3.12 3.46 1.40 2.31 3.15 4.99 4 | 5.33 4.69 4.86 5.34 3.68 5.28 3.00 0.14 3.34 1.75 5.83 5.83 4.25 3.47 2.87 4.25 0.99 3.11 3.57 4.49 5 | 1.76 2.89 2.93 3.68 7.65 1.84 1.46 -0.25 1.03 2.48 2.14 2.47 2.74 4.12 2.51 1.33 0.24 -0.42 2.05 2.81 6 | 5.25 4.37 5.32 5.28 1.84 6.02 2.30 0.91 2.09 1.61 4.89 4.81 3.38 4.65 3.88 4.18 0.36 2.30 3.93 3.62 7 | 2.84 2.57 2.77 3.00 1.46 2.30 -0.52 -1.77 1.21 0.39 3.37 2.47 1.22 2.59 1.71 1.72 1.13 1.69 2.13 1.90 8 | 0.77 -0.41 -0.37 0.14 -0.25 0.91 -1.77 -4.40 0.21 -1.53 1.42 1.25 -0.51 1.08 -0.89 0.70 -0.08 -0.54 1.33 1.59 9 | 3.05 2.83 2.07 3.34 1.03 2.09 1.21 0.21 1.27 1.91 5.12 3.14 2.65 2.71 2.88 1.82 3.88 2.52 3.67 3.77 10 | 1.00 1.42 1.41 1.75 2.48 1.61 0.39 -1.53 1.91 -0.09 2.87 2.30 1.33 0.80 2.60 2.00 2.94 1.77 2.74 2.82 11 | 6.24 2.92 5.77 5.83 2.14 4.89 3.37 1.42 5.12 2.87 5.85 6.19 7.87 6.46 1.20 1.37 2.62 3.54 5.76 8.57 12 | 5.61 3.95 4.19 5.83 2.47 4.81 2.47 1.25 3.14 2.30 6.19 5.93 4.22 6.05 4.54 2.05 1.76 3.66 5.26 5.28 13 | 3.27 2.90 2.50 4.25 2.74 3.38 1.22 -0.51 2.65 1.33 7.87 4.22 0.60 2.89 3.17 3.50 1.46 3.09 3.75 3.99 14 | 3.38 3.21 4.88 3.47 4.12 4.65 2.59 1.08 2.71 0.80 6.46 6.05 2.89 5.37 2.30 4.00 5.20 2.38 2.72 4.90 15 | 3.20 3.22 3.12 2.87 2.51 3.88 1.71 -0.89 2.88 2.60 1.20 4.54 3.17 2.30 1.65 1.95 0.08 2.68 5.32 5.75 16 | 3.60 3.22 3.46 4.25 1.33 4.18 1.72 0.70 1.82 2.00 1.37 2.05 3.50 4.00 1.95 2.83 3.26 3.45 3.50 4.50 17 | 2.30 1.93 1.40 0.99 0.24 0.36 1.13 -0.08 3.88 2.94 2.62 1.76 1.46 5.20 0.08 3.26 0.13 3.85 3.90 4.94 18 | 1.59 1.36 2.31 3.11 -0.42 2.30 1.69 -0.54 2.52 1.77 3.54 3.66 3.09 2.38 2.68 3.45 3.85 2.92 3.17 3.85 19 | 3.23 4.45 3.15 3.57 2.05 3.93 2.13 1.33 3.67 2.74 5.76 5.26 3.75 2.72 5.32 3.50 3.90 3.17 3.24 2.29 20 | 3.80 4.18 4.99 4.49 2.81 3.62 1.90 1.59 3.77 2.82 8.57 5.28 3.99 4.90 5.75 4.50 4.94 3.85 2.29 2.87 -------------------------------------------------------------------------------- /Matrix/MCLACHLAN: -------------------------------------------------------------------------------- 1 | 8.0 2.0 3.0 3.0 1.0 3.0 4.0 3.0 3.0 2.0 2.0 3.0 3.0 1.0 4.0 4.0 3.0 1.0 1.0 3.0 2 | 2.0 8.0 3.0 1.0 1.0 5.0 3.0 3.0 5.0 1.0 2.0 5.0 1.0 1.0 3.0 4.0 3.0 3.0 2.0 2.0 3 | 3.0 3.0 8.0 5.0 1.0 4.0 4.0 3.0 4.0 1.0 1.0 4.0 2.0 0.0 1.0 5.0 3.0 0.0 2.0 1.0 4 | 3.0 1.0 5.0 8.0 1.0 4.0 5.0 3.0 4.0 0.0 1.0 3.0 2.0 1.0 3.0 3.0 3.0 0.0 1.0 1.0 5 | 1.0 1.0 1.0 1.0 9.0 0.0 0.0 1.0 3.0 1.0 0.0 0.0 3.0 0.0 0.0 2.0 2.0 2.0 1.0 1.0 6 | 3.0 5.0 4.0 4.0 0.0 8.0 5.0 2.0 4.0 0.0 3.0 4.0 3.0 0.0 3.0 4.0 3.0 2.0 1.0 2.0 7 | 4.0 3.0 4.0 5.0 0.0 5.0 8.0 3.0 2.0 1.0 1.0 4.0 1.0 0.0 4.0 4.0 4.0 1.0 2.0 2.0 8 | 3.0 3.0 3.0 3.0 1.0 2.0 3.0 8.0 2.0 1.0 1.0 3.0 1.0 0.0 3.0 3.0 2.0 1.0 0.0 2.0 9 | 3.0 5.0 4.0 4.0 3.0 4.0 2.0 2.0 8.0 2.0 2.0 4.0 3.0 4.0 3.0 3.0 4.0 3.0 4.0 2.0 10 | 2.0 1.0 1.0 0.0 1.0 0.0 1.0 1.0 2.0 8.0 5.0 1.0 5.0 3.0 1.0 2.0 3.0 3.0 3.0 5.0 11 | 2.0 2.0 1.0 1.0 0.0 3.0 1.0 1.0 2.0 5.0 8.0 2.0 6.0 5.0 1.0 2.0 3.0 3.0 3.0 5.0 12 | 3.0 5.0 4.0 3.0 0.0 4.0 4.0 3.0 4.0 1.0 2.0 8.0 1.0 0.0 3.0 3.0 3.0 1.0 1.0 2.0 13 | 3.0 1.0 2.0 2.0 3.0 3.0 1.0 1.0 3.0 5.0 6.0 1.0 8.0 5.0 1.0 2.0 3.0 1.0 2.0 4.0 14 | 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 4.0 3.0 5.0 0.0 5.0 9.0 1.0 2.0 1.0 6.0 6.0 3.0 15 | 4.0 3.0 1.0 3.0 0.0 3.0 4.0 3.0 3.0 1.0 1.0 3.0 1.0 1.0 8.0 3.0 3.0 0.0 0.0 2.0 16 | 4.0 4.0 5.0 3.0 2.0 4.0 4.0 3.0 3.0 2.0 2.0 3.0 2.0 2.0 3.0 8.0 5.0 3.0 3.0 2.0 17 | 3.0 3.0 3.0 3.0 2.0 3.0 4.0 2.0 4.0 3.0 3.0 3.0 3.0 1.0 3.0 5.0 8.0 2.0 1.0 3.0 18 | 1.0 3.0 0.0 0.0 2.0 2.0 1.0 1.0 3.0 3.0 3.0 1.0 1.0 6.0 0.0 3.0 2.0 9.0 6.0 2.0 19 | 1.0 2.0 2.0 1.0 1.0 1.0 2.0 0.0 4.0 3.0 3.0 1.0 2.0 6.0 0.0 3.0 1.0 6.0 9.0 3.0 20 | 3.0 2.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 5.0 5.0 2.0 4.0 3.0 2.0 2.0 3.0 2.0 3.0 8.0 -------------------------------------------------------------------------------- /Matrix/PAM250: -------------------------------------------------------------------------------- 1 | 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0 -8 2 | -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1 -8 3 | 0 0 2 2 -4 1 1 0 2 -2 -3 1 -2 -3 0 1 0 -4 -2 -2 2 1 0 -8 4 | 0 -1 2 4 -5 2 3 1 1 -2 -4 0 -3 -6 -1 0 0 -7 -4 -2 3 3 -1 -8 5 | -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3 0 -2 -8 0 -2 -4 -5 -3 -8 6 | 0 1 1 2 -5 4 2 -1 3 -2 -2 1 -1 -5 0 -1 -1 -5 -4 -2 1 3 -1 -8 7 | 0 -1 1 3 -5 2 4 0 1 -2 -3 0 -2 -5 -1 0 0 -7 -4 -2 3 3 -1 -8 8 | 1 -3 0 1 -3 -1 0 5 -2 -3 -4 -2 -3 -5 0 1 0 -7 -5 -1 0 0 -1 -8 9 | -1 2 2 1 -3 3 1 -2 6 -2 -2 0 -2 -2 0 -1 -1 -3 0 -2 1 2 -1 -8 10 | -1 -2 -2 -2 -2 -2 -2 -3 -2 5 2 -2 2 1 -2 -1 0 -5 -1 4 -2 -2 -1 -8 11 | -2 -3 -3 -4 -6 -2 -3 -4 -2 2 6 -3 4 2 -3 -3 -2 -2 -1 2 -3 -3 -1 -8 12 | -1 3 1 0 -5 1 0 -2 0 -2 -3 5 0 -5 -1 0 0 -3 -4 -2 1 0 -1 -8 13 | -1 0 -2 -3 -5 -1 -2 -3 -2 2 4 0 6 0 -2 -2 -1 -4 -2 2 -2 -2 -1 -8 14 | -3 -4 -3 -6 -4 -5 -5 -5 -2 1 2 -5 0 9 -5 -3 -3 0 7 -1 -4 -5 -2 -8 15 | 1 0 0 -1 -3 0 -1 0 0 -2 -3 -1 -2 -5 6 1 0 -6 -5 -1 -1 0 -1 -8 16 | 1 0 1 0 0 -1 0 1 -1 -1 -3 0 -2 -3 1 2 1 -2 -3 -1 0 0 0 -8 17 | 1 -1 0 0 -2 -1 0 0 -1 0 -2 0 -1 -3 0 1 3 -5 -3 0 0 -1 0 -8 18 | -6 2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4 0 -6 -2 -5 17 0 -6 -5 -6 -4 -8 19 | -3 -4 -2 -4 0 -4 -4 -5 0 -1 -1 -4 -2 7 -5 -3 -3 0 10 -2 -3 -4 -2 -8 20 | 0 -2 -2 -2 -2 -2 -2 -1 -2 4 2 -2 2 -1 -1 -1 0 -6 -2 4 -2 -2 -1 -8 21 | 0 -1 2 3 -4 1 3 0 1 -2 -3 1 -2 -4 -1 0 0 -5 -3 -2 3 2 -1 -8 22 | 0 0 1 3 -5 3 3 0 2 -2 -3 0 -2 -5 0 0 -1 -6 -4 -2 2 3 -1 -8 23 | 0 -1 0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 0 0 -4 -2 -1 -1 -1 -1 -8 24 | -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 -------------------------------------------------------------------------------- /Matrix/VOL: -------------------------------------------------------------------------------- 1 | 2.20 2.04 2.21 2.44 1.89 2.25 1.71 1.55 1.92 1.76 2.68 2.48 1.95 2.21 2.04 2.13 1.89 1.97 2.24 2.42 2 | 2.04 1.88 2.04 2.28 1.73 2.09 1.55 1.39 1.76 1.60 2.52 2.31 1.79 2.05 1.88 1.97 1.73 1.81 2.08 2.25 3 | 2.21 2.04 2.21 2.44 1.90 2.26 1.72 1.56 1.93 1.77 2.69 2.48 1.96 2.22 2.05 2.14 1.90 1.98 2.25 2.42 4 | 2.44 2.28 2.44 2.68 2.13 2.49 1.95 1.79 2.16 2.00 2.92 2.72 2.19 2.45 2.28 2.37 2.13 2.21 2.48 2.65 5 | 1.89 1.73 1.90 2.13 1.58 1.94 1.40 1.24 1.62 1.46 2.37 2.17 1.64 1.90 1.73 1.82 1.58 1.66 1.93 2.11 6 | 2.25 2.09 2.26 2.49 1.94 2.30 1.76 1.60 1.97 1.82 2.73 2.53 2.00 2.26 2.09 2.18 1.94 2.02 2.29 2.47 7 | 1.71 1.55 1.72 1.95 1.40 1.76 1.23 1.07 1.44 1.28 2.20 1.99 1.47 1.72 1.55 1.65 1.41 1.49 1.76 1.93 8 | 1.55 1.39 1.56 1.79 1.24 1.60 1.07 0.91 1.28 1.12 2.04 1.83 1.31 1.56 1.39 1.49 1.25 1.33 1.60 1.77 9 | 1.92 1.76 1.93 2.16 1.62 1.97 1.44 1.28 1.65 1.49 2.41 2.20 1.68 1.94 1.76 1.86 1.62 1.70 1.97 2.14 10 | 1.76 1.60 1.77 2.00 1.46 1.82 1.28 1.12 1.49 1.33 2.25 2.04 1.52 1.78 1.61 1.70 1.46 1.54 1.81 1.98 11 | 2.68 2.52 2.69 2.92 2.37 2.73 2.20 2.04 2.41 2.25 3.17 2.96 2.44 2.69 2.52 2.62 2.37 2.46 2.73 2.90 12 | 2.48 2.31 2.48 2.72 2.17 2.53 1.99 1.83 2.20 2.04 2.96 2.75 2.23 2.49 2.32 2.41 2.17 2.25 2.52 2.69 13 | 1.95 1.79 1.96 2.19 1.64 2.00 1.47 1.31 1.68 1.52 2.44 2.23 1.70 1.96 1.79 1.88 1.64 1.72 2.00 2.17 14 | 2.21 2.05 2.22 2.45 1.90 2.26 1.72 1.56 1.94 1.78 2.69 2.49 1.96 2.22 2.05 2.14 1.90 1.98 2.25 2.43 15 | 2.04 1.88 2.05 2.28 1.73 2.09 1.55 1.39 1.76 1.61 2.52 2.32 1.79 2.05 1.88 1.97 1.73 1.81 2.08 2.26 16 | 2.13 1.97 2.14 2.37 1.82 2.18 1.65 1.49 1.86 1.70 2.62 2.41 1.88 2.14 1.97 2.06 1.82 1.91 2.18 2.35 17 | 1.89 1.73 1.90 2.13 1.58 1.94 1.41 1.25 1.62 1.46 2.37 2.17 1.64 1.90 1.73 1.82 1.58 1.66 1.94 2.11 18 | 1.97 1.81 1.98 2.21 1.66 2.02 1.49 1.33 1.70 1.54 2.46 2.25 1.72 1.98 1.81 1.91 1.66 1.75 2.02 2.19 19 | 2.24 2.08 2.25 2.48 1.93 2.29 1.76 1.60 1.97 1.81 2.73 2.52 2.00 2.25 2.08 2.18 1.94 2.02 2.29 2.46 20 | 2.42 2.25 2.42 2.65 2.11 2.47 1.93 1.77 2.14 1.98 2.90 2.69 2.17 2.43 2.26 2.35 2.11 2.19 2.46 2.63 21 | -------------------------------------------------------------------------------- /Parameters.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | """Parameters Loader""" 9 | 10 | from ConfigParser import SafeConfigParser 11 | 12 | surface_threshold = 7 # 7% [0, max(surface)[ 13 | psiblast_evalue = 0.00001 # [0.00000001:10] 14 | psiblast_identity = 30 # [0:100] (%) 15 | psiblast_coverage = 60 # [0:100] (%) 16 | psiblast_threading = False # Number of cores/servers or False 17 | pairwise_distance = "clustalw" # "clustalw", "pdistance", "Kimura" 18 | # "jukescantor" or "alignscore" 19 | alignscore_matrix = "BLOSUM62" # "BLOSUM62" or "PAM250" 20 | theilsen_cutoff = 0.7 # [0.25:1.0(all sequences)] 21 | clustalw_gap_opening = 10 # [0:100] 22 | clustalw_gap_extension = 0.2 # [0:10] 23 | clustalw_distance_matrix = "GONNET" # "GONNET", "BLOSUM" or "PAM" 24 | muscle_max_iteration = 16 # [2:16] 25 | mafft_configuration = "linsi" # "fftnsi" or "linsi" 26 | mafft_threading = False # Number of cores/servers or False 27 | alphabet_reduction = False # False or "charge", "charge_his", "polarity" 28 | # or "hydropathy" 29 | alignment_score = False # "sumofpairs" or False 30 | best_results = 20 # [1:max(scores)] 31 | results_histogram = True # True or False 32 | results_heatmap = True # True or False 33 | results_structure = "pymol" # "pymol" or False 34 | results_sifts = False # True or False 35 | 36 | def SaveParameters(filename): 37 | "Saves default parameters" 38 | 39 | parser = SafeConfigParser() 40 | parser.add_section('Global') 41 | parser.add_section('Psiblast') 42 | parser.add_section('Clustalw') 43 | parser.add_section('Muscle') 44 | parser.add_section('Mafft') 45 | parser.add_section('Results') 46 | parser.set('Global', 'SurfaceThreshold', float(surface_threshold)) 47 | parser.set('Psiblast', 'Evalue', float(psiblast_evalue)) 48 | parser.set('Psiblast', 'Identity', int(psiblast_identity)) 49 | parser.set('Psiblast', 'Coverage', int(psiblast_coverage)) 50 | parser.set('Psiblast', 'Threading', str(psiblast_threading)) 51 | parser.set('Global', 'PairwiseDistance', str(pairwise_distance)) 52 | parser.set('Clustalw', 'GapOpening', float(clustalw_gap_opening)) 53 | parser.set('Clustalw', 'GapExtension', float(clustalw_gap_extension)) 54 | parser.set('Clustalw', 'Matrix', str(clustalw_distance_matrix)) 55 | parser.set('Global', 'Matrix', str(alignscore_matrix)) 56 | parser.set('Global', 'TheilSenCutoff', float(theilsen_cutoff)) 57 | parser.set('Muscle', 'MaxIteration', int(muscle_max_iteration)) 58 | parser.set('Mafft', 'Configuration', str(mafft_configuration)) 59 | parser.set('Mafft', 'Threading', str(mafft_threading)) 60 | parser.set('Global', 'AlphabetReduction', str(alphabet_reduction)) 61 | parser.set('Global', 'AlignmentScore', str(alignment_score)) 62 | parser.set('Results', 'Best', int(best_results)) 63 | parser.set('Results', 'Histogram', str(results_histogram)) 64 | parser.set('Results', 'Heatmap', str(results_heatmap)) 65 | parser.set('Results', 'Structure', str(results_structure)) 66 | parser.set('Results', 'Sifts', str(results_sifts)) 67 | fil = open(filename, 'w') 68 | parser.write(fil) 69 | fil.close() 70 | 71 | def LoadParameters(filename, option): 72 | "Loads and tests input parameters" 73 | 74 | parser = SafeConfigParser() 75 | try: 76 | parser.read(filename) 77 | if option == "surface_threshold": 78 | surface_threshold = parser.getfloat('Global', 'SurfaceThreshold') 79 | return surface_threshold 80 | elif option == "psiblast_evalue": 81 | psiblast_evalue = parser.getfloat('Psiblast', 'Evalue') 82 | return psiblast_evalue 83 | elif option == "psiblast_identity": 84 | psiblast_identity = parser.getint('Psiblast', 'Identity') 85 | return psiblast_identity 86 | elif option == "psiblast_coverage": 87 | psiblast_coverage = parser.getint('Psiblast', 'Coverage') 88 | return psiblast_coverage 89 | elif option == "psiblast_threading": 90 | psiblast_threading = parser.get('Psiblast', 'Threading') 91 | return psiblast_threading 92 | elif option == "pairwise_distance": 93 | pairwise_distance = parser.get('Global', 'PairwiseDistance') 94 | return pairwise_distance 95 | elif option == "clustalw_gap_opening": 96 | clustalw_gap_opening = parser.getfloat('Clustalw', 'GapOpening') 97 | return clustalw_gap_opening 98 | elif option == "clustalw_gap_extension": 99 | clustalw_gap_extension = parser.getfloat('Clustalw', 'GapExtension') 100 | return clustalw_gap_extension 101 | elif option == "clustalw_distance_matrix": 102 | clustalw_distance_matrix = parser.get('Clustalw', 'Matrix') 103 | return clustalw_distance_matrix 104 | elif option == "alignscore_matrix": 105 | alignscore_matrix = parser.get('Global', 'Matrix') 106 | return alignscore_matrix 107 | elif option == "theilsen_cutoff": 108 | theilsen_cutoff = parser.getfloat('Global', 'TheilSenCutoff') 109 | return theilsen_cutoff 110 | elif option == "muscle_max_iteration": 111 | muscle_max_iteration = parser.getint('Muscle', 'MaxIteration') 112 | return muscle_max_iteration 113 | elif option == "mafft_configuration": 114 | mafft_configuration = parser.get('Mafft', 'Configuration') 115 | return mafft_configuration 116 | elif option == "mafft_threading": 117 | mafft_threading = parser.get('Mafft', 'Threading') 118 | return mafft_threading 119 | elif option == "alphabet_reduction": 120 | alphabet_reduction = parser.get('Global', 'AlphabetReduction') 121 | return alphabet_reduction 122 | elif option == "alignment_score": 123 | alignment_score = parser.get('Global', 'AlignmentScore') 124 | return alignment_score 125 | elif option == "best_results": 126 | best_results = parser.getint('Results', 'Best') 127 | return best_results 128 | elif option == "results_histogram": 129 | results_histogram = parser.getboolean('Results', 'Histogram') 130 | return results_histogram 131 | elif option == "results_heatmap": 132 | results_heatmap = parser.getboolean('Results', 'Heatmap') 133 | return results_heatmap 134 | elif option == "results_structure": 135 | results_structure = parser.get('Results', 'Structure') 136 | return results_structure 137 | elif option == "results_sifts": 138 | results_sifts = parser.getboolean('Results', 'Sifts') 139 | return results_sifts 140 | elif option == "test": 141 | parser.getint('Results', 'Best') 142 | print "Parameters... OK" 143 | return 144 | else: 145 | raise StandardError, "ERROR: Invalid option" 146 | except: 147 | raise StandardError, "ERROR: Invalid Parameters File" 148 | 149 | -------------------------------------------------------------------------------- /Params.config: -------------------------------------------------------------------------------- 1 | [Global] 2 | surfacethreshold = 7 3 | pairwisedistance = clustalw 4 | matrix = BLOSUM62 5 | theilsencutoff = 0.7 6 | alignmentscore = False 7 | alphabetreduction = False 8 | 9 | [Psiblast] 10 | evalue = 10 11 | identity = 0 12 | coverage = 0 13 | threading = False 14 | 15 | [Clustalw] 16 | gapopening = 10 17 | gapextension = 0.2 18 | matrix = GONNET 19 | 20 | [Muscle] 21 | maxiteration = 16 22 | 23 | [Mafft] 24 | configuration = linsi 25 | threading = False 26 | 27 | [Results] 28 | best = 20 29 | histogram = True 30 | heatmap = True 31 | structure = pymol 32 | sifts = False 33 | 34 | -------------------------------------------------------------------------------- /Pycoevol.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | #TODO: 8 | # Interaction maps 9 | 10 | import os 11 | import sys 12 | from src import MAIN 13 | from Parameters import LoadParameters as LP 14 | from optparse import OptionParser 15 | from Bio.Align.Applications import ClustalwCommandline 16 | 17 | def printUsage(): 18 | """Prints the usage - DEPRECATED""" 19 | __version__ = "beta" 20 | 21 | Usage = \ 22 | """ 23 | Pycoevol_%s (c) 2012, F. Madeira 24 | 25 | Pycoevol: A Python workflow to study protein-protein coevolution 26 | and interaction. 27 | 28 | Pycoevol.py input1 input2 29 | 30 | input1 seq1.fasta (-seqID1), pdb1.pdb:A (-PDBID1:A) 31 | or align1.fasta (where A is the chain designator) 32 | input2 seq2.fasta (seqID2), -pdb2.pdb:B (-PDBID2:B) 33 | or -align2.fasta (where B is the chain designator) 34 | -p --psiblast 35 | internet, local or custom (NCBI's PSIBLAST and 36 | local database are optional) 37 | -a --alignment 38 | clustalw, muscle, mafft or custom (MUSCLE and 39 | MAFFT are optional) 40 | -c --coevolution 41 | mi, mie, rcwmi, cpvnmie, cpvn, clm, vol 42 | omes, pearson, spearman, mcbasc, quartets, 43 | sca or elsc 44 | -x --chain 45 | chain identifier (in same order as input file). Default A 46 | -i --id 47 | identifier for each protein, in same order as input files. 48 | -h --help 49 | 50 | Check the README.md for further details. 51 | """ % __version__ 52 | print Usage 53 | 54 | def pycoevolRun(): 55 | "Routine which chooses the proper scripts given the input commands" 56 | main = MAIN.main(file1, file2, id1, id2, chain1, chain2, parameterfile, 57 | psiblast, alignment, coevolution, dirname) 58 | 59 | if psiblast == "custom" and alignment == "custom": 60 | print 'Coevolution scripts...' 61 | sys.stdout.flush() 62 | main.coevolutionSripts() 63 | print '... OK' 64 | else: 65 | print 'Sequence scripts...' 66 | sys.stdout.flush() 67 | main.sequenceSripts() 68 | print '... OK' 69 | 70 | print 'BLAST scripts...' 71 | sys.stdout.flush() 72 | main.psiblastSripts() 73 | print '... OK' 74 | 75 | print 'Organism scripts...' 76 | sys.stdout.flush() 77 | main.organismSripts() 78 | print '... OK' 79 | 80 | print 'Alignment scripts...' 81 | sys.stdout.flush() 82 | main.alignmentSripts() 83 | print '... OK' 84 | 85 | print 'Coevolution scripts...' 86 | sys.stdout.flush() 87 | main.coevolutionSripts() 88 | print '... OK' 89 | 90 | print 'Info scripts...' 91 | sys.stdout.flush() 92 | main.infoScripts(SIFTS) 93 | print '... OK' 94 | return 95 | 96 | def checkArguments(): 97 | "Checks if the input commands are valid" 98 | try: 99 | input = str("./Data/" + file1) 100 | file = open(input, "r") 101 | file.close() 102 | except: 103 | #raise StandardError, "ERROR: File no.1 is not acessible" 104 | pass 105 | 106 | try: 107 | input = str("./Data/" + file2) 108 | file = open(input, "r") 109 | file.close() 110 | except: 111 | #raise StandardError, "ERROR: File no.2 is not acessible" 112 | pass 113 | 114 | if len(chain1) <= 2 and len(chain2) <= 2: 115 | pass 116 | else: 117 | raise StandardError, "ERROR: Chains' length must be = 1" 118 | 119 | if psiblast != 'internet' and psiblast != 'local' and psiblast != 'custom': 120 | raise StandardError, "ERROR: PSI-Blast: Type 'internet', 'local'\ 121 | or 'custom'" 122 | 123 | if alignment != "clustalw" and alignment != "muscle" and \ 124 | alignment != "mafft" and alignment != 'custom': 125 | raise StandardError, "ERROR: Alignment Tools: Type '-clustalw', \ 126 | '-muscle', '-mafft' or 'custom'" 127 | 128 | if coevolution != 'mi' and coevolution != 'mie' and \ 129 | coevolution != 'rcwmi' and coevolution != 'cpvnmie' and \ 130 | coevolution != 'cpvn' and coevolution != 'clm' and \ 131 | coevolution != 'vol' and coevolution != 'omes' and \ 132 | coevolution != 'pearson' and coevolution != 'spearman' and \ 133 | coevolution != 'mcbasc' and coevolution != 'quartets' and \ 134 | coevolution != 'sca' and coevolution != 'elsc': 135 | raise StandardError, "ERROR: Coevolution Measure: Type '–mi', '–mie', \ 136 | '–rcwmi', '–cpvnmie', '–cpvn', '–clm', '–vol', '-omes', '-pearson', \ 137 | '-spearman', '-mcbasc', '-quartets', '-sca' or '-elsc'" 138 | 139 | def checkDependencies(): 140 | "Checks the import of mandatory python modules and clustalw" 141 | try: 142 | import Bio 143 | del Bio 144 | except ImportError: 145 | raise ImportError, "ERROR: Unable to import Biopython" 146 | 147 | try: 148 | import numpy 149 | del numpy 150 | except ImportError: 151 | raise ImportError, "ERROR: Unable to import Numpy" 152 | 153 | try: 154 | import matplotlib 155 | del matplotlib 156 | except ImportError: 157 | raise ImportError, "ERROR: Unable to import Matplotlib" 158 | 159 | try: 160 | try: 161 | cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe") 162 | input = "./src/tools/clustalw/test/test.fasta" 163 | clustalw = ClustalwCommandline(cmd, infile=input) 164 | clustalw() 165 | os.remove("./src/tools/clustalw/test/test.aln") 166 | os.remove("./src/tools/clustalw/test/test.dnd") 167 | except: 168 | cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw") 169 | input = "./src/tools/clustalw/test/test.fasta" 170 | clustalw = ClustalwCommandline(cmd, infile=input) 171 | clustalw() 172 | os.remove("./src/tools/clustalw/test/test.aln") 173 | os.remove("./src/tools/clustalw/test/test.dnd") 174 | except: 175 | raise StandardError, "ERROR: Unable to run ClustalW" 176 | 177 | 178 | def checkSIFTS(): 179 | "Checks the availability of SIFTS files" 180 | global SIFTS 181 | try: 182 | input = str("./SIFTS/pdb_chain_scop_uniprot.lst") 183 | file = open(input, "r") 184 | file.close() 185 | input = str("./SIFTS/pdb_chain_cath_uniprot.lst") 186 | file = open(input, "r") 187 | file.close() 188 | input = str("./SIFTS/pdb_chain_enzyme.lst") 189 | file = open(input, "r") 190 | file.close() 191 | input = str("./SIFTS/pdb_chain_interpro.lst") 192 | file = open(input, "r") 193 | file.close() 194 | input = str("./SIFTS/pdb_chain_pfam.lst") 195 | file = open(input, "r") 196 | file.close() 197 | input = str("./SIFTS/pdb_chain_taxonomy.lst") 198 | file = open(input, "r") 199 | file.close() 200 | input = str("./SIFTS/pdb_pubmed.lst") 201 | file = open(input, "r") 202 | file.close() 203 | SIFTS = True 204 | print "SIFTS... OK" 205 | except: 206 | SIFTS = False 207 | print "SIFTS... NOT OK" 208 | 209 | def addtoPATH(): 210 | sys.path.append("./src/tools/") 211 | sys.path.append("./src/tools/blast+/") 212 | sys.path.append("./src/tools/blast+/db") 213 | sys.path.append("./src/tools/clustalw/") 214 | sys.path.append("./src/tools/mafft/") 215 | sys.path.append("./src/tools/muscle/") 216 | 217 | def ParseArguments(): 218 | global file1 219 | global id1 220 | global chain1 221 | global file2 222 | global id2 223 | global chain2 224 | global parameterfile 225 | global psiblast 226 | global alignment 227 | global coevolution 228 | global dirname 229 | 230 | # defaults 231 | pathcwd = os.getcwd() 232 | dirname = os.getcwd() + "/Results/" 233 | parameterfile = '' 234 | file1 = '' 235 | file2 = '' 236 | chain1 = '' 237 | chain2 = '' 238 | 239 | parser = OptionParser(usage='Pycoevol.py input1 input2 [options]') 240 | parser.add_option('-b', '--psiblast', type='string', 241 | dest='psiblast', default='internet', 242 | help='internet, local or custom') 243 | parser.add_option('-a', '--alignment', type='string', 244 | dest='alignment', default='clustalw', 245 | help='clustalw, muscle, mafft or custom') 246 | parser.add_option('-c', '--coevolution', type='string', 247 | dest='coevolution', default='mi', 248 | help='mi, mie, rcwmi, cpvn, clm, vol, omes, pearson, spearman, mcbasc, quartets, sca or elsc') 249 | parser.add_option('-i', '--id', action='append', type='string', 250 | dest='ids', default=[]) 251 | parser.add_option('-x', '--chain', action='append', type='string', 252 | dest='chains', default=[]) 253 | parser.add_option('-p', '--parameters', 254 | dest='parameterfile', default=parameterfile) 255 | 256 | (options, args) = parser.parse_args() 257 | if len(args) == 0 and len(options.ids) == 0: 258 | parser.print_help() 259 | sys.exit() 260 | 261 | if len(args) == 2: 262 | input1 = args[0] 263 | input2 = args[1] 264 | dirname = os.path.dirname(input1) + "/" 265 | file1 = os.path.basename(input1) 266 | file2 = os.path.basename(input2) 267 | id1 = file1.split(".")[0] 268 | id2 = file2.split(".")[0] 269 | if len(options.chains) == 2: 270 | chain1 = options.chains[0] 271 | chain2 = options.chains[1] 272 | if len(options.ids) == 2: 273 | id1 = options.ids[0] 274 | id2 = options.ids[1] 275 | if chain1 == '' and chain2 == '': 276 | file1 = id1 + ".fasta" 277 | file2 = id2 + ".fasta" 278 | else: 279 | file1 = id1 + ".pdb" 280 | file2 = id2 + ".pdb" 281 | if options.parameterfile != '': 282 | parameterfile = options.parameterfile.strip('"') 283 | LP(parameterfile, "test") 284 | else: 285 | parameterfile = pathcwd + "/Params.config" 286 | parameterfile = parameterfile.strip('"') 287 | LP(parameterfile, "test") 288 | psiblast = options.psiblast 289 | alignment = options.alignment 290 | coevolution = options.coevolution 291 | 292 | def main(): 293 | ParseArguments() 294 | checkArguments() 295 | print 'Arguments... OK' 296 | addtoPATH() 297 | checkDependencies() 298 | print 'Dependencies... OK' 299 | checkSIFTS() 300 | pycoevolRun() 301 | print 'Analysis Complete !!' 302 | return 303 | 304 | if __name__ == "__main__": 305 | main() 306 | 307 | 308 | -------------------------------------------------------------------------------- /Pycoevol_paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/Pycoevol_paper.pdf -------------------------------------------------------------------------------- /Pycoevol_userguide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/Pycoevol_userguide.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PYCOEVOL 2 | ======== 3 | A Python workflow to study protein-protein coevolution and interaction 4 | 5 | Pycoevol is an integrated system for studying inter-protein coevolution and interaction. 6 | It automates the identification of contact points between protein partners, extending the 7 | general coevolution workflow consisting of: homologous sequence search; multiple sequence 8 | alignment computation; and coevolution analysis; with an improved selection of organisms 9 | and contact prediction. 10 | 11 | It generates friendly output results: matrix of scores; histograms; 12 | heat-maps; PyMOL scripts and interaction maps. Additional information for common web-services 13 | can be retrieved from SIFTS. 14 | 15 | ## Disclaimer 16 | 17 | This software is provided "as is", with no explicit or implied warranties. 18 | Use this software at your own risk. 19 | 20 | ## Copyright 21 | 22 | This software is public domain, and everyone has the right to copy, 23 | distribute, reuse, modify, improve and debug it. 24 | 25 | If you want to cite this piece of software/workflow use the following: 26 | 27 | Fábio Madeira and Ludwig Krippahl. 2012. PYCOEVOL: A Python workflow to study 28 | protein-protein coevolution. Proceedings of the International conference on 29 | Bioinformatics Models, Methods and Algorithms - BIOINFORMATICS 2012, pp.143-9. 30 | 31 | This work was partially supported by Portuguese National funds through Fundação 32 | para a Ciência e Tecnologia (FCT) under project CREMA PTDC/EIA-CCO/115999/2009. 33 | 34 | ## Dependencies 35 | 36 | [Python 2.7.2](http://python.org/), 37 | [Biopython 1.58](http://biopython.org/), 38 | [Numpy 1.6.1](http://numpy.scipy.org/), 39 | [Matplotlib 1.1.0](http://matplotlib.sourceforge.net/) and 40 | [ClustalW](http://www.clustal.org/) 41 | 42 | **Optional:** 43 | [NCBI Blast+](http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download), 44 | [NCBI's "refseq_protein" database](ftp://ftp.ncbi.nlm.nih.gov/blast/db/), 45 | [MUSCLE](http://www.drive5.com/muscle/), 46 | [MAFFT](http://mafft.cbrc.jp/alignment/software/) and 47 | [SIFTS lst files](http://www.ebi.ac.uk/pdbe/docs/sifts/quick.html) 48 | 49 | 50 | ## Usage 51 | 52 | _python Pycoevol.py input1 input2 [options]_ 53 | 54 | 55 | 56 | -h, --help show this help message and exit 57 | 58 | -b PSIBLAST, --psiblast=PSIBLAST 59 | 60 | internet, local or custom 61 | 62 | -a ALIGNMENT, --alignment=ALIGNMENT 63 | 64 | clustalw, muscle, mafft or custom 65 | 66 | -c COEVOLUTION, --coevolution=COEVOLUTION 67 | 68 | mi, mie, rcwmi, cpvn, clm, vol, omes, pearson,spearman, mcbasc, quartets, sca or elsc 69 | 70 | -i IDS, --id=IDS 71 | 72 | -x CHAINS, --chain=CHAINS 73 | 74 | -p PARAMETERFILE, --parameters=PARAMETERFILE 75 | 76 | For a detailed overview on how to install and use Pycoevol, please refer to the User Guide. 77 | 78 | 79 | **Coevolution measures:** 80 | 81 | * Mutual Information (mi) [Gloor et al, 2005] 82 | * MI by pair Entropy (mie) [Martin et al, 2005] 83 | * Row and Column Weighed MI (rcwmi) [Gouveia-Oliveira et al, 2007] 84 | * Contact Preferences, Volume Normalized (cpvn) [Glaser et al, 2001] 85 | * Contact PDB-derived Likelihood Matrix (clm) [Singer et al, 2002] 86 | * Residue-residue Volume Normalized (vol) [based on Esque et al, 2010] 87 | * Observed Minus Expected Squared (omes) [Kass and Horovitz, 2002] 88 | * Pearson’s correlation (pearson) [Göbel et al, 1994] 89 | * Spearman’s rank correlation (spearman) [Pazos et al, 1997] 90 | * McLachlan Based Substitution Correlation (mcbasc) [Fodor and Aldrich, 2004] 91 | * Quartets (quartets) [Galitsky, 2002] 92 | * Statistical Coupling Analysis (sca) [Lockless and Ranganathan, 1999] 93 | * Explicit Likelihood of Subset Covariation (elsc) [Dekker et al, 2004] 94 | 95 | **Pairwise distance measures:** 96 | 97 | * ClustalW distance[Chenna et al, 2003] 98 | * p-distance [Jukes and Cantor, 1969] 99 | * Jukes-Cantor [Jukes and Cantor, 1969] 100 | * Kimura distance [Kimura, 1983] 101 | * Pairwise score using Dayhoff or Henikoff matrices [Dayhoff et al, 1978; 102 | Henikoff and Henikoff, 1992] 103 | 104 | 105 | *Fábio Madeira and Ludwig Krippahl, 2012* 106 | 107 | This work was partially supported by Portuguese National 108 | funds through Fundação para a Ciência e Tecnologia (FCT) 109 | under project CREMA PTDC/EIA-CCO/115999/2009. 110 | -------------------------------------------------------------------------------- /Results/output_results: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/Results/output_results -------------------------------------------------------------------------------- /SIFTS/Database version: -------------------------------------------------------------------------------- 1 | # Last update on 16.12.2011 -------------------------------------------------------------------------------- /refseq_protein.pal: -------------------------------------------------------------------------------- 1 | # 2 | # Alias file created: Jun 26, 2011 8:38 PM 3 | # 4 | # Edit this file to reflet the location of your database 5 | # Get the database at ftp://ftp.ncbi.nih.gov/blast/db/ 6 | # 7 | TITLE NCBI Protein Reference Sequences 8 | DBLIST ./Pycoevol/src/tools/Blast+/db/refseq_protein.00 ./Pycoevol/src/tools/Blast+/db/refseq_protein.01 ./Pycoevol/src/tools/Blast+/db/refseq_protein.02 ./Pycoevol/src/tools/Blast+/db/refseq_protein.03 9 | -------------------------------------------------------------------------------- /src/ALIGN.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | import os 9 | from Parameters import LoadParameters as LP 10 | from src.UTILS import charge, charge_his, polarity, hydropathy 11 | from os import remove, system 12 | from shutil import copyfile 13 | from itertools import combinations 14 | from Bio import AlignIO, SeqIO 15 | from Bio.Alphabet import IUPAC 16 | from Bio.Align.Applications import ClustalwCommandline 17 | from Bio.Align.Applications import MuscleCommandline 18 | 19 | class alignment: 20 | """ 21 | Main code for multiple sequence alignment and scoring. 22 | 23 | Methods for computing MSAs: 24 | Clustalw - Chenna et al, 2003 25 | Muscle - Edgar, 2004 26 | Mafft - Katoh et al, 2002 27 | 28 | Methods for scoring MSAs: 29 | Sum-of-Pairs - Murata et al, 1985 30 | TODO: Circular Sum - Gonnet et al, 2000 31 | """ 32 | def __init__(self, id1, id2, alignment, parameterfile, dirname): 33 | self.id1 = id1 34 | self.id2 = id2 35 | self.alignment = alignment 36 | self.parameterfile = parameterfile 37 | self.dirname = dirname 38 | 39 | def __call__(self, id1, id2, alignment, parameterfile, dirname): 40 | self.id1 = id1 41 | self.id2 = id2 42 | self.alignment = alignment 43 | self.parameterfile = parameterfile 44 | self.dirname = dirname 45 | 46 | 47 | def computeAlignment(self, id, alignment): 48 | "Computes multiple sequence alignment with inputed method" 49 | 50 | if alignment == "clustalw": 51 | gop = LP(self.parameterfile, "clustalw_gap_opening") 52 | gep = LP(self.parameterfile, "clustalw_gap_extension") 53 | d_matrix = LP(self.parameterfile, "clustalw_distance_matrix") 54 | 55 | input_sequences = self.dirname + id + ".fasta" 56 | output_align = self.dirname + id + ".aln" 57 | output_fasta = self.dirname + id + "_clustalw.fasta" 58 | output_tree = self.dirname + id + ".dnd" 59 | try: 60 | cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe") 61 | clustalw = ClustalwCommandline(cmd, infile=input_sequences, 62 | outfile=output_align, 63 | newtree=output_tree, 64 | align="input", 65 | seqnos="ON", 66 | outorder="input", 67 | type="PROTEIN", 68 | pwmatrix=d_matrix, 69 | gapopen=gop, 70 | gapext=gep) 71 | clustalw() 72 | except: 73 | cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw") 74 | clustalw = ClustalwCommandline(cmd, infile=input_sequences, 75 | outfile=output_align, 76 | newtree=output_tree, 77 | align="input", 78 | seqnos="ON", 79 | outorder="input", 80 | type="PROTEIN", 81 | pwmatrix=d_matrix, 82 | gapopen=gop, 83 | gapext=gep) 84 | clustalw() 85 | AlignIO.convert(output_align, "clustal", output_fasta, "fasta") 86 | try: 87 | remove(output_align) 88 | remove(output_tree) 89 | except: 90 | pass 91 | 92 | elif alignment == "muscle": 93 | iteration = LP(self.parameterfile, "muscle_max_iteration") 94 | 95 | input_sequences = self.dirname + id + ".fasta" 96 | output_align = self.dirname + id + "_muscle.aln" 97 | output_fasta = self.dirname + id + "_muscle.fasta" 98 | 99 | muscle = MuscleCommandline(input=input_sequences, 100 | out=output_align, 101 | clwstrict=True, 102 | maxiters=iteration) 103 | muscle() 104 | AlignIO.convert(output_align, "clustal", output_fasta, "fasta") 105 | try: 106 | remove(output_align) 107 | except: 108 | pass 109 | 110 | organism_order = [] 111 | input_sequences = self.dirname + id + ".fasta" 112 | align = SeqIO.parse(input_sequences, "fasta", IUPAC.protein) 113 | for record in align: 114 | org = record.description 115 | organism_order.append(org) 116 | 117 | rec = dict() 118 | output_fasta = self.dirname + id + "_muscle.fasta" 119 | align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein) 120 | for record in align: 121 | org = str(record.description) 122 | seq = str(record.seq) 123 | rec[org] = seq 124 | 125 | fasta = open(output_fasta, "w") 126 | fasta.close() 127 | fasta = open(output_fasta, "a") 128 | for org in (organism_order): 129 | seq = rec[org] 130 | fasta.write(">" + org + "\n" + seq + "\n") 131 | fasta.close() 132 | 133 | else: 134 | configuration = LP(self.parameterfile, "mafft_configuration") 135 | threads = LP(self.parameterfile, "mafft_threading") 136 | input_sequences = self.dirname + id + ".fasta" 137 | output_fasta = self.dirname + id + "_mafft.fasta" 138 | 139 | if configuration == "fftnsi": 140 | if threads == False: 141 | fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder " 142 | mafft = system(fftnsi + input_sequences + ">" + output_fasta) 143 | mafft 144 | else: 145 | try: 146 | threads = int(threads) 147 | fftnsi = "mafft --retree 2 --maxiterate 1000\ 148 | --inputorder --threads %i " % (threads) 149 | mafft = system(fftnsi + input_sequences + ">" + output_fasta) 150 | mafft 151 | except: 152 | fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder " 153 | mafft = system(fftnsi + input_sequences + ">" + output_fasta) 154 | mafft 155 | else: 156 | if threads == False: 157 | linsi = "mafft --localpair --maxiterate 1000 --inputorder " 158 | mafft = system(linsi + input_sequences + ">" + output_fasta) 159 | mafft 160 | else: 161 | try: 162 | threads = int(threads) 163 | linsi = "mafft --localpair --maxiterate 1000\ 164 | --inputorder --threads %i " % (threads) 165 | mafft = system(linsi + input_sequences + ">" + output_fasta) 166 | mafft 167 | except: 168 | linsi = "mafft --localpair --maxiterate 1000 --inputorder " 169 | mafft = system(linsi + input_sequences + ">" + output_fasta) 170 | mafft 171 | 172 | def cutAlignment(self, file, id, alignment): 173 | "Selects MSA columns of interest (Query_id != '-')" 174 | 175 | description = [] 176 | align = [] 177 | columns = [] 178 | positions = [] 179 | blocks = [] 180 | new_align = [] 181 | new_align_ord = [] 182 | new_align_concate = [] 183 | self.cut_alignment = [] 184 | aa_red = LP(self.parameterfile, "alphabet_reduction") 185 | 186 | if alignment != "custom": 187 | input = self.dirname + id + "_" + alignment + ".fasta" 188 | alignment = AlignIO.read(input, "fasta") 189 | for record in alignment: 190 | key = record.id 191 | description.append(key) 192 | 193 | k = int(-1) 194 | for s in description: 195 | k += 1 196 | key = s.find("Query_id") 197 | if key != -1: 198 | break 199 | 200 | align_length = alignment.get_alignment_length() 201 | for position in range(0, align_length): 202 | column = alignment[:, position] 203 | align.append(column) 204 | if column[k] != "-": 205 | columns.append(column) 206 | positions.append(position) 207 | 208 | for i in range(0, len(positions), 1): 209 | beg = int(positions[i]) 210 | end = int(positions[i] + 1) 211 | block = alignment[:, beg:end] 212 | blocks.append(block) 213 | 214 | for block in blocks: 215 | for record in block: 216 | seq = str(record.seq) 217 | new_align.append(seq) 218 | 219 | numb_blocks = len(new_align) / len(columns[0]) 220 | for i in range(0, len(columns[0])): 221 | for j in range(0, len(new_align), len(columns[0])): 222 | new_align_ord.append(new_align[i + j]) 223 | 224 | for i in range(0, len(new_align_ord), numb_blocks): 225 | pseudolist = new_align_ord[i:i + numb_blocks] 226 | list = "" 227 | for j in pseudolist: 228 | list += j 229 | new_align_concate.append(list) 230 | 231 | for seq in new_align_concate: 232 | if aa_red != False: 233 | red = [AR(e, aa_red) for e in seq] 234 | new_seq = "" 235 | for i in red: 236 | new_seq += str(i) 237 | self.cut_alignment.append(new_seq) 238 | else: 239 | self.cut_alignment.append(seq) 240 | 241 | return self.cut_alignment 242 | 243 | else: 244 | output = self.dirname + id + "_" + alignment + ".fasta" 245 | copyfile(self.dirname + file, output) 246 | input = self.dirname + id + "_" + alignment + ".fasta" 247 | 248 | alignment = AlignIO.read(input, "fasta") 249 | for record in alignment: 250 | key = record.id 251 | description.append(key) 252 | 253 | align_length = alignment.get_alignment_length() 254 | for position in range(0, align_length): 255 | column = alignment[:, position] 256 | align.append(column) 257 | if column[0] != "-": 258 | columns.append(column) 259 | positions.append(position) 260 | 261 | for i in range(0, len(positions), 1): 262 | beg = int(positions[i]) 263 | end = int(positions[i] + 1) 264 | block = alignment[:, beg:end] 265 | blocks.append(block) 266 | 267 | for block in blocks: 268 | for record in block: 269 | seq = str(record.seq) 270 | new_align.append(seq) 271 | 272 | numb_blocks = len(new_align) / len(columns[0]) 273 | for i in range(0, len(columns[0])): 274 | for j in range(0, len(new_align), len(columns[0])): 275 | new_align_ord.append(new_align[i + j]) 276 | 277 | for i in range(0, len(new_align_ord), numb_blocks): 278 | pseudolist = new_align_ord[i:i + numb_blocks] 279 | list = "" 280 | for j in pseudolist: 281 | list += j 282 | new_align_concate.append(list) 283 | 284 | for seq in new_align_concate: 285 | if aa_red != False: 286 | red = [AR(e, aa_red) for e in seq] 287 | new_seq = "" 288 | for i in red: 289 | new_seq += str(i) 290 | self.cut_alignment.append(new_seq) 291 | else: 292 | self.cut_alignment.append(seq) 293 | 294 | return self.cut_alignment 295 | 296 | 297 | def alignScore(self, id, alignment): 298 | """ 299 | Computes a score for the MSA inputed. 300 | 301 | Methods implemented: 302 | Sum-of-pairs (SP) score - Murata et al, 1985 303 | as explained in Gonnet et al, 2000. 304 | SP is the sum of all possible combinations of 305 | pairwise scores. 306 | 307 | !!Disclaimer: alignmentScore is terribly slow!! 308 | 309 | (To Do - Circular Sum by Gonnet et al, 2000) 310 | """ 311 | score = LP(self.parameterfile, "alignment_score") 312 | 313 | if score == "sumofpairs": 314 | input = self.dirname + id + "_" + alignment + ".fasta" 315 | sequences = [] 316 | input_sequences = SeqIO.parse(input, "fasta", IUPAC.protein) 317 | for record in input_sequences: 318 | seq = str(record.seq) 319 | sequences.append(seq) 320 | 321 | SumOfPairs = 0 322 | for pair in combinations(sequences, 2): 323 | SumOfPairs += pairwiseScore(pair[0], pair[1]) 324 | 325 | print SumOfPairs 326 | else: pass 327 | 328 | def pairwiseScore(seq1, seq2): 329 | """ 330 | s(x,y) = { matchScore(x,y) if x!='-' and y!='-'; 331 | 0 if x=='-' and y=='-', because the delection as caused earlier 332 | gap penalty, depending on the gap length gap + length * increment} 333 | 334 | gap - depends on the scoring matrix (PAM, BLOSUM, etc) 335 | length - length of the gap 336 | increment - incremental penalty that depends on the scoring matrix 337 | 338 | BLOSUM62, gap = -4, increment = 1 -> increment = length 339 | """ 340 | 341 | gap = -4.0 342 | incr_top = 0 343 | incr_bottom = 0 344 | pairwise_score = 0 345 | for i, j in zip(range(len(seq1)), range(len(seq2))): 346 | aa1 = seq1[i] 347 | aa2 = seq2[j] 348 | if aa1 == "-" and aa2 == "-" : 349 | pairwise_score += 0 350 | elif aa1 != "-" and aa2 != "-": 351 | pairwise_score += float(matchScore(aa1, aa2, "BLOSUM62")) 352 | elif aa1 == "-" and aa2 != "-": 353 | try: 354 | aa11 = seq1[i + 1] 355 | aa22 = seq2[j + 1] 356 | if aa11 == "-" and aa22 != "-": 357 | incr_top += 1 358 | else: 359 | pairwise_score += gap + incr_top * incr_top 360 | incr_top = 0 361 | except: 362 | pairwise_score += gap 363 | pass 364 | elif aa1 != "-" and aa2 == "-": 365 | try: 366 | aa11 = seq1[i + 1] 367 | aa22 = seq2[j + 1] 368 | if aa11 != "-" and aa22 == "-": 369 | incr_bottom += 1 370 | else: 371 | pairwise_score += gap + incr_bottom * incr_bottom 372 | incr_bottom = 0 373 | except: 374 | pairwise_score += gap 375 | pass 376 | else: pass 377 | 378 | return pairwise_score 379 | 380 | def matchScore(alpha, beta, matrix): 381 | "Matches scores from a matrix" 382 | 383 | alphabet = {} 384 | alphabet["A"] = 0 385 | alphabet["R"] = 1 386 | alphabet["N"] = 2 387 | alphabet["D"] = 3 388 | alphabet["C"] = 4 389 | alphabet["Q"] = 5 390 | alphabet["E"] = 6 391 | alphabet["G"] = 7 392 | alphabet["H"] = 8 393 | alphabet["I"] = 9 394 | alphabet["L"] = 10 395 | alphabet["K"] = 11 396 | alphabet["M"] = 12 397 | alphabet["F"] = 13 398 | alphabet["P"] = 14 399 | alphabet["S"] = 15 400 | alphabet["T"] = 16 401 | alphabet["W"] = 17 402 | alphabet["Y"] = 18 403 | alphabet["V"] = 19 404 | alphabet["B"] = 20 405 | alphabet["Z"] = 21 406 | alphabet["X"] = 22 407 | alphabet["-"] = 22 408 | lut_x = alphabet[alpha] 409 | lut_y = alphabet[beta] 410 | 411 | return mapMatrix(matrix)[lut_x][lut_y] 412 | 413 | def mapMatrix(matrix): 414 | "Maps a matrix of floats" 415 | matrix = matrix.upper() 416 | 417 | score_matrix = [] 418 | input = './Matrix/' + matrix 419 | input_matrix = open(input, 'r') 420 | for line in input_matrix.readlines(): 421 | score_matrix.append(map(float, line.split())) 422 | input_matrix.close() 423 | 424 | return score_matrix 425 | 426 | def AR(aminoacid, method): 427 | """Performs alphabet reduction. 428 | Alphabets: charge, charge_his, polarity, hydropathy 429 | """ 430 | 431 | if method == "charge": 432 | return charge[aminoacid] 433 | elif method == "charge_his": 434 | return charge_his[aminoacid] 435 | elif method == "polarity": 436 | return polarity[aminoacid] 437 | elif method == "hydropathy": 438 | return hydropathy[aminoacid] 439 | else: 440 | return aminoacid 441 | 442 | 443 | -------------------------------------------------------------------------------- /src/BLAST.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | from Parameters import LoadParameters as LP 9 | from os import remove 10 | from shutil import move 11 | from Bio import SeqIO, Entrez 12 | from Bio.Alphabet import IUPAC 13 | from Bio.Blast import NCBIXML, NCBIWWW 14 | from Bio.Blast.Applications import NcbipsiblastCommandline 15 | Entrez.email = "entrez@mail.com" 16 | 17 | class psiblast: 18 | """ 19 | Main code for psiblast search over internet or at local database. 20 | 21 | Method for searching homologous sequences: 22 | PSI-Blast - Altschul et al, 1997 23 | """ 24 | def __init__(self, id1, id2, psiblast, parameterfile, dirname): 25 | self.id1 = id1 26 | self.id2 = id2 27 | self.psiblast = psiblast 28 | self.parameterfile = parameterfile 29 | self.dirname = dirname 30 | 31 | def __call__(self, id1, id2, psiblast, parameterfile, dirname): 32 | self.id1 = id1 33 | self.id2 = id2 34 | self.psiblast = psiblast 35 | self.parameterfile = parameterfile 36 | self.dirname = dirname 37 | 38 | def searchPSIBLAST(self, id, psiblast): 39 | "Psi-Blast over a local database or over the internet" 40 | 41 | if psiblast == "local": 42 | threads = LP(self.parameterfile, "psiblast_threading") 43 | evalue = LP(self.parameterfile, "psiblast_evalue") 44 | reference_protein = "refseq_protein" 45 | 46 | in_sequence = self.dirname + id + ".fa" 47 | 48 | output = self.dirname + id + ".xml" 49 | if threads == False: 50 | psiblast = NcbipsiblastCommandline(query=in_sequence, 51 | db=reference_protein, 52 | outfmt=5, 53 | threshold=evalue, 54 | out=output) 55 | psiblast() 56 | else: 57 | try: 58 | threads = int(threads) 59 | psiblast = NcbipsiblastCommandline(query=in_sequence, 60 | db=reference_protein, 61 | outfmt=5, 62 | threshold=evalue, 63 | out=output, 64 | num_threads=threads) 65 | psiblast() 66 | except: 67 | psiblast = NcbipsiblastCommandline(query=in_sequence, 68 | db=reference_protein, 69 | outfmt=5, 70 | threshold=evalue, 71 | out=output) 72 | psiblast() 73 | 74 | try: 75 | open(self.dirname + id + ".fasta") 76 | open.close() 77 | remove(self.dirname + id + ".fa") 78 | except: 79 | move(self.dirname + id + ".fa", self.dirname + id + ".fasta") 80 | else: 81 | evalue = LP(self.parameterfile, "psiblast_evalue") 82 | reference_protein = "refseq_protein" 83 | 84 | in_sequence = self.dirname + id + ".fa" 85 | 86 | for seq_record in SeqIO.parse(in_sequence, 87 | "fasta", IUPAC.protein): 88 | sequence = seq_record.seq 89 | 90 | psiblast = NCBIWWW.qblast("blastp", 91 | reference_protein, 92 | sequence, 93 | service="psi", 94 | expect=evalue, 95 | hitlist_size=500) 96 | psiblast 97 | 98 | try: 99 | open(self.dirname + id + ".fasta") 100 | open.close() 101 | remove(self.dirname + id + ".fa") 102 | except: 103 | move(self.dirname + id + ".fa", self.dirname + id + ".fasta") 104 | 105 | output = self.dirname + id + ".xml" 106 | saveblast = open(output, "w") 107 | saveblast.write(psiblast.read()) 108 | saveblast.close() 109 | psiblast.close() 110 | 111 | def validXML(self, id): 112 | "Checks if the input file is a valid XML" 113 | 114 | try: 115 | input = self.dirname + id + ".xml" 116 | input_xml = open(input, "r") 117 | xml = input_xml.readline() 118 | input_xml.close() 119 | if xml[0:5] == " thresh_coverage and identity > thresh_identity: 150 | hits.append(hit_id) 151 | input_xml.close() 152 | 153 | if hits == []: 154 | raise StandardError, "%s - No Hits found in PSI-BLAST search" % (input) 155 | 156 | for hit_id in hits: 157 | gi = hit_id[hit_id.find("id|") + 4:hit_id.find("|ref")] 158 | try: 159 | efetch = Entrez.efetch(db="protein", id=gi, rettype="fasta") 160 | except: 161 | try: 162 | efetch = Entrez.efetch(db="protein", id=gi, rettype="fasta") 163 | except: 164 | efetch = Entrez.efetch(db="protein", id=gi, rettype="fasta") 165 | efetch = Entrez.efetch(db="protein", id=gi, rettype="fasta") 166 | for values in efetch: 167 | description = values 168 | break 169 | sequence = "" 170 | for values in efetch: 171 | sequence += values.rstrip("\n") 172 | try: 173 | organism = description[description.find("[") + 1:description.find("]")] 174 | organism = organism.split() 175 | if len(organism) != 1: 176 | species = str(organism[0] + "_" + organism[1]) 177 | else: 178 | species = str(organism[0] + "_" + "sp.") 179 | output = self.dirname + id + ".blast" 180 | blast = open(output, "a") 181 | blast.write("\n" + ">" + species + "\n" + sequence + "\n") 182 | blast.close() 183 | except: 184 | raise StandardError, "%s - No Hits found in PSI-BLAST search" % (input) 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /src/COEVOL.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | from src.SEQ import sequence as class_sequence 9 | from src.ALIGN import alignment as class_alignment 10 | from Parameters import LoadParameters as LP 11 | from src.UTILS import aa, Flash 12 | from math import log, e, factorial 13 | from numpy import mean, std, zeros, sqrt 14 | from matplotlib import pyplot 15 | #from shutil import copyfile 16 | 17 | class coevolution: 18 | """ 19 | Main code for coevolution analysis. 20 | Note: All the coevolution measures are normalized [0:1] 21 | Matrix-based Methods: 22 | * Residue Contact Preferences, Volume Normalized - Glaser et al, 2001. 23 | * Contact PDB-derived Likelihood Matrix - Singer et al, 2002. 24 | * Residue-residue volume normalized - based on Esque et al, 2010. 25 | 26 | Mutual Information based methods: 27 | * Mutual Information - Gloor el al, 2005. 28 | * MI by pair entropy - Martin el al, 2005. 29 | * Row and column weighed MI - Gouveia-Oliveira et al, 2007. 30 | * Contact preferences, volume normalized MIE - F. Madeira, 2012. 31 | (unpublished) 32 | 33 | Correlation-based methods: 34 | * OMES (Observed Minus Expected Squared) - Kass and Horovitz, 2002. 35 | * Pearson's correlation - Gobel et al, 1994. (slow) 36 | * Spearman's rank correlation - Pazos et al, 1997. (slow) 37 | * McBASC (McLachlan Based Substitution Correlation) - Fodor and 38 | Aldrich, 2004. (slow) 39 | * Quartets - Galitsky, 2002. 40 | 41 | Perturbation-based methods: 42 | * SCA (Statistical Coupling analysis) - Lockless and Ranganathan, 1999. 43 | As on Halperin et al, 2006. 44 | * ELSC (Explicit Likelihood of Subset Covariation) - Dekker et al, 2004. 45 | """ 46 | 47 | def __init__(self, file1, file2, id1, id2, chain1, chain2, 48 | alignment, coevolution, parameterfile, dirname): 49 | self.file1 = file1 50 | self.file2 = file2 51 | self.id1 = id1 52 | self.id2 = id2 53 | self.chain1 = chain1 54 | self.chain2 = chain2 55 | self.alignment = alignment 56 | self.coevolution = coevolution 57 | self.parameterfile = parameterfile 58 | self.dirname = dirname 59 | 60 | def __call__(self, file1, file2, id1, id2, chain1, chain2, 61 | alignment, coevolution, parameterfile, dirname): 62 | self.file1 = file1 63 | self.file2 = file2 64 | self.id1 = id1 65 | self.id2 = id2 66 | self.chain1 = chain1 67 | self.chain2 = chain2 68 | self.alignment = alignment 69 | self.coevolution = coevolution 70 | self.parameterfile = parameterfile 71 | self.dirname = dirname 72 | 73 | def coevolAnalysis(self, file1, file2, id1, id2, 74 | chain1, chain2, alignment, coevolution): 75 | "Returns a matrix of coevolution scores" 76 | 77 | seq = class_sequence(self.file1, self.file2, self.id1, self.id2, 78 | self.chain1, self.chain2, self.parameterfile, 79 | self.dirname) 80 | aln = class_alignment(self.id1, self.id2, self.alignment, 81 | self.parameterfile, self.dirname) 82 | 83 | alignment1 = aln.cutAlignment(file1, id1, alignment) 84 | alignment2 = aln.cutAlignment(file2, id2, alignment) 85 | 86 | try: 87 | assert len(alignment1) == len(alignment2) 88 | except: 89 | raise StandardError, "Alignments must have the same number of sequences" 90 | 91 | protein1 = [] 92 | protein2 = [] 93 | try: 94 | protein1 = seq.matchResiduePosition(id1, chain1) 95 | protein2 = seq.matchResiduePosition(id2, chain2) 96 | except: 97 | pass 98 | 99 | info = dict() 100 | alignment1 = [e for e in alignment1] 101 | columns1 = transpose(alignment1) 102 | 103 | alignment2 = [e for e in alignment2] 104 | columns2 = transpose(alignment2) 105 | 106 | if coevolution == "mi": 107 | Flash('Mutual Information') 108 | mi = dict() 109 | pD1 = probabilityDict(columns1) 110 | pD2 = probabilityDict(columns2) 111 | 112 | for i in range(len(columns1)): 113 | Flash('Column ' + str(i)) 114 | for j in range(len(columns2)): 115 | mi[(i, j)] = mutualInformation(i, j, columns1, columns2, pD1, pD2) 116 | 117 | max_pos = [] 118 | for i in range(len(columns1)): 119 | for j in range(len(columns2)): 120 | max_pos.append(mi[(i, j)]) 121 | max_val = max(max_pos) 122 | 123 | for i in range(len(columns1)): 124 | for j in range(len(columns2)): 125 | if mi[(i, j)] != 0.0: 126 | info[(i, j)] = mi[(i, j)] * 1.0 / max_val 127 | else: 128 | info[(i, j)] = 0.0 129 | 130 | elif coevolution == "mie": 131 | Flash('Mutual Information by Pair Entropy') 132 | mie = dict() 133 | pD1 = probabilityDict(columns1) 134 | pD2 = probabilityDict(columns2) 135 | 136 | for i in range(len(columns1)): 137 | Flash('Column ' + str(i)) 138 | for j in range(len(columns2)): 139 | mie[(i, j)] = miEntropy(i, j, columns1, columns2, pD1, pD2) 140 | 141 | max_pos = [] 142 | for i in range(len(columns1)): 143 | for j in range(len(columns2)): 144 | max_pos.append(mie[(i, j)]) 145 | max_val = max(max_pos) 146 | 147 | for i in range(len(columns1)): 148 | for j in range(len(columns2)): 149 | if mie[(i, j)] != 0.0: 150 | info[(i, j)] = mie[(i, j)] * 1.0 / max_val 151 | else: 152 | info[(i, j)] = 0.0 153 | 154 | elif coevolution == "rcwmi": 155 | Flash('Row and Column Weighed Mutual Information') 156 | rcwmi = dict() 157 | pD1 = probabilityDict(columns1) 158 | pD2 = probabilityDict(columns2) 159 | 160 | i_all = dict() 161 | all_j = dict() 162 | for i in range(len(columns1)): 163 | v_i = 0 164 | for j in range(len(columns2)): 165 | v_i += mutualInformation(i, j, columns1, columns2, 166 | pD1, pD2) 167 | i_all[i] = v_i 168 | 169 | for j in range(len(columns2)): 170 | v_j = 0 171 | for i in range(len(columns1)): 172 | v_j += mutualInformation(i, j, columns1, columns2, 173 | pD1, pD2) 174 | all_j[j] = v_j 175 | 176 | column = columns1[0] 177 | n = len(column) 178 | for i in range(len(columns1)): 179 | Flash('Column ' + str(i)) 180 | for j in range(len(columns2)): 181 | mi = mutualInformation(i, j, columns1, columns2, 182 | pD1, pD2) 183 | rcwmi[(i, j)] = rowColumnWeighed(mi, 184 | i_all[i], all_j[j], n) 185 | max_pos = [] 186 | for i in range(len(columns1)): 187 | for j in range(len(columns2)): 188 | max_pos.append(rcwmi[(i, j)]) 189 | max_val = max(max_pos) 190 | 191 | for i in range(len(columns1)): 192 | for j in range(len(columns2)): 193 | if rcwmi[(i, j)] != 0.0: 194 | info[(i, j)] = rcwmi[(i, j)] * 1.0 / max_val 195 | else: 196 | info[(i, j)] = 0.0 197 | 198 | elif coevolution == "cpvn": 199 | Flash('Contact Preferences, Volume Normalized') 200 | cpvn = dict() 201 | score_matrix = mapMatrix("CPVN") 202 | for i in range(len(columns1)): 203 | Flash('Column ' + str(i)) 204 | for j in range(len(columns2)): 205 | res1 = str(alignment1[0][i]) 206 | res2 = str(alignment2[0][j]) 207 | average = [] 208 | for a, b in zip(columns1[i], columns2[j]): 209 | if a in aa and b in aa: 210 | average.append(float(matchScore(res1, res2, score_matrix))) 211 | cpvn[(i, j)] = mean(average) 212 | 213 | max_pos = [] 214 | for i in range(len(columns1)): 215 | for j in range(len(columns2)): 216 | max_pos.append(cpvn[(i, j)]) 217 | max_val = max(max_pos) 218 | 219 | for i in range(len(columns1)): 220 | for j in range(len(columns2)): 221 | if cpvn[(i, j)] != 0.0: 222 | info[(i, j)] = cpvn[(i, j)] * 1.0 / max_val 223 | else: 224 | info[(i, j)] = 0.0 225 | 226 | elif coevolution == "clm": 227 | Flash('Contact PDB-derived Likelihood Matrix') 228 | clm = dict() 229 | score_matrix = mapMatrix("CLM") 230 | for i in range(len(alignment1[0])): 231 | Flash('Column ' + str(i)) 232 | for j in range(len(alignment2[0])): 233 | res1 = str(alignment1[0][i]) 234 | res2 = str(alignment2[0][j]) 235 | average = [] 236 | for a, b in zip(columns1[i], columns2[j]): 237 | if a in aa and b in aa: 238 | average.append(float(matchScore(res1, res2, score_matrix))) 239 | clm[(i, j)] = mean(average) 240 | 241 | max_pos = [] 242 | for i in range(len(columns1)): 243 | for j in range(len(columns2)): 244 | max_pos.append(clm[(i, j)]) 245 | max_val = max(max_pos) 246 | 247 | for i in range(len(columns1)): 248 | for j in range(len(columns2)): 249 | if clm[(i, j)] != 0.0: 250 | info[(i, j)] = clm[(i, j)] * 1.0 / max_val 251 | else: 252 | info[(i, j)] = 0.0 253 | 254 | elif coevolution == "vol": 255 | Flash('Residue-residue Volume Normalized') 256 | vol = dict() 257 | score_matrix = mapMatrix("VOL") 258 | for i in range(len(alignment1[0])): 259 | Flash('Column ' + str(i)) 260 | for j in range(len(alignment2[0])): 261 | res1 = str(alignment1[0][i]) 262 | res2 = str(alignment2[0][j]) 263 | average = [] 264 | for a, b in zip(columns1[i], columns2[j]): 265 | if a in aa and b in aa: 266 | average.append(float(matchScore(res1, res2, score_matrix))) 267 | vol[(i, j)] = mean(average) 268 | 269 | max_pos = [] 270 | for i in range(len(columns1)): 271 | for j in range(len(columns2)): 272 | max_pos.append(vol[(i, j)]) 273 | max_val = max(max_pos) 274 | 275 | for i in range(len(columns1)): 276 | for j in range(len(columns2)): 277 | if vol[(i, j)] != 0.0: 278 | info[(i, j)] = vol[(i, j)] * 1.0 / max_val 279 | else: 280 | info[(i, j)] = 0.0 281 | 282 | elif coevolution == "omes": 283 | Flash('Observed Minus Expected Squared') 284 | omes = dict() 285 | for i in range(len(columns1)): 286 | Flash('Column ' + str(i)) 287 | for j in range(len(columns2)): 288 | omes[(i, j)] = covarianceOMES(columns1[i], columns2[j]) 289 | 290 | max_pos = [] 291 | for i in range(len(columns1)): 292 | for j in range(len(columns2)): 293 | max_pos.append(omes[(i, j)]) 294 | max_val = max(max_pos) 295 | 296 | for i in range(len(columns1)): 297 | for j in range(len(columns2)): 298 | if omes[(i, j)] != 0.0: 299 | info[(i, j)] = omes[(i, j)] * 1.0 / max_val 300 | else: 301 | info[(i, j)] = 0.0 302 | 303 | elif coevolution == "pearson": 304 | Flash("Pearson's correlation") 305 | pearson = dict() 306 | score_matrix = mapMatrix("MCLACHLAN") 307 | N = len(columns1[0]) 308 | for i in range(len(columns1)): 309 | Flash('Column ' + str(i)) 310 | for j in range(len(columns2)): 311 | d_matrix1 = twoDimensionalMatrix(columns1[i], score_matrix) 312 | d_matrix2 = twoDimensionalMatrix(columns2[j], score_matrix) 313 | pearson[(i, j)] = pearsonsCorrelation(d_matrix1, d_matrix2, N) 314 | 315 | max_pos = [] 316 | for i in range(len(columns1)): 317 | for j in range(len(columns2)): 318 | max_pos.append(pearson[(i, j)]) 319 | max_val = max(max_pos) 320 | 321 | for i in range(len(columns1)): 322 | for j in range(len(columns2)): 323 | if pearson[(i, j)] != 0.0: 324 | info[(i, j)] = pearson[(i, j)] * 1.0 / max_val 325 | else: 326 | info[(i, j)] = 0.0 327 | 328 | elif coevolution == "spearman": 329 | Flash("Spearman's rank correlation") 330 | score_matrix = mapMatrix("MCLACHLAN") 331 | spearman = dict() 332 | N = len(columns1[0]) 333 | for i in range(len(columns1)): 334 | Flash('Column ' + str(i)) 335 | for j in range(len(columns2)): 336 | d_matrix1 = twoDimensionalMatrix(columns1[i], score_matrix) 337 | d_matrix2 = twoDimensionalMatrix(columns2[j], score_matrix) 338 | spearman[(i, j)] = spearmansCorrelation(d_matrix1, d_matrix2, N) 339 | 340 | max_pos = [] 341 | for i in range(len(columns1)): 342 | for j in range(len(columns2)): 343 | max_pos.append(spearman[(i, j)]) 344 | max_val = max(max_pos) 345 | 346 | for i in range(len(columns1)): 347 | for j in range(len(columns2)): 348 | if spearman[(i, j)] != 0.0: 349 | info[(i, j)] = spearman[(i, j)] * 1.0 / max_val 350 | else: 351 | info[(i, j)] = 0.0 352 | 353 | elif coevolution == "mcbasc": 354 | Flash('McLachlan Based Substitution Correlation') 355 | mcbasc = dict() 356 | score_matrix = mapMatrix("MCLACHLAN") 357 | N = len(columns1[0]) 358 | for i in range(len(columns1)): 359 | Flash('Column ' + str(i)) 360 | for j in range(len(columns2)): 361 | d_matrix1 = twoDimensionalMatrix(columns1[i], score_matrix) 362 | d_matrix2 = twoDimensionalMatrix(columns2[j], score_matrix) 363 | mcbasc[(i, j)] = mcbascCorrelation(d_matrix1, d_matrix2, N) 364 | 365 | max_pos = [] 366 | for i in range(len(columns1)): 367 | for j in range(len(columns2)): 368 | max_pos.append(mcbasc[(i, j)]) 369 | max_val = max(max_pos) 370 | 371 | for i in range(len(columns1)): 372 | for j in range(len(columns2)): 373 | if mcbasc[(i, j)] != 0.0: 374 | info[(i, j)] = mcbasc[(i, j)] * 1.0 / max_val 375 | else: 376 | info[(i, j)] = 0.0 377 | 378 | elif coevolution == "quartets": 379 | Flash('Quartets') 380 | quartets = dict() 381 | for i in range(len(columns1)): 382 | Flash('Column ' + str(i)) 383 | for j in range(len(columns2)): 384 | quartets[(i, j)] = quartetsCorrelation(columns1[i], columns2[j]) 385 | 386 | max_pos = [] 387 | for i in range(len(columns1)): 388 | for j in range(len(columns2)): 389 | max_pos.append(quartets[(i, j)]) 390 | max_val = max(max_pos) 391 | 392 | for i in range(len(columns1)): 393 | for j in range(len(columns2)): 394 | if quartets[(i, j)] != 0.0: 395 | info[(i, j)] = quartets[(i, j)] * 1.0 / max_val 396 | else: 397 | info[(i, j)] = 0.0 398 | 399 | elif coevolution == "sca": 400 | Flash('Statistical Coupling Analysis') 401 | sca = dict() 402 | for i in range(len(columns1)): 403 | Flash('Column ' + str(i)) 404 | for j in range(len(columns2)): 405 | sca[(i, j)] = perturbationSCA(columns1[i], columns2[j], \ 406 | j, columns2) 407 | max_pos = [] 408 | for i in range(len(columns1)): 409 | for j in range(len(columns2)): 410 | max_pos.append(sca[(i, j)]) 411 | max_val = max(max_pos) 412 | 413 | for i in range(len(columns1)): 414 | for j in range(len(columns2)): 415 | if sca[(i, j)] != 0.0: 416 | info[(i, j)] = sca[(i, j)] * 1.0 / max_val 417 | else: 418 | info[(i, j)] = 0.0 419 | 420 | elif coevolution == "elsc": 421 | Flash('Explicit Likelihood of Subset Covariation') 422 | elsc = dict() 423 | for i in range(len(columns1)): 424 | Flash('Column ' + str(i)) 425 | for j in range(len(columns2)): 426 | elsc[(i, j)] = perturbationELSC(columns1[i], columns2[j], \ 427 | j, columns2) 428 | max_pos = [] 429 | for i in range(len(columns1)): 430 | for j in range(len(columns2)): 431 | max_pos.append(elsc[(i, j)]) 432 | max_val = max(max_pos) 433 | 434 | for i in range(len(columns1)): 435 | for j in range(len(columns2)): 436 | if elsc[(i, j)] != 0.0: 437 | info[(i, j)] = elsc[(i, j)] * 1.0 / max_val 438 | else: 439 | info[(i, j)] = 0.0 440 | else: pass 441 | 442 | output = self.dirname + alignment + "_" + coevolution + ".txt" 443 | results = open(output, "w") 444 | for i, j in sorted(info.keys()): 445 | if protein1 != [] and protein2 != []: 446 | print >> results, protein1[i], protein2[j], \ 447 | round((info[(i, j)]), 4) 448 | elif protein1 != [] and protein2 == []: 449 | print >> results, protein1[i], protein1[j], \ 450 | round((info[(i, j)]), 4) 451 | else: 452 | print >> results, str(i + 1), str(j + 1), \ 453 | round((info[(i, j)]), 4) 454 | results.close() 455 | 456 | def bestInfo(self, id1, id2, alignment, coevolution): 457 | "Points out the best coevolution scores" 458 | 459 | seq = class_sequence(self.file1, self.file2, self.id1, self.id2, 460 | self.chain1, self.chain2, self.parameterfile, 461 | self.dirname) 462 | 463 | histogram = LP(self.parameterfile, "results_histogram") 464 | heatmap = LP(self.parameterfile, "results_heatmap") 465 | best_info = LP(self.parameterfile, "best_results") 466 | 467 | surface1 = [] 468 | surface2 = [] 469 | interface = [] 470 | try: 471 | surface1 = seq.parseSurfacePDB(id1) 472 | surface2 = seq.parseSurfacePDB(id2) 473 | except: 474 | pass 475 | 476 | try: 477 | interface = seq.parseInterfacePDB(id1) 478 | except: 479 | pass 480 | 481 | input = self.dirname + alignment + "_" + coevolution + ".txt" 482 | output = self.dirname + alignment + "_" + coevolution + "_best.txt" 483 | bestResults(input, output, best_info, surface1, surface2, interface) 484 | 485 | if histogram == True: 486 | input = self.dirname + alignment + "_" + coevolution + ".txt" 487 | output = self.dirname + alignment + "_" + coevolution + "_hg.png" 488 | drawHistogram(input, output) 489 | 490 | if heatmap == True: 491 | input = self.dirname + alignment + "_" + coevolution + ".txt" 492 | output = self.dirname + alignment + "_" + coevolution + "_hm.png" 493 | drawHeatmap(id1, id2, input, output) 494 | 495 | 496 | def structureSingle(self, id1, id2, chain1, chain2, alignment, coevolution): 497 | "Structure based results for proteins with single chain" 498 | 499 | structure = LP(self.parameterfile, "results_structure") 500 | best_info = LP(self.parameterfile, "best_results") 501 | 502 | input = self.dirname + alignment + "_" + coevolution + "_best.txt" 503 | input_results = open(input, "r") 504 | results = input_results.readlines() 505 | input_results.close() 506 | 507 | positions1 = [] 508 | positions2 = [] 509 | for line in results: 510 | l = line.rstrip("\n") 511 | l = l.split() 512 | res1 = int(l[0]) 513 | res2 = int(l[1]) 514 | positions1.append(res1) 515 | positions2.append(res2) 516 | 517 | if structure == "pymol": 518 | output1 = self.dirname + id1 + ".pml" 519 | out_struct1 = open(output1, "w") 520 | print >> out_struct1, "load %s" % (id1 + ".pdb") 521 | print >> out_struct1, "hide lines" 522 | print >> out_struct1, "hide nonbonded" 523 | print >> out_struct1, "bg_color black" 524 | print >> out_struct1, "color grey20" 525 | print >> out_struct1, "show cartoon" 526 | print >> out_struct1, "select hitmol, chain %s" % (chain1.lower()) 527 | print >> out_struct1, "color red, (hitmol and resid *)" 528 | for pos in positions1: 529 | if len(positions1) <= 20: 530 | print >> out_struct1, "color yellow, (hitmol and resid %s)" \ 531 | % (str(pos + 1)) 532 | print >> out_struct1, "show spheres, (hitmol and resid %s)" \ 533 | % (str(pos + 1)) 534 | else: 535 | print >> out_struct1, "color yellow, (hitmol and resid %s)" \ 536 | % (str(pos + 1)) 537 | print >> out_struct1, "show sticks, (hitmol and resid %s)" \ 538 | % (str(pos + 1)) 539 | out_struct1.close() 540 | 541 | output2 = self.dirname + id2 + ".pml" 542 | out_struct2 = open(output2, "w") 543 | print >> out_struct2, "load %s" % (id2 + ".pdb") 544 | print >> out_struct2, "hide lines" 545 | print >> out_struct2, "hide nonbonded" 546 | print >> out_struct2, "bg_color black" 547 | print >> out_struct2, "color grey20" 548 | print >> out_struct2, "show cartoon" 549 | print >> out_struct2, "select hitmol, chain %s" % (chain2.lower()) 550 | print >> out_struct2, "color blue, (hitmol and resid *)" 551 | for pos in positions2: 552 | if best_info <= 20: 553 | print >> out_struct2, "color green, (hitmol and resid %s)" \ 554 | % (str(pos + 1)) 555 | print >> out_struct2, "show spheres, (hitmol and resid %s)" \ 556 | % (str(pos + 1)) 557 | else: 558 | print >> out_struct2, "color green, (hitmol and resid %s)" \ 559 | % (str(pos + 1)) 560 | print >> out_struct2, "show sticks, (hitmol and resid %s)" \ 561 | % (str(pos + 1)) 562 | out_struct2.close() 563 | else: pass 564 | 565 | #copyfile(self.dirname + id1 + ".pdb", self.dirname + id1 + ".pdb") 566 | #copyfile(self.dirname + id2 + ".pdb", self.dirname + id2 + ".pdb") 567 | 568 | def structurePair(self, id1, id2, chain1, chain2, alignment, coevolution): 569 | "Structure based results for a protein with two chains" 570 | 571 | structure = LP(self.parameterfile, "results_structure") 572 | best_info = LP(self.parameterfile, "best_results") 573 | 574 | input = self.dirname + alignment + "_" + coevolution + "_best.txt" 575 | input_results = open(input, "r") 576 | results = input_results.readlines() 577 | input_results.close() 578 | 579 | positions1 = [] 580 | positions2 = [] 581 | for line in results: 582 | l = line.rstrip("\n") 583 | l = l.split() 584 | res1 = int(l[0]) 585 | res2 = int(l[1]) 586 | positions1.append(res1) 587 | positions2.append(res2) 588 | 589 | if structure == "pymol": 590 | output = self.dirname + id1 + ".pml" 591 | 592 | out_struct = open(output, "w") 593 | print >> out_struct, "load %s" % (id1 + ".pdb") 594 | print >> out_struct, "hide lines" 595 | print >> out_struct, "hide nonbonded" 596 | print >> out_struct, "bg_color black" 597 | print >> out_struct, "color grey20" 598 | print >> out_struct, "show cartoon" 599 | print >> out_struct, "select hitmol1, chain %s" % (chain1.lower()) 600 | print >> out_struct, "select hitmol2, chain %s" % (chain2.lower()) 601 | print >> out_struct, "color red, (hitmol1)" 602 | print >> out_struct, "color blue, (hitmol2)" + "\n" 603 | for pos in positions1: 604 | if best_info <= 20: 605 | print >> out_struct, "color yellow, (hitmol1 and resid %s)" \ 606 | % (str(pos + 1)) 607 | print >> out_struct, "show spheres, (hitmol1 and resid %s)" \ 608 | % (str(pos + 1)) 609 | else: 610 | print >> out_struct, "color yellow, (hitmol1 and resid %s)" \ 611 | % (str(pos + 1)) 612 | print >> out_struct, "show sticks, (hitmol1 and resid %s)" \ 613 | % (str(pos + 1)) 614 | 615 | for pos in positions2: 616 | if best_info <= 20: 617 | print >> out_struct, "color green, (hitmol2 and resid %s)" \ 618 | % (str(pos + 1)) 619 | print >> out_struct, "show spheres, (hitmol2 and resid %s)" \ 620 | % (str(pos + 1)) 621 | else: 622 | print >> out_struct, "color green, (hitmol2 and resid %s)" \ 623 | % (str(pos + 1)) 624 | print >> out_struct, "show sticks, (hitmol2 and resid %s)" \ 625 | % (str(pos + 1)) 626 | out_struct.close() 627 | else: 628 | pass 629 | 630 | #copyfile(self.dirname + id1 + ".pdb", self.dirname + id1 + ".pdb") 631 | 632 | def matchScore(alpha, beta, score_matrix): 633 | "Matches scores from a matrix" 634 | 635 | alphabet = {} 636 | alphabet["I"] = 0 637 | alphabet["V"] = 1 638 | alphabet["L"] = 2 639 | alphabet["F"] = 3 640 | alphabet["C"] = 4 641 | alphabet["M"] = 5 642 | alphabet["A"] = 6 643 | alphabet["G"] = 7 644 | alphabet["T"] = 8 645 | alphabet["S"] = 9 646 | alphabet["W"] = 10 647 | alphabet["Y"] = 11 648 | alphabet["P"] = 12 649 | alphabet["H"] = 13 650 | alphabet["E"] = 14 651 | alphabet["Q"] = 15 652 | alphabet["D"] = 16 653 | alphabet["N"] = 17 654 | alphabet["K"] = 18 655 | alphabet["R"] = 19 656 | lut_x = alphabet[alpha] 657 | lut_y = alphabet[beta] 658 | 659 | return score_matrix[lut_x][lut_y] 660 | 661 | def matchScore2(alpha, beta, score_matrix): 662 | "Matches scores from a matrix - different residue order" 663 | 664 | alphabet = {} 665 | alphabet["A"] = 0 666 | alphabet["R"] = 1 667 | alphabet["N"] = 2 668 | alphabet["D"] = 3 669 | alphabet["C"] = 4 670 | alphabet["Q"] = 5 671 | alphabet["E"] = 6 672 | alphabet["G"] = 7 673 | alphabet["H"] = 8 674 | alphabet["I"] = 9 675 | alphabet["L"] = 10 676 | alphabet["K"] = 11 677 | alphabet["M"] = 12 678 | alphabet["F"] = 13 679 | alphabet["P"] = 14 680 | alphabet["S"] = 15 681 | alphabet["T"] = 16 682 | alphabet["W"] = 17 683 | alphabet["Y"] = 18 684 | alphabet["V"] = 19 685 | lut_x = alphabet[alpha] 686 | lut_y = alphabet[beta] 687 | 688 | return score_matrix[lut_x][lut_y] 689 | 690 | def mapMatrix(matrix): 691 | "Maps a matrix of floats" 692 | matrix = matrix.upper() 693 | 694 | score_matrix = [] 695 | input = './Matrix/' + matrix 696 | input_matrix = open(input, 'r') 697 | for line in input_matrix.readlines(): 698 | score_matrix.append(map(float, line.split())) 699 | input_matrix.close() 700 | 701 | return score_matrix 702 | 703 | def twoDimensionalMatrix(column, score_matrix): 704 | "For each column in the alignment constructs a two-dimensional matrix" 705 | 706 | two_d = [] 707 | for i in range(len(column)): 708 | for j in range(len(column)): 709 | if i != j: 710 | res1 = column[i] 711 | res2 = column[j] 712 | if res1 in aa and res2 in aa: 713 | s = float(matchScore2(res1, res2, score_matrix)) 714 | two_d.append(s) 715 | else: 716 | s = 0.0 717 | two_d.append(s) 718 | 719 | return two_d 720 | 721 | def log21(n): 722 | return log(n) * 1.0 / log(21) 723 | 724 | def ln(n): 725 | return log(n) * 1.0 / log(e) 726 | 727 | def transpose(L): 728 | R = range(len(L[0])) 729 | rL = list() 730 | for i in R: 731 | rL.append(''.join([item[i] for item in L])) 732 | return rL 733 | 734 | 735 | def probabilityDict(columns): 736 | "Caches character probabilities for each column" 737 | 738 | n = len(columns[0]) 739 | pD = list() 740 | for col in columns: 741 | aa = list(set(col)) 742 | values = [col.count(k) * 1.0 / n for k in aa] 743 | pD.append(dict(zip(aa, values))) 744 | return pD 745 | 746 | 747 | def mutualInformation(i, j, cols1, cols2, pD1, pD2): 748 | """ 749 | Mutual informaton for protein coevolution as by 750 | Gloor et al, 2005. MI(X,Y) = H(X) + H(Y) - H(X,Y) 751 | MI(X,Y) = SUMSUM P(x,y).log20(P(x,y)/P(x).P(y)) 752 | Treates gaps as signal. 753 | """ 754 | 755 | col1, col2 = cols1[i], cols2[j] 756 | n = len(col1) 757 | assert n == len(col2) 758 | mi = 0 759 | pairs = [col1[k] + col2[k] for k in range(n)] 760 | pL = sorted(list(set(pairs))) 761 | for p in pL: 762 | pXY = pairs.count(p) * 1.0 / n 763 | pX = pD1[i][p[0]] 764 | pY = pD2[j][p[1]] 765 | inside = (pXY * 1.0) / (pX * pY) 766 | outside = pXY * log21(inside) 767 | mi += outside 768 | return mi 769 | 770 | def miEntropy(i, j, cols1, cols2, pD1, pD2): 771 | """ 772 | Mutual informaton by pair entropy - Martin et al, 2005. 773 | MI(X,Y) = (H(X) + H(Y) - H(X,Y)) / H(X,Y) 774 | MI(X,Y) = (SUMSUM P(x,y).log20(P(x,y)/P(x).P(y))) / 775 | -(SUMSUM P(x,y).log20(P(x,y))) 776 | """ 777 | 778 | col1, col2 = cols1[i], cols2[j] 779 | assert len(col1) == len(col2) 780 | n = len(col1) 781 | mi = 0 782 | entropy = 0 783 | pairs = [col1[k] + col2[k] for k in range(n)] 784 | pL = sorted(list(set(pairs))) 785 | for p in pL: 786 | pXY = pairs.count(p) * 1.0 / n 787 | pX = pD1[i][p[0]] 788 | pY = pD2[j][p[1]] 789 | inside = (pXY * 1.0) / (pX * pY) 790 | outside = pXY * log21(inside) 791 | mi += outside 792 | for p in pL: 793 | pXY = pairs.count(p) * 1.0 / n 794 | inside = pXY 795 | outside = pXY * log21(inside) 796 | entropy += outside 797 | entropy = -entropy 798 | if entropy == 0.0: 799 | mi_entropy = 0.0 800 | else: mi_entropy = mi / entropy 801 | return mi_entropy 802 | 803 | def rowColumnWeighed(mi, i_all, all_j, n): 804 | """ 805 | Row and Column weighed Mutual Information - Gouveia- 806 | Oliveira et al, 2007. 807 | RCW(X,Y) = MI(X,Y) / 808 | (((MI(X,all) + MI(all,Y) - 2MI(X,Y))/(n-1)) 809 | """ 810 | 811 | bottom = (i_all + all_j - 2.0 * mi) / (n - 1) 812 | if bottom == 0.0: 813 | rcwmi = 0.0 814 | else: rcwmi = mi / bottom 815 | 816 | return rcwmi 817 | 818 | def covarianceOMES(column1, column2): 819 | """ 820 | Normalized Covariance analysis; OMES - Observed Minus Expected Squared 821 | derived from the covariance method of Kass and Horovitz, 2002 822 | """ 823 | 824 | assert len(column1) == len(column2) 825 | 826 | L = [] 827 | Nvalid = [] 828 | Cxi = [] 829 | Cyj = [] 830 | for i, j in zip(column1, column2): 831 | if i in aa and j in aa: 832 | value = [i, j] 833 | Nvalid.append(value) 834 | Cxi.append(i) 835 | Cyj.append(j) 836 | if value not in L: 837 | L.append(value) 838 | 839 | len_Nvalid = len(Nvalid) 840 | omes = 0.0 841 | for value in L: 842 | Nobs = Nvalid.count(value) 843 | i = value[0] 844 | j = value[1] 845 | Ci = Cxi.count(i) 846 | Cj = Cyj.count(j) 847 | Nex = Ci * Cj / len_Nvalid 848 | top = (Nobs - Nex) ** 2 849 | omes += top * 1.0 / len_Nvalid 850 | 851 | return omes 852 | 853 | def pearsonsCorrelation(d_matrix1, d_matrix2, N): 854 | """ 855 | Pearson's Correlation (Gobel method) - Gobel et al, 1994. 856 | """ 857 | 858 | assert len(d_matrix1) == len(d_matrix2) 859 | 860 | no_match = 0.0 861 | for k, l in zip(d_matrix1, d_matrix2): 862 | if k != l: 863 | no_match += 1.0 864 | length = len(d_matrix1) 865 | Wkl = no_match * 1.0 / length 866 | 867 | sigma_i = std(d_matrix1) 868 | Si = [] 869 | av_Si = mean(d_matrix1) 870 | for i in (d_matrix1): 871 | Si.append(i - av_Si) 872 | 873 | sigma_j = std(d_matrix2) 874 | Sj = [] 875 | av_Sj = mean(d_matrix1) 876 | for j in (d_matrix2): 877 | Sj.append(j - av_Sj) 878 | 879 | top = 0.0 880 | for i, j in zip(Si, Sj): 881 | top += float(i * j * Wkl) 882 | 883 | bottom = sigma_i * sigma_j 884 | if bottom == 0.0: 885 | pearson = 0.0 886 | else: 887 | pearson = (1.0 / N ** 2) * (top / bottom) 888 | 889 | return pearson 890 | 891 | def spearmansCorrelation(d_matrix1, d_matrix2, N): 892 | """ 893 | Spearman's rank Correlation - Pazos et al, 1997. 894 | """ 895 | 896 | assert len(d_matrix1) == len(d_matrix2) 897 | 898 | rank_matrix1 = [] 899 | rank_matrix2 = [] 900 | rank_temp1 = [] 901 | rank_temp2 = [] 902 | for k, l in zip(d_matrix1, d_matrix2): 903 | if k not in rank_temp1: 904 | rank_temp1.append(k) 905 | cnt = d_matrix1.count(k) 906 | rank = cnt * 1.0 / len(d_matrix1) 907 | rank_matrix1.append(rank) 908 | if l not in rank_temp2: 909 | rank_temp2.append(l) 910 | cnt = d_matrix2.count(l) 911 | rank = cnt * 1.0 / len(d_matrix2) 912 | rank_matrix2.append(rank) 913 | 914 | no_match = 0.0 915 | for k, l in zip(d_matrix1, d_matrix2): 916 | if k != l: 917 | no_match += 1.0 918 | length = len(d_matrix1) 919 | Wkl = no_match * 1.0 / length 920 | 921 | sigma_i = std(d_matrix1) 922 | Si = [] 923 | av_Si = mean(d_matrix1) 924 | for i in (rank_matrix1): 925 | Si.append(i - av_Si) 926 | 927 | sigma_j = std(d_matrix2) 928 | Sj = [] 929 | av_Sj = mean(d_matrix1) 930 | for j in (rank_matrix2): 931 | Sj.append(j - av_Sj) 932 | 933 | top = 0.0 934 | for i, j in zip(Si, Sj): 935 | top += float(i * j * Wkl) 936 | 937 | bottom = sigma_i * sigma_j 938 | if bottom == 0.0: 939 | spearman = 0.0 940 | else: 941 | spearman = (1.0 / N ** 2) * (top / bottom) 942 | 943 | return spearman 944 | 945 | def mcbascCorrelation(d_matrix1, d_matrix2, N): 946 | """ 947 | McBASC - McLachlan Based Substitution Correlation. 948 | Fodor and Aldrich, 2004. 949 | """ 950 | 951 | assert len(d_matrix1) == len(d_matrix2) 952 | 953 | sigma_i = std(d_matrix1) 954 | Si = [] 955 | av_Si = mean(d_matrix1) 956 | for i in (d_matrix1): 957 | Si.append(i - av_Si) 958 | 959 | sigma_j = std(d_matrix2) 960 | Sj = [] 961 | av_Sj = mean(d_matrix1) 962 | for j in (d_matrix2): 963 | Sj.append(j - av_Sj) 964 | 965 | top = 0.0 966 | for i, j in zip(Si, Sj): 967 | top += float(i * j) 968 | 969 | bottom = sigma_i * sigma_j 970 | if bottom == 0.0: 971 | mcbasc = 0.0 972 | else: 973 | mcbasc = abs((1.0 / N ** 2) * (top / bottom)) 974 | 975 | return mcbasc 976 | 977 | 978 | def quartetsCorrelation(column1, column2): 979 | """ 980 | Normalized Quartets correlation method by Galitsky, 2002. 981 | """ 982 | 983 | assert len(column1) == len(column2) 984 | 985 | quartets = 0.0 986 | x = column1 987 | y = column2 988 | pairs = [] 989 | for i, j in zip(x, y): 990 | value = [i, j] 991 | pairs.append(value) 992 | 993 | for i, j in zip(x, y): 994 | if i in aa and j in aa: 995 | Pix = x.count(i) 996 | Piy = y.count(i) 997 | Pjx = x.count(j) 998 | Pjy = y.count(j) 999 | val = [i, j] 1000 | Dmin = pairs.count(val) 1001 | Dif = 1.0 * (len(pairs) - Dmin) 1002 | if Dif != 0.0: 1003 | DQmin = Dmin * 1.0 / Dif 1004 | else: 1005 | DQmin = 0.0 1006 | 1007 | try : 1008 | if ((Pix * Pjy > Piy * Pjx) and ((Pix > Dmin) or (Pjy > Dmin)) or\ 1009 | (Pix * Pjy < Piy * Pjx) and ((Piy > Dmin) or (Pjx > Dmin)))\ 1010 | and\ 1011 | (((Pix * Pjy) * 1.0 / (Piy * Pjx) > DQmin) or\ 1012 | ((Piy * Pjx) * 1.0 / (Pix * Pjy) > DQmin)): 1013 | quartets += 1.0 1014 | except: 1015 | quartets += 0 1016 | return quartets 1017 | 1018 | def perturbationSCA(column1, column2, j, columns2): 1019 | """ 1020 | Normalized SCA - Statistical Coupling analysis, Lockless and 1021 | Ranganathan, 1999. As on Halperin et al, 2006. 1022 | """ 1023 | 1024 | assert len(column1) == len(column2) 1025 | 1026 | new_columns2 = subAlignment(column2, columns2) 1027 | x = column1 1028 | y = new_columns2[j] 1029 | 1030 | inside = 0.0 1031 | for i in x: 1032 | if i in aa: 1033 | Pix = x.count(i) * 1.0 / len(x) 1034 | Pixj = y.count(i) * 1.0 / len(y) 1035 | if Pixj != 0.0: 1036 | inside += (ln(Pixj) - Pix) ** 2 1037 | 1038 | sca = sqrt(inside) 1039 | return sca 1040 | 1041 | def perturbationELSC(column1, column2, j, columns2): 1042 | """ 1043 | Normalized ELSC - Explicit Likelihood of Subset Covariation, 1044 | Dekker et al, 2004. 1045 | """ 1046 | 1047 | assert len(column1) == len(column2) 1048 | 1049 | new_columns2 = subAlignment2(column1, column2, columns2) 1050 | x = column1 1051 | y1 = column2 1052 | y2 = new_columns2[j] 1053 | 1054 | 1055 | comb_x = [] 1056 | comb_all = [] 1057 | for i in x: 1058 | if i in aa: 1059 | Nxj = y1.count(i) 1060 | nxj = y2.count(i) 1061 | Nall = len(y1) 1062 | nall = len(y2) 1063 | mxj = int(round((Nxj * 1.0 / Nall) * nall)) 1064 | top = long(factorial(Nxj)) 1065 | bot1 = factorial(nxj) * factorial(Nxj - nxj) 1066 | bot2 = factorial(mxj) * factorial(Nxj - mxj) 1067 | comb_x.append(top / bot1) 1068 | comb_all.append(top / bot2) 1069 | 1070 | product = 1.0 1071 | for k, l in zip(comb_x, comb_all): 1072 | product *= (k * 1.0 / l) 1073 | 1074 | if product != 0.0: 1075 | elsc = -ln(product) 1076 | else: 1077 | elsc = 0.0 1078 | 1079 | return elsc 1080 | 1081 | def subAlignment (column, columns): 1082 | "Creates a sub_alignment based on the most frequent AA in column" 1083 | 1084 | pD = [] 1085 | y = column 1086 | for j in range(len(y)): 1087 | if y[j] in aa: 1088 | freq = y.count(y[j]) 1089 | freq_aa = y[j] 1090 | value = [freq_aa, freq] 1091 | pD.append(value) 1092 | 1093 | sort = sorted(pD, key=lambda pD: pD[1]) 1094 | aa_j = sort[0][0] 1095 | 1096 | col_positions = [] 1097 | pos = -1 1098 | for j in y: 1099 | pos += 1 1100 | if j == aa_j: 1101 | col_positions.append(pos) 1102 | 1103 | sub_align = [] 1104 | for col in columns: 1105 | sub_col = [] 1106 | for pos in col_positions: 1107 | sub_col.append(col[pos]) 1108 | sub_align.append(sub_col) 1109 | return sub_align 1110 | 1111 | def subAlignment2 (column1, column2, columns): 1112 | "Creates a sub_alignment based on AA identity of column1" 1113 | 1114 | x = column1 1115 | y = column2 1116 | 1117 | list_i = [] 1118 | for i in x: 1119 | if i in aa: 1120 | if i not in list_i: 1121 | list_i.append(i) 1122 | 1123 | col_positions = [] 1124 | pos = -1 1125 | for j in y: 1126 | pos += 1 1127 | if j in list_i: 1128 | col_positions.append(pos) 1129 | 1130 | sub_align = [] 1131 | for col in columns: 1132 | sub_col = [] 1133 | for pos in col_positions: 1134 | sub_col.append(col[pos]) 1135 | sub_align.append(sub_col) 1136 | return sub_align 1137 | 1138 | def bestResults(input, output, best_info, surface1, surface2, interface): 1139 | "Creates a new list of best coevolution scores" 1140 | 1141 | input_results = open(input, "r") 1142 | results = input_results.readlines() 1143 | input_results.close() 1144 | 1145 | all = [] 1146 | for line in results: 1147 | if line == "\n": pass 1148 | else: 1149 | l = line.rstrip("\n") 1150 | l = l.split() 1151 | res1 = int(l[0]) 1152 | res2 = int(l[1]) 1153 | mi = float(l[2]) 1154 | if res1 in surface1 and res2 in surface2: 1155 | value = [res1, res2, mi] 1156 | all.append(value) 1157 | elif res1 in surface1 and res2 in surface1: 1158 | value = [res1, res2, mi] 1159 | all.append(value) 1160 | else: 1161 | value = [res1, res2, mi] 1162 | all.append(value) 1163 | 1164 | a = all 1165 | sort = sorted(a, key=lambda a: a[2]) 1166 | length = len(sort) 1167 | position = length - best_info 1168 | threshold = sort[position] 1169 | 1170 | out_best = open(output, "w") 1171 | count = 0 1172 | for line in all: 1173 | res1 = line[0] 1174 | res2 = line[1] 1175 | mi = float(line[2]) 1176 | value = [res1, res2] 1177 | if mi >= threshold[2]: 1178 | count += 1 1179 | if value in interface and count <= best_info: 1180 | print >> out_best, res1, res2, mi, "Interface contact" 1181 | elif count <= best_info: 1182 | print >> out_best, res1, res2, mi 1183 | else: pass 1184 | out_best.close() 1185 | 1186 | def drawHistogram(input, output): 1187 | "Creates a histogram of coevolution scores" 1188 | 1189 | data = [] 1190 | info = [] 1191 | input_results = open(input, "r") 1192 | results = input_results.readlines() 1193 | input_results.close() 1194 | 1195 | for line in results: 1196 | l = line.rstrip("\n") 1197 | l = l.split() 1198 | res1 = int(l[0]) 1199 | res2 = int(l[1]) 1200 | inf = float(l[2]) 1201 | value = [res1, res2, inf] 1202 | data.append(value) 1203 | info.append(inf) 1204 | 1205 | maxi = max(info) 1206 | L = [t[2] for t in data] 1207 | X = maxi 1208 | pyplot.hist(L, bins=X * 50) 1209 | ax = pyplot.axes() 1210 | ax.set_xlabel('Score') 1211 | ax.set_ylabel('Frequency') 1212 | ax.set_xlim(0, X) 1213 | pyplot.savefig(output) 1214 | 1215 | def drawHeatmap(id1, id2, input, output): 1216 | "Creates a heatmap of coevolution scores" 1217 | 1218 | input_results = open(input, "r") 1219 | results = input_results.readlines() 1220 | input_results.close() 1221 | 1222 | data = [] 1223 | residue1 = [] 1224 | residue2 = [] 1225 | for line in results: 1226 | l = line.rstrip("\n") 1227 | l = l.split() 1228 | res1 = int(l[0]) 1229 | res2 = int(l[1]) 1230 | inf = float(l[2]) 1231 | value = [res1, res2, inf] 1232 | data.append(value) 1233 | if res1 not in residue1: 1234 | residue1.append(res1) 1235 | if res2 not in residue2: 1236 | residue2.append(res2) 1237 | 1238 | startX = int(data[0][0]) 1239 | startY = int(data[0][1]) 1240 | length = len(data) 1241 | endX = int(data[length - 1][0]) 1242 | endY = int(data[length - 1][1]) 1243 | 1244 | lenX = len(residue1) 1245 | lenY = len(residue2) 1246 | heatmap = zeros((lenY + 1, lenX + 1)) 1247 | for i in range(length): 1248 | X = int(data[i][0]) 1249 | Y = int(data[i][1]) 1250 | XY = float(data[i][2]) 1251 | heatmap[Y][X] = XY 1252 | 1253 | pyplot.figure() 1254 | pyplot.pcolormesh(heatmap) 1255 | pyplot.colorbar() 1256 | pyplot.axes().set_xlabel(id1) 1257 | pyplot.axes().set_ylabel(id2) 1258 | pyplot.axes().set_xlim(startX, endX) 1259 | pyplot.axes().set_ylim(startY, endY) 1260 | pyplot.savefig(output) 1261 | 1262 | 1263 | -------------------------------------------------------------------------------- /src/INFO.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | from Bio import SeqIO 9 | 10 | class information: 11 | """ 12 | Main code for generating extended results. 13 | """ 14 | def __init__(self, id1, id2, chain1, chain2, dirname): 15 | self.id1 = id1 16 | self.id2 = id2 17 | self.chain1 = chain1 18 | self.chain2 = chain2 19 | self.dirname = dirname 20 | 21 | def __call__(self, id1, id2, chain1, chain2, dirname): 22 | self.id1 = id1 23 | self.id2 = id2 24 | self.chain1 = chain1 25 | self.chain2 = chain2 26 | self.dirname = dirname 27 | 28 | def getInfo(self, id): 29 | "Creates info about the sequences, psiblast, organisms, etc" 30 | 31 | input = self.dirname + id + ".fasta" 32 | sequence = SeqIO.parse(input, "fasta") 33 | for seq_record in sequence: 34 | seq = seq_record.seq 35 | length = len(seq) 36 | break 37 | 38 | input = self.dirname + id + ".blast" 39 | hit = 0 40 | sequences = SeqIO.parse(input, "fasta") 41 | for record in sequences: 42 | sequence = record.seq 43 | hit += 1 44 | 45 | input = self.dirname + id + ".fasta" 46 | sequences = SeqIO.parse(input, "fasta") 47 | organisms = 0 48 | for record in sequences: 49 | sequence = record.seq 50 | organisms += 1 51 | 52 | 53 | output = self.dirname + "results.txt" 54 | out = open(output, "a") 55 | print >> out, "ID" + "\t" + "LengSeq" + "\t" + "NHits" + "\t" + \ 56 | "NOrganisms" 57 | print >> out, str(id) + "\t" + str(length) + "\t" + \ 58 | str(hit) + "\t" + str(organisms) + "\n" 59 | out.close() 60 | 61 | def getSIFTS(self, id, chain): 62 | """ 63 | Web_Services based on SIFTS @ 64 | http://www.ebi.ac.uk/pdbe/docs/sifts/ 65 | """ 66 | 67 | id = id.lower() 68 | try: 69 | id = id.rstrip("_1") 70 | except: 71 | pass 72 | try: 73 | id = id.rstrip("_2") 74 | except: 75 | pass 76 | 77 | # Uniprot ID and SCOP 78 | input = "./SIFTS/pdb_chain_scop_uniprot.lst" 79 | sifts = open(input, "r") 80 | read = sifts.readlines() 81 | sifts.close() 82 | 83 | unip = "Not_found" 84 | scop = "Not_found" 85 | for line in read: 86 | if line[0:4] == str(id): 87 | l = line.rstrip("\n") 88 | l = l.split("\t") 89 | if l[1] == str(chain): 90 | unip = str(l[2]) 91 | scop = str(l[5]) 92 | 93 | # CATH 94 | input = "./SIFTS/pdb_chain_cath_uniprot.lst" 95 | sifts = open(input, "r") 96 | read = sifts.readlines() 97 | sifts.close() 98 | 99 | cath = "Not_found" 100 | for line in read: 101 | if line[0:4] == str(id): 102 | l = line.rstrip("\n") 103 | l = l.split("\t") 104 | if l[1] == str(chain): 105 | cath = str(l[4]) 106 | 107 | 108 | # EC (enzyme) 109 | input = "./SIFTS/pdb_chain_enzyme.lst" 110 | sifts = open(input, "r") 111 | read = sifts.readlines() 112 | sifts.close() 113 | 114 | enz = "Not_found" 115 | for line in read: 116 | if line[0:4] == str(id): 117 | l = line.rstrip("\n") 118 | l = l.split("\t") 119 | if l[1] == str(chain): 120 | enz = str(l[4]) 121 | 122 | # Interpro 123 | input = "./SIFTS/pdb_chain_interpro.lst" 124 | sifts = open(input, "r") 125 | read = sifts.readlines() 126 | sifts.close() 127 | 128 | inter = "Not_found" 129 | for line in read: 130 | if line[0:4] == str(id): 131 | l = line.rstrip("\n") 132 | l = l.split("\t") 133 | if l[1] == str(chain): 134 | inter = str(l[2]) 135 | 136 | # Pfam 137 | input = "./SIFTS/pdb_chain_pfam.lst" 138 | sifts = open(input, "r") 139 | read = sifts.readlines() 140 | sifts.close() 141 | 142 | pfam = "Not_found" 143 | for line in read: 144 | if line[0:4] == str(id): 145 | l = line.rstrip("\n") 146 | l = l.split("\t") 147 | if l[1] == str(chain): 148 | pfam = str(l[4]) 149 | 150 | # Taxonomy 151 | input = "./SIFTS/pdb_chain_taxonomy.lst" 152 | sifts = open(input, "r") 153 | read = sifts.readlines() 154 | sifts.close() 155 | 156 | taxid = "Not_found" 157 | taxnm = "Not_found" 158 | for line in read: 159 | if line[0:4] == str(id): 160 | l = line.rstrip("\n") 161 | l = l.split("\t") 162 | if l[1] == str(chain): 163 | taxid = str(l[2]) 164 | taxnm = str(l[7]) 165 | 166 | 167 | # Pubmed 168 | input = "./SIFTS/pdb_pubmed.lst" 169 | sifts = open(input, "r") 170 | read = sifts.readlines() 171 | sifts.close() 172 | 173 | pubm = "Not_found" 174 | for line in read: 175 | if line[0:4] == str(id): 176 | l = line.rstrip("\n") 177 | l = l.split("\t") 178 | pubm = str(l[2]) 179 | 180 | output = self.dirname + "bioresults.txt" 181 | out = open(output, "a") 182 | print >> out, "Protein_ID" + "\t" + "Uniprot" + "\t" + "SCOP" + "\t" + \ 183 | "CATH" + "\t" + "Enzyme_EC" + "\t" + "Interpro" + "\t" + "Pfam" + "\t" + \ 184 | "Taxonomy_id" + "\t" + "Taxonomy_name" + "\t" + "Pubmed" 185 | 186 | print >> out, str(id) + "\t" + str(unip) + "\t" + \ 187 | str(scop) + "\t" + str(cath) + "\t" + str(enz) + "\t" + \ 188 | str(inter) + "\t" + str(pfam) + "\t" + str(taxid) + "\t" + \ 189 | str(taxnm) + "\t" + str(pubm) + "\n" 190 | out.close() 191 | 192 | 193 | -------------------------------------------------------------------------------- /src/MAIN.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | from src.SEQ import sequence 9 | from src.BLAST import psiblast 10 | from src.ORGANISM import organism 11 | from src.ALIGN import alignment 12 | from src.COEVOL import coevolution 13 | from src.INFO import information 14 | from Parameters import LoadParameters as LP 15 | 16 | class main: 17 | """ 18 | Main script caller. 19 | """ 20 | def __init__(self, file1, file2, id1, id2, chain1, chain2, parameterfile, 21 | psiblast, alignment, coevolution, dirname): 22 | self.file1 = str(file1) 23 | self.file2 = str(file2) 24 | self.id1 = str(id1) 25 | self.id2 = str(id2) 26 | self.chain1 = str(chain1) 27 | self.chain2 = str(chain2) 28 | self.parameterfile= str(parameterfile) 29 | self.psiblast = str(psiblast) 30 | self.alignment = str(alignment) 31 | self.coevolution = str(coevolution) 32 | self.dirname = str(dirname) 33 | 34 | def __call__(self, file1, file2, id1, id2, chain1, chain2, parameterfile, 35 | psiblast, alignment, coevolution, dirname): 36 | self.file1 = str(file1) 37 | self.file2 = str(file2) 38 | self.id1 = str(id1) 39 | self.id2 = str(id2) 40 | self.chain1 = str(chain1) 41 | self.chain2 = str(chain2) 42 | self.parameterfile= str(parameterfile) 43 | self.psiblast = str(psiblast) 44 | self.alignment = str(alignment) 45 | self.coevolution = str(coevolution) 46 | self.dirname = str(dirname) 47 | 48 | def sequenceSripts(self): 49 | seq = sequence(self.file1, self.file2, self.id1, self.id2, 50 | self.chain1, self.chain2, self.parameterfile, 51 | self.dirname) 52 | if self.id1 != self.id2: 53 | if self.chain1 == "" and self.chain2 == "": 54 | seq.validFASTA(self.file1, self.id1) 55 | seq.queryFASTA(self.file1, self.id1) 56 | seq.validFASTA(self.file2, self.id2) 57 | seq.queryFASTA(self.file2, self.id2) 58 | else: 59 | seq.validPDB(self.file1, self.id1, self.chain1) 60 | seq.sequencePDB(self.file1, self.id1, self.chain1) 61 | seq.surfacePDB(self.file1, self.id1, self.chain1) 62 | seq.validPDB(self.file2, self.id2, self.chain2) 63 | seq.sequencePDB(self.file2, self.id2, self.chain2) 64 | seq.surfacePDB(self.file2, self.id2, self.chain2) 65 | else: 66 | if self.chain1 == "" and self.chain2 == "": 67 | seq.validFASTA(self.file1, self.id1) 68 | seq.queryFASTA(self.file1, self.id1) 69 | else: 70 | if self.chain1 != self.chain2: 71 | seq.validPDB(self.file1, self.id1, self.chain1) 72 | seq.sequencePDB(self.file1, self.id1 + "_1", self.chain1) 73 | seq.surfacePDB(self.file1, self.id1 + "_1", self.chain1) 74 | seq.validPDB(self.file1, self.id1, self.chain2) 75 | seq.sequencePDB(self.file1, self.id1 + "_2", self.chain2) 76 | seq.surfacePDB(self.file1, self.id1 + "_2", self.chain2) 77 | else: 78 | seq.validPDB(self.file1, self.id1, self.chain1) 79 | seq.sequencePDB(self.file1, self.id1, self.chain1) 80 | seq.surfacePDB(self.file1, self.id1, self.chain1) 81 | return 82 | 83 | def psiblastSripts(self): 84 | seq = sequence(self.file1, self.file2, self.id1, self.id2, 85 | self.chain1, self.chain2, self.parameterfile, 86 | self.dirname) 87 | blast = psiblast(self.id1, self.id2, self.psiblast, 88 | self.parameterfile, self.dirname) 89 | if self.id1 != self.id2: 90 | blast.searchPSIBLAST(self.id1,self.psiblast) 91 | blast.searchPSIBLAST(self.id2,self.psiblast) 92 | blast.validXML(self.id1) 93 | blast.validXML(self.id2) 94 | blast.sequencesXML(self.id1,self.psiblast) 95 | blast.sequencesXML(self.id2,self.psiblast) 96 | else: 97 | if self.chain1 == "" and self.chain2 == "": 98 | seq.copySequence(self.id1) 99 | blast.searchPSIBLAST(self.id1 + "_1",self.psiblast) 100 | blast.searchPSIBLAST(self.id1 + "_2",self.psiblast) 101 | blast.validXML(self.id1 + "_1") 102 | blast.validXML(self.id1 + "_2") 103 | blast.sequencesXML(self.id1 + "_1",self.psiblast) 104 | blast.sequencesXML(self.id1 + "_2",self.psiblast) 105 | else: 106 | if self.chain1 != self.chain2: 107 | blast.searchPSIBLAST(self.id1 + "_1",self.psiblast) 108 | blast.searchPSIBLAST(self.id1 + "_2",self.psiblast) 109 | blast.validXML(self.id1 + "_1") 110 | blast.validXML(self.id1 + "_2") 111 | blast.sequencesXML(self.id1 + "_1",self.psiblast) 112 | blast.sequencesXML(self.id1 + "_2",self.psiblast) 113 | else: 114 | seq.copySequence(self.id1) 115 | blast.searchPSIBLAST(self.id1 + "_1",self.psiblast) 116 | blast.searchPSIBLAST(self.id1 + "_2",self.psiblast) 117 | blast.validXML(self.id1 + "_1") 118 | blast.validXML(self.id1 + "_2") 119 | blast.sequencesXML(self.id1 + "_1",self.psiblast) 120 | blast.sequencesXML(self.id1 + "_2",self.psiblast) 121 | return 122 | 123 | def organismSripts(self): 124 | org = organism(self.id1, self.id2, self.psiblast, 125 | self.parameterfile, self.dirname) 126 | if self.id1 != self.id2: 127 | org.uniqueOrganism(self.id1, self.id2) 128 | org.pairwiseDistance(self.id1, self.id2) 129 | org.getsCorrelation() 130 | org.removeSequences(self.id1, self.id2) 131 | else: 132 | org.uniqueOrganism(self.id1 + "_1", self.id1 + "_2") 133 | org.pairwiseDistance(self.id1 + "_1", self.id1 + "_2") 134 | org.getsCorrelation() 135 | org.removeSequences(self.id1 + "_1", self.id1 + "_2") 136 | return 137 | 138 | def alignmentSripts(self): 139 | aln = alignment(self.id1, self.id2, self.alignment, 140 | self.parameterfile, self.dirname) 141 | if self.id1 != self.id2: 142 | aln.computeAlignment(self.id1, self.alignment) 143 | aln.computeAlignment(self.id2, self.alignment) 144 | #aln.alignScore(self.id1, self.alignment) 145 | #aln.alignScore(self.id2, self.alignment) 146 | else: 147 | aln.computeAlignment(self.id1 + "_1", self.alignment) 148 | aln.computeAlignment(self.id1 + "_2", self.alignment) 149 | #aln.alignScore(self.id1 + "_1", self.alignment) 150 | #aln.alignScore(self.id1 + "_2", self.alignment) 151 | return 152 | 153 | def coevolutionSripts(self): 154 | coevol = coevolution(self.file1, self.file2, self.id1, self.id2, 155 | self.chain1, self.chain2, self.alignment, 156 | self.coevolution, self.parameterfile, 157 | self.dirname) 158 | if self.id1 != self.id2: 159 | coevol.coevolAnalysis(self.file1, self.file2, 160 | self.id1, self.id2, 161 | self.chain1, self.chain2, 162 | self.alignment, self.coevolution) 163 | coevol.bestInfo(self.id1, self.id2, 164 | self.alignment, self.coevolution) 165 | if self.chain1 == "" and self.chain2 == "": 166 | pass 167 | else: 168 | coevol.structureSingle(self.id1, self.id2, 169 | self.chain1, self.chain2, 170 | self.alignment, self.coevolution) 171 | 172 | else: 173 | coevol.coevolAnalysis(self.file1, self.file1, 174 | self.id1 + "_1", self.id1 + "_2", 175 | self.chain1, self.chain2, 176 | self.alignment, self.coevolution) 177 | coevol.bestInfo(self.id1 + "_1", self.id1 + "_2", 178 | self.alignment, self.coevolution) 179 | if self.chain1 == "" and self.chain2 == "": 180 | pass 181 | else: 182 | if self.chain1 != self.chain2: 183 | coevol.structurePair(self.id1, self.id1, 184 | self.chain1, self.chain2, 185 | self.alignment, self.coevolution) 186 | return 187 | 188 | def infoScripts(self, SIFTS): 189 | info = information(self.id1, self.id2,self.chain1, self.chain2, 190 | self.dirname) 191 | 192 | results_sifts = LP(self.parameterfile, "results_sifts") 193 | 194 | if self.id1 != self.id2: 195 | if self.chain1 == "" and self.chain2 == "": 196 | info.getInfo(self.id1) 197 | info.getInfo(self.id2) 198 | else: 199 | info.getInfo(self.id1) 200 | info.getInfo(self.id2) 201 | if results_sifts == True and SIFTS==True: 202 | info.getSIFTS(self.id1, self.chain1) 203 | info.getSIFTS(self.id2, self.chain2) 204 | else: pass 205 | else: 206 | if self.chain1 == "" and self.chain2 == "": 207 | info.getInfo(self.id1 + "_1") 208 | else: 209 | if self.chain1 != self.chain2: 210 | info.getInfo(self.id1 + "_1") 211 | info.getInfo(self.id1 + "_2") 212 | if results_sifts == True and SIFTS==True: 213 | info.getSIFTS(self.id1 + "_1", self.chain1) 214 | info.getSIFTS(self.id1 + "_2", self.chain2) 215 | else: pass 216 | else: 217 | info.getInfo(self.id1 + "_1") 218 | if results_sifts == True and SIFTS==True: 219 | info.getSIFTS(self.id1 + "_1", self.chain1) 220 | else: pass 221 | return 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /src/ORGANISM.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | import os 9 | from src.UTILS import aa 10 | from Parameters import LoadParameters as LP 11 | from os import remove, system 12 | from numpy import mean, sqrt, log, median 13 | from math import e 14 | from collections import OrderedDict 15 | from Bio import SeqIO, AlignIO 16 | from Bio.Alphabet import IUPAC 17 | 18 | class organism: 19 | """ 20 | Main code for sort and selection of organisms. 21 | 22 | Methods for calculate distance between pairwise alignments: 23 | ClustalW pairwise distance - Chenna et al, 2003 24 | p-distance - Jukes and Cantor, 1969 25 | Jukes-Cantor - Jukes and Cantor, 1969 26 | Kimura Distance - Kimura, 1983 27 | Alignment score using PAM250 or BLOSUM62 -Dayhoff et al, 1978; 28 | Henikoff and Henikoff, 1992 29 | """ 30 | def __init__(self, id1, id2, psiblast, parameterfile, dirname): 31 | self.id1 = id1 32 | self.id2 = id2 33 | self.psiblast = psiblast 34 | self.parameterfile = parameterfile 35 | self.dirname = dirname 36 | 37 | def __call__(self, id1, id2, psiblast, parameterfile, dirname): 38 | self.id1 = id1 39 | self.id2 = id2 40 | self.psiblast = psiblast 41 | self.parameterfile = parameterfile 42 | self.dirname = dirname 43 | 44 | def uniqueOrganism(self, id1, id2): 45 | "Removes unmatched organisms and concatenates sequences" 46 | 47 | input1 = self.dirname + id1 + ".blast" 48 | input2 = self.dirname + id2 + ".blast" 49 | 50 | 51 | ord_dict1 = orderedDict(SeqIO.parse(input1, "fasta", IUPAC.protein), 52 | key_function=checkOrganism) 53 | 54 | ord_dict2 = orderedDict(SeqIO.parse(input2, "fasta", IUPAC.protein), 55 | key_function=checkOrganism) 56 | 57 | org1 = [] 58 | for keys1 in ord_dict1.keys(): 59 | if keys1 in ord_dict2.keys(): 60 | organism = ord_dict1[keys1].description 61 | org1.append(organism) 62 | 63 | org2 = [] 64 | for keys2 in ord_dict2.keys(): 65 | if keys2 in ord_dict1.keys(): 66 | organism = ord_dict2[keys2].description 67 | org2.append(organism) 68 | 69 | 70 | if org1 == [] or org2 == []: 71 | raise StandardError, "There is no matching organisms" 72 | elif len(org1) < 15 or len(org2) < 15: 73 | raise StandardError, "Number of matching organisms <15" 74 | else: pass 75 | 76 | organism = [] 77 | list = [] 78 | for org in org1: 79 | if org in org2: 80 | value = [org1.index(org) + org2.index(org), 81 | org1.index(org), org2.index(org), org] 82 | list.append(value) 83 | sort = sorted(list) 84 | for index in sort: 85 | org = index[3] 86 | organism.append(org) 87 | 88 | input_sequences1 = SeqIO.parse(input1, "fasta", IUPAC.protein) 89 | sequences1 = [] 90 | for record in input_sequences1: 91 | org = str(record.description) 92 | seq = str(record.seq) 93 | if org in org1: 94 | value = [org, seq] 95 | sequences1.append(value) 96 | 97 | input_sequences2 = SeqIO.parse(input2, "fasta", IUPAC.protein) 98 | sequences2 = [] 99 | for record in input_sequences2: 100 | org = str(record.description) 101 | seq = str(record.seq) 102 | if org in org2: 103 | value = [org, seq] 104 | sequences2.append(value) 105 | 106 | self.ord_sequences1 = [] 107 | self.ord_sequences2 = [] 108 | for org in organism: 109 | seq = "" 110 | for o in sequences1: 111 | organ = o[0] 112 | seque = o[1] 113 | if org == organ: 114 | seq += seque + ":" 115 | value = [org, seq] 116 | self.ord_sequences1.append(value) 117 | seq = "" 118 | for o in sequences2: 119 | organ = o[0] 120 | seque = o[1] 121 | if org == organ: 122 | seq += seque + ":" 123 | value = [org, seq] 124 | self.ord_sequences2.append(value) 125 | 126 | return self.ord_sequences1, self.ord_sequences2 127 | 128 | 129 | def pairwiseDistance(self, id1, id2, method=None): 130 | """ 131 | Calculates distance between each pair by diferent methods: 132 | ClustalW distance, p-distance, Jukes-Cantor and Alignment score, 133 | with BLOSUM62 or PAM250 matrix. 134 | (edit Parameters.py) 135 | """ 136 | 137 | method = LP(self.parameterfile, "pairwise_distance") 138 | align_matrix = LP(self.parameterfile, "alignscore_matrix") 139 | distances1 = [] 140 | distances2 = [] 141 | 142 | input = self.dirname + id1 + ".fasta" 143 | input_query = SeqIO.parse(input, "fasta", IUPAC.protein) 144 | for record in input_query: 145 | q_desc = str(record.description) 146 | q_seq = str(record.seq) 147 | break 148 | 149 | for entry in self.ord_sequences1: 150 | p_desc = str(entry[0]) 151 | p_seq = str(entry[1]) 152 | p_seq = p_seq.rstrip(":") 153 | p_seq = p_seq.split(":") 154 | new_rec = [] 155 | for seq in p_seq: 156 | p_new_seq = seq 157 | pair = self.dirname + id1 + ".pair" 158 | out_pair = open(pair, "w") 159 | 160 | sequence1 = str("\n" + ">" + q_desc + "\n" + q_seq + "\n") 161 | sequence2 = str("\n" + ">" + p_desc + "\n" + p_new_seq + "\n") 162 | out_pair.write(sequence1 + sequence2) 163 | out_pair.close() 164 | 165 | output_align = self.dirname + id1 + ".aln" 166 | output_tree = self.dirname + id1 + ".dnd" 167 | distance = self.dirname + id1 + ".distance" 168 | 169 | try: 170 | cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe") 171 | clustalw = system(cmd + " " + pair + " > " + distance) 172 | clustalw 173 | except: 174 | cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw") 175 | clustalw = system(cmd + " " + pair + " > " + distance) 176 | clustalw 177 | 178 | output_fasta = self.dirname + id1 + "_pair.fasta" 179 | AlignIO.convert(output_align, "clustal", output_fasta, "fasta") 180 | 181 | 182 | input_align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein) 183 | msa = [] 184 | for record in input_align: 185 | seq = str(record.seq) 186 | msa.append(seq) 187 | sequence1 = msa[0] 188 | sequence2 = msa[1] 189 | 190 | pair_score = getDistance(sequence1, sequence2, 191 | method, align_matrix, distance) 192 | value = [pair_score, p_new_seq] 193 | new_rec.append(value) 194 | 195 | sort = sorted(new_rec, key=lambda new_rec: new_rec[0]) 196 | new_dist = sort[0][0] 197 | new_seq = sort[0][1] 198 | distances1.append(new_dist) 199 | output = self.dirname + id1 + ".fasta" 200 | out_fasta = open(output, "a") 201 | out_fasta.write("\n" + ">" + p_desc + "\n" + new_seq + "\n") 202 | out_fasta.close() 203 | 204 | try: 205 | remove(pair) 206 | remove(output_align) 207 | remove(output_tree) 208 | remove(output_fasta) 209 | remove(distance) 210 | except: 211 | pass 212 | 213 | input = self.dirname + id2 + ".fasta" 214 | input_query = SeqIO.parse(input, "fasta", IUPAC.protein) 215 | for record in input_query: 216 | q_desc = str(record.description) 217 | q_seq = str(record.seq) 218 | break 219 | 220 | for entry in self.ord_sequences2: 221 | p_desc = str(entry[0]) 222 | p_seq = str(entry[1]) 223 | p_seq = p_seq.rstrip(":") 224 | p_seq = p_seq.split(":") 225 | new_rec = [] 226 | for seq in p_seq: 227 | p_new_seq = seq 228 | pair = self.dirname + id2 + ".pair" 229 | out_pair = open(pair, "w") 230 | 231 | sequence1 = str("\n" + ">" + q_desc + "\n" + q_seq + "\n") 232 | sequence2 = str("\n" + ">" + p_desc + "\n" + p_new_seq + "\n") 233 | out_pair.write(sequence1 + sequence2) 234 | out_pair.close() 235 | 236 | output_align = self.dirname + id2 + ".aln" 237 | output_tree = self.dirname + id2 + ".dnd" 238 | distance = self.dirname + id2 + ".distance" 239 | 240 | try: 241 | cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe") 242 | clustalw = system(cmd + " " + pair + " > " + distance) 243 | clustalw 244 | except: 245 | cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw") 246 | clustalw = system(cmd + " " + pair + " > " + distance) 247 | clustalw 248 | 249 | output_fasta = self.dirname + id2 + "_pair.fasta" 250 | AlignIO.convert(output_align, "clustal", output_fasta, "fasta") 251 | 252 | input_align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein) 253 | msa = [] 254 | for record in input_align: 255 | seq = str(record.seq) 256 | msa.append(seq) 257 | sequence1 = msa[0] 258 | sequence2 = msa[1] 259 | 260 | pair_score = getDistance(sequence1, sequence2, 261 | method, align_matrix, distance) 262 | value = [pair_score, p_new_seq] 263 | new_rec.append(value) 264 | 265 | sort = sorted(new_rec, key=lambda new_rec: new_rec[0]) 266 | new_dist = sort[0][0] 267 | new_seq = sort[0][1] 268 | distances2.append(new_dist) 269 | output = self.dirname + id2 + ".fasta" 270 | out_fasta = open(output, "a") 271 | out_fasta.write("\n" + ">" + p_desc + "\n" + new_seq + "\n") 272 | out_fasta.close() 273 | 274 | try: 275 | remove(pair) 276 | remove(output_align) 277 | remove(output_tree) 278 | remove(output_fasta) 279 | remove(distance) 280 | except: 281 | pass 282 | 283 | 284 | output = self.dirname + "matrix.txt" 285 | out_distance = open(output, "w") 286 | for i in range(len(distances1)): 287 | print >> out_distance, "1" + "\t" + str(i + 2) + "\t" + \ 288 | str(distances1[i]) + "\t" + \ 289 | str(distances2[i]) 290 | out_distance.close() 291 | 292 | def getsCorrelation(self, method=None): 293 | """ 294 | Python implementation of the Theil-Sen Estimator. 295 | Calculates the correlation, a distance between 296 | each point P(x,y) to the mean slope. Distance of 297 | P(m,n) to Ax+By+C=0 is d=Abs(Am+Bn+C)/Sqrt(A^2+B^2) 298 | """ 299 | try: 300 | input = str(self.dirname + "matrix.txt") 301 | file = open(input, "r") 302 | file.close() 303 | except: 304 | return 305 | 306 | input = self.dirname + "matrix.txt" 307 | input_matrix = open(input, "r") 308 | matrix = input_matrix.readlines() 309 | input_matrix.close() 310 | 311 | Xs = [] 312 | Ys = [] 313 | for line in matrix: 314 | l = line.rstrip("\n") 315 | l = l.split() 316 | X = float(l[2]) 317 | Y = float(l[3]) 318 | Xs.append(X) 319 | Ys.append(Y) 320 | slope = theilsenEstimator(Xs, Ys) 321 | 322 | m = -slope 323 | divisor = sqrt(1 + m ** 2) 324 | distance = [] 325 | for f in range(len(Xs)): 326 | d = abs(m * Xs[f] + Ys[f]) / divisor 327 | distance.append(d) 328 | 329 | output = self.dirname + "correlation.txt" 330 | out_correlation = open(output, "w") 331 | print >> out_correlation, "Slope: %s" % (str(slope)) 332 | for d in range(len(distance)): 333 | print >> out_correlation, str(d + 2) + "\t" + str(distance[d]) 334 | out_correlation.close() 335 | 336 | 337 | def removeSequences(self, id1, id2): 338 | """ 339 | Removes sequences that not correlate and are point out by the 340 | Theil-Sen estimator. It implements an easy algorithm to remove 341 | distante sequences. 342 | """ 343 | 344 | try: 345 | input = str(self.dirname + "correlation.txt") 346 | file = open(input, "r") 347 | file.close() 348 | except: 349 | return 350 | 351 | input = self.dirname + "correlation.txt" 352 | input_correlation = open(input, "r") 353 | correlation = input_correlation.readlines() 354 | input_correlation.close() 355 | 356 | value = [] 357 | for line in correlation: 358 | if ":" in line: 359 | pass 360 | else: 361 | l = line.rstrip("\n") 362 | l = l.split("\t") 363 | seq = int(l[0]) 364 | d = float(l[1]) 365 | if seq != 0: 366 | value.append(d) 367 | 368 | removed = [] 369 | threshold = LP(self.parameterfile, "theilsen_cutoff") 370 | maximum = max(value) 371 | minimum = min(value) 372 | median_all = median(value) 373 | median_min = median_all - ((median_all - minimum) * 1.0 * threshold) 374 | median_max = median_all + ((maximum - median_all) * 1.0 * threshold) 375 | for v in value: 376 | if v < median_min or v > median_max: 377 | position = value.index(v) 378 | removed.append(position + 1) 379 | else: pass 380 | 381 | if removed != 0: 382 | sequences1 = [] 383 | input = self.dirname + id1 + ".fasta" 384 | input_sequences = SeqIO.parse(input, "fasta", IUPAC.protein) 385 | for record in input_sequences: 386 | desc = record.description 387 | seq = record.seq 388 | value = [str(desc), str(seq)] 389 | sequences1.append(value) 390 | 391 | output_fasta = open(input, "w") 392 | for i in range(len(sequences1)): 393 | if i not in removed: 394 | desc = str(sequences1[i][0]) 395 | seq = str(sequences1[i][1]) 396 | output_fasta.write(">" + desc + "\n" + seq + "\n" + "\n") 397 | else: 398 | pass 399 | output_fasta.close() 400 | 401 | 402 | sequences2 = [] 403 | input = self.dirname + id2 + ".fasta" 404 | input_sequences = SeqIO.parse(input, "fasta", IUPAC.protein) 405 | for record in input_sequences: 406 | desc = record.description 407 | seq = record.seq 408 | value = [str(desc), str(seq)] 409 | sequences2.append(value) 410 | 411 | output_fasta = open(input, "w") 412 | for i in range(len(sequences2)): 413 | if i not in removed: 414 | desc = str(sequences2[i][0]) 415 | seq = str(sequences2[i][1]) 416 | output_fasta.write(">" + desc + "\n" + seq + "\n" + "\n") 417 | else: 418 | pass 419 | output_fasta.close() 420 | else: pass 421 | 422 | 423 | def theilsenEstimator(Xs, Ys): 424 | """ 425 | The Theil-Sen estimator calculates the median slope 426 | among all lines through pairs of two-dimensional 427 | sample points. 428 | """ 429 | assert len(Xs) == len(Ys) 430 | slopes = [] 431 | for f in range(0, len(Xs) - 1): 432 | x1 = Xs[f] 433 | y1 = Ys[f] 434 | for g in range(1, len(Ys)): 435 | x2 = Xs[g] 436 | y2 = Ys[g] 437 | if x1 != x2: 438 | slope = (y2 - y1) / (x2 - x1) 439 | slopes.append(slope) 440 | 441 | slope = mean(slopes) 442 | return slope 443 | 444 | def matchScore(alpha, beta, score_matrix): 445 | "Matches scores from a matrix" 446 | 447 | alphabet = {} 448 | alphabet["A"] = 0 449 | alphabet["R"] = 1 450 | alphabet["N"] = 2 451 | alphabet["D"] = 3 452 | alphabet["C"] = 4 453 | alphabet["Q"] = 5 454 | alphabet["E"] = 6 455 | alphabet["G"] = 7 456 | alphabet["H"] = 8 457 | alphabet["I"] = 9 458 | alphabet["L"] = 10 459 | alphabet["K"] = 11 460 | alphabet["M"] = 12 461 | alphabet["F"] = 13 462 | alphabet["P"] = 14 463 | alphabet["S"] = 15 464 | alphabet["T"] = 16 465 | alphabet["W"] = 17 466 | alphabet["Y"] = 18 467 | alphabet["V"] = 19 468 | alphabet["B"] = 20 469 | alphabet["Z"] = 21 470 | alphabet["X"] = 22 471 | alphabet["-"] = 22 472 | lut_x = alphabet[alpha] 473 | lut_y = alphabet[beta] 474 | 475 | return score_matrix[lut_x][lut_y] 476 | 477 | def mapMatrix(align_matrix): 478 | "Maps a matrix of floats" 479 | matrix = align_matrix.upper() 480 | 481 | score_matrix = [] 482 | input = './Matrix/' + matrix 483 | input_matrix = open(input, 'r') 484 | for line in input_matrix.readlines(): 485 | score_matrix.append(map(float, line.split())) 486 | input_matrix.close() 487 | 488 | return score_matrix 489 | 490 | def checkOrganism(record): 491 | "Defines organism keys for a dictionary" 492 | organism = record.description.rstrip("\n") 493 | return organism 494 | 495 | def orderedDict(sequences, key_function=None): 496 | "Defines an ordered dictionary" 497 | d = OrderedDict() 498 | for record in sequences: 499 | key = key_function(record) 500 | if key in d: 501 | pass 502 | d[key] = record 503 | return d 504 | 505 | def ln(n): 506 | return log(n) * 1.0 / log(e) 507 | 508 | def getDistance(sequence1, sequence2, method, align_matrix, distance): 509 | "Returns the distance between the sequences" 510 | if method == "clustalw": 511 | distance = clustalwDistance(distance) 512 | elif method == "pdistance": 513 | distance = pDistance(sequence1, sequence2) 514 | elif method == "jukescantor": 515 | distance = jukesCantor(sequence1, sequence2) 516 | elif method == "kimura": 517 | distance = kimuraDistance(sequence1, sequence2) 518 | elif method == "alignscore": 519 | score_matrix = mapMatrix(align_matrix) 520 | distance = alignmentScore(sequence1, sequence2, score_matrix) 521 | else: 522 | raise StandardError, "%s - Invalid method for distance calculation" % (method) 523 | return distance 524 | 525 | def clustalwDistance(distance): 526 | """ 527 | Gets the distance from clustalw scores. 528 | """ 529 | state = "Sequences (1:2) Aligned. Score:" 530 | 531 | input = open(distance, "r") 532 | read = input.readlines() 533 | input.close() 534 | for l in read: 535 | if state in l: 536 | line = l.split() 537 | length = len(line) 538 | score = 0.01 * int(line[length - 1]) 539 | else: pass 540 | 541 | return score 542 | 543 | def pDistance(sequence1, sequence2): 544 | """ 545 | Proportion of sites at which the two sequences are different. 546 | p is close to 1 for poorly related sequences, and p is close 547 | to 0 for similar sequences. d = p 548 | """ 549 | assert len(sequence1) == len(sequence2) 550 | 551 | match = 0 552 | for a, b in zip(sequence1, sequence2): 553 | if a != b: 554 | match += 1 555 | else: 556 | pass 557 | 558 | length = len(sequence1) 559 | score = match * 1.0 / length 560 | score = score 561 | return score 562 | 563 | 564 | def jukesCantor(sequence1, sequence2): 565 | """ 566 | Maximum likelihood estimate of the number of substitutions 567 | between two sequences. p is described with the method 568 | p-distance. d = -19/20 log(1 - p * 20/19) 569 | """ 570 | exterior = -19 * 1.0 / 20 571 | interior = 1 - pDistance(sequence1, sequence2) * 20 * 1.0 / 19 572 | score = exterior * log(interior) 573 | 574 | score = str(score) 575 | if score == "nan": 576 | score = str(0.0) 577 | else: pass 578 | 579 | return score 580 | 581 | def kimuraDistance(sequence1, sequence2): 582 | """ 583 | Kimura's distance. This is a rough-and-ready distance formula 584 | for approximating PAM distance by simply measuring the fraction 585 | of amino acids, p, that differs between two sequences and 586 | computing the distance as (Kimura, 1983). 587 | d = - log_e (1 - p - 0.2 p^2 ). 588 | """ 589 | 590 | p_distance = pDistance(sequence1, sequence2) 591 | interior = (1 - p_distance - 0.2 * p_distance ** 2) 592 | score = -ln(interior) 593 | 594 | score = str(score) 595 | if score == "nan": 596 | score = str(0.0) 597 | else: pass 598 | 599 | return score 600 | 601 | def alignmentScore(sequence1, sequence2, score_matrix): 602 | """ 603 | Distance (d) between two sequences (1, 2) is computed from 604 | the pairwise alignment score between the two sequences (score12), 605 | and the pairwise alignment score between each sequence and itself 606 | (score11, score22). This metric ignores gaps. 607 | d = (1-score12/score11)* (1-score12/score22) 608 | 609 | !!Disclaimer: alignmentScore is terribly slow!! 610 | """ 611 | assert len(sequence1) == len(sequence2) 612 | 613 | score12 = 0 614 | for i in sequence1: 615 | for j in sequence2: 616 | if i in aa and j in aa: 617 | score12 += float(matchScore(i, j, score_matrix)) 618 | else: pass 619 | 620 | score11 = 0 621 | for i in sequence1: 622 | for j in sequence1: 623 | if i != "-" or j != "-": 624 | score11 += float(matchScore(i, j, score_matrix)) 625 | else: pass 626 | 627 | score22 = 0 628 | for i in sequence2: 629 | for j in sequence2: 630 | if i != "-" or j != "-": 631 | score22 += float(matchScore(i, j, score_matrix)) 632 | else: pass 633 | 634 | part1 = (1 - score12 * 1.0 / score11) 635 | part2 = (1 - score12 * 1.0 / score22) 636 | 637 | score = part1 * part2 638 | return score 639 | 640 | -------------------------------------------------------------------------------- /src/SASA.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | """Adapted from Surface Area (ASA) - (C) Bosco Ho 9 | http://boscoh.com/protein/calculating-the-solvent-accessible-surface-area-asa 10 | 11 | Calculates the Solvente Accessible Surface Area (SASA) using the classic 12 | 'rolling ball' algorithm - A. Shrake & J. A. Rupley. 13 | Environment and Exposure to Solvent of Protein Atoms. Lysozyme and Insulin. 14 | J Mol Biol. 79 (1973) 351-371. 15 | """ 16 | 17 | from src.UTILS import radii 18 | from math import pi, sqrt, cos, sin 19 | 20 | SMALL = 1E-6 21 | two_char_elements = [el for el, r in radii.items() if len(el) == 2] 22 | 23 | def SASA(input, output): 24 | mol = Molecule(input) 25 | atoms = mol.atoms() 26 | add_radii(atoms) 27 | 28 | n_sphere = 960 29 | asas = calculateSASA(atoms, 1.4, n_sphere) 30 | 31 | for asa, atom in zip(asas, atoms): 32 | atom.bfactor = asa 33 | mol.write_pdb(output) 34 | return 35 | 36 | def generateSpherePoints(n): 37 | """ 38 | Returns list of 3d coordinates of points on a sphere using the 39 | Golden Section Spiral algorithm. 40 | """ 41 | points = [] 42 | inc = pi * (3 - sqrt(5)) 43 | offset = 2 / float(n) 44 | for k in range(int(n)): 45 | y = k * offset - 1 + (offset / 2) 46 | r = sqrt(1 - y * y) 47 | phi = k * inc 48 | points.append([cos(phi) * r, y, sin(phi) * r]) 49 | return points 50 | 51 | 52 | def findNeighborIndices(atoms, probe, k): 53 | """ 54 | Returns list of indices of atoms within probe distance to atom k. 55 | """ 56 | neighbor_indices = [] 57 | atom_k = atoms[k] 58 | radius = atom_k.radius + probe + probe 59 | indices = range(k) 60 | indices.extend(range(k + 1, len(atoms))) 61 | for i in indices: 62 | atom_i = atoms[i] 63 | dist = pos_distance(atom_k.pos, atom_i.pos) 64 | if dist < radius + atom_i.radius: 65 | neighbor_indices.append(i) 66 | return neighbor_indices 67 | 68 | 69 | def calculateSASA(atoms, probe, n_sphere_point=960): 70 | """ 71 | Returns list of accessible surface areas of the atoms, using the probe 72 | and atom radius to define the surface. 73 | """ 74 | sphere_points = generateSpherePoints(n_sphere_point) 75 | 76 | const = 4.0 * pi / len(sphere_points) 77 | test_point = Vector3d() 78 | areas = [] 79 | for i, atom_i in enumerate(atoms): 80 | neighbor_indices = findNeighborIndices(atoms, probe, i) 81 | n_neighbor = len(neighbor_indices) 82 | j_closest_neighbor = 0 83 | radius = probe + atom_i.radius 84 | 85 | n_accessible_point = 0 86 | for point in sphere_points: 87 | is_accessible = True 88 | 89 | test_point.x = point[0] * radius + atom_i.pos.x 90 | test_point.y = point[1] * radius + atom_i.pos.y 91 | test_point.z = point[2] * radius + atom_i.pos.z 92 | 93 | cycled_indices = range(j_closest_neighbor, n_neighbor) 94 | cycled_indices.extend(range(j_closest_neighbor)) 95 | 96 | for j in cycled_indices: 97 | atom_j = atoms[neighbor_indices[j]] 98 | r = atom_j.radius + probe 99 | diff_sq = pos_distance_sq(atom_j.pos, test_point) 100 | if diff_sq < r * r: 101 | j_closest_neighbor = j 102 | is_accessible = False 103 | break 104 | if is_accessible: 105 | n_accessible_point += 1 106 | 107 | area = const * n_accessible_point * radius * radius 108 | areas.append(area) 109 | return areas 110 | 111 | 112 | def add_radii(atoms): 113 | for atom in atoms: 114 | if atom.element in radii: 115 | atom.radius = radii[atom.element] 116 | else: 117 | atom.radius = radii['.'] 118 | 119 | def pos_distance_sq(p1, p2): 120 | x = p1.x - p2.x 121 | y = p1.y - p2.y 122 | z = p1.z - p2.z 123 | return x * x + y * y + z * z; 124 | 125 | def pos_distance(p1, p2): 126 | return sqrt(pos_distance_sq(p2, p1)) 127 | 128 | class Molecule: 129 | def __init__(self, pdb=""): 130 | self.id = '' 131 | self._atoms = [] 132 | if pdb: 133 | self.read_pdb(pdb) 134 | 135 | def n_atom(self): 136 | return len(self._atoms) 137 | 138 | def atoms(self): 139 | return self._atoms 140 | 141 | def atom(self, i): 142 | return self._atoms[i] 143 | 144 | def clear(self): 145 | for atom in self._atoms: 146 | del atom 147 | del self._atoms[:] 148 | 149 | def transform(self, matrix): 150 | for atom in self._atoms: 151 | atom.pos.transform(matrix) 152 | 153 | def insert_atom(self, atom): 154 | self._atoms.append(atom) 155 | 156 | def erase_atom(self, atom_type): 157 | for atom in self._atoms: 158 | if atom.type == atom_type: 159 | self._atoms.remove(atom) 160 | del atom 161 | return 162 | 163 | def read_pdb(self, fname): 164 | self.clear() 165 | for line in open(fname, 'r').readlines(): 166 | if line.startswith("ATOM") or line.startswith("HETATM"): 167 | atom = AtomFromPdbLine(line); 168 | if len(self._atoms) == 1: 169 | self.id = atom.chain_id 170 | self.insert_atom(atom) 171 | if line.startswith("ENDMDL"): 172 | return 173 | 174 | def write_pdb(self, pdb): 175 | f = open(pdb, 'w') 176 | n_atom = 0 177 | for atom in sorted(self._atoms, cmp=cmp_atom): 178 | n_atom += 1 179 | atom.num = n_atom 180 | f.write(atom.pdb_str() + '\n') 181 | f.close() 182 | 183 | def AtomFromPdbLine(line): 184 | """Returns an Atom object from an atom line in a pdb file.""" 185 | atom = Atom() 186 | if line.startswith('HETATM'): 187 | atom.is_hetatm = True 188 | else: 189 | atom.is_hetatm = False 190 | atom.num = int(line[6:11]) 191 | atom.type = line[12:16].strip(" ") 192 | element = '' 193 | for c in line[12:15]: 194 | if not c.isdigit() and c != " ": 195 | element += c 196 | if element[:2] in two_char_elements: 197 | atom.element = element[:2] 198 | else: 199 | atom.element = element[0] 200 | atom.res_type = line[17:20] 201 | atom.chain_id = line[21] 202 | atom.res_num = int(line[22:26]) 203 | atom.res_insert = line[26] 204 | if atom.res_insert == " ": 205 | atom.res_insert = "" 206 | x = float(line[30:38]) 207 | y = float(line[38:46]) 208 | z = float(line[46:54]) 209 | atom.pos.set(x, y, z) 210 | try: 211 | atom.occupancy = float(line[54:60]) 212 | except: 213 | atom.occupancy = 100.0 214 | try: 215 | atom.bfactor = float(line[60:66]) 216 | except: 217 | atom.bfactor = 0.0 218 | return atom 219 | 220 | 221 | def cmp_atom(a1, a2): 222 | if a1.num < a2.num: 223 | return -1 224 | else: 225 | return 0 226 | 227 | def pad_atom_type(in_atom_type): 228 | atom_type = in_atom_type 229 | if len(atom_type) == 1: 230 | atom_type = " %s " % atom_type 231 | elif len(atom_type) == 2: 232 | atom_type = " %s " % atom_type 233 | elif len(atom_type) == 3: 234 | if atom_type[0].isdigit(): 235 | atom_type = "%s " % atom_type 236 | else: 237 | atom_type = " %s" % atom_type 238 | return atom_type 239 | 240 | class Atom: 241 | def __init__(self): 242 | self.is_hetatm = False 243 | self.pos = Vector3d() 244 | self.vel = Vector3d() 245 | self.mass = 0.0 246 | self.type = "" 247 | self.element = "" 248 | self.chain_id = " " 249 | self.res_type = "" 250 | self.res_num = "" 251 | self.res_insert = "" 252 | self.bfactor = 0.0 253 | self.occupancy = 0.0 254 | self.num = 0 255 | 256 | def pdb_str(self): 257 | return str(self.chain_id) + "\t" + str(self.res_type) + "\t" + \ 258 | str(self.res_num) + "\t" + str(self.bfactor) 259 | 260 | def __str__(self): 261 | return "%s%s-%s (% .1f % .1f % .1f)" \ 262 | % (self.res_type, self.res_num, 263 | self.type, self.pos.x, 264 | self.pos.y, self.pos.z) 265 | 266 | class Vector3d: 267 | def __init__(self, x=0.0, y=0.0, z=0.0): 268 | self.x = x 269 | self.y = y 270 | self.z = z 271 | 272 | def __add__(self, rhs): 273 | return Vector3d(rhs.x + self.x, rhs.y + self.y, rhs.z + self.z) 274 | 275 | def __sub__(self, rhs): 276 | return Vector3d(self.x - rhs.x, self.y - rhs.y, self.z - rhs.z) 277 | 278 | def __neg__(self): 279 | return Vector3d(-self.x, -self.y, -self.z) 280 | 281 | def __pos__(self): 282 | return Vector3d(self.x, self.y, self.z) 283 | 284 | def __eq__(self, rhs): 285 | return (is_near_zero(self.x - rhs.x) and \ 286 | is_near_zero(self.y - rhs.y) and \ 287 | is_near_zero(self.z - rhs.z)) 288 | 289 | def __str__(self): 290 | return "(% .2f, % .2f, % .2f)" % (self.x, self.y, self.z) 291 | 292 | def __repr__(self): 293 | return "Vector3d(%f, %f, %f)" % (self.x, self.y, self.z) 294 | 295 | def set(self, x, y, z): 296 | self.x = x 297 | self.y = y 298 | self.z = z 299 | 300 | def copy(self): 301 | return Vector3d(self.x, self.y, self.z) 302 | 303 | def length_sq(self): 304 | return self.x * self.x + self.y * self.y + self.z * self.z 305 | 306 | def length(self): 307 | return sqrt(self.x * self.x + self.y * self.y + self.z * self.z) 308 | 309 | def scale(self, scale): 310 | self.x *= scale 311 | self.y *= scale 312 | self.z *= scale 313 | 314 | def normalize(self): 315 | self.scale(1.0 / self.length()) 316 | 317 | def scaled_vec(self, scale): 318 | v = self.copy() 319 | v.scale(scale) 320 | return v 321 | 322 | def normal_vec(self): 323 | return self.scaled_vec(1.0 / self.length()) 324 | 325 | def parallel_vec(self, axis): 326 | axis_len = axis.length() 327 | if is_near_zero(axis_len): 328 | result = self 329 | else: 330 | result = axis.scaled_vec(dot(self, axis) 331 | / axis.length() / axis.length()) 332 | return result 333 | 334 | def perpendicular_vec(self, axis): 335 | return self - self.parallel_vec(axis) 336 | 337 | def transform(self, matrix): 338 | x = matrix.elem00 * self.x + \ 339 | matrix.elem10 * self.y + \ 340 | matrix.elem20 * self.z + \ 341 | matrix.elem30 342 | y = matrix.elem01 * self.x + \ 343 | matrix.elem11 * self.y + \ 344 | matrix.elem21 * self.z + \ 345 | matrix.elem31 346 | z = matrix.elem02 * self.x + \ 347 | matrix.elem12 * self.y + \ 348 | matrix.elem22 * self.z + \ 349 | matrix.elem32 350 | self.x, self.y, self.z = x, y, z 351 | 352 | def is_near_zero(a): 353 | return a < SMALL 354 | 355 | def dot(a, b): 356 | return a.x * b.x + a.y * b.y + a.z * b.z 357 | 358 | -------------------------------------------------------------------------------- /src/SEQ.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | import src.SASA as sasa 9 | from src.UTILS import aa_list, aa_symbols 10 | from Parameters import LoadParameters as LP 11 | import time 12 | from os import remove 13 | from shutil import copyfile 14 | from urllib import urlopen 15 | from Bio import SeqIO, Entrez 16 | from Bio.Alphabet import IUPAC 17 | from Bio.PDB.PDBParser import PDBParser 18 | Entrez.email = "entrez@mail.com" 19 | 20 | class sequence: 21 | """ 22 | Main code for handling sequences and structures. 23 | """ 24 | def __init__(self, file1, file2, id1, id2, chain1, chain2, parameterfile, 25 | dirname): 26 | self.file1 = file1 27 | self.file2 = file2 28 | self.chain1 = chain1 29 | self.chain2 = chain2 30 | self.id1 = id1 31 | self.id2 = id2 32 | self.parameterfile = parameterfile 33 | self.dirname = dirname 34 | 35 | def __call__(self, file1, file2, id1, id2, chain1, chain2, parameterfile, 36 | dirname): 37 | self.file1 = file1 38 | self.file2 = file2 39 | self.chain1 = chain1 40 | self.chain2 = chain2 41 | self.id1 = id1 42 | self.id2 = id2 43 | self.parameterfile = parameterfile 44 | self.dirname = dirname 45 | 46 | def validFASTA(self, file, id): 47 | "Checks if the input file is a valid FASTA file" 48 | 49 | try: 50 | input = str(self.dirname + file) 51 | SeqIO.read(input, "fasta", IUPAC.protein) 52 | except: 53 | try: 54 | "Fetches a sequence according to GI identifier or UniProt ID" 55 | fetch = Entrez.efetch(db="protein", id=id, rettype="fasta") 56 | 57 | output = str(self.dirname + file) 58 | out = open(output, "w") 59 | out.write(fetch.read()) 60 | out.close() 61 | read = SeqIO.parse(output, "fasta", IUPAC.protein) 62 | for record in read: 63 | sequence = str(record.seq) 64 | out = open(output, "w") 65 | print >> out, ">Query_id" + "\n" + sequence + "\n" 66 | out.close() 67 | 68 | except: 69 | raise StandardError, "%s - Invalid sequence identifier or sequence file" % (id) 70 | 71 | def queryFASTA(self, file, id): 72 | "Changes FASTA original header to 'Query_id'" 73 | 74 | input = str(self.dirname + file) 75 | input_sequence = SeqIO.parse(input, "fasta", IUPAC.protein) 76 | for record in input_sequence: 77 | sequence = str(record.seq) 78 | break 79 | 80 | output = str(self.dirname + id + ".fa") 81 | out = open(output, "w") 82 | print >> out, ">Query_id" + "\n" + sequence + "\n" 83 | out.close() 84 | input_sequence.close() 85 | 86 | remove(input) 87 | 88 | def validPDB(self, file, id, chain): 89 | "Checks if the input file is a valid PDB file" 90 | 91 | try: 92 | input = str(self.dirname + file) 93 | PDBParser().get_structure(id, input) 94 | try: 95 | test_structure = PDBParser().get_structure(id, input) 96 | test_model = test_structure[0] 97 | test_model[chain] 98 | except: 99 | raise StandardError, "%s - Invalid chain" % (chain) 100 | except: 101 | try: 102 | "Fetches a PDB file from the RCSB Protein Databank" 103 | url = 'http://www.rcsb.org/pdb/files/%s.pdb' % id 104 | read = urlopen(url).read() 105 | pdb = open(self.dirname + file, "w") 106 | pdb.write(read) 107 | pdb.close() 108 | input = str(self.dirname + file) 109 | PDBParser().get_structure(id, input) 110 | try: 111 | test_structure = PDBParser().get_structure(id, input) 112 | test_model = test_structure[0] 113 | test_model[chain] 114 | except: 115 | raise StandardError, "%s - Invalid chain" % (chain) 116 | except: 117 | raise StandardError, "%s - Invalid PDB ID or PDB file" % (id) 118 | 119 | def sequencePDB(self, file, id, chain): 120 | "Extracts a sequence from the ATOM lines of a PDB file" 121 | 122 | # sequence from atom lines 123 | input = str(self.dirname + file) 124 | input_structure = open(input, "r") 125 | structure = input_structure.readlines() 126 | input_structure.close() 127 | string = "" 128 | for line in structure: 129 | if line[0:4] == "ATOM": 130 | if line[21] == str(chain): 131 | CA = line[13:16] 132 | res = line[17:20] 133 | if CA == "CA ": 134 | if res in aa_list: 135 | string += aa_symbols[res] 136 | else: pass 137 | sequence = string 138 | 139 | output = str(self.dirname + id + ".fasta") 140 | out = open(output, "w") 141 | print >> out, ">Query_id" + "\n" + sequence + "\n" 142 | out.close() 143 | 144 | # full sequence from acession number on DBREF lines 145 | input = str(self.dirname + file) 146 | input_structure = open(input, "r") 147 | structure = input_structure.readlines() 148 | input_structure.close() 149 | 150 | for line in structure: 151 | if line[0:5] == "DBREF": 152 | if line[21] == str(chain): 153 | data = line.split() 154 | ch = data[2] 155 | if ch == chain: 156 | ac_number = data[6] 157 | try: 158 | fetch = Entrez.efetch(db="protein", id=ac_number, rettype="fasta") 159 | 160 | output = str(self.dirname + id + ".fa") 161 | out = open(output, "w") 162 | out.write(fetch.read()) 163 | out.close() 164 | read = SeqIO.parse(output) 165 | for record in read: 166 | sequence = str(record.seq) 167 | out = open(output, "w") 168 | print >> out, ">Query_id" + "\n" + sequence + "\n" 169 | out.close() 170 | except: 171 | copyfile(self.dirname + id + ".fasta", self.dirname + id + ".fa") 172 | 173 | def surfacePDB(self, file, id, chain): 174 | """" 175 | Points out surface residues in a PDB file (ASA > 7% (A^2))* 176 | *De et al.,2005. http://www.biomedcentral.com/1472-6807/5/15 177 | """ 178 | 179 | input = str(self.dirname + file) 180 | input_structure = open(input, "r") 181 | structure = input_structure.readlines() 182 | input_structure.close() 183 | 184 | input_final = str("./" + file) 185 | out = open(input_final, "w") 186 | 187 | for line in structure: 188 | if line[0:4] == "ATOM": 189 | if line[21] == str(chain): 190 | res = line[17:20] 191 | res = res.rstrip() 192 | res = res.lstrip() 193 | if str(res) in aa_list: 194 | print >> out, line.rstrip("\n") 195 | out.close() 196 | 197 | output = str("./" + file + ".txt") 198 | sasa.SASA(input_final, output) 199 | 200 | list = [] 201 | input = str("./" + file + ".txt") 202 | op = open(input) 203 | read = op.readlines() 204 | for line in read: 205 | line = line.rstrip() 206 | line = line.split() 207 | if line[0] == chain: 208 | amino = str(line[1]) 209 | res = int(line[2]) 210 | area = float(line[3]) 211 | if amino in aa_list: 212 | info = [amino, res, area] 213 | list.append(info) 214 | 215 | 216 | threshold = LP(self.parameterfile, "surface_threshold") 217 | surface = [] 218 | asa_list = [] 219 | total = 0 220 | for i in range(0, (len(list) - 1), 1): 221 | if list[i][0] == list[i + 1][0]: 222 | total += list[i][2] 223 | else: 224 | amino = str(list[i][0]) 225 | res = list[i][1] 226 | area = total + list[i][2] 227 | value = [amino, res, area] 228 | asa_list.append(area) 229 | surface.append(value) 230 | total = 0 231 | pass 232 | 233 | output = str(self.dirname + id + ".surface") 234 | out = open(output, "w") 235 | 236 | asa_max = int(round(float(max(asa_list)))) 237 | thrd = threshold * asa_max * 1.0 / 100 238 | for i in range(len(surface)): 239 | amino = str(surface[i][0]) 240 | res = str(surface[i][1]) 241 | area = float(surface[i][2]) 242 | if area > thrd: 243 | print >> out, amino, res + "\t" + str(area) 244 | out.close() 245 | 246 | time.sleep(2) 247 | try: 248 | remove("./" + file) 249 | except: pass 250 | try: 251 | remove("./" + file + ".txt") 252 | except: pass 253 | 254 | def parseSurfacePDB(self, id): 255 | "Parses residues at the surface level" 256 | 257 | input = str(self.dirname + id + ".surface") 258 | input_surface = open(input, "r") 259 | surface = input_surface.readlines() 260 | input_surface.close() 261 | 262 | surface_points = [] 263 | for line in surface: 264 | l = line.split() 265 | res_nb = int(l[1]) 266 | surface_points.append(res_nb) 267 | 268 | return surface_points 269 | 270 | 271 | def matchResiduePosition(self, id, chain): 272 | "Gets residue positions for use in coevolution analysis" 273 | 274 | input = str(self.dirname + id + ".pdb") 275 | input_structure = open(input, "r") 276 | structure = input_structure.readlines() 277 | input_structure.close() 278 | 279 | protein = [] 280 | for line in structure: 281 | if line[0:4] == "ATOM": 282 | if line[21] == str(chain): 283 | CA = line[13:16] 284 | res_nb = line[22:26] 285 | if CA == "CA ": 286 | res_nb = line[22:26] 287 | res = line[17:20] 288 | res = res.rstrip() 289 | res = res.lstrip() 290 | if str(res) in aa_list: 291 | protein.append(int(res_nb)) 292 | return protein 293 | 294 | def copySequence(self, id): 295 | "Doubles the sequence files" 296 | 297 | copyfile(self.dirname + id + ".fa", self.dirname + id + "_1.fa") 298 | copyfile(self.dirname + id + ".fa", self.dirname + id + "_2.fa") 299 | remove(self.dirname + id + ".fa") 300 | return 301 | 302 | -------------------------------------------------------------------------------- /src/UTILS.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### 7 | 8 | """ 9 | Utilities used in some routines. 10 | """ 11 | 12 | import sys 13 | 14 | def Flash(message): 15 | print message 16 | sys.stdout.flush() 17 | 18 | aa = ['A','C','D','E','F','G','H','K','I','L','M','N','P','Q','R','S','T','V','Y','W'] 19 | 20 | aa_list = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'LYS', 'ILE', 'LEU', 21 | 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TYR', 'TRP'] 22 | 23 | aa_symbols = {'ALA':'A','CYS':'C','ASP':'D', 24 | 'GLU':'E','PHE':'F','GLY':'G', 25 | 'HIS':'H','LYS':'K','ILE':'I', 26 | 'LEU':'L','MET':'M','ASN':'N', 27 | 'PRO':'P','GLN':'Q','ARG':'R', 28 | 'SER':'S','THR':'T','VAL':'V', 29 | 'TYR':'Y','TRP':'W','XXX':'X'} 30 | 31 | # amino acid properties 32 | aa_hydrofobic = ['A','F','G','I','L','M','P','V','W'] 33 | aa_hydrofile = ['C','N','Q','S','T','Y'] 34 | aa_basic = ['H','K','R'] 35 | aa_acid = ['D','E'] 36 | aa_polar = ['S','T','Q','C','E','Y','D','K','H','R','N'] 37 | aa_non_polar = ['A','V','L','I','G','W','F','P','M'] 38 | aa_charged = ['H','R','K','D','E'] 39 | 40 | # amino acid reduction alphabets 41 | # Caporaso, J. G., Smit, S., Easton, B. C., Hunter, L., Huttley, G. a, 42 | # Knight, R. (2008). Detecting coevolution without phylogenetic trees? 43 | # Tree-ignorant metrics of coevolution perform as well as tree-aware 44 | # metrics. BMC evolutionary biology, 8, 327. doi:10.1186/1471-2148-8-327 45 | # {'A', 'D', 'K'} 46 | charge = {'A':'A','C':'A','D':'D', 47 | 'E':'D','F':'A','G':'A', 48 | 'H':'A','K':'K','I':'A', 49 | 'L':'A','M':'A','N':'A', 50 | 'P':'A','Q':'A','R':'K', 51 | 'S':'A','T':'A','V':'A', 52 | 'Y':'A','W':'A','-':'-'} 53 | 54 | # {'A','D','K'} 55 | charge_his = {'A':'A','C':'A','D':'D', 56 | 'E':'D','F':'A','G':'A', 57 | 'H':'K','K':'K','I':'A', 58 | 'L':'A','M':'A','N':'A', 59 | 'P':'A','Q':'A','R':'K', 60 | 'S':'A','T':'A','V':'A', 61 | 'Y':'A','W':'A','-':'-'} 62 | 63 | # {'A','D','G','K'} 64 | polarity = {'A':'A','C':'G','D':'D', 65 | 'E':'D','F':'A','G':'G', 66 | 'H':'K','K':'K','I':'A', 67 | 'L':'A','M':'A','N':'G', 68 | 'P':'A','Q':'G','R':'K', 69 | 'S':'G','T':'G','V':'A', 70 | 'Y':'G','W':'A','-':'-'} 71 | 72 | # {'A','D','G'} 73 | hydropathy = {'A':'A','C':'A','D':'D', 74 | 'E':'D','F':'A','G':'G', 75 | 'H':'D','K':'D','I':'A', 76 | 'L':'A','M':'A','N':'D', 77 | 'P':'A','Q':'D','R':'D', 78 | 'S':'G','T':'G','V':'A', 79 | 'Y':'G','W':'G','-':'-'} 80 | 81 | # Hydrophobicity scale: 82 | # Kyte J and Doolittle RF: A simple method for displaying the 83 | # hydropathic character of a protein. J Mol Biol 157:105, 1982. 84 | kyte_doolittle = {'A':1.8,'C':2.5,'D':-3.5, 85 | 'E':-3.5,'F':2.8,'G':-0.4, 86 | 'H':-3.2,'K':-3.9,'I':4.5, 87 | 'L':3.8,'M':1.9,'N':-3.5, 88 | 'P':-1.6,'Q':-3.5,'R':-4.5, 89 | 'S':-0.8,'T':-0.7,'V':4.2, 90 | 'Y':-1.3,'W':-0.9} 91 | 92 | # Hoop TP and Woods KR: Prediction of protein antigenic determinants 93 | # from amino acid sequences. Proc Natl Acad Sci USA 78:3824, 1981. 94 | hopp_woods = {'A':-0.5,'C':-1.0,'D':3.0, 95 | 'E':3.0,'F':-2.5,'G':0.0, 96 | 'H':-0.5,'K':3.0,'I':-1.8, 97 | 'L':-1.8,'M':-1.3,'N':0.2, 98 | 'P':0.0,'Q':0.2,'R':3.0, 99 | 'S':0.3,'T':-0.4,'V':-1.5, 100 | 'Y':-2.3,'W':-3.4} 101 | 102 | # D. Eisenberg; R. M. Weiss & T. C. Terwilliger: 103 | # The hydrophobic moment detects periodicity in protein hydrophobicity. 104 | # Proc Natl Acad Sci U S A, 81, 140-144 105 | eisenberg = {'A':0.62,'C':0.29,'D':-0.9, 106 | 'E':-0.74,'F':1.19,'G':0.48, 107 | 'H':-0.4,'K':1.38,'I':-1.5, 108 | 'L':1.06,'M':0.64,'N':-0.78, 109 | 'P':0.12,'Q':-0.85,'R':-2.53, 110 | 'S':-0.18,'T':-0.05,'V':1.08, 111 | 'Y':0.81,'W':0.26} 112 | 113 | # D. M. Engelman; T. A. Steitz & A. Goldman: 114 | # Identifying nonpolar transbilayer helices in amino acid sequences of 115 | # membrane proteins. Annu Rev Biophys Biophys Chem, 15, 321-353 116 | engelman = {'A':1.6,'C':2.0,'D':-9.2, 117 | 'E':-82,'F':3.7,'G':1.0, 118 | 'H':-3.0,'K':3.1,'I':-8.8, 119 | 'L':2.8,'M':3.4,'N':-4.8, 120 | 'P':-0.2,'Q':-4.1,'R':-12.3, 121 | 'S':0.6,'T':1.2,'V':2.6, 122 | 'Y':1.9,'W':-0.7} 123 | 124 | # J. L. Cornette; K. B. Cease; H. Margalit; J. L. Spouge; J. A. Berzofsky & C. DeLisi: 125 | # Hydrophobicity scales and computational techniques for detecting amphipathic 126 | # structures in proteins. J Mol Biol, 195, 659-685 127 | cornette = {'A':0.2,'C':4.1,'D':-3.1, 128 | 'E':-1.8,'F':4.4,'G':0.0, 129 | 'H':0.5,'K':4.8,'I':-3.1, 130 | 'L':5.7,'M':4.2,'N':-0.5, 131 | 'P':-2.2,'Q':-2.8,'R':1.4, 132 | 'S':-0.5,'T':-1.9,'V':4.7, 133 | 'Y':1.0,'W':3.2} 134 | 135 | 136 | # Amino acid's volume: 137 | # Laguerre method with water. Esque et al, 2010 138 | volume = {'N' : 125.2, 'P': 122.1, 'Q': 148.1, 139 | 'A': 88.2, 'R': 188.8, 'S': 95.5, 140 | 'C': 113.3,'T': 118.4, 'D': 113.4, 141 | 'E': 134.8,'V': 134.5, 'F': 192.0, 142 | 'W': 227.3,'G': 65.3, 'H': 159.2, 143 | 'Y': 197.6,'I': 157.7, 'K': 164.2, 144 | 'L': 158.7,'M': 164.9} 145 | 146 | # atomic radius 147 | radii = {'H': 1.20, 'N': 1.55, 'NA': 2.27, 148 | 'CU': 1.40, 'CL': 1.75, 'C': 1.70, 149 | 'O': 1.52, 'I': 1.98, 'P': 1.80, 150 | 'B': 1.85, 'BR': 1.85, 'S': 1.80, 151 | 'SE': 1.90, 'F': 1.47, 'FE': 1.80, 152 | 'K': 2.75, 'MN': 1.73, 'MG': 1.73, 153 | 'ZN': 1.39, 'HG': 1.8, 'XE': 1.8, 154 | 'AU': 1.8, 'LI': 1.8, '.': 1.8} 155 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Encoding utf-8 # 3 | # F. Madeira and L. Krippahl, 2012 # 4 | # This code is part of Pycoevol distribution. # 5 | # This work is public domain. # 6 | ############################################################################### -------------------------------------------------------------------------------- /src/tools/blast+/db/refseq_protein.pal: -------------------------------------------------------------------------------- 1 | # 2 | # Alias file created: Jun 26, 2011 8:38 PM 3 | # 4 | # Edit this file to reflet the location of your database 5 | # Get the database at ftp://ftp.ncbi.nih.gov/blast/db/ 6 | # 7 | TITLE NCBI Protein Reference Sequences 8 | DBLIST ./Pycoevol/src/tools/Blast+/db/refseq_protein.00 ./Pycoevol/src/tools/Blast+/db/refseq_protein.01 ./Pycoevol/src/tools/Blast+/db/refseq_protein.02 ./Pycoevol/src/tools/Blast+/db/refseq_protein.03 9 | -------------------------------------------------------------------------------- /src/tools/blast+/psiblast_here: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/src/tools/blast+/psiblast_here -------------------------------------------------------------------------------- /src/tools/clustalw/clustalw_here: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/src/tools/clustalw/clustalw_here -------------------------------------------------------------------------------- /src/tools/mafft/mafft_here: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/src/tools/mafft/mafft_here -------------------------------------------------------------------------------- /src/tools/muscle/muscle_here: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/src/tools/muscle/muscle_here --------------------------------------------------------------------------------