├── GUI
    ├── mainform.lfm
    ├── mainform.pas
    ├── optionlists.txt
    ├── pycoevolgui.ico
    ├── pycoevolgui.lpi
    ├── pycoevolgui.lpr
    └── pycoevolgui.res
├── Matrix
    ├── BLOSUM62
    ├── CLM
    ├── CPVN
    ├── MCLACHLAN
    ├── PAM250
    └── VOL
├── Parameters.py
├── Params.config
├── Pycoevol.py
├── Pycoevol_paper.pdf
├── Pycoevol_userguide.pdf
├── README.md
├── Results
    └── output_results
├── SIFTS
    └── Database version
├── refseq_protein.pal
└── src
    ├── ALIGN.py
    ├── BLAST.py
    ├── COEVOL.py
    ├── INFO.py
    ├── MAIN.py
    ├── ORGANISM.py
    ├── SASA.py
    ├── SEQ.py
    ├── UTILS.py
    ├── __init__.py
    └── tools
        ├── blast+
            ├── db
            │   └── refseq_protein.pal
            └── psiblast_here
        ├── clustalw
            └── clustalw_here
        ├── mafft
            └── mafft_here
        └── muscle
            └── muscle_here


/GUI/mainform.lfm:
--------------------------------------------------------------------------------
  1 | object Form1: TForm1
  2 |   Left = 628
  3 |   Height = 604
  4 |   Top = 203
  5 |   Width = 843
  6 |   Caption = 'Pycoevol'
  7 |   ClientHeight = 604
  8 |   ClientWidth = 843
  9 |   OnActivate = FormActivate
 10 |   OnClose = FormClose
 11 |   OnCreate = FormCreate
 12 |   LCLVersion = '0.9.30.4'
 13 |   object RunPyCoBt: TButton
 14 |     Left = 640
 15 |     Height = 25
 16 |     Top = 168
 17 |     Width = 192
 18 |     Caption = 'Run Pycoevol'
 19 |     OnClick = RunPyCoBtClick
 20 |     TabOrder = 0
 21 |   end
 22 |   object PycoFolderEd: TEdit
 23 |     Left = 112
 24 |     Height = 28
 25 |     Top = 9
 26 |     Width = 672
 27 |     TabOrder = 1
 28 |   end
 29 |   object Label1: TLabel
 30 |     Left = 8
 31 |     Height = 21
 32 |     Top = 16
 33 |     Width = 104
 34 |     Caption = 'Pycoevol folder'
 35 |     ParentColor = False
 36 |     OnClick = Label1Click
 37 |   end
 38 |   object PycoFolderBrowseBt: TButton
 39 |     Left = 791
 40 |     Height = 25
 41 |     Top = 7
 42 |     Width = 32
 43 |     Caption = '...'
 44 |     OnClick = PycoFolderBrowseBtClick
 45 |     TabOrder = 2
 46 |   end
 47 |   object ParamFileEd: TEdit
 48 |     Left = 112
 49 |     Height = 28
 50 |     Top = 37
 51 |     Width = 672
 52 |     TabOrder = 3
 53 |   end
 54 |   object Label2: TLabel
 55 |     Left = 8
 56 |     Height = 21
 57 |     Top = 44
 58 |     Width = 100
 59 |     Caption = 'Parameters file'
 60 |     ParentColor = False
 61 |   end
 62 |   object ParamFileBrowseBt: TButton
 63 |     Left = 791
 64 |     Height = 25
 65 |     Top = 35
 66 |     Width = 32
 67 |     Caption = '...'
 68 |     OnClick = ParamFileBrowseBtClick
 69 |     TabOrder = 4
 70 |   end
 71 |   object PsiblastCb: TComboBox
 72 |     Left = 8
 73 |     Height = 28
 74 |     Top = 194
 75 |     Width = 140
 76 |     ItemHeight = 20
 77 |     TabOrder = 5
 78 |     Text = 'PsiblastCb'
 79 |   end
 80 |   object AlignmentCb: TComboBox
 81 |     Left = 160
 82 |     Height = 28
 83 |     Top = 194
 84 |     Width = 140
 85 |     ItemHeight = 20
 86 |     TabOrder = 6
 87 |     Text = 'PsiblastCb'
 88 |   end
 89 |   object CoevolutionCb: TComboBox
 90 |     Left = 312
 91 |     Height = 28
 92 |     Top = 194
 93 |     Width = 140
 94 |     ItemHeight = 20
 95 |     TabOrder = 7
 96 |     Text = 'PsiblastCb'
 97 |   end
 98 |   object Label3: TLabel
 99 |     Left = 8
100 |     Height = 21
101 |     Top = 172
102 |     Width = 99
103 |     Caption = 'Psiblast option'
104 |     ParentColor = False
105 |   end
106 |   object Label4: TLabel
107 |     Left = 160
108 |     Height = 21
109 |     Top = 172
110 |     Width = 118
111 |     Caption = 'Alignment option'
112 |     ParentColor = False
113 |   end
114 |   object Label5: TLabel
115 |     Left = 312
116 |     Height = 21
117 |     Top = 172
118 |     Width = 141
119 |     Caption = 'Coevolution measure'
120 |     ParentColor = False
121 |   end
122 |   object File1Ed: TEdit
123 |     Left = 112
124 |     Height = 28
125 |     Top = 98
126 |     Width = 488
127 |     TabOrder = 8
128 |   end
129 |   object Label6: TLabel
130 |     Left = 8
131 |     Height = 21
132 |     Top = 100
133 |     Width = 60
134 |     Caption = 'Protein 1'
135 |     ParentColor = False
136 |   end
137 |   object File1BrowseBt: TButton
138 |     Left = 600
139 |     Height = 25
140 |     Top = 96
141 |     Width = 32
142 |     Caption = '...'
143 |     OnClick = File1BrowseBtClick
144 |     TabOrder = 9
145 |   end
146 |   object File2Ed: TEdit
147 |     Left = 112
148 |     Height = 28
149 |     Top = 126
150 |     Width = 488
151 |     TabOrder = 10
152 |   end
153 |   object Label7: TLabel
154 |     Left = 8
155 |     Height = 21
156 |     Top = 128
157 |     Width = 60
158 |     Caption = 'Protein 2'
159 |     ParentColor = False
160 |   end
161 |   object File2BrowseBt: TButton
162 |     Left = 600
163 |     Height = 25
164 |     Top = 124
165 |     Width = 32
166 |     Caption = '...'
167 |     OnClick = File2BrowseBtClick
168 |     TabOrder = 11
169 |   end
170 |   object Label8: TLabel
171 |     Left = 352
172 |     Height = 21
173 |     Top = 80
174 |     Width = 24
175 |     Caption = 'File'
176 |     ParentColor = False
177 |   end
178 |   object Chain1Ed: TEdit
179 |     Left = 640
180 |     Height = 28
181 |     Top = 98
182 |     Width = 48
183 |     TabOrder = 12
184 |   end
185 |   object Chain2Ed: TEdit
186 |     Left = 640
187 |     Height = 28
188 |     Top = 126
189 |     Width = 48
190 |     TabOrder = 13
191 |   end
192 |   object Label9: TLabel
193 |     Left = 648
194 |     Height = 21
195 |     Top = 80
196 |     Width = 38
197 |     Caption = 'Chain'
198 |     ParentColor = False
199 |   end
200 |   object Id1Ed: TEdit
201 |     Left = 688
202 |     Height = 28
203 |     Top = 98
204 |     Width = 144
205 |     TabOrder = 14
206 |   end
207 |   object Id2Ed: TEdit
208 |     Left = 688
209 |     Height = 28
210 |     Top = 126
211 |     Width = 144
212 |     TabOrder = 15
213 |   end
214 |   object Label10: TLabel
215 |     Left = 736
216 |     Height = 21
217 |     Top = 80
218 |     Width = 37
219 |     Caption = 'Label'
220 |     ParentColor = False
221 |   end
222 |   object PycoMm: TMemo
223 |     Left = 10
224 |     Height = 360
225 |     Top = 240
226 |     Width = 822
227 |     ReadOnly = True
228 |     ScrollBars = ssVertical
229 |     TabOrder = 16
230 |   end
231 |   object PythonClEd: TEdit
232 |     Left = 480
233 |     Height = 28
234 |     Top = 194
235 |     Width = 128
236 |     TabOrder = 17
237 |     Text = 'python'
238 |   end
239 |   object Label11: TLabel
240 |     Left = 480
241 |     Height = 21
242 |     Top = 172
243 |     Width = 120
244 |     Caption = 'Python interpreter'
245 |     ParentColor = False
246 |   end
247 |   object StopPycoBt: TButton
248 |     Left = 640
249 |     Height = 25
250 |     Top = 200
251 |     Width = 192
252 |     Caption = 'Stop Pycoevol'
253 |     Enabled = False
254 |     OnClick = StopPycoBtClick
255 |     TabOrder = 18
256 |   end
257 |   object OpenDialog1: TOpenDialog
258 |     left = 64
259 |     top = 284
260 |   end
261 |   object SelectDirectoryDialog1: TSelectDirectoryDialog
262 |     left = 69
263 |     top = 272
264 |   end
265 | end
266 | 


--------------------------------------------------------------------------------
/GUI/mainform.pas:
--------------------------------------------------------------------------------
  1 | {*******************************************************************************
  2 | This file is part of the Pycoevol.
  3 | This work is public domain. Enjoy.
  4 | ********************************************************************************
  5 | Author: Ludwig Krippahl
  6 | Date: 21.4.2012
  7 | Purpose:
  8 | Pycoevol GUI
  9 | Requirements:
 10 | Revisions:
 11 | To do:
 12 | *******************************************************************************}
 13 | 
 14 | unit mainform;
 15 | 
 16 | {$mode objfpc}{$H+}
 17 | 
 18 | interface
 19 | 
 20 | uses
 21 |   Classes, SysUtils, FileUtil, Forms, Controls, Graphics, Dialogs, StdCtrls,
 22 |   Grids,Process, INIFiles;
 23 | 
 24 | type
 25 | 
 26 |   { TForm1 }
 27 | 
 28 |   TForm1 = class(TForm)
 29 |     StopPycoBt: TButton;
 30 |     PythonClEd: TEdit;
 31 |     Label10: TLabel;
 32 |     Label11: TLabel;
 33 |     Label6: TLabel;
 34 |     Label7: TLabel;
 35 |     Label8: TLabel;
 36 |     Label9: TLabel;
 37 |     PycoMm: TMemo;
 38 |     OpenDialog1: TOpenDialog;
 39 |     Label3: TLabel;
 40 |     Label4: TLabel;
 41 |     Label5: TLabel;
 42 |     File2BrowseBt: TButton;
 43 |     File2Ed: TEdit;
 44 |     Chain2Ed: TEdit;
 45 |     Id2Ed: TEdit;
 46 |     PsiblastCb: TComboBox;
 47 |     Label2: TLabel;
 48 |     AlignmentCb: TComboBox;
 49 |     CoevolutionCb: TComboBox;
 50 |     PycoFolderBrowseBt: TButton;
 51 |     ParamFileBrowseBt: TButton;
 52 |     File1BrowseBt: TButton;
 53 |     PycoFolderEd: TEdit;
 54 |     Label1: TLabel;
 55 |     ParamFileEd: TEdit;
 56 |     File1Ed: TEdit;
 57 |     Chain1Ed: TEdit;
 58 |     Id1Ed: TEdit;
 59 |     RunPyCoBt: TButton;
 60 |     SelectDirectoryDialog1: TSelectDirectoryDialog;
 61 |     procedure File1BrowseBtClick(Sender: TObject);
 62 |     procedure File2BrowseBtClick(Sender: TObject);
 63 |     procedure FormActivate(Sender: TObject);
 64 |     procedure FormClose(Sender: TObject; var CloseAction: TCloseAction);
 65 |     procedure FormCreate(Sender: TObject);
 66 |     procedure Label1Click(Sender: TObject);
 67 |     procedure ParamFileBrowseBtClick(Sender: TObject);
 68 |     procedure ParamFileEditClick(Sender: TObject);
 69 |     procedure PycoFolderBrowseBtClick(Sender: TObject);
 70 |     procedure RunPyCoBtClick(Sender: TObject);
 71 |     procedure StopPycoBtClick(Sender: TObject);
 72 |   private
 73 |     { private declarations }
 74 |     FTerminatePyCo:Boolean;
 75 |     Init:Boolean;
 76 |     function GetCommandLine:string;
 77 |     procedure LoadLists;
 78 |     procedure SaveConfiguration;
 79 |     procedure LoadConfiguration;
 80 |     procedure RunPycoevol;
 81 |   public
 82 |     { public declarations }
 83 |   end; 
 84 | 
 85 | var
 86 |   Form1: TForm1; 
 87 | 
 88 | implementation
 89 | 
 90 | {$R *.lfm}
 91 | 
 92 | { TForm1 }
 93 | 
 94 | procedure TForm1.FormCreate(Sender: TObject);
 95 | begin
 96 |   LoadLists;
 97 |   Init:=True;
 98 | end;
 99 | 
100 | procedure TForm1.Label1Click(Sender: TObject);
101 | begin
102 | 
103 | end;
104 | 
105 | procedure TForm1.FormActivate(Sender: TObject);
106 | begin
107 |   if Init then
108 |      begin
109 |      LoadConfiguration;
110 |      Init:=False;
111 |      end;
112 | end;
113 | 
114 | procedure TForm1.File2BrowseBtClick(Sender: TObject);
115 | begin
116 |   OpenDialog1.Filter:='PDB file|*.pdb|Sequence|*.fasta|Any|*.*';
117 |   if OpenDialog1.Execute then
118 |      File2Ed.Text:=OpenDialog1.FileName;
119 | end;
120 | 
121 | procedure TForm1.File1BrowseBtClick(Sender: TObject);
122 | begin
123 |   OpenDialog1.Filter:='PDB file|*.pdb|Sequence|*.fasta|Any|*.*';
124 |   if OpenDialog1.Execute then
125 |      File1Ed.Text:=OpenDialog1.FileName;
126 | end;
127 | 
128 | procedure TForm1.FormClose(Sender: TObject; var CloseAction: TCloseAction);
129 | begin
130 |   SaveConfiguration;
131 | end;
132 | 
133 | procedure TForm1.ParamFileBrowseBtClick(Sender: TObject);
134 | begin
135 |   OpenDialog1.Filter:='Parameter file|*.config|Any|*.*';
136 |   if OpenDialog1.Execute then
137 |      ParamFileEd.Text:=OpenDialog1.FileName;
138 | end;
139 | 
140 | procedure TForm1.ParamFileEditClick(Sender: TObject);
141 | 
142 | var proc:TProcess;
143 | 
144 | begin
145 |   proc:=TProcess.Create(nil);
146 |   proc.CommandLine:='"'+ParamFileEd.Text+'"';
147 |   proc.Execute;
148 |   proc.Free;
149 | end;
150 | 
151 | procedure TForm1.PycoFolderBrowseBtClick(Sender: TObject);
152 | begin
153 |   if SelectDirectoryDialog1.Execute then
154 |     PycoFolderEd.Text:=SelectDirectoryDialog1.FileName;
155 | end;
156 | 
157 | procedure TForm1.RunPyCoBtClick(Sender: TObject);
158 | 
159 | var oc:string;
160 | 
161 | begin
162 |   RunPyCoBt.Enabled:=False;
163 |   StopPyCoBt.Enabled:=True;
164 |   FTerminatePyco:=False;
165 |   oc:=RunPyCoBt.Caption;
166 |   RunPyCoBt.Caption:='Busy...';
167 |   Application.ProcessMessages;
168 |   try
169 |     RunPycoevol;
170 |   finally
171 |     RunPyCoBt.Caption:=oc;
172 |     RunPyCoBt.Enabled:=True;
173 |   end;
174 | end;
175 | 
176 | procedure TForm1.StopPycoBtClick(Sender: TObject);
177 | begin
178 |   StopPyCoBt.Enabled:=False;
179 |   FTerminatePyco:=True;
180 | end;
181 | 
182 | function TForm1.GetCommandLine: string;
183 | begin
184 | 
185 |   //TODO: Check for spaces in parameters??
186 | 
187 |   Result:=PythonClEd.Text+' "'+PycoFolderEd.Text+PathDelim+'Pycoevol.py"';
188 |   if (File1Ed.Text<>'') and (File2Ed.Text<>'') then
189 |     Result:=Result+' "'+File1Ed.Text+'" "'+File2Ed.Text+'"';
190 |   if (Chain1Ed.Text<>'') and (Chain2Ed.Text<>'') then
191 |     Result:=Result+' -x'+Chain1Ed.Text+' -x'+Chain2Ed.Text;
192 |   if (Id1Ed.Text<>'') and (Id2Ed.Text<>'') then
193 |     Result:=Result+' -i'+Id1Ed.Text+' -i'+Id2Ed.Text;
194 |   Result:=Result+' -b'+PsiblastCb.Text+' -a'+AlignmentCb.Text+' -c'+CoevolutionCb.Text;
195 |   if (ParamFileEd.text<>'') then
196 |   Result:=Result+' -p"'+ParamFileEd.text+'"';
197 | end;
198 | 
199 | procedure TForm1.LoadLists;
200 | 
201 | var
202 |   sl:TStringList;
203 |   f:Integer;
204 |   s:string;
205 |   currbox:TComboBox;
206 | 
207 | begin
208 |   sl:=TStringList.Create;
209 |   sl.LoadFromFile('optionlists.txt');
210 |   for f:=0 to sl.Count-1 do
211 |     begin
212 |     s:=sl.Strings[f];
213 |     if s='**coevolution' then currbox:=CoevolutionCB
214 |     else if s='**alignment' then currbox:=AlignmentCB
215 |     else if s='**psiblast' then currbox:=PsiblastCB
216 |     else
217 |       currbox.Items.Add(s);
218 |     end;
219 |   CoevolutionCb.ItemIndex:=0;
220 |   AlignmentCb.ItemIndex:=0;
221 |   PsiblastCb.ItemIndex:=0;
222 | 
223 |   sl.Free;
224 | end;
225 | 
226 | procedure TForm1.SaveConfiguration;
227 | 
228 | var
229 |   cfg:string;
230 |   ini:TIniFile;
231 | 
232 | begin
233 |   cfg := GetAppConfigFile(False);
234 |   if not DirectoryExists(ExtractFileDir(cfg)) then
235 |       CreateDir(ExtractFileDir(cfg));
236 |   ini:=TiniFile.Create(cfg);
237 |   ini.WriteString('Form','PycoevolFolder',PycoFolderEd.Text);
238 |   ini.WriteString('Form','ParametersFile',ParamFileEd.Text);
239 |   ini.WriteString('Form','File1',File1Ed.Text);
240 |   ini.WriteString('Form','File2',File2Ed.Text);
241 |   ini.WriteString('Form','Psiblast',PsiblastCb.Text);
242 |   ini.WriteString('Form','Alignment',AlignmentCB.Text);
243 |   ini.WriteString('Form','Coevolution',CoevolutionCB.Text);
244 |   ini.WriteString('Form','Python',PythonClEd.Text);
245 |   ini.UpdateFile;
246 |   ini.Free;
247 | end;
248 | 
249 | procedure TForm1.LoadConfiguration;
250 | 
251 | var
252 |    cfg:string;
253 |    ini:TIniFile;
254 | 
255 | begin
256 |   cfg := GetAppConfigFile(False);
257 |   if FileExists(cfg) then
258 |     begin
259 |     ini:=TIniFile.Create(cfg);
260 |     PycoFolderEd.Text:=ini.ReadString('Form','PycoevolFolder','');
261 |     ParamFileEd.Text:=ini.ReadString('Form','ParametersFile','');
262 |     File1Ed.Text:=ini.ReadString('Form','File1','');
263 |     File2Ed.Text:=ini.ReadString('Form','File2','');
264 |     PythonClEd.Text:=ini.ReadString('Form','Python','');
265 | 
266 |     PsiblastCb.ItemIndex:=PsiblastCb.Items.IndexOf(ini.ReadString('Form','Psiblast',''));
267 |     if PsiBlastCB.ItemIndex<0 then PsiBlastCb.ItemIndex:=0;
268 |     AlignmentCB.ItemIndex:=AlignmentCB.Items.IndexOf(ini.ReadString('Form','Alignment',''));
269 |     if AlignmentCB.ItemIndex<0 then AlignmentCB.ItemIndex:=0;
270 |     CoevolutionCB.ItemIndex:=CoevolutionCB.Items.IndexOf(ini.ReadString('Form','Coevolution',''));
271 |     if CoevolutionCB.ItemIndex<0 then CoevolutionCB.ItemIndex:=0;
272 | 
273 |     ini.Free;
274 |     end;
275 | end;
276 | 
277 | procedure TForm1.RunPycoevol;
278 | 
279 | 
280 | var
281 |   ebytes,nbytes: LongInt;
282 |   proc:TProcess;
283 |   cl:string;
284 | 
285 | procedure RefreshOutput;
286 | 
287 | var
288 |   s:string;
289 | 
290 | begin
291 |   nbytes := proc.Output.NumBytesAvailable;
292 |   while nbytes > 0 do
293 |      begin
294 |      SetLength(s,nbytes);
295 |      proc.Output.Read(s[1], nbytes);
296 |      nbytes := proc.Output.NumBytesAvailable;
297 |      PycoMm.Lines.Add(s);
298 |      end;
299 |   ebytes := proc.Stderr.NumBytesAvailable;
300 |   if ebytes>0 then PycoMm.Lines.Add('*** ERROR ***');
301 |   while ebytes > 0 do
302 |      begin
303 |      SetLength(s,ebytes);
304 |      proc.Stderr.Read(s[1], ebytes);
305 |      ebytes := proc.Stderr.NumBytesAvailable;
306 |      PycoMm.Lines.Add(s);
307 |      end;
308 |    PycoMm.SelStart := Length(PycoMm.Lines.Text)-1;
309 |    PycoMm.SelLength:=0;
310 |    PycoMm.SetFocus;
311 |    Application.ProcessMessages;
312 |    if FTerminatePyCo then proc.Active:=False;
313 | end;
314 | 
315 | begin
316 |   SetCurrentDir(PyCoFolderEd.Text);
317 |   cl:=GetCommandLine;
318 |   proc:=TProcess.Create(nil);
319 |   proc.CommandLine := cl;
320 |   proc.Options := [poUsePipes];
321 |   PycoMm.Lines.Add(cl);
322 |   Application.ProcessMessages;
323 |   proc.Execute;
324 |   while proc.Running do
325 |   begin
326 |     Application.ProcessMessages;
327 |     RefreshOutput;
328 |     Sleep(500);
329 |   end;
330 |   RefreshOutput;
331 |   proc.Free;
332 |   if FTerminatePyCo then PycoMm.Lines.Add('Terminated by user')
333 |   else PycoMm.Lines.Add('Done');
334 |   PycoMm.SelStart := Length(PycoMm.Lines.Text)-1;
335 |   PycoMm.SelLength:=0;
336 |   PycoMm.SetFocus;
337 | 
338 | end;
339 | 
340 | end.
341 | 
342 | 


--------------------------------------------------------------------------------
/GUI/optionlists.txt:
--------------------------------------------------------------------------------
 1 | **coevolution
 2 | mi
 3 | mie
 4 | rcwmi
 5 | cpvn
 6 | clm
 7 | vol
 8 | omes
 9 | pearson
10 | spearman
11 | mcbasc
12 | quartets
13 | sca
14 | elsc
15 | **psiblast
16 | internet
17 | local
18 | custom
19 | **alignment
20 | clustalw
21 | muscle
22 | mafft
23 | custom


--------------------------------------------------------------------------------
/GUI/pycoevolgui.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/GUI/pycoevolgui.ico


--------------------------------------------------------------------------------
/GUI/pycoevolgui.lpi:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <CONFIG>
  3 |   <ProjectOptions>
  4 |     <Version Value="9"/>
  5 |     <General>
  6 |       <MainUnit Value="0"/>
  7 |       <ResourceType Value="res"/>
  8 |       <UseXPManifest Value="True"/>
  9 |       <Icon Value="0"/>
 10 |       <ActiveWindowIndexAtStart Value="0"/>
 11 |     </General>
 12 |     <i18n>
 13 |       <EnableI18N LFM="False"/>
 14 |     </i18n>
 15 |     <VersionInfo>
 16 |       <StringTable ProductVersion=""/>
 17 |     </VersionInfo>
 18 |     <BuildModes Count="1" Active="Default">
 19 |       <Item1 Name="Default" Default="True"/>
 20 |     </BuildModes>
 21 |     <PublishOptions>
 22 |       <Version Value="2"/>
 23 |       <IncludeFileFilter Value="*.(pas|pp|inc|lfm|lpr|lrs|lpi|lpk|sh|xml)"/>
 24 |       <ExcludeFileFilter Value="*.(bak|ppu|o|so);*~;backup"/>
 25 |     </PublishOptions>
 26 |     <RunParams>
 27 |       <local>
 28 |         <FormatVersion Value="1"/>
 29 |         <LaunchingApplication PathPlusParams="/usr/bin/xterm -T 'Lazarus Run Output' -e $(LazarusDir)/tools/runwait.sh $(TargetCmdLine)"/>
 30 |       </local>
 31 |     </RunParams>
 32 |     <RequiredPackages Count="1">
 33 |       <Item1>
 34 |         <PackageName Value="LCL"/>
 35 |       </Item1>
 36 |     </RequiredPackages>
 37 |     <Units Count="4">
 38 |       <Unit0>
 39 |         <Filename Value="pycoevolgui.lpr"/>
 40 |         <IsPartOfProject Value="True"/>
 41 |         <UnitName Value="pycoevolgui"/>
 42 |         <WindowIndex Value="0"/>
 43 |         <TopLine Value="1"/>
 44 |         <CursorPos X="1" Y="1"/>
 45 |         <UsageCount Value="50"/>
 46 |       </Unit0>
 47 |       <Unit1>
 48 |         <Filename Value="mainform.pas"/>
 49 |         <IsPartOfProject Value="True"/>
 50 |         <ComponentName Value="Form1"/>
 51 |         <ResourceBaseClass Value="Form"/>
 52 |         <UnitName Value="mainform"/>
 53 |         <IsVisibleTab Value="True"/>
 54 |         <EditorIndex Value="0"/>
 55 |         <WindowIndex Value="0"/>
 56 |         <TopLine Value="100"/>
 57 |         <CursorPos X="3" Y="102"/>
 58 |         <UsageCount Value="50"/>
 59 |         <Loaded Value="True"/>
 60 |         <LoadedDesigner Value="True"/>
 61 |       </Unit1>
 62 |       <Unit2>
 63 |         <Filename Value="/usr/lib/lazarus/0.9.30/lcl/stdctrls.pp"/>
 64 |         <UnitName Value="StdCtrls"/>
 65 |         <WindowIndex Value="0"/>
 66 |         <TopLine Value="377"/>
 67 |         <CursorPos X="14" Y="395"/>
 68 |         <UsageCount Value="9"/>
 69 |       </Unit2>
 70 |       <Unit3>
 71 |         <Filename Value="/usr/share/fpcsrc/2.4.4/packages/fcl-process/src/pipes.pp"/>
 72 |         <UnitName Value="Pipes"/>
 73 |         <WindowIndex Value="0"/>
 74 |         <TopLine Value="21"/>
 75 |         <CursorPos X="16" Y="38"/>
 76 |         <UsageCount Value="9"/>
 77 |       </Unit3>
 78 |     </Units>
 79 |     <JumpHistory Count="30" HistoryIndex="29">
 80 |       <Position1>
 81 |         <Filename Value="mainform.pas"/>
 82 |         <Caret Line="228" Column="38" TopLine="194"/>
 83 |       </Position1>
 84 |       <Position2>
 85 |         <Filename Value="mainform.pas"/>
 86 |         <Caret Line="216" Column="27" TopLine="199"/>
 87 |       </Position2>
 88 |       <Position3>
 89 |         <Filename Value="mainform.pas"/>
 90 |         <Caret Line="232" Column="53" TopLine="222"/>
 91 |       </Position3>
 92 |       <Position4>
 93 |         <Filename Value="mainform.pas"/>
 94 |         <Caret Line="226" Column="22" TopLine="200"/>
 95 |       </Position4>
 96 |       <Position5>
 97 |         <Filename Value="mainform.pas"/>
 98 |         <Caret Line="228" Column="19" TopLine="205"/>
 99 |       </Position5>
100 |       <Position6>
101 |         <Filename Value="mainform.pas"/>
102 |         <Caret Line="206" Column="17" TopLine="189"/>
103 |       </Position6>
104 |       <Position7>
105 |         <Filename Value="mainform.pas"/>
106 |         <Caret Line="226" Column="48" TopLine="203"/>
107 |       </Position7>
108 |       <Position8>
109 |         <Filename Value="mainform.pas"/>
110 |         <Caret Line="217" Column="32" TopLine="209"/>
111 |       </Position8>
112 |       <Position9>
113 |         <Filename Value="mainform.pas"/>
114 |         <Caret Line="241" Column="32" TopLine="228"/>
115 |       </Position9>
116 |       <Position10>
117 |         <Filename Value="mainform.pas"/>
118 |         <Caret Line="129" Column="29" TopLine="117"/>
119 |       </Position10>
120 |       <Position11>
121 |         <Filename Value="mainform.pas"/>
122 |         <Caret Line="212" Column="5" TopLine="187"/>
123 |       </Position11>
124 |       <Position12>
125 |         <Filename Value="mainform.pas"/>
126 |         <Caret Line="216" Column="97" TopLine="195"/>
127 |       </Position12>
128 |       <Position13>
129 |         <Filename Value="mainform.pas"/>
130 |         <Caret Line="101" Column="3" TopLine="99"/>
131 |       </Position13>
132 |       <Position14>
133 |         <Filename Value="mainform.pas"/>
134 |         <Caret Line="94" Column="11" TopLine="90"/>
135 |       </Position14>
136 |       <Position15>
137 |         <Filename Value="mainform.pas"/>
138 |         <Caret Line="101" Column="51" TopLine="97"/>
139 |       </Position15>
140 |       <Position16>
141 |         <Filename Value="mainform.pas"/>
142 |         <Caret Line="143" Column="11" TopLine="133"/>
143 |       </Position16>
144 |       <Position17>
145 |         <Filename Value="mainform.pas"/>
146 |         <Caret Line="287" Column="5" TopLine="269"/>
147 |       </Position17>
148 |       <Position18>
149 |         <Filename Value="mainform.pas"/>
150 |         <Caret Line="259" Column="69" TopLine="239"/>
151 |       </Position18>
152 |       <Position19>
153 |         <Filename Value="mainform.pas"/>
154 |         <Caret Line="273" Column="1" TopLine="241"/>
155 |       </Position19>
156 |       <Position20>
157 |         <Filename Value="mainform.pas"/>
158 |         <Caret Line="263" Column="24" TopLine="206"/>
159 |       </Position20>
160 |       <Position21>
161 |         <Filename Value="mainform.pas"/>
162 |         <Caret Line="264" Column="24" TopLine="241"/>
163 |       </Position21>
164 |       <Position22>
165 |         <Filename Value="mainform.pas"/>
166 |         <Caret Line="203" Column="16" TopLine="191"/>
167 |       </Position22>
168 |       <Position23>
169 |         <Filename Value="mainform.pas"/>
170 |         <Caret Line="145" Column="9" TopLine="135"/>
171 |       </Position23>
172 |       <Position24>
173 |         <Filename Value="mainform.pas"/>
174 |         <Caret Line="168" Column="69" TopLine="140"/>
175 |       </Position24>
176 |       <Position25>
177 |         <Filename Value="mainform.pas"/>
178 |         <Caret Line="169" Column="69" TopLine="142"/>
179 |       </Position25>
180 |       <Position26>
181 |         <Filename Value="mainform.pas"/>
182 |         <Caret Line="145" Column="4" TopLine="127"/>
183 |       </Position26>
184 |       <Position27>
185 |         <Filename Value="mainform.pas"/>
186 |         <Caret Line="291" Column="46" TopLine="273"/>
187 |       </Position27>
188 |       <Position28>
189 |         <Filename Value="mainform.pas"/>
190 |         <Caret Line="170" Column="25" TopLine="149"/>
191 |       </Position28>
192 |       <Position29>
193 |         <Filename Value="mainform.pas"/>
194 |         <Caret Line="135" Column="47" TopLine="111"/>
195 |       </Position29>
196 |       <Position30>
197 |         <Filename Value="mainform.pas"/>
198 |         <Caret Line="117" Column="25" TopLine="100"/>
199 |       </Position30>
200 |     </JumpHistory>
201 |   </ProjectOptions>
202 |   <CompilerOptions>
203 |     <Version Value="9"/>
204 |     <Target>
205 |       <Filename Value="pycoevolgui"/>
206 |     </Target>
207 |     <SearchPaths>
208 |       <IncludeFiles Value="$(ProjOutDir)"/>
209 |       <UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/>
210 |     </SearchPaths>
211 |     <Other>
212 |       <CompilerMessages>
213 |         <UseMsgFile Value="True"/>
214 |       </CompilerMessages>
215 |       <CompilerPath Value="$(CompPath)"/>
216 |     </Other>
217 |   </CompilerOptions>
218 |   <Debugging>
219 |     <Exceptions Count="3">
220 |       <Item1>
221 |         <Name Value="EAbort"/>
222 |       </Item1>
223 |       <Item2>
224 |         <Name Value="ECodetoolError"/>
225 |       </Item2>
226 |       <Item3>
227 |         <Name Value="EFOpenError"/>
228 |       </Item3>
229 |     </Exceptions>
230 |   </Debugging>
231 | </CONFIG>
232 | 


--------------------------------------------------------------------------------
/GUI/pycoevolgui.lpr:
--------------------------------------------------------------------------------
 1 | program pycoevolgui;
 2 | 
 3 | {$mode objfpc}{$H+}
 4 | 
 5 | uses
 6 |   {$IFDEF UNIX}{$IFDEF UseCThreads}
 7 |   cthreads,
 8 |   {$ENDIF}{$ENDIF}
 9 |   Interfaces, // this includes the LCL widgetset
10 |   Forms, mainform
11 |   { you can add units after this };
12 | 
13 | {$R *.res}
14 | 
15 | begin
16 |   Application.Initialize;
17 |   Application.CreateForm(TForm1, Form1);
18 |   Application.Run;
19 | end.
20 | 
21 | 


--------------------------------------------------------------------------------
/GUI/pycoevolgui.res:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/GUI/pycoevolgui.res


--------------------------------------------------------------------------------
/Matrix/BLOSUM62:
--------------------------------------------------------------------------------
 1 |   4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
 2 |  -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
 3 |  -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
 4 |  -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
 5 |   0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
 6 |  -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
 7 |  -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
 8 |   0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
 9 |  -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
10 |  -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
11 |  -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
12 |  -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
13 |  -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
14 |  -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
15 |  -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
16 |   1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
17 |   0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
18 |  -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2  1  2 -3 -4 -3 -2 -4
19 |  -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
20 |   0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
21 |  -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
22 |  -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
23 |   0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
24 |  -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -1


--------------------------------------------------------------------------------
/Matrix/CLM:
--------------------------------------------------------------------------------
 1 | 0.50  0.90  0.30  0.30  1.40  0.30  0.60  1.30  0.30  1.30  1.10  0.40  0.50  0.50  0.50  0.40  0.60  1.10  1.40  1.30
 2 | 0.90  9.60  0.60  0.50  2.60  0.70  1.80  1.80  0.60  1.70  1.50  0.70  1.00  0.80  0.80  0.80  0.80  1.70  2.70  2.20
 3 | 0.30  0.60  0.50  0.30  0.50  0.40  1.10  0.50  1.00  0.40  0.50  0.70  0.40  0.50  1.50  0.60  0.50  0.40  1.10  1.20
 4 | 0.30  0.40  0.40  0.40  0.80  0.30  1.00  0.60  1.10  0.60  0.70  0.60  0.50  0.60  1.50  0.50  0.60  0.50  1.20  1.30
 5 | 1.40  2.40  0.60  0.70  4.30  0.90  1.40  3.30  0.70  3.20  2.80  0.80  1.30  1.10  1.40  0.90  1.10  2.60  4.10  3.10
 6 | 0.40  0.60  0.40  0.30  1.00  0.40  0.60  0.60  0.50  0.60  0.70  0.60  0.40  0.50  0.70  0.40  0.50  0.60  1.10  1.00
 7 | 0.70  1.50  1.40  1.10  1.20  0.70  2.20  1.10  0.70  1.20  1.60  0.80  0.60  0.90  1.10  1.00  1.00  0.90  2.30  1.90
 8 | 1.40  1.70  0.50  0.60  3.40  0.60  1.10  3.50  0.60  3.20  2.60  0.60  0.90  0.80  1.00  0.70  1.10  2.70  3.20  2.80
 9 | 0.30  0.60  1.00  1.10  0.80  0.40  0.60  0.60  0.30  0.60  0.60  0.60  0.40  0.70  0.50  0.50  0.50  0.50  0.90  1.10
10 | 1.20  1.60  0.40  0.60  3.70  0.60  1.10  3.30  0.60  3.30  2.70  0.60  0.90  0.90  1.00  0.60  1.00  2.60  3.40  2.50
11 | 1.10  1.60  0.50  0.50  3.40  0.60  1.40  2.40  0.50  2.50  2.90  0.80  1.00  1.00  1.00  0.70  0.80  2.20  3.60  2.80
12 | 0.40  0.70  0.70  0.60  0.80  0.60  0.90  0.60  0.60  0.60  0.90  0.90  0.70  0.90  1.00  0.60  0.80  0.60  1.30  1.20
13 | 0.50  1.00  0.50  0.50  1.40  0.30  1.00  0.80  0.40  0.90  1.10  0.70  0.70  0.70  0.90  0.50  0.60  0.80  2.20  1.70
14 | 0.40  0.80  0.60  0.50  1.10  0.60  1.00  0.80  0.70  0.80  0.90  0.80  0.70  0.80  1.60  0.50  0.80  0.70  1.50  1.20
15 | 0.50  0.90  1.70  1.60  1.50  0.70  1.40  1.10  0.50  1.10  1.10  1.00  0.90  1.00  1.00  0.90  0.90  1.00  1.90  1.90
16 | 0.30  0.60  0.50  0.50  0.80  0.30  0.90  0.60  0.50  0.60  0.60  0.60  0.40  0.70  0.70  0.40  0.60  0.60  1.20  0.90
17 | 0.50  0.90  0.60  0.70  1.10  0.40  1.00  1.10  0.60  1.00  1.00  0.60  0.60  0.80  0.90  0.70  0.80  0.90  1.40  1.20
18 | 1.10  1.40  0.40  0.60  3.00  0.50  1.00  2.60  0.60  2.80  2.20  0.60  0.80  0.80  0.80  0.70  0.90  2.40  2.30  2.00
19 | 1.30  2.40  1.20  1.00  3.60  1.10  2.30  3.00  1.20  3.70  3.70  1.00  1.90  1.60  1.80  1.10  1.10  2.70  4.20  3.40
20 | 1.20  1.80  1.10  1.20  3.30  0.90  1.90  2.20  1.20  2.20  2.60  1.00  1.50  1.30  1.60  1.00  1.10  2.00  3.00  2.50


--------------------------------------------------------------------------------
/Matrix/CPVN:
--------------------------------------------------------------------------------
 1 |  3.89  4.91  4.59 5.33  1.76 5.25  2.84  0.77 3.05  1.00 6.24 5.61  3.27 3.38  3.20 3.60  2.30  1.59 3.23 3.80
 2 |  4.91  3.74  4.20 4.69  2.89 4.37  2.57 -0.41 2.83  1.42 2.92 3.95  2.90 3.21  3.22 3.22  1.93  1.36 4.45 4.18
 3 |  4.59  4.20  4.03 4.86  2.93 5.32  2.77 -0.37 2.07  1.41 5.77 4.19  2.50 4.88  3.12 3.46  1.40  2.31 3.15 4.99
 4 |  5.33  4.69  4.86 5.34  3.68 5.28  3.00  0.14 3.34  1.75 5.83 5.83  4.25 3.47  2.87 4.25  0.99  3.11 3.57 4.49
 5 |  1.76  2.89  2.93 3.68  7.65 1.84  1.46 -0.25 1.03  2.48 2.14 2.47  2.74 4.12  2.51 1.33  0.24 -0.42 2.05 2.81
 6 |  5.25  4.37  5.32 5.28  1.84 6.02  2.30  0.91 2.09  1.61 4.89 4.81  3.38 4.65  3.88 4.18  0.36  2.30 3.93 3.62
 7 |  2.84  2.57  2.77 3.00  1.46 2.30 -0.52 -1.77 1.21  0.39 3.37 2.47  1.22 2.59  1.71 1.72  1.13  1.69 2.13 1.90
 8 |  0.77 -0.41 -0.37 0.14 -0.25 0.91 -1.77 -4.40 0.21 -1.53 1.42 1.25 -0.51 1.08 -0.89 0.70 -0.08 -0.54 1.33 1.59
 9 |  3.05  2.83  2.07 3.34  1.03 2.09  1.21  0.21 1.27  1.91 5.12 3.14  2.65 2.71  2.88 1.82  3.88  2.52 3.67 3.77
10 |  1.00  1.42  1.41 1.75  2.48 1.61  0.39 -1.53 1.91 -0.09 2.87 2.30  1.33 0.80  2.60 2.00  2.94  1.77 2.74 2.82
11 |  6.24  2.92  5.77 5.83  2.14 4.89  3.37  1.42 5.12  2.87 5.85 6.19  7.87 6.46  1.20 1.37  2.62  3.54 5.76 8.57
12 |  5.61  3.95  4.19 5.83  2.47 4.81  2.47  1.25 3.14  2.30 6.19 5.93  4.22 6.05  4.54 2.05  1.76  3.66 5.26 5.28
13 |  3.27  2.90  2.50 4.25  2.74 3.38  1.22 -0.51 2.65  1.33 7.87 4.22  0.60 2.89  3.17 3.50  1.46  3.09 3.75 3.99
14 |  3.38  3.21  4.88 3.47  4.12 4.65  2.59  1.08 2.71  0.80 6.46 6.05  2.89 5.37  2.30 4.00  5.20  2.38 2.72 4.90
15 |  3.20  3.22  3.12 2.87  2.51 3.88  1.71 -0.89 2.88  2.60 1.20 4.54  3.17 2.30  1.65 1.95  0.08  2.68 5.32 5.75
16 |  3.60  3.22  3.46 4.25  1.33 4.18  1.72  0.70 1.82  2.00 1.37 2.05  3.50 4.00  1.95 2.83  3.26  3.45 3.50 4.50
17 |  2.30  1.93  1.40 0.99  0.24 0.36  1.13 -0.08 3.88  2.94 2.62 1.76  1.46 5.20  0.08 3.26  0.13  3.85 3.90 4.94
18 |  1.59  1.36  2.31 3.11 -0.42 2.30  1.69 -0.54 2.52  1.77 3.54 3.66  3.09 2.38  2.68 3.45  3.85  2.92 3.17 3.85
19 |  3.23  4.45  3.15 3.57  2.05 3.93  2.13  1.33 3.67  2.74 5.76 5.26  3.75 2.72  5.32 3.50  3.90  3.17 3.24 2.29
20 |  3.80  4.18  4.99 4.49  2.81 3.62  1.90  1.59 3.77  2.82 8.57 5.28  3.99 4.90  5.75 4.50  4.94  3.85 2.29 2.87


--------------------------------------------------------------------------------
/Matrix/MCLACHLAN:
--------------------------------------------------------------------------------
 1 |  8.0 2.0 3.0 3.0 1.0 3.0 4.0 3.0 3.0 2.0 2.0 3.0 3.0 1.0 4.0 4.0 3.0 1.0 1.0 3.0
 2 |  2.0 8.0 3.0 1.0 1.0 5.0 3.0 3.0 5.0 1.0 2.0 5.0 1.0 1.0 3.0 4.0 3.0 3.0 2.0 2.0
 3 |  3.0 3.0 8.0 5.0 1.0 4.0 4.0 3.0 4.0 1.0 1.0 4.0 2.0 0.0 1.0 5.0 3.0 0.0 2.0 1.0
 4 |  3.0 1.0 5.0 8.0 1.0 4.0 5.0 3.0 4.0 0.0 1.0 3.0 2.0 1.0 3.0 3.0 3.0 0.0 1.0 1.0
 5 |  1.0 1.0 1.0 1.0 9.0 0.0 0.0 1.0 3.0 1.0 0.0 0.0 3.0 0.0 0.0 2.0 2.0 2.0 1.0 1.0
 6 |  3.0 5.0 4.0 4.0 0.0 8.0 5.0 2.0 4.0 0.0 3.0 4.0 3.0 0.0 3.0 4.0 3.0 2.0 1.0 2.0
 7 |  4.0 3.0 4.0 5.0 0.0 5.0 8.0 3.0 2.0 1.0 1.0 4.0 1.0 0.0 4.0 4.0 4.0 1.0 2.0 2.0
 8 |  3.0 3.0 3.0 3.0 1.0 2.0 3.0 8.0 2.0 1.0 1.0 3.0 1.0 0.0 3.0 3.0 2.0 1.0 0.0 2.0
 9 |  3.0 5.0 4.0 4.0 3.0 4.0 2.0 2.0 8.0 2.0 2.0 4.0 3.0 4.0 3.0 3.0 4.0 3.0 4.0 2.0
10 |  2.0 1.0 1.0 0.0 1.0 0.0 1.0 1.0 2.0 8.0 5.0 1.0 5.0 3.0 1.0 2.0 3.0 3.0 3.0 5.0
11 |  2.0 2.0 1.0 1.0 0.0 3.0 1.0 1.0 2.0 5.0 8.0 2.0 6.0 5.0 1.0 2.0 3.0 3.0 3.0 5.0
12 |  3.0 5.0 4.0 3.0 0.0 4.0 4.0 3.0 4.0 1.0 2.0 8.0 1.0 0.0 3.0 3.0 3.0 1.0 1.0 2.0
13 |  3.0 1.0 2.0 2.0 3.0 3.0 1.0 1.0 3.0 5.0 6.0 1.0 8.0 5.0 1.0 2.0 3.0 1.0 2.0 4.0
14 |  1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 4.0 3.0 5.0 0.0 5.0 9.0 1.0 2.0 1.0 6.0 6.0 3.0
15 |  4.0 3.0 1.0 3.0 0.0 3.0 4.0 3.0 3.0 1.0 1.0 3.0 1.0 1.0 8.0 3.0 3.0 0.0 0.0 2.0
16 |  4.0 4.0 5.0 3.0 2.0 4.0 4.0 3.0 3.0 2.0 2.0 3.0 2.0 2.0 3.0 8.0 5.0 3.0 3.0 2.0
17 |  3.0 3.0 3.0 3.0 2.0 3.0 4.0 2.0 4.0 3.0 3.0 3.0 3.0 1.0 3.0 5.0 8.0 2.0 1.0 3.0
18 |  1.0 3.0 0.0 0.0 2.0 2.0 1.0 1.0 3.0 3.0 3.0 1.0 1.0 6.0 0.0 3.0 2.0 9.0 6.0 2.0
19 |  1.0 2.0 2.0 1.0 1.0 1.0 2.0 0.0 4.0 3.0 3.0 1.0 2.0 6.0 0.0 3.0 1.0 6.0 9.0 3.0
20 |  3.0 2.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 5.0 5.0 2.0 4.0 3.0 2.0 2.0 3.0 2.0 3.0 8.0


--------------------------------------------------------------------------------
/Matrix/PAM250:
--------------------------------------------------------------------------------
 1 |   2 -2  0  0 -2  0  0  1 -1 -1 -2 -1 -1 -3  1  1  1 -6 -3  0  0  0  0 -8
 2 |  -2  6  0 -1 -4  1 -1 -3  2 -2 -3  3  0 -4  0  0 -1  2 -4 -2 -1  0 -1 -8
 3 |   0  0  2  2 -4  1  1  0  2 -2 -3  1 -2 -3  0  1  0 -4 -2 -2  2  1  0 -8
 4 |   0 -1  2  4 -5  2  3  1  1 -2 -4  0 -3 -6 -1  0  0 -7 -4 -2  3  3 -1 -8
 5 |  -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3  0 -2 -8  0 -2 -4 -5 -3 -8
 6 |   0  1  1  2 -5  4  2 -1  3 -2 -2  1 -1 -5  0 -1 -1 -5 -4 -2  1  3 -1 -8
 7 |   0 -1  1  3 -5  2  4  0  1 -2 -3  0 -2 -5 -1  0  0 -7 -4 -2  3  3 -1 -8
 8 |   1 -3  0  1 -3 -1  0  5 -2 -3 -4 -2 -3 -5  0  1  0 -7 -5 -1  0  0 -1 -8
 9 |  -1  2  2  1 -3  3  1 -2  6 -2 -2  0 -2 -2  0 -1 -1 -3  0 -2  1  2 -1 -8
10 |  -1 -2 -2 -2 -2 -2 -2 -3 -2  5  2 -2  2  1 -2 -1  0 -5 -1  4 -2 -2 -1 -8
11 |  -2 -3 -3 -4 -6 -2 -3 -4 -2  2  6 -3  4  2 -3 -3 -2 -2 -1  2 -3 -3 -1 -8
12 |  -1  3  1  0 -5  1  0 -2  0 -2 -3  5  0 -5 -1  0  0 -3 -4 -2  1  0 -1 -8
13 |  -1  0 -2 -3 -5 -1 -2 -3 -2  2  4  0  6  0 -2 -2 -1 -4 -2  2 -2 -2 -1 -8
14 |  -3 -4 -3 -6 -4 -5 -5 -5 -2  1  2 -5  0  9 -5 -3 -3  0  7 -1 -4 -5 -2 -8
15 |   1  0  0 -1 -3  0 -1  0  0 -2 -3 -1 -2 -5  6  1  0 -6 -5 -1 -1  0 -1 -8
16 |   1  0  1  0  0 -1  0  1 -1 -1 -3  0 -2 -3  1  2  1 -2 -3 -1  0  0  0 -8
17 |   1 -1  0  0 -2 -1  0  0 -1  0 -2  0 -1 -3  0  1  3 -5 -3  0  0 -1  0 -8
18 |  -6  2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4  0 -6 -2 -5 17  0 -6 -5 -6 -4 -8
19 |  -3 -4 -2 -4  0 -4 -4 -5  0 -1 -1 -4 -2  7 -5 -3 -3  0 10 -2 -3 -4 -2 -8
20 |   0 -2 -2 -2 -2 -2 -2 -1 -2  4  2 -2  2 -1 -1 -1  0 -6 -2  4 -2 -2 -1 -8
21 |   0 -1  2  3 -4  1  3  0  1 -2 -3  1 -2 -4 -1  0  0 -5 -3 -2  3  2 -1 -8
22 |   0  0  1  3 -5  3  3  0  2 -2 -3  0 -2 -5  0  0 -1 -6 -4 -2  2  3 -1 -8
23 |   0 -1  0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1  0  0 -4 -2 -1 -1 -1 -1 -8
24 |  -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8  1


--------------------------------------------------------------------------------
/Matrix/VOL:
--------------------------------------------------------------------------------
 1 | 2.20 2.04 2.21 2.44 1.89 2.25 1.71 1.55 1.92 1.76 2.68 2.48 1.95 2.21 2.04 2.13 1.89 1.97 2.24 2.42
 2 | 2.04 1.88 2.04 2.28 1.73 2.09 1.55 1.39 1.76 1.60 2.52 2.31 1.79 2.05 1.88 1.97 1.73 1.81 2.08 2.25
 3 | 2.21 2.04 2.21 2.44 1.90 2.26 1.72 1.56 1.93 1.77 2.69 2.48 1.96 2.22 2.05 2.14 1.90 1.98 2.25 2.42
 4 | 2.44 2.28 2.44 2.68 2.13 2.49 1.95 1.79 2.16 2.00 2.92 2.72 2.19 2.45 2.28 2.37 2.13 2.21 2.48 2.65
 5 | 1.89 1.73 1.90 2.13 1.58 1.94 1.40 1.24 1.62 1.46 2.37 2.17 1.64 1.90 1.73 1.82 1.58 1.66 1.93 2.11
 6 | 2.25 2.09 2.26 2.49 1.94 2.30 1.76 1.60 1.97 1.82 2.73 2.53 2.00 2.26 2.09 2.18 1.94 2.02 2.29 2.47
 7 | 1.71 1.55 1.72 1.95 1.40 1.76 1.23 1.07 1.44 1.28 2.20 1.99 1.47 1.72 1.55 1.65 1.41 1.49 1.76 1.93
 8 | 1.55 1.39 1.56 1.79 1.24 1.60 1.07 0.91 1.28 1.12 2.04 1.83 1.31 1.56 1.39 1.49 1.25 1.33 1.60 1.77
 9 | 1.92 1.76 1.93 2.16 1.62 1.97 1.44 1.28 1.65 1.49 2.41 2.20 1.68 1.94 1.76 1.86 1.62 1.70 1.97 2.14
10 | 1.76 1.60 1.77 2.00 1.46 1.82 1.28 1.12 1.49 1.33 2.25 2.04 1.52 1.78 1.61 1.70 1.46 1.54 1.81 1.98
11 | 2.68 2.52 2.69 2.92 2.37 2.73 2.20 2.04 2.41 2.25 3.17 2.96 2.44 2.69 2.52 2.62 2.37 2.46 2.73 2.90
12 | 2.48 2.31 2.48 2.72 2.17 2.53 1.99 1.83 2.20 2.04 2.96 2.75 2.23 2.49 2.32 2.41 2.17 2.25 2.52 2.69
13 | 1.95 1.79 1.96 2.19 1.64 2.00 1.47 1.31 1.68 1.52 2.44 2.23 1.70 1.96 1.79 1.88 1.64 1.72 2.00 2.17
14 | 2.21 2.05 2.22 2.45 1.90 2.26 1.72 1.56 1.94 1.78 2.69 2.49 1.96 2.22 2.05 2.14 1.90 1.98 2.25 2.43
15 | 2.04 1.88 2.05 2.28 1.73 2.09 1.55 1.39 1.76 1.61 2.52 2.32 1.79 2.05 1.88 1.97 1.73 1.81 2.08 2.26
16 | 2.13 1.97 2.14 2.37 1.82 2.18 1.65 1.49 1.86 1.70 2.62 2.41 1.88 2.14 1.97 2.06 1.82 1.91 2.18 2.35
17 | 1.89 1.73 1.90 2.13 1.58 1.94 1.41 1.25 1.62 1.46 2.37 2.17 1.64 1.90 1.73 1.82 1.58 1.66 1.94 2.11
18 | 1.97 1.81 1.98 2.21 1.66 2.02 1.49 1.33 1.70 1.54 2.46 2.25 1.72 1.98 1.81 1.91 1.66 1.75 2.02 2.19
19 | 2.24 2.08 2.25 2.48 1.93 2.29 1.76 1.60 1.97 1.81 2.73 2.52 2.00 2.25 2.08 2.18 1.94 2.02 2.29 2.46
20 | 2.42 2.25 2.42 2.65 2.11 2.47 1.93 1.77 2.14 1.98 2.90 2.69 2.17 2.43 2.26 2.35 2.11 2.19 2.46 2.63
21 | 


--------------------------------------------------------------------------------
/Parameters.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | """Parameters Loader"""
  9 | 
 10 | from ConfigParser import SafeConfigParser
 11 | 
 12 | surface_threshold = 7                   # 7% [0, max(surface)[ 
 13 | psiblast_evalue = 0.00001               # [0.00000001:10]
 14 | psiblast_identity = 30                  # [0:100] (%)
 15 | psiblast_coverage = 60                  # [0:100] (%)
 16 | psiblast_threading = False              # Number of cores/servers or False
 17 | pairwise_distance = "clustalw"          # "clustalw", "pdistance", "Kimura"
 18 |                                         # "jukescantor" or "alignscore"
 19 | alignscore_matrix = "BLOSUM62"          # "BLOSUM62" or "PAM250"
 20 | theilsen_cutoff = 0.7                   # [0.25:1.0(all sequences)]
 21 | clustalw_gap_opening = 10               # [0:100]
 22 | clustalw_gap_extension = 0.2            # [0:10]
 23 | clustalw_distance_matrix = "GONNET"     # "GONNET", "BLOSUM" or "PAM"
 24 | muscle_max_iteration = 16               # [2:16]
 25 | mafft_configuration = "linsi"           # "fftnsi" or "linsi"
 26 | mafft_threading = False                 # Number of cores/servers or False
 27 | alphabet_reduction = False              # False or "charge", "charge_his", "polarity" 
 28 |                                         # or "hydropathy"
 29 | alignment_score = False                 # "sumofpairs" or False
 30 | best_results = 20                       # [1:max(scores)]
 31 | results_histogram = True                # True or False
 32 | results_heatmap = True                  # True or False
 33 | results_structure = "pymol"             # "pymol" or False
 34 | results_sifts = False                   # True or False
 35 | 
 36 | def SaveParameters(filename):
 37 |     "Saves default parameters"
 38 |     
 39 |     parser = SafeConfigParser()
 40 |     parser.add_section('Global')
 41 |     parser.add_section('Psiblast')
 42 |     parser.add_section('Clustalw')
 43 |     parser.add_section('Muscle')
 44 |     parser.add_section('Mafft')
 45 |     parser.add_section('Results')
 46 |     parser.set('Global', 'SurfaceThreshold', float(surface_threshold))
 47 |     parser.set('Psiblast', 'Evalue', float(psiblast_evalue))
 48 |     parser.set('Psiblast', 'Identity', int(psiblast_identity))
 49 |     parser.set('Psiblast', 'Coverage', int(psiblast_coverage))
 50 |     parser.set('Psiblast', 'Threading', str(psiblast_threading))
 51 |     parser.set('Global', 'PairwiseDistance', str(pairwise_distance))
 52 |     parser.set('Clustalw', 'GapOpening', float(clustalw_gap_opening))
 53 |     parser.set('Clustalw', 'GapExtension', float(clustalw_gap_extension))
 54 |     parser.set('Clustalw', 'Matrix', str(clustalw_distance_matrix))
 55 |     parser.set('Global', 'Matrix', str(alignscore_matrix))
 56 |     parser.set('Global', 'TheilSenCutoff', float(theilsen_cutoff))
 57 |     parser.set('Muscle', 'MaxIteration', int(muscle_max_iteration))
 58 |     parser.set('Mafft', 'Configuration', str(mafft_configuration))
 59 |     parser.set('Mafft', 'Threading', str(mafft_threading))
 60 |     parser.set('Global', 'AlphabetReduction', str(alphabet_reduction))
 61 |     parser.set('Global', 'AlignmentScore', str(alignment_score))
 62 |     parser.set('Results', 'Best', int(best_results))
 63 |     parser.set('Results', 'Histogram', str(results_histogram))
 64 |     parser.set('Results', 'Heatmap', str(results_heatmap))
 65 |     parser.set('Results', 'Structure', str(results_structure))
 66 |     parser.set('Results', 'Sifts', str(results_sifts))
 67 |     fil = open(filename, 'w')
 68 |     parser.write(fil)
 69 |     fil.close()
 70 | 
 71 | def LoadParameters(filename, option):
 72 |     "Loads and tests input parameters"
 73 |     
 74 |     parser = SafeConfigParser()
 75 |     try:
 76 |         parser.read(filename)
 77 |         if option == "surface_threshold":
 78 |             surface_threshold = parser.getfloat('Global', 'SurfaceThreshold') 
 79 |             return surface_threshold
 80 |         elif option == "psiblast_evalue":
 81 |             psiblast_evalue = parser.getfloat('Psiblast', 'Evalue')
 82 |             return psiblast_evalue
 83 |         elif option == "psiblast_identity":
 84 |             psiblast_identity = parser.getint('Psiblast', 'Identity')
 85 |             return psiblast_identity
 86 |         elif option == "psiblast_coverage":
 87 |             psiblast_coverage = parser.getint('Psiblast', 'Coverage')
 88 |             return psiblast_coverage
 89 |         elif option == "psiblast_threading":
 90 |             psiblast_threading = parser.get('Psiblast', 'Threading')
 91 |             return psiblast_threading
 92 |         elif option == "pairwise_distance":
 93 |             pairwise_distance = parser.get('Global', 'PairwiseDistance')
 94 |             return pairwise_distance
 95 |         elif option == "clustalw_gap_opening":
 96 |             clustalw_gap_opening = parser.getfloat('Clustalw', 'GapOpening')
 97 |             return clustalw_gap_opening
 98 |         elif option == "clustalw_gap_extension":
 99 |             clustalw_gap_extension = parser.getfloat('Clustalw', 'GapExtension')
100 |             return clustalw_gap_extension
101 |         elif option == "clustalw_distance_matrix":
102 |             clustalw_distance_matrix = parser.get('Clustalw', 'Matrix')
103 |             return clustalw_distance_matrix
104 |         elif option == "alignscore_matrix":
105 |             alignscore_matrix = parser.get('Global', 'Matrix')
106 |             return alignscore_matrix
107 |         elif option == "theilsen_cutoff":
108 |             theilsen_cutoff = parser.getfloat('Global', 'TheilSenCutoff')
109 |             return theilsen_cutoff
110 |         elif option == "muscle_max_iteration":
111 |             muscle_max_iteration = parser.getint('Muscle', 'MaxIteration')
112 |             return muscle_max_iteration
113 |         elif option == "mafft_configuration":
114 |             mafft_configuration = parser.get('Mafft', 'Configuration')
115 |             return mafft_configuration
116 |         elif option == "mafft_threading":
117 |             mafft_threading = parser.get('Mafft', 'Threading')
118 |             return mafft_threading
119 |         elif option == "alphabet_reduction":
120 |             alphabet_reduction = parser.get('Global', 'AlphabetReduction')
121 |             return alphabet_reduction
122 |         elif option == "alignment_score":
123 |             alignment_score = parser.get('Global', 'AlignmentScore')
124 |             return alignment_score
125 |         elif option == "best_results":
126 |             best_results = parser.getint('Results', 'Best')
127 |             return best_results
128 |         elif option == "results_histogram":
129 |             results_histogram = parser.getboolean('Results', 'Histogram')
130 |             return results_histogram
131 |         elif option == "results_heatmap":
132 |             results_heatmap = parser.getboolean('Results', 'Heatmap')
133 |             return results_heatmap
134 |         elif option == "results_structure":
135 |             results_structure = parser.get('Results', 'Structure')
136 |             return results_structure
137 |         elif option == "results_sifts":
138 |             results_sifts = parser.getboolean('Results', 'Sifts')
139 |             return results_sifts
140 |         elif option == "test":
141 |             parser.getint('Results', 'Best')
142 |             print "Parameters... OK"
143 |             return
144 |         else:
145 |             raise StandardError, "ERROR: Invalid option"
146 |     except:
147 |         raise StandardError, "ERROR: Invalid Parameters File"
148 | 
149 | 


--------------------------------------------------------------------------------
/Params.config:
--------------------------------------------------------------------------------
 1 | [Global]
 2 | surfacethreshold = 7
 3 | pairwisedistance = clustalw
 4 | matrix = BLOSUM62
 5 | theilsencutoff = 0.7
 6 | alignmentscore = False
 7 | alphabetreduction = False
 8 | 
 9 | [Psiblast]
10 | evalue = 10
11 | identity = 0
12 | coverage = 0
13 | threading = False
14 | 
15 | [Clustalw]
16 | gapopening = 10
17 | gapextension = 0.2
18 | matrix = GONNET
19 | 
20 | [Muscle]
21 | maxiteration = 16
22 | 
23 | [Mafft]
24 | configuration = linsi
25 | threading = False
26 | 
27 | [Results]
28 | best = 20
29 | histogram = True
30 | heatmap = True
31 | structure = pymol
32 | sifts = False
33 | 
34 | 


--------------------------------------------------------------------------------
/Pycoevol.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | #TODO: 
  8 | # Interaction maps
  9 | 
 10 | import os
 11 | import sys
 12 | from src import MAIN
 13 | from Parameters import LoadParameters as LP
 14 | from optparse import OptionParser
 15 | from Bio.Align.Applications import ClustalwCommandline
 16 | 
 17 | def printUsage():
 18 |     """Prints the usage - DEPRECATED"""
 19 |     __version__ = "beta"
 20 |     
 21 |     Usage = \
 22 |     """
 23 |     Pycoevol_%s (c) 2012, F. Madeira
 24 |   
 25 |     Pycoevol: A Python workflow to study protein-protein coevolution 
 26 |     and interaction.
 27 | 
 28 |     Pycoevol.py   input1 input2 
 29 |        
 30 |     input1        seq1.fasta (-seqID1), pdb1.pdb:A (-PDBID1:A)   
 31 |                   or align1.fasta (where A is the chain designator)                
 32 |     input2        seq2.fasta (seqID2), -pdb2.pdb:B (-PDBID2:B)  
 33 |                   or -align2.fasta (where B is the chain designator) 
 34 |     -p --psiblast
 35 |                   internet, local or custom (NCBI's PSIBLAST and 
 36 |                   local database are optional) 
 37 |     -a --alignment
 38 |                   clustalw, muscle, mafft or custom (MUSCLE and  
 39 |                   MAFFT are optional) 
 40 |     -c --coevolution
 41 |                   mi, mie, rcwmi, cpvnmie, cpvn, clm, vol
 42 |                   omes, pearson, spearman, mcbasc, quartets,
 43 |                   sca or elsc
 44 |     -x --chain
 45 |                   chain identifier (in same order as input file). Default A
 46 |     -i --id
 47 |                   identifier for each protein, in same order as input files.
 48 |     -h --help
 49 |                  
 50 |     Check the README.md for further details.
 51 |     """ % __version__
 52 |     print Usage
 53 |         
 54 | def pycoevolRun():
 55 |     "Routine which chooses the proper scripts given the input commands"
 56 |     main = MAIN.main(file1, file2, id1, id2, chain1, chain2, parameterfile,
 57 |                      psiblast, alignment, coevolution, dirname)
 58 |     
 59 |     if psiblast == "custom" and alignment == "custom":
 60 |         print 'Coevolution scripts...'
 61 |         sys.stdout.flush()
 62 |         main.coevolutionSripts()
 63 |         print '... OK'
 64 |     else:
 65 |         print 'Sequence scripts...'
 66 |         sys.stdout.flush()
 67 |         main.sequenceSripts()
 68 |         print '... OK'
 69 |         
 70 |         print 'BLAST scripts...'
 71 |         sys.stdout.flush()
 72 |         main.psiblastSripts()
 73 |         print '... OK'
 74 |         
 75 |         print 'Organism scripts...'
 76 |         sys.stdout.flush()
 77 |         main.organismSripts()
 78 |         print '... OK'
 79 |         
 80 |         print 'Alignment scripts...'
 81 |         sys.stdout.flush()
 82 |         main.alignmentSripts()
 83 |         print '... OK'
 84 |         
 85 |         print 'Coevolution scripts...'
 86 |         sys.stdout.flush()
 87 |         main.coevolutionSripts()
 88 |         print '... OK'
 89 |         
 90 |         print 'Info scripts...'
 91 |         sys.stdout.flush()
 92 |         main.infoScripts(SIFTS)
 93 |         print '... OK'
 94 |     return
 95 |    
 96 | def checkArguments():
 97 |     "Checks if the input commands are valid"
 98 |     try:
 99 |         input = str("./Data/" + file1)
100 |         file = open(input, "r")
101 |         file.close()
102 |     except:
103 |         #raise StandardError, "ERROR: File no.1 is not acessible"
104 |         pass
105 |     
106 |     try:
107 |         input = str("./Data/" + file2)
108 |         file = open(input, "r")
109 |         file.close() 
110 |     except:
111 |         #raise StandardError, "ERROR: File no.2 is not acessible"
112 |         pass
113 |     
114 |     if len(chain1) <= 2 and len(chain2) <= 2:
115 |         pass
116 |     else:
117 |         raise StandardError, "ERROR: Chains' length must be = 1"
118 |     
119 |     if psiblast != 'internet' and psiblast != 'local' and psiblast != 'custom':
120 |         raise StandardError, "ERROR: PSI-Blast: Type 'internet', 'local'\
121 |                             or 'custom'"
122 |     
123 |     if alignment != "clustalw" and alignment != "muscle" and \
124 |     alignment != "mafft" and alignment != 'custom':
125 |         raise StandardError, "ERROR: Alignment Tools: Type '-clustalw', \
126 |         '-muscle', '-mafft' or 'custom'"
127 |     
128 |     if coevolution != 'mi' and coevolution != 'mie' and \
129 |     coevolution != 'rcwmi' and coevolution != 'cpvnmie' and \
130 |     coevolution != 'cpvn' and coevolution != 'clm' and \
131 |     coevolution != 'vol' and coevolution != 'omes' and \
132 |     coevolution != 'pearson' and coevolution != 'spearman' and \
133 |     coevolution != 'mcbasc' and coevolution != 'quartets' and \
134 |     coevolution != 'sca' and coevolution != 'elsc':
135 |         raise StandardError, "ERROR: Coevolution Measure: Type '–mi', '–mie', \
136 |         '–rcwmi', '–cpvnmie', '–cpvn', '–clm', '–vol', '-omes', '-pearson', \
137 |         '-spearman', '-mcbasc', '-quartets', '-sca' or '-elsc'"
138 | 
139 | def checkDependencies():
140 |     "Checks the import of mandatory python modules and clustalw"
141 |     try: 
142 |         import Bio
143 |         del Bio
144 |     except ImportError:
145 |         raise ImportError, "ERROR: Unable to import Biopython"
146 |     
147 |     try: 
148 |         import numpy
149 |         del numpy
150 |     except ImportError:
151 |         raise ImportError, "ERROR: Unable to import Numpy"
152 |     
153 |     try: 
154 |         import matplotlib
155 |         del matplotlib
156 |     except ImportError:
157 |         raise ImportError, "ERROR: Unable to import Matplotlib"
158 |     
159 |     try:
160 |         try:
161 |             cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe")
162 |             input = "./src/tools/clustalw/test/test.fasta"
163 |             clustalw = ClustalwCommandline(cmd, infile=input) 
164 |             clustalw()
165 |             os.remove("./src/tools/clustalw/test/test.aln")
166 |             os.remove("./src/tools/clustalw/test/test.dnd")
167 |         except:
168 |             cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw")
169 |             input = "./src/tools/clustalw/test/test.fasta"
170 |             clustalw = ClustalwCommandline(cmd, infile=input) 
171 |             clustalw()
172 |             os.remove("./src/tools/clustalw/test/test.aln")
173 |             os.remove("./src/tools/clustalw/test/test.dnd")
174 |     except:
175 |         raise StandardError, "ERROR: Unable to run ClustalW"
176 |         
177 | 
178 | def checkSIFTS():
179 |     "Checks the availability of SIFTS files"
180 |     global SIFTS
181 |     try:
182 |         input = str("./SIFTS/pdb_chain_scop_uniprot.lst")
183 |         file = open(input, "r")
184 |         file.close()
185 |         input = str("./SIFTS/pdb_chain_cath_uniprot.lst")
186 |         file = open(input, "r")
187 |         file.close()
188 |         input = str("./SIFTS/pdb_chain_enzyme.lst")
189 |         file = open(input, "r")
190 |         file.close()
191 |         input = str("./SIFTS/pdb_chain_interpro.lst")
192 |         file = open(input, "r")
193 |         file.close()
194 |         input = str("./SIFTS/pdb_chain_pfam.lst")
195 |         file = open(input, "r")
196 |         file.close()
197 |         input = str("./SIFTS/pdb_chain_taxonomy.lst")
198 |         file = open(input, "r")
199 |         file.close()
200 |         input = str("./SIFTS/pdb_pubmed.lst")
201 |         file = open(input, "r")
202 |         file.close()
203 |         SIFTS = True
204 |         print "SIFTS... OK"
205 |     except:
206 |         SIFTS = False        
207 |         print "SIFTS... NOT OK"
208 |         
209 | def addtoPATH():
210 |     sys.path.append("./src/tools/")
211 |     sys.path.append("./src/tools/blast+/")
212 |     sys.path.append("./src/tools/blast+/db")
213 |     sys.path.append("./src/tools/clustalw/")
214 |     sys.path.append("./src/tools/mafft/")
215 |     sys.path.append("./src/tools/muscle/")
216 |     
217 | def ParseArguments():
218 |     global file1
219 |     global id1
220 |     global chain1
221 |     global file2
222 |     global id2
223 |     global chain2
224 |     global parameterfile
225 |     global psiblast
226 |     global alignment
227 |     global coevolution
228 |     global dirname
229 | 
230 |     # defaults
231 |     pathcwd = os.getcwd()
232 |     dirname = os.getcwd() + "/Results/"
233 |     parameterfile = ''
234 |     file1 = ''
235 |     file2 = '' 
236 |     chain1 = ''
237 |     chain2 = ''
238 | 
239 |     parser = OptionParser(usage='Pycoevol.py input1 input2 [options]')
240 |     parser.add_option('-b', '--psiblast', type='string',
241 |                       dest='psiblast', default='internet',
242 |                       help='internet, local or custom')
243 |     parser.add_option('-a', '--alignment', type='string',
244 |                       dest='alignment', default='clustalw',
245 |                       help='clustalw, muscle, mafft or custom')
246 |     parser.add_option('-c', '--coevolution', type='string',
247 |                       dest='coevolution', default='mi',
248 |                       help='mi, mie, rcwmi, cpvn, clm, vol, omes, pearson, spearman, mcbasc, quartets, sca or elsc')
249 |     parser.add_option('-i', '--id', action='append', type='string',
250 |                       dest='ids', default=[])
251 |     parser.add_option('-x', '--chain', action='append', type='string',
252 |                       dest='chains', default=[])
253 |     parser.add_option('-p', '--parameters',
254 |                       dest='parameterfile', default=parameterfile)
255 |       
256 |     (options, args) = parser.parse_args()
257 |     if len(args) == 0 and len(options.ids) == 0:
258 |         parser.print_help()
259 |         sys.exit()
260 |         
261 |     if len(args) == 2:
262 |         input1 = args[0]
263 |         input2 = args[1]
264 |         dirname = os.path.dirname(input1) + "/"
265 |         file1 = os.path.basename(input1)
266 |         file2 = os.path.basename(input2)
267 |         id1 = file1.split(".")[0]
268 |         id2 = file2.split(".")[0]
269 |     if len(options.chains) == 2:
270 |         chain1 = options.chains[0]
271 |         chain2 = options.chains[1]
272 |     if len(options.ids) == 2:
273 |         id1 = options.ids[0]
274 |         id2 = options.ids[1]
275 |         if chain1 == '' and chain2 == '':
276 |             file1 = id1 + ".fasta"
277 |             file2 = id2 + ".fasta"
278 |         else:
279 |             file1 = id1 + ".pdb"
280 |             file2 = id2 + ".pdb"
281 |     if options.parameterfile != '':
282 |         parameterfile = options.parameterfile.strip('"')
283 |         LP(parameterfile, "test")
284 |     else:
285 |         parameterfile = pathcwd + "/Params.config"
286 |         parameterfile = parameterfile.strip('"')
287 |         LP(parameterfile, "test")
288 |     psiblast = options.psiblast
289 |     alignment = options.alignment
290 |     coevolution = options.coevolution
291 | 
292 | def main():
293 |     ParseArguments()        
294 |     checkArguments()
295 |     print 'Arguments... OK'
296 |     addtoPATH()
297 |     checkDependencies()
298 |     print 'Dependencies... OK'
299 |     checkSIFTS()        
300 |     pycoevolRun()
301 |     print 'Analysis Complete !!'
302 |     return
303 |     
304 | if __name__ == "__main__":
305 |     main()
306 |     
307 | 
308 | 


--------------------------------------------------------------------------------
/Pycoevol_paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/Pycoevol_paper.pdf


--------------------------------------------------------------------------------
/Pycoevol_userguide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/Pycoevol_userguide.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ﻿PYCOEVOL
  2 | ========
  3 | A Python workflow to study protein-protein coevolution and interaction
  4 |  
  5 | Pycoevol is an integrated system for studying inter-protein coevolution and interaction.
  6 | It automates the identification of contact points between protein partners, extending the 
  7 | general coevolution workflow consisting of: homologous sequence search; multiple sequence 
  8 | alignment computation; and coevolution analysis; with an improved selection of organisms 
  9 | and contact prediction. 
 10 | 
 11 | It generates friendly output results: matrix of scores; histograms;
 12 | heat-maps; PyMOL scripts and interaction maps. Additional information for common web-services
 13 | can be retrieved from SIFTS. 
 14 | 
 15 | ## Disclaimer 
 16 | 
 17 | This software is provided "as is", with no explicit or implied warranties. 
 18 | Use this software at your own risk.
 19 | 
 20 | ## Copyright
 21 | 
 22 | This software is public domain, and everyone has the right to copy, 
 23 | distribute, reuse, modify, improve and debug it.
 24 | 
 25 | If you want to cite this piece of software/workflow use the following:
 26 | 
 27 | Fábio Madeira and Ludwig Krippahl. 2012. PYCOEVOL: A Python workflow to study 
 28 | protein-protein coevolution. Proceedings of the International conference on 
 29 | Bioinformatics Models, Methods and Algorithms - BIOINFORMATICS 2012, pp.143-9. 
 30 | 
 31 | This work was partially supported by Portuguese National funds through Fundação 
 32 | para a Ciência e Tecnologia (FCT) under project CREMA PTDC/EIA-CCO/115999/2009.
 33 | 
 34 | ## Dependencies
 35 | 
 36 | [Python 2.7.2](http://python.org/),
 37 | [Biopython 1.58](http://biopython.org/),
 38 | [Numpy 1.6.1](http://numpy.scipy.org/),
 39 | [Matplotlib 1.1.0](http://matplotlib.sourceforge.net/) and
 40 | [ClustalW](http://www.clustal.org/)
 41 | 
 42 | **Optional:**
 43 | [NCBI Blast+](http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download),
 44 | [NCBI's "refseq_protein" database](ftp://ftp.ncbi.nlm.nih.gov/blast/db/),
 45 | [MUSCLE](http://www.drive5.com/muscle/),
 46 | [MAFFT](http://mafft.cbrc.jp/alignment/software/) and
 47 | [SIFTS lst files](http://www.ebi.ac.uk/pdbe/docs/sifts/quick.html)
 48 | 
 49 | 
 50 | ## Usage
 51 |  
 52 | _python Pycoevol.py input1 input2 [options]_
 53 | 
 54 | 
 55 |  
 56 | 	-h, --help		show this help message and exit
 57 |      
 58 | 	-b PSIBLAST, --psiblast=PSIBLAST
 59 |  
 60 | 					internet, local or custom
 61 |      
 62 | 	-a ALIGNMENT, --alignment=ALIGNMENT
 63 |  
 64 | 					clustalw, muscle, mafft or custom
 65 |      
 66 | 	-c COEVOLUTION, --coevolution=COEVOLUTION
 67 |  
 68 | 					mi, mie, rcwmi, cpvn, clm, vol, omes, pearson,spearman, mcbasc, quartets, sca or elsc
 69 |      
 70 | 	-i IDS, --id=IDS
 71 |  
 72 | 	-x CHAINS, --chain=CHAINS
 73 |  
 74 | 	-p PARAMETERFILE, --parameters=PARAMETERFILE
 75 | 
 76 | For a detailed overview on how to install and use Pycoevol, please refer to the User Guide.
 77 | 
 78 | 
 79 | **Coevolution measures:**
 80 | 
 81 | * Mutual Information (mi) [Gloor et al, 2005]
 82 | * MI by pair Entropy (mie) [Martin et al, 2005]
 83 | * Row and Column Weighed MI (rcwmi) [Gouveia-Oliveira et al, 2007]
 84 | * Contact Preferences, Volume Normalized (cpvn) [Glaser et al, 2001]
 85 | * Contact PDB-derived Likelihood Matrix (clm) [Singer et al, 2002]
 86 | * Residue-residue Volume Normalized (vol) [based on Esque et al, 2010]
 87 | * Observed Minus Expected Squared  (omes) [Kass and Horovitz, 2002]
 88 | * Pearson’s correlation (pearson) [Göbel et al, 1994]
 89 | * Spearman’s rank correlation (spearman) [Pazos et al, 1997]
 90 | * McLachlan Based Substitution Correlation (mcbasc) [Fodor and Aldrich, 2004]
 91 | * Quartets (quartets) [Galitsky, 2002]
 92 | * Statistical Coupling Analysis (sca) [Lockless and Ranganathan, 1999]
 93 | * Explicit Likelihood of Subset Covariation (elsc) [Dekker et al, 2004]
 94 | 
 95 | **Pairwise distance measures:**
 96 | 
 97 | * ClustalW distance[Chenna et al, 2003]
 98 | * p-distance [Jukes and Cantor, 1969]
 99 | * Jukes-Cantor [Jukes and Cantor, 1969]
100 | * Kimura distance [Kimura, 1983]
101 | * Pairwise score using Dayhoff or Henikoff matrices [Dayhoff et al, 1978; 
102 | Henikoff and Henikoff, 1992]
103 | 
104 | 
105 | *Fábio Madeira and Ludwig Krippahl, 2012*
106 | 
107 | This work was partially supported by Portuguese National
108 | funds through Fundação para a Ciência e Tecnologia (FCT)
109 | under project CREMA PTDC/EIA-CCO/115999/2009.
110 | 


--------------------------------------------------------------------------------
/Results/output_results:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/Results/output_results


--------------------------------------------------------------------------------
/SIFTS/Database version:
--------------------------------------------------------------------------------
1 | # Last update on 16.12.2011


--------------------------------------------------------------------------------
/refseq_protein.pal:
--------------------------------------------------------------------------------
1 | #
2 | # Alias file created: Jun 26, 2011  8:38 PM
3 | #
4 | # Edit this file to reflet the location of your database
5 | # Get the database at ftp://ftp.ncbi.nih.gov/blast/db/
6 | #
7 | TITLE  NCBI Protein Reference Sequences
8 | DBLIST ./Pycoevol/src/tools/Blast+/db/refseq_protein.00 ./Pycoevol/src/tools/Blast+/db/refseq_protein.01 ./Pycoevol/src/tools/Blast+/db/refseq_protein.02 ./Pycoevol/src/tools/Blast+/db/refseq_protein.03
9 | 


--------------------------------------------------------------------------------
/src/ALIGN.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | import os
  9 | from Parameters import LoadParameters as LP
 10 | from src.UTILS import charge, charge_his, polarity, hydropathy
 11 | from os import remove, system
 12 | from shutil import copyfile
 13 | from itertools import combinations
 14 | from Bio import AlignIO, SeqIO
 15 | from Bio.Alphabet import IUPAC
 16 | from Bio.Align.Applications import ClustalwCommandline
 17 | from Bio.Align.Applications import MuscleCommandline
 18 | 
 19 | class alignment:
 20 |     """
 21 |     Main code for multiple sequence alignment and scoring.
 22 |     
 23 |     Methods for computing MSAs:
 24 |     Clustalw - Chenna et al, 2003
 25 |     Muscle - Edgar, 2004
 26 |     Mafft - Katoh et al, 2002
 27 |     
 28 |     Methods for scoring MSAs:
 29 |     Sum-of-Pairs - Murata et al, 1985
 30 |     TODO: Circular Sum - Gonnet et al, 2000
 31 |     """
 32 |     def __init__(self, id1, id2, alignment, parameterfile, dirname):
 33 |         self.id1 = id1
 34 |         self.id2 = id2
 35 |         self.alignment = alignment
 36 |         self.parameterfile = parameterfile
 37 |         self.dirname = dirname
 38 |         
 39 |     def __call__(self, id1, id2, alignment, parameterfile, dirname):
 40 |         self.id1 = id1
 41 |         self.id2 = id2
 42 |         self.alignment = alignment
 43 |         self.parameterfile = parameterfile
 44 |         self.dirname = dirname
 45 |         
 46 | 
 47 |     def computeAlignment(self, id, alignment):
 48 |         "Computes multiple sequence alignment with inputed method"
 49 |         
 50 |         if alignment == "clustalw":
 51 |             gop = LP(self.parameterfile, "clustalw_gap_opening")
 52 |             gep = LP(self.parameterfile, "clustalw_gap_extension")
 53 |             d_matrix = LP(self.parameterfile, "clustalw_distance_matrix")
 54 |             
 55 |             input_sequences = self.dirname + id + ".fasta"
 56 |             output_align = self.dirname + id + ".aln"
 57 |             output_fasta = self.dirname + id + "_clustalw.fasta"
 58 |             output_tree = self.dirname + id + ".dnd"
 59 |             try:
 60 |                 cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe")
 61 |                 clustalw = ClustalwCommandline(cmd, infile=input_sequences,
 62 |                                                outfile=output_align,
 63 |                                                newtree=output_tree,
 64 |                                                align="input",
 65 |                                                seqnos="ON",
 66 |                                                outorder="input",
 67 |                                                type="PROTEIN",
 68 |                                                pwmatrix=d_matrix,
 69 |                                                gapopen=gop,
 70 |                                                gapext=gep) 
 71 |                 clustalw()
 72 |             except:
 73 |                 cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw")
 74 |                 clustalw = ClustalwCommandline(cmd, infile=input_sequences,
 75 |                                                outfile=output_align,
 76 |                                                newtree=output_tree,
 77 |                                                align="input",
 78 |                                                seqnos="ON",
 79 |                                                outorder="input",
 80 |                                                type="PROTEIN",
 81 |                                                pwmatrix=d_matrix,
 82 |                                                gapopen=gop,
 83 |                                                gapext=gep) 
 84 |                 clustalw()
 85 |             AlignIO.convert(output_align, "clustal", output_fasta, "fasta")
 86 |             try:
 87 |                 remove(output_align)
 88 |                 remove(output_tree)
 89 |             except:
 90 |                 pass
 91 |             
 92 |         elif alignment == "muscle":
 93 |             iteration = LP(self.parameterfile, "muscle_max_iteration")
 94 |             
 95 |             input_sequences = self.dirname + id + ".fasta"
 96 |             output_align = self.dirname + id + "_muscle.aln"
 97 |             output_fasta = self.dirname + id + "_muscle.fasta"
 98 |             
 99 |             muscle = MuscleCommandline(input=input_sequences,
100 |                                        out=output_align,
101 |                                        clwstrict=True,
102 |                                        maxiters=iteration)
103 |             muscle()
104 |             AlignIO.convert(output_align, "clustal", output_fasta, "fasta")
105 |             try:
106 |                 remove(output_align)
107 |             except:
108 |                 pass
109 |             
110 |             organism_order = []
111 |             input_sequences = self.dirname + id + ".fasta"
112 |             align = SeqIO.parse(input_sequences, "fasta", IUPAC.protein)
113 |             for record in align:
114 |                 org = record.description
115 |                 organism_order.append(org)
116 |                 
117 |             rec = dict()
118 |             output_fasta = self.dirname + id + "_muscle.fasta"
119 |             align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein)
120 |             for record in align:
121 |                 org = str(record.description)
122 |                 seq = str(record.seq)
123 |                 rec[org] = seq
124 |             
125 |             fasta = open(output_fasta, "w")
126 |             fasta.close()
127 |             fasta = open(output_fasta, "a")
128 |             for org in (organism_order):
129 |                 seq = rec[org]
130 |                 fasta.write(">" + org + "\n" + seq + "\n")
131 |             fasta.close()
132 |             
133 |         else:
134 |             configuration = LP(self.parameterfile, "mafft_configuration")
135 |             threads = LP(self.parameterfile, "mafft_threading")
136 |             input_sequences = self.dirname + id + ".fasta"
137 |             output_fasta = self.dirname + id + "_mafft.fasta"
138 |             
139 |             if configuration == "fftnsi":
140 |                 if threads == False:
141 |                     fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder "
142 |                     mafft = system(fftnsi + input_sequences + ">" + output_fasta)
143 |                     mafft
144 |                 else:
145 |                     try:
146 |                         threads = int(threads)
147 |                         fftnsi = "mafft --retree 2 --maxiterate 1000\
148 |                          --inputorder --threads %i " % (threads)
149 |                         mafft = system(fftnsi + input_sequences + ">" + output_fasta)
150 |                         mafft
151 |                     except:
152 |                         fftnsi = "mafft --retree 2 --maxiterate 1000 --inputorder "
153 |                         mafft = system(fftnsi + input_sequences + ">" + output_fasta)
154 |                         mafft
155 |             else:
156 |                 if threads == False:
157 |                     linsi = "mafft --localpair --maxiterate 1000 --inputorder "
158 |                     mafft = system(linsi + input_sequences + ">" + output_fasta)
159 |                     mafft
160 |                 else:
161 |                     try:
162 |                         threads = int(threads)
163 |                         linsi = "mafft --localpair --maxiterate 1000\
164 |                          --inputorder --threads %i " % (threads)
165 |                         mafft = system(linsi + input_sequences + ">" + output_fasta)
166 |                         mafft
167 |                     except:
168 |                         linsi = "mafft --localpair --maxiterate 1000 --inputorder "
169 |                         mafft = system(linsi + input_sequences + ">" + output_fasta)
170 |                         mafft    
171 |         
172 |     def cutAlignment(self, file, id, alignment):
173 |         "Selects MSA columns of interest (Query_id != '-')"
174 |                
175 |         description = []
176 |         align = []
177 |         columns = []
178 |         positions = []
179 |         blocks = []
180 |         new_align = []
181 |         new_align_ord = []
182 |         new_align_concate = []
183 |         self.cut_alignment = []
184 |         aa_red = LP(self.parameterfile, "alphabet_reduction")
185 |         
186 |         if alignment != "custom":
187 |             input = self.dirname + id + "_" + alignment + ".fasta"
188 |             alignment = AlignIO.read(input, "fasta")
189 |             for record in alignment:
190 |                 key = record.id
191 |                 description.append(key)
192 |             
193 |             k = int(-1)
194 |             for s in description:
195 |                 k += 1
196 |                 key = s.find("Query_id")
197 |                 if key != -1:
198 |                     break
199 |             
200 |             align_length = alignment.get_alignment_length()
201 |             for position in range(0, align_length):
202 |                 column = alignment[:, position]
203 |                 align.append(column)
204 |                 if column[k] != "-":
205 |                     columns.append(column)
206 |                     positions.append(position)
207 |                     
208 |             for i in range(0, len(positions), 1):
209 |                 beg = int(positions[i])
210 |                 end = int(positions[i] + 1)
211 |                 block = alignment[:, beg:end]         
212 |                 blocks.append(block)
213 |             
214 |             for block in blocks:
215 |                 for record in block:
216 |                     seq = str(record.seq)
217 |                     new_align.append(seq)
218 |             
219 |             numb_blocks = len(new_align) / len(columns[0])
220 |             for i in range(0, len(columns[0])):
221 |                 for j in range(0, len(new_align), len(columns[0])):
222 |                     new_align_ord.append(new_align[i + j])
223 |         
224 |             for i in range(0, len(new_align_ord), numb_blocks):
225 |                 pseudolist = new_align_ord[i:i + numb_blocks]
226 |                 list = ""
227 |                 for j in pseudolist:
228 |                     list += j
229 |                 new_align_concate.append(list)
230 |     
231 |             for seq in new_align_concate:
232 |                 if aa_red != False:
233 |                     red = [AR(e, aa_red) for e in seq]
234 |                     new_seq = ""
235 |                     for i in red:
236 |                         new_seq += str(i)
237 |                     self.cut_alignment.append(new_seq)
238 |                 else:
239 |                     self.cut_alignment.append(seq) 
240 |                 
241 |             return self.cut_alignment
242 |         
243 |         else:
244 |             output = self.dirname + id + "_" + alignment + ".fasta"
245 |             copyfile(self.dirname + file, output)
246 |             input = self.dirname + id + "_" + alignment + ".fasta"
247 |            
248 |             alignment = AlignIO.read(input, "fasta")
249 |             for record in alignment:
250 |                 key = record.id
251 |                 description.append(key)
252 |             
253 |             align_length = alignment.get_alignment_length()
254 |             for position in range(0, align_length):
255 |                 column = alignment[:, position]
256 |                 align.append(column)
257 |                 if column[0] != "-":
258 |                     columns.append(column)
259 |                     positions.append(position)
260 |                     
261 |             for i in range(0, len(positions), 1):
262 |                 beg = int(positions[i])
263 |                 end = int(positions[i] + 1)
264 |                 block = alignment[:, beg:end]         
265 |                 blocks.append(block)
266 |             
267 |             for block in blocks:
268 |                 for record in block:
269 |                     seq = str(record.seq)
270 |                     new_align.append(seq)
271 |             
272 |             numb_blocks = len(new_align) / len(columns[0])
273 |             for i in range(0, len(columns[0])):
274 |                 for j in range(0, len(new_align), len(columns[0])):
275 |                     new_align_ord.append(new_align[i + j])
276 |         
277 |             for i in range(0, len(new_align_ord), numb_blocks):
278 |                 pseudolist = new_align_ord[i:i + numb_blocks]
279 |                 list = ""
280 |                 for j in pseudolist:
281 |                     list += j
282 |                 new_align_concate.append(list)
283 |     
284 |             for seq in new_align_concate:
285 |                 if aa_red != False:
286 |                     red = [AR(e, aa_red) for e in seq]
287 |                     new_seq = ""
288 |                     for i in red:
289 |                         new_seq += str(i)
290 |                     self.cut_alignment.append(new_seq)
291 |                 else:
292 |                     self.cut_alignment.append(seq)      
293 |                
294 |             return self.cut_alignment
295 |             
296 |      
297 |     def alignScore(self, id, alignment):
298 |         """
299 |         Computes a score for the MSA inputed.
300 |         
301 |         Methods implemented:
302 |         Sum-of-pairs (SP) score -  Murata et al, 1985
303 |         as explained in Gonnet et al, 2000. 
304 |         SP is the sum of all possible combinations of
305 |         pairwise scores. 
306 |         
307 |         !!Disclaimer: alignmentScore is terribly slow!!
308 |         
309 |         (To Do - Circular Sum by Gonnet et al, 2000)
310 |         """
311 |         score = LP(self.parameterfile, "alignment_score")
312 |         
313 |         if score == "sumofpairs":
314 |             input = self.dirname + id + "_" + alignment + ".fasta"
315 |             sequences = []
316 |             input_sequences = SeqIO.parse(input, "fasta", IUPAC.protein)
317 |             for record in input_sequences:
318 |                 seq = str(record.seq)
319 |                 sequences.append(seq) 
320 |             
321 |             SumOfPairs = 0
322 |             for pair in combinations(sequences, 2): 
323 |                 SumOfPairs += pairwiseScore(pair[0], pair[1])
324 |             
325 |             print SumOfPairs
326 |         else: pass
327 | 
328 | def pairwiseScore(seq1, seq2):
329 |     """
330 |     s(x,y) = { matchScore(x,y) if x!='-' and y!='-';
331 |                0 if x=='-' and y=='-', because the delection as caused earlier
332 |                gap penalty, depending on the gap length  gap + length * increment}
333 |                
334 |     gap - depends on the scoring matrix (PAM, BLOSUM, etc)
335 |     length - length of the gap
336 |     increment - incremental penalty that depends on the scoring matrix
337 |     
338 |     BLOSUM62, gap = -4, increment = 1 -> increment = length 
339 |     """
340 |     
341 |     gap = -4.0
342 |     incr_top = 0
343 |     incr_bottom = 0
344 |     pairwise_score = 0
345 |     for i, j in zip(range(len(seq1)), range(len(seq2))):
346 |         aa1 = seq1[i]
347 |         aa2 = seq2[j] 
348 |         if aa1 == "-" and aa2 == "-" :
349 |             pairwise_score += 0
350 |         elif aa1 != "-" and aa2 != "-":
351 |             pairwise_score += float(matchScore(aa1, aa2, "BLOSUM62"))
352 |         elif aa1 == "-" and aa2 != "-":
353 |             try:
354 |                 aa11 = seq1[i + 1]
355 |                 aa22 = seq2[j + 1]
356 |                 if aa11 == "-" and aa22 != "-":
357 |                     incr_top += 1
358 |                 else: 
359 |                     pairwise_score += gap + incr_top * incr_top
360 |                     incr_top = 0
361 |             except: 
362 |                 pairwise_score += gap
363 |                 pass
364 |         elif aa1 != "-" and aa2 == "-":
365 |             try:
366 |                 aa11 = seq1[i + 1]
367 |                 aa22 = seq2[j + 1]
368 |                 if aa11 != "-" and aa22 == "-":
369 |                     incr_bottom += 1
370 |                 else: 
371 |                     pairwise_score += gap + incr_bottom * incr_bottom
372 |                     incr_bottom = 0
373 |             except: 
374 |                 pairwise_score += gap
375 |                 pass
376 |         else: pass
377 |         
378 |     return pairwise_score
379 |          
380 | def matchScore(alpha, beta, matrix):
381 |     "Matches scores from a matrix"
382 |     
383 |     alphabet = {}    
384 |     alphabet["A"] = 0
385 |     alphabet["R"] = 1
386 |     alphabet["N"] = 2
387 |     alphabet["D"] = 3
388 |     alphabet["C"] = 4
389 |     alphabet["Q"] = 5
390 |     alphabet["E"] = 6
391 |     alphabet["G"] = 7
392 |     alphabet["H"] = 8
393 |     alphabet["I"] = 9
394 |     alphabet["L"] = 10
395 |     alphabet["K"] = 11
396 |     alphabet["M"] = 12
397 |     alphabet["F"] = 13
398 |     alphabet["P"] = 14
399 |     alphabet["S"] = 15
400 |     alphabet["T"] = 16
401 |     alphabet["W"] = 17
402 |     alphabet["Y"] = 18
403 |     alphabet["V"] = 19
404 |     alphabet["B"] = 20
405 |     alphabet["Z"] = 21
406 |     alphabet["X"] = 22
407 |     alphabet["-"] = 22
408 |     lut_x = alphabet[alpha]
409 |     lut_y = alphabet[beta]
410 |     
411 |     return mapMatrix(matrix)[lut_x][lut_y]
412 |     
413 | def mapMatrix(matrix):
414 |     "Maps a matrix of floats"
415 |     matrix = matrix.upper()
416 |     
417 |     score_matrix = []
418 |     input = './Matrix/' + matrix
419 |     input_matrix = open(input, 'r')
420 |     for line in input_matrix.readlines():
421 |         score_matrix.append(map(float, line.split()))
422 |     input_matrix.close()
423 |     
424 |     return score_matrix
425 |     
426 | def AR(aminoacid, method):
427 |     """Performs alphabet reduction.
428 |     Alphabets: charge, charge_his, polarity, hydropathy
429 |     """
430 | 
431 |     if method == "charge":
432 |         return charge[aminoacid]
433 |     elif method == "charge_his":
434 |         return charge_his[aminoacid]
435 |     elif method == "polarity":
436 |         return polarity[aminoacid]
437 |     elif method == "hydropathy":
438 |         return hydropathy[aminoacid]
439 |     else:
440 |         return aminoacid
441 |     
442 |         
443 | 


--------------------------------------------------------------------------------
/src/BLAST.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | from Parameters import LoadParameters as LP
  9 | from os import remove
 10 | from shutil import move
 11 | from Bio import SeqIO, Entrez
 12 | from Bio.Alphabet import IUPAC
 13 | from Bio.Blast import NCBIXML, NCBIWWW
 14 | from Bio.Blast.Applications import NcbipsiblastCommandline
 15 | Entrez.email = "entrez@mail.com"
 16 | 
 17 | class psiblast:
 18 |     """
 19 |     Main code for psiblast search over internet or at local database.
 20 |     
 21 |     Method for searching homologous sequences:
 22 |     PSI-Blast - Altschul et al, 1997
 23 |     """
 24 |     def __init__(self, id1, id2, psiblast, parameterfile, dirname):
 25 |         self.id1 = id1
 26 |         self.id2 = id2
 27 |         self.psiblast = psiblast
 28 |         self.parameterfile = parameterfile
 29 |         self.dirname = dirname
 30 |         
 31 |     def __call__(self, id1, id2, psiblast, parameterfile, dirname):
 32 |         self.id1 = id1
 33 |         self.id2 = id2
 34 |         self.psiblast = psiblast
 35 |         self.parameterfile = parameterfile
 36 |         self.dirname = dirname
 37 | 
 38 |     def searchPSIBLAST(self, id, psiblast):
 39 |         "Psi-Blast over a local database or over the internet"
 40 |         
 41 |         if psiblast == "local":
 42 |             threads = LP(self.parameterfile, "psiblast_threading")
 43 |             evalue = LP(self.parameterfile, "psiblast_evalue")
 44 |             reference_protein = "refseq_protein"
 45 |         
 46 |             in_sequence = self.dirname + id + ".fa"
 47 |             
 48 |             output = self.dirname + id + ".xml"
 49 |             if threads == False:
 50 |                 psiblast = NcbipsiblastCommandline(query=in_sequence,
 51 | 										 db=reference_protein,
 52 | 										 outfmt=5,
 53 | 										 threshold=evalue,
 54 | 										 out=output) 
 55 |                 psiblast()
 56 |             else:
 57 |                 try:
 58 |                     threads = int(threads)
 59 |                     psiblast = NcbipsiblastCommandline(query=in_sequence,
 60 |                                          db=reference_protein,
 61 |                                          outfmt=5,
 62 |                                          threshold=evalue,
 63 |                                          out=output,
 64 |                                          num_threads=threads) 
 65 |                     psiblast()
 66 |                 except: 
 67 |                     psiblast = NcbipsiblastCommandline(query=in_sequence,
 68 |                                          db=reference_protein,
 69 |                                          outfmt=5,
 70 |                                          threshold=evalue,
 71 |                                          out=output) 
 72 |                     psiblast()
 73 |             
 74 |             try:
 75 |                 open(self.dirname + id + ".fasta")
 76 |                 open.close()
 77 |                 remove(self.dirname + id + ".fa")
 78 |             except: 
 79 |                 move(self.dirname + id + ".fa", self.dirname + id + ".fasta")
 80 |         else:
 81 |             evalue = LP(self.parameterfile, "psiblast_evalue")
 82 |             reference_protein = "refseq_protein"
 83 |             
 84 |             in_sequence = self.dirname + id + ".fa"
 85 |                 
 86 |             for seq_record in SeqIO.parse(in_sequence,
 87 |                                           "fasta", IUPAC.protein):
 88 |                 sequence = seq_record.seq
 89 |         
 90 |                 psiblast = NCBIWWW.qblast("blastp",
 91 | 								    reference_protein,
 92 | 								    sequence,
 93 | 								    service="psi",
 94 | 								    expect=evalue,
 95 | 								    hitlist_size=500)
 96 |                 psiblast
 97 |                 
 98 |             try:
 99 |                 open(self.dirname + id + ".fasta")
100 |                 open.close()
101 |                 remove(self.dirname + id + ".fa")
102 |             except: 
103 |                 move(self.dirname + id + ".fa", self.dirname + id + ".fasta")
104 | 
105 |             output = self.dirname + id + ".xml"
106 |             saveblast = open(output, "w")
107 |             saveblast.write(psiblast.read())
108 |             saveblast.close()
109 |             psiblast.close()
110 | 
111 |     def validXML(self, id):
112 |         "Checks if the input file is a valid XML"
113 |         
114 |         try:
115 |             input = self.dirname + id + ".xml"
116 |             input_xml = open(input, "r")
117 |             xml = input_xml.readline()
118 |             input_xml.close()
119 |             if xml[0:5] == "<?xml":
120 |                 pass
121 |             else:                             
122 |                 raise StandardError, "%s - Invalid xml" % (input)
123 |         except:
124 |             raise StandardError, "%s - Invalid xml or not found" % (input)
125 | 
126 |     def sequencesXML(self, id, psiblast):
127 |         "Extracts records from xml and writes FASTA (full-length) sequences"
128 |         
129 |         thresh_identity = LP(self.parameterfile, "psiblast_identity")
130 |         thresh_coverage = LP(self.parameterfile, "psiblast_coverage")
131 |         
132 |         input = self.dirname + id + ".xml"
133 |         input_xml = open(input, "r")
134 |         
135 |         hits = []
136 |         for record in NCBIXML.parse(input_xml):            
137 |             for align in record.alignments:
138 |                 hit_id = align.hit_id
139 |                 for hsp in align.hsps:    
140 |                     positives = int(hsp.positives)
141 |                     identities = int(hsp.identities)
142 |                     q_start = int(hsp.query_start)
143 |                     q_end = int(hsp.query_end)
144 |                     query = (q_end - q_start) * 1.0
145 |                     sbjct1 = positives * 1.0
146 |                     coverage = sbjct1 / query * 100
147 |                     sbjct2 = identities * 1.0
148 |                     identity = sbjct2 / query * 100
149 |                     if coverage > thresh_coverage and identity > thresh_identity:
150 |                         hits.append(hit_id)
151 |         input_xml.close()
152 |         
153 |         if hits == []:
154 |             raise StandardError, "%s - No Hits found in PSI-BLAST search" % (input) 
155 |                  
156 |         for hit_id in hits:
157 |             gi = hit_id[hit_id.find("id|") + 4:hit_id.find("|ref")]
158 |             try:
159 |                 efetch = Entrez.efetch(db="protein", id=gi, rettype="fasta")
160 |             except:
161 |                 try:
162 |                     efetch = Entrez.efetch(db="protein", id=gi, rettype="fasta")
163 |                 except:
164 |                     efetch = Entrez.efetch(db="protein", id=gi, rettype="fasta")
165 |                 efetch = Entrez.efetch(db="protein", id=gi, rettype="fasta")
166 |             for values in efetch:
167 |                 description = values
168 |                 break
169 |             sequence = ""
170 |             for values in efetch:
171 |                 sequence += values.rstrip("\n")
172 |             try: 
173 |                 organism = description[description.find("[") + 1:description.find("]")]
174 |                 organism = organism.split()
175 |                 if len(organism) != 1:
176 |                     species = str(organism[0] + "_" + organism[1])
177 |                 else:
178 |                     species = str(organism[0] + "_" + "sp.")
179 |                 output = self.dirname + id + ".blast"
180 |                 blast = open(output, "a")
181 |                 blast.write("\n" + ">" + species + "\n" + sequence + "\n")
182 |                 blast.close()
183 |             except: 
184 |                 raise StandardError, "%s - No Hits found in PSI-BLAST search" % (input)    
185 |             
186 |  
187 |         
188 | 
189 | 


--------------------------------------------------------------------------------
/src/COEVOL.py:
--------------------------------------------------------------------------------
   1 | ﻿###############################################################################
   2 | # Encoding utf-8                                                              #
   3 | # F. Madeira and L. Krippahl, 2012                                            #
   4 | # This code is part of Pycoevol distribution.                                 #
   5 | # This work is public domain.                                                 #
   6 | ###############################################################################
   7 | 
   8 | from src.SEQ import sequence as class_sequence
   9 | from src.ALIGN import alignment as class_alignment
  10 | from Parameters import LoadParameters as LP
  11 | from src.UTILS import aa, Flash
  12 | from math import log, e, factorial
  13 | from numpy import mean, std, zeros, sqrt
  14 | from matplotlib import pyplot
  15 | #from shutil import copyfile
  16 | 
  17 | class coevolution:
  18 |     """
  19 |     Main code for coevolution analysis.
  20 |     Note: All the coevolution measures are normalized [0:1]
  21 |     Matrix-based Methods:
  22 |     * Residue Contact Preferences, Volume Normalized - Glaser et al, 2001.
  23 |     * Contact PDB-derived Likelihood Matrix  - Singer et al, 2002.
  24 |     * Residue-residue volume normalized  - based on Esque et al, 2010.
  25 |     
  26 |     Mutual Information based methods:
  27 |     * Mutual Information - Gloor el al, 2005.
  28 |     * MI by pair entropy - Martin el al, 2005.
  29 |     * Row and column weighed MI - Gouveia-Oliveira et al, 2007.
  30 |     * Contact preferences, volume normalized MIE - F. Madeira, 2012.
  31 |     (unpublished)
  32 |     
  33 |     Correlation-based methods:
  34 |     * OMES (Observed Minus Expected Squared) - Kass and Horovitz, 2002.
  35 |     * Pearson's correlation - Gobel et al, 1994. (slow)
  36 |     * Spearman's rank correlation - Pazos et al, 1997. (slow)
  37 |     * McBASC (McLachlan Based Substitution Correlation) - Fodor and 
  38 |     Aldrich, 2004. (slow)
  39 |     * Quartets - Galitsky, 2002. 
  40 |     
  41 |     Perturbation-based methods:
  42 |     * SCA (Statistical Coupling analysis) - Lockless and Ranganathan, 1999.
  43 |     As on Halperin et al, 2006.
  44 |     * ELSC (Explicit Likelihood of Subset Covariation) - Dekker et al, 2004.
  45 |     """
  46 |     
  47 |     def __init__(self, file1, file2, id1, id2, chain1, chain2,
  48 |                  alignment, coevolution, parameterfile, dirname):
  49 |         self.file1 = file1
  50 |         self.file2 = file2
  51 |         self.id1 = id1
  52 |         self.id2 = id2
  53 |         self.chain1 = chain1
  54 |         self.chain2 = chain2
  55 |         self.alignment = alignment
  56 |         self.coevolution = coevolution
  57 |         self.parameterfile = parameterfile
  58 |         self.dirname = dirname
  59 |         
  60 |     def __call__(self, file1, file2, id1, id2, chain1, chain2,
  61 |                  alignment, coevolution, parameterfile, dirname):
  62 |         self.file1 = file1
  63 |         self.file2 = file2
  64 |         self.id1 = id1
  65 |         self.id2 = id2
  66 |         self.chain1 = chain1
  67 |         self.chain2 = chain2
  68 |         self.alignment = alignment
  69 |         self.coevolution = coevolution
  70 |         self.parameterfile = parameterfile
  71 |         self.dirname = dirname
  72 | 
  73 |     def coevolAnalysis(self, file1, file2, id1, id2,
  74 |                        chain1, chain2, alignment, coevolution):
  75 |         "Returns a matrix of coevolution scores"
  76 |         
  77 |         seq = class_sequence(self.file1, self.file2, self.id1, self.id2,
  78 |                        self.chain1, self.chain2, self.parameterfile,
  79 |                        self.dirname)
  80 |         aln = class_alignment(self.id1, self.id2, self.alignment,
  81 |                               self.parameterfile, self.dirname)
  82 |         
  83 |         alignment1 = aln.cutAlignment(file1, id1, alignment)
  84 |         alignment2 = aln.cutAlignment(file2, id2, alignment)
  85 | 
  86 |         try:
  87 |             assert len(alignment1) == len(alignment2)
  88 |         except:
  89 |             raise StandardError, "Alignments must have the same number of sequences"
  90 |             
  91 |         protein1 = []
  92 |         protein2 = []
  93 |         try:
  94 |             protein1 = seq.matchResiduePosition(id1, chain1)
  95 |             protein2 = seq.matchResiduePosition(id2, chain2)
  96 |         except:
  97 |             pass
  98 | 
  99 |         info = dict()
 100 |         alignment1 = [e for e in alignment1]
 101 |         columns1 = transpose(alignment1)
 102 | 
 103 |         alignment2 = [e for e in alignment2]
 104 |         columns2 = transpose(alignment2)
 105 |             
 106 |         if coevolution == "mi":
 107 |             Flash('Mutual Information')
 108 |             mi = dict()
 109 |             pD1 = probabilityDict(columns1)
 110 |             pD2 = probabilityDict(columns2)
 111 |          
 112 |             for i in range(len(columns1)):
 113 |                 Flash('Column ' + str(i))
 114 |                 for j in range(len(columns2)):
 115 |                     mi[(i, j)] = mutualInformation(i, j, columns1, columns2, pD1, pD2)
 116 |             
 117 |             max_pos = []
 118 |             for i in range(len(columns1)):
 119 |                 for j in range(len(columns2)):
 120 |                     max_pos.append(mi[(i, j)])
 121 |             max_val = max(max_pos)
 122 |                     
 123 |             for i in range(len(columns1)):
 124 |                 for j in range(len(columns2)):
 125 |                     if mi[(i, j)] != 0.0:
 126 |                         info[(i, j)] = mi[(i, j)] * 1.0 / max_val
 127 |                     else:
 128 |                         info[(i, j)] = 0.0
 129 |         
 130 |         elif coevolution == "mie":
 131 |             Flash('Mutual Information by Pair Entropy')
 132 |             mie = dict()
 133 |             pD1 = probabilityDict(columns1)
 134 |             pD2 = probabilityDict(columns2)
 135 |          
 136 |             for i in range(len(columns1)):
 137 |                 Flash('Column ' + str(i))
 138 |                 for j in range(len(columns2)):
 139 |                     mie[(i, j)] = miEntropy(i, j, columns1, columns2, pD1, pD2)
 140 |             
 141 |             max_pos = []
 142 |             for i in range(len(columns1)):
 143 |                 for j in range(len(columns2)):
 144 |                     max_pos.append(mie[(i, j)])
 145 |             max_val = max(max_pos)
 146 |                     
 147 |             for i in range(len(columns1)):
 148 |                 for j in range(len(columns2)):
 149 |                     if mie[(i, j)] != 0.0:
 150 |                         info[(i, j)] = mie[(i, j)] * 1.0 / max_val
 151 |                     else:
 152 |                         info[(i, j)] = 0.0
 153 |                     
 154 |         elif coevolution == "rcwmi":
 155 |             Flash('Row and Column Weighed Mutual Information')
 156 |             rcwmi = dict()
 157 |             pD1 = probabilityDict(columns1)
 158 |             pD2 = probabilityDict(columns2)
 159 |          
 160 |             i_all = dict()
 161 |             all_j = dict()
 162 |             for i in range(len(columns1)):
 163 |                 v_i = 0
 164 |                 for j in range(len(columns2)):
 165 |                     v_i += mutualInformation(i, j, columns1, columns2,
 166 |                                              pD1, pD2)
 167 |                     i_all[i] = v_i
 168 | 
 169 |             for j in range(len(columns2)):
 170 |                 v_j = 0
 171 |                 for i in range(len(columns1)):
 172 |                     v_j += mutualInformation(i, j, columns1, columns2,
 173 |                                              pD1, pD2)
 174 |                     all_j[j] = v_j
 175 |             
 176 |             column = columns1[0]
 177 |             n = len(column)
 178 |             for i in range(len(columns1)):
 179 |                 Flash('Column ' + str(i))
 180 |                 for j in range(len(columns2)):
 181 |                     mi = mutualInformation(i, j, columns1, columns2,
 182 |                                            pD1, pD2)    
 183 |                     rcwmi[(i, j)] = rowColumnWeighed(mi,
 184 |                                                    i_all[i], all_j[j], n)
 185 |             max_pos = []
 186 |             for i in range(len(columns1)):
 187 |                 for j in range(len(columns2)):
 188 |                     max_pos.append(rcwmi[(i, j)])
 189 |             max_val = max(max_pos)
 190 |                     
 191 |             for i in range(len(columns1)):
 192 |                 for j in range(len(columns2)):
 193 |                     if rcwmi[(i, j)] != 0.0:
 194 |                         info[(i, j)] = rcwmi[(i, j)] * 1.0 / max_val
 195 |                     else:
 196 |                         info[(i, j)] = 0.0
 197 |                     
 198 |         elif coevolution == "cpvn":
 199 |             Flash('Contact Preferences, Volume Normalized')
 200 |             cpvn = dict()
 201 |             score_matrix = mapMatrix("CPVN")
 202 |             for i in range(len(columns1)):
 203 |                 Flash('Column ' + str(i))
 204 |                 for j in range(len(columns2)):
 205 |                     res1 = str(alignment1[0][i])
 206 |                     res2 = str(alignment2[0][j])
 207 |                     average = []
 208 |                     for a, b in zip(columns1[i], columns2[j]):
 209 |                         if a in aa and b in aa:
 210 |                             average.append(float(matchScore(res1, res2, score_matrix)))
 211 |                     cpvn[(i, j)] = mean(average)
 212 |             
 213 |             max_pos = []
 214 |             for i in range(len(columns1)):
 215 |                 for j in range(len(columns2)):
 216 |                     max_pos.append(cpvn[(i, j)])
 217 |             max_val = max(max_pos)
 218 |                     
 219 |             for i in range(len(columns1)):
 220 |                 for j in range(len(columns2)):
 221 |                     if cpvn[(i, j)] != 0.0:
 222 |                         info[(i, j)] = cpvn[(i, j)] * 1.0 / max_val
 223 |                     else:
 224 |                         info[(i, j)] = 0.0
 225 | 
 226 |         elif coevolution == "clm":
 227 |             Flash('Contact PDB-derived Likelihood Matrix')
 228 |             clm = dict()
 229 |             score_matrix = mapMatrix("CLM")
 230 |             for i in range(len(alignment1[0])):
 231 |                 Flash('Column ' + str(i))
 232 |                 for j in range(len(alignment2[0])):
 233 |                     res1 = str(alignment1[0][i])
 234 |                     res2 = str(alignment2[0][j])
 235 |                     average = []
 236 |                     for a, b in zip(columns1[i], columns2[j]):
 237 |                         if a in aa and b in aa:
 238 |                             average.append(float(matchScore(res1, res2, score_matrix)))
 239 |                     clm[(i, j)] = mean(average)
 240 |             
 241 |             max_pos = []
 242 |             for i in range(len(columns1)):
 243 |                 for j in range(len(columns2)):
 244 |                     max_pos.append(clm[(i, j)])
 245 |             max_val = max(max_pos)
 246 |                     
 247 |             for i in range(len(columns1)):
 248 |                 for j in range(len(columns2)):
 249 |                     if clm[(i, j)] != 0.0:
 250 |                         info[(i, j)] = clm[(i, j)] * 1.0 / max_val
 251 |                     else:
 252 |                         info[(i, j)] = 0.0
 253 |                     
 254 |         elif coevolution == "vol":
 255 |             Flash('Residue-residue Volume Normalized')
 256 |             vol = dict()
 257 |             score_matrix = mapMatrix("VOL")
 258 |             for i in range(len(alignment1[0])):
 259 |                 Flash('Column ' + str(i))
 260 |                 for j in range(len(alignment2[0])):
 261 |                     res1 = str(alignment1[0][i])
 262 |                     res2 = str(alignment2[0][j])
 263 |                     average = []
 264 |                     for a, b in zip(columns1[i], columns2[j]):
 265 |                         if a in aa and b in aa:
 266 |                             average.append(float(matchScore(res1, res2, score_matrix)))
 267 |                     vol[(i, j)] = mean(average)
 268 |             
 269 |             max_pos = []
 270 |             for i in range(len(columns1)):
 271 |                 for j in range(len(columns2)):
 272 |                     max_pos.append(vol[(i, j)])
 273 |             max_val = max(max_pos)
 274 |                     
 275 |             for i in range(len(columns1)):
 276 |                 for j in range(len(columns2)):
 277 |                     if vol[(i, j)] != 0.0:
 278 |                         info[(i, j)] = vol[(i, j)] * 1.0 / max_val
 279 |                     else:
 280 |                         info[(i, j)] = 0.0
 281 |                     
 282 |         elif coevolution == "omes":
 283 |             Flash('Observed Minus Expected Squared')
 284 |             omes = dict()
 285 |             for i in range(len(columns1)):
 286 |                 Flash('Column ' + str(i))
 287 |                 for j in range(len(columns2)):
 288 |                     omes[(i, j)] = covarianceOMES(columns1[i], columns2[j])
 289 |                     
 290 |             max_pos = []
 291 |             for i in range(len(columns1)):
 292 |                 for j in range(len(columns2)):
 293 |                     max_pos.append(omes[(i, j)])
 294 |             max_val = max(max_pos)
 295 |                     
 296 |             for i in range(len(columns1)):
 297 |                 for j in range(len(columns2)):
 298 |                     if omes[(i, j)] != 0.0:
 299 |                         info[(i, j)] = omes[(i, j)] * 1.0 / max_val
 300 |                     else:
 301 |                         info[(i, j)] = 0.0
 302 |                     
 303 |         elif coevolution == "pearson":
 304 |             Flash("Pearson's correlation")
 305 |             pearson = dict()
 306 |             score_matrix = mapMatrix("MCLACHLAN")
 307 |             N = len(columns1[0])
 308 |             for i in range(len(columns1)):
 309 |                 Flash('Column ' + str(i))
 310 |                 for j in range(len(columns2)):
 311 |                     d_matrix1 = twoDimensionalMatrix(columns1[i], score_matrix)
 312 |                     d_matrix2 = twoDimensionalMatrix(columns2[j], score_matrix)
 313 |                     pearson[(i, j)] = pearsonsCorrelation(d_matrix1, d_matrix2, N)
 314 |                     
 315 |             max_pos = []
 316 |             for i in range(len(columns1)):
 317 |                 for j in range(len(columns2)):
 318 |                     max_pos.append(pearson[(i, j)])
 319 |             max_val = max(max_pos)
 320 |                     
 321 |             for i in range(len(columns1)):
 322 |                 for j in range(len(columns2)):
 323 |                     if pearson[(i, j)] != 0.0:
 324 |                         info[(i, j)] = pearson[(i, j)] * 1.0 / max_val
 325 |                     else:
 326 |                         info[(i, j)] = 0.0
 327 |                     
 328 |         elif coevolution == "spearman":
 329 |             Flash("Spearman's rank correlation")
 330 |             score_matrix = mapMatrix("MCLACHLAN")
 331 |             spearman = dict()
 332 |             N = len(columns1[0])
 333 |             for i in range(len(columns1)):
 334 |                 Flash('Column ' + str(i))
 335 |                 for j in range(len(columns2)):
 336 |                     d_matrix1 = twoDimensionalMatrix(columns1[i], score_matrix)
 337 |                     d_matrix2 = twoDimensionalMatrix(columns2[j], score_matrix)
 338 |                     spearman[(i, j)] = spearmansCorrelation(d_matrix1, d_matrix2, N)
 339 |             
 340 |             max_pos = []
 341 |             for i in range(len(columns1)):
 342 |                 for j in range(len(columns2)):
 343 |                     max_pos.append(spearman[(i, j)])
 344 |             max_val = max(max_pos)
 345 |                     
 346 |             for i in range(len(columns1)):
 347 |                 for j in range(len(columns2)):
 348 |                     if spearman[(i, j)] != 0.0:
 349 |                         info[(i, j)] = spearman[(i, j)] * 1.0 / max_val
 350 |                     else:
 351 |                         info[(i, j)] = 0.0
 352 |                     
 353 |         elif coevolution == "mcbasc":
 354 |             Flash('McLachlan Based Substitution Correlation')
 355 |             mcbasc = dict()
 356 |             score_matrix = mapMatrix("MCLACHLAN")
 357 |             N = len(columns1[0])
 358 |             for i in range(len(columns1)):
 359 |                 Flash('Column ' + str(i))
 360 |                 for j in range(len(columns2)):
 361 |                     d_matrix1 = twoDimensionalMatrix(columns1[i], score_matrix)
 362 |                     d_matrix2 = twoDimensionalMatrix(columns2[j], score_matrix)
 363 |                     mcbasc[(i, j)] = mcbascCorrelation(d_matrix1, d_matrix2, N)
 364 |             
 365 |             max_pos = []
 366 |             for i in range(len(columns1)):
 367 |                 for j in range(len(columns2)):
 368 |                     max_pos.append(mcbasc[(i, j)])
 369 |             max_val = max(max_pos)
 370 |                     
 371 |             for i in range(len(columns1)):
 372 |                 for j in range(len(columns2)):
 373 |                     if mcbasc[(i, j)] != 0.0:
 374 |                         info[(i, j)] = mcbasc[(i, j)] * 1.0 / max_val
 375 |                     else:
 376 |                         info[(i, j)] = 0.0 
 377 |         
 378 |         elif coevolution == "quartets":
 379 |             Flash('Quartets')
 380 |             quartets = dict()
 381 |             for i in range(len(columns1)):
 382 |                 Flash('Column ' + str(i))
 383 |                 for j in range(len(columns2)):
 384 |                     quartets[(i, j)] = quartetsCorrelation(columns1[i], columns2[j])
 385 |             
 386 |             max_pos = []
 387 |             for i in range(len(columns1)):
 388 |                 for j in range(len(columns2)):
 389 |                     max_pos.append(quartets[(i, j)])
 390 |             max_val = max(max_pos)
 391 |                     
 392 |             for i in range(len(columns1)):
 393 |                 for j in range(len(columns2)):
 394 |                     if quartets[(i, j)] != 0.0:
 395 |                         info[(i, j)] = quartets[(i, j)] * 1.0 / max_val
 396 |                     else:
 397 |                         info[(i, j)] = 0.0
 398 |                         
 399 |         elif coevolution == "sca":
 400 |             Flash('Statistical Coupling Analysis')
 401 |             sca = dict()   
 402 |             for i in range(len(columns1)):
 403 |                 Flash('Column ' + str(i))
 404 |                 for j in range(len(columns2)):
 405 |                     sca[(i, j)] = perturbationSCA(columns1[i], columns2[j], \
 406 |                                                   j, columns2)
 407 |             max_pos = []
 408 |             for i in range(len(columns1)):
 409 |                 for j in range(len(columns2)):
 410 |                     max_pos.append(sca[(i, j)])
 411 |             max_val = max(max_pos)
 412 |                     
 413 |             for i in range(len(columns1)):
 414 |                 for j in range(len(columns2)):
 415 |                     if sca[(i, j)] != 0.0:
 416 |                         info[(i, j)] = sca[(i, j)] * 1.0 / max_val
 417 |                     else:
 418 |                         info[(i, j)] = 0.0
 419 |                     
 420 |         elif coevolution == "elsc":
 421 |             Flash('Explicit Likelihood of Subset Covariation') 
 422 |             elsc = dict()  
 423 |             for i in range(len(columns1)):
 424 |                 Flash('Column ' + str(i))
 425 |                 for j in range(len(columns2)):
 426 |                     elsc[(i, j)] = perturbationELSC(columns1[i], columns2[j], \
 427 |                                                    j, columns2)       
 428 |             max_pos = []
 429 |             for i in range(len(columns1)):
 430 |                 for j in range(len(columns2)):
 431 |                     max_pos.append(elsc[(i, j)])
 432 |             max_val = max(max_pos)
 433 |                     
 434 |             for i in range(len(columns1)):
 435 |                 for j in range(len(columns2)):
 436 |                     if elsc[(i, j)] != 0.0:
 437 |                         info[(i, j)] = elsc[(i, j)] * 1.0 / max_val
 438 |                     else:
 439 |                         info[(i, j)] = 0.0               
 440 |         else: pass
 441 |         
 442 |         output = self.dirname + alignment + "_" + coevolution + ".txt"
 443 |         results = open(output, "w")
 444 |         for i, j in sorted(info.keys()):
 445 |             if protein1 != [] and protein2 != []:
 446 |                 print >> results, protein1[i], protein2[j], \
 447 |                 round((info[(i, j)]), 4)
 448 |             elif protein1 != [] and protein2 == []:
 449 |                 print >> results, protein1[i], protein1[j], \
 450 |                 round((info[(i, j)]), 4)
 451 |             else:
 452 |                 print >> results, str(i + 1), str(j + 1), \
 453 |                 round((info[(i, j)]), 4)
 454 |         results.close()
 455 |     
 456 |     def bestInfo(self, id1, id2, alignment, coevolution):
 457 |         "Points out the best coevolution scores"
 458 |         
 459 |         seq = class_sequence(self.file1, self.file2, self.id1, self.id2,
 460 |                        self.chain1, self.chain2, self.parameterfile, 
 461 |                        self.dirname)
 462 |         
 463 |         histogram = LP(self.parameterfile, "results_histogram")
 464 |         heatmap = LP(self.parameterfile, "results_heatmap")
 465 |         best_info = LP(self.parameterfile, "best_results")
 466 |         
 467 |         surface1 = []
 468 |         surface2 = []
 469 |         interface = []
 470 |         try:
 471 |             surface1 = seq.parseSurfacePDB(id1)
 472 |             surface2 = seq.parseSurfacePDB(id2)
 473 |         except:
 474 |             pass
 475 |         
 476 |         try:
 477 |             interface = seq.parseInterfacePDB(id1)
 478 |         except:
 479 |             pass
 480 |         
 481 |         input = self.dirname + alignment + "_" + coevolution + ".txt"
 482 |         output = self.dirname + alignment + "_" + coevolution + "_best.txt"
 483 |         bestResults(input, output, best_info, surface1, surface2, interface)
 484 |         
 485 |         if histogram == True:
 486 |             input = self.dirname + alignment + "_" + coevolution + ".txt"
 487 |             output = self.dirname + alignment + "_" + coevolution + "_hg.png"
 488 |             drawHistogram(input, output)
 489 |             
 490 |         if heatmap == True:
 491 |             input = self.dirname + alignment + "_" + coevolution + ".txt"
 492 |             output = self.dirname + alignment + "_" + coevolution + "_hm.png"
 493 |             drawHeatmap(id1, id2, input, output)
 494 |         
 495 |         
 496 |     def structureSingle(self, id1, id2, chain1, chain2, alignment, coevolution):
 497 |         "Structure based results for proteins with single chain"
 498 |         
 499 |         structure = LP(self.parameterfile, "results_structure")
 500 |         best_info = LP(self.parameterfile, "best_results")
 501 |         
 502 |         input = self.dirname + alignment + "_" + coevolution + "_best.txt"
 503 |         input_results = open(input, "r")
 504 |         results = input_results.readlines()
 505 |         input_results.close()
 506 |             
 507 |         positions1 = []
 508 |         positions2 = []
 509 |         for line in results:
 510 |             l = line.rstrip("\n")
 511 |             l = l.split()
 512 |             res1 = int(l[0])
 513 |             res2 = int(l[1])
 514 |             positions1.append(res1)
 515 |             positions2.append(res2)
 516 |             
 517 |         if structure == "pymol":
 518 |             output1 = self.dirname + id1 + ".pml"
 519 |             out_struct1 = open(output1, "w")
 520 |             print >> out_struct1, "load %s" % (id1 + ".pdb")
 521 |             print >> out_struct1, "hide lines"
 522 |             print >> out_struct1, "hide nonbonded"
 523 |             print >> out_struct1, "bg_color black"
 524 |             print >> out_struct1, "color grey20"
 525 |             print >> out_struct1, "show cartoon"
 526 |             print >> out_struct1, "select hitmol, chain %s" % (chain1.lower())
 527 |             print >> out_struct1, "color red, (hitmol and resid *)"
 528 |             for pos in positions1:
 529 |                 if len(positions1) <= 20:
 530 |                     print >> out_struct1, "color yellow, (hitmol and resid %s)" \
 531 |                     % (str(pos + 1))
 532 |                     print >> out_struct1, "show spheres, (hitmol and resid %s)" \
 533 |                     % (str(pos + 1))     
 534 |                 else:
 535 |                     print >> out_struct1, "color yellow, (hitmol and resid %s)" \
 536 |                     % (str(pos + 1))
 537 |                     print >> out_struct1, "show sticks, (hitmol and resid %s)" \
 538 |                     % (str(pos + 1))
 539 |             out_struct1.close()
 540 |             
 541 |             output2 = self.dirname + id2 + ".pml"
 542 |             out_struct2 = open(output2, "w")
 543 |             print >> out_struct2, "load %s" % (id2 + ".pdb")
 544 |             print >> out_struct2, "hide lines"
 545 |             print >> out_struct2, "hide nonbonded"
 546 |             print >> out_struct2, "bg_color black"
 547 |             print >> out_struct2, "color grey20"
 548 |             print >> out_struct2, "show cartoon"
 549 |             print >> out_struct2, "select hitmol, chain %s" % (chain2.lower())
 550 |             print >> out_struct2, "color blue, (hitmol and resid *)"
 551 |             for pos in positions2:
 552 |                 if best_info <= 20:
 553 |                     print >> out_struct2, "color green, (hitmol and resid %s)" \
 554 |                     % (str(pos + 1))
 555 |                     print >> out_struct2, "show spheres, (hitmol and resid %s)" \
 556 |                     % (str(pos + 1))     
 557 |                 else:
 558 |                     print >> out_struct2, "color green, (hitmol and resid %s)" \
 559 |                     % (str(pos + 1))
 560 |                     print >> out_struct2, "show sticks, (hitmol and resid %s)" \
 561 |                     % (str(pos + 1))
 562 |             out_struct2.close()
 563 |         else: pass
 564 |         
 565 |         #copyfile(self.dirname + id1 + ".pdb", self.dirname + id1 + ".pdb")
 566 |         #copyfile(self.dirname + id2 + ".pdb", self.dirname + id2 + ".pdb")
 567 |         
 568 |     def structurePair(self, id1, id2, chain1, chain2, alignment, coevolution):
 569 |         "Structure based results for a protein with two chains"
 570 |         
 571 |         structure = LP(self.parameterfile, "results_structure")
 572 |         best_info = LP(self.parameterfile, "best_results")
 573 |         
 574 |         input = self.dirname + alignment + "_" + coevolution + "_best.txt"
 575 |         input_results = open(input, "r")
 576 |         results = input_results.readlines()
 577 |         input_results.close()
 578 |             
 579 |         positions1 = []
 580 |         positions2 = []
 581 |         for line in results:
 582 |             l = line.rstrip("\n")
 583 |             l = l.split()
 584 |             res1 = int(l[0])
 585 |             res2 = int(l[1])
 586 |             positions1.append(res1)
 587 |             positions2.append(res2)
 588 |             
 589 |         if structure == "pymol":
 590 |             output = self.dirname + id1 + ".pml"
 591 |             
 592 |             out_struct = open(output, "w")
 593 |             print >> out_struct, "load %s" % (id1 + ".pdb")
 594 |             print >> out_struct, "hide lines"
 595 |             print >> out_struct, "hide nonbonded"
 596 |             print >> out_struct, "bg_color black"
 597 |             print >> out_struct, "color grey20"
 598 |             print >> out_struct, "show cartoon"
 599 |             print >> out_struct, "select hitmol1, chain %s" % (chain1.lower())
 600 |             print >> out_struct, "select hitmol2, chain %s" % (chain2.lower())
 601 |             print >> out_struct, "color red, (hitmol1)"
 602 |             print >> out_struct, "color blue, (hitmol2)" + "\n"
 603 |             for pos in positions1:
 604 |                 if best_info <= 20:
 605 |                     print >> out_struct, "color yellow, (hitmol1 and resid %s)" \
 606 |                     % (str(pos + 1))
 607 |                     print >> out_struct, "show spheres, (hitmol1 and resid %s)" \
 608 |                     % (str(pos + 1))     
 609 |                 else:
 610 |                     print >> out_struct, "color yellow, (hitmol1 and resid %s)" \
 611 |                     % (str(pos + 1))
 612 |                     print >> out_struct, "show sticks, (hitmol1 and resid %s)" \
 613 |                     % (str(pos + 1))
 614 |                     
 615 |             for pos in positions2:
 616 |                 if best_info <= 20:
 617 |                     print >> out_struct, "color green, (hitmol2 and resid %s)" \
 618 |                     % (str(pos + 1))
 619 |                     print >> out_struct, "show spheres, (hitmol2 and resid %s)" \
 620 |                     % (str(pos + 1))     
 621 |                 else:
 622 |                     print >> out_struct, "color green, (hitmol2 and resid %s)" \
 623 |                     % (str(pos + 1))
 624 |                     print >> out_struct, "show sticks, (hitmol2 and resid %s)" \
 625 |                     % (str(pos + 1))    
 626 |             out_struct.close()
 627 |         else: 
 628 |             pass
 629 |              
 630 |         #copyfile(self.dirname + id1 + ".pdb", self.dirname + id1 + ".pdb")
 631 |         
 632 | def matchScore(alpha, beta, score_matrix):
 633 |     "Matches scores from a matrix"
 634 |         
 635 |     alphabet = {}    
 636 |     alphabet["I"] = 0
 637 |     alphabet["V"] = 1
 638 |     alphabet["L"] = 2
 639 |     alphabet["F"] = 3
 640 |     alphabet["C"] = 4
 641 |     alphabet["M"] = 5
 642 |     alphabet["A"] = 6
 643 |     alphabet["G"] = 7
 644 |     alphabet["T"] = 8
 645 |     alphabet["S"] = 9
 646 |     alphabet["W"] = 10
 647 |     alphabet["Y"] = 11
 648 |     alphabet["P"] = 12
 649 |     alphabet["H"] = 13
 650 |     alphabet["E"] = 14
 651 |     alphabet["Q"] = 15
 652 |     alphabet["D"] = 16
 653 |     alphabet["N"] = 17
 654 |     alphabet["K"] = 18
 655 |     alphabet["R"] = 19
 656 |     lut_x = alphabet[alpha]
 657 |     lut_y = alphabet[beta]
 658 |     
 659 |     return score_matrix[lut_x][lut_y]
 660 | 
 661 | def matchScore2(alpha, beta, score_matrix):
 662 |     "Matches scores from a matrix - different residue order"
 663 |     
 664 |     alphabet = {}    
 665 |     alphabet["A"] = 0
 666 |     alphabet["R"] = 1
 667 |     alphabet["N"] = 2
 668 |     alphabet["D"] = 3
 669 |     alphabet["C"] = 4
 670 |     alphabet["Q"] = 5
 671 |     alphabet["E"] = 6
 672 |     alphabet["G"] = 7
 673 |     alphabet["H"] = 8
 674 |     alphabet["I"] = 9
 675 |     alphabet["L"] = 10
 676 |     alphabet["K"] = 11
 677 |     alphabet["M"] = 12
 678 |     alphabet["F"] = 13
 679 |     alphabet["P"] = 14
 680 |     alphabet["S"] = 15
 681 |     alphabet["T"] = 16
 682 |     alphabet["W"] = 17
 683 |     alphabet["Y"] = 18
 684 |     alphabet["V"] = 19
 685 |     lut_x = alphabet[alpha]
 686 |     lut_y = alphabet[beta]
 687 |     
 688 |     return score_matrix[lut_x][lut_y]
 689 |     
 690 | def mapMatrix(matrix):
 691 |     "Maps a matrix of floats"
 692 |     matrix = matrix.upper()
 693 |     
 694 |     score_matrix = []
 695 |     input = './Matrix/' + matrix
 696 |     input_matrix = open(input, 'r')
 697 |     for line in input_matrix.readlines():
 698 |         score_matrix.append(map(float, line.split()))
 699 |     input_matrix.close()
 700 |     
 701 |     return score_matrix
 702 |    
 703 | def twoDimensionalMatrix(column, score_matrix):
 704 |     "For each column in the alignment constructs a two-dimensional matrix"
 705 |     
 706 |     two_d = []
 707 |     for i in range(len(column)):
 708 |         for j in range(len(column)):
 709 |             if i != j:
 710 |                 res1 = column[i]
 711 |                 res2 = column[j]
 712 |                 if res1 in aa and res2 in aa:
 713 |                     s = float(matchScore2(res1, res2, score_matrix))
 714 |                     two_d.append(s)
 715 |                 else:
 716 |                     s = 0.0
 717 |                     two_d.append(s)
 718 |                 
 719 |     return two_d
 720 |         
 721 | def log21(n):  
 722 |     return log(n) * 1.0 / log(21)
 723 | 
 724 | def ln(n): 
 725 |     return log(n) * 1.0 / log(e)
 726 | 
 727 | def transpose(L):
 728 |     R = range(len(L[0]))
 729 |     rL = list()
 730 |     for i in R:
 731 |         rL.append(''.join([item[i] for item in L]))
 732 |     return rL
 733 |     
 734 | 
 735 | def probabilityDict(columns):
 736 |     "Caches character probabilities for each column"
 737 |     
 738 |     n = len(columns[0])
 739 |     pD = list()
 740 |     for col in columns:
 741 |         aa = list(set(col))
 742 |         values = [col.count(k) * 1.0 / n for k in aa]
 743 |         pD.append(dict(zip(aa, values)))
 744 |     return pD
 745 | 
 746 | 
 747 | def mutualInformation(i, j, cols1, cols2, pD1, pD2):
 748 |     """
 749 |     Mutual informaton for protein coevolution as by
 750 |     Gloor et al, 2005. MI(X,Y) = H(X) + H(Y) - H(X,Y)
 751 |     MI(X,Y) = SUMSUM P(x,y).log20(P(x,y)/P(x).P(y))
 752 |     Treates gaps as signal.
 753 |     """
 754 |     
 755 |     col1, col2 = cols1[i], cols2[j]
 756 |     n = len(col1)
 757 |     assert n == len(col2)
 758 |     mi = 0
 759 |     pairs = [col1[k] + col2[k] for k in range(n)]
 760 |     pL = sorted(list(set(pairs)))
 761 |     for p in pL: 
 762 |         pXY = pairs.count(p) * 1.0 / n
 763 |         pX = pD1[i][p[0]]
 764 |         pY = pD2[j][p[1]]
 765 |         inside = (pXY * 1.0) / (pX * pY)
 766 |         outside = pXY * log21(inside)
 767 |         mi += outside
 768 |     return mi
 769 | 
 770 | def miEntropy(i, j, cols1, cols2, pD1, pD2):
 771 |     """
 772 |     Mutual informaton by pair entropy - Martin et al, 2005.
 773 |     MI(X,Y) = (H(X) + H(Y) - H(X,Y)) / H(X,Y)
 774 |     MI(X,Y) = (SUMSUM P(x,y).log20(P(x,y)/P(x).P(y))) / 
 775 |                -(SUMSUM P(x,y).log20(P(x,y)))  
 776 |     """
 777 | 
 778 |     col1, col2 = cols1[i], cols2[j]
 779 |     assert len(col1) == len(col2)
 780 |     n = len(col1)
 781 |     mi = 0
 782 |     entropy = 0
 783 |     pairs = [col1[k] + col2[k] for k in range(n)]
 784 |     pL = sorted(list(set(pairs)))
 785 |     for p in pL: 
 786 |         pXY = pairs.count(p) * 1.0 / n
 787 |         pX = pD1[i][p[0]]
 788 |         pY = pD2[j][p[1]]
 789 |         inside = (pXY * 1.0) / (pX * pY)
 790 |         outside = pXY * log21(inside)
 791 |         mi += outside
 792 |     for p in pL: 
 793 |         pXY = pairs.count(p) * 1.0 / n
 794 |         inside = pXY
 795 |         outside = pXY * log21(inside)
 796 |         entropy += outside
 797 |     entropy = -entropy
 798 |     if entropy == 0.0:
 799 |         mi_entropy = 0.0
 800 |     else: mi_entropy = mi / entropy
 801 |     return mi_entropy
 802 |   
 803 | def rowColumnWeighed(mi, i_all, all_j, n):
 804 |     """
 805 |     Row and Column weighed Mutual Information - Gouveia-
 806 |     Oliveira et al, 2007. 
 807 |     RCW(X,Y) = MI(X,Y) / 
 808 |             (((MI(X,all) + MI(all,Y) - 2MI(X,Y))/(n-1))
 809 |     """
 810 |     
 811 |     bottom = (i_all + all_j - 2.0 * mi) / (n - 1)
 812 |     if bottom == 0.0:
 813 |         rcwmi = 0.0
 814 |     else: rcwmi = mi / bottom
 815 |     
 816 |     return rcwmi
 817 | 
 818 | def covarianceOMES(column1, column2):
 819 |     """
 820 |     Normalized Covariance analysis; OMES - Observed Minus Expected Squared
 821 |     derived from the covariance method of Kass and Horovitz, 2002
 822 |     """
 823 |  
 824 |     assert len(column1) == len(column2)
 825 |     
 826 |     L = []
 827 |     Nvalid = []
 828 |     Cxi = []
 829 |     Cyj = []
 830 |     for i, j in zip(column1, column2):
 831 |         if i in aa and j in aa:
 832 |             value = [i, j]
 833 |             Nvalid.append(value)
 834 |             Cxi.append(i)
 835 |             Cyj.append(j)
 836 |             if value not in L:
 837 |                 L.append(value)
 838 | 
 839 |     len_Nvalid = len(Nvalid)
 840 |     omes = 0.0
 841 |     for value in L:
 842 |         Nobs = Nvalid.count(value)
 843 |         i = value[0]
 844 |         j = value[1]
 845 |         Ci = Cxi.count(i)
 846 |         Cj = Cyj.count(j)
 847 |         Nex = Ci * Cj / len_Nvalid    
 848 |         top = (Nobs - Nex) ** 2
 849 |         omes += top * 1.0 / len_Nvalid
 850 |     
 851 |     return omes
 852 | 
 853 | def pearsonsCorrelation(d_matrix1, d_matrix2, N):
 854 |     """
 855 |     Pearson's Correlation (Gobel method) - Gobel et al, 1994.
 856 |     """
 857 |     
 858 |     assert len(d_matrix1) == len(d_matrix2)
 859 |     
 860 |     no_match = 0.0
 861 |     for k, l in zip(d_matrix1, d_matrix2):
 862 |         if k != l:
 863 |             no_match += 1.0
 864 |     length = len(d_matrix1)
 865 |     Wkl = no_match * 1.0 / length
 866 |     
 867 |     sigma_i = std(d_matrix1)
 868 |     Si = []
 869 |     av_Si = mean(d_matrix1)
 870 |     for i in (d_matrix1):
 871 |         Si.append(i - av_Si)
 872 |     
 873 |     sigma_j = std(d_matrix2)
 874 |     Sj = []
 875 |     av_Sj = mean(d_matrix1)
 876 |     for j in (d_matrix2):
 877 |         Sj.append(j - av_Sj)
 878 |     
 879 |     top = 0.0
 880 |     for i, j in zip(Si, Sj):
 881 |         top += float(i * j * Wkl)
 882 | 
 883 |     bottom = sigma_i * sigma_j
 884 |     if bottom == 0.0:
 885 |         pearson = 0.0
 886 |     else:
 887 |         pearson = (1.0 / N ** 2) * (top / bottom)
 888 |     
 889 |     return pearson
 890 | 
 891 | def spearmansCorrelation(d_matrix1, d_matrix2, N):
 892 |     """
 893 |     Spearman's rank Correlation - Pazos et al, 1997. 
 894 |     """
 895 |     
 896 |     assert len(d_matrix1) == len(d_matrix2)
 897 |     
 898 |     rank_matrix1 = []
 899 |     rank_matrix2 = []
 900 |     rank_temp1 = []
 901 |     rank_temp2 = []
 902 |     for k, l in zip(d_matrix1, d_matrix2):
 903 |         if k not in rank_temp1:
 904 |             rank_temp1.append(k)
 905 |             cnt = d_matrix1.count(k)
 906 |             rank = cnt * 1.0 / len(d_matrix1)
 907 |             rank_matrix1.append(rank)
 908 |         if l not in rank_temp2:
 909 |             rank_temp2.append(l)
 910 |             cnt = d_matrix2.count(l)
 911 |             rank = cnt * 1.0 / len(d_matrix2)
 912 |             rank_matrix2.append(rank)
 913 |     
 914 |     no_match = 0.0
 915 |     for k, l in zip(d_matrix1, d_matrix2):
 916 |         if k != l:
 917 |             no_match += 1.0
 918 |     length = len(d_matrix1)
 919 |     Wkl = no_match * 1.0 / length
 920 |     
 921 |     sigma_i = std(d_matrix1)
 922 |     Si = []
 923 |     av_Si = mean(d_matrix1)
 924 |     for i in (rank_matrix1):
 925 |         Si.append(i - av_Si)
 926 |     
 927 |     sigma_j = std(d_matrix2)
 928 |     Sj = []
 929 |     av_Sj = mean(d_matrix1)
 930 |     for j in (rank_matrix2):
 931 |         Sj.append(j - av_Sj)
 932 |     
 933 |     top = 0.0
 934 |     for i, j in zip(Si, Sj):
 935 |         top += float(i * j * Wkl)
 936 | 
 937 |     bottom = sigma_i * sigma_j
 938 |     if bottom == 0.0:
 939 |         spearman = 0.0
 940 |     else:
 941 |         spearman = (1.0 / N ** 2) * (top / bottom)
 942 |     
 943 |     return spearman
 944 | 
 945 | def mcbascCorrelation(d_matrix1, d_matrix2, N):
 946 |     """
 947 |     McBASC - McLachlan Based Substitution Correlation.
 948 |     Fodor and Aldrich, 2004.
 949 |     """
 950 |     
 951 |     assert len(d_matrix1) == len(d_matrix2)
 952 |     
 953 |     sigma_i = std(d_matrix1)
 954 |     Si = []
 955 |     av_Si = mean(d_matrix1)
 956 |     for i in (d_matrix1):
 957 |         Si.append(i - av_Si)
 958 |     
 959 |     sigma_j = std(d_matrix2)
 960 |     Sj = []
 961 |     av_Sj = mean(d_matrix1)
 962 |     for j in (d_matrix2):
 963 |         Sj.append(j - av_Sj)
 964 |     
 965 |     top = 0.0
 966 |     for i, j in zip(Si, Sj):
 967 |         top += float(i * j)
 968 | 
 969 |     bottom = sigma_i * sigma_j
 970 |     if bottom == 0.0:
 971 |         mcbasc = 0.0
 972 |     else:
 973 |         mcbasc = abs((1.0 / N ** 2) * (top / bottom))
 974 |     
 975 |     return mcbasc
 976 | 
 977 | 
 978 | def quartetsCorrelation(column1, column2):
 979 |     """
 980 |     Normalized Quartets correlation method by Galitsky, 2002.
 981 |     """
 982 |  
 983 |     assert len(column1) == len(column2)
 984 |     
 985 |     quartets = 0.0
 986 |     x = column1
 987 |     y = column2
 988 |     pairs = []
 989 |     for i, j in zip(x, y):
 990 |         value = [i, j]
 991 |         pairs.append(value)
 992 |         
 993 |     for i, j in zip(x, y):
 994 |         if i in aa and j in aa:
 995 |             Pix = x.count(i)
 996 |             Piy = y.count(i) 
 997 |             Pjx = x.count(j) 
 998 |             Pjy = y.count(j)
 999 |             val = [i, j]
1000 |             Dmin = pairs.count(val)
1001 |             Dif = 1.0 * (len(pairs) - Dmin)
1002 |             if Dif != 0.0:
1003 |                 DQmin = Dmin * 1.0 / Dif
1004 |             else:
1005 |                 DQmin = 0.0
1006 | 
1007 |             try :
1008 |                 if ((Pix * Pjy > Piy * Pjx) and ((Pix > Dmin) or (Pjy > Dmin)) or\
1009 |                     (Pix * Pjy < Piy * Pjx) and ((Piy > Dmin) or (Pjx > Dmin)))\
1010 |                     and\
1011 |                    (((Pix * Pjy) * 1.0 / (Piy * Pjx) > DQmin) or\
1012 |                     ((Piy * Pjx) * 1.0 / (Pix * Pjy) > DQmin)):
1013 |                     quartets += 1.0
1014 |             except:
1015 |                 quartets += 0
1016 |     return quartets
1017 | 
1018 | def perturbationSCA(column1, column2, j, columns2):
1019 |     """
1020 |     Normalized SCA - Statistical Coupling analysis, Lockless and 
1021 |     Ranganathan, 1999. As on Halperin et al, 2006.
1022 |     """
1023 |     
1024 |     assert len(column1) == len(column2)
1025 |     
1026 |     new_columns2 = subAlignment(column2, columns2)
1027 |     x = column1
1028 |     y = new_columns2[j]
1029 |     
1030 |     inside = 0.0
1031 |     for i in x:
1032 |         if i in aa:
1033 |             Pix = x.count(i) * 1.0 / len(x)
1034 |             Pixj = y.count(i) * 1.0 / len(y)
1035 |             if Pixj != 0.0:
1036 |                 inside += (ln(Pixj) - Pix) ** 2
1037 |             
1038 |     sca = sqrt(inside)
1039 |     return sca
1040 | 
1041 | def perturbationELSC(column1, column2, j, columns2):
1042 |     """
1043 |     Normalized ELSC - Explicit Likelihood of Subset Covariation, 
1044 |     Dekker et al, 2004.
1045 |     """
1046 |     
1047 |     assert len(column1) == len(column2)
1048 |     
1049 |     new_columns2 = subAlignment2(column1, column2, columns2)
1050 |     x = column1
1051 |     y1 = column2
1052 |     y2 = new_columns2[j]
1053 |     
1054 |     
1055 |     comb_x = []
1056 |     comb_all = []
1057 |     for i in x:
1058 |         if i in aa:
1059 |             Nxj = y1.count(i)
1060 |             nxj = y2.count(i)
1061 |             Nall = len(y1)
1062 |             nall = len(y2)
1063 |             mxj = int(round((Nxj * 1.0 / Nall) * nall))
1064 |             top = long(factorial(Nxj))
1065 |             bot1 = factorial(nxj) * factorial(Nxj - nxj)
1066 |             bot2 = factorial(mxj) * factorial(Nxj - mxj)
1067 |             comb_x.append(top / bot1)
1068 |             comb_all.append(top / bot2)          
1069 |     
1070 |     product = 1.0
1071 |     for k, l in zip(comb_x, comb_all):    
1072 |         product *= (k * 1.0 / l) 
1073 |         
1074 |     if product != 0.0:
1075 |         elsc = -ln(product)
1076 |     else: 
1077 |         elsc = 0.0
1078 |     
1079 |     return elsc
1080 | 
1081 | def subAlignment (column, columns):
1082 |     "Creates a sub_alignment based on the most frequent AA in column"
1083 |     
1084 |     pD = []
1085 |     y = column
1086 |     for j in range(len(y)):
1087 |         if y[j] in aa:
1088 |             freq = y.count(y[j])
1089 |             freq_aa = y[j]
1090 |             value = [freq_aa, freq]
1091 |             pD.append(value)
1092 |     
1093 |     sort = sorted(pD, key=lambda pD: pD[1])
1094 |     aa_j = sort[0][0]
1095 |     
1096 |     col_positions = []
1097 |     pos = -1
1098 |     for j in y:
1099 |         pos += 1
1100 |         if j == aa_j:
1101 |             col_positions.append(pos)
1102 |     
1103 |     sub_align = []
1104 |     for col in columns:
1105 |         sub_col = []
1106 |         for pos in col_positions:
1107 |             sub_col.append(col[pos])
1108 |         sub_align.append(sub_col)
1109 |     return sub_align
1110 | 
1111 | def subAlignment2 (column1, column2, columns):
1112 |     "Creates a sub_alignment based on AA identity of column1"
1113 |     
1114 |     x = column1
1115 |     y = column2
1116 |     
1117 |     list_i = []
1118 |     for i in x:
1119 |         if i in aa:
1120 |             if i not in list_i:
1121 |                 list_i.append(i)
1122 |     
1123 |     col_positions = []
1124 |     pos = -1
1125 |     for j in y:
1126 |         pos += 1
1127 |         if j in list_i:
1128 |             col_positions.append(pos)
1129 |     
1130 |     sub_align = []
1131 |     for col in columns:
1132 |         sub_col = []
1133 |         for pos in col_positions:
1134 |             sub_col.append(col[pos])
1135 |         sub_align.append(sub_col)
1136 |     return sub_align
1137 |   
1138 | def bestResults(input, output, best_info, surface1, surface2, interface):
1139 |     "Creates a new list of best coevolution scores"
1140 |     
1141 |     input_results = open(input, "r")
1142 |     results = input_results.readlines()
1143 |     input_results.close()
1144 |     
1145 |     all = []
1146 |     for line in results:
1147 |         if line == "\n": pass
1148 |         else: 
1149 |             l = line.rstrip("\n")
1150 |             l = l.split()
1151 |             res1 = int(l[0])
1152 |             res2 = int(l[1])
1153 |             mi = float(l[2])
1154 |             if res1 in surface1 and res2 in surface2:
1155 |                 value = [res1, res2, mi]
1156 |                 all.append(value)
1157 |             elif res1 in surface1 and res2 in surface1:
1158 |                 value = [res1, res2, mi]
1159 |                 all.append(value)
1160 |             else:
1161 |                 value = [res1, res2, mi]
1162 |                 all.append(value)
1163 |     
1164 |     a = all
1165 |     sort = sorted(a, key=lambda a: a[2])
1166 |     length = len(sort)
1167 |     position = length - best_info
1168 |     threshold = sort[position]
1169 |     
1170 |     out_best = open(output, "w")
1171 |     count = 0
1172 |     for line in all:
1173 |         res1 = line[0]
1174 |         res2 = line[1]
1175 |         mi = float(line[2])
1176 |         value = [res1, res2]
1177 |         if mi >= threshold[2]:
1178 |             count += 1 
1179 |             if value in interface and count <= best_info:
1180 |                 print >> out_best, res1, res2, mi, "Interface contact"
1181 |             elif count <= best_info:
1182 |                 print >> out_best, res1, res2, mi
1183 |             else: pass
1184 |     out_best.close()
1185 | 
1186 | def drawHistogram(input, output):
1187 |     "Creates a histogram of coevolution scores"  
1188 |         
1189 |     data = []
1190 |     info = []
1191 |     input_results = open(input, "r")
1192 |     results = input_results.readlines()
1193 |     input_results.close()
1194 |         
1195 |     for line in results:
1196 |         l = line.rstrip("\n")
1197 |         l = l.split()
1198 |         res1 = int(l[0])
1199 |         res2 = int(l[1])
1200 |         inf = float(l[2])
1201 |         value = [res1, res2, inf]
1202 |         data.append(value)
1203 |         info.append(inf)
1204 | 
1205 |     maxi = max(info)
1206 |     L = [t[2] for t in data]
1207 |     X = maxi
1208 |     pyplot.hist(L, bins=X * 50)
1209 |     ax = pyplot.axes()
1210 |     ax.set_xlabel('Score')
1211 |     ax.set_ylabel('Frequency')
1212 |     ax.set_xlim(0, X)
1213 |     pyplot.savefig(output)
1214 |                 
1215 | def drawHeatmap(id1, id2, input, output): 
1216 |     "Creates a heatmap of coevolution scores"
1217 |                
1218 |     input_results = open(input, "r")
1219 |     results = input_results.readlines()
1220 |     input_results.close()
1221 |     
1222 |     data = []  
1223 |     residue1 = []
1224 |     residue2 = [] 
1225 |     for line in results:
1226 |         l = line.rstrip("\n")
1227 |         l = l.split()
1228 |         res1 = int(l[0])
1229 |         res2 = int(l[1])
1230 |         inf = float(l[2])
1231 |         value = [res1, res2, inf]
1232 |         data.append(value)
1233 |         if res1 not in residue1:
1234 |             residue1.append(res1)
1235 |         if res2 not in residue2:
1236 |             residue2.append(res2)
1237 |        
1238 |     startX = int(data[0][0])
1239 |     startY = int(data[0][1])
1240 |     length = len(data)
1241 |     endX = int(data[length - 1][0])
1242 |     endY = int(data[length - 1][1])
1243 |     
1244 |     lenX = len(residue1)
1245 |     lenY = len(residue2)    
1246 |     heatmap = zeros((lenY + 1, lenX + 1))
1247 |     for i in range(length):
1248 |         X = int(data[i][0])
1249 |         Y = int(data[i][1])
1250 |         XY = float(data[i][2])
1251 |         heatmap[Y][X] = XY
1252 |             
1253 |     pyplot.figure()
1254 |     pyplot.pcolormesh(heatmap)
1255 |     pyplot.colorbar() 
1256 |     pyplot.axes().set_xlabel(id1)
1257 |     pyplot.axes().set_ylabel(id2)
1258 |     pyplot.axes().set_xlim(startX, endX)
1259 |     pyplot.axes().set_ylim(startY, endY)
1260 |     pyplot.savefig(output)
1261 | 
1262 | 
1263 | 


--------------------------------------------------------------------------------
/src/INFO.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | from Bio import SeqIO
  9 | 
 10 | class information:
 11 |     """
 12 |     Main code for generating extended results.
 13 |     """
 14 |     def __init__(self, id1, id2, chain1, chain2, dirname):
 15 |         self.id1 = id1
 16 |         self.id2 = id2
 17 |         self.chain1 = chain1
 18 |         self.chain2 = chain2
 19 |         self.dirname = dirname
 20 |                 
 21 |     def __call__(self, id1, id2, chain1, chain2, dirname):
 22 |         self.id1 = id1
 23 |         self.id2 = id2
 24 |         self.chain1 = chain1
 25 |         self.chain2 = chain2
 26 |         self.dirname = dirname
 27 |     
 28 |     def getInfo(self, id):
 29 |         "Creates info about the sequences, psiblast, organisms, etc"
 30 |     
 31 |         input = self.dirname + id + ".fasta"
 32 |         sequence = SeqIO.parse(input, "fasta")
 33 |         for seq_record in sequence:
 34 |             seq = seq_record.seq
 35 |             length = len(seq)
 36 |             break
 37 |     
 38 |         input = self.dirname + id + ".blast"
 39 |         hit = 0
 40 |         sequences = SeqIO.parse(input, "fasta")
 41 |         for record in sequences:
 42 |             sequence = record.seq              
 43 |             hit += 1      
 44 |         
 45 |         input = self.dirname + id + ".fasta"
 46 |         sequences = SeqIO.parse(input, "fasta")
 47 |         organisms = 0
 48 |         for record in sequences:
 49 |             sequence = record.seq
 50 |             organisms += 1
 51 | 
 52 |         
 53 |         output = self.dirname + "results.txt"
 54 |         out = open(output, "a")
 55 |         print >> out, "ID" + "\t" + "LengSeq" + "\t" + "NHits" + "\t" + \
 56 |         "NOrganisms"
 57 |         print >> out, str(id) + "\t" + str(length) + "\t" + \
 58 |         str(hit) + "\t" + str(organisms) + "\n"
 59 |         out.close()
 60 |         
 61 |     def getSIFTS(self, id, chain):
 62 |         """
 63 |         Web_Services based on SIFTS @ 
 64 |         http://www.ebi.ac.uk/pdbe/docs/sifts/
 65 |         """
 66 |         
 67 |         id = id.lower()
 68 |         try:
 69 |             id = id.rstrip("_1")
 70 |         except:
 71 |             pass
 72 |         try:
 73 |             id = id.rstrip("_2")
 74 |         except:
 75 |             pass
 76 |         
 77 |         # Uniprot ID and SCOP
 78 |         input = "./SIFTS/pdb_chain_scop_uniprot.lst"
 79 |         sifts = open(input, "r")
 80 |         read = sifts.readlines()
 81 |         sifts.close()
 82 |         
 83 |         unip = "Not_found"
 84 |         scop = "Not_found"
 85 |         for line in read:
 86 |             if line[0:4] == str(id):
 87 |                 l = line.rstrip("\n")
 88 |                 l = l.split("\t")
 89 |                 if l[1] == str(chain):
 90 |                     unip = str(l[2])
 91 |                     scop = str(l[5])
 92 |         
 93 |         # CATH
 94 |         input = "./SIFTS/pdb_chain_cath_uniprot.lst"
 95 |         sifts = open(input, "r")
 96 |         read = sifts.readlines()
 97 |         sifts.close()
 98 | 
 99 |         cath = "Not_found"
100 |         for line in read:
101 |             if line[0:4] == str(id):
102 |                 l = line.rstrip("\n")
103 |                 l = l.split("\t")
104 |                 if l[1] == str(chain):
105 |                     cath = str(l[4])
106 | 
107 |         
108 |         # EC (enzyme)
109 |         input = "./SIFTS/pdb_chain_enzyme.lst"
110 |         sifts = open(input, "r")
111 |         read = sifts.readlines()
112 |         sifts.close()
113 |         
114 |         enz = "Not_found"
115 |         for line in read:
116 |             if line[0:4] == str(id):
117 |                 l = line.rstrip("\n")
118 |                 l = l.split("\t")
119 |                 if l[1] == str(chain):
120 |                     enz = str(l[4])
121 |         
122 |         # Interpro
123 |         input = "./SIFTS/pdb_chain_interpro.lst"
124 |         sifts = open(input, "r")
125 |         read = sifts.readlines()
126 |         sifts.close()
127 |         
128 |         inter = "Not_found"
129 |         for line in read:
130 |             if line[0:4] == str(id):
131 |                 l = line.rstrip("\n")
132 |                 l = l.split("\t")
133 |                 if l[1] == str(chain):
134 |                     inter = str(l[2])
135 |         
136 |         # Pfam
137 |         input = "./SIFTS/pdb_chain_pfam.lst"
138 |         sifts = open(input, "r")
139 |         read = sifts.readlines()
140 |         sifts.close()
141 |         
142 |         pfam = "Not_found"
143 |         for line in read:
144 |             if line[0:4] == str(id):
145 |                 l = line.rstrip("\n")
146 |                 l = l.split("\t")
147 |                 if l[1] == str(chain):
148 |                     pfam = str(l[4])
149 |  
150 |         # Taxonomy
151 |         input = "./SIFTS/pdb_chain_taxonomy.lst"
152 |         sifts = open(input, "r")
153 |         read = sifts.readlines()
154 |         sifts.close()
155 | 
156 |         taxid = "Not_found"
157 |         taxnm = "Not_found"
158 |         for line in read:
159 |             if line[0:4] == str(id):
160 |                 l = line.rstrip("\n")
161 |                 l = l.split("\t")
162 |                 if l[1] == str(chain):
163 |                     taxid = str(l[2])
164 |                     taxnm = str(l[7])
165 | 
166 |         
167 |         # Pubmed
168 |         input = "./SIFTS/pdb_pubmed.lst"
169 |         sifts = open(input, "r")
170 |         read = sifts.readlines()
171 |         sifts.close()
172 | 
173 |         pubm = "Not_found"
174 |         for line in read:
175 |             if line[0:4] == str(id):
176 |                 l = line.rstrip("\n")
177 |                 l = l.split("\t")
178 |                 pubm = str(l[2])
179 |         
180 |         output = self.dirname + "bioresults.txt"
181 |         out = open(output, "a")
182 |         print >> out, "Protein_ID" + "\t" + "Uniprot" + "\t" + "SCOP" + "\t" + \
183 |         "CATH" + "\t" + "Enzyme_EC" + "\t" + "Interpro" + "\t" + "Pfam" + "\t" + \
184 |         "Taxonomy_id" + "\t" + "Taxonomy_name" + "\t" + "Pubmed" 
185 |         
186 |         print >> out, str(id) + "\t" + str(unip) + "\t" + \
187 |         str(scop) + "\t" + str(cath) + "\t" + str(enz) + "\t" + \
188 |         str(inter) + "\t" + str(pfam) + "\t" + str(taxid) + "\t" + \
189 |         str(taxnm) + "\t" + str(pubm) + "\n"
190 |         out.close()
191 |         
192 |         
193 | 


--------------------------------------------------------------------------------
/src/MAIN.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | from src.SEQ import sequence
  9 | from src.BLAST import psiblast
 10 | from src.ORGANISM import organism
 11 | from src.ALIGN import alignment
 12 | from src.COEVOL import coevolution
 13 | from src.INFO import information
 14 | from Parameters import LoadParameters as LP
 15 | 
 16 | class main:
 17 |     """
 18 |     Main script caller.
 19 |     """
 20 |     def __init__(self, file1, file2, id1, id2, chain1, chain2, parameterfile, 
 21 |                  psiblast, alignment, coevolution, dirname):
 22 |         self.file1 = str(file1)
 23 |         self.file2 = str(file2)
 24 |         self.id1 = str(id1)
 25 |         self.id2 = str(id2)
 26 |         self.chain1 = str(chain1)
 27 |         self.chain2 = str(chain2)
 28 |         self.parameterfile= str(parameterfile)
 29 |         self.psiblast = str(psiblast)
 30 |         self.alignment = str(alignment)
 31 |         self.coevolution = str(coevolution)
 32 |         self.dirname = str(dirname)
 33 |         
 34 |     def __call__(self, file1, file2, id1, id2, chain1, chain2, parameterfile, 
 35 |                  psiblast, alignment, coevolution, dirname):
 36 |         self.file1 = str(file1)
 37 |         self.file2 = str(file2)
 38 |         self.id1 = str(id1)
 39 |         self.id2 = str(id2)
 40 |         self.chain1 = str(chain1)
 41 |         self.chain2 = str(chain2)
 42 |         self.parameterfile= str(parameterfile)
 43 |         self.psiblast = str(psiblast)
 44 |         self.alignment = str(alignment)
 45 |         self.coevolution = str(coevolution)
 46 |         self.dirname = str(dirname)
 47 |     
 48 |     def sequenceSripts(self):
 49 |         seq = sequence(self.file1, self.file2, self.id1, self.id2, 
 50 |                        self.chain1, self.chain2, self.parameterfile, 
 51 |                        self.dirname)
 52 |         if self.id1 != self.id2:
 53 |             if self.chain1 == "" and self.chain2 == "":
 54 |                 seq.validFASTA(self.file1, self.id1)
 55 |                 seq.queryFASTA(self.file1, self.id1)
 56 |                 seq.validFASTA(self.file2, self.id2)
 57 |                 seq.queryFASTA(self.file2, self.id2)
 58 |             else:
 59 |                 seq.validPDB(self.file1, self.id1, self.chain1)
 60 |                 seq.sequencePDB(self.file1, self.id1, self.chain1)
 61 |                 seq.surfacePDB(self.file1, self.id1, self.chain1)
 62 |                 seq.validPDB(self.file2, self.id2, self.chain2)
 63 |                 seq.sequencePDB(self.file2, self.id2, self.chain2)
 64 |                 seq.surfacePDB(self.file2, self.id2, self.chain2)
 65 |         else:
 66 |             if self.chain1 == "" and self.chain2 == "":
 67 |                 seq.validFASTA(self.file1, self.id1)
 68 |                 seq.queryFASTA(self.file1, self.id1)
 69 |             else:
 70 |                 if self.chain1 != self.chain2:
 71 |                     seq.validPDB(self.file1, self.id1, self.chain1)
 72 |                     seq.sequencePDB(self.file1, self.id1 + "_1", self.chain1)
 73 |                     seq.surfacePDB(self.file1, self.id1 + "_1", self.chain1)
 74 |                     seq.validPDB(self.file1, self.id1, self.chain2)
 75 |                     seq.sequencePDB(self.file1, self.id1 + "_2", self.chain2)
 76 |                     seq.surfacePDB(self.file1, self.id1 + "_2", self.chain2)
 77 |                 else:
 78 |                     seq.validPDB(self.file1, self.id1, self.chain1)
 79 |                     seq.sequencePDB(self.file1, self.id1, self.chain1)
 80 |                     seq.surfacePDB(self.file1, self.id1, self.chain1)
 81 |         return
 82 |     
 83 |     def psiblastSripts(self):
 84 |         seq = sequence(self.file1, self.file2, self.id1, self.id2, 
 85 |                        self.chain1, self.chain2, self.parameterfile,
 86 |                        self.dirname)
 87 |         blast = psiblast(self.id1, self.id2, self.psiblast,
 88 |                          self.parameterfile, self.dirname)
 89 |         if self.id1 != self.id2:
 90 |             blast.searchPSIBLAST(self.id1,self.psiblast)
 91 |             blast.searchPSIBLAST(self.id2,self.psiblast)
 92 |             blast.validXML(self.id1)
 93 |             blast.validXML(self.id2)
 94 |             blast.sequencesXML(self.id1,self.psiblast)
 95 |             blast.sequencesXML(self.id2,self.psiblast)
 96 |         else:
 97 |             if self.chain1 == "" and self.chain2 == "":
 98 |                 seq.copySequence(self.id1)
 99 |                 blast.searchPSIBLAST(self.id1 + "_1",self.psiblast)
100 |                 blast.searchPSIBLAST(self.id1 + "_2",self.psiblast)
101 |                 blast.validXML(self.id1 + "_1")
102 |                 blast.validXML(self.id1 + "_2")
103 |                 blast.sequencesXML(self.id1 + "_1",self.psiblast)
104 |                 blast.sequencesXML(self.id1 + "_2",self.psiblast)
105 |             else:
106 |                 if self.chain1 != self.chain2:
107 |                     blast.searchPSIBLAST(self.id1 + "_1",self.psiblast)
108 |                     blast.searchPSIBLAST(self.id1 + "_2",self.psiblast)
109 |                     blast.validXML(self.id1 + "_1")
110 |                     blast.validXML(self.id1 + "_2")
111 |                     blast.sequencesXML(self.id1 + "_1",self.psiblast)
112 |                     blast.sequencesXML(self.id1 + "_2",self.psiblast)
113 |                 else:
114 |                     seq.copySequence(self.id1)
115 |                     blast.searchPSIBLAST(self.id1 + "_1",self.psiblast)
116 |                     blast.searchPSIBLAST(self.id1 + "_2",self.psiblast)
117 |                     blast.validXML(self.id1 + "_1")
118 |                     blast.validXML(self.id1 + "_2")
119 |                     blast.sequencesXML(self.id1 + "_1",self.psiblast)
120 |                     blast.sequencesXML(self.id1 + "_2",self.psiblast)
121 |         return
122 |     
123 |     def organismSripts(self):
124 |         org = organism(self.id1, self.id2, self.psiblast,
125 |                        self.parameterfile, self.dirname)
126 |         if self.id1 != self.id2:
127 |             org.uniqueOrganism(self.id1, self.id2)
128 |             org.pairwiseDistance(self.id1, self.id2)
129 |             org.getsCorrelation()
130 |             org.removeSequences(self.id1, self.id2)
131 |         else:
132 |             org.uniqueOrganism(self.id1 + "_1", self.id1 + "_2")
133 |             org.pairwiseDistance(self.id1 + "_1", self.id1 + "_2")
134 |             org.getsCorrelation()
135 |             org.removeSequences(self.id1 + "_1", self.id1 + "_2")                               
136 |         return
137 |     
138 |     def alignmentSripts(self): 
139 |         aln = alignment(self.id1, self.id2, self.alignment, 
140 |                         self.parameterfile, self.dirname)
141 |         if self.id1 != self.id2:
142 |             aln.computeAlignment(self.id1, self.alignment)
143 |             aln.computeAlignment(self.id2, self.alignment)
144 |             #aln.alignScore(self.id1, self.alignment)
145 |             #aln.alignScore(self.id2, self.alignment)
146 |         else:
147 |             aln.computeAlignment(self.id1 + "_1", self.alignment)
148 |             aln.computeAlignment(self.id1 + "_2", self.alignment)
149 |             #aln.alignScore(self.id1 + "_1", self.alignment)
150 |             #aln.alignScore(self.id1 + "_2", self.alignment)                              
151 |         return
152 |     
153 |     def coevolutionSripts(self):
154 |         coevol = coevolution(self.file1, self.file2, self.id1, self.id2, 
155 |                              self.chain1, self.chain2, self.alignment, 
156 |                              self.coevolution, self.parameterfile, 
157 |                              self.dirname)
158 |         if self.id1 != self.id2:
159 |             coevol.coevolAnalysis(self.file1, self.file2,
160 |                                   self.id1, self.id2, 
161 |                                   self.chain1, self.chain2, 
162 |                                   self.alignment, self.coevolution)
163 |             coevol.bestInfo(self.id1, self.id2,  
164 |                                   self.alignment, self.coevolution)
165 |             if self.chain1 == "" and self.chain2 == "":
166 |                 pass
167 |             else:
168 |                 coevol.structureSingle(self.id1, self.id2, 
169 |                                             self.chain1, self.chain2, 
170 |                                             self.alignment, self.coevolution)
171 |                 
172 |         else:
173 |             coevol.coevolAnalysis(self.file1, self.file1,
174 |                                   self.id1 + "_1", self.id1 + "_2", 
175 |                                   self.chain1, self.chain2, 
176 |                                   self.alignment, self.coevolution)
177 |             coevol.bestInfo(self.id1 + "_1", self.id1 + "_2",  
178 |                                   self.alignment, self.coevolution)
179 |             if self.chain1 == "" and self.chain2 == "":
180 |                 pass
181 |             else:
182 |                 if self.chain1 != self.chain2:
183 |                     coevol.structurePair(self.id1, self.id1, 
184 |                                             self.chain1, self.chain2, 
185 |                                             self.alignment, self.coevolution)
186 |         return
187 |     
188 |     def infoScripts(self, SIFTS):
189 |         info = information(self.id1, self.id2,self.chain1, self.chain2, 
190 |                            self.dirname)
191 |         
192 |         results_sifts = LP(self.parameterfile, "results_sifts")
193 |         
194 |         if self.id1 != self.id2:
195 |             if self.chain1 == "" and self.chain2 == "":
196 |                 info.getInfo(self.id1)
197 |                 info.getInfo(self.id2)
198 |             else:
199 |                 info.getInfo(self.id1)
200 |                 info.getInfo(self.id2)
201 |                 if results_sifts == True and SIFTS==True:
202 |                     info.getSIFTS(self.id1, self.chain1)
203 |                     info.getSIFTS(self.id2, self.chain2)
204 |                 else: pass
205 |         else:
206 |             if self.chain1 == "" and self.chain2 == "":
207 |                 info.getInfo(self.id1 + "_1")
208 |             else:
209 |                 if self.chain1 != self.chain2:
210 |                     info.getInfo(self.id1 + "_1")
211 |                     info.getInfo(self.id1 + "_2")
212 |                     if results_sifts == True and SIFTS==True:
213 |                         info.getSIFTS(self.id1 + "_1", self.chain1)
214 |                         info.getSIFTS(self.id1 + "_2", self.chain2)
215 |                     else: pass
216 |                 else:
217 |                     info.getInfo(self.id1 + "_1")
218 |                     if results_sifts == True and SIFTS==True:
219 |                         info.getSIFTS(self.id1 + "_1", self.chain1)
220 |                     else: pass
221 |         return
222 | 
223 |     
224 |     
225 | 


--------------------------------------------------------------------------------
/src/ORGANISM.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | import os
  9 | from src.UTILS import aa
 10 | from Parameters import LoadParameters as LP
 11 | from os import remove, system
 12 | from numpy import mean, sqrt, log, median
 13 | from math import e
 14 | from collections import OrderedDict
 15 | from Bio import SeqIO, AlignIO
 16 | from Bio.Alphabet import IUPAC
 17 |     
 18 | class organism:
 19 |     """
 20 |     Main code for sort and selection of organisms. 
 21 |     
 22 |     Methods for calculate distance between pairwise alignments:
 23 |     ClustalW pairwise distance - Chenna et al, 2003 
 24 |     p-distance - Jukes and Cantor, 1969
 25 |     Jukes-Cantor - Jukes and Cantor, 1969
 26 |     Kimura Distance - Kimura, 1983
 27 |     Alignment score using PAM250 or BLOSUM62 -Dayhoff et al, 1978;
 28 |     Henikoff and Henikoff, 1992
 29 |     """
 30 |     def __init__(self, id1, id2, psiblast, parameterfile, dirname):
 31 |         self.id1 = id1
 32 |         self.id2 = id2
 33 |         self.psiblast = psiblast
 34 |         self.parameterfile = parameterfile
 35 |         self.dirname = dirname
 36 |         
 37 |     def __call__(self, id1, id2, psiblast, parameterfile, dirname):
 38 |         self.id1 = id1
 39 |         self.id2 = id2
 40 |         self.psiblast = psiblast
 41 |         self.parameterfile = parameterfile
 42 |         self.dirname = dirname
 43 |         
 44 |     def uniqueOrganism(self, id1, id2):
 45 |         "Removes unmatched organisms and concatenates sequences"
 46 |         
 47 |         input1 = self.dirname + id1 + ".blast"        
 48 |         input2 = self.dirname + id2 + ".blast"
 49 |         
 50 | 
 51 |         ord_dict1 = orderedDict(SeqIO.parse(input1, "fasta", IUPAC.protein),
 52 |                                 key_function=checkOrganism)
 53 |         
 54 |         ord_dict2 = orderedDict(SeqIO.parse(input2, "fasta", IUPAC.protein),
 55 |                                 key_function=checkOrganism)
 56 |         
 57 |         org1 = []
 58 |         for keys1 in ord_dict1.keys():
 59 |             if keys1 in ord_dict2.keys():
 60 |                 organism = ord_dict1[keys1].description
 61 |                 org1.append(organism)
 62 |         
 63 |         org2 = []
 64 |         for keys2 in ord_dict2.keys():
 65 |             if keys2 in ord_dict1.keys():
 66 |                 organism = ord_dict2[keys2].description
 67 |                 org2.append(organism)
 68 |                 
 69 | 
 70 |         if org1 == [] or org2 == []:
 71 |             raise StandardError, "There is no matching organisms"
 72 |         elif len(org1) < 15 or len(org2) < 15:
 73 |             raise StandardError, "Number of matching organisms <15"
 74 |         else: pass
 75 | 
 76 |         organism = []
 77 |         list = []
 78 |         for org in org1:
 79 |             if org in org2:
 80 |                 value = [org1.index(org) + org2.index(org),
 81 |                          org1.index(org), org2.index(org), org]
 82 |                 list.append(value)
 83 |         sort = sorted(list)
 84 |         for index in sort:
 85 |             org = index[3]
 86 |             organism.append(org)
 87 |             
 88 |         input_sequences1 = SeqIO.parse(input1, "fasta", IUPAC.protein)
 89 |         sequences1 = []
 90 |         for record in input_sequences1:
 91 |             org = str(record.description)
 92 |             seq = str(record.seq)
 93 |             if org in org1:
 94 |                 value = [org, seq]
 95 |                 sequences1.append(value)
 96 |                 
 97 |         input_sequences2 = SeqIO.parse(input2, "fasta", IUPAC.protein) 
 98 |         sequences2 = []
 99 |         for record in input_sequences2:
100 |             org = str(record.description)
101 |             seq = str(record.seq)
102 |             if org in org2:
103 |                 value = [org, seq]
104 |                 sequences2.append(value)    
105 |         
106 |         self.ord_sequences1 = []
107 |         self.ord_sequences2 = []
108 |         for org in organism:
109 |             seq = ""
110 |             for o in sequences1:
111 |                 organ = o[0]
112 |                 seque = o[1]
113 |                 if org == organ:
114 |                     seq += seque + ":"
115 |             value = [org, seq]
116 |             self.ord_sequences1.append(value)
117 |             seq = ""
118 |             for o in sequences2:
119 |                 organ = o[0]
120 |                 seque = o[1]
121 |                 if org == organ:
122 |                     seq += seque + ":"
123 |             value = [org, seq]
124 |             self.ord_sequences2.append(value)
125 |         
126 |         return self.ord_sequences1, self.ord_sequences2
127 |         
128 |         
129 |     def pairwiseDistance(self, id1, id2, method=None):
130 |         """
131 |         Calculates distance between each pair by diferent methods:
132 |         ClustalW distance, p-distance, Jukes-Cantor and Alignment score, 
133 |         with BLOSUM62 or PAM250 matrix.
134 |         (edit Parameters.py)
135 |         """
136 |         
137 |         method = LP(self.parameterfile, "pairwise_distance")
138 |         align_matrix = LP(self.parameterfile, "alignscore_matrix")
139 |         distances1 = []
140 |         distances2 = []
141 |         
142 |         input = self.dirname + id1 + ".fasta"
143 |         input_query = SeqIO.parse(input, "fasta", IUPAC.protein)
144 |         for record in input_query:
145 |             q_desc = str(record.description)
146 |             q_seq = str(record.seq)
147 |             break
148 |            
149 |         for entry in self.ord_sequences1:
150 |             p_desc = str(entry[0])
151 |             p_seq = str(entry[1])
152 |             p_seq = p_seq.rstrip(":")
153 |             p_seq = p_seq.split(":")
154 |             new_rec = []
155 |             for seq in p_seq:
156 |                 p_new_seq = seq   
157 |                 pair = self.dirname + id1 + ".pair"
158 |                 out_pair = open(pair, "w")
159 |             
160 |                 sequence1 = str("\n" + ">" + q_desc + "\n" + q_seq + "\n")
161 |                 sequence2 = str("\n" + ">" + p_desc + "\n" + p_new_seq + "\n")
162 |                 out_pair.write(sequence1 + sequence2)
163 |                 out_pair.close()    
164 |             
165 |                 output_align = self.dirname + id1 + ".aln"
166 |                 output_tree = self.dirname + id1 + ".dnd"
167 |                 distance = self.dirname + id1 + ".distance"
168 |                 
169 |                 try:
170 |                     cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe")
171 |                     clustalw = system(cmd + " " + pair + " > " + distance)
172 |                     clustalw
173 |                 except:
174 |                     cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw")
175 |                     clustalw = system(cmd + " " + pair + " > " + distance)
176 |                     clustalw
177 |                 
178 |                 output_fasta = self.dirname + id1 + "_pair.fasta"
179 |                 AlignIO.convert(output_align, "clustal", output_fasta, "fasta")
180 |                 
181 | 
182 |                 input_align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein)
183 |                 msa = []
184 |                 for record in input_align:
185 |                     seq = str(record.seq)
186 |                     msa.append(seq)
187 |                 sequence1 = msa[0]
188 |                 sequence2 = msa[1]
189 |             
190 |                 pair_score = getDistance(sequence1, sequence2,
191 |                                          method, align_matrix, distance)
192 |                 value = [pair_score, p_new_seq]
193 |                 new_rec.append(value) 
194 |             
195 |             sort = sorted(new_rec, key=lambda new_rec: new_rec[0])
196 |             new_dist = sort[0][0]
197 |             new_seq = sort[0][1]
198 |             distances1.append(new_dist)    
199 |             output = self.dirname + id1 + ".fasta"
200 |             out_fasta = open(output, "a")
201 |             out_fasta.write("\n" + ">" + p_desc + "\n" + new_seq + "\n")
202 |             out_fasta.close()
203 |                 
204 |         try:
205 |             remove(pair)
206 |             remove(output_align)
207 |             remove(output_tree)
208 |             remove(output_fasta)
209 |             remove(distance)
210 |         except:
211 |             pass
212 |         
213 |         input = self.dirname + id2 + ".fasta"
214 |         input_query = SeqIO.parse(input, "fasta", IUPAC.protein)
215 |         for record in input_query:
216 |             q_desc = str(record.description)
217 |             q_seq = str(record.seq)
218 |             break
219 |         
220 |         for entry in self.ord_sequences2:
221 |             p_desc = str(entry[0])
222 |             p_seq = str(entry[1])
223 |             p_seq = p_seq.rstrip(":")
224 |             p_seq = p_seq.split(":")
225 |             new_rec = []
226 |             for seq in p_seq:
227 |                 p_new_seq = seq           
228 |                 pair = self.dirname + id2 + ".pair"
229 |                 out_pair = open(pair, "w")
230 |             
231 |                 sequence1 = str("\n" + ">" + q_desc + "\n" + q_seq + "\n")
232 |                 sequence2 = str("\n" + ">" + p_desc + "\n" + p_new_seq + "\n")
233 |                 out_pair.write(sequence1 + sequence2)
234 |                 out_pair.close()    
235 |             
236 |                 output_align = self.dirname + id2 + ".aln"
237 |                 output_tree = self.dirname + id2 + ".dnd"
238 |                 distance = self.dirname + id2 + ".distance"
239 |                 
240 |                 try:
241 |                     cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw.exe")
242 |                     clustalw = system(cmd + " " + pair + " > " + distance)
243 |                     clustalw
244 |                 except:
245 |                     cmd = str(os.getcwd() + "/src/tools/clustalw/clustalw")
246 |                     clustalw = system(cmd + " " + pair + " > " + distance)
247 |                     clustalw 
248 |                 
249 |                 output_fasta = self.dirname + id2 + "_pair.fasta"
250 |                 AlignIO.convert(output_align, "clustal", output_fasta, "fasta")
251 |             
252 |                 input_align = SeqIO.parse(output_fasta, "fasta", IUPAC.protein)
253 |                 msa = []
254 |                 for record in input_align:
255 |                     seq = str(record.seq)
256 |                     msa.append(seq)
257 |                 sequence1 = msa[0]
258 |                 sequence2 = msa[1]
259 |             
260 |                 pair_score = getDistance(sequence1, sequence2,
261 |                                          method, align_matrix, distance)
262 |                 value = [pair_score, p_new_seq]
263 |                 new_rec.append(value) 
264 |             
265 |             sort = sorted(new_rec, key=lambda new_rec: new_rec[0])
266 |             new_dist = sort[0][0]
267 |             new_seq = sort[0][1]
268 |             distances2.append(new_dist)    
269 |             output = self.dirname + id2 + ".fasta"
270 |             out_fasta = open(output, "a")
271 |             out_fasta.write("\n" + ">" + p_desc + "\n" + new_seq + "\n")
272 |             out_fasta.close()
273 |                 
274 |         try:
275 |             remove(pair)
276 |             remove(output_align)
277 |             remove(output_tree)
278 |             remove(output_fasta)
279 |             remove(distance)
280 |         except:
281 |             pass
282 |         
283 |         
284 |         output = self.dirname + "matrix.txt" 
285 |         out_distance = open(output, "w")
286 |         for i in range(len(distances1)):
287 |             print >> out_distance, "1" + "\t" + str(i + 2) + "\t" + \
288 |                                             str(distances1[i]) + "\t" + \
289 |                                             str(distances2[i])
290 |         out_distance.close()
291 |     
292 |     def getsCorrelation(self, method=None):
293 |         """
294 |         Python implementation of the Theil-Sen Estimator.
295 |         Calculates the correlation, a distance between 
296 |         each point P(x,y) to the mean slope. Distance of
297 |         P(m,n) to Ax+By+C=0 is d=Abs(Am+Bn+C)/Sqrt(A^2+B^2)
298 |         """
299 |         try:
300 |             input = str(self.dirname + "matrix.txt")
301 |             file = open(input, "r")
302 |             file.close() 
303 |         except:    
304 |             return
305 |         
306 |         input = self.dirname + "matrix.txt"
307 |         input_matrix = open(input, "r")
308 |         matrix = input_matrix.readlines()
309 |         input_matrix.close()
310 |         
311 |         Xs = []
312 |         Ys = []
313 |         for line in matrix:
314 |             l = line.rstrip("\n")
315 |             l = l.split()
316 |             X = float(l[2])
317 |             Y = float(l[3])
318 |             Xs.append(X)
319 |             Ys.append(Y)
320 |         slope = theilsenEstimator(Xs, Ys)
321 |         
322 |         m = -slope
323 |         divisor = sqrt(1 + m ** 2)
324 |         distance = []
325 |         for f in range(len(Xs)):
326 |             d = abs(m * Xs[f] + Ys[f]) / divisor
327 |             distance.append(d)
328 |         
329 |         output = self.dirname + "correlation.txt"
330 |         out_correlation = open(output, "w")
331 |         print >> out_correlation, "Slope: %s" % (str(slope))
332 |         for d in range(len(distance)):
333 |             print >> out_correlation, str(d + 2) + "\t" + str(distance[d])
334 |         out_correlation.close()
335 |                  
336 | 
337 |     def removeSequences(self, id1, id2):
338 |         """
339 |         Removes sequences that not correlate and are point out by the
340 |         Theil-Sen estimator. It implements an easy algorithm to remove 
341 |         distante sequences.
342 |         """
343 |         
344 |         try:
345 |             input = str(self.dirname + "correlation.txt")
346 |             file = open(input, "r")
347 |             file.close() 
348 |         except:    
349 |             return
350 |         
351 |         input = self.dirname + "correlation.txt"
352 |         input_correlation = open(input, "r")
353 |         correlation = input_correlation.readlines()
354 |         input_correlation.close()
355 |         
356 |         value = []
357 |         for line in correlation:
358 |             if ":" in line:
359 |                 pass
360 |             else:
361 |                 l = line.rstrip("\n")
362 |                 l = l.split("\t")
363 |                 seq = int(l[0])
364 |                 d = float(l[1])
365 |                 if seq != 0:
366 |                     value.append(d)    
367 |         
368 |         removed = []
369 |         threshold = LP(self.parameterfile, "theilsen_cutoff")
370 |         maximum = max(value)
371 |         minimum = min(value)
372 |         median_all = median(value)
373 |         median_min = median_all - ((median_all - minimum) * 1.0 * threshold)
374 |         median_max = median_all + ((maximum - median_all) * 1.0 * threshold)
375 |         for v in value:
376 |             if v < median_min or v > median_max:
377 |                 position = value.index(v)    
378 |                 removed.append(position + 1)
379 |             else: pass       
380 |         
381 |         if removed != 0:
382 |             sequences1 = []
383 |             input = self.dirname + id1 + ".fasta"
384 |             input_sequences = SeqIO.parse(input, "fasta", IUPAC.protein)
385 |             for record in input_sequences:
386 |                 desc = record.description
387 |                 seq = record.seq
388 |                 value = [str(desc), str(seq)]
389 |                 sequences1.append(value)
390 |                
391 |             output_fasta = open(input, "w") 
392 |             for i in range(len(sequences1)):
393 |                 if i not in removed:
394 |                     desc = str(sequences1[i][0])
395 |                     seq = str(sequences1[i][1])
396 |                     output_fasta.write(">" + desc + "\n" + seq + "\n" + "\n")
397 |                 else: 
398 |                     pass
399 |             output_fasta.close()
400 |             
401 |             
402 |             sequences2 = []
403 |             input = self.dirname + id2 + ".fasta"
404 |             input_sequences = SeqIO.parse(input, "fasta", IUPAC.protein)
405 |             for record in input_sequences:
406 |                 desc = record.description
407 |                 seq = record.seq
408 |                 value = [str(desc), str(seq)]
409 |                 sequences2.append(value)
410 |             
411 |             output_fasta = open(input, "w")    
412 |             for i in range(len(sequences2)):
413 |                 if i not in removed:
414 |                     desc = str(sequences2[i][0])
415 |                     seq = str(sequences2[i][1])
416 |                     output_fasta.write(">" + desc + "\n" + seq + "\n" + "\n")
417 |                 else: 
418 |                     pass
419 |             output_fasta.close()
420 |         else: pass
421 |             
422 |  
423 | def theilsenEstimator(Xs, Ys):
424 |     """
425 |     The Theil-Sen estimator calculates the median slope 
426 |     among all lines through pairs of two-dimensional 
427 |     sample points.
428 |     """
429 |     assert len(Xs) == len(Ys)
430 |     slopes = []
431 |     for f in range(0, len(Xs) - 1):
432 |         x1 = Xs[f]
433 |         y1 = Ys[f]
434 |         for g in range(1, len(Ys)):
435 |             x2 = Xs[g]
436 |             y2 = Ys[g]
437 |             if x1 != x2:
438 |                 slope = (y2 - y1) / (x2 - x1)
439 |                 slopes.append(slope)
440 |     
441 |     slope = mean(slopes)
442 |     return slope
443 |         
444 | def matchScore(alpha, beta, score_matrix):
445 |     "Matches scores from a matrix"
446 |     
447 |     alphabet = {}    
448 |     alphabet["A"] = 0
449 |     alphabet["R"] = 1
450 |     alphabet["N"] = 2
451 |     alphabet["D"] = 3
452 |     alphabet["C"] = 4
453 |     alphabet["Q"] = 5
454 |     alphabet["E"] = 6
455 |     alphabet["G"] = 7
456 |     alphabet["H"] = 8
457 |     alphabet["I"] = 9
458 |     alphabet["L"] = 10
459 |     alphabet["K"] = 11
460 |     alphabet["M"] = 12
461 |     alphabet["F"] = 13
462 |     alphabet["P"] = 14
463 |     alphabet["S"] = 15
464 |     alphabet["T"] = 16
465 |     alphabet["W"] = 17
466 |     alphabet["Y"] = 18
467 |     alphabet["V"] = 19
468 |     alphabet["B"] = 20
469 |     alphabet["Z"] = 21
470 |     alphabet["X"] = 22
471 |     alphabet["-"] = 22
472 |     lut_x = alphabet[alpha]
473 |     lut_y = alphabet[beta]
474 |     
475 |     return score_matrix[lut_x][lut_y]
476 |     
477 | def mapMatrix(align_matrix):
478 |     "Maps a matrix of floats"
479 |     matrix = align_matrix.upper()
480 |     
481 |     score_matrix = []
482 |     input = './Matrix/' + matrix
483 |     input_matrix = open(input, 'r')
484 |     for line in input_matrix.readlines():
485 |         score_matrix.append(map(float, line.split()))
486 |     input_matrix.close()
487 |     
488 |     return score_matrix
489 |     
490 | def checkOrganism(record):
491 |     "Defines organism keys for a dictionary"
492 |     organism = record.description.rstrip("\n")
493 |     return organism
494 |     
495 | def orderedDict(sequences, key_function=None):
496 |     "Defines an ordered dictionary"
497 |     d = OrderedDict()               
498 |     for record in sequences:
499 |         key = key_function(record)
500 |         if key in d:
501 |             pass
502 |         d[key] = record
503 |     return d
504 | 
505 | def ln(n): 
506 |     return log(n) * 1.0 / log(e)
507 |      
508 | def getDistance(sequence1, sequence2, method, align_matrix, distance):
509 |     "Returns the distance between the sequences"       
510 |     if method == "clustalw":
511 |         distance = clustalwDistance(distance)   
512 |     elif method == "pdistance":
513 |         distance = pDistance(sequence1, sequence2) 
514 |     elif method == "jukescantor":
515 |         distance = jukesCantor(sequence1, sequence2)
516 |     elif method == "kimura":
517 |         distance = kimuraDistance(sequence1, sequence2)
518 |     elif method == "alignscore":
519 |         score_matrix = mapMatrix(align_matrix)
520 |         distance = alignmentScore(sequence1, sequence2, score_matrix)
521 |     else: 
522 |         raise StandardError, "%s - Invalid method for distance calculation" % (method)  
523 |     return distance
524 | 
525 | def clustalwDistance(distance):
526 |     """
527 |     Gets the distance from clustalw scores.
528 |     """
529 |     state = "Sequences (1:2) Aligned. Score:"
530 |     
531 |     input = open(distance, "r")
532 |     read = input.readlines()
533 |     input.close()
534 |     for l in read:
535 |         if state in l:
536 |             line = l.split()
537 |             length = len(line)
538 |             score = 0.01 * int(line[length - 1])
539 |         else: pass
540 | 
541 |     return score
542 | 
543 | def pDistance(sequence1, sequence2):
544 |     """
545 |     Proportion of sites at which the two sequences are different. 
546 |     p is close to 1 for poorly related sequences, and p is close 
547 |     to 0 for similar sequences. d = p
548 |     """
549 |     assert len(sequence1) == len(sequence2)
550 |     
551 |     match = 0
552 |     for a, b in zip(sequence1, sequence2):
553 |         if a != b:
554 |             match += 1
555 |         else:
556 |             pass
557 |     
558 |     length = len(sequence1)
559 |     score = match * 1.0 / length
560 |     score = score
561 |     return score
562 | 
563 | 
564 | def jukesCantor(sequence1, sequence2):
565 |     """
566 |     Maximum likelihood estimate of the number of substitutions 
567 |     between two sequences. p is described with the method 
568 |     p-distance. d = -19/20 log(1 - p * 20/19)
569 |     """
570 |     exterior = -19 * 1.0 / 20
571 |     interior = 1 - pDistance(sequence1, sequence2) * 20 * 1.0 / 19
572 |     score = exterior * log(interior)
573 |     
574 |     score = str(score)
575 |     if score == "nan":
576 |         score = str(0.0)
577 |     else: pass
578 |     
579 |     return score
580 | 
581 | def kimuraDistance(sequence1, sequence2):
582 |     """
583 |     Kimura's distance. This is a rough-and-ready distance formula 
584 |     for approximating PAM distance by simply measuring the fraction 
585 |     of amino acids, p, that differs between two sequences and 
586 |     computing the distance as (Kimura, 1983).
587 |     d = - log_e (1 - p - 0.2 p^2 ). 
588 |     """
589 |     
590 |     p_distance = pDistance(sequence1, sequence2)
591 |     interior = (1 - p_distance - 0.2 * p_distance ** 2)
592 |     score = -ln(interior)
593 |     
594 |     score = str(score)
595 |     if score == "nan":
596 |         score = str(0.0)
597 |     else: pass
598 |     
599 |     return score
600 | 
601 | def alignmentScore(sequence1, sequence2, score_matrix):
602 |     """
603 |     Distance (d) between two sequences (1, 2) is computed from 
604 |     the pairwise alignment score between the two sequences (score12), 
605 |     and the pairwise alignment score between each sequence and itself 
606 |     (score11, score22). This metric ignores gaps.
607 |     d = (1-score12/score11)* (1-score12/score22)
608 |     
609 |     !!Disclaimer: alignmentScore is terribly slow!!
610 |     """
611 |     assert len(sequence1) == len(sequence2)
612 |     
613 |     score12 = 0     
614 |     for i in sequence1:
615 |         for j in sequence2:
616 |             if i in aa and j in aa:
617 |                 score12 += float(matchScore(i, j, score_matrix))
618 |             else: pass
619 |             
620 |     score11 = 0     
621 |     for i in sequence1:
622 |         for j in sequence1:
623 |             if i != "-" or j != "-":
624 |                 score11 += float(matchScore(i, j, score_matrix))
625 |             else: pass
626 |     
627 |     score22 = 0     
628 |     for i in sequence2:
629 |         for j in sequence2:
630 |             if i != "-" or j != "-":
631 |                 score22 += float(matchScore(i, j, score_matrix))
632 |             else: pass
633 |     
634 |     part1 = (1 - score12 * 1.0 / score11)
635 |     part2 = (1 - score12 * 1.0 / score22)
636 |     
637 |     score = part1 * part2
638 |     return score
639 | 
640 | 


--------------------------------------------------------------------------------
/src/SASA.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | """Adapted from Surface Area (ASA) - (C) Bosco Ho 
  9 | http://boscoh.com/protein/calculating-the-solvent-accessible-surface-area-asa
 10 | 
 11 | Calculates the Solvente Accessible Surface Area (SASA) using the classic
 12 | 'rolling ball' algorithm - A. Shrake & J. A. Rupley.
 13 | Environment and Exposure to Solvent of Protein Atoms. Lysozyme and Insulin.
 14 | J Mol Biol. 79 (1973) 351-371.
 15 | """
 16 | 
 17 | from src.UTILS import radii
 18 | from math import pi, sqrt, cos, sin
 19 | 
 20 | SMALL = 1E-6
 21 | two_char_elements = [el for el, r in radii.items() if len(el) == 2]
 22 | 
 23 | def SASA(input, output):
 24 |     mol = Molecule(input)
 25 |     atoms = mol.atoms()
 26 |     add_radii(atoms)
 27 |     
 28 |     n_sphere = 960
 29 |     asas = calculateSASA(atoms, 1.4, n_sphere)
 30 |     
 31 |     for asa, atom in zip(asas, atoms):
 32 |         atom.bfactor = asa
 33 |     mol.write_pdb(output)
 34 |     return
 35 | 
 36 | def generateSpherePoints(n):
 37 |     """
 38 |     Returns list of 3d coordinates of points on a sphere using the
 39 |     Golden Section Spiral algorithm.
 40 |     """
 41 |     points = []
 42 |     inc = pi * (3 - sqrt(5))
 43 |     offset = 2 / float(n)
 44 |     for k in range(int(n)):
 45 |         y = k * offset - 1 + (offset / 2)
 46 |         r = sqrt(1 - y * y)
 47 |         phi = k * inc
 48 |         points.append([cos(phi) * r, y, sin(phi) * r])
 49 |     return points
 50 | 
 51 | 
 52 | def findNeighborIndices(atoms, probe, k):
 53 |     """
 54 |     Returns list of indices of atoms within probe distance to atom k. 
 55 |     """
 56 |     neighbor_indices = []
 57 |     atom_k = atoms[k]
 58 |     radius = atom_k.radius + probe + probe
 59 |     indices = range(k)
 60 |     indices.extend(range(k + 1, len(atoms)))
 61 |     for i in indices:
 62 |         atom_i = atoms[i]
 63 |         dist = pos_distance(atom_k.pos, atom_i.pos)
 64 |         if dist < radius + atom_i.radius:
 65 |             neighbor_indices.append(i)
 66 |     return neighbor_indices
 67 | 
 68 | 
 69 | def calculateSASA(atoms, probe, n_sphere_point=960):
 70 |     """
 71 |     Returns list of accessible surface areas of the atoms, using the probe
 72 |     and atom radius to define the surface.
 73 |     """
 74 |     sphere_points = generateSpherePoints(n_sphere_point)
 75 | 
 76 |     const = 4.0 * pi / len(sphere_points)
 77 |     test_point = Vector3d()
 78 |     areas = []
 79 |     for i, atom_i in enumerate(atoms):
 80 |         neighbor_indices = findNeighborIndices(atoms, probe, i)
 81 |         n_neighbor = len(neighbor_indices)
 82 |         j_closest_neighbor = 0
 83 |         radius = probe + atom_i.radius
 84 | 
 85 |         n_accessible_point = 0
 86 |         for point in sphere_points:
 87 |             is_accessible = True
 88 | 
 89 |             test_point.x = point[0] * radius + atom_i.pos.x
 90 |             test_point.y = point[1] * radius + atom_i.pos.y
 91 |             test_point.z = point[2] * radius + atom_i.pos.z
 92 | 
 93 |             cycled_indices = range(j_closest_neighbor, n_neighbor)
 94 |             cycled_indices.extend(range(j_closest_neighbor))
 95 | 
 96 |             for j in cycled_indices:
 97 |                 atom_j = atoms[neighbor_indices[j]]
 98 |                 r = atom_j.radius + probe
 99 |                 diff_sq = pos_distance_sq(atom_j.pos, test_point)
100 |                 if diff_sq < r * r:
101 |                     j_closest_neighbor = j
102 |                     is_accessible = False
103 |                     break
104 |             if is_accessible:
105 |                 n_accessible_point += 1
106 | 
107 |         area = const * n_accessible_point * radius * radius 
108 |         areas.append(area)
109 |     return areas
110 | 
111 | 
112 | def add_radii(atoms):
113 |     for atom in atoms:
114 |         if atom.element in radii:
115 |             atom.radius = radii[atom.element]
116 |         else:
117 |             atom.radius = radii['.']
118 | 
119 | def pos_distance_sq(p1, p2):
120 |     x = p1.x - p2.x
121 |     y = p1.y - p2.y
122 |     z = p1.z - p2.z
123 |     return x * x + y * y + z * z;
124 | 
125 | def pos_distance(p1, p2):
126 |     return sqrt(pos_distance_sq(p2, p1))
127 | 
128 | class Molecule:
129 |     def __init__(self, pdb=""):
130 |         self.id = ''
131 |         self._atoms = []
132 |         if pdb:
133 |             self.read_pdb(pdb)
134 | 
135 |     def n_atom(self):
136 |         return len(self._atoms)
137 | 
138 |     def atoms(self):
139 |         return self._atoms
140 | 
141 |     def atom(self, i):
142 |         return self._atoms[i]
143 |         
144 |     def clear(self):
145 |         for atom in self._atoms:
146 |             del atom
147 |         del self._atoms[:]
148 | 
149 |     def transform(self, matrix):
150 |         for atom in self._atoms:
151 |             atom.pos.transform(matrix)
152 | 
153 |     def insert_atom(self, atom):
154 |         self._atoms.append(atom)
155 |         
156 |     def erase_atom(self, atom_type):
157 |         for atom in self._atoms:
158 |             if atom.type == atom_type:
159 |                 self._atoms.remove(atom)
160 |                 del atom
161 |                 return
162 | 
163 |     def read_pdb(self, fname):
164 |         self.clear()
165 |         for line in open(fname, 'r').readlines():
166 |             if line.startswith("ATOM") or line.startswith("HETATM"):
167 |                 atom = AtomFromPdbLine(line);
168 |                 if len(self._atoms) == 1:
169 |                     self.id = atom.chain_id
170 |                 self.insert_atom(atom)
171 |             if line.startswith("ENDMDL"):
172 |                 return
173 | 
174 |     def write_pdb(self, pdb):
175 |         f = open(pdb, 'w')
176 |         n_atom = 0
177 |         for atom in sorted(self._atoms, cmp=cmp_atom):
178 |             n_atom += 1
179 |             atom.num = n_atom
180 |             f.write(atom.pdb_str() + '\n')
181 |         f.close()
182 | 
183 | def AtomFromPdbLine(line):
184 |     """Returns an Atom object from an atom line in a pdb file."""
185 |     atom = Atom()
186 |     if line.startswith('HETATM'):
187 |         atom.is_hetatm = True
188 |     else:
189 |         atom.is_hetatm = False
190 |     atom.num = int(line[6:11])
191 |     atom.type = line[12:16].strip(" ")
192 |     element = ''
193 |     for c in line[12:15]:
194 |         if not c.isdigit() and c != " ":
195 |             element += c
196 |     if element[:2] in two_char_elements:
197 |         atom.element = element[:2]
198 |     else:
199 |         atom.element = element[0]
200 |     atom.res_type = line[17:20]
201 |     atom.chain_id = line[21]
202 |     atom.res_num = int(line[22:26])
203 |     atom.res_insert = line[26]
204 |     if atom.res_insert == " ":
205 |         atom.res_insert = ""
206 |     x = float(line[30:38])
207 |     y = float(line[38:46])
208 |     z = float(line[46:54])
209 |     atom.pos.set(x, y, z)
210 |     try:
211 |         atom.occupancy = float(line[54:60])
212 |     except:
213 |         atom.occupancy = 100.0
214 |     try:
215 |         atom.bfactor = float(line[60:66])
216 |     except:
217 |         atom.bfactor = 0.0
218 |     return atom
219 |     
220 |     
221 | def cmp_atom(a1, a2):
222 |     if a1.num < a2.num:
223 |         return -1
224 |     else:
225 |         return 0
226 | 
227 | def pad_atom_type(in_atom_type):
228 |     atom_type = in_atom_type
229 |     if len(atom_type) == 1:
230 |         atom_type = " %s    " % atom_type
231 |     elif len(atom_type) == 2:
232 |         atom_type = " %s " % atom_type
233 |     elif len(atom_type) == 3:
234 |         if atom_type[0].isdigit():
235 |             atom_type = "%s " % atom_type
236 |         else:
237 |             atom_type = " %s" % atom_type
238 |     return atom_type
239 | 
240 | class Atom:
241 |     def __init__(self):
242 |         self.is_hetatm = False
243 |         self.pos = Vector3d()
244 |         self.vel = Vector3d()
245 |         self.mass = 0.0
246 |         self.type = ""
247 |         self.element = ""
248 |         self.chain_id = " "
249 |         self.res_type = ""
250 |         self.res_num = ""
251 |         self.res_insert = ""
252 |         self.bfactor = 0.0
253 |         self.occupancy = 0.0
254 |         self.num = 0
255 |     
256 |     def pdb_str(self):
257 |         return str(self.chain_id) + "\t" + str(self.res_type) + "\t" + \
258 |             str(self.res_num) + "\t" + str(self.bfactor)
259 |                              
260 |     def __str__(self):
261 |         return "%s%s-%s (% .1f % .1f % .1f)" \
262 |                         % (self.res_type, self.res_num,
263 |                                 self.type, self.pos.x,
264 |                                 self.pos.y, self.pos.z)
265 |         
266 | class Vector3d:
267 |     def __init__(self, x=0.0, y=0.0, z=0.0):
268 |         self.x = x
269 |         self.y = y
270 |         self.z = z
271 | 
272 |     def __add__(self, rhs):
273 |         return Vector3d(rhs.x + self.x, rhs.y + self.y, rhs.z + self.z)
274 | 
275 |     def __sub__(self, rhs):
276 |         return Vector3d(self.x - rhs.x, self.y - rhs.y, self.z - rhs.z)
277 | 
278 |     def __neg__(self):
279 |         return Vector3d(-self.x, -self.y, -self.z)
280 | 
281 |     def __pos__(self):
282 |         return Vector3d(self.x, self.y, self.z)
283 | 
284 |     def __eq__(self, rhs):
285 |         return (is_near_zero(self.x - rhs.x) and \
286 |                         is_near_zero(self.y - rhs.y) and \
287 |                         is_near_zero(self.z - rhs.z))
288 | 
289 |     def __str__(self):
290 |         return "(% .2f, % .2f, % .2f)" % (self.x, self.y, self.z)
291 | 
292 |     def __repr__(self):
293 |         return "Vector3d(%f, %f, %f)" % (self.x, self.y, self.z)
294 | 
295 |     def set(self, x, y, z):
296 |         self.x = x
297 |         self.y = y
298 |         self.z = z
299 | 
300 |     def copy(self):
301 |         return Vector3d(self.x, self.y, self.z)
302 | 
303 |     def length_sq(self):
304 |         return self.x * self.x + self.y * self.y + self.z * self.z
305 | 
306 |     def length(self):
307 |         return sqrt(self.x * self.x + self.y * self.y + self.z * self.z)
308 | 
309 |     def scale(self, scale):
310 |         self.x *= scale
311 |         self.y *= scale
312 |         self.z *= scale
313 | 
314 |     def normalize(self):
315 |         self.scale(1.0 / self.length())
316 | 
317 |     def scaled_vec(self, scale):
318 |         v = self.copy()
319 |         v.scale(scale)
320 |         return v
321 | 
322 |     def normal_vec(self):
323 |         return self.scaled_vec(1.0 / self.length())
324 | 
325 |     def parallel_vec(self, axis):
326 |         axis_len = axis.length()
327 |         if is_near_zero(axis_len):
328 |             result = self
329 |         else:
330 |             result = axis.scaled_vec(dot(self, axis) 
331 |                              / axis.length() / axis.length())
332 |         return result
333 | 
334 |     def perpendicular_vec(self, axis):
335 |         return self - self.parallel_vec(axis)
336 | 
337 |     def transform(self, matrix):
338 |         x = matrix.elem00 * self.x + \
339 |                 matrix.elem10 * self.y + \
340 |                 matrix.elem20 * self.z + \
341 |                 matrix.elem30
342 |         y = matrix.elem01 * self.x + \
343 |                 matrix.elem11 * self.y + \
344 |                 matrix.elem21 * self.z + \
345 |                 matrix.elem31
346 |         z = matrix.elem02 * self.x + \
347 |                 matrix.elem12 * self.y + \
348 |                 matrix.elem22 * self.z + \
349 |                 matrix.elem32
350 |         self.x, self.y, self.z = x, y, z
351 | 
352 | def is_near_zero(a):
353 |     return a < SMALL
354 |   
355 | def dot(a, b):
356 |     return a.x * b.x + a.y * b.y + a.z * b.z
357 | 
358 | 


--------------------------------------------------------------------------------
/src/SEQ.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | import src.SASA as sasa
  9 | from src.UTILS import aa_list, aa_symbols
 10 | from Parameters import LoadParameters as LP
 11 | import time
 12 | from os import remove
 13 | from shutil import copyfile
 14 | from urllib import urlopen
 15 | from Bio import SeqIO, Entrez
 16 | from Bio.Alphabet import IUPAC
 17 | from Bio.PDB.PDBParser import PDBParser
 18 | Entrez.email = "entrez@mail.com"
 19 | 
 20 | class sequence:
 21 |     """
 22 |     Main code for handling sequences and structures.
 23 |     """
 24 |     def __init__(self, file1, file2, id1, id2, chain1, chain2, parameterfile, 
 25 |                  dirname):
 26 |         self.file1 = file1
 27 |         self.file2 = file2
 28 |         self.chain1 = chain1
 29 |         self.chain2 = chain2
 30 |         self.id1 = id1
 31 |         self.id2 = id2
 32 |         self.parameterfile = parameterfile
 33 |         self.dirname = dirname
 34 |         
 35 |     def __call__(self, file1, file2, id1, id2, chain1, chain2, parameterfile,
 36 |                  dirname):
 37 |         self.file1 = file1
 38 |         self.file2 = file2
 39 |         self.chain1 = chain1
 40 |         self.chain2 = chain2
 41 |         self.id1 = id1
 42 |         self.id2 = id2
 43 |         self.parameterfile = parameterfile
 44 |         self.dirname = dirname
 45 |     
 46 |     def validFASTA(self, file, id):
 47 |         "Checks if the input file is a valid FASTA file"
 48 |         
 49 |         try:
 50 |             input = str(self.dirname + file)
 51 |             SeqIO.read(input, "fasta", IUPAC.protein)
 52 |         except:
 53 |             try:
 54 |                 "Fetches a sequence according to GI identifier or UniProt ID"
 55 |                 fetch = Entrez.efetch(db="protein", id=id, rettype="fasta")
 56 |                               
 57 |                 output = str(self.dirname + file)
 58 |                 out = open(output, "w")
 59 |                 out.write(fetch.read())
 60 |                 out.close()
 61 |                 read = SeqIO.parse(output, "fasta", IUPAC.protein)
 62 |                 for record in read:
 63 |                     sequence = str(record.seq)
 64 |                 out = open(output, "w")
 65 |                 print >> out, ">Query_id" + "\n" + sequence + "\n"
 66 |                 out.close()
 67 |                 
 68 |             except:                           
 69 |                 raise StandardError, "%s - Invalid sequence identifier or sequence file" % (id)                          
 70 |     
 71 |     def queryFASTA(self, file, id):
 72 |         "Changes FASTA original header to 'Query_id'"
 73 |         
 74 |         input = str(self.dirname + file)
 75 |         input_sequence = SeqIO.parse(input, "fasta", IUPAC.protein)
 76 |         for record in input_sequence:
 77 |             sequence = str(record.seq)
 78 |             break
 79 |             
 80 |         output = str(self.dirname + id + ".fa")
 81 |         out = open(output, "w")
 82 |         print >> out, ">Query_id" + "\n" + sequence + "\n"
 83 |         out.close()
 84 |         input_sequence.close()
 85 |         
 86 |         remove(input)
 87 |         
 88 |     def validPDB(self, file, id, chain):
 89 |         "Checks if the input file is a valid PDB file"
 90 |     
 91 |         try:
 92 |             input = str(self.dirname + file)
 93 |             PDBParser().get_structure(id, input)
 94 |             try:
 95 |                 test_structure = PDBParser().get_structure(id, input)
 96 |                 test_model = test_structure[0]
 97 |                 test_model[chain]
 98 |             except: 
 99 |                 raise StandardError, "%s - Invalid chain" % (chain)
100 |         except: 
101 |             try:
102 |                 "Fetches a PDB file from the RCSB Protein Databank"
103 |                 url = 'http://www.rcsb.org/pdb/files/%s.pdb' % id
104 |                 read = urlopen(url).read()
105 |                 pdb = open(self.dirname + file, "w")
106 |                 pdb.write(read)
107 |                 pdb.close()
108 |                 input = str(self.dirname + file)
109 |                 PDBParser().get_structure(id, input)
110 |                 try:
111 |                     test_structure = PDBParser().get_structure(id, input)
112 |                     test_model = test_structure[0]
113 |                     test_model[chain]
114 |                 except: 
115 |                     raise StandardError, "%s - Invalid chain" % (chain)
116 |             except:                             
117 |                 raise StandardError, "%s - Invalid PDB ID or PDB file" % (id)                            
118 |         
119 |     def sequencePDB(self, file, id, chain):
120 |         "Extracts a sequence from the ATOM lines of a PDB file"
121 |         
122 |         # sequence from atom lines
123 |         input = str(self.dirname + file)
124 |         input_structure = open(input, "r")
125 |         structure = input_structure.readlines()
126 |         input_structure.close()
127 |         string = ""
128 |         for line in structure:
129 |             if line[0:4] == "ATOM":
130 |                 if line[21] == str(chain):
131 |                     CA = line[13:16]
132 |                     res = line[17:20]
133 |                     if CA == "CA ":
134 |                         if res in aa_list:
135 |                             string += aa_symbols[res]
136 |                         else: pass
137 |         sequence = string
138 |                         
139 |         output = str(self.dirname + id + ".fasta")
140 |         out = open(output, "w")
141 |         print >> out, ">Query_id" + "\n" + sequence + "\n"
142 |         out.close()
143 |         
144 |         # full sequence from acession number on DBREF lines
145 |         input = str(self.dirname + file)
146 |         input_structure = open(input, "r")
147 |         structure = input_structure.readlines()
148 |         input_structure.close()
149 |         
150 |         for line in structure:
151 |             if line[0:5] == "DBREF":
152 |                 if line[21] == str(chain):
153 |                     data = line.split()
154 |                     ch = data[2]
155 |                     if ch == chain:
156 |                         ac_number = data[6]
157 |         try:
158 |             fetch = Entrez.efetch(db="protein", id=ac_number, rettype="fasta")
159 |                           
160 |             output = str(self.dirname + id + ".fa")
161 |             out = open(output, "w")
162 |             out.write(fetch.read())
163 |             out.close()
164 |             read = SeqIO.parse(output)
165 |             for record in read:
166 |                 sequence = str(record.seq)
167 |             out = open(output, "w")
168 |             print >> out, ">Query_id" + "\n" + sequence + "\n"
169 |             out.close()
170 |         except:
171 |             copyfile(self.dirname + id + ".fasta", self.dirname + id + ".fa")
172 | 
173 |     def surfacePDB(self, file, id, chain):
174 |         """"
175 |         Points out surface residues in a PDB file (ASA > 7% (A^2))*
176 |         *De et al.,2005. http://www.biomedcentral.com/1472-6807/5/15
177 |         """
178 |         
179 |         input = str(self.dirname + file)
180 |         input_structure = open(input, "r")
181 |         structure = input_structure.readlines()
182 |         input_structure.close()
183 |         
184 |         input_final = str("./" + file)
185 |         out = open(input_final, "w")
186 |         
187 |         for line in structure:
188 |             if line[0:4] == "ATOM":
189 |                 if line[21] == str(chain):
190 |                     res = line[17:20]
191 |                     res = res.rstrip()
192 |                     res = res.lstrip()
193 |                     if str(res) in aa_list:
194 |                         print >> out, line.rstrip("\n")
195 |         out.close()
196 |         
197 |         output = str("./" + file + ".txt")
198 |         sasa.SASA(input_final, output)
199 |         
200 |         list = []
201 |         input = str("./" + file + ".txt")
202 |         op = open(input)
203 |         read = op.readlines()
204 |         for line in read:
205 |             line = line.rstrip()
206 |             line = line.split()
207 |             if line[0] == chain:
208 |                 amino = str(line[1])
209 |                 res = int(line[2])
210 |                 area = float(line[3])
211 |                 if amino in aa_list:
212 |                     info = [amino, res, area]
213 |                     list.append(info)
214 |         
215 |         
216 |         threshold = LP(self.parameterfile, "surface_threshold")
217 |         surface = []
218 |         asa_list = []
219 |         total = 0
220 |         for i in range(0, (len(list) - 1), 1):
221 |             if list[i][0] == list[i + 1][0]:
222 |                 total += list[i][2]
223 |             else:
224 |                 amino = str(list[i][0])
225 |                 res = list[i][1]
226 |                 area = total + list[i][2]
227 |                 value = [amino, res, area]
228 |                 asa_list.append(area)
229 |                 surface.append(value)
230 |                 total = 0
231 |                 pass
232 |         
233 |         output = str(self.dirname + id + ".surface")
234 |         out = open(output, "w")
235 |         
236 |         asa_max = int(round(float(max(asa_list))))
237 |         thrd = threshold * asa_max * 1.0 / 100
238 |         for i in range(len(surface)):
239 |             amino = str(surface[i][0])
240 |             res = str(surface[i][1])
241 |             area = float(surface[i][2])
242 |             if area > thrd:
243 |                 print >> out, amino, res + "\t" + str(area)
244 |         out.close()
245 |         
246 |         time.sleep(2)        
247 |         try:
248 |             remove("./" + file)
249 |         except: pass
250 |         try:
251 |             remove("./" + file + ".txt")
252 |         except: pass
253 |         
254 |     def parseSurfacePDB(self, id):
255 |         "Parses residues at the surface level" 
256 |         
257 |         input = str(self.dirname + id + ".surface")
258 |         input_surface = open(input, "r")
259 |         surface = input_surface.readlines()
260 |         input_surface.close()
261 |         
262 |         surface_points = []
263 |         for line in surface:
264 |             l = line.split()
265 |             res_nb = int(l[1])
266 |             surface_points.append(res_nb)
267 |             
268 |         return surface_points
269 |         
270 |     
271 |     def matchResiduePosition(self, id, chain):
272 |         "Gets residue positions for use in coevolution analysis"
273 |         
274 |         input = str(self.dirname + id + ".pdb")
275 |         input_structure = open(input, "r")
276 |         structure = input_structure.readlines()
277 |         input_structure.close()
278 |                 
279 |         protein = []
280 |         for line in structure:
281 |             if line[0:4] == "ATOM":
282 |                 if line[21] == str(chain):
283 |                     CA = line[13:16]
284 |                     res_nb = line[22:26]
285 |                     if CA == "CA ":
286 |                         res_nb = line[22:26]
287 |                         res = line[17:20]
288 |                         res = res.rstrip()
289 |                         res = res.lstrip()
290 |                         if str(res) in aa_list:
291 |                             protein.append(int(res_nb))
292 |         return protein
293 |     
294 |     def copySequence(self, id):
295 |         "Doubles the sequence files"
296 |         
297 |         copyfile(self.dirname + id + ".fa", self.dirname + id + "_1.fa")
298 |         copyfile(self.dirname + id + ".fa", self.dirname + id + "_2.fa")
299 |         remove(self.dirname + id + ".fa")
300 |         return
301 | 
302 | 


--------------------------------------------------------------------------------
/src/UTILS.py:
--------------------------------------------------------------------------------
  1 | ﻿###############################################################################
  2 | # Encoding utf-8                                                              #
  3 | # F. Madeira and L. Krippahl, 2012                                            #
  4 | # This code is part of Pycoevol distribution.                                 #
  5 | # This work is public domain.                                                 #
  6 | ###############################################################################
  7 | 
  8 | """
  9 | Utilities used in some routines.
 10 | """
 11 | 
 12 | import sys
 13 | 
 14 | def Flash(message):
 15 |     print message
 16 |     sys.stdout.flush()
 17 | 
 18 | aa = ['A','C','D','E','F','G','H','K','I','L','M','N','P','Q','R','S','T','V','Y','W']
 19 | 
 20 | aa_list = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'LYS', 'ILE', 'LEU',
 21 |            'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TYR', 'TRP']
 22 | 
 23 | aa_symbols =  {'ALA':'A','CYS':'C','ASP':'D',
 24 |          'GLU':'E','PHE':'F','GLY':'G',
 25 |          'HIS':'H','LYS':'K','ILE':'I',
 26 |          'LEU':'L','MET':'M','ASN':'N',
 27 |          'PRO':'P','GLN':'Q','ARG':'R',
 28 |          'SER':'S','THR':'T','VAL':'V',
 29 |          'TYR':'Y','TRP':'W','XXX':'X'}      
 30 |   
 31 | # amino acid properties      
 32 | aa_hydrofobic = ['A','F','G','I','L','M','P','V','W']
 33 | aa_hydrofile = ['C','N','Q','S','T','Y']
 34 | aa_basic = ['H','K','R']
 35 | aa_acid = ['D','E']
 36 | aa_polar = ['S','T','Q','C','E','Y','D','K','H','R','N']
 37 | aa_non_polar = ['A','V','L','I','G','W','F','P','M']
 38 | aa_charged = ['H','R','K','D','E']
 39 | 
 40 | # amino acid reduction alphabets
 41 | # Caporaso, J. G., Smit, S., Easton, B. C., Hunter, L., Huttley, G. a, 
 42 | # Knight, R. (2008). Detecting coevolution without phylogenetic trees? 
 43 | # Tree-ignorant metrics of coevolution perform as well as tree-aware 
 44 | # metrics. BMC evolutionary biology, 8, 327. doi:10.1186/1471-2148-8-327
 45 | # {'A', 'D', 'K'} 
 46 | charge = {'A':'A','C':'A','D':'D',
 47 |           'E':'D','F':'A','G':'A',
 48 |           'H':'A','K':'K','I':'A',
 49 |           'L':'A','M':'A','N':'A',
 50 |           'P':'A','Q':'A','R':'K',
 51 |           'S':'A','T':'A','V':'A',
 52 |           'Y':'A','W':'A','-':'-'}
 53 | 
 54 | # {'A','D','K'}
 55 | charge_his = {'A':'A','C':'A','D':'D',
 56 |               'E':'D','F':'A','G':'A',
 57 |               'H':'K','K':'K','I':'A',
 58 |               'L':'A','M':'A','N':'A',
 59 |               'P':'A','Q':'A','R':'K',
 60 |               'S':'A','T':'A','V':'A',
 61 |               'Y':'A','W':'A','-':'-'}
 62 | 
 63 | # {'A','D','G','K'}
 64 | polarity = {'A':'A','C':'G','D':'D',
 65 |               'E':'D','F':'A','G':'G',
 66 |               'H':'K','K':'K','I':'A',
 67 |               'L':'A','M':'A','N':'G',
 68 |               'P':'A','Q':'G','R':'K',
 69 |               'S':'G','T':'G','V':'A',
 70 |               'Y':'G','W':'A','-':'-'}
 71 | 
 72 | # {'A','D','G'} 
 73 | hydropathy = {'A':'A','C':'A','D':'D',
 74 |               'E':'D','F':'A','G':'G',
 75 |               'H':'D','K':'D','I':'A',
 76 |               'L':'A','M':'A','N':'D',
 77 |               'P':'A','Q':'D','R':'D',
 78 |               'S':'G','T':'G','V':'A',
 79 |               'Y':'G','W':'G','-':'-'}
 80 | 
 81 | # Hydrophobicity scale:
 82 | # Kyte J and Doolittle RF: A simple method for displaying the 
 83 | # hydropathic character of a protein. J Mol Biol 157:105, 1982.
 84 | kyte_doolittle = {'A':1.8,'C':2.5,'D':-3.5,
 85 |                   'E':-3.5,'F':2.8,'G':-0.4,
 86 |                   'H':-3.2,'K':-3.9,'I':4.5,
 87 |                   'L':3.8,'M':1.9,'N':-3.5,
 88 |                   'P':-1.6,'Q':-3.5,'R':-4.5,
 89 |                   'S':-0.8,'T':-0.7,'V':4.2,
 90 |                   'Y':-1.3,'W':-0.9}
 91 | 
 92 | # Hoop TP and Woods KR: Prediction of protein antigenic determinants 
 93 | # from amino acid sequences. Proc Natl Acad Sci USA 78:3824, 1981. 
 94 | hopp_woods = {'A':-0.5,'C':-1.0,'D':3.0,
 95 |               'E':3.0,'F':-2.5,'G':0.0,
 96 |               'H':-0.5,'K':3.0,'I':-1.8,
 97 |               'L':-1.8,'M':-1.3,'N':0.2,
 98 |               'P':0.0,'Q':0.2,'R':3.0,
 99 |               'S':0.3,'T':-0.4,'V':-1.5,
100 |               'Y':-2.3,'W':-3.4}  
101 | 
102 | # D. Eisenberg; R. M. Weiss & T. C. Terwilliger:
103 | # The hydrophobic moment detects periodicity in protein hydrophobicity.
104 | # Proc Natl Acad Sci U S A, 81, 140-144
105 | eisenberg = {'A':0.62,'C':0.29,'D':-0.9,
106 |             'E':-0.74,'F':1.19,'G':0.48,
107 |             'H':-0.4,'K':1.38,'I':-1.5,
108 |             'L':1.06,'M':0.64,'N':-0.78,
109 |             'P':0.12,'Q':-0.85,'R':-2.53,
110 |             'S':-0.18,'T':-0.05,'V':1.08,
111 |             'Y':0.81,'W':0.26}  
112 | 
113 | # D. M. Engelman; T. A. Steitz & A. Goldman:
114 | # Identifying nonpolar transbilayer helices in amino acid sequences of 
115 | # membrane proteins. Annu Rev Biophys Biophys Chem, 15, 321-353
116 | engelman = {'A':1.6,'C':2.0,'D':-9.2,
117 |             'E':-82,'F':3.7,'G':1.0,
118 |             'H':-3.0,'K':3.1,'I':-8.8,
119 |             'L':2.8,'M':3.4,'N':-4.8,
120 |             'P':-0.2,'Q':-4.1,'R':-12.3,
121 |             'S':0.6,'T':1.2,'V':2.6,
122 |             'Y':1.9,'W':-0.7}
123 | 
124 | # J. L. Cornette; K. B. Cease; H. Margalit; J. L. Spouge; J. A. Berzofsky & C. DeLisi:
125 | # Hydrophobicity scales and computational techniques for detecting amphipathic 
126 | # structures in proteins. J Mol Biol, 195, 659-685
127 | cornette = {'A':0.2,'C':4.1,'D':-3.1,
128 |             'E':-1.8,'F':4.4,'G':0.0,
129 |             'H':0.5,'K':4.8,'I':-3.1,
130 |             'L':5.7,'M':4.2,'N':-0.5,
131 |             'P':-2.2,'Q':-2.8,'R':1.4,
132 |             'S':-0.5,'T':-1.9,'V':4.7,
133 |             'Y':1.0,'W':3.2}
134 | 
135 | 
136 | # Amino acid's volume:
137 | # Laguerre method with water. Esque et al, 2010
138 | volume = {'N' : 125.2, 'P': 122.1, 'Q': 148.1,
139 |           'A': 88.2, 'R': 188.8, 'S': 95.5,
140 |           'C': 113.3,'T': 118.4, 'D': 113.4,
141 |           'E': 134.8,'V': 134.5, 'F': 192.0,
142 |           'W': 227.3,'G': 65.3,  'H': 159.2,
143 |           'Y': 197.6,'I': 157.7, 'K': 164.2,
144 |           'L': 158.7,'M': 164.9} 
145 | 
146 | # atomic radius
147 | radii = {'H': 1.20, 'N': 1.55, 'NA': 2.27,
148 |          'CU': 1.40, 'CL': 1.75, 'C': 1.70,
149 |          'O': 1.52, 'I': 1.98, 'P': 1.80,
150 |          'B': 1.85, 'BR': 1.85, 'S': 1.80,
151 |          'SE': 1.90, 'F': 1.47, 'FE': 1.80,
152 |          'K':  2.75, 'MN': 1.73, 'MG': 1.73,
153 |          'ZN': 1.39, 'HG': 1.8, 'XE': 1.8,
154 |          'AU': 1.8, 'LI': 1.8, '.': 1.8}
155 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | ﻿###############################################################################
2 | # Encoding utf-8                                                              #
3 | # F. Madeira and L. Krippahl, 2012                                            #
4 | # This code is part of Pycoevol distribution.                                 #
5 | # This work is public domain.                                                 #
6 | ###############################################################################


--------------------------------------------------------------------------------
/src/tools/blast+/db/refseq_protein.pal:
--------------------------------------------------------------------------------
1 | #
2 | # Alias file created: Jun 26, 2011  8:38 PM
3 | #
4 | # Edit this file to reflet the location of your database
5 | # Get the database at ftp://ftp.ncbi.nih.gov/blast/db/
6 | #
7 | TITLE  NCBI Protein Reference Sequences
8 | DBLIST ./Pycoevol/src/tools/Blast+/db/refseq_protein.00 ./Pycoevol/src/tools/Blast+/db/refseq_protein.01 ./Pycoevol/src/tools/Blast+/db/refseq_protein.02 ./Pycoevol/src/tools/Blast+/db/refseq_protein.03
9 | 


--------------------------------------------------------------------------------
/src/tools/blast+/psiblast_here:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/src/tools/blast+/psiblast_here


--------------------------------------------------------------------------------
/src/tools/clustalw/clustalw_here:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/src/tools/clustalw/clustalw_here


--------------------------------------------------------------------------------
/src/tools/mafft/mafft_here:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/src/tools/mafft/mafft_here


--------------------------------------------------------------------------------
/src/tools/muscle/muscle_here:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biomadeira/pycoevol/8c9ef916abccc29656e4b6c8be9ae920aa06a119/src/tools/muscle/muscle_here


--------------------------------------------------------------------------------