├── .gitignore
├── .travis.yml
├── AGEpy
    ├── AGEpy.py
    ├── __init__.py
    ├── bed.py
    ├── biom.py
    ├── blast.py
    ├── cytoscape.py
    ├── david.py
    ├── fasta.py
    ├── go.py
    ├── gtf.py
    ├── homology.py
    ├── kegg.py
    ├── meme.py
    ├── plots.py
    ├── rbiom.py
    └── sam.py
├── Dockerfile
├── LICENSE
├── README.md
├── README.rst
├── bin
    ├── QC_plots
    ├── aDiff
    ├── abed
    ├── blasto
    ├── david
    └── obo2tsv
├── conf.py
├── docs
    ├── .DS_Store
    ├── cookbook.md
    ├── executables
    │   ├── abed.md
    │   ├── adiff.md
    │   ├── blasto.md
    │   ├── david.md
    │   └── obo2tsv.md
    ├── index.md
    └── modules
    │   ├── MA1.png
    │   ├── MA2.png
    │   ├── MA3.png
    │   ├── bed.md
    │   ├── biom.md
    │   ├── blast.md
    │   ├── cellplot.CellPlot.png
    │   ├── cytoscape.md
    │   ├── david.md
    │   ├── fasta.md
    │   ├── go.md
    │   ├── gtf.md
    │   ├── homology.md
    │   ├── kegg.md
    │   ├── meme.md
    │   ├── p53.png
    │   ├── plots.md
    │   ├── sam.md
    │   └── symplot.SymPlot.png
├── index.html
├── mkdocs.yml
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled python modules.
 2 | *.pyc
 3 | 
 4 | # Setuptools distribution folder.
 5 | /dist/
 6 | /build/
 7 | site
 8 | 
 9 | # Python egg metadata, regenerated from source files by setuptools.
10 | /*.egg-info
11 | 
12 | # Other
13 | *.swp
14 | .DS_Store
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python: 3.7
 3 | dist: xenial
 4 | sudo: true
 5 | 
 6 | install:
 7 |     - pip3 install .
 8 | 
 9 | script:
10 |     - echo "TODO"
11 | 


--------------------------------------------------------------------------------
/AGEpy/AGEpy.py:
--------------------------------------------------------------------------------
 1 | """Bioinformatics tools developed at the Max Planck Institute for Biology of Ageing"""
 2 | from .bed import *
 3 | from .biom import *
 4 | from .david import *
 5 | from .fasta import *
 6 | from .go import *
 7 | from .gtf import *
 8 | from .homology import *
 9 | from .kegg import *
10 | from .meme import *
11 | from .plots import *
12 | from .rbiom import *
13 | from .sam import *
14 | from .cytoscape import *
15 | from .blast import *
16 | 


--------------------------------------------------------------------------------
/AGEpy/__init__.py:
--------------------------------------------------------------------------------
 1 | """Bioinformatics tools developed at the Max Planck Institute for Biology of Ageing"""
 2 | from .bed import *
 3 | from .biom import *
 4 | from .david import *
 5 | from .fasta import *
 6 | from .go import *
 7 | from .gtf import *
 8 | from .homology import *
 9 | from .kegg import *
10 | from .meme import *
11 | from .plots import *
12 | from .rbiom import *
13 | from .sam import *
14 | from .cytoscape import *
15 | from .blast import *
16 | 


--------------------------------------------------------------------------------
/AGEpy/bed.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import pandas as pd
  4 | import numpy as np
  5 | #from urllib import urlopen # python2 
  6 | #import urllib2 # python2
  7 | import urllib.request as urllib2
  8 | #import StringIO python2
  9 | from io import StringIO
 10 | import gzip
 11 | try:
 12 |     import pybedtools
 13 |     from pybedtools import BedTool
 14 | except:
 15 |     print("pybedtools could not be imported")
 16 |     sys.stdout.flush()
 17 | from .gtf import GTFtoBED
 18 | from .gtf import readGTF
 19 | from .gtf import retrieve_GTF_field
 20 | 
 21 | 
 22 | def writeBED(inBED, file_path):
 23 |     """
 24 |     Writes a bed dataframe into a bed file.
 25 |     Bed format: 'chrom','chromStart','chromEnd','name','score','strand'
 26 | 
 27 |     :param inBED: bed dataframe to be written.
 28 |     :param file_path: /path/to/file.bed
 29 | 
 30 |     :returns: nothing
 31 | 
 32 |     """
 33 |     inBED.to_csv(file_path,index=None,sep="\t",header=None)
 34 | 
 35 | def GetBEDnarrowPeakgz(URL_or_PATH_TO_file):
 36 |     """
 37 |     Reads a gz compressed BED narrow peak file from a web address or local file
 38 | 
 39 |     :param URL_or_PATH_TO_file: web address of path to local file
 40 | 
 41 |     :returns: a Pandas dataframe
 42 |     """
 43 | 
 44 |     if os.path.isfile(URL_or_PATH_TO_file):
 45 |         response=open(URL_or_PATH_TO_file, "r")
 46 |         compressedFile = StringIO.StringIO(response.read())
 47 |     else:
 48 |         response = urllib2.urlopen(URL_or_PATH_TO_file)
 49 |         compressedFile = StringIO.StringIO(response.read())
 50 |     decompressedFile = gzip.GzipFile(fileobj=compressedFile)
 51 |     out=decompressedFile.read().split("\n")
 52 |     out=[ s.split("\t") for s in out]
 53 |     out=pd.DataFrame(out)
 54 |     out.columns=["chrom","chromStart","chromEnd","name","score","strand","signalValue","-log10(pValue)","-log10(qvalue)","peak"]
 55 |     out["name"]=out.index.tolist()
 56 |     out["name"]="Peak_"+out["name"].astype(str)
 57 |     out=out[:-1]
 58 |     return out
 59 | 
 60 | def dfTObedtool(df):
 61 |     """
 62 |     Transforms a pandas dataframe into a bedtool
 63 | 
 64 |     :param df: Pandas dataframe
 65 | 
 66 |     :returns: a bedtool
 67 |     """
 68 | 
 69 |     df=df.astype(str)
 70 |     df=df.drop_duplicates()
 71 |     df=df.values.tolist()
 72 |     df=["\t".join(s) for s in df ]
 73 |     df="\n".join(df)
 74 |     df=BedTool(df, from_string=True)
 75 |     return df
 76 | 
 77 | def GetPeaksExons(bed,parsedGTF):
 78 |     """
 79 |     Annotates a bedtool, BED narrow peak
 80 | 
 81 |     :param bed: a pandas dataframe in bed format
 82 |     :param parsedGTF: a parsed GTF file as outputed by parseGTF() with the following columns
 83 | 
 84 |     :returns: a Pandas dataframe
 85 |     """
 86 | 
 87 |     bedtool_AB=dfTObedtool(bed)
 88 | 
 89 |     exonsGTF=parsedGTF[parsedGTF["feature"]=="exon"]
 90 |     exonsGTF.reset_index(inplace=True, drop=True)
 91 | 
 92 |     exonsBED=GTFtoBED(exonsGTF, "exon_id")
 93 |     exonsBED.columns=['chrom', 'chromStart', 'chromEnd', 'exon_id', 'score', 'strand']
 94 |     exonsBEDcols=exonsBED.columns.tolist()
 95 | 
 96 |     bedcols=bed.columns.tolist()
 97 |     exonsBEDcols_=[]
 98 |     for c in exonsBEDcols:
 99 |         if c in bedcols:
100 |             exonsBEDcols_.append(c+"_exon")
101 |         else:
102 |             exonsBEDcols_.append(c)
103 | 
104 |     cols=[bedcols,exonsBEDcols_,["overlap"] ]
105 |     cols=[item for sublist in cols for item in sublist]
106 | 
107 |     bedtool_exons=dfTObedtool(exonsBED)
108 | 
109 |     bedtool_target_exons=bedtool_AB.intersect(bedtool_exons, wo=True, s=True)
110 |     dfTargetE=pd.read_table(bedtool_target_exons.fn, names=cols)
111 |     ExonsTransGenes=parsedGTF[["exon_id","transcript_id","gene_id"]].drop_duplicates()
112 |     dfTargets=pd.merge(dfTargetE,ExonsTransGenes,on=["exon_id"],how="left")
113 |     dfTargets["count"]=1
114 | 
115 |     def getCounts(df,field):
116 |         """
117 |         For each field in a bed narrow peak returns the number or times that field is present,\
118 |         the normalized mean of the '-log10(pValue)' and normalized mean of the signal value.
119 | 
120 |         :param df: a Pandas dataframe of a bed narrow peak
121 |         :param field: field to analyse, ie. exons or transcripts
122 | 
123 |         :returns: a Pandas dataframe
124 |         """
125 | 
126 |         tmp=df[[field,'name',"count"]].drop_duplicates()
127 |         tmp=tmp.drop(["name"],axis=1)
128 |         tmp["count"]=tmp["count"].astype(int)
129 |         tmp.columns=[field,"%s_count" %str(field)]
130 |         tmp=tmp.groupby(field, as_index=False).sum()
131 |         df=pd.merge(df,tmp,on=field,how="left")
132 | 
133 |         tmp=df[[field,'name',"-log10(pValue)"]].drop_duplicates()
134 |         tmp=tmp.drop(["name"],axis=1)
135 |         tmp["-log10(pValue)"]=tmp["-log10(pValue)"].astype(float)
136 |         tmp=tmp.groupby(field).apply(lambda l: reduce(lambda x, y: x*y, l["-log10(pValue)"]) )
137 |         tmp=pd.DataFrame(tmp)
138 |         tmp.reset_index(inplace=True,drop=False)
139 |         tmp.columns=[field,"%s norm. mean -log10(pValue)" %str(field)]
140 |         df=pd.merge(df,tmp,on=field,how="left")
141 | 
142 |         tmp=df[[field,'name',"signalValue"]].drop_duplicates()
143 |         tmp=tmp.drop(["name"],axis=1)
144 |         tmp["signalValue"]=tmp["signalValue"].astype(float)
145 |         tmp=tmp.groupby(field).apply(lambda l: reduce(lambda x, y: x*y, l["signalValue"]) )
146 |         tmp=pd.DataFrame(tmp)
147 |         tmp.reset_index(inplace=True,drop=False)
148 |         tmp.columns=[field,"%s signalValue" %str(field)]
149 |         df=pd.merge(df,tmp,on=field,how="left")
150 | 
151 |         return df
152 | 
153 |     for f in ["exon_id","transcript_id"]:
154 |         dfTargets=getCounts(dfTargets,f)
155 | 
156 |     def getCounts_GeneIDs(df):
157 |         """
158 |         For each gene id in a bed narrow peak returns the number or times that field is present,\
159 |         the normalized mean of the '-log10(pValue)' and normalized mean of the signal value.
160 | 
161 |         :param df: a Pandas dataframe of a bed narrow peak
162 | 
163 |         :returns: a Pandas dataframe
164 |         """
165 | 
166 |         field="gene_id"
167 | 
168 |         tmp=df[[field,"transcript_id","transcript_id_count"]].drop_duplicates()
169 |         tmp=tmp.drop(["transcript_id"],axis=1)
170 |         tmp["transcript_id_count"]=tmp["transcript_id_count"].astype(int)
171 |         tmp.columns=[field,"%s_count" %str(field)]
172 |         tmp=tmp.groupby(field, as_index=False).sum()
173 |         df=pd.merge(df,tmp,on=field,how="left")
174 | 
175 |         tmp=df[[field,'transcript_id',"transcript_id norm. mean -log10(pValue)"]].drop_duplicates()
176 |         tmp=tmp.drop(["transcript_id"],axis=1)
177 |         tmp["transcript_id norm. mean -log10(pValue)"]=tmp["transcript_id norm. mean -log10(pValue)"].astype(float)
178 |         tmp.columns=[field,"%s norm. mean -log10(pValue)" %str(field)]
179 |         tmp=tmp.groupby(field, as_index=False).sum()
180 |         df=pd.merge(df,tmp,on=field,how="left")
181 | 
182 | 
183 | 
184 |         tmp=df[[field,'transcript_id',"transcript_id signalValue"]].drop_duplicates()
185 |         tmp=tmp.drop(["transcript_id"],axis=1)
186 |         tmp["transcript_id signalValue"]=tmp["transcript_id signalValue"].astype(float)
187 |         tmp.columns=[field,"%s signalValue" %str(field)]
188 |         tmp=tmp.groupby(field, as_index=False).sum()
189 |         df=pd.merge(df,tmp,on=field,how="left")
190 | 
191 |         return df
192 | 
193 |     dfTargets=getCounts_GeneIDs(dfTargets)
194 | 
195 | 
196 |     dfTargets=dfTargets.drop(["count"],axis=1)
197 |     return dfTargets
198 | 
199 | def AnnotateBED(bed, GTF, genome_file, bedcols=None, promoter=[1000,200]):
200 |     """
201 |     Annotates a bed file.
202 | 
203 |     :param bed: either a /path/to/file.bed or a Pandas dataframe in bed format. /path/to/file.bed implies bedcols.
204 |     :param GTF: /path/to/file.gtf
205 |     :param genome_file: /path/to/file.genome - a tab separated values of chr name and size information
206 |     :param bedcols: a comma separated string of column headers to use when reading in a bed file. eg: "chr,start,end,name"
207 |     :param promoter: a list containing the upstream start of the promoter region from the TSS and the downstream end of the promoter region from the TSS.
208 | 
209 |     :returns: a Pandas dataframe with the annotated bed file. exons and promoters will be reported as well in the annotated_gene_features column.
210 |     """
211 |     if type(bed) == type("string"):
212 |         bed=pd.read_table(bed,header=None)
213 |         bed.columns=bedcols.split(",")
214 | 
215 |     print("Reading GTF file.")
216 |     sys.stdout.flush()
217 | 
218 |     GTF=readGTF(GTF)
219 |     GTF["gene_name"]=retrieve_GTF_field("gene_name", GTF)
220 |     GTF["gene_id"]=retrieve_GTF_field("gene_id", GTF)
221 |     GTF["gene_name"]=GTF["gene_name"]+"/"+GTF["gene_id"]
222 |     GTF=GTF.drop(["gene_id"],axis=1)
223 | 
224 |     print("Generating promoters annotation.")
225 |     sys.stdout.flush()
226 | 
227 |     promoters=GTF[GTF["feature"]=="transcript"]
228 |     promoters_plus=promoters[promoters["strand"]=="+"]
229 |     promoters_minus=promoters[promoters["strand"]=="-"]
230 | 
231 |     upstream=promoter[0]
232 |     downstream=promoter[1]
233 | 
234 |     promoters_plus.loc[:,"promoter_start"]=promoters_plus.loc[:,"start"].astype(int)-upstream
235 |     promoters_plus.loc[:,"promoter_end"]=promoters_plus.loc[:,"start"].astype(int)+downstream
236 | 
237 |     promoters_minus.loc[:,"promoter_start"]=promoters_minus["end"].astype(int)-downstream
238 |     promoters_minus.loc[:,"promoter_end"]=promoters_minus["end"].astype(int)+upstream
239 | 
240 |     promoters=pd.concat([promoters_plus,promoters_minus])
241 | 
242 |     promoters=promoters[["seqname","feature","promoter_start","promoter_end","gene_name"]]
243 |     promoters.columns=["seqname","feature","start","end","gene_name"]
244 | 
245 |     promoters.loc[:,"feature"]="promoter"
246 |     promoters.drop_duplicates(inplace=True)
247 |     promoters.reset_index(inplace=True, drop=True)
248 | 
249 |     chr_sizes=pd.read_table(genome_file,header=None)
250 |     chr_sizes.columns=["seqname","size"]
251 |     chr_sizes.loc[:,"seqname"]=chr_sizes["seqname"].astype(str)
252 |     promoters.loc[:,"seqname"]=promoters["seqname"].astype(str)
253 | 
254 |     promoters=pd.merge(promoters,chr_sizes,how="left",on=["seqname"])
255 |     def CorrectStart(df):
256 |         s=df["start"]
257 |         if s < 0:
258 |             s=0
259 |         return s
260 | 
261 |     def CorrectEnd(df):
262 |         s=df["end"]
263 |         e=df["size"]
264 |         if s > e:
265 |             s=e
266 |         return s
267 | 
268 |     promoters.loc[:,"start"]=promoters.apply(CorrectStart,axis=1)
269 |     promoters.loc[:,"end"]=promoters.apply(CorrectEnd,axis=1)
270 | 
271 |     promoters.drop(["size"],axis=1, inplace=True)
272 | 
273 |     GTFs=GTF[["seqname","feature","start","end","gene_name"]]
274 |     GTFs=GTFs[ GTFs["feature"]!= "gene"]
275 | 
276 |     GTFs.drop_duplicates(inplace=True)
277 |     GTFs.reset_index(inplace=True, drop=True)
278 | 
279 |     GTFs=pd.concat([GTFs,promoters])
280 | 
281 |     def NewName(df):
282 |         name=df["gene_name"]
283 |         feature=df["feature"]
284 |         if feature == "transcript":
285 |             res=name
286 |         else:
287 |             res=name+":"+feature
288 |         return res
289 | 
290 |     GTFs.loc[:,"gene_name"]=GTFs.apply(NewName, axis=1)
291 |     GTFs=GTFs[["seqname","start","end","gene_name"]]
292 | 
293 |     print( "Intersecting annotation tables and bed." )
294 |     sys.stdout.flush()
295 | 
296 |     refGTF=dfTObedtool(GTFs)
297 |     pos=dfTObedtool(bed)
298 | 
299 |     colsGTF=GTFs.columns.tolist()
300 |     newCols=bed.columns.tolist()
301 | 
302 |     for f in colsGTF:
303 |         newCols.append(f+"_")
304 |     newCols_=[ s for s in newCols if s not in ["seqname_","start_", "end_"]]
305 | 
306 |     pos=pos.intersect(refGTF, loj=True)
307 |     pos=pd.read_table(pos.fn , names=newCols)
308 |     pos=pos[newCols_]
309 | 
310 |     print("Merging features.")
311 |     sys.stdout.flush()
312 | 
313 |     def GetFeature(x):
314 |         if ":" in x:
315 |             res=x.split(":")[1]
316 |         else:
317 |             res=np.nan
318 |         return res
319 | 
320 |     def GetName(x):
321 |         if ":" in x:
322 |             res=x.split(":")[0]
323 |         elif type(x) == type("string"):
324 |             if x != ".":
325 |                 res=x
326 |             else:
327 |                 res=np.nan
328 |         else:
329 |             res=np.nan
330 |         return res
331 | 
332 |     pos["gene_feature_"]=pos["gene_name_"].apply( lambda x: GetFeature(x) )
333 |     pos["gene_name_"]=pos["gene_name_"].apply( lambda x: GetName(x) )
334 | 
335 |     refcol=pos.columns.tolist()
336 |     refcol=[ s for s in refcol if s != "gene_feature_" ]
337 | 
338 |     def CombineAnn(df):
339 |         def JOIN(x):
340 |             return ', '.join([ str(s) for s in list(set(df[x]))  if str(s) != "nan" ] )
341 |         return pd.Series(dict( gene_feature_ = JOIN("gene_feature_") ) )
342 | 
343 |     pos_=pos.groupby(refcol).apply(CombineAnn)
344 |     pos_.reset_index(inplace=True, drop=False)
345 | 
346 |     def MergeNameFeatures(df):
347 |         name=df["gene_name_"]
348 |         feature=df["gene_feature_"]
349 |         if (type(name) == type("string")) & (name != ".") :
350 |             if type(feature) == type("string"):
351 |                 if len(feature) > 0:
352 |                     res=name+": "+feature
353 |                 else:
354 |                     res=name
355 |             else:
356 |                 res=name
357 |         else:
358 |             res=np.nan
359 |         return res
360 | 
361 |     pos_["annotated_gene_features"]=pos_.apply(MergeNameFeatures,axis=1)
362 | 
363 |     pos_=pos_.drop(["gene_name_","gene_feature_"],axis=1)
364 | 
365 |     def CombineAnn(df):
366 |         def JOIN(x):
367 |             return '; '.join([ str(s) for s in list(set(df[x]))  if str(s) != "nan" ] )
368 |         return pd.Series(dict( annotated_gene_features = JOIN("annotated_gene_features") ) )
369 | 
370 |     refcol=[ s for s in refcol if s != "gene_name_" ]
371 |     pos_=pos_.groupby(refcol).apply(CombineAnn)
372 |     pos_.reset_index(inplace=True, drop=False)
373 | 
374 |     return pos_
375 | 


--------------------------------------------------------------------------------
/AGEpy/blast.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import itertools
  3 | import pandas as pd
  4 | import sys
  5 | 
  6 | def variablename(var):
  7 |     """
  8 |     Returns the string of a variable name.
  9 |     """
 10 |     s=[tpl[0] for tpl in itertools.ifilter(lambda x: var is x[1], globals().items())]
 11 |     s=s[0].upper()
 12 |     return s
 13 | 
 14 | def BLASTquery(query,database,program,filter=None,\
 15 |                format_type=None, expect=None,\
 16 |                nucl_reward=None, nucl_penalty=None,\
 17 |                gapcosts=None, matrix=None,\
 18 |                hitlist_size=None, descriptions=None,\
 19 |                alignments=None,\
 20 |                ncbi_gi=None, threshold=None,\
 21 |                word_size=None, composition_based_statistics=None,\
 22 |                organism=None, others=None,\
 23 |                num_threads=None, baseURL="http://blast.ncbi.nlm.nih.gov",\
 24 |               verbose=False):
 25 |     """
 26 |     Performs a blast query online.
 27 | 
 28 |     As in https://ncbi.github.io/blast-cloud/
 29 | 
 30 |     :param query: Search query. Allowed values: Accession, GI, or FASTA.
 31 |     :param database: BLAST database. Allowed values: nt, nr, refseq_rna, refseq_protein, swissprot, pdbaa, pdbnt
 32 |     :param program: BLAST program. Allowed values:  blastn, megablast, blastp, blastx, tblastn, tblastx
 33 |     :param filter: Low complexity filtering. Allowed values: F to disable. T or L to enable. Prepend "m" for mask at lookup (e.g., mL)
 34 |     :param format_type: Report type. Allowed values: HTML, Text, XML, XML2, JSON2, or Tabular. HTML is the default.
 35 |     :param expect: Expect value. Allowed values: Number greater than zero.
 36 |     :param nucl_reward: Reward for matching bases (BLASTN and megaBLAST). Allowed values: Integer greater than zero.
 37 |     :param nucl_penalty: Cost for mismatched bases (BLASTN and megaBLAST). Allowed values: Integer less than zero.
 38 |     :param gapcosts: Gap existence and extension costs. Allowed values: Pair of positive integers separated by a space such as "11 1".
 39 |     :param matrix: Scoring matrix name. Allowed values: One of BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, PAM250, PAM30 or PAM70. Default: BLOSUM62 for all applicable programs.
 40 |     :param hitlist_size: Number of databases sequences to keep. Allowed values: Integer greater than zero.
 41 |     :param descriptions: Number of descriptions to print (applies to HTML and Text). Allowed values: Integer greater than zero.
 42 |     :param alignments: Number of alignments to print (applies to HTML and Text). Allowed values: Integer greater than zero.
 43 |     :param ncbi_gi: Show NCBI GIs in report. Allowed values: T or F.
 44 |     :param threshold: Neighboring score for initial words. Allowed values: Positive integer (BLASTP default is 11). Does not apply to BLASTN or MegaBLAST).
 45 |     :param word_size: Size of word for initial matches. Allowed values: Positive integer.
 46 |     :param composition_based_statistics: Composition based statistics algorithm to use. Allowed values: One of 0, 1, 2, or 3. See comp_based_stats command line option in the BLAST+ user manual for details.
 47 |     :param organism: an organism as in https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
 48 |     :param others: here you can add other parameters as seen in a blast bookmarked page. Define you query in https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
 49 |             Once your query is defined click on "Bookmark" on right upper side of the page. You can copy fragments of the URL
 50 |             which define the query. Eg. For organism "Homo sapiens (taxid:9606)" you will see the string "EQ_MENU=Homo%20sapiens%20%28taxid%3A9606%29" - this is
 51 |             the string you can use here in others.
 52 |     :param num_threads: Number of virtual CPUs to use. Allowed values: Integer greater than zero (default is 1). Supported only on the cloud.
 53 |     :param verbose: print more
 54 | 
 55 |     :returns: BLAST search request identifier
 56 |     """
 57 | 
 58 |     if organism:
 59 |         organism=organism.replace(" ", "%20").replace("(", "%28").replace(")", "%29").replace(":", "%3A")
 60 |         EQ_MENU=organism
 61 |     else:
 62 |         EQ_MENU=None
 63 | 
 64 |     URL=baseURL+"/Blast.cgi?"
 65 |     URL=URL+"QUERY="+str(query)+"&DATABASE="+str(database)+"&PROGRAM="+str(program)
 66 |     for o,varname in zip([filter, format_type, expect, nucl_reward, nucl_penalty,\
 67 |               gapcosts, matrix, hitlist_size, descriptions, alignments,\
 68 |               ncbi_gi, threshold, word_size, composition_based_statistics,\
 69 |               EQ_MENU, num_threads],\
 70 |               ['FILTER' , 'FORMAT_TYPE', 'EXPECT', 'NUCL_REWARD', 'NUCL_PENALTY',\
 71 |               'GAPCOSTS', 'MATRIX', 'HITLIST_SIZE', 'DESCRIPTIONS', 'ALIGNMENTS',\
 72 |               'NCBI_GI', 'THRESHOLD', 'WORD_SIZE', 'COMPOSITION_BASED_STATISTICS',\
 73 |               'EQ_MENU', 'NUM_THREADS']):
 74 |         if o:
 75 |             URL=URL+"&"+ varname +"="+str(o)
 76 | 
 77 |     if others:
 78 |         URL=URL+"&"+others
 79 | 
 80 |     URL=URL+"&CMD=Put"
 81 | 
 82 |     if verbose:
 83 |         print(URL)
 84 |         sys.stdout.flush()
 85 | 
 86 |     response=requests.get(url = URL)
 87 |     r=response.content.split("\n")
 88 |     RID=[ s for s in r if "RID = " in s ]
 89 |     if len(RID) > 0:
 90 |         RID=RID[0].split(" ")[-1]
 91 |     else:
 92 |         print("Could not return an RID for this query.")
 93 |         RID=None
 94 |     return RID
 95 | 
 96 | def BLASTcheck(rid,baseURL="http://blast.ncbi.nlm.nih.gov"):
 97 |     """
 98 |     Checks the status of a query.
 99 | 
100 |     :param rid: BLAST search request identifier. Allowed values: The Request ID (RID) returned when the search was submitted
101 |     :param baseURL: server url. Default=http://blast.ncbi.nlm.nih.gov
102 | 
103 |     :returns status: status for the query.
104 |     :returns therearehist: yes or no for existing hits on a finished query.
105 |     """
106 | 
107 |     URL=baseURL+"/Blast.cgi?"
108 |     URL=URL+"FORMAT_OBJECT=SearchInfo&RID="+rid+"&CMD=Get"
109 |     response=requests.get(url = URL)
110 |     r=response.content.split("\n")
111 |     try:
112 |         status=[ s for s in r if "Status=" in s ][0].split("=")[-1]
113 |         ThereAreHits=[ s for s in r if "ThereAreHits=" in s ][0].split("=")[-1]
114 |     except:
115 |         status=None
116 |         ThereAreHits=None
117 | 
118 |     print(rid, status, ThereAreHits)
119 |     sys.stdout.flush()
120 | 
121 |     return status, ThereAreHits
122 | 
123 | def BLASTresults(rid, format_type="Tabular", \
124 |                  hitlist_size= None, alignments=None, \
125 |                  ncbi_gi = None, format_object=None,\
126 |                  baseURL="http://blast.ncbi.nlm.nih.gov"):
127 |     """
128 |     Retrieves results for an RID.
129 | 
130 |     :param rid: BLAST search request identifier. Allowed values: The Request ID (RID) returned when the search was submitted
131 |     :param format_type: Report type. Allowed values: HTML, Text, XML, XML2, JSON2, or Tabular. Tabular is the default.
132 |     :param hitlist_size: Number of databases sequences to keep. Allowed values: Integer greater than zero.
133 |     :param alignments: Number of alignments to print (applies to HTML and Text). Allowed values: Integer greater than zero.
134 |     :param ncbi_gi: Show NCBI GIs in report. Allowed values: T or F.
135 |     :param format_object: Object type. Allowed values: SearchInfo (status check) or Alignment (report formatting).
136 |     :param baseURL: server url. Default=http://blast.ncbi.nlm.nih.gov
137 | 
138 |     :returns: the result of a BLAST query. If format_type="Tabular" it will parse the content into a Pandas dataframe.
139 |     """
140 | 
141 |     URL=baseURL+"/Blast.cgi?"
142 |     URL=URL+"RID="+str(rid)+"&FORMAT_TYPE="+str(format_type)
143 |     for o in [ hitlist_size, alignments,\
144 |               ncbi_gi, format_object]:
145 |         if o:
146 |             URL=URL+"&"+ variablename(var) +"="+str(o)
147 |     URL=URL+"&CMD=Get"
148 |     response=requests.get(url = URL)
149 |     response=response.content
150 | 
151 |     if format_type=="Tabular":
152 |         result=response.split("\n")
153 |         result=[ s.split("\t") for s in result][6:]
154 |         header=result[:7]
155 |         content=result[7:]
156 |         fields=header[5][0].strip("# Fields: ").split(", ")
157 |         result=pd.DataFrame(content,columns=fields)
158 |         response=result[:int(header[-1][0].split(" ")[1])]
159 | 
160 |     return response
161 | 


--------------------------------------------------------------------------------
/AGEpy/david.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import sys
  3 | # from suds.client import Client as sudsclient
  4 | from zeep import Client as zeepclient
  5 | import logging
  6 | import ssl
  7 | from .plots import *
  8 | 
  9 | david_categories = [
 10 |   'GOTERM_BP_FAT', 'GOTERM_CC_FAT', 'GOTERM_MF_FAT', 'KEGG_PATHWAY',
 11 |   'BIOCARTA', 'PFAM', 'PROSITE' ]
 12 | 
 13 | david_fields = [
 14 |   'categoryName', 'termName', 'listHits', 'percent',
 15 |   'ease', 'geneIds', 'listTotals', 'popHits', 'popTotals',
 16 |   'foldEnrichment', 'bonferroni', 'benjamini', 'afdr']
 17 | # include:
 18 | # 'fisher'
 19 | # 'termName' to 'term' and 'term_name'
 20 | 
 21 | def DAVIDenrich(database, categories, user, ids, ids_bg = None, name = '', name_bg = '', verbose = False, p = 0.1, n = 2):
 22 |     # Modified from https://david.ncifcrf.gov/content.jsp?file=WS.html
 23 |     # by courtesy of HuangYi @ 20110424
 24 | 
 25 |     """
 26 |     Queries the DAVID database for an enrichment analysis
 27 |     Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == "type" tag and categories ==  "annot" tag.
 28 | 
 29 |     :param database: A string for the database to query, e.g. 'WORMBASE_GENE_ID'
 30 |     :param categories: A comma separated string with databases
 31 |     :param user: A user ID registered at DAVID for querying
 32 |     :param ids: A list with identifiers
 33 |     :param name: A string with the name for the query set
 34 |     :param ids_bg: A list with the background identifiers to enrich against,
 35 |       'None' for whole set
 36 |     :param name_bg: A string with the name for the background set
 37 |     :param p: Maximum p value for enrichment of a term
 38 |     :param n: Minimum number of genes within a term
 39 | 
 40 |     :returns: None if no ids match the queried database, or a pandas data frame with results
 41 |     """
 42 |     
 43 |     ids = ','.join([str(i) for i in ids])
 44 |     use_bg = 0
 45 |     if ids_bg is not None:
 46 |       ids_bg = ','.join([str(i) for i in ids_bg])
 47 |     ssl._create_default_https_context = ssl._create_unverified_context
 48 |     url = 'https://david.ncifcrf.gov/webservice/services/DAVIDWebService?wsdl'
 49 |     logging.getLogger("zeep").setLevel(logging.ERROR)
 50 |     # client = sudsclient(url)
 51 |     # client.wsdl.services[0].setlocation('https://david.ncifcrf.gov/webservice/services/DAVIDWebService.DAVIDWebServiceHttpSoap11Endpoint/')
 52 |     client = zeepclient(url)
 53 |     client_auth = client.service.authenticate(user)
 54 |     if verbose:
 55 |       print('User Authentication:', client_auth)
 56 |       sys.stdout.flush()
 57 |     size = client.service.addList(ids, database, name, 0) #| inputListIds,idType,listName,listType)
 58 |     if verbose:
 59 |       print('Mapping rate of ids: ', str(size))
 60 |       sys.stdout.flush()
 61 |     if not float(size) > float(0):
 62 |       return None
 63 |     if ids_bg is not None:
 64 |       size_bg = client.service.addList(ids_bg, database, name_bg, 1)
 65 |       if verbose:
 66 |         print('Mapping rate of background ids: ', str(size_bg))
 67 |         sys.stdout.flush()
 68 |     client_categories = client.service.setCategories(categories)
 69 |     if verbose:
 70 |       print('Categories used: ', client_categories)
 71 |       sys.stdout.flush()
 72 |     client_report = client.service.getChartReport(p, n)
 73 |     size_report = len(client_report)
 74 |     if verbose:
 75 |       print('Records reported: ', str(size_report))
 76 |       sys.stdout.flush()
 77 | 
 78 |     if size_report > 0:
 79 |         df = []
 80 |         for r in client_report:
 81 |             # d = dict(r)
 82 |             line = []
 83 |             for f in david_fields:
 84 |                 # line.append(str(d[f]).encode('ascii','ignore'))
 85 |                 value = getattr(r, f, None)
 86 |                 line.append(str(value).encode('ascii','ignore'))
 87 |             df.append(line)
 88 |         df = pd.DataFrame(df)
 89 |         df.columns=david_fields
 90 |         for col in david_fields:
 91 |             df[col] = df[col].apply(lambda x: x.decode())
 92 |     else:
 93 |         df=None
 94 | 
 95 |     return df
 96 | 
 97 | 
 98 | def id_nameDAVID(df,GTF=None,name_id=None):
 99 |     """
100 |     Given a DAVIDenrich output it converts ensembl gene ids to genes names and adds this column to the output
101 | 
102 |     :param df: a dataframe output from DAVIDenrich
103 |     :param GTF: a GTF dataframe from readGTF()
104 |     :param name_id: instead of a gtf dataframe a dataframe with the columns 'gene_name' and 'gene_id' can be given as input
105 | 
106 |     :returns: a pandas dataframe with a gene name column added to it.
107 |     """
108 |     if name_id is None:
109 |         gene_name=retrieve_GTF_field('gene_name',GTF)
110 |         gene_id=retrieve_GTF_field('gene_id', GTF)
111 |         GTF=pd.concat([gene_name,gene_id],axis=1)
112 |     else:
113 |         GTF=name_id.copy()
114 |     df['Gene_names']="genes"
115 |     terms=df['termName'].tolist()
116 |     enrichN=pd.DataFrame()
117 |     for term in terms:
118 |         tmp=df[df['termName']==term]
119 |         tmp=tmp.reset_index(drop=True)
120 |         ids=tmp.xs(0)['geneIds']
121 |         ids=pd.DataFrame(data=ids.split(", "))
122 |         ids.columns=['geneIds']
123 |         ids['geneIds']=ids['geneIds'].map(str.lower)
124 |         GTF['gene_id']=GTF['gene_id'].astype(str)
125 |         GTF['gene_id']=GTF['gene_id'].map(str.lower)
126 |         ids=pd.merge(ids, GTF, how='left', left_on='geneIds', right_on='gene_id')
127 |         names=ids['gene_name'].tolist()
128 |         names= ', '.join(names)
129 |         tmp["Gene_names"]=names
130 |         #tmp=tmp.replace(to_replace=tmp.xs(0)['Gene_names'], value=names)
131 |         enrichN=pd.concat([enrichN, tmp])
132 |     enrichN=enrichN.reset_index(drop=True)
133 | 
134 |     gene_names=enrichN[['Gene_names']]
135 |     gpos=enrichN.columns.get_loc("geneIds")
136 |     enrichN=enrichN.drop(['Gene_names'],axis=1)
137 |     cols=enrichN.columns.tolist()
138 |     enrichN=pd.concat([enrichN[cols[:gpos+1]],gene_names,enrichN[cols[gpos+1:]]],axis=1)
139 | 
140 |     return enrichN
141 | 
142 | def DAVIDgetGeneAttribute(x,df,refCol="ensembl_gene_id",fieldTOretrieve="gene_name"):
143 |     """
144 |     Returns a list of gene names for given gene ids.
145 | 
146 |     :param x: a string with the list of IDs separated by ', '
147 |     :param df: a dataframe with the reference column and a the column to retrieve
148 |     :param refCol: the header of the column containing the identifiers
149 |     :param fieldTOretrieve: the field to retrieve from parsedGTF eg. 'gene_name'
150 | 
151 |     :returns: list of fieldTOretrieve separeted by ', ' in the same order as the given in x
152 |     """
153 |     
154 |     l=x.split(", ")
155 |     l=[ s.upper() for s in l ]
156 |     tmpdf=pd.DataFrame({refCol:l},index=range(len(l)))
157 |     df_fix=df[[refCol,fieldTOretrieve]].drop_duplicates()
158 |     sys.stdout.flush()
159 |     df_fix[refCol]=df_fix[refCol].apply(lambda x: x.upper())
160 |     ids=pd.merge(tmpdf,df_fix,how="left",on=[refCol])
161 |     ids=ids[fieldTOretrieve].tolist()
162 |     ids=[ str(s) for s in ids ]
163 |     ids=", ".join(ids)
164 |     return ids
165 | 
166 | 
167 | def DAVIDplot(database, categories, user, df_ids, output, df_ids_bg = None, name = '', \
168 |     name_bg = '', verbose = False, p = 0.1, n = 2):
169 |     """
170 |     Queries the DAVID database for an enrichment analysis and plots CellPlots as
171 |     well as SymPlots (see plots).
172 |     Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == "type" tag and categories ==  "annot" tag.
173 | 
174 |     :param database: a string for the database to query, e.g. 'WORMBASE_GENE_ID'
175 |     :param categories: a comma separated string with databases
176 |     :param user: a user ID registered at DAVID for querying
177 |     :param df_ids: a dataframe where the first column contains the identifiers
178 |         to be queried and the second column the respective log2fc for each identifier.
179 |     :param output: /path/to/output/prefix
180 |     :param df_ids_bg: a dataframe where the first column contains the identifiers
181 |         to be used as background. None for whole set.
182 |     :param name: a string with the name for the query set
183 |     :param name_bg: a string with the name for the background set
184 |     :param p: Maximum p value for enrichment of a term
185 |     :param n: Minimum number of genes within a term
186 | 
187 |     :returns: Nothing
188 |     """
189 | 
190 |     idsc1=df_ids.columns.tolist()[0]
191 |     idsc2=df_ids.columns.tolist()[1]
192 | 
193 |     ids=df_ids[idsc1].tolist()
194 |     if type(df_ids_bg)==type(pd.DataFrame()):
195 |         ids_bg=df_ids_bg[df_ids_bg.columns.tolist()[0]]
196 |     else:
197 |         ids_bg=None
198 |     
199 |     print(categories)
200 |     
201 |     david=DAVIDenrich(database, categories, user, ids, ids_bg = ids_bg, \
202 |     name = name, name_bg = name_bg, verbose = verbose, p = p, n = n)
203 | 
204 |     if type(david)!=type(pd.DataFrame()):
205 |         print("For this dataset no enrichments could be returned.")
206 |         sys.stdout.flush()
207 |     else:
208 |         david[idsc2]=david["geneIds"].apply(lambda x: \
209 |                           DAVIDgetGeneAttribute(x,\
210 |                           df_ids,\
211 |                           refCol=idsc1,\
212 |                           fieldTOretrieve=idsc2))
213 |         david[idsc2]=david[idsc2].apply(lambda x: x.replace(", ", ","))
214 |         EXC=pd.ExcelWriter(output+".xlsx")
215 |         for category in list(set(david["categoryName"].tolist())):
216 |             david_=david[david["categoryName"]==category]
217 |             print(category)
218 |             david_.to_excel(EXC,category)
219 | 
220 |             tmp=david_[:20]
221 |             tmp["-log10(p)"]=np.log10(tmp["ease"].astype(float)) * -1
222 |             #tmp["Term"]=tmp['termName']
223 |             #tmp["Annotated"]=tmp["listHits"]
224 |             cellplot=CellPlot(tmp, output_file=output+"."+category, gene_expression_col=idsc2, gene_expression=idsc2, \
225 |             figure_title=category+"\n"+output.split("/")[-1], pvalCol="ease", \
226 |             lowerLimit=None, upperLimit=None, colorBarType='bwr', xaxis_label = "GO Term -log10(p-value)")
227 | 
228 |             symplot=SymPlot(tmp, output_file=output+"."+category, \
229 |             gene_expression_col=idsc2,\
230 |             figure_title=category+"\n"+output.split("/")[-1], \
231 |             pvalCol="ease", xaxis_label = "GO Term -log10(p-value)")
232 |         EXC.save()
233 | 


--------------------------------------------------------------------------------
/AGEpy/fasta.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def getFasta(opened_file, sequence_name):
 3 |     """
 4 |     Retrieves a sequence from an opened multifasta file
 5 | 
 6 |     :param opened_file: an opened multifasta file eg. opened_file=open("/path/to/file.fa",'r+')
 7 |     :param sequence_name: the name of the sequence to be retrieved eg. for '>2 dna:chromosome chromosome:GRCm38:2:1:182113224:1 REF' use: sequence_name=str(2)
 8 | 
 9 |     returns: a string with the sequence of interest
10 |     """
11 | 
12 |     lines = opened_file.readlines()
13 |     seq=str("")
14 |     for i in range(0, len(lines)):
15 |         line = lines[i]
16 |         if line[0] == ">":
17 |             fChr=line.split(" ")[0].split("\n")[0]
18 |             fChr=fChr[1:]
19 |             if fChr == sequence_name:
20 |                 s=i
21 |                 code=['N','A','C','T','G']
22 |                 firstbase=lines[s+1][0]
23 |                 while firstbase in code:
24 |                     s=s + 1
25 |                     seq=seq+lines[s]
26 |                     firstbase=lines[s+1][0]
27 | 
28 |     if len(seq)==0:
29 |         seq=None
30 |     else:
31 |         seq=seq.split("\n")
32 |         seq="".join(seq)
33 | 
34 |     return seq
35 | 
36 | def writeFasta(sequence, sequence_name, output_file):
37 |     """
38 |     Writes a fasta sequence into a file.
39 | 
40 |     :param sequence: a string with the sequence to be written
41 |     :param sequence_name: name of the the fasta sequence
42 |     :param output_file: /path/to/file.fa to be written
43 | 
44 |     :returns: nothing
45 |     """
46 |     i=0
47 |     f=open(output_file,'w')
48 |     f.write(">"+str(sequence_name)+"\n")
49 |     while i <= len(sequence):
50 |         f.write(sequence[i:i+60]+"\n")
51 |         i=i+60
52 |     f.close()
53 | 
54 | def rewriteFasta(sequence, sequence_name, fasta_in, fasta_out):
55 |     """
56 |     Rewrites a specific sequence in a multifasta file while keeping the sequence header.
57 | 
58 |     :param sequence: a string with the sequence to be written
59 |     :param sequence_name: the name of the sequence to be retrieved eg. for '>2 dna:chromosome chromosome:GRCm38:2:1:182113224:1 REF' use: sequence_name=str(2)
60 |     :param fasta_in: /path/to/original.fa
61 |     :param fasta_out: /path/to/destination.fa
62 | 
63 |     :returns: nothing
64 |     """
65 |     f=open(fasta_in, 'r+')
66 |     f2=open(fasta_out,'w')
67 |     lines = f.readlines()
68 |     i=0
69 |     while i < len(lines):
70 |         line = lines[i]
71 |         if line[0] == ">":
72 |             f2.write(line)
73 |             fChr=line.split(" ")[0]
74 |             fChr=fChr[1:]
75 |             if fChr == sequence_name:
76 |                 code=['N','A','C','T','G']
77 |                 firstbase=lines[i+1][0]
78 |                 while firstbase in code:
79 |                     i=i+1
80 |                     firstbase=lines[i][0]
81 |                 s=0
82 |                 while s <= len(sequence):
83 |                     f2.write(sequence[s:s+60]+"\n")
84 |                     s=s+60
85 |             else:
86 |                 i=i+1
87 |         else:
88 |             f2.write(line)
89 |             i=i+1
90 | 
91 |     f2.close
92 |     f.close
93 | 


--------------------------------------------------------------------------------
/AGEpy/go.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | #import urllib2 # python2
 3 | import urllib.request as urllib2
 4 | # import StringIO # python2
 5 | from io import StringIO
 6 | import gzip
 7 | import sys
 8 | 
 9 | 
10 | def getGeneAssociation(URL_or_file):
11 |     """
12 |     This function collects GO annotation from http://geneontology.org/page/download-annotations.
13 | 
14 |     :param URL_or_file: either a link to a file on geneontology.org eg. http://geneontology.org/gene-associations/gene_association.fb.gz or the path for the respective  downloded .gz file.
15 |     :returns: a Pandas dataframe with the parsed table.
16 |     """
17 |     if URL_or_file[:4] == "http":
18 |         response = urllib2.urlopen(URL_or_file)
19 |         compressedFile = StringIO.StringIO(response.read())
20 |         decompressedFile = gzip.GzipFile(fileobj=compressedFile)
21 |     else:
22 |         decompressedFile = gzip.GzipFile(URL_or_file)
23 |     out=decompressedFile.read().split("\n")
24 | 
25 |     version=[ s for s in out if len(s) > 0 ]
26 |     version=[ s for s in version if s[0] == '!' ]
27 |     version=[ s for s in version if "!gaf-version:" in s ]
28 |     version=version[0]
29 | 
30 | 
31 |     if version=="!gaf-version: 2.0":
32 |         reco=version
33 |     else:
34 |         reco=None
35 | 
36 |     out=[ s for s in out if len(s) > 0 ]
37 |     out=[ s for s in out if s[0] != "!" ]
38 |     out=[s.split("\t") for s in out]
39 |     out=pd.DataFrame(out)
40 |     mgi_cols=["DB","DB_Object_ID","DB_Object_Symbol","Qualifier (this field is optional)","GO ID","DB:Reference","Evidence Code","Evidence Code Qualifier (optional)",\
41 |      "Aspect","DB_Object_Name","DB_Object_Synonym","DB_Object_Type","Taxon","Date","Assigned_by"]
42 |     fb_cols=["DB","DB_Object_ID","DB_Object_Symbol","Qualifier","GO ID","DB:Reference","Evidence",\
43 |      "With (or) From","Aspect","DB_Object_Name","DB_Object_Synonym","DB_Object_Type","Taxon","Date","Assigned_by","Annotation Extension",\
44 |      "Gene Product Form ID"]
45 |     gaf_20=["DB","DB Object ID","DB Object Symbol","Qualifier","GO ID","DB:Reference (|DB:Reference)","Evidence Code","With (or) From","Aspect","DB Object Name",\
46 |      "DB Object Synonym (|Synonym)","DB Object Type","Taxon(|taxon)","Date","Assigned By","Annotation Extension","Gene Product Form ID"]
47 |     cols={"fb":fb_cols,"wb":fb_cols,"mgi":fb_cols,"!gaf-version: 2.0":gaf_20}
48 |     colsType=URL_or_file.split(".")
49 |     colsType=colsType[len(colsType)-2]
50 |     if colsType=="gaf":
51 |         colsType=reco
52 |     if colsType in cols.keys():
53 |         try:
54 |             cols=cols.get(colsType)
55 |             out.columns=cols
56 |         except ValueError as err:
57 |             print("Could not fit headers.")
58 |             print(err)
59 |             sys.stdout.flush()
60 |     else:
61 |         print("Could not find headers for %s." %colsType)
62 |         sys.stdout.flush()
63 |     return out
64 | 


--------------------------------------------------------------------------------
/AGEpy/gtf.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import pandas as pd
  3 | import numpy as np
  4 | from collections import OrderedDict
  5 | try:
  6 |     import pybedtools
  7 | except:
  8 |     print("pybedtools could not be imported")
  9 |     sys.stdout.flush()
 10 | import csv
 11 | 
 12 | def readGTF(infile):
 13 |     """
 14 |     Reads a GTF file and labels the respective columns in agreement with GTF file standards:
 15 |     'seqname','source','feature','start','end','score','strand','frame','attribute'.
 16 | 
 17 |     :param infile: path/to/file.gtf
 18 |     :returns: a Pandas dataframe of the respective GTF
 19 | 
 20 |     """
 21 |     df=pd.read_table(infile, sep='\t', comment="#", header=None, dtype=str)
 22 |     df.columns=['seqname','source','feature','start','end','score','strand','frame','attribute']
 23 |     #df = df.astype(str) # from DTC
 24 |     return df
 25 | 
 26 | def retrieve_GTF_field(field,gtf):
 27 |     """
 28 |     Returns a field of choice from the attribute column of the GTF
 29 | 
 30 |     :param field: field to be retrieved
 31 |     :returns: a Pandas dataframe with one columns containing the field of choice
 32 | 
 33 |     """
 34 |     inGTF=gtf.copy()
 35 |     def splits(x):
 36 |         l=x.split(";")
 37 |         l=[ s.split(" ") for s in l]
 38 |         res=np.nan
 39 |         for s in l:
 40 |             if field in s:
 41 |                 if '"' in s[-1]:
 42 |                     res=s[-1][1:-1]
 43 |                 else:
 44 |                     res=s[-1]
 45 |         return res
 46 | 
 47 |     inGTF[field]=inGTF['attribute'].apply(lambda x: splits(x))
 48 |     return inGTF[[field]]
 49 | 
 50 | def attributesGTF(inGTF):
 51 |     """
 52 |     List the type of attributes in a the attribute section of a GTF file
 53 | 
 54 |     :param inGTF: GTF dataframe to be analysed
 55 |     :returns: a list of attributes present in the attribute section
 56 | 
 57 |     """
 58 |     df=pd.DataFrame(inGTF['attribute'].str.split(";").tolist())
 59 |     desc=[]
 60 |     for i in df.columns.tolist():
 61 |         val=df[[i]].dropna()
 62 |         val=pd.DataFrame(val[i].str.split(' "').tolist())[0]
 63 |         val=list(set(val))
 64 |         for v in val:
 65 |             if len(v) > 0:
 66 |                 l=v.split(" ")
 67 |                 if len(l)>1:
 68 |                     l=l[1]
 69 |                 else:
 70 |                     l=l[0]
 71 |                 desc.append(l)
 72 |     desc=list(set(desc))
 73 |     finaldesc=[]
 74 |     for d in desc:
 75 |         if len(d) > 0:
 76 |             finaldesc.append(d)
 77 |     return finaldesc
 78 | 
 79 | def parseGTF(inGTF):
 80 |     """
 81 |     Reads an extracts all attributes in the attributes section of a GTF and constructs a new dataframe wiht one collumn per attribute instead of the attributes column
 82 | 
 83 |     :param inGTF: GTF dataframe to be parsed
 84 |     :returns: a dataframe of the orignal input GTF with attributes parsed.
 85 | 
 86 |     """
 87 | 
 88 |     desc=attributesGTF(inGTF)
 89 |     ref=inGTF.copy()
 90 |     ref.reset_index(inplace=True, drop=True)
 91 |     df=ref.drop(['attribute'],axis=1).copy()
 92 |     for d in desc:
 93 |         field=retrieve_GTF_field(d,ref)
 94 |         df=pd.concat([df,field],axis=1)
 95 |     return df
 96 | 
 97 | def writeGTF(inGTF,file_path):
 98 |     """
 99 |     Write a GTF dataframe into a file
100 | 
101 |     :param inGTF: GTF dataframe to be written. It should either have 9 columns with the last one being the "attributes" section or more than 9 columns where all columns after the 8th will be colapsed into one.
102 |     :param file_path: path/to/the/file.gtf
103 |     :returns: nothing
104 |     """
105 |     cols=inGTF.columns.tolist()
106 |     if len(cols) == 9:
107 |         if 'attribute' in cols:
108 |             df=inGTF
109 |     else:
110 |         df=inGTF[cols[:8]]
111 |         df['attribute']=""
112 |         for c in cols[8:]:
113 |             if c == cols[len(cols)-1]:
114 |                 df['attribute']=df['attribute']+c+' "'+inGTF[c].astype(str)+'";'
115 |             else:
116 |                 df['attribute']=df['attribute']+c+' "'+inGTF[c].astype(str)+'"; '
117 |     df.to_csv(file_path, sep="\t",header=None,index=None,quoting=csv.QUOTE_NONE)
118 | 
119 | def GTFtoBED(inGTF,name):
120 |     """
121 |     Transform a GTF dataframe into a bed dataframe
122 | 
123 |     :param inGTF: GTF dataframe for transformation
124 |     :param name: field of the GTF data frame to be use for the bed 'name' positon
125 | 
126 |     returns: a bed dataframe with the corresponding bed fiels: 'chrom','chromStart','chromEnd','name','score','strand'
127 |     """
128 | 
129 |     bed=inGTF.copy()
130 |     bed.reset_index(inplace=True, drop=True)
131 |     if name not in bed.columns.tolist():
132 |         field=retrieve_GTF_field(name, bed)
133 |         bed=pd.concat([bed,field],axis=1)
134 |     bed=bed[['seqname','start','end',name,'score','strand']]
135 |     bed.columns=['chrom','chromStart','chromEnd','name','score','strand']
136 |     bed.drop_duplicates(inplace=True)
137 |     bed.reset_index(inplace=True,drop=True)
138 |     return bed
139 | 
140 | def MAPGenoToTrans(parsedGTF,feature):
141 |     """
142 |     Gets all positions of all bases in an exon
143 | 
144 |     :param df: a Pandas dataframe with 'start','end', and 'strand' information for each entry.
145 |                 df must contain 'seqname','feature','start','end','strand','frame','gene_id',
146 |                 'transcript_id','exon_id','exon_number']
147 |     :param feature: feature upon wich to generate the map, eg. 'exon' or 'transcript'
148 | 
149 |     :returns: a string with the comma separated positions of all bases in the exon
150 |     """
151 |     GenTransMap=parsedGTF[parsedGTF["feature"]==feature]
152 |     def getExonsPositions(df):
153 |         start=int(df["start"])
154 |         stop=int(df["end"])
155 |         strand=df["strand"]
156 |         r=range(start,stop+1)
157 |         if strand=="-":
158 |             r.sort(reverse=True)
159 |         r=[ str(s) for s in r]
160 |         return ",".join(r)
161 | 
162 |     GenTransMap["feature_bases"]=GenTransMap.apply(getExonsPositions, axis=1)
163 |     GenTransMap=GenTransMap.sort_values(by=["transcript_id","exon_number"],ascending=True)
164 |     def CombineExons(df):
165 |         return pd.Series(dict( feature_bases = ','.join(df['feature_bases']) ) )
166 |     GenTransMap=GenTransMap.groupby("transcript_id").apply(CombineExons)
167 |     GenTransMap=GenTransMap.to_dict().get("feature_bases")
168 | 
169 |     return GenTransMap
170 | 
171 | def GetTransPosition(df,field,dic,refCol="transcript_id"):
172 |     """
173 |     Maps a genome position to transcript positon"
174 | 
175 |     :param df: a Pandas dataframe
176 |     :param field: the head of the column containing the genomic position
177 |     :param dic: a dictionary containing for each transcript the respective bases eg. {ENST23923910:'234,235,236,1021,..'}
178 |     :param refCol: header of the reference column with IDs, eg. 'transcript_id'
179 | 
180 |     :returns: position on transcript
181 |     """
182 |     try:
183 |         gen=str(int(df[field]))
184 |         transid=df[refCol]
185 |         bases=dic.get(transid).split(",")
186 |         bases=bases.index(str(gen))+1
187 |     except:
188 |         bases=np.nan
189 |     return bases
190 | 
191 | def getPromotersBed(gtf,fa,upstream=2000,downstream=200):
192 |     """
193 |     Reads a gtf file and returns a bed file for the promoter coordinates.
194 |     
195 |     :param gtf: path/to/file.gtf. Must be an ensembl gtf.
196 |     :param fa: path/to/fasta.fa. Must be an ensembl fasta file.
197 |     :param upstream: number of bases upstream of transcript start sites the promoter should start
198 |     :param downstream: number of bases downstream of transcript start sites the promoter should end
199 |     :returns: a pandas dataframe in bed format
200 | 
201 |     """
202 |     chrsizes={}
203 |     with open(fa, "r") as f:
204 |         for line in f.readlines():
205 |             if line[0] == ">":
206 |                 l=line.split(" ")
207 |                 seqname=l[0][1:]
208 |                 size=int(l[2].split(":")[-2])
209 |                 chrsizes[seqname]=size
210 |     gtf=readGTF(gtf)
211 |     gtf=gtf[gtf["feature"]=="transcript"]
212 |     gtf.reset_index(inplace=True, drop=True)
213 | 
214 |     gtf["gene_id"]=retrieve_GTF_field(field="gene_id",gtf=gtf)
215 |     gtf["gene_name"]=retrieve_GTF_field(field="gene_name",gtf=gtf)
216 | 
217 |     def getcoord(df):
218 |         seqname=df["seqname"]
219 |         strand=df["strand"]
220 |         if strand == "+":
221 |             tss=int(df["start"])
222 |             promoter_start=tss-upstream
223 |             promoter_end=tss+downstream
224 |         else:
225 |             tss=int(df["end"])
226 |             promoter_start=tss-downstream
227 |             promoter_end=tss+upstream
228 | 
229 |         if promoter_start < 0:
230 |             promoter_start=0
231 |         if promoter_end > chrsizes[seqname]:
232 |             promoter_end=chrsizes[seqname]
233 | 
234 |         return str(promoter_start)+","+str(promoter_end)
235 | 
236 |     gtf["promoter"]=gtf.apply(getcoord, axis=1)
237 |     gtf["start"]=gtf["promoter"].apply(lambda x: int(x.split(",")[0]) )
238 |     gtf["end"]=gtf["promoter"].apply(lambda x: int(x.split(",")[1]) )
239 |     
240 |     gtf["id, name"]=gtf["gene_id"]+", "+gtf["gene_name"]
241 |     gtf_=gtf.drop(["source","feature","attribute","promoter","gene_id","gene_name"],axis=1)
242 |     gtf_=gtf_.drop_duplicates()
243 |     gtf_counts=gtf_[["id, name"]]
244 |     gtf_counts["#"]=1
245 |     gtf_counts=gtf_counts.groupby(["id, name"]).sum()
246 |     beds=gtf_[["seqname","start","end","id, name","score","strand"]]
247 |     beds.columns=['chrom', 'start', 'stop', 'name', 'score', 'strand']
248 |     beds=beds[beds["name"].isin( gtf_counts[gtf_counts["#"]==1].index.tolist() )]
249 |     genes=[ s for s in list(set(gtf_counts[gtf_counts["#"]>1].index.tolist())) if str(s).lower() != "nan" ]
250 | 
251 |     for gene_id in genes:
252 |         tmp=gtf[gtf["id, name"]==gene_id]
253 |         strand=tmp["strand"].tolist()[0]
254 |         bed=GTFtoBED(inGTF=tmp,name="id, name")
255 |         bed = pybedtools.BedTool.from_dataframe(bed)
256 |         bed=bed.sort()
257 |         bed=bed.merge()
258 |         bed = pd.read_table(bed.fn, names=['chrom', 'start', 'stop' ])
259 |         bed["name"]=gene_id
260 |         bed["score"]="."
261 |         bed["strand"]=strand
262 |         beds=pd.concat([beds,bed])
263 |     
264 |     beds = pybedtools.BedTool.from_dataframe(beds)
265 |     beds = beds.sort()
266 |     beds = pd.read_table(beds.fn, names=['chrom', 'start', 'stop', 'name', 'score', 'strand'])
267 |     
268 |     beds.reset_index(inplace=True, drop=True)
269 |     beds["i"]=beds.index.tolist()
270 |     beds["i"]=beds["i"].astype(str)
271 |     beds["name"]=beds["i"]+": "+beds["name"]
272 |     beds=beds.drop(["i"],axis=1)
273 | 
274 |     return beds
275 | 


--------------------------------------------------------------------------------
/AGEpy/homology.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | #import urllib2 # python2
 4 | import urllib.request as urllib2
 5 | 
 6 | def getHomoloGene(taxfile="build_inputs/taxid_taxname",\
 7 |                   genefile="homologene.data",\
 8 |                   proteinsfile="build_inputs/all_proteins.data",\
 9 |                   proteinsclusterfile="build_inputs/proteins_for_clustering.data",\
10 |                   baseURL="http://ftp.ncbi.nih.gov/pub/HomoloGene/current/"):
11 |     """
12 |     Returns NBCI's Homolog Gene tables.
13 | 
14 |     :param taxfile: path to local file or to baseURL/taxfile
15 |     :param genefile: path to local file or to baseURL/genefile
16 |     :param proteinsfile: path to local file or to baseURL/proteinsfile
17 |     :param proteinsclusterfile: path to local file or to baseURL/proteinsclusterfile
18 |     :param baseURL: baseURL for downloading files
19 | 
20 |     :returns genedf: Homolog gene Pandas dataframe
21 |     :returns protclusdf: Pandas dataframe. Lists one protein per gene that were used for homologene clustering.
22 |                         If a gene has multiple protein accessions derived from alternative splicing,
23 |                         only one protein isoform that give most protein alignment to proteins in other species
24 |                         was selected for clustering and it is listed in this file.
25 |     :returns proteinsdf: Pandas dataframe. Lists all proteins and their gene information.
26 |                         If a gene has multple protein accessions derived from alternative splicing event,
27 |                         each protein accession is list in a separate line.
28 |     """
29 | 
30 |     def getDf(inputfile):
31 |         if os.path.isfile(inputfile):
32 |             df=pd.read_table(inputfile,header=None)
33 |         else:
34 |             df = urllib2.urlopen(baseURL+inputfile)
35 |             df=df.read().split("\n")
36 |             df=[ s for s in df if len(s) > 0 ]
37 |             df=[s.split("\t") for s in df]
38 |             df=pd.DataFrame(df)
39 |         return df
40 | 
41 |     taxdf=getDf(taxfile)
42 |     taxdf.set_index([0],inplace=True)
43 |     taxdi=taxdf.to_dict().get(1)
44 | 
45 |     genedf=getDf(genefile)
46 |     genecols=["HID","Taxonomy ID","Gene ID","Gene Symbol","Protein gi","Protein accession"]
47 |     genedf.columns=genecols
48 |     genedf["organism"]=genedf["Taxonomy ID"].apply(lambda x:taxdi.get(x))
49 | 
50 |     proteinsdf=getDf(proteinsfile)
51 |     proteinscols=["taxid","entrez GeneID","gene symbol","gene description","protein accession.ver","mrna accession.ver",\
52 |                  "length of protein  listed in column 5","-11) contains data about gene location on the genome",\
53 |                   "starting position of gene in 0-based coordinate",\
54 |                   "end position of the gene in 0-based coordinate","strand","nucleotide gi of genomic sequence where this gene is annotated"]
55 |     proteinsdf.columns=proteinscols
56 |     proteinsdf["organism"]=proteinsdf["taxid"].apply(lambda x:taxdi.get(x))
57 | 
58 |     protclusdf=getDf(proteinsclusterfile)
59 |     protclustercols=["taxid","entrez GeneID","gene symbol","gene description","protein accession.ver","mrna accession.ver",\
60 |                  "length of protein  listed in column 5","-11) contains data about gene location on the genome",\
61 |                   "starting position of gene in 0-based coordinate",\
62 |                   "end position of the gene in 0-based coordinate","strand","nucleotide gi of genomic sequence where this gene is annotated"]
63 |     protclusdf.columns=proteinscols
64 |     protclusdf["organism"]=protclusdf["taxid"].apply(lambda x:taxdi.get(x))
65 | 
66 |     return genedf, protclusdf, proteinsdf
67 | 


--------------------------------------------------------------------------------
/AGEpy/meme.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | def filterMotifs(memeFile,outFile, minSites):
 3 |     """
 4 |     Selectes motifs from a meme file based on the number of sites.
 5 | 
 6 |     :param memeFile: MEME file to be read
 7 |     :param outFile: MEME file to be written
 8 |     :param minSites: minimum number of sites each motif needs to have to be valid
 9 | 
10 |     :returns: nothing
11 |     """
12 | 
13 |     with open(memeFile, "r") as mF:
14 |         oldMEME=mF.readlines()
15 |         newMEME=oldMEME[:7]
16 |         i=7
17 |         while i < len(oldMEME):
18 |             if oldMEME[i].split(" ")[0] == "MOTIF":
19 |                 print(oldMEME[i].split("\n")[0], int(oldMEME[i+2].split("nsites= ")[1].split(" ")[0]))
20 |                 sys.stdout.flush()
21 |                 if int(oldMEME[i+2].split("nsites= ")[1].split(" ")[0]) > minSites:
22 |                     newMEME.append(oldMEME[i])
23 |                     f=i+1
24 |                     while oldMEME[f].split(" ")[0] != "MOTIF":
25 |                         newMEME.append(oldMEME[f])
26 |                         f=f+1
27 |                     i=i+1
28 |                 else:
29 |                     i=i+1
30 |             else:
31 |                 i=i+1
32 |     with open(outFile, "w+") as out:
33 |         out.write("".join(newMEME) )
34 | 
35 |     return newMEME
36 | 


--------------------------------------------------------------------------------
/AGEpy/rbiom.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sys
 3 | #try:
 4 | #    from rpy2.robjects.packages import importr
 5 | #    try:
 6 | #        biomaRt = importr("biomaRt")
 7 | #    except:
 8 | #        print "rpy2 could be loaded but 'biomaRt' could not be found.\nIf you want to use 'biomaRt' related functions please install 'biomaRt' in R.\n\n$ R\n> source('http://bioconductor.org/biocLite.R')\n> biocLite()\n> biocLite('biomaRt')\n> quit()"
 9 | #        sys.stdout.flush()
10 | #except:
11 | #    print "Failed to import rpy2 module.\nPlease make sure you are using the same version of R you had when AGEpy was installed."
12 | #    sys.stdout.flush()
13 | 
14 | rbiomart_host="www.ensembl.org"
15 | 
16 | def RdatabasesBM(host=rbiomart_host):
17 |     """
18 |     Lists BioMart databases through a RPY2 connection.
19 | 
20 |     :param host: address of the host server, default='www.ensembl.org'
21 | 
22 |     :returns: nothing
23 | 
24 |     """
25 |     biomaRt = importr("biomaRt")
26 |     print(biomaRt.listMarts(host=host))
27 | 
28 | def RdatasetsBM(database,host=rbiomart_host):
29 |     """
30 |     Lists BioMart datasets through a RPY2 connection.
31 | 
32 |     :param database: a database listed in RdatabasesBM()
33 |     :param host: address of the host server, default='www.ensembl.org'
34 | 
35 |     :returns: nothing
36 | 
37 |     """
38 |     biomaRt = importr("biomaRt")
39 |     ensemblMart=biomaRt.useMart(database, host=host)
40 |     print(biomaRt.listDatasets(ensemblMart))
41 | 
42 | def RfiltersBM(dataset,database,host=rbiomart_host):
43 |     """
44 |     Lists BioMart filters through a RPY2 connection.
45 | 
46 |     :param dataset: a dataset listed in RdatasetsBM()
47 |     :param database: a database listed in RdatabasesBM()
48 |     :param host: address of the host server, default='www.ensembl.org'
49 | 
50 |     :returns: nothing
51 | 
52 |     """
53 |     biomaRt = importr("biomaRt")
54 |     ensemblMart=biomaRt.useMart(database, host=host)
55 |     ensembl=biomaRt.useDataset(dataset, mart=ensemblMart)
56 |     print(biomaRt.listFilters(ensembl))
57 | 
58 | def RattributesBM(dataset,database,host=rbiomart_host):
59 |     """
60 |     Lists BioMart attributes through a RPY2 connection.
61 | 
62 |     :param dataset: a dataset listed in RdatasetsBM()
63 |     :param database: a database listed in RdatabasesBM()
64 |     :param host: address of the host server, default='www.ensembl.org'
65 | 
66 |     :returns: nothing
67 | 
68 |     """
69 |     biomaRt = importr("biomaRt")
70 |     ensemblMart=biomaRt.useMart(database, host=rbiomart_host)
71 |     ensembl=biomaRt.useDataset(dataset, mart=ensemblMart)
72 |     print(biomaRt.listAttributes(ensembl))
73 | 
74 | def RqueryBM(query_filter,query_items,query_attributes,dataset,database,host=rbiomart_host):
75 |     """
76 |     Queries BioMart.
77 | 
78 |     :param query_filtery: one BioMart filter associated with the items being queried
79 |     :param query_items: list of items to be queried (must assoiate with given filter)
80 |     :param query_attributes: list of attributes to recover from BioMart
81 |     :param dataset: dataset to query
82 |     :param database: database to query
83 |     :param host: address of the host server, default='www.ensembl.org'
84 | 
85 |     return: a Pandas dataframe of the queried attributes
86 | 
87 |     """
88 | 
89 |     biomaRt = importr("biomaRt")
90 |     ensemblMart=biomaRt.useMart(database, host=rbiomart_host)
91 |     ensembl=biomaRt.useDataset(dataset, mart=ensemblMart)
92 |     df=biomaRt.getBM(attributes=query_attributes, filters=query_filter, values=query_items, mart=ensembl)
93 |     output = [tuple([df[j][i] for j in range(df.ncol)]) for i in range(df.nrow)]
94 |     output = pd.DataFrame(output)
95 |     output.columns = query_attributes
96 |     return output
97 | 


--------------------------------------------------------------------------------
/AGEpy/sam.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | def readSAM(SAMfile,header=False):
  5 |     """
  6 |     Reads and parses a sam file.
  7 | 
  8 |     :param SAMfile: /path/to/file.sam
  9 |     :param header: logical, if True, reads the header information
 10 | 
 11 |     :returns: a pandas dataframe with the respective SAM columns: 'QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL' and a list of the headers if header=True
 12 | 
 13 |     """
 14 |     if header==True:
 15 |         f=open(SAMfile,"r+")
 16 |         head=[]
 17 |         for line in f.readlines():
 18 |             if line[0]=="@":
 19 |                 head.append(line)
 20 |             else:
 21 |                 continue
 22 |         f.close()
 23 | 
 24 |     sam=pd.read_table(SAMfile,sep="this_gives_one_column",comment="@",header=None)
 25 |     sam=pd.DataFrame(sam[0].str.split("\t").tolist())
 26 |     acols=[0,1,2,3,4,5,6,7,8,9]
 27 |     sam_=sam[acols]
 28 |     samcols=sam.columns.tolist()
 29 |     bcols=[ s for s in samcols if s not in acols ]
 30 |     sam_[10]=sam[bcols[0]]
 31 |     if len(bcols) > 1:
 32 |         for c in bcols[1:]:
 33 |             sam_[10]=sam_[10].astype(str)
 34 |             sam[c]=sam[c].astype(str)
 35 |             sam_[10]=sam_[10]+"\t"+sam[c]
 36 | 
 37 |     sam_.columns=['QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL']
 38 | 
 39 |     if header==True:
 40 |         return sam_, head
 41 |     else:
 42 |         return sam_
 43 | 
 44 | def writeSAM(sam,SAMfile,header=None):
 45 |     """
 46 |     Writes a pandas dataframe with the respective SAM columns: 'QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL' into a sam file
 47 | 
 48 |     :param sam: pandas dataframe to be writen
 49 |     :param SAMfile: /path/to/file.sam
 50 | 
 51 |     :returns: nothing
 52 |     """
 53 |     def toNone(x):
 54 |         if x=="None":
 55 |             x=np.nan
 56 |         return x
 57 | 
 58 |     sam.reset_index(inplace=True,drop=True)
 59 |     QUAL=pd.DataFrame(sam['QUAL'].str.split("\t").tolist())
 60 |     cols=QUAL.columns.tolist()
 61 | 
 62 |     for c in cols:
 63 |         QUAL[c]=QUAL[c].apply(lambda x: toNone(x))
 64 | 
 65 |     sam=sam.drop(['QUAL'],axis=1)
 66 |     sam=pd.concat([sam,QUAL],axis=1)
 67 |     sam=sam.astype(str)
 68 |     sam=sam.as_matrix()
 69 | 
 70 |     tfile=open(SAMfile, "w+")
 71 | 
 72 |     if header != None:
 73 |         for l in header:
 74 |             tfile.write(l)
 75 | 
 76 |     for l in sam:
 77 |         l=[ s for s in l if s not in ['nan'] ]
 78 |         l="\t".join(l)
 79 |         tfile.write(l+"\n")
 80 | 
 81 |     tfile.close()
 82 | 
 83 | def SAMflags(x):
 84 |     """
 85 |     Explains a SAM flag.
 86 | 
 87 |     :param x: flag
 88 | 
 89 |     :returns: complete SAM flag explanaition
 90 |     """
 91 |     flags=[]
 92 | 
 93 |     if x & 1:
 94 |         l="1: Read paired"
 95 |     else:
 96 |         l="0: Read unpaired"
 97 |     flags.append(l)
 98 | 
 99 |     if x & 2 :
100 |         l="1: Read mapped in proper pair"
101 |     else:
102 |         l="0: Read not mapped in proper pair"
103 |     flags.append(l)
104 | 
105 |     if x & 4 :
106 |         l="1: Read unmapped"
107 |     else:
108 |         l="0: Read mapped"
109 |     flags.append(l)
110 | 
111 |     if x & 8 :
112 |         l="1: Mate unmapped"
113 |     else:
114 |         l="0: Mate mapped"
115 |     flags.append(l)
116 | 
117 |     if x & 16 :
118 |         l="1: Read reverse strand"
119 |     else:
120 |         l="0: Read direct strand"
121 |     flags.append(l)
122 | 
123 |     if x & 32 :
124 |         l="1: Mate reverse strand"
125 |     else:
126 |         l="0: Mate direct strand"
127 |     flags.append(l)
128 | 
129 |     if x & 64 :
130 |         l="1: First in pair"
131 |     else:
132 |         l="0: Second in pair"
133 |     flags.append(l)
134 | 
135 |     if x & 128 :
136 |         l="1: Second in pair"
137 |     else:
138 |         l="0: First in pair"
139 |     flags.append(l)
140 | 
141 |     if x & 256 :
142 |         l="1: Not primary alignment"
143 |     else:
144 |         l="0: Primary alignment"
145 |     flags.append(l)
146 | 
147 |     if x & 512 :
148 |         l="1: Read fails platform/vendor quality checks"
149 |     else:
150 |         l="0: Read passes platform/vendor quality checks"
151 |     flags.append(l)
152 | 
153 |     if x & 1024 :
154 |         l="1: Read is PCR or optical duplicate"
155 |     else:
156 |         l="0: Read is not PCR or optical duplicate"
157 |     flags.append(l)
158 | 
159 |     if x & 2048 :
160 |         l="1: Supplementary alignment"
161 |     else:
162 |         l="0: Not supplementary alignment"
163 |     flags.append(l)
164 | 
165 |     return flags
166 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-slim
2 | 
3 | RUN apt-get update && apt-get install -yq --no-install-recommends git gcc g++ libz-dev imagemagick imagemagick-doc && apt-get clean && rm -rf /var/lib/apt/lists/*
4 | 
5 | RUN pip3 install git+https://github.com/mpg-age-bioinformatics/AGEpy.git
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## AGEpy [![Build Status](https://travis-ci.org/mpg-age-bioinformatics/AGEpy.svg?branch=master)](https://travis-ci.org/mpg-age-bioinformatics/AGEpy) [![PyPI version](https://badge.fury.io/py/AGEpy.svg)](https://badge.fury.io/py/AGEpy) [![ReadtheDocs](https://readthedocs.org/projects/agepy/badge/?version=latest)](http://agepy.readthedocs.io)
 2 | 
 3 | This python package contains Bioinformatics tools developed at the
 4 | Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing.
 5 | 
 6 | > Max Planck Institute for Biology of Ageing  
 7 | > Joseph-Stelzmann-Str. 9b  
 8 | > D-50931 Cologne  
 9 | > Germany
10 | 
11 | [https://bioinformatics.age.mpg.de](https://bioinformatics.age.mpg.de)
12 | 
13 | #### Read the Docs
14 | 
15 | [agepy.readthedocs.io](http://agepy.readthedocs.io)
16 | 
17 | #### Installation
18 | 
19 | ###### pip
20 | 
21 | ```bash
22 | pip3 install git+https://github.com/mpg-age-bioinformatics/AGEpy.git --user
23 | ```
24 | 
25 | To install a specific commit use:
26 | ```
27 | pip3 install git+https://github.com/mpg-age-bioinformatics/AGEpy.git@<ash> --user
28 | # eg.
29 | pip3 install git+https://github.com/mpg-age-bioinformatics/AGEpy.git@9b10b76d021652c44f93e8dd3850a7a937e6fcee --user
30 | ```
31 | 
32 | Alternatively you can also install the package with a symlink, so that changes
33 | to the source files will be immediately available to users of the package on
34 | your system:
35 | 
36 | ```bash
37 | git clone https://github.com/mpg-age-bioinformatics/AGEpy
38 | cd AGEpy
39 | python setup.py develop --user
40 | ```
41 | 
42 | Be aware that with the develop option you won't be able to properly update once new scripts are added.
43 | 
44 | #### Example usage
45 | 
46 | ```python
47 | import AGEpy as age
48 | 
49 | gtf=age.readGTF("/path/to/file.gtf")
50 | 
51 | gtf.head()
52 | ```
53 | 
54 | #### Help
55 | 
56 | In bash:
57 | 
58 | ```bash
59 | pydoc AGEpy.AGEpy
60 | ```
61 | 
62 | In python:
63 | 
64 | ```python
65 | help("AGEpy.AGEpy")
66 | ```
67 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | AGEpy
 2 | ^^^^^
 3 | 
 4 | This python package contains Bioinformatics tools developed at the
 5 | Bioinformatics Core Facility of the Max Planck Institute for Biology of
 6 | Ageing.
 7 | 
 8 |     Max Planck Institute for Biology of Ageing 
 9 |     
10 |     Joseph-Stelzmann-Str. 9b
11 |     
12 |     D-50931 Cologne Germany
13 | 
14 | `https://bioinformatics.age.mpg.de`_
15 | 
16 | Read the Docs
17 | ^^^^^^^^^^^^^
18 | 
19 | `agepy.readthedocs.io`_
20 | 
21 | .. _agepy.readthedocs.io: http://agepy.readthedocs.io
22 | .. _https://bioinformatics.age.mpg.de: https://bioinformatics.age.mpg.de
23 | 


--------------------------------------------------------------------------------
/bin/abed:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description="abed is an annotation tool for bed files.",formatter_class = argparse.ArgumentDefaultsHelpFormatter )
 6 | parser.add_argument("-b", "--bed", help="/path/to/file.bed")
 7 | parser.add_argument("-g", "--gtf", help="/path/to/file.gtf")
 8 | parser.add_argument("-s", "--sizes", help="/path/to/file.genome. Tab separated values of 'chromosome name' and 'size' information.")
 9 | parser.add_argument("-c", "--columns", help="A comma separated string of column headers to use when reading in the bed file. eg.: 'chr,start,end,name'." )
10 | parser.add_argument("-p", "--promoter", help="A comma separated list containing the upstream start of the promoter region from the TSS and the downstream end of the promoter region from the TSS. eg.: '1000,200'.")
11 | parser.add_argument("-o", "--output", help="/path/to/output.tsv.")
12 | 
13 | args = parser.parse_args()
14 | 
15 | import AGEpy as age
16 | import pandas as pd
17 | 
18 | promoters=args.promoter
19 | promoters=promoters.split(",")
20 | promoters=[ int(s) for s in promoters ]
21 | 
22 | bed=age.AnnotateBED(args.bed,args.gtf, args.sizes, bedcols=args.columns, promoter=promoters)
23 | 
24 | bed.to_csv(args.output, index=None, sep="\t")
25 | 


--------------------------------------------------------------------------------
/bin/blasto:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import sys
  5 | import argparse
  6 | 
  7 | sys.stdout.flush()
  8 | 
  9 | # argparse arguments
 10 | 
 11 | parser = argparse.ArgumentParser(description="This module will load a fasta formatted file and query each fasta sequence for blast \
 12 |                                  The user may add blast parameters as space separated list after the sequence name. All queries are \
 13 |                                  listed into a log table. The user can either let the program running while waiting for the results \
 14 |                                  using the -C option, or quit and check if the results are ready later using -W -t <queryTable.tsv>", \
 15 |                                      formatter_class = argparse.ArgumentDefaultsHelpFormatter)
 16 | # Flags
 17 | parser.add_argument('-S', '--submitFromFasta', help = 'Read in fasta file and submit blast queries. Write out submitted query IDs.', \
 18 |     action = 'store_true')
 19 | parser.add_argument('-C', '--continueThrough', help = 'Read from fasta file, submit and continue checking. Write results when they are \
 20 |                     ready and exit after all results are finished.', action = 'store_true')
 21 | parser.add_argument('-W', '--checkAndWriteResults', help = 'Read query IDs from tsv and check status. If results are ready, collect and safe.',\
 22 |     action = 'store_true')
 23 | # Input
 24 | parser.add_argument("-f", "--inputFasta", default = '', help="Fasta formatted input file containing one or more input sequences. \
 25 |                     The sequence name may contain additional blast paramers, ") # include example
 26 | parser.add_argument('-t', '--inputTsv', default = '',help='Tab separated input file containing sequence IDs, output prefix, query IDs, query arguments.')
 27 | # Output
 28 | parser.add_argument('-o', '--outputPrefix', default = '', help='Output prefix. All files will start with this prefix, blast output files will\
 29 |                     be written two <prefix>_<sequenceID>.<format_type>')
 30 | parser.add_argument('--format_type', default = 'Tabular', help='format of the blast output')
 31 | parser.add_argument('--sleepTime', default = 60, type = int, help = 'time to wait before checking again if your jobs are done, only active if -C is on')
 32 | parser.add_argument("--description", help="Get a description of what this script does.", action="store_true")
 33 | 
 34 | args = parser.parse_args()
 35 | 
 36 | 
 37 | if args.description:
 38 |     print "This module will load a fasta formatted file and query each fasta sequence for blast \
 39 |     The user may add blast parameters as space separated list after the sequence name. All queries are \
 40 |     listed into a log table. The user can either let the program running while waiting for the results \
 41 |     using the -C option, or quit and check if the results are ready later using -W -t <queryTable.tsv>"
 42 |     sys.exit(0)
 43 | 
 44 | # test input and arguments
 45 | 
 46 | # test if inputfiles are present, if -S then -f if -W then -t
 47 | if args.submitFromFasta and args.inputFasta == '':
 48 |     print('ERROR: If you are trying to submit your jobs, you need to supply input fasta sequences using -f <file.fa>')
 49 |     sys.exit(1)
 50 | 
 51 | if args.submitFromFasta and not os.path.exists(args.inputFasta):
 52 |     print ('ERROR: No such input file: %s' %(args.inputFasta))
 53 |     sys.exit(1)
 54 | 
 55 | if args.checkAndWriteResults and args.inputTsv == '':
 56 |     print('ERROR: If you are trying to check and write your jobs, you need to supply input tab separated table -t <queries.tsv>')
 57 |     sys.exit(1)
 58 | 
 59 | if args.checkAndWriteResults and not os.path.exists(args.inputTsv):
 60 |     print ('ERROR: No such input file: %s' %(args.inputTsv))
 61 |     sys.exit(1)
 62 | 
 63 | # test if output location is writable
 64 | if not os.path.isdir('/'.join(args.outputPrefix.split('/')[:-1])):
 65 |     print('ERROR: No such output directory: %s' %('/'.join(args.outputPrefix.split('/')[:-1])))
 66 |     sys.exit(1)
 67 |     if not os.access('/'.join(args.outputPrefix.split('/')[:-1]), os.W_OK):
 68 |         print('ERROR: You do not have write permissions: %s' %('/'.join(args.outputPrefix.split('/')[:-1])))
 69 | 
 70 | # test if format_type belongs to possible format types
 71 | if not args.format_type in ['Tabular', 'Text', 'XML', 'XML2', 'JSON2']:
 72 |     print('ERROR: Only Tabular, Text, XML, XML2, or JSON2 are a supported format_type right now. %s is not supported'  %(args.format_type))
 73 |     sys.exit(1)
 74 | 
 75 | # import AGEpy and other packages
 76 | import AGEpy as age
 77 | import pandas as pd
 78 | import numpy as np
 79 | import time
 80 | 
 81 | 
 82 | # read in fasta
 83 | if args.submitFromFasta:
 84 |     I = open(args.inputFasta)
 85 |     FASTA = {}
 86 |     
 87 |     while True:
 88 |         tmp_seqID = I.readline()
 89 |         tmp_sequence = I.readline()
 90 |         if not tmp_seqID.startswith('>'):
 91 |             break
 92 |         FASTA[tmp_seqID.replace('\n', '')[1:]] = {'sequence': tmp_sequence.replace('\n', '')}
 93 |     
 94 |     I.close()
 95 |     
 96 |     # open query_output file
 97 |     queryID_output = open('%s.queryTable.tsv' %(args.outputPrefix), 'w')
 98 |     queryID_output.write('SequenceID\tuser_prefix\tqueryID\tparameters\n')
 99 |     
100 |     # for each fasta make a query and save queryID
101 |     for seq in FASTA:
102 |         # initalize BLAST parameters
103 |         database = 'nt'; program = 'blastn'; filter=None; format_type=None; expect=None
104 |         nucl_reward=None; nucl_penalty=None; gapcosts=None; matrix=None; hitlist_size=None
105 |         descriptions=None; alignments=None; ncbi_gi=None; threshold=None
106 |         word_size=None; composition_based_statistics=None; organism=None; others=None
107 |         num_threads=None; baseURL="http://blast.ncbi.nlm.nih.gov"; verbose=False
108 |         # redifine paramters based on user input
109 |         params = seq.split(' ')[1:]
110 |         for p in params:
111 |             exec(p)
112 |         # correctly format gapcosts
113 |         if gapcosts:
114 |             gapcosts = gapcosts.replace(',', ' ') 
115 |         # submit BLAST
116 |         RID=age.BLASTquery(FASTA[seq]['sequence'], database, program, filter=filter,\
117 |                 format_type=format_type, expect=expect,\
118 |                 nucl_reward=nucl_reward, nucl_penalty=nucl_penalty,\
119 |                 gapcosts=gapcosts, matrix=matrix,\
120 |                 hitlist_size=hitlist_size, descriptions=descriptions,\
121 |                 alignments=alignments, ncbi_gi=ncbi_gi, threshold=threshold,\
122 |                 word_size=word_size, composition_based_statistics=composition_based_statistics,\
123 |                 organism=organism, others=others, num_threads=num_threads, baseURL=baseURL,\
124 |                 verbose=verbose)
125 |         print(FASTA[seq]['sequence'])
126 |         print(RID)
127 |         FASTA[seq]['queryID'] = RID
128 |         FASTA[seq]['SeqID'] = seq.split(' ')[0]
129 |         FASTA[seq]['params'] = seq.split(' ')[1:]
130 |         # write query id to log table
131 |         queryID_output.write('%s\t%s\t%s\t%s\n' %(seq.split(' ')[0], args.outputPrefix, RID, ' '.join(seq.split(' ')[1:])))
132 |     
133 |     queryID_output.close()
134 |     print('%s jobs have been submitted.' %(len(FASTA)))
135 |     
136 |     # exit if -C is not specified
137 |     if not args.continueThrough:
138 |         print('\nYou can find an overview here: %s.queryTable.tsv' %(args.outputPrefix)) 
139 |         print "\n\n*************************************\nDeveloped by the Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing \n\nbioinformatics@age.mpg.de\n\n"
140 |     exit(0)
141 | 
142 |     print('continuing ...')
143 | 
144 | 
145 | # read in tsv if the program stopped after submitting ### TODO get prefix from file
146 | if args.checkAndWriteResults:
147 |     I = open(args.inputTsv)
148 |     FASTA = {}
149 |     I.readline()
150 |     for line in I:
151 |         L = line.replace('\n', '').split('\t')
152 |         FASTA['%s %s' %(L[0], L[3])] = {'params' : L[3].split(' '), 'sequence' : '', 'SeqID': L[0], 'queryID': L[2]}
153 |     I.close()
154 | 
155 | 
156 | # check if results are ready and write them if they are ready
157 | while len(FASTA) > 0:
158 |     finished = []
159 |     for seq in FASTA:
160 |         status, therearehits=age.BLASTcheck(FASTA[seq]['queryID'])
161 |         if status == 'READY' and therearehits == 'yes':
162 |             r=age.BLASTresults(FASTA[seq]['queryID'], format_type = args.format_type)
163 |             if args.format_type == 'Tabular':
164 |                 r.insert(0, 'query_name', [FASTA[seq]['SeqID']] * r.shape[0])
165 |                 r.to_csv('%s_%s.tsv' %(args.outputPrefix, FASTA[seq]['SeqID']), sep = '\t', index = False)
166 |             elif format_type.lower() in ['html', 'Text', 'xml', 'xml2', 'json2']:
167 |                 O = open('%s_%s.%s' %(args.outputPrefix, FASTA[seq]['SeqID'], format_type.lower()), 'w')
168 |                 O.write(r)
169 |                 O.close()
170 |             else:
171 |                 print('Only Tabular, Text, XML, XML2, or JSON2 are a supported format_type right now. %s is not supported'  %(args.format_type)) 
172 |             finished += [seq]
173 |         elif status == 'READY' and therearehits == 'no':
174 |             print('Query %s is ready but has no hits' %(FASTA[seq]['SeqID']))
175 |             finished += [seq]
176 |         else:
177 |             print('Query %s is not ready yet' %(FASTA[seq]['SeqID']))
178 |     for seq in finished:
179 |         del FASTA[seq]
180 |     if not args.continueThrough:
181 |         print('%s jobs are still running' %(len(FASTA)))
182 |         print "\n\n*************************************\nDeveloped by the Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing \n\nbioinformatics@age.mpg.de\n\n"
183 |         exit(0)
184 |     print('%s jobs are still running' %(len(FASTA)))
185 |     if len(FASTA) > 0:
186 |         print('waiting ...')
187 |         time.sleep(args.sleepTime)
188 | 
189 | print('finished')
190 | 
191 | print "\n\n*************************************\nDeveloped by the Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing \n\nbioinformatics@age.mpg.de\n\n"
192 | sys.exit()
193 | 
194 | 
195 | # python blasto -S -f /home/fmetge/Documents/corefacility/AGEpy/test_fasta.fa -o /home/fmetge/Documents/corefacility/AGEpy/test
196 | # python blasto -W -t /home/fmetge/Documents/corefacility/AGEpy/test.queryTable.tsv -o /home/fmetge/Documents/corefacility/AGEpy/test
197 | 
198 | # python blasto -S -C -f /home/fmetge/Documents/corefacility/AGEpy/test_fasta.fa -o /home/fmetge/Documents/corefacility/AGEpy/test ... works in theory, but doesnt finit
199 | # python blasto -W -C -t /home/fmetge/Documents/corefacility/AGEpy/test.queryTable.tsv -o /home/fmetge/Documents/corefacility/AGEpy/test
200 | 


--------------------------------------------------------------------------------
/bin/david:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import argparse
 6 | 
 7 | sys.stdout.flush()
 8 | 
 9 | parser = argparse.ArgumentParser(description="Queries the DAVID database for an enrichment \
10 | analysis and plots CellPlots as well as SymPlots (see plots). \
11 | Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == 'type' tag and categories ==  'annot' tag.", \
12 | formatter_class = argparse.ArgumentDefaultsHelpFormatter)
13 | parser.add_argument("-i", "--input", help="A file with tab separated values where \
14 | the first column contains the identifiers to be queried and the second column the \
15 | respective log2fc for each identifier.")
16 | parser.add_argument("-o", "--output", help="/path/to/output/prefix")
17 | parser.add_argument("-d", "--database", help="a string for the database to query, e.g. 'WORMBASE_GENE_ID'.")
18 | parser.add_argument("-c", "--categories", help="a comma separated list of categories.",\
19 |  default='GOTERM_BP_FAT,GOTERM_CC_FAT,GOTERM_MF_FAT,KEGG_PATHWAY,BIOCARTA,PFAM,PROSITE')
20 | parser.add_argument("-u", "--user", help="a user ID registered at DAVID for querying")
21 | parser.add_argument("-v", "--verbose", help="Print more.",default=None, action="store_true")
22 | parser.add_argument("-p", "--pvalue", help="Maximum p value for enrichment of a term.", default=0.1)
23 | parser.add_argument("-n", "--ngenes", help="Minimum number of genes within a term.", default=2)
24 | parser.add_argument("-b", "--background", help="A file with tab separated values where \
25 | the first column contains the identifiers to used as a background. \
26 | None for whole DAVID database as background.", default=None)
27 | args = parser.parse_args()
28 | 
29 | import pandas as pd
30 | import AGEpy as age
31 | 
32 | df_ids=pd.read_csv(args.input, sep = '\t')
33 | if args.background:
34 |     df_ids_bg=pd.read_csv(args.background, sep = '\t')
35 | else:
36 |     df_ids_bg=None
37 | 
38 | #categories=args.categories.split(",")
39 | 
40 | age.DAVIDplot(args.database, args.categories, args.user, df_ids, args.output, \
41 | df_ids_bg = df_ids_bg, name = '', name_bg = '', verbose = args.verbose, \
42 | p = args.pvalue, n = args.ngenes)
43 | 


--------------------------------------------------------------------------------
/bin/obo2tsv:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | 
  5 | parser = argparse.ArgumentParser(description="obo to tsv parser", formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  6 | parser.add_argument("-i", "--input",help="go-basic.obo file. Files can be downloaded from http://geneontology.org/page/download-ontology.")
  7 | parser.add_argument("-u", "--url", help="If no go-basic.obo input file is specified, a url to a target obo file can be specified instead.", default='http://geneontology.org/ontology/go-basic.obo')
  8 | parser.add_argument("-o", "--output", help="Name of output tab separated file.", default="go-basic.tsv")
  9 | parser.add_argument("-c", "--cpus", help="Number of cpus.", default=36)
 10 | parser.add_argument("--organism", help="Optional, merge GO obo.tsv with a GO annotation for an organism: either a link to a file on geneontology.org eg. http://geneontology.org/gene-associations/gene_association.fb.gz or the path for the respective  downloded .gz file.",default=None)
 11 | args = parser.parse_args()
 12 | 
 13 | import pandas as pd
 14 | import numpy as np
 15 | import multiprocessing as mp
 16 | import sys
 17 | import contextlib
 18 | import cStringIO
 19 | @contextlib.contextmanager
 20 | def nostdout():
 21 |     save_stdout = sys.stdout
 22 |     sys.stdout = cStringIO.StringIO()
 23 |     yield
 24 |     sys.stdout = save_stdout
 25 | with nostdout():
 26 |     import AGEpy.AGEpy as age
 27 | #age.checkImport()
 28 | 
 29 | 
 30 | n_processors=int(args.cpus)
 31 | 
 32 | if args.input:
 33 |     f=open(args.input)
 34 |     lis=f.readlines()
 35 |     r=args.input
 36 | elif args.url:
 37 |     from urllib import urlopen
 38 |     f=urlopen(args.url).read().split("\n")
 39 |     lis=[str(x)+"\n" for x in f]
 40 |     r=args.url
 41 | 
 42 | print "Finished importing %s" %r
 43 | sys.stdout.flush()
 44 | 
 45 | def getTerm(i,lines):
 46 |     term={}
 47 |     cats=[]
 48 |     GOid=lines[i].split("\n")[0].split(": ")[1]
 49 |     i+=1
 50 |     while lines[i] != "\n":
 51 |         line=lines[i].split("\n")[0].split(": ")
 52 |         cats.append(line[0])
 53 |         if not line[0] in term.keys():
 54 |             term[line[0]]=line[1]
 55 |         else:
 56 |             nval=term[line[0]]+"; "+line[1]
 57 |             term[line[0]]=nval
 58 |         i+=1
 59 |     return i+1, GOid, term, cats
 60 | 
 61 | def collectUpper(x,GOdic):
 62 |     allUpper=[]
 63 |     is_a=GOdic[x]["is_a"]
 64 |     is_a=is_a.split("; ")
 65 |     is_a=[ s.split(" ! ")[0] for s in is_a]
 66 |     return is_a
 67 | 
 68 | def checkTop(x,df):
 69 |     name_spaces=set(df["namespace"].tolist())
 70 |     if len(x) == 1:
 71 |         if x[0] in name_spaces:
 72 |             return True
 73 |         else:
 74 |             return False
 75 |     else:
 76 |         return False
 77 | 
 78 | i=0
 79 | GOdic={}
 80 | cats=[]
 81 | while i < len(lis):
 82 |     l=lis[i]
 83 |     if '[Term]' in l:
 84 |         i, GOid, term, c=getTerm(i+1,lis)
 85 |         GOdic[GOid]=term
 86 |         cats.append(c)
 87 |         
 88 |     else: 
 89 |         i+=1
 90 | cats=[item for sublist in cats for item in sublist]
 91 | 
 92 | df=pd.DataFrame.from_dict(GOdic,orient="index")
 93 | 
 94 | print "Collecting information on parent terms"
 95 | sys.stdout.flush()
 96 | 
 97 | for GO in GOdic.keys():
 98 |     allUpper=[]
 99 |     if "is_a" in GOdic[GO].keys():
100 |         upper=collectUpper(GO,GOdic)
101 |         allUpper.append(upper)
102 |         while not checkTop(upper,df):
103 |             sub=[]
104 |             for u in upper:
105 |                 if "is_a" in GOdic[u].keys():
106 |                     upper_=collectUpper(u,GOdic)
107 |                     sub.append(upper_)
108 |             if len(sub)>0:
109 |                 sub=[item for sublist in sub for item in sublist]
110 |                 allUpper.append(sub)
111 |                 upper=sub
112 |             else:
113 |                 break
114 |         allUpper=list(set([item for sublist in allUpper for item in sublist]))
115 |         allUpper="; ".join(allUpper)
116 |         GOdic[GO]["parent_terms"]=allUpper
117 | 
118 | #import json
119 | #with open("/beegfs/group_bit/home/JBoucas/GO_test/out.dic", 'w') as configfile:
120 | #    json.dump(GOdic, configfile)
121 | #import json
122 | #with open("/beegfs/group_bit/home/JBoucas/GO_test/out.dic", 'r') as configfile:
123 | #    GOdic=json.load(configfile)
124 |  
125 | df=pd.DataFrame.from_dict(GOdic,orient="index")
126 | df["term"]=df.index.tolist()
127 | df=df.reset_index(drop=True)
128 | 
129 | def getChildren(x,dfn=df):
130 |     children=[]
131 |     for i in range(len(dfn)):
132 |         if str(dfn.ix[i,"parent_terms"])!="nan":
133 |             if x in dfn.ix[i,"parent_terms"]:
134 |                 children.append(dfn.ix[i,"term"])
135 |     if len(children)>1:
136 |         children="; ".join(children)
137 |     elif len(children)==1:
138 |         children=str(children[0])
139 |     else:
140 |         children="None"
141 |     return children
142 | 
143 | def worker(df):
144 |     df=pd.DataFrame(df)
145 |     df["children"]=df['term'].apply(getChildren)
146 |     df["result"]=df["term"].astype(str)+"-"+df["children"].astype(str)
147 |     res=df["result"].tolist()
148 |     return res
149 | 
150 | def correctNones(x):
151 |     if x == "None":
152 |         return np.nan
153 |     else:
154 |         return x
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     print "Collecting information on child terms"
159 |     sys.stdout.flush()
160 | 
161 |     reader = np.array_split(df,n_processors)
162 |     pool = mp.Pool(n_processors)
163 |     funclist = []
164 |     for d in reader:
165 |         out_put = pool.apply_async(worker,[d])
166 |         funclist.append(out_put)
167 | 
168 |     dfCov=pd.DataFrame()
169 |     for f in funclist:
170 |         covs_=f.get()
171 |         covs_=pd.DataFrame(covs_,index=range(len(covs_)))
172 |         dfCov=pd.concat([dfCov,covs_],axis=0)
173 |     dfCov.columns=['chil']
174 |     TSS=pd.DataFrame(dfCov['chil'].str.split("-").tolist())
175 |     TSS.columns=["term","children"]
176 |     TSS["children"]=TSS["children"].apply(lambda x: correctNones(x) )
177 |     df=pd.merge(df,TSS,on="term",how="outer")
178 |     col=['term','synonym','name', 'relationship', 'namespace', 'is_a', 'def', 'subset', 'comment', 'xref', 'is_obsolete', 'consider', 'alt_id','replaced_by', 'parent_terms', 'children']
179 |     df=df[col]
180 |     if args.organism:
181 |         org=age.getGeneAssociation(args.organism)
182 |         if "GO ID" in org.columns.tolist():
183 |             df=pd.merge(org,df,left_on=["GO ID"], right_on=["term"],how="left")
184 |         else:
185 |             check=org.ix[0]
186 |             for i in range(len(check)):
187 |                 if "GO:" in check[i]:
188 |                     break
189 |             df=pd.merge(org,df,left_on=[i], right_on=["term"],how="left")
190 |     df.to_csv(args.output,sep="\t",index=None)
191 |     print "Done"
192 | 


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
1 | from recommonmark.parser import CommonMarkParser
2 | 
3 | source_parsers = {
4 |     '.md': CommonMarkParser,
5 | }
6 | 
7 | source_suffix = ['.rst', '.md']
8 | 


--------------------------------------------------------------------------------
/docs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/.DS_Store


--------------------------------------------------------------------------------
/docs/cookbook.md:
--------------------------------------------------------------------------------
 1 | ### Importing
 2 | 
 3 | All functions in the AGEpy pakcage can be accessed using:
 4 | 
 5 | ```python
 6 | import AGEpy as age
 7 | help(age.readGTF)
 8 | ```
 9 | 
10 | Alternatively, functions from the different modules can be accessed with for example:
11 | 
12 | ```python
13 | from AGEpy import gtf
14 | help(gtf.readGTF)
15 | ```
16 | 
17 | ### Help
18 | 
19 | In bash:
20 | 
21 | ```bash
22 | pydoc AGEpy.AGEpy
23 | ```
24 | 
25 | In python:
26 | 
27 | ```python
28 | help("AGEpy.AGEpy")
29 | ```
30 | 
31 | ### Example usage
32 | 
33 | ```python
34 | import AGEpy as age
35 | 
36 | gtf=age.readGTF("/path/to/file.gtf")
37 | 
38 | gtf.head()
39 | ```
40 | 


--------------------------------------------------------------------------------
/docs/executables/abed.md:
--------------------------------------------------------------------------------
 1 | ## Intro
 2 | 
 3 | `abed` is annotation tool for bed files.
 4 | 
 5 | It annotates bed files with gene names, gene ids, and feature information from a
 6 | provided annotations file (GTF).
 7 | 
 8 | ## Examples
 9 | 
10 | ```
11 | $ abed -b K27AC_chip1_peaks.bed -g hg38.83.gtf -s hg38.83.genome \
12 |   -c "chr,start,end,name,signal_value,strand,fold_change,p_value,Benjamini_Hochberg_FDR" \
13 |   -p 1000,200 -o annotated.bed.tsv
14 | ```
15 | 
16 | ## Output files
17 | 
18 | * **`annotated.bed.tsv`**
19 | 
20 | ```
21 | chr     start   end     name    signal_value    strand  fold_change     p_value Benjamini_Hochberg_FDR  annotated_gene_features
22 | 1       1205710 1205930 chip1_peak_2799 241.68113891    .       12.1486518993   1.18025200195e-06       0.00504618822857        TNFRSF18/ENSG00000186891: promoter
23 | 1       1616560 1616780 chip1_peak_5889 71.050614487    .       3.57152066778   5.81256160902e-06       0.0113928182561 RP11-345P4.9/ENSG00000272106: promoter; MIB2/ENSG00000197530: five_prime_utr, exon, promoter, CDS
24 | 1       1892440 1892660 chip1_peak_3527 243.582136289   .       12.2442098543   1.93910073064e-06       0.00651852531479        RP1-140A9.1/ENSG00000231050: exon
25 | 1       2212540 2212870 chip1_peak_25   81.3040545107   .       4.08693314134   9.22431065693e-12       4.99354651514e-06       FAAP20/ENSG00000162585: exon, promoter; RP11-181G12.4/ENSG00000234396: exon, promoter
26 | 1       3712500 3712720 chip1_peak_6234 38.8954679096   .       1.95516912169   6.52541908518e-06       0.0120595193967 TP73/ENSG00000078900; RP5-1092A11.2/ENSG00000235131
27 | 1       3772780 3773000 chip1_peak_4768 120.93909338    .       6.0792784787    3.69904369916e-06       0.00905667169324        SMIM1/ENSG00000235169: five_prime_utr, exon, promoter
28 | 1       4680280 4680500 chip1_peak_7707 110.246753848   .       5.54180372353   9.96101899764e-06       0.014735318932  AJAP1/ENSG00000196581
29 | 1       5652020 5652240 chip1_peak_526  145.04841094    .       7.29118813738   2.00633103721e-08       0.000477258277665       RP11-154H17.1/ENSG00000236948
30 | 1       6330720 6330940 chip1_peak_7213 153.651918104   .       7.72366298467   8.71110008982e-06       0.0138290963917 ACOT7/ENSG00000097021
31 | 1       6362730 6362950 chip1_peak_7153 87.6949697748   .       4.40818702657   8.54621275409e-06       0.0136908694171 ACOT7/ENSG00000097021
32 | 1       6421360 6421580 chip1_peak_5440 279.339262378   .       14.0416230896   4.90344812744e-06       0.0104638228061 HES2/ENSG00000069812
33 | 1       6423890 6424220 chip1_peak_3597 398.194019503   .       20.0161276677   2.01692509039e-06       0.00664985446589        ESPN/ENSG00000187017: promoter; HES2/ENSG00000069812
34 | 1       6859710 6860040 chip1_peak_53   79.510332304    .       3.99676761667   4.53361213715e-11       1.10018291319e-05       CAMTA1/ENSG00000171735
35 | 1       7400250 7400470 chip1_peak_5957 122.230907584   .       6.14421445654   5.936688264e-06 0.011503154056  CAMTA1/ENSG00000171735
36 | 1       7705060 7705390 chip1_peak_1184 96.6376920702   .       4.85771329365   1.53921999586e-07       0.001601221552  CAMTA1/ENSG00000171735
37 | 1       7745650 7745870 chip1_peak_746  105.6964562     .       5.31307266734   4.97095836632e-08       0.000833924420617       CAMTA1/ENSG00000171735: exon, CDS
38 | ```
39 | 
40 | ## Help
41 | 
42 | ```
43 | $ abed --help
44 | 
45 | usage: abed [-h] [-b BED] [-g GTF] [-s SIZES] [-c COLUMNS] [-p PROMOTER]
46 |             [-o OUTPUT]
47 | 
48 | abed is an annotation tool for bed files.
49 | 
50 | optional arguments:
51 |   -h, --help            show this help message and exit
52 |   -b BED, --bed BED     /path/to/file.bed (default: None)
53 |   -g GTF, --gtf GTF     /path/to/file.gtf (default: None)
54 |   -s SIZES, --sizes SIZES
55 |                         /path/to/file.genome. Tab separated values of
56 |                         'chromosome name' and 'size' information. (default:
57 |                         None)
58 |   -c COLUMNS, --columns COLUMNS
59 |                         A comma separated string of column headers to use when
60 |                         reading in the bed file. eg.: 'chr,start,end,name'.
61 |                         (default: None)
62 |   -p PROMOTER, --promoter PROMOTER
63 |                         A comma separated list containing the upstream start
64 |                         of the promoter region from the TSS and the downstream
65 |                         end of the promoter region from the TSS. eg.:
66 |                         '1000,200'. (default: None)
67 |   -o OUTPUT, --output OUTPUT
68 |                         /path/to/output.tsv. (default: None)
69 | ```
70 | 


--------------------------------------------------------------------------------
/docs/executables/adiff.md:
--------------------------------------------------------------------------------
  1 | ## Intro
  2 | 
  3 | `aDiff` is an annotation tool for differential gene expression results generated by ***cuffdiff*** (Trapnell C., *Nature Biotechnology*, 2012).
  4 | 
  5 | It annotates *cuffdiff* outputs with ensembl gene ids, gene ontology terms and kegg ids.
  6 | 
  7 | Additonally it uses ***DAVID***s API (Huang DW, *Nature Protoc.*, 2009; Huang DW, *Nucleic Acids Res.*, 2009; Xiaoli J, *Bioinformatics*, 2012) to perform enrichment analysis.
  8 | 
  9 | A ***Cytoscape*** (Shannon P, *Genome Research*, 2003) instance running with the ***String*** (Szklarczyk D, *Nucleic Acids Res.*, 2017) App installed can additionally be plugged in to generate expanded protein-protein interactions.
 10 | 
 11 | For a full RNAseq pipeline including `aDiff` check: [http://bioinformatics.age.mpg.de/presentations-tutorials/presentations/modules/rnaseq-tuxedo-update/#/intro](http://bioinformatics.age.mpg.de/presentations-tutorials/presentations/modules/rnaseq-tuxedo-update/#/intro)
 12 | 
 13 | ## Examples
 14 | 
 15 | Example of an `aDiff` call on a *c. elegans* dataset:
 16 | 
 17 | ```
 18 | $ aDiff -D -i cuffdiff_output -o adiff_output \
 19 | -G references/cel.latest.ensembl.gtf \
 20 | -C cuffmerge_output/merged.gtf \
 21 | --DAVIDuser "<Registered.Email@david.com>" \
 22 | --organismtag CEL \
 23 | --cytoscape_host 'localhost' \
 24 | --cytoscape_port 1234
 25 | ```
 26 | 
 27 | Example of an `aDiff` call on a *d. melanogaster* dataset:
 28 | 
 29 | ```
 30 | $ aDiff -D -i cuffdiff_output -o adiff_output \
 31 | -G references/Drosophila_melanogaster.BDGP6.90.gtf \
 32 | -C cuffmerge_output/merged.gtf \
 33 | --dataset dmelanogaster_gene_ensembl \
 34 | --filter flybase_gene_id \
 35 | --outputBiotypes 'flybase_gene_id gene_biotype' \
 36 | --outputGoterms 'flybase_gene_id go_id name_1006' \
 37 | --DAVIDid FLYBASE_GENE_ID \
 38 | --DAVIDuser "<Registered.Email@david.com>" \
 39 | --organismtag DMEL \
 40 | --species 'drosophila melanogaster' \
 41 | --cytoscape_host 'localhost' \
 42 | --cytoscape_port 1234
 43 | ```
 44 | 
 45 | Example of an `aDiff` call on a *mus musculus*  dataset:
 46 | 
 47 | ```
 48 | $ aDiff -i cufdiff_output -o adiff_output \
 49 | -G ensembl.mus_musculus.83.original.gtf \
 50 | -C cuffmerge_output/merged.gtf \
 51 | --TSV \
 52 | --dataset mmusculus_gene_ensembl \
 53 | -u "<Registered.Email@david.com>" \
 54 | --DAVIDid ENSEMBL_GENE_ID \
 55 | --host http://dec2015.archive.ensembl.org/biomart \
 56 | --organismtag MUS \
 57 | --species 'mus musculus' \
 58 | --cytoscape_host 'localhost' \
 59 | --cytoscape_port 1234
 60 | ```
 61 | 
 62 | Example of an `aDiff` call on a *h. sapiens*  dataset:
 63 | 
 64 | ```
 65 | $ aDiff -i cufdiff_output -o adiff_output \
 66 | -G ensembl.homo_sapiens.83.original.gtf \
 67 | -C cuffmerge_output/merged.gtf \
 68 | --TSV \
 69 | --dataset hsapiens_gene_ensembl \
 70 | -u "<Registered.Email@david.com>" \
 71 | --DAVIDid ENSEMBL_GENE_ID \
 72 | --host http://dec2015.archive.ensembl.org/biomart \
 73 | --organismtag HSA \
 74 | --species 'homo sapiens' \
 75 | --cytoscape_host 'localhost' \
 76 | --cytoscape_port 1234
 77 | ```
 78 | 
 79 | ## Output files
 80 | 
 81 | Example of the output for the the *h. sapiens* call above.
 82 | 
 83 | * **`diff_sig_geneexp.xlsx`** this file reports significant differential gene expression. It is based on the *gene_exp.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*).
 84 | 
 85 | * **`diff_sig_iso.xlsx`** this file reports significant differential isoform expression . It is based on the *isoform_exp.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*).
 86 | 
 87 | * **`diff_sig_prom.xlsx`** this file reports significant differential promoter usage. It is based on the *promoters.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*).
 88 | 
 89 | * **`diff_sig_splic.xlsx`** this file reprots significant differential splicing . It is based on the *splicing.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*).
 90 | 
 91 | * **`diff_sig_cds.xlsx`** this file  reports significant differential cds usage. It is based on the *cds.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*).
 92 | 
 93 | * **`geneexp_ALL.tsv`** this file is based on the *gene_exp.diff* file output of *cuffdiff* adding annotation columns to it.
 94 | 
 95 | * **`iso_ALL.tsv`** this file is based on the *isoform_exp.diff* file output of *cuffdiff* adding annotation columns to it.
 96 | 
 97 | * **`prom_ALL.tsv`** this file is based on the *promoters.diff* file output of *cuffdiff* adding annotation columns to it.
 98 | 
 99 | * **`splic_ALL.tsv`** this file is based on the *splicing.diff* file output of *cuffdiff* adding annotation columns to it.
100 | 
101 | * **`cds_ALL.tsv`** this file is based on the *cds.diff* file output of *cuffdiff* adding annotation columns to it.
102 | 
103 | * **`diff_p.05.xlsx`** contains a sheet for each of the files above (ie. *geneexp_ALL.tsv*, *iso_ALL.tsv*, *prom_ALL.tsv*, *splic_ALL.tsv*, *cds_ALL.tsv* ) subset to p values bellow 0.05.
104 | 
105 | * **`KEGG_PATHWAY_diff_sig_geneexp.xlsx`** this file is based on the *gene_exp.diff* file output of *cuffdiff*. It generates a result sheet for each pairwise comparison. It reports DAVID enrichment results for KEGG using genes labeled as significant by *cuffdiff*.
106 | 
107 | * **`GOTERM_BP_FAT_diff_sig_splic.xlsx`** this is file is based on the *splicing.diff* file output of *cuffdiff*.  It generates a result sheet for each pairwise comparison. It reports DAVID enrichment results for Gene Ontology Biological Process (GOTERM BP) using genes labeled as significant by *cuffdiff*.
108 | 
109 | * **`OMIM_DISEASE_diff_sig_geneexp.xlsx`** this file is based on the *gene_exp.diff* file output of *cuffdiff*. It generates a result sheet for each pairwise comparison. It reports DAVID enrichment results for OMIM DISEASE using genes labeled as significant by *cuffdiff*.
110 | 
111 | DAVID output columns:
112 | 
113 |   * **categoryName**: Category name. eg.: GOTERM_BP_FAT.
114 | 
115 |   * **termName**: Term name. eg.: GO:0048468~cell development.
116 | 
117 |   * **listHits**: Number of items in the query list matching this term.
118 | 
119 |   * **percent**: Percentage of items in the query list matching this term.
120 | 
121 |   * **ease**: ease test p value.
122 | 
123 |   * **geneIds**: gene ids.
124 | 
125 |   * **Gene_name**: gene name.
126 | 
127 |   * **listTotals**: number of genes in query list.
128 | 
129 |   * **popHits**: number of genes in background population list matching this term.
130 | 
131 |   * **popTotals**: number of genes in background population lis.
132 | 
133 |   * **foldEnrichment**: Fold enrichment.
134 | 
135 |   * **bonferroni**: Bonferroni corrected p values.
136 | 
137 |   * **benjamini**: Benjamini-Hochberg corrected p values.
138 | 
139 |   * **afdr**: False discovery rate.
140 | 
141 | More information on the standard ouput columns of *cuffdiff* can be found [here](http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/index.html).
142 | 
143 | The `cytoscape` folder  contains cytoscape session files `cys`, as well as `pdf`s and `png`s of the generated networks. Networks are generated by String PPI queries allowing a 25% size expanasion and a confidence cuttoff of 0.4. It also generates a subnetwork by ranking the genes by abs(log2(fold change)) and selecting the top 10% of nodes with edges and the respective first neighbours as well as the same 10% slection but using difusion. Node color maps log2(fold change) - blue down, red up - while node border color and size map normalized expression.
144 | 
145 | ## Help
146 | 
147 | ```
148 | $ aDiff --help
149 | 
150 | aDiff is an annotation tool for differential gene expression results generated
151 | by cuffdiff (Trapnell C., Nature Biotechnology, 2012).
152 | 
153 | usage: aDiff [-h] [-D] [-i INPUTFOLDER] [-o OUTPUTFOLDER] [-G ORIGINALGTF]
154 |              [-C CUFFCOMPAREGTF] [-f INPUTFILES] [-s SHORTOUTPUTNAME]
155 |              [--sigOnly] [--TSV] [--TSVall] [--description] [--listMarts]
156 |              [--mart MART] [--listDatasets] [--dataset DATASET]
157 |              [--listFilters] [--filter FILTER] [--listAttributes]
158 |              [--outputBiotypes OUTPUTBIOTYPES] [--outputGoterms OUTPUTGOTERMS]
159 |              [--KEGG] [--listKEGGorganisms] [--KEGGorg KEGGORG] [--findKEGGdb]
160 |              [--KEGGdb KEGGDB] [--DAVIDid DAVIDID] [--DAVIDcat DAVIDCAT]
161 |              [-u DAVIDUSER] [--host HOST] [--organismtag {DMEL,CEL,MUS,HSA}]
162 |              [--species SPECIES] [--limit LIMIT] [--cuttoff CUTTOFF]
163 |              [--taxon TAXON] [--cytoscape_host CYTOSCAPE_HOST]
164 |              [--cytoscape_port CYTOSCAPE_PORT]
165 | 
166 | optional arguments:
167 |   -h, --help            show this help message and exit
168 |   -D, --DAVID           Use this flag to perform DAVID GO enrichment analysis
169 |                         (default: False)
170 |   -i INPUTFOLDER, --inputFolder INPUTFOLDER
171 |                         Cuffdiff output folder (default: None)
172 |   -o OUTPUTFOLDER, --outputFolder OUTPUTFOLDER
173 |                         Output folder (default: None)
174 |   -G ORIGINALGTF, --originalGTF ORIGINALGTF
175 |                         Original/downloaded GTF (default: None)
176 |   -C CUFFCOMPAREGTF, --cuffcompareGTF CUFFCOMPAREGTF
177 |                         Merged cuffcompared GTF (default: None)
178 |   -f INPUTFILES, --inputFiles INPUTFILES
179 |                         Implies -s. Use this option to select which *.diff
180 |                         files you wish to analyse.'. (default: gene_exp.diff
181 |                         promoters.diff splicing.diff cds.diff
182 |                         isoform_exp.diff)
183 |   -s SHORTOUTPUTNAME, --shortOutputName SHORTOUTPUTNAME
184 |                         Use this option to select a short outpput name for
185 |                         each *.diff file used in '-f'. No '.' (dots) allowed.
186 |                         (default: geneexp prom splic cds iso)
187 |   --sigOnly             Only create report tables for cuffdiff-labeled
188 |                         significantly changed genes (default: False)
189 |   --TSV                 For p values > = 0.05 write tables as tab separated
190 |                         values (default: False)
191 |   --TSVall              Save p < 0.05 save tables as tab separated values in a
192 |                         folder called TSV (default: False)
193 |   --description         Get a description of what this script does. (default:
194 |                         False)
195 |   --listMarts           List biomaRt Marts (default: False)
196 |   --mart MART           Your mart of choice. (default: ENSEMBL_MART_ENSEMBL)
197 |   --listDatasets        List datasets for your mart (default: False)
198 |   --dataset DATASET     Dataset of your choice. (default:
199 |                         celegans_gene_ensembl)
200 |   --listFilters         List available filters (default: False)
201 |   --filter FILTER       Filter to use to identify your genes. (default:
202 |                         ensembl_gene_id)
203 |   --listAttributes      List available attributes for your dataset. (default:
204 |                         False)
205 |   --outputBiotypes OUTPUTBIOTYPES
206 |                         Outputs/attributes for your biotypes data. Order has
207 |                         to be kept, ie. first IDs then biotype. (default:
208 |                         ensembl_gene_id gene_biotype)
209 |   --outputGoterms OUTPUTGOTERMS
210 |                         Outputs/attributes for your goterms data. Order has to
211 |                         be kept, ie. 1st gene_id, then go_id, then
212 |                         go_term_name (default: ensembl_gene_id go_id
213 |                         name_1006)
214 |   --KEGG                Add KEGG annotations (default: False)
215 |   --listKEGGorganisms   List KEGG organisms. (default: False)
216 |   --KEGGorg KEGGORG     KEGG organism. (default: cel)
217 |   --findKEGGdb          KEGG has DB identifier for each linked DB. Use this
218 |                         function to find the label of your DB, eg: 'ensembl-
219 |                         hsa', 'FlyBase'. This option requires --originalGTF
220 |                         and --KEGGorg (default: False)
221 |   --KEGGdb KEGGDB       KEGG database linked to your ensembl organism.
222 |                         (default: EnsemblGenomes-Gn)
223 |   --DAVIDid DAVIDID     DAVID's id for your dataset. List of ids available in
224 |                         http://david.abcc.ncifcrf.gov/content.jsp?file=DAVID_A
225 |                         PI.html#input_list (default: WORMBASE_GENE_ID)
226 |   --DAVIDcat DAVIDCAT   DAVID's categories you wish to analyse. List of
227 |                         available categories in https://david.ncifcrf.gov/cont
228 |                         ent.jsp?file=DAVID_API.html#approved_list. (default: G
229 |                         OTERM_BP_FAT,GOTERM_CC_FAT,GOTERM_MF_FAT,KEGG_PATHWAY,
230 |                         PFAM,PROSITE,GENETIC_ASSOCIATION_DB_DISEASE,OMIM_DISEA
231 |                         SE)
232 |   -u DAVIDUSER, --DAVIDuser DAVIDUSER
233 |                         Your DAVID's user id. example: 'John.Doe@age.mpg.de'
234 |                         (default: None)
235 |   --host HOST           Ensembl host. Check http://www.ensembl.org/info/websit
236 |                         e/archives/index.html for older releases. (default:
237 |                         http://www.ensembl.org/biomart)
238 |   --organismtag {DMEL,CEL,MUS,HSA}
239 |                         Organism tag. (default: None)
240 |   --species SPECIES     Species for string app query. eg. 'caenorhabditis
241 |                         elegans', 'drosophila melanogaster', 'mus musculus',
242 |                         'homo sapiens'. Default='caenorhabditis elegans'
243 |                         (default: caenorhabditis elegans)
244 |   --limit LIMIT         Limit for string app query. Number of extra genes to
245 |                         recover. If None, limit=N(query_genes)*.25 (default:
246 |                         None)
247 |   --cuttoff CUTTOFF     Confidence cuttoff for sting app query. Default=0.4
248 |                         (default: 0.4)
249 |   --taxon TAXON         Taxon id for string app query. For the species shown
250 |                         above, taxon id will be automatically identified.
251 |                         (default: None)
252 |   --cytoscape_host CYTOSCAPE_HOST
253 |                         Host address for cytoscape. (default: None)
254 |   --cytoscape_port CYTOSCAPE_PORT
255 |                         Cytoscape port. (default: None)
256 | ```
257 | 


--------------------------------------------------------------------------------
/docs/executables/blasto.md:
--------------------------------------------------------------------------------
 1 | ## Intro
 2 | 
 3 | This module will load a fasta formatted file and query each fasta sequence for blast.
 4 | The user may add blast parameters as space separated list after the sequence name. All queries are
 5 | listed into a log table. The user can either let the program running while waiting for the results
 6 | using the -C option, or quit and check if the results are ready later using -W -t <queryTable.tsv>
 7 | 
 8 | ## Examples
 9 | 
10 | ```bash
11 | $cat input.fa
12 | 
13 | >sequence1
14 | GCGAAGCCCAAGAGGATGAAGCCAGAGATGGTGTTGGAGTTGCTGGGGCTGCTGAGGGTATTGATCTGTCTGTGACCTGCGATAGCATCAGAAGTTGTTTCACATTCTAGTTATAGCTGAGGGAGGTTATGTTTTGAGCAAGCAGGAAAC
15 | >Sequence2
16 | AGCTCCTGAGAAACTTGGGGGGCGCGACACAGATAGGGTGAAAGCAGAGTGATAGACCTGGGATGGTTACGGGACCAAGGGAAGACCAGGCTGGTTGGCATACACCGGTGAACGGATGGGAGTCCTAGGGAAAGATGATGCGCCTAACAG
17 | >sequence2_filtered database='nt' filter="T" nucl_penalty=-5 gapcosts='1,11'
18 | AGCTCCTGAGAAACTTGGGGGGCGCGACACAGATAGGGTGAAAGCAGAGTGATAGACCTGGGATGGTTACGGGACCAAGGGAAGACCAGGCTGGTTGGCATACACCGGTGAACGGATGGGAGTCCTAGGGAAAGATGATGCGCCTAACAG
19 | >sequence3
20 | TCGTTTGATTCTGCAAGCAGCACCTACTGTGGGGTATTGATAAGATCTCTGATGGCGTCTGAAATTCTTCTGAGATTAGAGGAAGATCAGGTGTGTTTTAATGTCGAGCAGGTGTTTCCCCAAGATTAGTGGGGGGATTCGGTTTTTCCT
21 | 
22 | $blasto -S -f /usr/home/JDoe/project1/input.fa -o /usr/home/JDoe/project1/run1
23 | $blasto -W -t /usr/home/JDoe/project1/run1.queryTable.tsv -o /usr/home/JDoe/project1/run1
24 | ```
25 | 
26 | ## Help
27 | ```bash
28 | 
29 | usage: blasto [-h] [-S] [-C] [-W] [-f INPUTFASTA] [-t INPUTTSV]
30 |               [-o OUTPUTPREFIX] [--format_type FORMAT_TYPE]
31 |               [--sleepTime SLEEPTIME] [--description]
32 | 
33 | This module will load a fasta formatted file and query each fasta sequence for
34 | blast The user may add blast parameters as space separated list after the
35 | sequence name. All queries are listed into a log table. The user can either
36 | let the program running while waiting for the results using the -C option, or
37 | quit and check if the results are ready later using -W -t <queryTable.tsv>
38 | 
39 | optional arguments:
40 |   -h, --help            show this help message and exit
41 |   -S, --submitFromFasta
42 |                         Read in fasta file and submit blast queries. Write out
43 |                         submitted query IDs. (default: False)
44 |   -C, --continueThrough
45 |                         Read from fasta file, submit and continue checking.
46 |                         Write results when they are ready and exit after all
47 |                         results are finished. (default: False)
48 |   -W, --checkAndWriteResults
49 |                         Read query IDs from tsv and check status. If results
50 |                         are ready, collect and safe. (default: False)
51 |   -f INPUTFASTA, --inputFasta INPUTFASTA
52 |                         Fasta formatted input file containing one or more
53 |                         input sequences. The sequence name may contain
54 |                         additional blast paramers, (default: )
55 |   -t INPUTTSV, --inputTsv INPUTTSV
56 |                         Tab separated input file containing sequence IDs,
57 |                         output prefix, query IDs, query arguments. (default: )
58 |   -o OUTPUTPREFIX, --outputPrefix OUTPUTPREFIX
59 |                         Output prefix. All files will start with this prefix,
60 |                         blast output files will be written two
61 |                         <prefix>_<sequenceID>.<format_type> (default: )
62 |   --format_type FORMAT_TYPE
63 |                         format of the blast output (default: Tabular)
64 |   --sleepTime SLEEPTIME
65 |                         time to wait before checking again if your jobs are
66 |                         done, only active if -C is on (default: 60)
67 |   --description         Get a description of what this script does. (default:
68 |                         False)
69 | ```
70 | 


--------------------------------------------------------------------------------
/docs/executables/david.md:
--------------------------------------------------------------------------------
 1 | ## Intro
 2 | 
 3 | Queries the DAVID database for an enrichment analysis and plots CellPlots as well as SymPlots (see plots).
 4 | 
 5 | ## Examples
 6 | 
 7 | ```bash
 8 | $ cat input.tsv
 9 | 
10 | ensembl_gene_id  log2(fold_change)
11 | ENSG00000272449           1.859500
12 | ENSG00000130762           0.601051
13 | ENSG00000083444          -0.881957
14 | ENSG00000162493          -0.638433
15 | ENSG00000253368           0.654517
16 | 
17 | $ david -i input.tsv -o /usr/home/JDoe/project1/datasetA -d ENSEMBL_GENE_ID -u 'email.registered@david.com'
18 | ```
19 | 
20 | ## Help
21 | 
22 | ```bash
23 | $ david --help
24 | 
25 | usage: david [-h] [-i INPUT] [-o OUTPUT] [-d DATABASE] [-c CATEGORIES]
26 |              [-u USER] [-v] [-p PVALUE] [-n NGENES] [-b BACKGROUND]
27 | 
28 | Queries the DAVID database for an enrichment analysis and plots CellPlots as
29 | well as SymPlots (see plots). Check
30 | https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database ==
31 | 'type' tag and categories == 'annot' tag.
32 | 
33 | optional arguments:
34 |   -h, --help            show this help message and exit
35 |   -i INPUT, --input INPUT
36 |                         A file with tab separated values where the first
37 |                         column contains the identifiers to be queried and the
38 |                         second column the respective log2fc for each
39 |                         identifier. (default: None)
40 |   -o OUTPUT, --output OUTPUT
41 |                         /path/to/output/prefix (default: None)
42 |   -d DATABASE, --database DATABASE
43 |                         a string for the database to query, e.g.
44 |                         'WORMBASE_GENE_ID'. (default: None)
45 |   -c CATEGORIES, --categories CATEGORIES
46 |                         a comma separated list of categories. (default: GOTERM
47 |                         _BP_FAT,GOTERM_CC_FAT,GOTERM_MF_FAT,KEGG_PATHWAY,BIOCA
48 |                         RTA,PFAM,PROSITE)
49 |   -u USER, --user USER  a user ID registered at DAVID for querying (default:
50 |                         None)
51 |   -v, --verbose         Print more. (default: None)
52 |   -p PVALUE, --pvalue PVALUE
53 |                         Maximum p value for enrichment of a term. (default:
54 |                         0.1)
55 |   -n NGENES, --ngenes NGENES
56 |                         Minimum number of genes within a term. (default: 2)
57 |   -b BACKGROUND, --background BACKGROUND
58 |                         A file with tab separated values where the first
59 |                         column contains the identifiers to used as a
60 |                         background. None for whole DAVID database as
61 |                         background. (default: None)
62 | ```
63 | 


--------------------------------------------------------------------------------
/docs/executables/obo2tsv.md:
--------------------------------------------------------------------------------
 1 | ## Intro
 2 | 
 3 | `obo2tsv` parses a gene ontology obo file to tsv. It will include for each term columns for parent terms as well as child terms.
 4 | 
 5 | ## Examples
 6 | 
 7 | ```
 8 | $ obo2tsv -u http://geneontology.org/ontology/go-basic.obo \
 9 | -o go-basic.tsv -c 4 \
10 | --organism http://geneontology.org/gene-associations/gene_association.fb.gz
11 | ```
12 | 
13 | Links to other `--organism` can be found on [http://geneontology.org/page/download-annotations](http://geneontology.org/page/download-annotations).
14 | 
15 | ## Help
16 | 
17 | ```
18 | $ obo2tsv --help
19 | 
20 | usage: obo2tsv [-h] [-i INPUT] [-u URL] [-o OUTPUT] [-c CPUS]
21 |                [--organism ORGANISM]
22 | 
23 | obo to tsv parser
24 | 
25 | optional arguments:
26 |   -h, --help            show this help message and exit
27 |   -i INPUT, --input INPUT
28 |                         go-basic.obo file. Files can be downloaded from
29 |                         http://geneontology.org/page/download-ontology.
30 |                         (default: None)
31 |   -u URL, --url URL     If no go-basic.obo input file is specified, a url to a
32 |                         target obo file can be specified instead. (default:
33 |                         http://geneontology.org/ontology/go-basic.obo)
34 |   -o OUTPUT, --output OUTPUT
35 |                         Name of output tab separated file. (default: go-
36 |                         basic.tsv)
37 |   -c CPUS, --cpus CPUS  Number of cpus. (default: 36)
38 |   --organism ORGANISM   Optional, merge GO obo.tsv with a GO annotation for an
39 |                         organism: either a link to a file on geneontology.org
40 |                         eg. http://geneontology.org/gene-
41 |                         associations/gene_association.fb.gz or the path for
42 |                         the respective downloded .gz file. (default: None)
43 | ```
44 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ## AGEpy [![Build Status](https://travis-ci.org/mpg-age-bioinformatics/AGEpy.svg?branch=master)](https://travis-ci.org/mpg-age-bioinformatics/AGEpy) [![PyPI version](https://badge.fury.io/py/AGEpy.svg)](https://badge.fury.io/py/AGEpy) ![ReadtheDocs](https://readthedocs.org/projects/agepy/badge/?version=latest)
 2 | 
 3 | This python package contains Bioinformatics tools developed at the
 4 | Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing.
 5 | 
 6 | > Max Planck Institute for Biology of Ageing  
 7 | > Joseph-Stelzmann-Str. 9b  
 8 | > D-50931 Cologne  
 9 | > Germany
10 | 
11 | [https://bioinformatics.age.mpg.de](https://bioinformatics.age.mpg.de)
12 | 
13 | ### Installation
14 | 
15 | ###### pip
16 | 
17 | Latest pip release:
18 | 
19 | ```bash
20 | pip install AGEpy --user
21 | ```
22 | 
23 | ###### github
24 | 
25 | Get the latest development version from github:
26 | 
27 | ```bash
28 | git clone https://github.com/mpg-age-bioinformatics/AGEpy
29 | ```
30 | 
31 | Install:
32 | 
33 | ```bash
34 | cd AGEpy
35 | python setup.py install --user
36 | ```
37 | 
38 | and then update to the latest release whenever required with:
39 | 
40 | ```bash
41 | cd AGEpy
42 | git pull
43 | python setup.py install --user --force
44 | ```
45 | 
46 | Alternatively you can also install the package with a symlink, so that changes
47 | to the source files will be immediately available to users of the package on
48 | your system:
49 | 
50 | ```bash
51 | cd AGEpy
52 | python setup.py develop --user
53 | ```
54 | 
55 | Be aware that with the develop option you won't be able to properly update once new scripts are added.
56 | 


--------------------------------------------------------------------------------
/docs/modules/MA1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/MA1.png


--------------------------------------------------------------------------------
/docs/modules/MA2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/MA2.png


--------------------------------------------------------------------------------
/docs/modules/MA3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/MA3.png


--------------------------------------------------------------------------------
/docs/modules/bed.md:
--------------------------------------------------------------------------------
  1 | ## ___GetBEDnarrowPeakgz___
  2 | 
  3 | Reads a gz compressed BED narrow peak file from a web address or local file and returns a pandas dataframe.
  4 | 
  5 | **`GetBEDnarrowPeakgz(URL_or_PATH_TO_file)`**
  6 | 
  7 | * **`URL_or_PATH_TO_file`** source of input bed. Either a web link or a path to a local file.
  8 | 
  9 | * **`returns`** a pandas dataframe of the inpud bed.
 10 | 
 11 | ```python
 12 | >>> import AGEpy as age
 13 | 
 14 | >>> eCLIP_1_bednarrowPeak="https://www.encodeproject.org/files/ENCFF066PCT/@@download/ENCFF066PCT.bed.gz"
 15 | >>> bed=age.GetBEDnarrowPeakgz(eCLIP_1_bednarrowPeak)
 16 | >>> print bed.head()
 17 | 
 18 | chrom chromStart   chromEnd    name score strand       signalValue  \
 19 | 0  chr7  139371278  139371296  Peak_0  1000      +  5.09062636514014   
 20 | 1  chr7  139371257  139371278  Peak_1  1000      +   5.0840236303159   
 21 | 2  chr7  155781335  155781431  Peak_2  1000      +  3.70481328524336   
 22 | 3  chr7   87156569   87156676  Peak_3  1000      +  3.95023151551588   
 23 | 4  chr7  105073472  105073521  Peak_4  1000      +  4.14556204062503   
 24 | 
 25 |      -log10(pValue) -log10(qvalue) peak  
 26 | 0  48.9834262537309             -1   -1  
 27 | 1  48.7463712698062             -1   -1  
 28 | 2  42.6519289009201             -1   -1  
 29 | 3  37.7848384917051             -1   -1  
 30 | 4  34.0756845242392             -1   -1
 31 | ```
 32 | ___
 33 | 
 34 | ## ___writeBED___
 35 | 
 36 | Writes a bed dataframe into a bed file.
 37 | 
 38 | **`writeBED(inBED, file_path)`**
 39 | 
 40 | * **`inBED`** a pandas dataframe with the contents of the bed file to be written.
 41 | * **`file_path`** path to target file.
 42 | 
 43 | * **`returns`** nothing.
 44 | 
 45 | ```python
 46 | >>> import AGEpy as age
 47 | >>> print bed.head()
 48 | 
 49 | chrom chromStart   chromEnd    name score strand       signalValue  \
 50 | 0  chr7  139371278  139371296  Peak_0  1000      +  5.09062636514014   
 51 | 1  chr7  139371257  139371278  Peak_1  1000      +   5.0840236303159   
 52 | 2  chr7  155781335  155781431  Peak_2  1000      +  3.70481328524336   
 53 | 3  chr7   87156569   87156676  Peak_3  1000      +  3.95023151551588   
 54 | 4  chr7  105073472  105073521  Peak_4  1000      +  4.14556204062503   
 55 | 
 56 |      -log10(pValue) -log10(qvalue) peak  
 57 | 0  48.9834262537309             -1   -1  
 58 | 1  48.7463712698062             -1   -1  
 59 | 2  42.6519289009201             -1   -1  
 60 | 3  37.7848384917051             -1   -1  
 61 | 4  34.0756845242392             -1   -1
 62 | 
 63 | >>> age.writeBED(bed,"/path/to/file.bed")
 64 | ```
 65 | ___
 66 | 
 67 | ## ___dfTObedtool___
 68 | 
 69 | Transforms a pandas dataframe into a bedtool. Requires `bedtools` to be in your `path`.
 70 | 
 71 | **`dfTObedtool(df)`**
 72 | 
 73 | * **`df`** a pandas dataframe.
 74 | * **`returns`** a bedtool.
 75 | 
 76 | ```python
 77 | >>> import AGEpy as age
 78 | >>> print bed.head()
 79 | 
 80 | chrom chromStart   chromEnd    name score strand       signalValue  \
 81 | 0  chr7  139371278  139371296  Peak_0  1000      +  5.09062636514014   
 82 | 1  chr7  139371257  139371278  Peak_1  1000      +   5.0840236303159   
 83 | 2  chr7  155781335  155781431  Peak_2  1000      +  3.70481328524336   
 84 | 3  chr7   87156569   87156676  Peak_3  1000      +  3.95023151551588   
 85 | 4  chr7  105073472  105073521  Peak_4  1000      +  4.14556204062503   
 86 | 
 87 |      -log10(pValue) -log10(qvalue) peak  
 88 | 0  48.9834262537309             -1   -1  
 89 | 1  48.7463712698062             -1   -1  
 90 | 2  42.6519289009201             -1   -1  
 91 | 3  37.7848384917051             -1   -1  
 92 | 4  34.0756845242392             -1   -1
 93 | 
 94 | >>> bedtool=age.dfTObedtool(bed)
 95 | >>> print bedtool.head()
 96 | 
 97 | chr7	139371278	139371296	Peak_0	1000	+	5.09062636514014	48.9834262537309	-1	-1
 98 | chr7	139371257	139371278	Peak_1	1000	+	5.0840236303159	48.7463712698062	-1	-1
 99 | chr7	155781335	155781431	Peak_2	1000	+	3.70481328524336	42.6519289009201	-1	-1
100 | chr7	87156569	87156676	Peak_3	1000	+	3.95023151551588	37.7848384917051	-1	-1
101 | chr7	105073472	105073521	Peak_4	1000	+	4.14556204062503	34.0756845242392	-1	-1
102 | chr7	128761857	128761952	Peak_5	1000	+	4.02131461357736	33.9350181783027	-1	-1
103 | chr7	121296414	121296454	Peak_6	1000	+	3.50632247892067	30.2512926812531	-1	-1
104 | chr7	139368342	139368352	Peak_7	1000	+	4.41912711395099	29.6666535015756	-1	-1
105 | chr7	87155583	87155635	Peak_8	1000	+	4.08769554637519	29.3752024210392	-1	-1
106 | chr7	105540000	105540028	Peak_9	1000	+	4.2212263105571	29.0451450847765	-1	-1
107 | 
108 | >>> print type(bed)
109 | 
110 | <class 'pandas.core.frame.DataFrame'>
111 | 
112 | >>> print type(bedtool)
113 | 
114 | <class 'pybedtools.bedtool.BedTool'>
115 | ```
116 | 
117 | ___
118 | 
119 | ## ___GetPeaksExons___
120 | 
121 | Annotates a bedtool, BED narrow peak.
122 | 
123 | **`GetPeaksExons(bed,parsedGTF)`**
124 | 
125 | * **`bed`** a pandas dataframe in bed format
126 | * **`parsedGTF`** a parsed GTF file as outputed by parseGTF()
127 | 
128 | * **`returns`** a Pandas dataframe
129 | 
130 | ```python
131 | >>> import AGEpy as age
132 | >>> print bed.head()
133 | 
134 | chrom chromStart   chromEnd    name score strand       signalValue  \
135 | 0  chr7  139371278  139371296  Peak_0  1000      +  5.09062636514014   
136 | 1  chr7  139371257  139371278  Peak_1  1000      +   5.0840236303159   
137 | 2  chr7  155781335  155781431  Peak_2  1000      +  3.70481328524336   
138 | 3  chr7   87156569   87156676  Peak_3  1000      +  3.95023151551588   
139 | 4  chr7  105073472  105073521  Peak_4  1000      +  4.14556204062503   
140 | 
141 |      -log10(pValue) -log10(qvalue) peak  
142 | 0  48.9834262537309             -1   -1  
143 | 1  48.7463712698062             -1   -1  
144 | 2  42.6519289009201             -1   -1  
145 | 3  37.7848384917051             -1   -1  
146 | 4  34.0756845242392             -1   -1
147 | 
148 | >>> GTF=age.readGTF("/beegfs/group_bit/data/projects/departments/Bioinformatics/bit_RNAseq_eCLIP/downloads/gencode.v24.primary_assembly.annotation.gtf")
149 | >>> print GTF.head()
150 | 
151 | seqname  source     feature  start    end score strand frame  \
152 | 0    chr1  HAVANA        gene  11869  14409     .      +     .   
153 | 1    chr1  HAVANA  transcript  11869  14409     .      +     .   
154 | 2    chr1  HAVANA        exon  11869  12227     .      +     .   
155 | 3    chr1  HAVANA        exon  12613  12721     .      +     .   
156 | 4    chr1  HAVANA        exon  13221  14409     .      +     .   
157 | 
158 |                                          attribute  
159 | 0  gene_id "ENSG00000223972.5"; gene_type "transc..."  
160 | 1  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
161 | 2  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
162 | 3  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
163 | 4  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
164 | 
165 | >>> GTFpa=age.parseGTF(GTF)
166 | >>> print GTFpa.head()
167 | 
168 | seqname  source     feature  start    end score strand frame gene_status  \
169 | 0    chr1  HAVANA        gene  11869  14409     .      +     .       KNOWN   
170 | 1    chr1  HAVANA  transcript  11869  14409     .      +     .       KNOWN   
171 | 2    chr1  HAVANA        exon  11869  12227     .      +     .       KNOWN   
172 | 3    chr1  HAVANA        exon  12613  12721     .      +     .       KNOWN   
173 | 4    chr1  HAVANA        exon  13221  14409     .      +     .       KNOWN   
174 | 
175 |             havana_gene    ...               exon_id      transcript_id  \
176 | 0  OTTHUMG00000000961.2    ...                   NaN                NaN   
177 | 1  OTTHUMG00000000961.2    ...                   NaN  ENST00000456328.2   
178 | 2  OTTHUMG00000000961.2    ...     ENSE00002234944.1  ENST00000456328.2   
179 | 3  OTTHUMG00000000961.2    ...     ENSE00003582793.1  ENST00000456328.2   
180 | 4  OTTHUMG00000000961.2    ...     ENSE00002312635.1  ENST00000456328.2   
181 | 
182 |   exon_number  ont     havana_transcript ccdsid transcript_name  \
183 | 0         NaN  NaN                   NaN    NaN             NaN   
184 | 1         NaN  NaN  OTTHUMT00000362751.1    NaN     DDX11L1-002   
185 | 2           1  NaN  OTTHUMT00000362751.1    NaN     DDX11L1-002   
186 | 3           2  NaN  OTTHUMT00000362751.1    NaN     DDX11L1-002   
187 | 4           3  NaN  OTTHUMT00000362751.1    NaN     DDX11L1-002   
188 | 
189 |                             gene_type transcript_status gene_name  
190 | 0  transcribed_unprocessed_pseudogene               NaN   DDX11L1  
191 | 1  transcribed_unprocessed_pseudogene             KNOWN   DDX11L1  
192 | 2  transcribed_unprocessed_pseudogene             KNOWN   DDX11L1  
193 | 3  transcribed_unprocessed_pseudogene             KNOWN   DDX11L1  
194 | 4  transcribed_unprocessed_pseudogene             KNOWN   DDX11L1
195 | 
196 | >>> bedAn=age.GetPeaksExons(bed,GTFpa)
197 | >>> print bedAn.head()
198 | 
199 | chrom  chromStart   chromEnd     name  score strand  signalValue  \
200 | 0  chr7   155781335  155781431   Peak_2   1000      +     3.704813   
201 | 1  chr7   155781335  155781431   Peak_2   1000      +     3.704813   
202 | 2  chr7   121296414  121296454   Peak_6   1000      +     3.506322   
203 | 3  chr7    87155538   87155583  Peak_16   1000      +     4.077391   
204 | 4  chr7   107904733  107904812  Peak_17   1000      +     3.674368   
205 | 
206 |    -log10(pValue)  -log10(qvalue)  peak         ...           \
207 | 0       42.651929              -1    -1         ...            
208 | 1       42.651929              -1    -1         ...            
209 | 2       30.251293              -1    -1         ...            
210 | 3       22.798739              -1    -1         ...            
211 | 4       21.118496              -1    -1         ...            
212 | 
213 |               gene_id  exon_id_count  exon_id norm. mean -log10(pValue)  \
214 | 0  ENSG00000184863.10              1                          42.651929   
215 | 1  ENSG00000184863.10              1                          42.651929   
216 | 2  ENSG00000106034.17              1                          30.251293   
217 | 3  ENSG00000135164.18              3                        2951.868281   
218 | 4  ENSG00000091140.12              1                          21.118496   
219 | 
220 |   exon_id signalValue transcript_id_count  \
221 | 0            3.704813                   1   
222 | 1            3.704813                   1   
223 | 2            3.506322                   1   
224 | 3           42.703999                   3   
225 | 4            3.674368                   1   
226 | 
227 |   transcript_id norm. mean -log10(pValue)  transcript_id signalValue  \
228 | 0                               42.651929                   3.704813   
229 | 1                               42.651929                   3.704813   
230 | 2                               30.251293                   3.506322   
231 | 3                             2951.868281                  42.703999   
232 | 4                               21.118496                   3.674368   
233 | 
234 |   gene_id_count gene_id norm. mean -log10(pValue)  gene_id signalValue  
235 | 0             4                        116.619012            17.830941  
236 | 1             4                        116.619012            17.830941  
237 | 2             2                         30.251293             2.144090  
238 | 3             8                       3300.707425            73.902289  
239 | 4             5                        135.139064            22.210269  
240 | ```
241 | **gene_id_count**: number of intervals overlapping this gene
242 | 
243 | **transcript_id_count**: number of intervals overlapping this transcript
244 | 
245 | **exon_id_count**: number of intervals overlapping this exon
246 | ___
247 | 
248 | ## ___AnnotateBED___
249 | 
250 | Annotates a bedtool, BED narrow peak.
251 | 
252 | **`AnnotateBED(bed,GTF, genome_file, bedcols=None, promoter=[1000,200])`**
253 | 
254 | * **`bed`** either a /path/to/file.bed or a Pandas dataframe in bed format. /path/to/file.bed implies bedcols.
255 | * **`GTF`** /path/to/file.gtf
256 | * **`genome_file`** /path/to/file.genome - a tab separated values of chr name and size information
257 | * **`bedcols`** a comma separated string of column headers to use when reading in a bed file. eg: "chr,start,end,name"
258 | * **`promoter`** a list containing the upstream start of the promoter region from the TSS and the downstream end of the promoter region from the TSS.  
259 | 
260 | * **`returns`** a Pandas dataframe with the annotated bed file. exons and promoters will be reported as well in the annotated_gene_features column.
261 | 
262 | ```python
263 | ```python
264 | >>> import AGEpy as age
265 | >>> print bed.head()
266 | 
267 | chr      start        end          name  signal value strand  fold change  \
268 | 0   2  175167300  175167740  chip1_peak_2     58.993528      .     2.965444   
269 | 1   2   27052080   27052410  chip1_peak_3    154.897096      .     7.786255   
270 | 2   1  243719300  243719630  chip1_peak_4     99.776458      .     5.015490   
271 | 3  17    2564650    2564980  chip1_peak_5     72.892502      .     3.664107   
272 | 4   7   44999240   44999570  chip1_peak_6    106.434435      .     5.350169   
273 | 
274 |       p-value  Benjamini-Hochberg FDR enriched in marker  
275 | 0  5.747544e-15            4.044835e-08     control  K27AC  
276 | 1  2.197614e-14            8.934691e-08     control  K27AC  
277 | 2  2.915657e-14            8.934691e-08     control  K27AC  
278 | 3  3.173957e-14            8.934691e-08     control  K27AC  
279 | 4  3.871249e-14            9.081308e-08     control  K27AC  
280 | 
281 | >>> bed=AnnotateBED(bed,"hg38.83.gtf","hg38.83.genome")
282 | >>> print bed.head()
283 | 
284 | chr   start     end              name  signal value strand  fold change  \
285 | 0   1  789880  791350  chip2_peak_44728    172.757426      .     8.473977   
286 | 1   1  820750  822710  chip1_peak_22461    148.812870      .    11.672676   
287 | 2   1  905550  905850   chip1_peak_1792    289.437404      .    13.231699   
288 | 3   1  913500  913800   chip1_peak_4243     43.508330      .     1.988994   
289 | 4   1  960150  960450   chip1_peak_1666     67.008675      .     3.063317   
290 | 
291 |       p-value  Benjamini-Hochberg FDR enriched in marker  \
292 | 0  6.043877e-06                0.000314     stretch   H3K9   
293 | 1  5.292319e-07                0.000057     control   H3K9   
294 | 2  9.544848e-07                0.004798     control  H3K27   
295 | 3  4.932846e-06                0.010117     control  H3K27   
296 | 4  8.347840e-07                0.004535     control  H3K27   
297 | 
298 |                            annotated_gene_features  
299 | 0                        RP5-857K21.4; RP11-206L10.9  
300 | 1                                       RP5-857K21.4  
301 | 2                                       RP11-54O7.16  
302 | 3  RP11-54O7.1: exon; RP11-54O7.2: promoter; RP11...  
303 | 4                  NOC2L: promoter; KLHL17: promoter  
304 | ```
305 | ___ 
306 | 


--------------------------------------------------------------------------------
/docs/modules/biom.md:
--------------------------------------------------------------------------------
  1 | ## ___datasetsBM___
  2 | 
  3 | Lists BioMart datasets.
  4 | 
  5 | **`datasetsBM(host=biomart_host)`**
  6 | 
  7 | * **`host`** address of the host server, default='http://www.ensembl.org/biomart'
  8 | * **`returns`** nothing
  9 | 
 10 | ```python
 11 | >>> import AGEpy as age
 12 | >>> age.datasetsBM()
 13 | 
 14 | u'acarolinensis_gene_ensembl'	Anole lizard genes (AnoCar2.0),
 15 | u'acarolinensis_genomic_sequence'	Anole lizard sequences (AnoCar2.0),
 16 | u'amelanoleuca_gene_ensembl'	Panda genes (ailMel1),
 17 | u'amelanoleuca_genomic_sequence'	Panda sequences (ailMel1),
 18 | u'amexicanus_gene_ensembl'	Cave fish genes (AstMex102),
 19 | u'amexicanus_genomic_sequence'	Cave fish sequences (AstMex102),
 20 | u'anancymaae_gene_ensembl'	Ma's night monkey genes (Anan_2.0),
 21 | u'anancymaae_genomic_sequence'	Ma's night monkey sequences (Anan_2.0),
 22 | u'aplatyrhynchos_gene_ensembl'	Duck genes (BGI_duck_1.0),
 23 | u'aplatyrhynchos_genomic_sequence'	Duck sequences (BGI_duck_1.0),
 24 | u'btaurus_gene_ensembl'	Cow genes (UMD3.1),
 25 | u'btaurus_genomic_sequence'	Cow sequences (UMD3.1),
 26 | u'btaurus_marker_end'	marker_feature_end,
 27 | u'btaurus_marker_start'	marker_feature,
 28 | u'btaurus_qtl_feature'	qtl_feature,
 29 | .
 30 | .
 31 | .
 32 | ```
 33 | ___
 34 | 
 35 | ## ___filtersBM___
 36 | 
 37 | Lists BioMart filters for a specific dataset.
 38 | 
 39 | **`filtersBM(dataset,host=biomart_host)`**
 40 | 
 41 | * **`dataset`** dataset to list filters of
 42 | * **`host`** address of the host server, default='http://www.ensembl.org/biomart'
 43 | 
 44 | * **`returns`** nothing
 45 | 
 46 | ```python
 47 | >>> import AGEpy as age
 48 | >>> age.filtersBM('hsapiens_gene_ensembl')
 49 | 
 50 | u'affy_hc_g110'	'AFFY HC G110 probe ID(s) [e.g. 266_s_at]' (type	id_list, values	[]),
 51 | u'affy_hg_focus'	'AFFY HG Focus probe ID(s) [e.g. 212481_s_at]' (type	id_list, values	[]),
 52 | u'affy_hg_u133_plus_2'	'AFFY HG U133 Plus 2 probe ID(s) [e.g. 1553551_s_at]' (type	id_list, values	[]),
 53 | u'affy_hg_u133a'	'AFFY HG U133A probe ID(s) [e.g. 211600_at]' (type	id_list, values	[]),
 54 | u'affy_hg_u133a_2'	'AFFY HG U133A 2 probe ID(s) [e.g. 211600_at]' (type	id_list, values	[]),
 55 | u'affy_hg_u133b'	'AFFY HG U133B probe ID(s) [e.g. 224321_at]' (type	id_list, values	[]),
 56 | u'affy_hg_u95a'	'AFFY HG U95A probe ID(s) [e.g. 33866_at]' (type	id_list, values	[]),
 57 | u'affy_hg_u95av2'	'AFFY HG U95Av2 probe ID(s) [e.g. 33866_at]' (type	id_list, values	[]),
 58 | u'affy_hg_u95b'	'AFFY HG U95B probe ID(s) [e.g. 48794_s_at]' (type	id_list, values	[]),
 59 | u'affy_hg_u95c'	'AFFY HG U95C probe ID(s) [e.g. 66888_at]' (type	id_list, values	[]),
 60 | u'affy_hg_u95d'	'AFFY HG U95D probe ID(s) [e.g. 70806_at]' (type	id_list, values	[]),
 61 | u'affy_hg_u95e'	'AFFY HG U95E probe ID(s) [e.g. 88289_at]' (type	id_list, values	[]),
 62 | u'affy_hta_2_0'	'AFFY HTA 2 0 probe ID(s) [e.g. TC04001102.hg]' (type	id_list, values	[]),
 63 | u'affy_huex_1_0_st_v2'	'AFFY HuEx 1 0 st v2 probe ID(s) [e.g. 4037584]' (type	id_list, values	[]),
 64 | u'affy_hugene_1_0_st_v1'	'AFFY HuGene 1 0 st v1 probe ID(s) [e.g. 8165644]' (type	id_list, values	[]),
 65 | u'affy_hugene_2_0_st_v1'	'AFFY HuGene 2 0 st v1 probe ID(s) [e.g. 17100641]' (type	id_list, values	[]),
 66 | u'affy_hugenefl'	'AFFY HuGeneFL probe ID(s) [e.g. Z70759_at]' (type	id_list, values	[]),
 67 | u'affy_primeview'	'AFFY PrimeView probe ID(s) [e.g. 11761516_x_at]' (type	id_list, values	[]),
 68 | .
 69 | .
 70 | .
 71 | 
 72 | ```
 73 | ___
 74 | 
 75 | ## ___attributesBM___
 76 | 
 77 | Lists BioMart attributes for a specific dataset.
 78 | 
 79 | **`attributesBM(dataset,host=biomart_host)`**
 80 | 
 81 | * **`dataset`** dataset to list attributes of
 82 | * **`host`** address of the host server, default='http://www.ensembl.org/biomart'
 83 | 
 84 | * **`returns`** nothing
 85 | 
 86 | ```python
 87 | >>> import AGEpy as age
 88 | >>> age.attributesBM('hsapiens_gene_ensembl')
 89 | 
 90 | u'3_utr_end'	'3' UTR end' (default	False),
 91 |  u'3_utr_start'	'3' UTR start' (default	False),
 92 |  u'3utr'	'3' UTR' (default	False),
 93 |  u'5_utr_end'	'5' UTR end' (default	False),
 94 |  u'5_utr_start'	'5' UTR start' (default	False),
 95 |  u'5utr'	'5' UTR' (default	False),
 96 |  u'acarolinensis_homolog_associated_gene_name'	'Anole lizard gene name' (default	False),
 97 |  u'acarolinensis_homolog_canonical_transcript_protein'	'Query protein or transcript ID' (default	False),
 98 |  u'acarolinensis_homolog_chrom_end'	'Anole lizard chromosome/scaffold end (bp)' (default	False),
 99 |  u'acarolinensis_homolog_chrom_start'	'Anole lizard chromosome/scaffold start (bp)' (default	False),
100 |  u'acarolinensis_homolog_chromosome'	'Anole lizard chromosome/scaffold name' (default	False),
101 |  u'acarolinensis_homolog_dn'	'dN with Anole lizard' (default	False),
102 |  u'acarolinensis_homolog_ds'	'dS with Anole lizard' (default	False),
103 |  u'acarolinensis_homolog_ensembl_gene'	'Anole lizard gene stable ID' (default	False),
104 |  .
105 |  .
106 |  .
107 | 
108 | ```
109 | ___
110 | 
111 | ## ___queryBM___
112 | 
113 | Queries BioMart.
114 | 
115 | **`queryBM(query_attributes,query_dataset,query_filter=None,query_items=None,query_dic=None,host=biomart_host)`**
116 | 
117 | * **`query_attributes`** list of attributes to recover from BioMart
118 | * **`query_dataset`** dataset to query
119 | * **`query_filter`** one BioMart filter associated with the items being queried
120 | * **`query_items`** list of items to be queried (must assoiate with given filter)
121 | * **`query_querydic`** for complex queries this option should be used instead of 'filters' and 'items' and a dictionary of filters provided here eg. querydic={"filter1":["item1","item2"],"filter2":["item3","item4"]}. If using querydic, don't query more than 350 items at once.
122 | * **`host`** address of the host server, default='http://www.ensembl.org/biomart'
123 | 
124 | * **`returns`** a Pandas dataframe of the queried attributes
125 | 
126 | ```python
127 | >>> import AGEpy as age
128 | >>> queryDf=queryBM(query_attributes=["ensembl_gene_id","external_gene_name", \
129 |                                   "go_id","name_1006","definition_1006"],\
130 |                 query_dataset='hsapiens_gene_ensembl')
131 | >>> print queryDf.head()
132 | 
133 | ensembl_gene_id external_gene_name       go_id            name_1006  \
134 | 0  ENSG00000283891             MIR628  GO:0005615  extracellular space   
135 | 1  ENSG00000251931          RNU6-871P                                    
136 | 2  ENSG00000207766             MIR626                                    
137 | 3  ENSG00000275323         AC012314.7  GO:0003723          RNA binding   
138 | 4  ENSG00000275323         AC012314.7  GO:0005634              nucleus   
139 | 
140 |                                      definition_1006  
141 | 0  "That part of a multicellular organism outside..."  
142 | 1                                                     
143 | 2                                                     
144 | 3  "Interacting selectively and non-covalently wi..."  
145 | 4  "A membrane-bounded organelle of eukaryotic ce..."  
146 | ```
147 | ___
148 | 
149 | ## ___FilterGOstring___
150 | 
151 | Filters GO terms based on given strings using ENSEMBL's biomart homology mapping.
152 | 
153 | **`FilterGOstring(names_filter=["age-", "aging", "aged", 'aging', 'aging.', 'aging,'], exclude_names=["packaging","voltage","cleavage-", "stage-1","cage-like","message-specific", "damage-associated","stage-specific","foraging", "DNA-damaging","engaging","damaged","packaged"], defs_filter=[" age-", " aging", " aged", ' aging', ' aging.', ' aging,'], exclude_defs=["packaging","voltage","cleavage-", "stage-1","cage-like","message-specific", "damage-associated","stage-specific","foraging", "DNA-damaging","engaging","damaged","packaged"], host=biomart_host, HSA=None,MUS=None,CEL=None,DMEL=None)`**
154 | 
155 | * **`names_filter`** list of substrings to filter GO names on. Default=["age-", "aging", "aged", 'aging', 'aging.', 'aging,']
156 | * **`exclude_names`** list of substrings to be used for exclusion of GO names. Default=["packaging","voltage","cleavage-",
157 |                        "stage-1","cage-like","message-specific",
158 |                        "damage-associated","stage-specific","foraging",
159 |                        "DNA-damaging","engaging","damaged","packaged"]
160 | * **`defs_filter`** list of substrings to filter GO defenitions on. Default=[" age-", " aging", " aged", ' aging', ' aging.', ' aging,']
161 | * **`exclude_defs`** list of substrings to be used for exclustion of GO defenitions. Default=["packaging","voltage","cleavage-",
162 |                          "stage-1","cage-like","message-specific",
163 |                          "damage-associated","stage-specific","foraging",
164 |                          "DNA-damaging","engaging","damaged","packaged"]
165 | * **`host`** biomart host server, default="http://www.ensembl.org/biomart"
166 | * **`HSA`** retrieved hsa dataframe
167 | * **`MUS`** retrieved mus dataframe
168 | * **`CEL`** retrieved cel dataframe
169 | * **`DMEL`** retrieved dmel dataframe
170 | 
171 | * **`returns`**  homology_df, HSA, MUS, CEL, DMEL
172 | 
173 | ```python
174 | >>> import AGEpy as age
175 | >>> homology_df, HSA, MUS, CEL, DMEL=age.FilterGOstring()
176 | >>> print homology_df.head()
177 | 
178 | HSA_ensembl_gene_id HSA_external_gene_name  \
179 | 0     ENSG00000000003                 TSPAN6   
180 | 1     ENSG00000000005                   TNMD   
181 | 2     ENSG00000000460               C1orf112   
182 | 3     ENSG00000000971                    CFH   
183 | 4     ENSG00000002079                  MYH16   
184 | 
185 |                                          HSA_go_id  \
186 | 0  GO:0039532, , GO:0070062, GO:0016021, GO:00160...   
187 | 1  GO:0005737, , GO:0016020, GO:0035990, GO:00717...   
188 | 2                                                NaN   
189 | 3  , GO:0030449, GO:0070062, GO:0045087, GO:00725...   
190 | 4                                                NaN   
191 | 
192 |                                      HSA_name_1006  \
193 | 0  , negative regulation of NIK/NF-kappaB signali...   
194 | 1  , nuclear envelope, cytoplasm, negative regula...   
195 | 2                                                NaN   
196 | 3  , innate immune response, heparan sulfate prot...   
197 | 4                                                NaN   
198 | 
199 |                                HSA_definition_1006 MUS_ensembl_gene_id  \
200 | 0  "The component of a membrane consisting of the..."  ENSMUSG00000067377   
201 | 1  "The component of a membrane consisting of the..."  ENSMUSG00000031250   
202 | 2                                                NaN   ENSMUSG00000041406   
203 | 3  "Interacting selectively and non-covalently wi..."                 NaN   
204 | 4                                                NaN                  NaN   
205 | 
206 | CEL_ensembl_gene_id DMEL_ensembl_gene_id MUS_external_gene_name  \
207 | 0                 NaN                  NaN                 Tspan6   
208 | 1                 NaN                  NaN                   Tnmd   
209 | 2                 NaN                  NaN               BC055324   
210 | 3                 NaN                  NaN                   None   
211 | 4                 NaN                  NaN                   None   
212 | 
213 |                                          MUS_go_id   ...     \
214 | 0  GO:0039532, , GO:0070062, GO:0016021, GO:00160...   ...      
215 | 1  GO:0016020, GO:0035990, GO:0071773, GO:0016021...   ...      
216 | 2               GO:0005575, GO:0008150, GO:0003674,    ...      
217 | 3                                               None   ...      
218 | 4                                               None   ...      
219 | 
220 |                                MUS_definition_1006 CEL_external_gene_name  \
221 | 0  "The component of a membrane consisting of the..."                   None   
222 | 1  "The component of a membrane consisting of the..."                   None   
223 | 2  "Elemental activities, such as catalysis or bi..."                   None   
224 | 3                                               None                    None   
225 | 4                                               None                    None   
226 | 
227 | CEL_go_id CEL_name_1006 CEL_definition_1006 DMEL_external_gene_name  \
228 | 0      None          None                None                    None   
229 | 1      None          None                None                    None   
230 | 2      None          None                None                    None   
231 | 3      None          None                None                    None   
232 | 4      None          None                None                    None   
233 | 
234 | DMEL_go_id DMEL_name_1006 DMEL_definition_1006 evidence  
235 | 0       None           None                 None      NaN  
236 | 1       None           None                 None      NaN  
237 | 2       None           None                 None      NaN  
238 | 3       None           None                 None      NaN  
239 | 4       None           None                 None      NaN  
240 | 
241 | ```
242 | 
243 | **evidence** indicates from which organisms there is evidence of the intended string
244 | 


--------------------------------------------------------------------------------
/docs/modules/blast.md:
--------------------------------------------------------------------------------
  1 | ## ___BLASTquery___
  2 | 
  3 | Performs a blast query online. As in https://ncbi.github.io/blast-cloud/
  4 | 
  5 | **`BLASTquery(query,database,program,filter=None, format_type=None, expect=None, nucl_reward=None, nucl_penalty=None, gapcosts=None, matrix=None, hitlist_size=None, descriptions=None, alignments=None, ncbi_gi=None, threshold=None, word_size=None, composition_based_statistics=None, organism=None, others=None, num_threads=None, baseURL="http://blast.ncbi.nlm.nih.gov", verbose=False)`**
  6 | 
  7 | * **`query`** Search query. Allowed values: Accession, GI, or FASTA.
  8 | * **`database`** BLAST database. Allowed values: nt, nr, refseq_rna, refseq_protein, swissprot, pdbaa, pdbnt
  9 | * **`program`** BLAST program. Allowed values:  blastn, megablast, blastp, blastx, tblastn, tblastx
 10 | * **`filter`** Low complexity filtering. Allowed values: F to disable. T or L to enable. Prepend “m” for mask at lookup (e.g., mL)
 11 | * **`format_type`** Report type. Allowed values: HTML, Text, XML, XML2, JSON2, or Tabular. HTML is the default.
 12 | * **`expect`** Expect value. Allowed values: Number greater than zero.
 13 | * **`nucl_reward`** Reward for matching bases (BLASTN and megaBLAST). Allowed values: Integer greater than zero.
 14 | * **`nucl_penalty`** Cost for mismatched bases (BLASTN and megaBLAST). Allowed values: Integer less than zero.
 15 | * **`gapcosts`** Gap existence and extension costs. Allowed values: Pair of positive integers separated by a space such as “11 1”.
 16 | * **`matrix`** Scoring matrix name. Allowed values: One of BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, PAM250, PAM30 or PAM70. Default: BLOSUM62 for all applicable programs.
 17 | * **`hitlist_size`** Number of databases sequences to keep. Allowed values: Integer greater than zero.
 18 | * **`descriptions`** Number of descriptions to print (applies to HTML and Text). Allowed values: Integer greater than zero.
 19 | * **`alignments`** Number of alignments to print (applies to HTML and Text). Allowed values: Integer greater than zero.
 20 | * **`ncbi_gi`** Show NCBI GIs in report. Allowed values: T or F.
 21 | * **`threshold`** Neighboring score for initial words. Allowed values: Positive integer (BLASTP default is 11). Does not apply to BLASTN or MegaBLAST).
 22 | * **`word_size`** Size of word for initial matches. Allowed values: Positive integer.
 23 | * **`composition_based_statistics`** Composition based statistics algorithm to use. Allowed values: One of 0, 1, 2, or 3. See comp_based_stats command line option in the BLAST+ user manual for details.
 24 | * **`organism`** an organism as in https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
 25 | * **`others`** here you can add other parameters as seen in a blast bookmarked page. Define you query in https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
 26 |         Once your query is defined click on "Bookmark" on right upper side of the page. You can copy fragments of the URL
 27 |         which define the query. Eg. For organism "Homo sapiens (taxid:9606)" you will see the string "EQ_MENU=Homo%20sapiens%20%28taxid%3A9606%29" - this is
 28 |         the string you can use here in others.
 29 | * **`num_threads`** Number of virtual CPUs to use. Allowed values: Integer greater than zero (default is 1). Supported only on the cloud.
 30 | * **`verbose`** print more
 31 | 
 32 | * **`returns`** BLAST search request identifier
 33 | 
 34 | ```python
 35 | >>> import AGEpy as age
 36 | >>> seq="CTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTAC"
 37 | >>> RID=age.BLASTquery(seq,"nt","blastn")
 38 | >>> print RID
 39 | 
 40 | 4MS2JV8T014
 41 | ```
 42 | ___
 43 | 
 44 | ## ___BLASTcheck___
 45 | 
 46 | Checks the status of a query.
 47 | 
 48 | **`BLASTcheck(rid,baseURL="http://blast.ncbi.nlm.nih.gov")`**
 49 | 
 50 | * **`rid`**  BLAST search request identifier. Allowed values: The Request ID (RID) returned when the search was submitted
 51 | * **`baseURL`** server url. Default=http://blast.ncbi.nlm.nih.gov
 52 | 
 53 | * **`returns status`** status for the query.
 54 | * **`returns therearehist`** yes or no for existing hits on a finished query.
 55 | 
 56 | ```python
 57 | >>> import AGEpy as age
 58 | >>> status, therearehits=age.BLASTcheck(RID)
 59 | 
 60 | RID: 4MRYDZSC014; status:READY; hits: yes
 61 | 
 62 | >>> print status, therearehits
 63 | 
 64 | READY yes
 65 | ```
 66 | ___
 67 | 
 68 | ## ___BLASTresults___
 69 | 
 70 | Retrieves results for an RID.
 71 | 
 72 | **`BLASTresults(rid, format_type="Tabular", hitlist_size= None, alignments=None, ncbi_gi = None, format_object=None, baseURL="http://blast.ncbi.nlm.nih.gov")`**
 73 | 
 74 | * **`rid`** BLAST search request identifier. Allowed values: The Request ID (RID) returned when the search was submitted
 75 | * **`format_type`** Report type. Allowed values: HTML, Text, XML, XML2, JSON2, or Tabular.
 76 | * **`hitlist_size`** Number of databases sequences to keep. Allowed values: Integer greater than zero.
 77 | * **`alignments`** Number of alignments to print (applies to HTML and Text). Allowed values: Integer greater than zero.
 78 | * **`ncbi_gi`** Show NCBI GIs in report. Allowed values: T or F.
 79 | * **`format_object`** Object type. Allowed values: SearchInfo (status check) or Alignment (report formatting).
 80 | * **`baseURL`** server url. Default=http://blast.ncbi.nlm.nih.gov
 81 | 
 82 | * **`returns`** the result of a BLAST query. If format_type="Tabular" it will parse the content into a Pandas dataframe.
 83 | 
 84 | ```python
 85 | >>> import AGEpy as age
 86 | >>> r=age.BLASTresults(RID)
 87 | >>> print r.head()
 88 | 
 89 | query id                                        subject ids  \
 90 | 0  Query_17381                       gi|1012955506|gb|JN214348.1|   
 91 | 1  Query_17381                       gi|631786534|tpe|HG975427.1|   
 92 | 2  Query_17381                        gi|369762889|gb|JN900492.1|   
 93 | 3  Query_17381                   gi|371502118|ref|NM_001126118.1|   
 94 | 4  Query_17381  gi|371502115|ref|NM_001126112.2|;gi|454521556|...   
 95 | 
 96 | query acc.ver subject acc.ver % identity alignment length mismatches  \
 97 | 0   Query_17381      JN214348.1    100.000             1190          0   
 98 | 1   Query_17381      HG975427.1    100.000             1190          0   
 99 | 2   Query_17381      JN900492.1    100.000             1190          0   
100 | 3   Query_17381  NM_001126118.1    100.000             1190          0   
101 | 4   Query_17381  NM_001126112.2    100.000             1190          0   
102 | 
103 | gap opens q. start q. end s. start s. end evalue bit scor  
104 | 0         0        1   1190      614   1803    0.0     2147  
105 | 1         0        1   1190      766   1955    0.0     2147  
106 | 2         0        1   1190      877   2066    0.0     2147  
107 | 3         0        1   1190      888   2077    0.0     2147  
108 | 4         0        1   1190      768   1957    0.0     2147
109 | ```
110 | 


--------------------------------------------------------------------------------
/docs/modules/cellplot.CellPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/cellplot.CellPlot.png


--------------------------------------------------------------------------------
/docs/modules/cytoscape.md:
--------------------------------------------------------------------------------
  1 | ## ___checkCytoscapeVersion___
  2 | 
  3 | Checks cytoscape version.
  4 | 
  5 | **`CheckResponse(r)`**
  6 | 
  7 | * **`host`** cytoscape host address, default=cytoscape_host
  8 | * **`port`** cytoscape port, defaul=cytoscape_port
  9 | * **`returns`** nothing
 10 | 
 11 | ```python
 12 | >>> import AGEpy as age
 13 | >>> age.checkCytoscapeVersion()
 14 | 
 15 | cytoscapeVersion 3.6.0
 16 | apiVersion v1
 17 | ```
 18 | ___
 19 | ## ___cytoscape___
 20 | 
 21 | General function for interacting with Cytoscape API.
 22 | 
 23 | **`cytoscape(namespace,command="",PARAMS={},host=cytoscape_host,port=cytoscape_port,method="POST",verbose=False)`**
 24 | 
 25 | * **`namespace`** namespace where the request should be executed. eg. "string"
 26 | * **`commnand`** command to execute. eg. "protein query"
 27 | * **`PARAMs`** a dictionary with the parameters. Check your swagger normaly running on
 28 | 'http://localhost:1234/v1/swaggerUI/swagger-ui/index.html?url=http://localhost:1234/v1/commands/swagger.json'
 29 | * **`host`** cytoscape host address, default=cytoscape_host
 30 | * **`port`** cytoscape port, default=cytoscape_port
 31 | * **`method`** type of http call, ie. "POST" or "GET" or "HELP".
 32 | * **`verbose`** print more information
 33 | * **`returns`** For "POST" the data in the content's response. For "GET" None.
 34 | 
 35 | ```python
 36 | >>> import AGEpy as age
 37 | >>> response=age.cytoscape("string","pubmed query",{"pubmed":"p53 p21","limit":"50"})
 38 | >>> print response
 39 | 
 40 | {u'SUID': 37560}
 41 | ```
 42 | ___
 43 | ## ___result___
 44 | 
 45 | Displays the current network.
 46 | 
 47 | **`result(filetype="PNG", saveas=None, host=cytoscape_host, port=cytoscape_port)`**
 48 | 
 49 | * **`filetype`** file type, default="PNG"
 50 | * **`saveas`** /path/to/non/tmp/file.prefix
 51 | * **`host`** cytoscape host address, default=cytoscape_host
 52 | * **`port`** cytoscape port, default=cytoscape_port
 53 | * **`returns`** an image
 54 | 
 55 | ```python
 56 | >>> import AGEpy as age
 57 | >>> response=age.result()
 58 | >>> response
 59 | ```
 60 | ![cytoscape](p53.png)
 61 | ___
 62 | 
 63 | ## ___getTableColumns___
 64 | 
 65 | Gets tables from cytoscape.
 66 | 
 67 | **`getTableColumns(table, columns, namespace = "default", network = "current", host=cytoscape_host,port=cytoscape_port,verbose=False)`**
 68 | 
 69 | * **`table`** table to retrieve eg. node
 70 | * **`columns`** columns to retrieve in list format
 71 | * **`namespace`** namepsace, default="default"
 72 | * **`network`** a network name or id, default="current"
 73 | * **`host`** cytoscape host address, default=cytoscape_host
 74 | * **`port`** cytoscape port, default=cytoscape_port
 75 | * **`verbose`** print more information
 76 | * **`returns`** a pandas dataframe
 77 | 
 78 | ```python
 79 | >>> import AGEpy as age
 80 | >>> response=age.getTableColumns('node',['display name'])
 81 | >>> print response
 82 | 
 83 |                      display name
 84 | 9606.ENSP00000367207          MYC
 85 | 9606.ENSP00000356150         MDM4
 86 | 9606.ENSP00000228872       CDKN1B
 87 | 9606.ENSP00000361021         PTEN
 88 | 9606.ENSP00000265734         CDK6
 89 | ```
 90 | ___
 91 | ## ___loadTableData___
 92 | 
 93 | Loads tables into cytoscape.
 94 | 
 95 | **`loadTableData(df, df_key='index',table="node", table_key_column = "name", network="current", namespace="default", host=cytoscape_host, port=cytoscape_port, verbose=False)`**
 96 | 
 97 | * **`df`** a pandas dataframe to load
 98 | * **`df_key`** key column in df, defaul="index"
 99 | * **`table`** target table, default="node"
100 | * **`table_key_column`** table key column, default="name"
101 | * **`network`** a network name or id, default="current"
102 | * **`host`** cytoscape host address, default=cytoscape_host
103 | * **`port`** cytoscape port, default=cytoscape_port
104 | * **`verbose`** print more information
105 | * **`returns`** output of put request
106 | 
107 | ```python
108 | >>> import AGEpy as age
109 | >>> print df.head()
110 | 
111 |                      display name
112 | 9606.ENSP00000367207          MYC
113 | 9606.ENSP00000356150         MDM4
114 | 9606.ENSP00000228872       CDKN1B
115 | 9606.ENSP00000361021         PTEN
116 | 9606.ENSP00000265734         CDK6
117 | 
118 | >>> def MarkCKDs(x):
119 | ...    if "CDK" in x:
120 | ...        res="yes"
121 | ...    else:
122 | ...        res="not"
123 | ...    return res
124 | >>> df["CDK"]=df["display name"].apply( lambda x: MarkCKDs(x) )
125 | >>> print df.head()
126 | 
127 |                      display name  CDK
128 | 9606.ENSP00000367207          MYC  not
129 | 9606.ENSP00000356150         MDM4  not
130 | 9606.ENSP00000228872       CDKN1B  yes
131 | 9606.ENSP00000361021         PTEN  not
132 | 9606.ENSP00000265734         CDK6  yes
133 | 
134 | >>> response=age.loadTableData(df[["CDK"]])
135 | ```
136 | ___
137 | 
138 | ## ***simple_defaults***
139 | 
140 | Simplifies default layouts.
141 | 
142 | **`simple_defaults(defaults_dic)`**
143 | 
144 | * **`defaults_dic`** a dictionary of the form { visualProperty_A:value_A, visualProperty_B:value_B, ..}
145 | * **`returns`** a list of dictionaries with each item corresponding to a given key in defaults_dic
146 | 
147 | ```python
148 | >>> import AGEpy as age
149 | >>> defaults_dic={"NODE_SHAPE":"ellipse",\
150 |                   "NODE_SIZE":60,\
151 |                   "NODE_FILL_COLOR":"#AAAAAA",\
152 |                   "EDGE_TRANSPARENCY":120}
153 | >>> defaults_list=age.simple_defaults(defaults_dic)
154 | >>> print defaults_list
155 | 
156 | [{'visualProperty': 'NODE_SIZE', 'value': 60}, \
157 | {'visualProperty': 'NODE_FILL_COLOR', 'value': '#AAAAAA'}, \
158 | {'visualProperty': 'NODE_SHAPE', 'value': 'ellipse'}, \
159 | {'visualProperty': 'EDGE_TRANSPARENCY', 'value': 120}]
160 | ```
161 | ___
162 | 
163 | ## ***create_styles***
164 | 
165 | Creates a new visual style.
166 | 
167 | **`create_styles(title,defaults=None,mappings=None,host=cytoscape_host,port=cytoscape_port)`**
168 | 
169 | * **`title`** title of the visual style
170 | * **`defaults`** a list of dictionaries for each visualProperty
171 | * **`mappings`** a list of dictionaries for each visualProperty
172 | * **`host`** cytoscape host address, default=cytoscape_host
173 | * **`port`** cytoscape port, default=cytoscape_port
174 | * **`retunrs`** nothing
175 | 
176 | ```python
177 | >>> import AGEpy as age
178 | >>> print defaults_list
179 | 
180 | [{'visualProperty': 'NODE_SIZE', 'value': 60}, \
181 | {'visualProperty': 'NODE_FILL_COLOR', 'value': '#AAAAAA'}, \
182 | {'visualProperty': 'NODE_SHAPE', 'value': 'ellipse'}, \
183 | {'visualProperty': 'EDGE_TRANSPARENCY', 'value': 120}]
184 | 
185 | >>> response=age.create_styles("newStyle",defaults=defaults_list)
186 | ```
187 | ___
188 | 
189 | ## ***update_style***
190 | 
191 | Updates a visual style.
192 | 
193 | **`update_style(title, defaults=None, mappings=None, host=cytoscape_host, port=cytoscape_port, verbose=False)`**
194 | 
195 | * **`title`** title of the visual style
196 | * **`defaults`** a list of dictionaries for each visualProperty
197 | * **`mappings`** a list of dictionaries for each visualProperty
198 | * **`host`** cytoscape host address, default=cytoscape_host
199 | * **`port`** cytoscape port, default=cytoscape_port
200 | * **`retunrs`** nothing
201 | 
202 | ```python
203 | >>> import AGEpy as age
204 | >>> print new_defaults_list
205 | 
206 | [{'visualProperty': 'NODE_SIZE', 'value': 80}, \
207 | {'visualProperty': 'NODE_FILL_COLOR', 'value': '#AAAAAA'}, \
208 | {'visualProperty': 'NODE_SHAPE', 'value': 'ellipse'}, \
209 | {'visualProperty': 'EDGE_TRANSPARENCY', 'value': 120}]
210 | 
211 | >>> response=age.update_style("newStyle",defaults=defaults_list)
212 | ```
213 | ___
214 | 
215 | ## ***mapVisualProperty***
216 | 
217 | Generates a dictionary for a given visual property
218 | 
219 | **`mapVisualProperty(visualProperty, mappingType, mappingColumn, lower=None,center=None,upper=None, discrete=None, network="current",table="node", namespace="default", host=cytoscape_host, port=cytoscape_port, verbose=False)`**
220 | 
221 | * **`visualProperty`** visualProperty
222 | * **`mappingType`** mappingType
223 | * **`mappingColumn`** mappingColumn
224 | * **`lower`** for "continuous" mappings a list of the form [value,rgb_string]
225 | * **`center`** for "continuous" mappings a list of the form [value,rgb_string]
226 | * **`upper`** for "continuous" mappings a list of the form [value,rgb_string]
227 | * **`discrete`** for discrete mappings, a list of lists of the form [ list_of_keys, list_of_values ]
228 | * **`network`** a network name or id, default="current"
229 | * **`host`** cytoscape host address, default=cytoscape_host
230 | * **`port`** cytoscape port, default=cytoscape_port
231 | * **`retunrs`** a dictionary for the respective visual property
232 | 
233 | ```python
234 | >>> import AGEpy as age
235 | >>> import matplotlib
236 | 
237 | >>> NODE_LABEL=age.mapVisualProperty("NODE_LABEL","passthrough","display name")
238 | >>> print NODE_LABEL
239 | 
240 | {'mappingType': 'passthrough', 'visualProperty': 'NODE_LABEL', 'mappingColumnType': u'String', 'mappingColumn': 'display name'}
241 | 
242 | >>> NODE_SHAPE=age.mapVisualProperty('NODE_SHAPE','discrete','CDK',\
243 |                                      discrete=[ ["yes","not"], \
244 |                                      ["DIAMOND", "ellipse"] ])
245 | 
246 | >>> NODE_SIZE=age.mapVisualProperty('NODE_SIZE','discrete','CDK',\
247 |                                     discrete=[ ["yes","not"],\
248 |                                     ["100.0","60.0"] ])
249 | 
250 | # imagine you have a log2(fold_change) column in your cytoscape table
251 | >>> cmap = matplotlib.cm.get_cmap("bwr")
252 | >>> norm = matplotlib.colors.Normalize(vmin=-4, vmax=4)
253 | >>> min_color=matplotlib.colors.rgb2hex(cmap(norm(-4)))
254 | >>> center_color=matplotlib.colors.rgb2hex(cmap(norm(0)))
255 | >>> max_color=matplotlib.colors.rgb2hex(cmap(norm(4)))  
256 | >>> NODE_FILL_COLOR=age.mapVisualProperty('NODE_FILL_COLOR','continuous','log2(fold_change)',\
257 |                                       lower=[-4,min_color],center=[0.0,center_color],upper=[4,max_color])
258 | ```
259 | ___
260 | 
261 | ## ***aDiffCytoscape***
262 | 
263 | Plots tables from aDiff/cuffdiff into cytoscape using String protein queries.
264 | Uses top changed genes as well as first neighbours and difusion fo generate subnetworks.
265 | 
266 | **`aDiffCytoscape(df, aging_genes, target, species="caenorhabditis elegans", limit=None, cutoff=0.4, taxon=None, cytoscape_host=cytoscape_host, cytoscape_port=cytoscape_port)`**
267 | 
268 | * **`df`**  df as outputed by aDiff for differential gene expression
269 | * **`aging_genes`** ENS gene ids to be labeled with a diagonal
270 | * **`target`** target destination for saving files without prefix. eg. "/beegfs/group_bit/home/JBoucas/test/N2_vs_daf2"
271 | * **`species`** species for string app query. eg. "caenorhabditis elegans", "drosophila melanogaster", "mus musculus", "homo sapiens"
272 | * **`limit`** limit for string app query. Number of extra genes to recover. If None, limit=N(query_genes)*.25
273 | * **`cuttoff`** confidence cuttoff for sting app query. Default=0.4
274 | * **`taxon`** taxon id for string app query. For the species shown above, taxon id will be automatically identified
275 | * **`cytoscape_host`** host address for cytoscape, default=cytoscape_host
276 | * **`cytoscape_port`** cytoscape port, defaut=cytoscape_port
277 | * **`returns`** nothing
278 | 
279 | ```python
280 | >>> import AGEpy as age
281 | >>> print genes[:10]
282 | 
283 | ['WBGene00008288', 'WBGene00002169', 'WBGene00008733', 'WBGene00004178', 'WBGene00004178', 'WBGene00004179', 'WBGene00004179', 'WBGene00020581', 'WBGene00001877', 'WBGene00001881']
284 | 
285 | >>> print df.head()
286 | 
287 | ensembl_gene_id            gene            locus sample_1 sample_2 status  \
288 | 0  WBGene00022275        Y74C9A.1    I:43732-44677       N2     daf2     OK   
289 | 1  WBGene00004418  F53G12.9,rpl-7  I:111037-113672       N2     daf2     OK   
290 | 2  WBGene00018774  F53G12.9,rpl-7  I:111037-113672       N2     daf2     OK   
291 | 3  WBGene00018772        F53G12.4  I:134336-137282       N2     daf2     OK   
292 | 4  WBGene00018958        F56C11.6  I:171339-175991       N2     daf2     OK   
293 | 
294 |      value_1      value_2  log2(fold_change)  test_stat  p_value   q_value  \
295 | 0     0.195901     0.986634           2.332390    2.32959  0.00570  0.031216   
296 | 1  3354.820000  2463.480000          -0.445539   -2.71381  0.00005  0.000556   
297 | 2  3354.820000  2463.480000          -0.445539   -2.71381  0.00005  0.000556   
298 | 3     1.235670     2.992460           1.276040    3.16508  0.00005  0.000556   
299 | 4     2.651180     3.795600           0.517696    1.73994  0.00410  0.024157   
300 | 
301 | significant                                              GO_id  \
302 | 0         yes                                                NaN   
303 | 1         yes  GO:0003735; GO:0000463; GO:0044822; GO:0002181...   
304 | 2         yes                                                NaN   
305 | 3         yes                                                NaN   
306 | 4         yes                 GO:0016787; GO:0005615; GO:0004104   
307 | 
308 |                                            GO_term    gene_biotype  \
309 | 0                                                NaN  protein_coding   
310 | 1  structural constituent of ribosome; maturation...  protein_coding   
311 | 2                                                NaN  protein_coding   
312 | 3                                                NaN  protein_coding   
313 | 4  hydrolase activity; extracellular space; choli...  protein_coding   
314 | 
315 |   NormInt evidence  
316 | 0 -0.356904       no  
317 | 1  3.458609       no  
318 | 2  3.458609       no  
319 | 3  0.283965       no  
320 | 4  0.501360       no  
321 | 
322 | >>> age.aDiffCytoscape(df,genes,"/u/home/JBoucas/cytoscape/cyto")
323 | ```
324 | 


--------------------------------------------------------------------------------
/docs/modules/david.md:
--------------------------------------------------------------------------------
  1 | ## ___DAVIDenrich___
  2 | 
  3 | Queries the DAVID database for an enrichment analysis.
  4 | Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == "type" tag and categories ==  "annot" tag.
  5 | 
  6 | **`DAVIDenrich(database, categories, user, ids, ids_bg = None, name = '', name_bg = '', verbose = False, p = 0.1, n = 2)`**
  7 | 
  8 | * **`database`** A string for the database to query, e.g. 'WORMBASE_GENE_ID'
  9 | * **`categories`** A comma separated string with databases
 10 | * **`user`**  A user ID registered at DAVID for querying
 11 | * **`ids`**  A list with identifiers
 12 | * **`name`**  A string with the name for the query set
 13 | * **`ids_bg`**  A list with the background identifiers to enrich against, 'None' for whole set
 14 | * **`name_bg`**  A string with the name for the background set
 15 | * **`p`**  Maximum p value for enrichment of a term
 16 | * **`n`**  Minimum number of genes within a term
 17 | * **`returns`**  None if no ids match the queried database, or a pandas dataframe with results
 18 | 
 19 | ```python
 20 | >>> import AGEpy as age
 21 | >>> print sigGenes[:10]
 22 | 
 23 | [u'WBGene00022275', u'WBGene00004418', u'WBGene00018774',
 24 |  u'WBGene00018772', u'WBGene00018958', u'WBGene00021662',
 25 |  u'WBGene00255594', u'WBGene00021658', u'WBGene00021026',
 26 |  u'WBGene00022042']
 27 | 
 28 | >>> categories=['GOTERM_BP_FAT', 'GOTERM_CC_FAT', 'GOTERM_MF_FAT', 'KEGG_PATHWAY','BIOCARTA', 'PFAM', 'PROSITE' ]
 29 | >>> DAVIDdf=age.DAVIDenrich('WORMBASE_GENE_ID', categories, 'email.registered@david.com', sigGenes)
 30 | >>> print DAVIDdf.head()
 31 | 
 32 | categoryName                                     termName listHits  \
 33 | 0  GOTERM_BP_FAT                       GO:0006412~translation      177   
 34 | 1  GOTERM_BP_FAT         GO:0006518~peptide metabolic process      198   
 35 | 2  GOTERM_BP_FAT      GO:0043043~peptide biosynthetic process      177   
 36 | 3  GOTERM_BP_FAT        GO:0043604~amide biosynthetic process      180   
 37 | 4  GOTERM_BP_FAT  GO:0043603~cellular amide metabolic process      206   
 38 | 
 39 |      percent               ease  \
 40 | 0  5.85704831238  4.32627669357e-43   
 41 | 1  6.55195234944  1.36601477909e-42   
 42 | 2  5.85704831238  4.04090150003e-42   
 43 | 3  5.95632031767  1.05565138148e-40   
 44 | 4  6.81667769689  3.74871147863e-40   
 45 | 
 46 |                                          geneIds listTotals popHits  \
 47 | 0  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     379   
 48 | 1  WBGENE00002063, WBGENE00006626, WBGENE00007584...       1878     455   
 49 | 2  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     384   
 50 | 3  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     402   
 51 | 4  WBGENE00002063, WBGENE00006626, WBGENE00007584...       1878     499   
 52 | 
 53 | popTotals foldEnrichment         bonferroni          benjamini  \
 54 | 0     11221  2.79042292227  1.28576943333e-39  1.28576943333e-39   
 55 | 1     11221  2.60009830425  4.05979592345e-39  2.02989796172e-39   
 56 | 2     11221  2.75408929047  1.20095592581e-38  4.00318641936e-39   
 57 | 3     11221   2.6753612131  3.13739590576e-37  7.84348976441e-38   
 58 | 4     11221  2.46662227543  1.11411705145e-36   2.2282341029e-37   
 59 | 
 60 |             afdr  
 61 | 0  7.78207551683e-40  
 62 | 1  2.45717759656e-39  
 63 | 2  7.26874466353e-39  
 64 | 3  1.89889814083e-37  
 65 | 4   6.7431553467e-37  
 66 | ```
 67 | ___
 68 | 
 69 | ## ___DAVIDgetGeneAttribute___
 70 | 
 71 | Returns a list of gene names for given gene ids.
 72 | 
 73 | **`DAVIDgetGeneAttribute(x, df, refCol="ensembl_gene_id", fieldTOretrieve="gene_name")`**
 74 | 
 75 | * **`x`** a string with the list of IDs separated by ', '
 76 | * **`df`**  a dataframe with the reference column and a the column to retrieve
 77 | * **`refCol`** the header of the column containing the identifiers
 78 | * **`fieldTOretrieve`** the field to retrieve from parsedGTF eg. 'gene_name'
 79 | * **`returns`** list of fieldTOretrieve separeted by ', ' in the same order as the given in x
 80 | 
 81 | ```python
 82 | >>> import AGEpy as age
 83 | >>> print df.head()
 84 | 
 85 | ensembl_gene_id            gene            locus sample_1 sample_2 status  \
 86 | 0  WBGene00022275        Y74C9A.1    I:43732-44677       N2     daf2     OK   
 87 | 1  WBGene00004418  F53G12.9,rpl-7  I:111037-113672       N2     daf2     OK   
 88 | 2  WBGene00018774  F53G12.9,rpl-7  I:111037-113672       N2     daf2     OK   
 89 | 3  WBGene00018772        F53G12.4  I:134336-137282       N2     daf2     OK   
 90 | 4  WBGene00018958        F56C11.6  I:171339-175991       N2     daf2     OK   
 91 | 
 92 |      value_1      value_2  log2(fold_change)  test_stat  p_value   q_value  \
 93 | 0     0.195901     0.986634           2.332390    2.32959  0.00570  0.031216   
 94 | 1  3354.820000  2463.480000          -0.445539   -2.71381  0.00005  0.000556   
 95 | 2  3354.820000  2463.480000          -0.445539   -2.71381  0.00005  0.000556   
 96 | 3     1.235670     2.992460           1.276040    3.16508  0.00005  0.000556   
 97 | 4     2.651180     3.795600           0.517696    1.73994  0.00410  0.024157   
 98 | 
 99 | significant                                              GO_id  \
100 | 0         yes                                                NaN   
101 | 1         yes  GO:0003735; GO:0000463; GO:0044822; GO:0002181...   
102 | 2         yes                                                NaN   
103 | 3         yes                                                NaN   
104 | 4         yes                 GO:0016787; GO:0005615; GO:0004104   
105 | 
106 |                                            GO_term    gene_biotype  \
107 | 0                                                NaN  protein_coding   
108 | 1  structural constituent of ribosome; maturation...  protein_coding   
109 | 2                                                NaN  protein_coding   
110 | 3                                                NaN  protein_coding   
111 | 4  hydrolase activity; extracellular space; choli...  protein_coding   
112 | 
113 |   NormInt evidence  
114 | 0 -0.356904       no  
115 | 1  3.458609       no  
116 | 2  3.458609       no  
117 | 3  0.283965       no  
118 | 4  0.501360       no  
119 | 
120 | >>> print DAVIDdf.head()
121 | 
122 | categoryName                                     termName listHits  \
123 | 0  GOTERM_BP_FAT                       GO:0006412~translation      177   
124 | 1  GOTERM_BP_FAT         GO:0006518~peptide metabolic process      198   
125 | 2  GOTERM_BP_FAT      GO:0043043~peptide biosynthetic process      177   
126 | 3  GOTERM_BP_FAT        GO:0043604~amide biosynthetic process      180   
127 | 4  GOTERM_BP_FAT  GO:0043603~cellular amide metabolic process      206   
128 | 
129 |      percent               ease  \
130 | 0  5.85704831238  4.32627669357e-43   
131 | 1  6.55195234944  1.36601477909e-42   
132 | 2  5.85704831238  4.04090150003e-42   
133 | 3  5.95632031767  1.05565138148e-40   
134 | 4  6.81667769689  3.74871147863e-40   
135 | 
136 |                                          geneIds listTotals popHits  \
137 | 0  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     379   
138 | 1  WBGENE00002063, WBGENE00006626, WBGENE00007584...       1878     455   
139 | 2  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     384   
140 | 3  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     402   
141 | 4  WBGENE00002063, WBGENE00006626, WBGENE00007584...       1878     499   
142 | 
143 | popTotals foldEnrichment         bonferroni          benjamini  \
144 | 0     11221  2.79042292227  1.28576943333e-39  1.28576943333e-39   
145 | 1     11221  2.60009830425  4.05979592345e-39  2.02989796172e-39   
146 | 2     11221  2.75408929047  1.20095592581e-38  4.00318641936e-39   
147 | 3     11221   2.6753612131  3.13739590576e-37  7.84348976441e-38   
148 | 4     11221  2.46662227543  1.11411705145e-36   2.2282341029e-37   
149 | 
150 |             afdr  
151 | 0  7.78207551683e-40  
152 | 1  2.45717759656e-39  
153 | 2  7.26874466353e-39  
154 | 3  1.89889814083e-37  
155 | 4   6.7431553467e-37  
156 | 
157 | >>> gene_names=df[["ensembl_gene_id","gene"]].drop_duplicates()
158 | >>> DAVIDdf["gene_names"]=DAVIDdf["geneIds"].apply(lambda x: \
159 |                               age.DAVIDgetGeneAttribute(x,\
160 |                               gene_names,\
161 |                               refCol="ensembl_gene_id",\
162 |                               fieldTOretrieve="gene"))
163 | >>> print DAVIDdf.head()
164 | 
165 | categoryName                                     termName listHits  \
166 | 0  GOTERM_BP_FAT                       GO:0006412~translation      177   
167 | 1  GOTERM_BP_FAT         GO:0006518~peptide metabolic process      198   
168 | 2  GOTERM_BP_FAT      GO:0043043~peptide biosynthetic process      177   
169 | 3  GOTERM_BP_FAT        GO:0043604~amide biosynthetic process      180   
170 | 4  GOTERM_BP_FAT  GO:0043603~cellular amide metabolic process      206   
171 | 
172 |      percent               ease  \
173 | 0  5.85704831238  4.32627669357e-43   
174 | 1  6.55195234944  1.36601477909e-42   
175 | 2  5.85704831238  4.04090150003e-42   
176 | 3  5.95632031767  1.05565138148e-40   
177 | 4  6.81667769689  3.74871147863e-40   
178 | 
179 |                                          geneIds listTotals popHits  \
180 | 0  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     379   
181 | 1  WBGENE00002063, WBGENE00006626, WBGENE00007584...       1878     455   
182 | 2  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     384   
183 | 3  WBGENE00002063, WBGENE00013678, WBGENE00006626...       1878     402   
184 | 4  WBGENE00002063, WBGENE00006626, WBGENE00007584...       1878     499   
185 | 
186 | popTotals foldEnrichment         bonferroni          benjamini  \
187 | 0     11221  2.79042292227  1.28576943333e-39  1.28576943333e-39   
188 | 1     11221  2.60009830425  4.05979592345e-39  2.02989796172e-39   
189 | 2     11221  2.75408929047  1.20095592581e-38  4.00318641936e-39   
190 | 3     11221   2.6753612131  3.13739590576e-37  7.84348976441e-38   
191 | 4     11221  2.46662227543  1.11411705145e-36   2.2282341029e-37   
192 | 
193 |             afdr                                         gene_names  
194 | 0  7.78207551683e-40  ife-5, Y105E8A.20, tsn-1, yars-1, ife-3, C14C1...  
195 | 1  2.45717759656e-39  ife-5, tsn-1, C14C10.1, ife-3, rps-30, iff-2, ...  
196 | 2  7.26874466353e-39  ife-5, Y105E8A.20, tsn-1, yars-1, ife-3, C14C1...  
197 | 3  1.89889814083e-37  ife-5, Y105E8A.20, tsn-1, yars-1, ife-3, C14C1...  
198 | 4   6.7431553467e-37  ife-5, tsn-1, C14C10.1, ife-3, rps-30, Y51H4A....  
199 | ```
200 | ___
201 | 
202 | ## ***DAVIDplot***
203 | 
204 | Queries the DAVID database for an enrichment analysis and plots CellPlots as
205 | well as SymPlots (see plots) using the 20 most significant terms.
206 | Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == "type" tag and categories ==  "annot" tag.
207 | 
208 | **`DAVIDplot(database, categories, user, df_ids, output, df_ids_bg = None, name = '', name_bg = '', verbose = False, p = 0.1, n = 2)`**
209 | 
210 | * **`database`** a string for the database to query, e.g. 'WORMBASE_GENE_ID'
211 | * **`categories`** a comma separated string with databases
212 | * **`user`** a user ID registered at DAVID for querying
213 | * **`df_ids`** a dataframe where the first column contains the identifiers
214 |     to be queried and the second column the respective log2fc for each identifier.
215 | * **`output`** /path/to/output/prefix
216 | * **`df_ids_bg`** a dataframe where the first column contains the identifiers to be used as background. 'None' for whole set
217 | * **`name`** a string with the name for the query set
218 | * **`name_bg`** a string with the name for the background set
219 | * **`p`** Maximum p value for enrichment of a term
220 | * **`n`** Minimum number of genes within a term
221 | 
222 | * **`returns`** nothing
223 | 
224 | ```python
225 | >>> import AGEpy as age
226 | >>> print df.head()
227 | 
228 | ensembl_gene_id  log2(fold_change)
229 | 0  ENSG00000272449           1.859500
230 | 1  ENSG00000130762           0.601051
231 | 2  ENSG00000083444          -0.881957
232 | 3  ENSG00000162493          -0.638433
233 | 4  ENSG00000253368           0.654517
234 | 
235 | >>> categories=['GOTERM_BP_FAT', 'GOTERM_CC_FAT', 'GOTERM_MF_FAT', 'KEGG_PATHWAY','BIOCARTA', 'PFAM', 'PROSITE' ]
236 | >>> DAVIDdf=DAVIDplot('ENSEMBL_GENE_ID', categories, 'email.registered@david.com', df, "/usr/home/JDoe/mydataset")
237 | ```
238 | ___
239 | 


--------------------------------------------------------------------------------
/docs/modules/fasta.md:
--------------------------------------------------------------------------------
 1 | ## ___getFasta___
 2 | 
 3 | Retrieves a sequence from an opened multifasta file.
 4 | 
 5 | **`getFasta(opened_file, sequence_name)`**
 6 | 
 7 | * **`opened_file`** an opened multifasta file eg. opened_file=open("/path/to/file.fa",'r+')
 8 | * **`sequence_name`** the name of the sequence to be retrieved eg. for '>2 dna:chromosome chromosome:GRCm38:2:1:182113224:1 REF' use: sequence_name=str(2)
 9 | * **`returns`** a string with the sequence of interest
10 | 
11 | ```python
12 | >>> import AGEpy as age
13 | >>> fafile="/path/to/GRCm38.dna.primary_assembly.fa"
14 | >>> with open(fafile, "r") as fastafile:
15 | ...     chr2=age.getFasta(fastafile, "2")
16 | >>> print len(chr2)
17 | 
18 | 182113224
19 | 
20 | >>> print chr2[82113224:82113284]
21 | 
22 | AGGGTGAATGATGTTTCTGGTACAGTGTACCAGTAAACCTAGCAGTAGGAGCATCAGTAT
23 | ```
24 | ___
25 | 
26 | ## ___writeFasta___
27 | 
28 | Writes a fasta sequence into a file.
29 | 
30 | **`writeFasta(sequence, sequence_name, output_file)`**
31 | 
32 | * **`sequence`** a string with the sequence to be written
33 | * **`sequence_name`** name of the the fasta sequence
34 | * **`output_file`** /path/to/file.fa to be written
35 | * **`returns`** nothing
36 | 
37 | ```python
38 | >>> import AGEpy as age
39 | >>> print len(chr2)
40 | 
41 | 182113224
42 | 
43 | >>> print chr2[82113224:82113284]
44 | 
45 | AGGGTGAATGATGTTTCTGGTACAGTGTACCAGTAAACCTAGCAGTAGGAGCATCAGTAT
46 | 
47 | >>> age.writeFasta(chr2,"2 my version of this sequence","/path/to/out/file.fa")
48 | ```
49 | ___
50 | 
51 | ## ___rewriteFasta___
52 | 
53 | Rewrites a specific sequence in a multifasta file while keeping the sequence header.
54 | 
55 | **`rewriteFasta(sequence, sequence_name, fasta_in, fasta_out)`**
56 | 
57 | * **`sequence`** a string with the sequence to be written
58 | * **`sequence_name`** the name of the sequence to be retrieved eg. for '>2 dna:chromosome chromosome:GRCm38:2:1:182113224:1 REF' use: sequence_name=str(2)
59 | * **`fasta_in`** /path/to/original.fa
60 | * **`fasta_out`** /path/to/destination.fa
61 | * **`returns`** nothing
62 | 
63 | ```python
64 | >>> import AGEpy as age
65 | >>> fafile="/path/to/GRCm38.dna.primary_assembly.fa"
66 | >>> with open(fafile, "r") as fastafile:
67 | ...     chr2=age.getFasta(fastafile, "2")
68 | >>> chr2=chr2.strip("N")
69 | >>> age.rewriteFasta(chr2, "2", fafile, "/path/to/modified/file.fa")
70 | ```
71 | ___
72 | 


--------------------------------------------------------------------------------
/docs/modules/go.md:
--------------------------------------------------------------------------------
 1 | ## ___getGeneAssociation___
 2 | 
 3 | This function collects GO annotation from http://geneontology.org/page/download-annotations.
 4 | 
 5 | **`getGeneAssociation(URL_or_file)`**
 6 | 
 7 | * **`URL_or_file`** either a link to a file on geneontology.org eg. http://geneontology.org/gene-associations/gene_association.fb.gz or the path for the respective  downloded .gz file.
 8 | 
 9 | * **`returns`** a Pandas dataframe with the parsed table.
10 | 
11 | ```python
12 | >>> import pandas as pd
13 | >>> gA=age.getGeneAssociation("http://geneontology.org/gene-associations/gene_association.wb.gz")
14 | >>> print gA.head()
15 | 
16 | DB    DB_Object_ID DB_Object_Symbol Qualifier       GO ID  \
17 | 0  WB  WBGene00000001            aap-1            GO:0005942   
18 | 1  WB  WBGene00000001            aap-1            GO:0005942   
19 | 2  WB  WBGene00000001            aap-1            GO:0008286   
20 | 3  WB  WBGene00000001            aap-1            GO:0008286   
21 | 4  WB  WBGene00000001            aap-1            GO:0008286   
22 | 
23 |                         DB:Reference Evidence      With (or) From Aspect  \
24 | 0                        GO_REF:0000002      IEA  InterPro:IPR001720      C   
25 | 1  WB_REF:WBPaper00005614|PMID:12393910      IDA                          C   
26 | 2  WB_REF:WBPaper00005614|PMID:12393910      IGI   WB:WBGene00000090      P   
27 | 3  WB_REF:WBPaper00005614|PMID:12393910      IGI   WB:WBGene00000898      P   
28 | 4  WB_REF:WBPaper00005614|PMID:12393910      IMP                          P   
29 | 
30 | DB_Object_Name DB_Object_Synonym DB_Object_Type       Taxon      Date  \
31 | 0                       Y110A7A.10           gene  taxon:6239  20170321   
32 | 1                       Y110A7A.10           gene  taxon:6239  20151214   
33 | 2                       Y110A7A.10           gene  taxon:6239  20151214   
34 | 3                       Y110A7A.10           gene  taxon:6239  20151214   
35 | 4                       Y110A7A.10           gene  taxon:6239  20060302   
36 | 
37 | Assigned_by Annotation Extension Gene Product Form ID  
38 | 0          WB                                            
39 | 1          WB                                            
40 | 2          WB                                            
41 | 3          WB                                            
42 | 4          WB                                            
43 | ```
44 | ___
45 | 


--------------------------------------------------------------------------------
/docs/modules/gtf.md:
--------------------------------------------------------------------------------
  1 | ## ___readGTF___
  2 | 
  3 | Reads a GTF file and labels the respective columns in agreement with GTF file standards:
  4 | 'seqname','source','feature','start','end','score','strand','frame','attribute'.
  5 | 
  6 | **`readGTF(infile)`**
  7 | 
  8 | * **`infile`** /path/to/file.gtf
  9 | * **`returns`** a Pandas dataframe of the respective GTF
 10 | 
 11 | ```python
 12 | >>> import AGEpy as age
 13 | >>> GTF=age.readGTF("gencode.v24.primary_assembly.annotation.gtf")
 14 | >>> print GTF.head()
 15 | 
 16 | seqname  source     feature  start    end score strand frame  \
 17 | 0    chr1  HAVANA        gene  11869  14409     .      +     .   
 18 | 1    chr1  HAVANA  transcript  11869  14409     .      +     .   
 19 | 2    chr1  HAVANA        exon  11869  12227     .      +     .   
 20 | 3    chr1  HAVANA        exon  12613  12721     .      +     .   
 21 | 4    chr1  HAVANA        exon  13221  14409     .      +     .   
 22 | 
 23 |                                          attribute  
 24 | 0  gene_id "ENSG00000223972.5"; gene_type "transc..."  
 25 | 1  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
 26 | 2  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
 27 | 3  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
 28 | 4  gene_id "ENSG00000223972.5"; transcript_id "EN..."
 29 | ```
 30 | ___
 31 | 
 32 | ## ***retrieve_GTF_field***
 33 | 
 34 | Returns a field of choice from the attribute column of the GTF.
 35 | 
 36 | **`retrieve_GTF_field(field,gtf)`**
 37 | 
 38 | * **`field`** field to be retrieved
 39 | * **`returns`** a Pandas dataframe with one column containing the field of choice
 40 | 
 41 | ```python
 42 | >>> import AGEpy as age
 43 | >>> GTF=age.readGTF("/gencode.v24.primary_assembly.annotation.gtf")
 44 | >>> print GTF.head()
 45 | 
 46 | seqname  source     feature  start    end score strand frame  \
 47 | 0    chr1  HAVANA        gene  11869  14409     .      +     .   
 48 | 1    chr1  HAVANA  transcript  11869  14409     .      +     .   
 49 | 2    chr1  HAVANA        exon  11869  12227     .      +     .   
 50 | 3    chr1  HAVANA        exon  12613  12721     .      +     .   
 51 | 4    chr1  HAVANA        exon  13221  14409     .      +     .   
 52 | 
 53 |                                          attribute  
 54 | 0  gene_id "ENSG00000223972.5"; gene_type "transc..."  
 55 | 1  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
 56 | 2  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
 57 | 3  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
 58 | 4  gene_id "ENSG00000223972.5"; transcript_id "EN..."
 59 | 
 60 | >>> GTF["gene_id"]=age.retrieve_GTF_field("gene_id",GTF)
 61 | >>> print GTF.head()
 62 | 
 63 | seqname  source     feature  start    end score strand frame  \
 64 | 0    chr1  HAVANA        gene  11869  14409     .      +     .   
 65 | 1    chr1  HAVANA  transcript  11869  14409     .      +     .   
 66 | 2    chr1  HAVANA        exon  11869  12227     .      +     .   
 67 | 3    chr1  HAVANA        exon  12613  12721     .      +     .   
 68 | 4    chr1  HAVANA        exon  13221  14409     .      +     .   
 69 | 
 70 |                                          attribute            gene_id  
 71 | 0  gene_id "ENSG00000223972.5"; gene_type "transc..."  ENSG00000223972.5  
 72 | 1  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5  
 73 | 2  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5  
 74 | 3  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5  
 75 | 4  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5
 76 | ```
 77 | ___
 78 | 
 79 | ## ___attributesGTF___
 80 | 
 81 | List the type of attributes in a the attribute section of a GTF file
 82 | 
 83 | **`attributesGTF(inGTF)`**
 84 | 
 85 | * **`inGTF`** GTF dataframe to be analysed
 86 | * **`returns`** a list of attributes present in the attribute section
 87 | 
 88 | ```python
 89 | >>> import AGEpy as age
 90 | >>> attributes=age.attributesGTF(GTF)
 91 | >>> print attributes
 92 | 
 93 | ['gene_status', 'havana_gene', 'transcript_support_level', 'level', 'transcript_type', 'tag', 'protein_id', 'gene_id', 'exon_id', 'transcript_id', 'exon_number', 'ont', 'havana_transcript', 'ccdsid', 'transcript_name', 'gene_type', 'transcript_status', 'gene_name']
 94 | ```
 95 | ___
 96 | ## ___parseGTF___
 97 | 
 98 | Reads an extracts all attributes in the attributes section of a GTF and constructs a new dataframe wiht one collumn per attribute instead of the attributes column.
 99 | 
100 | **`parseGTF(inGTF)`**
101 | 
102 | * **`inGTF`** GTF dataframe to be parsed
103 | * **`returns`** a dataframe of the orignal input GTF with attributes parsed
104 | 
105 | ```python
106 | >>> GTF=age.readGTF("gencode.v24.primary_assembly.annotation.gtf")
107 | >>> print GTF.head()
108 | 
109 | seqname  source     feature  start    end score strand frame  \
110 | 0    chr1  HAVANA        gene  11869  14409     .      +     .   
111 | 1    chr1  HAVANA  transcript  11869  14409     .      +     .   
112 | 2    chr1  HAVANA        exon  11869  12227     .      +     .   
113 | 3    chr1  HAVANA        exon  12613  12721     .      +     .   
114 | 4    chr1  HAVANA        exon  13221  14409     .      +     .   
115 | 
116 |                                          attribute  
117 | 0  gene_id "ENSG00000223972.5"; gene_type "transc..."  
118 | 1  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
119 | 2  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
120 | 3  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
121 | 4  gene_id "ENSG00000223972.5"; transcript_id "EN..."  
122 | 
123 | >>> GTFpa=age.parseGTF(GTF)
124 | >>> print GTFpa.head()
125 | 
126 | seqname  source     feature  start    end score strand frame gene_status  \
127 | 0    chr1  HAVANA        gene  11869  14409     .      +     .       KNOWN   
128 | 1    chr1  HAVANA  transcript  11869  14409     .      +     .       KNOWN   
129 | 2    chr1  HAVANA        exon  11869  12227     .      +     .       KNOWN   
130 | 3    chr1  HAVANA        exon  12613  12721     .      +     .       KNOWN   
131 | 4    chr1  HAVANA        exon  13221  14409     .      +     .       KNOWN   
132 | 
133 |             havana_gene    ...               exon_id      transcript_id  \
134 | 0  OTTHUMG00000000961.2    ...                   NaN                NaN   
135 | 1  OTTHUMG00000000961.2    ...                   NaN  ENST00000456328.2   
136 | 2  OTTHUMG00000000961.2    ...     ENSE00002234944.1  ENST00000456328.2   
137 | 3  OTTHUMG00000000961.2    ...     ENSE00003582793.1  ENST00000456328.2   
138 | 4  OTTHUMG00000000961.2    ...     ENSE00002312635.1  ENST00000456328.2   
139 | 
140 |   exon_number  ont     havana_transcript ccdsid transcript_name  \
141 | 0         NaN  NaN                   NaN    NaN             NaN   
142 | 1         NaN  NaN  OTTHUMT00000362751.1    NaN     DDX11L1-002   
143 | 2           1  NaN  OTTHUMT00000362751.1    NaN     DDX11L1-002   
144 | 3           2  NaN  OTTHUMT00000362751.1    NaN     DDX11L1-002   
145 | 4           3  NaN  OTTHUMT00000362751.1    NaN     DDX11L1-002   
146 | 
147 |                             gene_type transcript_status gene_name  
148 | 0  transcribed_unprocessed_pseudogene               NaN   DDX11L1  
149 | 1  transcribed_unprocessed_pseudogene             KNOWN   DDX11L1  
150 | 2  transcribed_unprocessed_pseudogene             KNOWN   DDX11L1  
151 | 3  transcribed_unprocessed_pseudogene             KNOWN   DDX11L1  
152 | 4  transcribed_unprocessed_pseudogene             KNOWN   DDX11L1
153 | ```
154 | ___
155 | 
156 | ## ___writeGTF___
157 | 
158 | Write a GTF dataframe into a file.
159 | 
160 | **`writeGTF(inGTF,file_path)`**
161 | 
162 | * **`inGTF`** GTF dataframe to be written. It should either have 9 columns with the last one being the "attributes" section or more than 9 columns where all columns after the 8th will be colapsed into one.
163 | * **`file_path`** /path/to/the/file.gtf
164 | * **`returns`** nothing
165 | 
166 | ```python
167 | >>> import AGEpy as age
168 | >>> writeGTF(GTFpa,"/path/to/new/file.gtf")
169 | ```
170 | ___
171 | 
172 | ## ___MAPGenoToTrans___
173 | 
174 | Gets all positions of all bases in an exon.
175 | 
176 | **`MAPGenoToTrans(parsedGTF,feature)`**
177 | 
178 | * **`df`** a Pandas dataframe with 'start','end', and 'strand' information for each entry. df must contain ['seqname','feature','start','end','strand','frame','gene_id',  'transcript_id','exon_id','exon_number']
179 | * **`feature`** feature upon wich to generate the map, eg. 'exon' or 'transcript'
180 | * **`returns`** a dictionary with a string with the comma separated positions of all bases in the exon
181 | 
182 | ```python
183 | >>> import AGEpy as age
184 | >>> print GTF.head()
185 | 
186 | seqname  source     feature  start    end score strand frame  \
187 | 0    chr1  HAVANA        gene  11869  14409     .      +     .   
188 | 1    chr1  HAVANA  transcript  11869  14409     .      +     .   
189 | 2    chr1  HAVANA        exon  11869  12227     .      +     .   
190 | 3    chr1  HAVANA        exon  12613  12721     .      +     .   
191 | 4    chr1  HAVANA        exon  13221  14409     .      +     .   
192 | 
193 |                                          attribute            gene_id  \
194 | 0  gene_id "ENSG00000223972.5"; gene_type "transc..."  ENSG00000223972.5   
195 | 1  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
196 | 2  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
197 | 3  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
198 | 4  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
199 | 
200 |      transcript_id            exon_id exon_number  
201 | 0                NaN                NaN         NaN  
202 | 1  ENST00000456328.2                NaN         NaN  
203 | 2  ENST00000456328.2  ENSE00002234944.1           1  
204 | 3  ENST00000456328.2  ENSE00003582793.1           2  
205 | 4  ENST00000456328.2  ENSE00002312635.1           3
206 | 
207 | >>> GtoT=age.MAPGenoToTrans(GTF,"exon")
208 | >>> print GtoT
209 | 
210 | {ENST23923910:'234,235,236,1021,..'}
211 | ```
212 | ___
213 | 
214 | ## ___GTFtoBED___
215 | 
216 | Transform a GTF dataframe into a bed dataframe
217 | 
218 | **`GTFtoBED(inGTF,name)`**
219 | 
220 | * **`inGTF`** GTF dataframe for transformation
221 | * **`name`** field of the GTF data frame to be use for the bed 'name' positon
222 | * **`returns`** a bed dataframe with the corresponding bed fiels: 'chrom','chromStart','chromEnd','name','score','strand'
223 | 
224 | ```python
225 | >>> import AGEpy as age
226 | >>> bed = age.GTFtoBED(GTF, "gene_id")
227 | ```
228 | ___
229 | ## ___GetTransPosition___
230 | 
231 | Maps a genome position to transcript positon.
232 | 
233 | **`GetTransPosition(df, field, dic, refCol="transcript_id")`**
234 | 
235 | * **`df`** a Pandas dataframe
236 | * **`field`** the head of the column containing the genomic position
237 | * **`dic`** a dictionary containing for each transcript the respective bases eg. {ENST23923910:'234,235,236,1021,..'}. See *MAPGenoToTrans*.
238 | * **`refCol`** header of the reference column with IDs, eg. 'transcript_id'
239 | 
240 | ```python
241 | >>> import AGEpy as age
242 | >>> print GTF_.head()
243 | 
244 | seqname  source feature  start    end score strand frame  \
245 | 2    chr1  HAVANA    exon  11869  12227     .      +     .   
246 | 3    chr1  HAVANA    exon  12613  12721     .      +     .   
247 | 4    chr1  HAVANA    exon  13221  14409     .      +     .   
248 | 6    chr1  HAVANA    exon  12010  12057     .      +     .   
249 | 7    chr1  HAVANA    exon  12179  12227     .      +     .   
250 | 
251 |                                          attribute            gene_id  \
252 | 2  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
253 | 3  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
254 | 4  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
255 | 6  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
256 | 7  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
257 | 
258 |      transcript_id            exon_id exon_number  target  
259 | 2  ENST00000456328.2  ENSE00002234944.1           1   12000  
260 | 3  ENST00000456328.2  ENSE00003582793.1           2   12617  
261 | 4  ENST00000456328.2  ENSE00002312635.1           3   14000  
262 | 6  ENST00000450305.2  ENSE00001948541.1           1   12040  
263 | 7  ENST00000450305.2  ENSE00001671638.2           2   12210  
264 | 
265 | >>> GTF_["transcript target"]=GTF_.apply(age.GetTransPosition, \
266 |                                      args=("target",GtoT),axis=1)
267 | >>> print GTF_.head()
268 | 
269 | seqname  source feature  start    end score strand frame  \
270 | 2    chr1  HAVANA    exon  11869  12227     .      +     .   
271 | 3    chr1  HAVANA    exon  12613  12721     .      +     .   
272 | 4    chr1  HAVANA    exon  13221  14409     .      +     .   
273 | 6    chr1  HAVANA    exon  12010  12057     .      +     .   
274 | 7    chr1  HAVANA    exon  12179  12227     .      +     .   
275 | 
276 |                                          attribute            gene_id  \
277 | 2  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
278 | 3  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
279 | 4  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
280 | 6  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
281 | 7  gene_id "ENSG00000223972.5"; transcript_id "EN..."  ENSG00000223972.5   
282 | 
283 |      transcript_id            exon_id exon_number  target  transcript target  
284 | 2  ENST00000456328.2  ENSE00002234944.1           1   12000                132  
285 | 3  ENST00000456328.2  ENSE00003582793.1           2   12617                364  
286 | 4  ENST00000456328.2  ENSE00002312635.1           3   14000               1248  
287 | 6  ENST00000450305.2  ENSE00001948541.1           1   12040                 31  
288 | 7  ENST00000450305.2  ENSE00001671638.2           2   12210                 80  
289 | ```
290 | ___
291 | 
292 | ## ***getPromotersBed***
293 | 
294 | Reads a gtf file and returns a bed file for the promoter coordinates.
295 | 
296 | **`getPromotersBed(gtf,fa,upstream=2000,downstream=200)`**
297 | 
298 | * **`gtf`** path/to/file.gtf. Must be an ensembl gtf.
299 | * **`fa`** path/to/fasta.fa. Must be an ensembl fasta file.
300 | * **`upstream`** number of bases upstream of transcript start sites the promoter should start
301 | * **`downstream`** number of bases downstream of transcript start sites the promoter should end
302 |     
303 | * **`returns`** a pandas dataframe in bed format
304 | 
305 | ```python
306 | >>> import AGEpy as age
307 | >>> bed=age.getPromotersBed(gtf="Caenorhabditis_elegans.WBcel235.89.gtf",\
308 |                           fa="Caenorhabditis_elegans.WBcel235.dna.toplevel.fa",\
309 |                           upstream=2000,downstream=200)
310 | >>> print(bed.head())
311 | 
312 |   chrom  chromStart  chromEnd                      name score strand
313 | 0     V           0       380  WBGene00197333, cTel3X.2     .      +
314 | 1     V           0      5857   WBGene00015153, B0348.5     .      +
315 | 2     V         129      2329  WBGene00198386, cTel3X.3     .      -
316 | 3     V        7622      9822     WBGene00002061, ife-3     .      -
317 | 4     V        8539     10739  WBGene00255704, B0348.10     .      -
318 | ```
319 | ___
320 | 
321 | 


--------------------------------------------------------------------------------
/docs/modules/homology.md:
--------------------------------------------------------------------------------
  1 | ## ___getHomoloGene___
  2 | 
  3 | Returns NBCI's Homolog Gene tables.
  4 | 
  5 | **`getHomoloGene(taxfile="build_inputs/taxid_taxname", genefile="homologene.data", proteinsfile="build_inputs/all_proteins.data", proteinsclusterfile="build_inputs/proteins_for_clustering.data", baseURL="http://ftp.ncbi.nih.gov/pub/HomoloGene/current/")`**
  6 | 
  7 | * **`taxfile`** path to local file or to baseURL/taxfile, default="build_inputs/taxid_taxname",
  8 | * **`genefile`** path to local file or to baseURL/genefile, defult="homologene.data"
  9 | * **`proteinsfile`** path to local file or to baseURL/proteinsfile, default="build_inputs/all_proteins.data"
 10 | * **`proteinsclusterfile`** path to local file or to baseURL/proteinsclusterfile, default="build_inputs/proteins_for_clustering.data"
 11 | * **`baseURL`** baseURL for downloading files, default="http://ftp.ncbi.nih.gov/pub/HomoloGene/current/"
 12 | * **`returns genedf`** Homolog gene Pandas dataframe
 13 | * **`returns protclusdf`** Pandas dataframe. Lists one protein per gene that were used for homologene clustering.
 14 |                     If a gene has multiple protein accessions derived from alternative splicing,
 15 |                     only one protein isoform that give most protein alignment to proteins in other species
 16 |                     was selected for clustering and it is listed in this file.
 17 | * **`returns proteinsdf`** Pandas dataframe. Lists all proteins and their gene information.
 18 |                     If a gene has multple protein accessions derived from alternative splicing event,
 19 |                     each protein accession is list in a separate line.
 20 | 
 21 | ```python
 22 | >>> import AGEpy as age
 23 | >>> genedf, protclusdf, proteinsdf = age.getHomoloGene()
 24 | >>> print genedf.head()
 25 | 
 26 | HID Taxonomy ID Gene ID Gene Symbol Protein gi Protein accession  \
 27 | 0   3        9606      34       ACADM    4557231       NP_000007.1   
 28 | 1   3        9598  469356       ACADM  160961497    NP_001104286.1   
 29 | 2   3        9544  705168       ACADM  109008502    XP_001101274.1   
 30 | 3   3        9615  490207       ACADM  545503811    XP_005622188.1   
 31 | 4   3        9913  505968       ACADM  115497690    NP_001068703.1   
 32 | 
 33 |                 organism  
 34 | 0            Homo sapiens  
 35 | 1         Pan troglodytes  
 36 | 2          Macaca mulatta  
 37 | 3  Canis lupus familiaris  
 38 | 4              Bos taurus  
 39 | 
 40 | >>> print protclusdf.head()
 41 | 
 42 | taxid entrez GeneID gene symbol gene description protein accession.ver  \
 43 | 0  3702      10723019   AT1G27045        AT1G27045        NP_001185103.1   
 44 | 1  3702      10723020   AT2G41231        AT2G41231        NP_001189726.1   
 45 | 2  3702      10723023   AT1G24095        AT1G24095        NP_001185076.1   
 46 | 3  3702      10723026   AT1G12855        AT1G12855        NP_001184976.1   
 47 | 4  3702      10723027   AT4G22758        AT4G22758        NP_001190802.1   
 48 | 
 49 |   mrna accession.ver length of protein  listed in column 5  \
 50 | 0     NM_001198174.1                                   227   
 51 | 1     NM_001202797.1                                    99   
 52 | 2     NM_001198147.1                                   213   
 53 | 3     NM_001198047.1                                   462   
 54 | 4     NM_001203873.1                                   255   
 55 | 
 56 |   -11) contains data about gene location on the genome  \
 57 | 0                                          240254421     
 58 | 1                                          240254678     
 59 | 2                                          240254421     
 60 | 3                                          240254421     
 61 | 4                                          240256243     
 62 | 
 63 |   starting position of gene in 0-based coordinate  \
 64 | 0                                         9391608   
 65 | 1                                        17195291   
 66 | 2                                         8523246   
 67 | 3                                         4382159   
 68 | 4                                        11958309   
 69 | 
 70 |   end position of the gene in 0-based coordinate strand  \
 71 | 0                                        9393018      +   
 72 | 1                                       17195914      +   
 73 | 2                                        8524928      +   
 74 | 3                                        4383610      +   
 75 | 4                                       11960035      +   
 76 | 
 77 |   nucleotide gi of genomic sequence where this gene is annotated  \
 78 | 0                                          AT1G27045               
 79 | 1                                          AT2G41231               
 80 | 2                                          AT1G24095               
 81 | 3                                          AT1G12855               
 82 | 4                                          AT4G22758               
 83 | 
 84 |                organism  
 85 | 0  Arabidopsis thaliana  
 86 | 1  Arabidopsis thaliana  
 87 | 2  Arabidopsis thaliana  
 88 | 3  Arabidopsis thaliana  
 89 | 4  Arabidopsis thaliana
 90 | 
 91 | >>> print proteinsdf.head()
 92 | 
 93 | taxid entrez GeneID gene symbol gene description protein accession.ver  \
 94 | 0  3702      10723019   AT1G27045        AT1G27045        NP_001185103.1   
 95 | 1  3702      10723020   AT2G41231        AT2G41231        NP_001189725.1   
 96 | 2  3702      10723020   AT2G41231        AT2G41231        NP_001189726.1   
 97 | 3  3702      10723023   AT1G24095        AT1G24095        NP_001185076.1   
 98 | 4  3702      10723026   AT1G12855        AT1G12855        NP_001184976.1   
 99 | 
100 |  mrna accession.ver length of protein  listed in column 5  \
101 | 0     NM_001198174.1                                   227   
102 | 1     NM_001202796.1                                   104   
103 | 2     NM_001202797.1                                    99   
104 | 3     NM_001198147.1                                   213   
105 | 4     NM_001198047.1                                   462   
106 | 
107 |  -11) contains data about gene location on the genome  \
108 | 0                                          240254421     
109 | 1                                          240254678     
110 | 2                                          240254678     
111 | 3                                          240254421     
112 | 4                                          240254421     
113 | 
114 |  starting position of gene in 0-based coordinate  \
115 | 0                                         9391608   
116 | 1                                        17195291   
117 | 2                                        17195291   
118 | 3                                         8523246   
119 | 4                                         4382159   
120 | 
121 |  end position of the gene in 0-based coordinate strand  \
122 | 0                                        9393018      +   
123 | 1                                       17195914      +   
124 | 2                                       17195914      +   
125 | 3                                        8524928      +   
126 | 4                                        4383610      +   
127 | 
128 |  nucleotide gi of genomic sequence where this gene is annotated  \
129 | 0                                          AT1G27045               
130 | 1                                          AT2G41231               
131 | 2                                          AT2G41231               
132 | 3                                          AT1G24095               
133 | 4                                          AT1G12855               
134 | 
135 |               organism  
136 | 0  Arabidopsis thaliana  
137 | 1  Arabidopsis thaliana  
138 | 2  Arabidopsis thaliana  
139 | 3  Arabidopsis thaliana  
140 | 4  Arabidopsis thaliana
141 | ```
142 | ___
143 | 


--------------------------------------------------------------------------------
/docs/modules/meme.md:
--------------------------------------------------------------------------------
 1 | ## ___filterMotifs___
 2 | 
 3 | Selectes motifs from a meme file based on the number of sites.
 4 | 
 5 | **`filterMotifs(memeFile,outFile, minSites)`**
 6 | 
 7 | * **`memeFile`** MEME file to be read
 8 | * **`outFile`** MEME file to be written
 9 | * **`minSites`** minimum number of sites each motif needs to have to be valid
10 | * **`returns`** nothing
11 | 
12 | ```python
13 | >>> import AGEpy as age
14 | >>> age.filterMotifs("/path/to/input.meme","/path/to/output.meme", 15)
15 | ```
16 | ___
17 | 


--------------------------------------------------------------------------------
/docs/modules/p53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/p53.png


--------------------------------------------------------------------------------
/docs/modules/plots.md:
--------------------------------------------------------------------------------
  1 | ## ___CellPlot___
  2 | 
  3 | Python implementation of the CellPlot from the CellPlot package for R.
  4 | -inf or inf enrichments will come out as min found float or max found float, respectively.
  5 | 
  6 | **`CellPlot(df, output_file=None, term_col="termName", gene_expression_col='log2fc', gene_expression="log2FC",x_values="-log10(p)", xaxis_label = "-log10(p)", pvalCol="ease", figure_title="CellPlot", lowerLimit=None, upperLimit=None, colorBarType='coolwarm'):`**
  7 | 
  8 | * **`df`** pandas dataframe with the following columns - 'Enrichment', 'Term', and 'log2fc'.
  9 |            For log2fc each cell must contain a comma separated string with the log2fc for the genes enriched in the respective term.
 10 |            eg. '-inf,-1,2,3.4,3.66,inf'
 11 | * **`output_file`** prefix for an output file. If given it will create output_file.CellPlot.svg and output_file.CellPlot.png
 12 | * **`gene_expression_col`** column with gene expression data separated by a comma (ie. ',')
 13 | * **`gene_expression`** label for the color gradiant bar.
 14 | * **`x_values`** values to use on the x-axis
 15 | * **`xaxis_label`** label for x-axis
 16 | * **`figure_title`** Figure title.
 17 | * **`term_col`** the column with the terms names
 18 | * **`pvalCol`** name of the column containing the p values to determine if the terms should be marked as NS - not significant, use None for no marking
 19 | * **`lowerLimit`** lower limit for the heatmap bar (default is the 0.1 percentile)
 20 | * **`upperLimit`** upper limit for the heatmap bar (default is the 0.9 percentile)
 21 | * **`colorBarType`** type of heatmap, 'Spectral' is default, alternative eg. 'seismic'
 22 | * **`returns`** a matplotlib figure
 23 | 
 24 | ```python
 25 | >>> import AGEpy as age
 26 | >>> print df.head()
 27 | 
 28 |     categoryName                                           termName  listHits  \
 29 | 0  GOTERM_BP_FAT                          GO:0006396~RNA processing       716   
 30 | 1  GOTERM_BP_FAT  GO:0043933~macromolecular complex subunit orga...      1433   
 31 | 2  GOTERM_BP_FAT                  GO:0016071~mRNA metabolic process       523   
 32 | 3  GOTERM_BP_FAT           GO:0044085~cellular component biogenesis      1596   
 33 | 4  GOTERM_BP_FAT    GO:0022613~ribonucleoprotein complex biogenesis       398   
 34 | 
 35 |      percent           ease  \
 36 | 0  10.599556  8.904648e-157   
 37 | 1  21.213916  2.144221e-124   
 38 | 2   7.742413  1.473027e-109   
 39 | 3  23.626943  2.398988e-108   
 40 | 4   5.891932   7.142953e-99   
 41 | 
 42 |                                              geneIds  listTotals  popHits  \
 43 | 0  ENSG00000151304, ENSG00000091127, ENSG00000171...        6085      910   
 44 | 1  ENSG00000166337, ENSG00000110075, ENSG00000110...        6085     2461   
 45 | 2  ENSG00000138385, ENSG00000106355, ENSG00000110...        6085      672   
 46 | 3  ENSG00000110075, ENSG00000110074, ENSG00000164...        6085     2914   
 47 | 4  ENSG00000151304, ENSG00000215301, ENSG00000171...        6085      480   
 48 | 
 49 |    popTotals  foldEnrichment     bonferroni      benjamini           afdr  \
 50 | 0      16650        2.152907  1.021719e-152  1.021719e-152  1.842717e-153   
 51 | 1      16650        1.593266  2.460279e-120  1.230140e-120  4.437224e-121   
 52 | 2      16650        2.129541  1.690151e-105  5.633836e-106  3.048263e-106   
 53 | 3      16650        1.498639  2.752599e-104  6.881498e-105  4.964436e-105   
 54 | 4      16650        2.268796   8.195824e-95   1.639165e-95   1.478154e-95   
 55 | 
 56 |                                           genes name  \
 57 | 0  SRFBP1, PUS7, CHD7, SSB, LSM5, NOB1, GTF2H1, A...   
 58 | 1  TAF10, PPP6R3, FOXRED1, PSMC1, ILK, EP400, CTB...   
 59 | 2  SSB, LSM5, GTF2H1, ALYREF, RPS10, SNRNP35, CNO...   
 60 | 3  PPP6R3, FOXRED1, UTP15, SIX1, LLGL1, RPL9, TRI...   
 61 | 4  SRFBP1, DDX3X, CHD7, NOB1, RPS10, UTP15, RPL30...   
 62 | 
 63 |                                               log2fc   -log10(p)  
 64 | 0  0.153, 0.37, 0.023, 0.321, 0.084, 0.61, 0.118,...  156.050383  
 65 | 1  -0.309, 0.078, -0.063, 0.005, 0.054, -0.051, 0...  123.668730  
 66 | 2  0.321, 0.084, 0.118, -0.013, 0.06, -0.055, 0.0...  108.831789  
 67 | 3  0.078, -0.063, -0.303, -0.39, -0.254, 0.092, -...  107.619972  
 68 | 4  0.153, 0.221, 0.023, 0.61, 0.06, -0.303, 0.15,...   98.146122  
 69 | 
 70 | >>> cellplot=age.CellPlot(df[:20])
 71 | ```
 72 | ![cellpot](cellplot.CellPlot.png)
 73 | ___
 74 | 
 75 | ## ___SymPlot___
 76 | 
 77 | Python implementation of the SymPlot from the CellPlot package for R.
 78 | -inf or inf enrichments will come out as min found float or max found float, respectively.
 79 | 
 80 | **`SymPlot(df,output_file=None,figure_title="SymPlot", pvalCol="ease", term_col="termName", x_values="-log10(p)", n_terms_col="listHits", gene_expression_col="log2fc" , xaxis_label = "-log10(p)", colorBarType='coolwarm')`**
 81 | 
 82 | * **`df`** pandas dataframe with the following columns - 'Enrichment', 'Significant', 'Annotated', 'Term', and 'log2fc'. 'Annotated'i stands for number of genes annotated with the respective GO term. As reported in DAVID by listHits. For log2fc each cell must contain a comma separated string with the log2fc for the genes enriched in the respective term. eg. '-inf,-1,2,3.4,3.66,inf'
 83 | * **`gene_expression_col`** column with gene expression data separated by a comma (ie. ',')
 84 | * **`gene_expression`** label for the color gradiant bar.
 85 | * **`x_values`** values to use on the x-axis
 86 | * **`xaxis_label`** label for x-axis
 87 | * **`term_col`** the column with the terms names  
 88 | * **`output_file`** prefix for an output file. If given it witll create output_file.SymPlot.svg and output_file.SymPlot.png
 89 | * **`figure_title`** Figure title.
 90 | * **`pvalCol`** name of the column containing the p values to determine if the terms should be marked as NS - not significant, use None for no marking
 91 | * **`colorBarType`** type of heatmap, 'coolwarm' is dafault, alternative eg. 'seismic','Spectral','bwr','coolwarm'
 92 | * **`returns`** a matplotlib figure
 93 | 
 94 | ```python
 95 | >>> import AGEpy as age
 96 | >>> symplot=age.SymPlot(df[:20],"symplot", "mutant 1",pvalCol="ease")
 97 | ```
 98 | ![sympot](symplot.SymPlot.png)
 99 | ___
100 | ## ___MA___
101 | 
102 | Plots an MA like plot.
103 | 
104 | **`MA(df, title, figName, c, daType="counts", nbins=10, perc=.5, deg=3, eq=True, splines=True, spec=None, Targets=None, ylim=None, sizeRed=8)`**
105 | 
106 | * **`df`** dataframe output of GetData()
107 | * **`title`** plot title, 'Genes' or 'Transcripts'
108 | * **`figName`** /path/to/saved/figure/prefix
109 | * **`c`** pair of samples to be plotted in list format
110 | * **`daType`** data type, ie. 'counts' or 'FPKM'
111 | * **`nbins`** number of bins on normalized intensities to fit the splines
112 | * **`per`** log2(fold change) percentil to which the splines will be fitted
113 | * **`deg`** degress of freedom used to fit the splines
114 | * **`eq`** if true assumes for each bin that the lower and upper values are equally distant to 0, taking the smaller distance for both
115 | * **`splines`** plot splines, default=True
116 | * **`spec`** list of ids to be highlighted
117 | * **`Targets`** list of ids that will be highlighted if outside of the fitted splines
118 | * **`ylim`** a list of limits to apply on the y-axis of the plot
119 | * **`sizeRed`** size of the highlight marker
120 | * **`returns df_`** a Pandas dataframe similar to the GetData() output with normalized intensities and spline outbounds rows marked as 1.
121 | * **`returns red`** list of ids that are highlighted
122 | 
123 | ```python
124 | >>> import AGEpy as age
125 | >>> print df.head()
126 | 
127 | gene_id                    gene  wt0  wt20  log2(wt20/wt0)  \
128 | 0  ENSG00000223972                 DDX11L1  0.0   0.0             NaN   
129 | 1  ENSG00000243485  MIR1302-2,RP11-34P13.3  0.0   0.0             NaN   
130 | 2  ENSG00000274890  MIR1302-2,RP11-34P13.3  0.0   0.0             NaN   
131 | 3  ENSG00000268020                  OR4G4P  0.0   0.0             NaN   
132 | 4  ENSG00000240361                 OR4G11P  0.0   0.0             NaN   
133 | 
134 | p_value  q_value significant  
135 | 0      1.0      1.0          no  
136 | 1      1.0      1.0          no  
137 | 2      1.0      1.0          no  
138 | 3      1.0      1.0          no  
139 | 4      1.0      1.0          no  
140 | 
141 | >>> madf1,sig1=age.MA(dge_, 'Genes',"MA1",["wt0","wt20"], daType="FPKM")
142 | ```
143 | ![ma1](MA1.png)
144 | ```python
145 | >>> sigGenes=df[df["significant"=="yes"]]["gene_id"].tolist()
146 | >>> madf2,sig2=age.MA(dge_, 'Genes',"MA2", ["wt0","wt20"], splines=False, daType="FPKM",spec=sigGenes)
147 | ```
148 | ![ma2](MA2.png)
149 | ```python
150 | >>> madf3,sig3=age.MA(dge_, 'Genes',"MA3", ["wt0","wt20"], splines=True, daType="FPKM",Targets=sigGenes)
151 | ```
152 | ![ma3](MA3.png)
153 | ___
154 | 


--------------------------------------------------------------------------------
/docs/modules/sam.md:
--------------------------------------------------------------------------------
 1 | ## ___readSAM___
 2 | 
 3 | Reads and parses a sam file.
 4 | 
 5 | **`readSAM(SAMfile,header=False)`**
 6 | 
 7 | * **`SAMfile`** /path/to/file.sam
 8 | * **`header`** logical, if True, reads the header information
 9 | * **`returns`** a pandas dataframe with the respective SAM columns: 'QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL' and a list of the headers if header=True
10 | 
11 | ```python
12 | >>> import AGEpy as age
13 | >>> SAMdf=age.readSAM("sample1.sam")
14 | >>> print SAMdf.head()
15 | 
16 | CIGAR  \
17 | 0  J00137:91:HJG75BBXX:6:1101:27458:1244    4     *         0    0       *   
18 | 1   J00137:91:HJG75BBXX:6:1101:2483:1226    4     *         0    0       *   
19 | 2   J00137:91:HJG75BBXX:6:1101:6593:1244   16    II  11210427  255  2S146M   
20 | 3   J00137:91:HJG75BBXX:6:1101:9293:1244    0     I  10433525  255    150M   
21 | 4  J00137:91:HJG75BBXX:6:1101:13271:1244   16   III   5277278  255    150M   
22 | 
23 | RNEXT PNEXT TLEN                                                SEQ  \
24 | 0     *     0    0  CCAAAATCAGTTACAAAAAAATTAAATATCGAGTTCCTCCCCCAGA...   
25 | 1     *     0    0  ACGTGACCGATGGTTGGCATGGCACGCATACCACGGAAGCGTCTGC...   
26 | 2     *     0    0  AACAACAGCAGCAGCAGATTTACCAAAGGTTCCCAGCAAGACTAAT...   
27 | 3     *     0    0  CTTGATTGTACTGCTGTGGTGGACCGCGTGGTCCTCCTTGTTGGTT...   
28 | 4     *     0    0  GGACATGATGATCATGGCCACGACTCTCATGGACATAGTCATGATC...   
29 | 
30 |                                               QUAL  
31 | 0  AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...  
32 | 1  A-<AF7FF-AF-A<--F-7<FAA7FFF7-<-<F7JF---7--77J7...  
33 | 2  7-AA7JF)JJJF<--7FF--7--AAA--<A7<--<<A-7-A-<<FJ...  
34 | 3  AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...  
35 | 4  JFFAJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJJ...
36 | ```
37 | ___
38 | 
39 | ## ___writeSAM___
40 | Writes a pandas dataframe with the respective SAM columns: 'QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL' into a sam file.
41 | 
42 | **`writeSAM(sam, SAMfile, header=None)`**
43 | 
44 | * **`sam`** pandas dataframe to be writen
45 | * **`SAMfile`** /path/to/file.sam
46 | * **`returns`** nothing
47 | 
48 | ```
49 | >>> import AGEpy as age
50 | >>> age.writeSAM(SAMdf,"modified.sam")
51 | ```
52 | ___
53 | 
54 | ## ___SAMflags___
55 | Explains a SAM flag.
56 | 
57 | **`SAMflags(x)`**
58 | 
59 | * **`x`** flag
60 | * **`returns`** complete SAM flag explanation
61 | 
62 | ```
63 | >>> import AGEpy as age
64 | >>> print age.SAMflags(64)
65 | ```
66 | ["0: Read unpaired",
67 |  "0:  Read not mapped in proper pair",
68 |  "0:  Read mapped",
69 |  "0:  Mate mapped",
70 |  "0:  Read direct strand",
71 |  "0:  Mate direct strand",
72 |  "1:  First in pair",
73 |  "0:  First in pair",
74 |  "0:  Primary alignment",
75 |  "0:  Read passes platform/vendor quality checks",
76 |  "0:  Read is not PCR or optical duplicate",
77 |  "0:  Not supplementary alignment"]
78 | 
79 |  ___
80 | 


--------------------------------------------------------------------------------
/docs/modules/symplot.SymPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/symplot.SymPlot.png


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <title>AGEpy</title>
 5 |         <meta http-equiv="refresh" content="0; url=http://agepy.readthedocs.io" />
 6 |     </head>
 7 |     <body>
 8 |         <p><a href="http://agepy.readthedocs.io">Redirect to new page...</a></p>
 9 |     </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | theme: readthedocs
 2 | site_name: AGEpy
 3 | pages:
 4 |   - Home: index.md
 5 |   - Cookbook: cookbook.md
 6 |   - Modules:
 7 |     - bed: modules/bed.md
 8 |     - biom: modules/biom.md
 9 |     - blast: modules/blast.md
10 |     - cytoscape: modules/cytoscape.md
11 |     - david: modules/david.md
12 |     - fasta: modules/fasta.md
13 |     - go: modules/go.md
14 |     - gtf: modules/gtf.md
15 |     - homology: modules/homology.md
16 |     - kegg: modules/kegg.md
17 |     - meme: modules/meme.md
18 |     - plots: modules/plots.md
19 |     - sam: modules/sam.md
20 |   - Executables:
21 |     - aDiff: executables/adiff.md
22 |     - abed: executables/abed.md
23 |     - obo2tsv: executables/obo2tsv.md
24 |     - david: executables/david.md
25 |     - blasto: executables/blasto.md
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | AGEpy
 2 | Pandas>=0.15.2
 3 | numpy>=1.9.2
 4 | requests>=2.20.0
 5 | openpyxl
 6 | suds
 7 | zeep
 8 | xlrd
 9 | biomart
10 | matplotlib
11 | xlsxwriter
12 | pybedtools
13 | wand
14 | paramiko
15 | ipaddress
16 | seaborn
17 | scipy
18 | scikit-learn
19 | statsmodels
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import os
 3 | import re
 4 | import sys
 5 | from setuptools import setup
 6 | 
 7 | here = os.path.abspath(os.path.dirname(__file__))
 8 | 
 9 | 
10 | def read(*parts):
11 |     # intentionally *not* adding an encoding option to open, See:
12 |     #   https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690
13 |     with codecs.open(os.path.join(here, *parts), 'r') as fp:
14 |         return fp.read()
15 | 
16 | setup(name = 'AGEpy',
17 |       version = '0.8.2',
18 |       description = 'Bioinformatics tools for Python developed at the MPI for Biology of Ageing',
19 |       long_description = read('README.rst'),
20 |       url = 'https://github.com/mpg-age-bioinformatics/AGEpy',
21 |       author = 'Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing',
22 |       author_email = 'bioinformatics@age.mpg.de',
23 |       license = 'MIT',
24 |       packages = [ 'AGEpy' ],
25 |       install_requires = [ 'Pandas>=0.15.2', 'numpy>=1.9.2','requests>=2.20.0', \
26 |       'suds', 'zeep', 'openpyxl','xlrd', 'biomart', 'matplotlib','pybedtools', \
27 |       'xlsxwriter','wand','paramiko','ipaddress', 'seaborn', \
28 |       'scipy', 'scikit-learn', 'statsmodels'],
29 |       zip_safe = False,
30 |       scripts=['bin/obo2tsv','bin/aDiff','bin/abed','bin/david', 'bin/blasto', 'bin/QC_plots']
31 |       )
32 | 


--------------------------------------------------------------------------------