├── .gitignore ├── .travis.yml ├── AGEpy ├── AGEpy.py ├── __init__.py ├── bed.py ├── biom.py ├── blast.py ├── cytoscape.py ├── david.py ├── fasta.py ├── go.py ├── gtf.py ├── homology.py ├── kegg.py ├── meme.py ├── plots.py ├── rbiom.py └── sam.py ├── Dockerfile ├── LICENSE ├── README.md ├── README.rst ├── bin ├── QC_plots ├── aDiff ├── abed ├── blasto ├── david └── obo2tsv ├── conf.py ├── docs ├── .DS_Store ├── cookbook.md ├── executables │ ├── abed.md │ ├── adiff.md │ ├── blasto.md │ ├── david.md │ └── obo2tsv.md ├── index.md └── modules │ ├── MA1.png │ ├── MA2.png │ ├── MA3.png │ ├── bed.md │ ├── biom.md │ ├── blast.md │ ├── cellplot.CellPlot.png │ ├── cytoscape.md │ ├── david.md │ ├── fasta.md │ ├── go.md │ ├── gtf.md │ ├── homology.md │ ├── kegg.md │ ├── meme.md │ ├── p53.png │ ├── plots.md │ ├── sam.md │ └── symplot.SymPlot.png ├── index.html ├── mkdocs.yml ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled python modules. 2 | *.pyc 3 | 4 | # Setuptools distribution folder. 5 | /dist/ 6 | /build/ 7 | site 8 | 9 | # Python egg metadata, regenerated from source files by setuptools. 10 | /*.egg-info 11 | 12 | # Other 13 | *.swp 14 | .DS_Store 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3.7 3 | dist: xenial 4 | sudo: true 5 | 6 | install: 7 | - pip3 install . 8 | 9 | script: 10 | - echo "TODO" 11 | -------------------------------------------------------------------------------- /AGEpy/AGEpy.py: -------------------------------------------------------------------------------- 1 | """Bioinformatics tools developed at the Max Planck Institute for Biology of Ageing""" 2 | from .bed import * 3 | from .biom import * 4 | from .david import * 5 | from .fasta import * 6 | from .go import * 7 | from .gtf import * 8 | from .homology import * 9 | from .kegg import * 10 | from .meme import * 11 | from .plots import * 12 | from .rbiom import * 13 | from .sam import * 14 | from .cytoscape import * 15 | from .blast import * 16 | -------------------------------------------------------------------------------- /AGEpy/__init__.py: -------------------------------------------------------------------------------- 1 | """Bioinformatics tools developed at the Max Planck Institute for Biology of Ageing""" 2 | from .bed import * 3 | from .biom import * 4 | from .david import * 5 | from .fasta import * 6 | from .go import * 7 | from .gtf import * 8 | from .homology import * 9 | from .kegg import * 10 | from .meme import * 11 | from .plots import * 12 | from .rbiom import * 13 | from .sam import * 14 | from .cytoscape import * 15 | from .blast import * 16 | -------------------------------------------------------------------------------- /AGEpy/bed.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | #from urllib import urlopen # python2 6 | #import urllib2 # python2 7 | import urllib.request as urllib2 8 | #import StringIO python2 9 | from io import StringIO 10 | import gzip 11 | try: 12 | import pybedtools 13 | from pybedtools import BedTool 14 | except: 15 | print("pybedtools could not be imported") 16 | sys.stdout.flush() 17 | from .gtf import GTFtoBED 18 | from .gtf import readGTF 19 | from .gtf import retrieve_GTF_field 20 | 21 | 22 | def writeBED(inBED, file_path): 23 | """ 24 | Writes a bed dataframe into a bed file. 25 | Bed format: 'chrom','chromStart','chromEnd','name','score','strand' 26 | 27 | :param inBED: bed dataframe to be written. 28 | :param file_path: /path/to/file.bed 29 | 30 | :returns: nothing 31 | 32 | """ 33 | inBED.to_csv(file_path,index=None,sep="\t",header=None) 34 | 35 | def GetBEDnarrowPeakgz(URL_or_PATH_TO_file): 36 | """ 37 | Reads a gz compressed BED narrow peak file from a web address or local file 38 | 39 | :param URL_or_PATH_TO_file: web address of path to local file 40 | 41 | :returns: a Pandas dataframe 42 | """ 43 | 44 | if os.path.isfile(URL_or_PATH_TO_file): 45 | response=open(URL_or_PATH_TO_file, "r") 46 | compressedFile = StringIO.StringIO(response.read()) 47 | else: 48 | response = urllib2.urlopen(URL_or_PATH_TO_file) 49 | compressedFile = StringIO.StringIO(response.read()) 50 | decompressedFile = gzip.GzipFile(fileobj=compressedFile) 51 | out=decompressedFile.read().split("\n") 52 | out=[ s.split("\t") for s in out] 53 | out=pd.DataFrame(out) 54 | out.columns=["chrom","chromStart","chromEnd","name","score","strand","signalValue","-log10(pValue)","-log10(qvalue)","peak"] 55 | out["name"]=out.index.tolist() 56 | out["name"]="Peak_"+out["name"].astype(str) 57 | out=out[:-1] 58 | return out 59 | 60 | def dfTObedtool(df): 61 | """ 62 | Transforms a pandas dataframe into a bedtool 63 | 64 | :param df: Pandas dataframe 65 | 66 | :returns: a bedtool 67 | """ 68 | 69 | df=df.astype(str) 70 | df=df.drop_duplicates() 71 | df=df.values.tolist() 72 | df=["\t".join(s) for s in df ] 73 | df="\n".join(df) 74 | df=BedTool(df, from_string=True) 75 | return df 76 | 77 | def GetPeaksExons(bed,parsedGTF): 78 | """ 79 | Annotates a bedtool, BED narrow peak 80 | 81 | :param bed: a pandas dataframe in bed format 82 | :param parsedGTF: a parsed GTF file as outputed by parseGTF() with the following columns 83 | 84 | :returns: a Pandas dataframe 85 | """ 86 | 87 | bedtool_AB=dfTObedtool(bed) 88 | 89 | exonsGTF=parsedGTF[parsedGTF["feature"]=="exon"] 90 | exonsGTF.reset_index(inplace=True, drop=True) 91 | 92 | exonsBED=GTFtoBED(exonsGTF, "exon_id") 93 | exonsBED.columns=['chrom', 'chromStart', 'chromEnd', 'exon_id', 'score', 'strand'] 94 | exonsBEDcols=exonsBED.columns.tolist() 95 | 96 | bedcols=bed.columns.tolist() 97 | exonsBEDcols_=[] 98 | for c in exonsBEDcols: 99 | if c in bedcols: 100 | exonsBEDcols_.append(c+"_exon") 101 | else: 102 | exonsBEDcols_.append(c) 103 | 104 | cols=[bedcols,exonsBEDcols_,["overlap"] ] 105 | cols=[item for sublist in cols for item in sublist] 106 | 107 | bedtool_exons=dfTObedtool(exonsBED) 108 | 109 | bedtool_target_exons=bedtool_AB.intersect(bedtool_exons, wo=True, s=True) 110 | dfTargetE=pd.read_table(bedtool_target_exons.fn, names=cols) 111 | ExonsTransGenes=parsedGTF[["exon_id","transcript_id","gene_id"]].drop_duplicates() 112 | dfTargets=pd.merge(dfTargetE,ExonsTransGenes,on=["exon_id"],how="left") 113 | dfTargets["count"]=1 114 | 115 | def getCounts(df,field): 116 | """ 117 | For each field in a bed narrow peak returns the number or times that field is present,\ 118 | the normalized mean of the '-log10(pValue)' and normalized mean of the signal value. 119 | 120 | :param df: a Pandas dataframe of a bed narrow peak 121 | :param field: field to analyse, ie. exons or transcripts 122 | 123 | :returns: a Pandas dataframe 124 | """ 125 | 126 | tmp=df[[field,'name',"count"]].drop_duplicates() 127 | tmp=tmp.drop(["name"],axis=1) 128 | tmp["count"]=tmp["count"].astype(int) 129 | tmp.columns=[field,"%s_count" %str(field)] 130 | tmp=tmp.groupby(field, as_index=False).sum() 131 | df=pd.merge(df,tmp,on=field,how="left") 132 | 133 | tmp=df[[field,'name',"-log10(pValue)"]].drop_duplicates() 134 | tmp=tmp.drop(["name"],axis=1) 135 | tmp["-log10(pValue)"]=tmp["-log10(pValue)"].astype(float) 136 | tmp=tmp.groupby(field).apply(lambda l: reduce(lambda x, y: x*y, l["-log10(pValue)"]) ) 137 | tmp=pd.DataFrame(tmp) 138 | tmp.reset_index(inplace=True,drop=False) 139 | tmp.columns=[field,"%s norm. mean -log10(pValue)" %str(field)] 140 | df=pd.merge(df,tmp,on=field,how="left") 141 | 142 | tmp=df[[field,'name',"signalValue"]].drop_duplicates() 143 | tmp=tmp.drop(["name"],axis=1) 144 | tmp["signalValue"]=tmp["signalValue"].astype(float) 145 | tmp=tmp.groupby(field).apply(lambda l: reduce(lambda x, y: x*y, l["signalValue"]) ) 146 | tmp=pd.DataFrame(tmp) 147 | tmp.reset_index(inplace=True,drop=False) 148 | tmp.columns=[field,"%s signalValue" %str(field)] 149 | df=pd.merge(df,tmp,on=field,how="left") 150 | 151 | return df 152 | 153 | for f in ["exon_id","transcript_id"]: 154 | dfTargets=getCounts(dfTargets,f) 155 | 156 | def getCounts_GeneIDs(df): 157 | """ 158 | For each gene id in a bed narrow peak returns the number or times that field is present,\ 159 | the normalized mean of the '-log10(pValue)' and normalized mean of the signal value. 160 | 161 | :param df: a Pandas dataframe of a bed narrow peak 162 | 163 | :returns: a Pandas dataframe 164 | """ 165 | 166 | field="gene_id" 167 | 168 | tmp=df[[field,"transcript_id","transcript_id_count"]].drop_duplicates() 169 | tmp=tmp.drop(["transcript_id"],axis=1) 170 | tmp["transcript_id_count"]=tmp["transcript_id_count"].astype(int) 171 | tmp.columns=[field,"%s_count" %str(field)] 172 | tmp=tmp.groupby(field, as_index=False).sum() 173 | df=pd.merge(df,tmp,on=field,how="left") 174 | 175 | tmp=df[[field,'transcript_id',"transcript_id norm. mean -log10(pValue)"]].drop_duplicates() 176 | tmp=tmp.drop(["transcript_id"],axis=1) 177 | tmp["transcript_id norm. mean -log10(pValue)"]=tmp["transcript_id norm. mean -log10(pValue)"].astype(float) 178 | tmp.columns=[field,"%s norm. mean -log10(pValue)" %str(field)] 179 | tmp=tmp.groupby(field, as_index=False).sum() 180 | df=pd.merge(df,tmp,on=field,how="left") 181 | 182 | 183 | 184 | tmp=df[[field,'transcript_id',"transcript_id signalValue"]].drop_duplicates() 185 | tmp=tmp.drop(["transcript_id"],axis=1) 186 | tmp["transcript_id signalValue"]=tmp["transcript_id signalValue"].astype(float) 187 | tmp.columns=[field,"%s signalValue" %str(field)] 188 | tmp=tmp.groupby(field, as_index=False).sum() 189 | df=pd.merge(df,tmp,on=field,how="left") 190 | 191 | return df 192 | 193 | dfTargets=getCounts_GeneIDs(dfTargets) 194 | 195 | 196 | dfTargets=dfTargets.drop(["count"],axis=1) 197 | return dfTargets 198 | 199 | def AnnotateBED(bed, GTF, genome_file, bedcols=None, promoter=[1000,200]): 200 | """ 201 | Annotates a bed file. 202 | 203 | :param bed: either a /path/to/file.bed or a Pandas dataframe in bed format. /path/to/file.bed implies bedcols. 204 | :param GTF: /path/to/file.gtf 205 | :param genome_file: /path/to/file.genome - a tab separated values of chr name and size information 206 | :param bedcols: a comma separated string of column headers to use when reading in a bed file. eg: "chr,start,end,name" 207 | :param promoter: a list containing the upstream start of the promoter region from the TSS and the downstream end of the promoter region from the TSS. 208 | 209 | :returns: a Pandas dataframe with the annotated bed file. exons and promoters will be reported as well in the annotated_gene_features column. 210 | """ 211 | if type(bed) == type("string"): 212 | bed=pd.read_table(bed,header=None) 213 | bed.columns=bedcols.split(",") 214 | 215 | print("Reading GTF file.") 216 | sys.stdout.flush() 217 | 218 | GTF=readGTF(GTF) 219 | GTF["gene_name"]=retrieve_GTF_field("gene_name", GTF) 220 | GTF["gene_id"]=retrieve_GTF_field("gene_id", GTF) 221 | GTF["gene_name"]=GTF["gene_name"]+"/"+GTF["gene_id"] 222 | GTF=GTF.drop(["gene_id"],axis=1) 223 | 224 | print("Generating promoters annotation.") 225 | sys.stdout.flush() 226 | 227 | promoters=GTF[GTF["feature"]=="transcript"] 228 | promoters_plus=promoters[promoters["strand"]=="+"] 229 | promoters_minus=promoters[promoters["strand"]=="-"] 230 | 231 | upstream=promoter[0] 232 | downstream=promoter[1] 233 | 234 | promoters_plus.loc[:,"promoter_start"]=promoters_plus.loc[:,"start"].astype(int)-upstream 235 | promoters_plus.loc[:,"promoter_end"]=promoters_plus.loc[:,"start"].astype(int)+downstream 236 | 237 | promoters_minus.loc[:,"promoter_start"]=promoters_minus["end"].astype(int)-downstream 238 | promoters_minus.loc[:,"promoter_end"]=promoters_minus["end"].astype(int)+upstream 239 | 240 | promoters=pd.concat([promoters_plus,promoters_minus]) 241 | 242 | promoters=promoters[["seqname","feature","promoter_start","promoter_end","gene_name"]] 243 | promoters.columns=["seqname","feature","start","end","gene_name"] 244 | 245 | promoters.loc[:,"feature"]="promoter" 246 | promoters.drop_duplicates(inplace=True) 247 | promoters.reset_index(inplace=True, drop=True) 248 | 249 | chr_sizes=pd.read_table(genome_file,header=None) 250 | chr_sizes.columns=["seqname","size"] 251 | chr_sizes.loc[:,"seqname"]=chr_sizes["seqname"].astype(str) 252 | promoters.loc[:,"seqname"]=promoters["seqname"].astype(str) 253 | 254 | promoters=pd.merge(promoters,chr_sizes,how="left",on=["seqname"]) 255 | def CorrectStart(df): 256 | s=df["start"] 257 | if s < 0: 258 | s=0 259 | return s 260 | 261 | def CorrectEnd(df): 262 | s=df["end"] 263 | e=df["size"] 264 | if s > e: 265 | s=e 266 | return s 267 | 268 | promoters.loc[:,"start"]=promoters.apply(CorrectStart,axis=1) 269 | promoters.loc[:,"end"]=promoters.apply(CorrectEnd,axis=1) 270 | 271 | promoters.drop(["size"],axis=1, inplace=True) 272 | 273 | GTFs=GTF[["seqname","feature","start","end","gene_name"]] 274 | GTFs=GTFs[ GTFs["feature"]!= "gene"] 275 | 276 | GTFs.drop_duplicates(inplace=True) 277 | GTFs.reset_index(inplace=True, drop=True) 278 | 279 | GTFs=pd.concat([GTFs,promoters]) 280 | 281 | def NewName(df): 282 | name=df["gene_name"] 283 | feature=df["feature"] 284 | if feature == "transcript": 285 | res=name 286 | else: 287 | res=name+":"+feature 288 | return res 289 | 290 | GTFs.loc[:,"gene_name"]=GTFs.apply(NewName, axis=1) 291 | GTFs=GTFs[["seqname","start","end","gene_name"]] 292 | 293 | print( "Intersecting annotation tables and bed." ) 294 | sys.stdout.flush() 295 | 296 | refGTF=dfTObedtool(GTFs) 297 | pos=dfTObedtool(bed) 298 | 299 | colsGTF=GTFs.columns.tolist() 300 | newCols=bed.columns.tolist() 301 | 302 | for f in colsGTF: 303 | newCols.append(f+"_") 304 | newCols_=[ s for s in newCols if s not in ["seqname_","start_", "end_"]] 305 | 306 | pos=pos.intersect(refGTF, loj=True) 307 | pos=pd.read_table(pos.fn , names=newCols) 308 | pos=pos[newCols_] 309 | 310 | print("Merging features.") 311 | sys.stdout.flush() 312 | 313 | def GetFeature(x): 314 | if ":" in x: 315 | res=x.split(":")[1] 316 | else: 317 | res=np.nan 318 | return res 319 | 320 | def GetName(x): 321 | if ":" in x: 322 | res=x.split(":")[0] 323 | elif type(x) == type("string"): 324 | if x != ".": 325 | res=x 326 | else: 327 | res=np.nan 328 | else: 329 | res=np.nan 330 | return res 331 | 332 | pos["gene_feature_"]=pos["gene_name_"].apply( lambda x: GetFeature(x) ) 333 | pos["gene_name_"]=pos["gene_name_"].apply( lambda x: GetName(x) ) 334 | 335 | refcol=pos.columns.tolist() 336 | refcol=[ s for s in refcol if s != "gene_feature_" ] 337 | 338 | def CombineAnn(df): 339 | def JOIN(x): 340 | return ', '.join([ str(s) for s in list(set(df[x])) if str(s) != "nan" ] ) 341 | return pd.Series(dict( gene_feature_ = JOIN("gene_feature_") ) ) 342 | 343 | pos_=pos.groupby(refcol).apply(CombineAnn) 344 | pos_.reset_index(inplace=True, drop=False) 345 | 346 | def MergeNameFeatures(df): 347 | name=df["gene_name_"] 348 | feature=df["gene_feature_"] 349 | if (type(name) == type("string")) & (name != ".") : 350 | if type(feature) == type("string"): 351 | if len(feature) > 0: 352 | res=name+": "+feature 353 | else: 354 | res=name 355 | else: 356 | res=name 357 | else: 358 | res=np.nan 359 | return res 360 | 361 | pos_["annotated_gene_features"]=pos_.apply(MergeNameFeatures,axis=1) 362 | 363 | pos_=pos_.drop(["gene_name_","gene_feature_"],axis=1) 364 | 365 | def CombineAnn(df): 366 | def JOIN(x): 367 | return '; '.join([ str(s) for s in list(set(df[x])) if str(s) != "nan" ] ) 368 | return pd.Series(dict( annotated_gene_features = JOIN("annotated_gene_features") ) ) 369 | 370 | refcol=[ s for s in refcol if s != "gene_name_" ] 371 | pos_=pos_.groupby(refcol).apply(CombineAnn) 372 | pos_.reset_index(inplace=True, drop=False) 373 | 374 | return pos_ 375 | -------------------------------------------------------------------------------- /AGEpy/blast.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import itertools 3 | import pandas as pd 4 | import sys 5 | 6 | def variablename(var): 7 | """ 8 | Returns the string of a variable name. 9 | """ 10 | s=[tpl[0] for tpl in itertools.ifilter(lambda x: var is x[1], globals().items())] 11 | s=s[0].upper() 12 | return s 13 | 14 | def BLASTquery(query,database,program,filter=None,\ 15 | format_type=None, expect=None,\ 16 | nucl_reward=None, nucl_penalty=None,\ 17 | gapcosts=None, matrix=None,\ 18 | hitlist_size=None, descriptions=None,\ 19 | alignments=None,\ 20 | ncbi_gi=None, threshold=None,\ 21 | word_size=None, composition_based_statistics=None,\ 22 | organism=None, others=None,\ 23 | num_threads=None, baseURL="http://blast.ncbi.nlm.nih.gov",\ 24 | verbose=False): 25 | """ 26 | Performs a blast query online. 27 | 28 | As in https://ncbi.github.io/blast-cloud/ 29 | 30 | :param query: Search query. Allowed values: Accession, GI, or FASTA. 31 | :param database: BLAST database. Allowed values: nt, nr, refseq_rna, refseq_protein, swissprot, pdbaa, pdbnt 32 | :param program: BLAST program. Allowed values: blastn, megablast, blastp, blastx, tblastn, tblastx 33 | :param filter: Low complexity filtering. Allowed values: F to disable. T or L to enable. Prepend "m" for mask at lookup (e.g., mL) 34 | :param format_type: Report type. Allowed values: HTML, Text, XML, XML2, JSON2, or Tabular. HTML is the default. 35 | :param expect: Expect value. Allowed values: Number greater than zero. 36 | :param nucl_reward: Reward for matching bases (BLASTN and megaBLAST). Allowed values: Integer greater than zero. 37 | :param nucl_penalty: Cost for mismatched bases (BLASTN and megaBLAST). Allowed values: Integer less than zero. 38 | :param gapcosts: Gap existence and extension costs. Allowed values: Pair of positive integers separated by a space such as "11 1". 39 | :param matrix: Scoring matrix name. Allowed values: One of BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, PAM250, PAM30 or PAM70. Default: BLOSUM62 for all applicable programs. 40 | :param hitlist_size: Number of databases sequences to keep. Allowed values: Integer greater than zero. 41 | :param descriptions: Number of descriptions to print (applies to HTML and Text). Allowed values: Integer greater than zero. 42 | :param alignments: Number of alignments to print (applies to HTML and Text). Allowed values: Integer greater than zero. 43 | :param ncbi_gi: Show NCBI GIs in report. Allowed values: T or F. 44 | :param threshold: Neighboring score for initial words. Allowed values: Positive integer (BLASTP default is 11). Does not apply to BLASTN or MegaBLAST). 45 | :param word_size: Size of word for initial matches. Allowed values: Positive integer. 46 | :param composition_based_statistics: Composition based statistics algorithm to use. Allowed values: One of 0, 1, 2, or 3. See comp_based_stats command line option in the BLAST+ user manual for details. 47 | :param organism: an organism as in https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome 48 | :param others: here you can add other parameters as seen in a blast bookmarked page. Define you query in https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome 49 | Once your query is defined click on "Bookmark" on right upper side of the page. You can copy fragments of the URL 50 | which define the query. Eg. For organism "Homo sapiens (taxid:9606)" you will see the string "EQ_MENU=Homo%20sapiens%20%28taxid%3A9606%29" - this is 51 | the string you can use here in others. 52 | :param num_threads: Number of virtual CPUs to use. Allowed values: Integer greater than zero (default is 1). Supported only on the cloud. 53 | :param verbose: print more 54 | 55 | :returns: BLAST search request identifier 56 | """ 57 | 58 | if organism: 59 | organism=organism.replace(" ", "%20").replace("(", "%28").replace(")", "%29").replace(":", "%3A") 60 | EQ_MENU=organism 61 | else: 62 | EQ_MENU=None 63 | 64 | URL=baseURL+"/Blast.cgi?" 65 | URL=URL+"QUERY="+str(query)+"&DATABASE="+str(database)+"&PROGRAM="+str(program) 66 | for o,varname in zip([filter, format_type, expect, nucl_reward, nucl_penalty,\ 67 | gapcosts, matrix, hitlist_size, descriptions, alignments,\ 68 | ncbi_gi, threshold, word_size, composition_based_statistics,\ 69 | EQ_MENU, num_threads],\ 70 | ['FILTER' , 'FORMAT_TYPE', 'EXPECT', 'NUCL_REWARD', 'NUCL_PENALTY',\ 71 | 'GAPCOSTS', 'MATRIX', 'HITLIST_SIZE', 'DESCRIPTIONS', 'ALIGNMENTS',\ 72 | 'NCBI_GI', 'THRESHOLD', 'WORD_SIZE', 'COMPOSITION_BASED_STATISTICS',\ 73 | 'EQ_MENU', 'NUM_THREADS']): 74 | if o: 75 | URL=URL+"&"+ varname +"="+str(o) 76 | 77 | if others: 78 | URL=URL+"&"+others 79 | 80 | URL=URL+"&CMD=Put" 81 | 82 | if verbose: 83 | print(URL) 84 | sys.stdout.flush() 85 | 86 | response=requests.get(url = URL) 87 | r=response.content.split("\n") 88 | RID=[ s for s in r if "RID = " in s ] 89 | if len(RID) > 0: 90 | RID=RID[0].split(" ")[-1] 91 | else: 92 | print("Could not return an RID for this query.") 93 | RID=None 94 | return RID 95 | 96 | def BLASTcheck(rid,baseURL="http://blast.ncbi.nlm.nih.gov"): 97 | """ 98 | Checks the status of a query. 99 | 100 | :param rid: BLAST search request identifier. Allowed values: The Request ID (RID) returned when the search was submitted 101 | :param baseURL: server url. Default=http://blast.ncbi.nlm.nih.gov 102 | 103 | :returns status: status for the query. 104 | :returns therearehist: yes or no for existing hits on a finished query. 105 | """ 106 | 107 | URL=baseURL+"/Blast.cgi?" 108 | URL=URL+"FORMAT_OBJECT=SearchInfo&RID="+rid+"&CMD=Get" 109 | response=requests.get(url = URL) 110 | r=response.content.split("\n") 111 | try: 112 | status=[ s for s in r if "Status=" in s ][0].split("=")[-1] 113 | ThereAreHits=[ s for s in r if "ThereAreHits=" in s ][0].split("=")[-1] 114 | except: 115 | status=None 116 | ThereAreHits=None 117 | 118 | print(rid, status, ThereAreHits) 119 | sys.stdout.flush() 120 | 121 | return status, ThereAreHits 122 | 123 | def BLASTresults(rid, format_type="Tabular", \ 124 | hitlist_size= None, alignments=None, \ 125 | ncbi_gi = None, format_object=None,\ 126 | baseURL="http://blast.ncbi.nlm.nih.gov"): 127 | """ 128 | Retrieves results for an RID. 129 | 130 | :param rid: BLAST search request identifier. Allowed values: The Request ID (RID) returned when the search was submitted 131 | :param format_type: Report type. Allowed values: HTML, Text, XML, XML2, JSON2, or Tabular. Tabular is the default. 132 | :param hitlist_size: Number of databases sequences to keep. Allowed values: Integer greater than zero. 133 | :param alignments: Number of alignments to print (applies to HTML and Text). Allowed values: Integer greater than zero. 134 | :param ncbi_gi: Show NCBI GIs in report. Allowed values: T or F. 135 | :param format_object: Object type. Allowed values: SearchInfo (status check) or Alignment (report formatting). 136 | :param baseURL: server url. Default=http://blast.ncbi.nlm.nih.gov 137 | 138 | :returns: the result of a BLAST query. If format_type="Tabular" it will parse the content into a Pandas dataframe. 139 | """ 140 | 141 | URL=baseURL+"/Blast.cgi?" 142 | URL=URL+"RID="+str(rid)+"&FORMAT_TYPE="+str(format_type) 143 | for o in [ hitlist_size, alignments,\ 144 | ncbi_gi, format_object]: 145 | if o: 146 | URL=URL+"&"+ variablename(var) +"="+str(o) 147 | URL=URL+"&CMD=Get" 148 | response=requests.get(url = URL) 149 | response=response.content 150 | 151 | if format_type=="Tabular": 152 | result=response.split("\n") 153 | result=[ s.split("\t") for s in result][6:] 154 | header=result[:7] 155 | content=result[7:] 156 | fields=header[5][0].strip("# Fields: ").split(", ") 157 | result=pd.DataFrame(content,columns=fields) 158 | response=result[:int(header[-1][0].split(" ")[1])] 159 | 160 | return response 161 | -------------------------------------------------------------------------------- /AGEpy/david.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | # from suds.client import Client as sudsclient 4 | from zeep import Client as zeepclient 5 | import logging 6 | import ssl 7 | from .plots import * 8 | 9 | david_categories = [ 10 | 'GOTERM_BP_FAT', 'GOTERM_CC_FAT', 'GOTERM_MF_FAT', 'KEGG_PATHWAY', 11 | 'BIOCARTA', 'PFAM', 'PROSITE' ] 12 | 13 | david_fields = [ 14 | 'categoryName', 'termName', 'listHits', 'percent', 15 | 'ease', 'geneIds', 'listTotals', 'popHits', 'popTotals', 16 | 'foldEnrichment', 'bonferroni', 'benjamini', 'afdr'] 17 | # include: 18 | # 'fisher' 19 | # 'termName' to 'term' and 'term_name' 20 | 21 | def DAVIDenrich(database, categories, user, ids, ids_bg = None, name = '', name_bg = '', verbose = False, p = 0.1, n = 2): 22 | # Modified from https://david.ncifcrf.gov/content.jsp?file=WS.html 23 | # by courtesy of HuangYi @ 20110424 24 | 25 | """ 26 | Queries the DAVID database for an enrichment analysis 27 | Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == "type" tag and categories == "annot" tag. 28 | 29 | :param database: A string for the database to query, e.g. 'WORMBASE_GENE_ID' 30 | :param categories: A comma separated string with databases 31 | :param user: A user ID registered at DAVID for querying 32 | :param ids: A list with identifiers 33 | :param name: A string with the name for the query set 34 | :param ids_bg: A list with the background identifiers to enrich against, 35 | 'None' for whole set 36 | :param name_bg: A string with the name for the background set 37 | :param p: Maximum p value for enrichment of a term 38 | :param n: Minimum number of genes within a term 39 | 40 | :returns: None if no ids match the queried database, or a pandas data frame with results 41 | """ 42 | 43 | ids = ','.join([str(i) for i in ids]) 44 | use_bg = 0 45 | if ids_bg is not None: 46 | ids_bg = ','.join([str(i) for i in ids_bg]) 47 | ssl._create_default_https_context = ssl._create_unverified_context 48 | url = 'https://david.ncifcrf.gov/webservice/services/DAVIDWebService?wsdl' 49 | logging.getLogger("zeep").setLevel(logging.ERROR) 50 | # client = sudsclient(url) 51 | # client.wsdl.services[0].setlocation('https://david.ncifcrf.gov/webservice/services/DAVIDWebService.DAVIDWebServiceHttpSoap11Endpoint/') 52 | client = zeepclient(url) 53 | client_auth = client.service.authenticate(user) 54 | if verbose: 55 | print('User Authentication:', client_auth) 56 | sys.stdout.flush() 57 | size = client.service.addList(ids, database, name, 0) #| inputListIds,idType,listName,listType) 58 | if verbose: 59 | print('Mapping rate of ids: ', str(size)) 60 | sys.stdout.flush() 61 | if not float(size) > float(0): 62 | return None 63 | if ids_bg is not None: 64 | size_bg = client.service.addList(ids_bg, database, name_bg, 1) 65 | if verbose: 66 | print('Mapping rate of background ids: ', str(size_bg)) 67 | sys.stdout.flush() 68 | client_categories = client.service.setCategories(categories) 69 | if verbose: 70 | print('Categories used: ', client_categories) 71 | sys.stdout.flush() 72 | client_report = client.service.getChartReport(p, n) 73 | size_report = len(client_report) 74 | if verbose: 75 | print('Records reported: ', str(size_report)) 76 | sys.stdout.flush() 77 | 78 | if size_report > 0: 79 | df = [] 80 | for r in client_report: 81 | # d = dict(r) 82 | line = [] 83 | for f in david_fields: 84 | # line.append(str(d[f]).encode('ascii','ignore')) 85 | value = getattr(r, f, None) 86 | line.append(str(value).encode('ascii','ignore')) 87 | df.append(line) 88 | df = pd.DataFrame(df) 89 | df.columns=david_fields 90 | for col in david_fields: 91 | df[col] = df[col].apply(lambda x: x.decode()) 92 | else: 93 | df=None 94 | 95 | return df 96 | 97 | 98 | def id_nameDAVID(df,GTF=None,name_id=None): 99 | """ 100 | Given a DAVIDenrich output it converts ensembl gene ids to genes names and adds this column to the output 101 | 102 | :param df: a dataframe output from DAVIDenrich 103 | :param GTF: a GTF dataframe from readGTF() 104 | :param name_id: instead of a gtf dataframe a dataframe with the columns 'gene_name' and 'gene_id' can be given as input 105 | 106 | :returns: a pandas dataframe with a gene name column added to it. 107 | """ 108 | if name_id is None: 109 | gene_name=retrieve_GTF_field('gene_name',GTF) 110 | gene_id=retrieve_GTF_field('gene_id', GTF) 111 | GTF=pd.concat([gene_name,gene_id],axis=1) 112 | else: 113 | GTF=name_id.copy() 114 | df['Gene_names']="genes" 115 | terms=df['termName'].tolist() 116 | enrichN=pd.DataFrame() 117 | for term in terms: 118 | tmp=df[df['termName']==term] 119 | tmp=tmp.reset_index(drop=True) 120 | ids=tmp.xs(0)['geneIds'] 121 | ids=pd.DataFrame(data=ids.split(", ")) 122 | ids.columns=['geneIds'] 123 | ids['geneIds']=ids['geneIds'].map(str.lower) 124 | GTF['gene_id']=GTF['gene_id'].astype(str) 125 | GTF['gene_id']=GTF['gene_id'].map(str.lower) 126 | ids=pd.merge(ids, GTF, how='left', left_on='geneIds', right_on='gene_id') 127 | names=ids['gene_name'].tolist() 128 | names= ', '.join(names) 129 | tmp["Gene_names"]=names 130 | #tmp=tmp.replace(to_replace=tmp.xs(0)['Gene_names'], value=names) 131 | enrichN=pd.concat([enrichN, tmp]) 132 | enrichN=enrichN.reset_index(drop=True) 133 | 134 | gene_names=enrichN[['Gene_names']] 135 | gpos=enrichN.columns.get_loc("geneIds") 136 | enrichN=enrichN.drop(['Gene_names'],axis=1) 137 | cols=enrichN.columns.tolist() 138 | enrichN=pd.concat([enrichN[cols[:gpos+1]],gene_names,enrichN[cols[gpos+1:]]],axis=1) 139 | 140 | return enrichN 141 | 142 | def DAVIDgetGeneAttribute(x,df,refCol="ensembl_gene_id",fieldTOretrieve="gene_name"): 143 | """ 144 | Returns a list of gene names for given gene ids. 145 | 146 | :param x: a string with the list of IDs separated by ', ' 147 | :param df: a dataframe with the reference column and a the column to retrieve 148 | :param refCol: the header of the column containing the identifiers 149 | :param fieldTOretrieve: the field to retrieve from parsedGTF eg. 'gene_name' 150 | 151 | :returns: list of fieldTOretrieve separeted by ', ' in the same order as the given in x 152 | """ 153 | 154 | l=x.split(", ") 155 | l=[ s.upper() for s in l ] 156 | tmpdf=pd.DataFrame({refCol:l},index=range(len(l))) 157 | df_fix=df[[refCol,fieldTOretrieve]].drop_duplicates() 158 | sys.stdout.flush() 159 | df_fix[refCol]=df_fix[refCol].apply(lambda x: x.upper()) 160 | ids=pd.merge(tmpdf,df_fix,how="left",on=[refCol]) 161 | ids=ids[fieldTOretrieve].tolist() 162 | ids=[ str(s) for s in ids ] 163 | ids=", ".join(ids) 164 | return ids 165 | 166 | 167 | def DAVIDplot(database, categories, user, df_ids, output, df_ids_bg = None, name = '', \ 168 | name_bg = '', verbose = False, p = 0.1, n = 2): 169 | """ 170 | Queries the DAVID database for an enrichment analysis and plots CellPlots as 171 | well as SymPlots (see plots). 172 | Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == "type" tag and categories == "annot" tag. 173 | 174 | :param database: a string for the database to query, e.g. 'WORMBASE_GENE_ID' 175 | :param categories: a comma separated string with databases 176 | :param user: a user ID registered at DAVID for querying 177 | :param df_ids: a dataframe where the first column contains the identifiers 178 | to be queried and the second column the respective log2fc for each identifier. 179 | :param output: /path/to/output/prefix 180 | :param df_ids_bg: a dataframe where the first column contains the identifiers 181 | to be used as background. None for whole set. 182 | :param name: a string with the name for the query set 183 | :param name_bg: a string with the name for the background set 184 | :param p: Maximum p value for enrichment of a term 185 | :param n: Minimum number of genes within a term 186 | 187 | :returns: Nothing 188 | """ 189 | 190 | idsc1=df_ids.columns.tolist()[0] 191 | idsc2=df_ids.columns.tolist()[1] 192 | 193 | ids=df_ids[idsc1].tolist() 194 | if type(df_ids_bg)==type(pd.DataFrame()): 195 | ids_bg=df_ids_bg[df_ids_bg.columns.tolist()[0]] 196 | else: 197 | ids_bg=None 198 | 199 | print(categories) 200 | 201 | david=DAVIDenrich(database, categories, user, ids, ids_bg = ids_bg, \ 202 | name = name, name_bg = name_bg, verbose = verbose, p = p, n = n) 203 | 204 | if type(david)!=type(pd.DataFrame()): 205 | print("For this dataset no enrichments could be returned.") 206 | sys.stdout.flush() 207 | else: 208 | david[idsc2]=david["geneIds"].apply(lambda x: \ 209 | DAVIDgetGeneAttribute(x,\ 210 | df_ids,\ 211 | refCol=idsc1,\ 212 | fieldTOretrieve=idsc2)) 213 | david[idsc2]=david[idsc2].apply(lambda x: x.replace(", ", ",")) 214 | EXC=pd.ExcelWriter(output+".xlsx") 215 | for category in list(set(david["categoryName"].tolist())): 216 | david_=david[david["categoryName"]==category] 217 | print(category) 218 | david_.to_excel(EXC,category) 219 | 220 | tmp=david_[:20] 221 | tmp["-log10(p)"]=np.log10(tmp["ease"].astype(float)) * -1 222 | #tmp["Term"]=tmp['termName'] 223 | #tmp["Annotated"]=tmp["listHits"] 224 | cellplot=CellPlot(tmp, output_file=output+"."+category, gene_expression_col=idsc2, gene_expression=idsc2, \ 225 | figure_title=category+"\n"+output.split("/")[-1], pvalCol="ease", \ 226 | lowerLimit=None, upperLimit=None, colorBarType='bwr', xaxis_label = "GO Term -log10(p-value)") 227 | 228 | symplot=SymPlot(tmp, output_file=output+"."+category, \ 229 | gene_expression_col=idsc2,\ 230 | figure_title=category+"\n"+output.split("/")[-1], \ 231 | pvalCol="ease", xaxis_label = "GO Term -log10(p-value)") 232 | EXC.save() 233 | -------------------------------------------------------------------------------- /AGEpy/fasta.py: -------------------------------------------------------------------------------- 1 | 2 | def getFasta(opened_file, sequence_name): 3 | """ 4 | Retrieves a sequence from an opened multifasta file 5 | 6 | :param opened_file: an opened multifasta file eg. opened_file=open("/path/to/file.fa",'r+') 7 | :param sequence_name: the name of the sequence to be retrieved eg. for '>2 dna:chromosome chromosome:GRCm38:2:1:182113224:1 REF' use: sequence_name=str(2) 8 | 9 | returns: a string with the sequence of interest 10 | """ 11 | 12 | lines = opened_file.readlines() 13 | seq=str("") 14 | for i in range(0, len(lines)): 15 | line = lines[i] 16 | if line[0] == ">": 17 | fChr=line.split(" ")[0].split("\n")[0] 18 | fChr=fChr[1:] 19 | if fChr == sequence_name: 20 | s=i 21 | code=['N','A','C','T','G'] 22 | firstbase=lines[s+1][0] 23 | while firstbase in code: 24 | s=s + 1 25 | seq=seq+lines[s] 26 | firstbase=lines[s+1][0] 27 | 28 | if len(seq)==0: 29 | seq=None 30 | else: 31 | seq=seq.split("\n") 32 | seq="".join(seq) 33 | 34 | return seq 35 | 36 | def writeFasta(sequence, sequence_name, output_file): 37 | """ 38 | Writes a fasta sequence into a file. 39 | 40 | :param sequence: a string with the sequence to be written 41 | :param sequence_name: name of the the fasta sequence 42 | :param output_file: /path/to/file.fa to be written 43 | 44 | :returns: nothing 45 | """ 46 | i=0 47 | f=open(output_file,'w') 48 | f.write(">"+str(sequence_name)+"\n") 49 | while i <= len(sequence): 50 | f.write(sequence[i:i+60]+"\n") 51 | i=i+60 52 | f.close() 53 | 54 | def rewriteFasta(sequence, sequence_name, fasta_in, fasta_out): 55 | """ 56 | Rewrites a specific sequence in a multifasta file while keeping the sequence header. 57 | 58 | :param sequence: a string with the sequence to be written 59 | :param sequence_name: the name of the sequence to be retrieved eg. for '>2 dna:chromosome chromosome:GRCm38:2:1:182113224:1 REF' use: sequence_name=str(2) 60 | :param fasta_in: /path/to/original.fa 61 | :param fasta_out: /path/to/destination.fa 62 | 63 | :returns: nothing 64 | """ 65 | f=open(fasta_in, 'r+') 66 | f2=open(fasta_out,'w') 67 | lines = f.readlines() 68 | i=0 69 | while i < len(lines): 70 | line = lines[i] 71 | if line[0] == ">": 72 | f2.write(line) 73 | fChr=line.split(" ")[0] 74 | fChr=fChr[1:] 75 | if fChr == sequence_name: 76 | code=['N','A','C','T','G'] 77 | firstbase=lines[i+1][0] 78 | while firstbase in code: 79 | i=i+1 80 | firstbase=lines[i][0] 81 | s=0 82 | while s <= len(sequence): 83 | f2.write(sequence[s:s+60]+"\n") 84 | s=s+60 85 | else: 86 | i=i+1 87 | else: 88 | f2.write(line) 89 | i=i+1 90 | 91 | f2.close 92 | f.close 93 | -------------------------------------------------------------------------------- /AGEpy/go.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | #import urllib2 # python2 3 | import urllib.request as urllib2 4 | # import StringIO # python2 5 | from io import StringIO 6 | import gzip 7 | import sys 8 | 9 | 10 | def getGeneAssociation(URL_or_file): 11 | """ 12 | This function collects GO annotation from http://geneontology.org/page/download-annotations. 13 | 14 | :param URL_or_file: either a link to a file on geneontology.org eg. http://geneontology.org/gene-associations/gene_association.fb.gz or the path for the respective downloded .gz file. 15 | :returns: a Pandas dataframe with the parsed table. 16 | """ 17 | if URL_or_file[:4] == "http": 18 | response = urllib2.urlopen(URL_or_file) 19 | compressedFile = StringIO.StringIO(response.read()) 20 | decompressedFile = gzip.GzipFile(fileobj=compressedFile) 21 | else: 22 | decompressedFile = gzip.GzipFile(URL_or_file) 23 | out=decompressedFile.read().split("\n") 24 | 25 | version=[ s for s in out if len(s) > 0 ] 26 | version=[ s for s in version if s[0] == '!' ] 27 | version=[ s for s in version if "!gaf-version:" in s ] 28 | version=version[0] 29 | 30 | 31 | if version=="!gaf-version: 2.0": 32 | reco=version 33 | else: 34 | reco=None 35 | 36 | out=[ s for s in out if len(s) > 0 ] 37 | out=[ s for s in out if s[0] != "!" ] 38 | out=[s.split("\t") for s in out] 39 | out=pd.DataFrame(out) 40 | mgi_cols=["DB","DB_Object_ID","DB_Object_Symbol","Qualifier (this field is optional)","GO ID","DB:Reference","Evidence Code","Evidence Code Qualifier (optional)",\ 41 | "Aspect","DB_Object_Name","DB_Object_Synonym","DB_Object_Type","Taxon","Date","Assigned_by"] 42 | fb_cols=["DB","DB_Object_ID","DB_Object_Symbol","Qualifier","GO ID","DB:Reference","Evidence",\ 43 | "With (or) From","Aspect","DB_Object_Name","DB_Object_Synonym","DB_Object_Type","Taxon","Date","Assigned_by","Annotation Extension",\ 44 | "Gene Product Form ID"] 45 | gaf_20=["DB","DB Object ID","DB Object Symbol","Qualifier","GO ID","DB:Reference (|DB:Reference)","Evidence Code","With (or) From","Aspect","DB Object Name",\ 46 | "DB Object Synonym (|Synonym)","DB Object Type","Taxon(|taxon)","Date","Assigned By","Annotation Extension","Gene Product Form ID"] 47 | cols={"fb":fb_cols,"wb":fb_cols,"mgi":fb_cols,"!gaf-version: 2.0":gaf_20} 48 | colsType=URL_or_file.split(".") 49 | colsType=colsType[len(colsType)-2] 50 | if colsType=="gaf": 51 | colsType=reco 52 | if colsType in cols.keys(): 53 | try: 54 | cols=cols.get(colsType) 55 | out.columns=cols 56 | except ValueError as err: 57 | print("Could not fit headers.") 58 | print(err) 59 | sys.stdout.flush() 60 | else: 61 | print("Could not find headers for %s." %colsType) 62 | sys.stdout.flush() 63 | return out 64 | -------------------------------------------------------------------------------- /AGEpy/gtf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | from collections import OrderedDict 5 | try: 6 | import pybedtools 7 | except: 8 | print("pybedtools could not be imported") 9 | sys.stdout.flush() 10 | import csv 11 | 12 | def readGTF(infile): 13 | """ 14 | Reads a GTF file and labels the respective columns in agreement with GTF file standards: 15 | 'seqname','source','feature','start','end','score','strand','frame','attribute'. 16 | 17 | :param infile: path/to/file.gtf 18 | :returns: a Pandas dataframe of the respective GTF 19 | 20 | """ 21 | df=pd.read_table(infile, sep='\t', comment="#", header=None, dtype=str) 22 | df.columns=['seqname','source','feature','start','end','score','strand','frame','attribute'] 23 | #df = df.astype(str) # from DTC 24 | return df 25 | 26 | def retrieve_GTF_field(field,gtf): 27 | """ 28 | Returns a field of choice from the attribute column of the GTF 29 | 30 | :param field: field to be retrieved 31 | :returns: a Pandas dataframe with one columns containing the field of choice 32 | 33 | """ 34 | inGTF=gtf.copy() 35 | def splits(x): 36 | l=x.split(";") 37 | l=[ s.split(" ") for s in l] 38 | res=np.nan 39 | for s in l: 40 | if field in s: 41 | if '"' in s[-1]: 42 | res=s[-1][1:-1] 43 | else: 44 | res=s[-1] 45 | return res 46 | 47 | inGTF[field]=inGTF['attribute'].apply(lambda x: splits(x)) 48 | return inGTF[[field]] 49 | 50 | def attributesGTF(inGTF): 51 | """ 52 | List the type of attributes in a the attribute section of a GTF file 53 | 54 | :param inGTF: GTF dataframe to be analysed 55 | :returns: a list of attributes present in the attribute section 56 | 57 | """ 58 | df=pd.DataFrame(inGTF['attribute'].str.split(";").tolist()) 59 | desc=[] 60 | for i in df.columns.tolist(): 61 | val=df[[i]].dropna() 62 | val=pd.DataFrame(val[i].str.split(' "').tolist())[0] 63 | val=list(set(val)) 64 | for v in val: 65 | if len(v) > 0: 66 | l=v.split(" ") 67 | if len(l)>1: 68 | l=l[1] 69 | else: 70 | l=l[0] 71 | desc.append(l) 72 | desc=list(set(desc)) 73 | finaldesc=[] 74 | for d in desc: 75 | if len(d) > 0: 76 | finaldesc.append(d) 77 | return finaldesc 78 | 79 | def parseGTF(inGTF): 80 | """ 81 | Reads an extracts all attributes in the attributes section of a GTF and constructs a new dataframe wiht one collumn per attribute instead of the attributes column 82 | 83 | :param inGTF: GTF dataframe to be parsed 84 | :returns: a dataframe of the orignal input GTF with attributes parsed. 85 | 86 | """ 87 | 88 | desc=attributesGTF(inGTF) 89 | ref=inGTF.copy() 90 | ref.reset_index(inplace=True, drop=True) 91 | df=ref.drop(['attribute'],axis=1).copy() 92 | for d in desc: 93 | field=retrieve_GTF_field(d,ref) 94 | df=pd.concat([df,field],axis=1) 95 | return df 96 | 97 | def writeGTF(inGTF,file_path): 98 | """ 99 | Write a GTF dataframe into a file 100 | 101 | :param inGTF: GTF dataframe to be written. It should either have 9 columns with the last one being the "attributes" section or more than 9 columns where all columns after the 8th will be colapsed into one. 102 | :param file_path: path/to/the/file.gtf 103 | :returns: nothing 104 | """ 105 | cols=inGTF.columns.tolist() 106 | if len(cols) == 9: 107 | if 'attribute' in cols: 108 | df=inGTF 109 | else: 110 | df=inGTF[cols[:8]] 111 | df['attribute']="" 112 | for c in cols[8:]: 113 | if c == cols[len(cols)-1]: 114 | df['attribute']=df['attribute']+c+' "'+inGTF[c].astype(str)+'";' 115 | else: 116 | df['attribute']=df['attribute']+c+' "'+inGTF[c].astype(str)+'"; ' 117 | df.to_csv(file_path, sep="\t",header=None,index=None,quoting=csv.QUOTE_NONE) 118 | 119 | def GTFtoBED(inGTF,name): 120 | """ 121 | Transform a GTF dataframe into a bed dataframe 122 | 123 | :param inGTF: GTF dataframe for transformation 124 | :param name: field of the GTF data frame to be use for the bed 'name' positon 125 | 126 | returns: a bed dataframe with the corresponding bed fiels: 'chrom','chromStart','chromEnd','name','score','strand' 127 | """ 128 | 129 | bed=inGTF.copy() 130 | bed.reset_index(inplace=True, drop=True) 131 | if name not in bed.columns.tolist(): 132 | field=retrieve_GTF_field(name, bed) 133 | bed=pd.concat([bed,field],axis=1) 134 | bed=bed[['seqname','start','end',name,'score','strand']] 135 | bed.columns=['chrom','chromStart','chromEnd','name','score','strand'] 136 | bed.drop_duplicates(inplace=True) 137 | bed.reset_index(inplace=True,drop=True) 138 | return bed 139 | 140 | def MAPGenoToTrans(parsedGTF,feature): 141 | """ 142 | Gets all positions of all bases in an exon 143 | 144 | :param df: a Pandas dataframe with 'start','end', and 'strand' information for each entry. 145 | df must contain 'seqname','feature','start','end','strand','frame','gene_id', 146 | 'transcript_id','exon_id','exon_number'] 147 | :param feature: feature upon wich to generate the map, eg. 'exon' or 'transcript' 148 | 149 | :returns: a string with the comma separated positions of all bases in the exon 150 | """ 151 | GenTransMap=parsedGTF[parsedGTF["feature"]==feature] 152 | def getExonsPositions(df): 153 | start=int(df["start"]) 154 | stop=int(df["end"]) 155 | strand=df["strand"] 156 | r=range(start,stop+1) 157 | if strand=="-": 158 | r.sort(reverse=True) 159 | r=[ str(s) for s in r] 160 | return ",".join(r) 161 | 162 | GenTransMap["feature_bases"]=GenTransMap.apply(getExonsPositions, axis=1) 163 | GenTransMap=GenTransMap.sort_values(by=["transcript_id","exon_number"],ascending=True) 164 | def CombineExons(df): 165 | return pd.Series(dict( feature_bases = ','.join(df['feature_bases']) ) ) 166 | GenTransMap=GenTransMap.groupby("transcript_id").apply(CombineExons) 167 | GenTransMap=GenTransMap.to_dict().get("feature_bases") 168 | 169 | return GenTransMap 170 | 171 | def GetTransPosition(df,field,dic,refCol="transcript_id"): 172 | """ 173 | Maps a genome position to transcript positon" 174 | 175 | :param df: a Pandas dataframe 176 | :param field: the head of the column containing the genomic position 177 | :param dic: a dictionary containing for each transcript the respective bases eg. {ENST23923910:'234,235,236,1021,..'} 178 | :param refCol: header of the reference column with IDs, eg. 'transcript_id' 179 | 180 | :returns: position on transcript 181 | """ 182 | try: 183 | gen=str(int(df[field])) 184 | transid=df[refCol] 185 | bases=dic.get(transid).split(",") 186 | bases=bases.index(str(gen))+1 187 | except: 188 | bases=np.nan 189 | return bases 190 | 191 | def getPromotersBed(gtf,fa,upstream=2000,downstream=200): 192 | """ 193 | Reads a gtf file and returns a bed file for the promoter coordinates. 194 | 195 | :param gtf: path/to/file.gtf. Must be an ensembl gtf. 196 | :param fa: path/to/fasta.fa. Must be an ensembl fasta file. 197 | :param upstream: number of bases upstream of transcript start sites the promoter should start 198 | :param downstream: number of bases downstream of transcript start sites the promoter should end 199 | :returns: a pandas dataframe in bed format 200 | 201 | """ 202 | chrsizes={} 203 | with open(fa, "r") as f: 204 | for line in f.readlines(): 205 | if line[0] == ">": 206 | l=line.split(" ") 207 | seqname=l[0][1:] 208 | size=int(l[2].split(":")[-2]) 209 | chrsizes[seqname]=size 210 | gtf=readGTF(gtf) 211 | gtf=gtf[gtf["feature"]=="transcript"] 212 | gtf.reset_index(inplace=True, drop=True) 213 | 214 | gtf["gene_id"]=retrieve_GTF_field(field="gene_id",gtf=gtf) 215 | gtf["gene_name"]=retrieve_GTF_field(field="gene_name",gtf=gtf) 216 | 217 | def getcoord(df): 218 | seqname=df["seqname"] 219 | strand=df["strand"] 220 | if strand == "+": 221 | tss=int(df["start"]) 222 | promoter_start=tss-upstream 223 | promoter_end=tss+downstream 224 | else: 225 | tss=int(df["end"]) 226 | promoter_start=tss-downstream 227 | promoter_end=tss+upstream 228 | 229 | if promoter_start < 0: 230 | promoter_start=0 231 | if promoter_end > chrsizes[seqname]: 232 | promoter_end=chrsizes[seqname] 233 | 234 | return str(promoter_start)+","+str(promoter_end) 235 | 236 | gtf["promoter"]=gtf.apply(getcoord, axis=1) 237 | gtf["start"]=gtf["promoter"].apply(lambda x: int(x.split(",")[0]) ) 238 | gtf["end"]=gtf["promoter"].apply(lambda x: int(x.split(",")[1]) ) 239 | 240 | gtf["id, name"]=gtf["gene_id"]+", "+gtf["gene_name"] 241 | gtf_=gtf.drop(["source","feature","attribute","promoter","gene_id","gene_name"],axis=1) 242 | gtf_=gtf_.drop_duplicates() 243 | gtf_counts=gtf_[["id, name"]] 244 | gtf_counts["#"]=1 245 | gtf_counts=gtf_counts.groupby(["id, name"]).sum() 246 | beds=gtf_[["seqname","start","end","id, name","score","strand"]] 247 | beds.columns=['chrom', 'start', 'stop', 'name', 'score', 'strand'] 248 | beds=beds[beds["name"].isin( gtf_counts[gtf_counts["#"]==1].index.tolist() )] 249 | genes=[ s for s in list(set(gtf_counts[gtf_counts["#"]>1].index.tolist())) if str(s).lower() != "nan" ] 250 | 251 | for gene_id in genes: 252 | tmp=gtf[gtf["id, name"]==gene_id] 253 | strand=tmp["strand"].tolist()[0] 254 | bed=GTFtoBED(inGTF=tmp,name="id, name") 255 | bed = pybedtools.BedTool.from_dataframe(bed) 256 | bed=bed.sort() 257 | bed=bed.merge() 258 | bed = pd.read_table(bed.fn, names=['chrom', 'start', 'stop' ]) 259 | bed["name"]=gene_id 260 | bed["score"]="." 261 | bed["strand"]=strand 262 | beds=pd.concat([beds,bed]) 263 | 264 | beds = pybedtools.BedTool.from_dataframe(beds) 265 | beds = beds.sort() 266 | beds = pd.read_table(beds.fn, names=['chrom', 'start', 'stop', 'name', 'score', 'strand']) 267 | 268 | beds.reset_index(inplace=True, drop=True) 269 | beds["i"]=beds.index.tolist() 270 | beds["i"]=beds["i"].astype(str) 271 | beds["name"]=beds["i"]+": "+beds["name"] 272 | beds=beds.drop(["i"],axis=1) 273 | 274 | return beds 275 | -------------------------------------------------------------------------------- /AGEpy/homology.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | #import urllib2 # python2 4 | import urllib.request as urllib2 5 | 6 | def getHomoloGene(taxfile="build_inputs/taxid_taxname",\ 7 | genefile="homologene.data",\ 8 | proteinsfile="build_inputs/all_proteins.data",\ 9 | proteinsclusterfile="build_inputs/proteins_for_clustering.data",\ 10 | baseURL="http://ftp.ncbi.nih.gov/pub/HomoloGene/current/"): 11 | """ 12 | Returns NBCI's Homolog Gene tables. 13 | 14 | :param taxfile: path to local file or to baseURL/taxfile 15 | :param genefile: path to local file or to baseURL/genefile 16 | :param proteinsfile: path to local file or to baseURL/proteinsfile 17 | :param proteinsclusterfile: path to local file or to baseURL/proteinsclusterfile 18 | :param baseURL: baseURL for downloading files 19 | 20 | :returns genedf: Homolog gene Pandas dataframe 21 | :returns protclusdf: Pandas dataframe. Lists one protein per gene that were used for homologene clustering. 22 | If a gene has multiple protein accessions derived from alternative splicing, 23 | only one protein isoform that give most protein alignment to proteins in other species 24 | was selected for clustering and it is listed in this file. 25 | :returns proteinsdf: Pandas dataframe. Lists all proteins and their gene information. 26 | If a gene has multple protein accessions derived from alternative splicing event, 27 | each protein accession is list in a separate line. 28 | """ 29 | 30 | def getDf(inputfile): 31 | if os.path.isfile(inputfile): 32 | df=pd.read_table(inputfile,header=None) 33 | else: 34 | df = urllib2.urlopen(baseURL+inputfile) 35 | df=df.read().split("\n") 36 | df=[ s for s in df if len(s) > 0 ] 37 | df=[s.split("\t") for s in df] 38 | df=pd.DataFrame(df) 39 | return df 40 | 41 | taxdf=getDf(taxfile) 42 | taxdf.set_index([0],inplace=True) 43 | taxdi=taxdf.to_dict().get(1) 44 | 45 | genedf=getDf(genefile) 46 | genecols=["HID","Taxonomy ID","Gene ID","Gene Symbol","Protein gi","Protein accession"] 47 | genedf.columns=genecols 48 | genedf["organism"]=genedf["Taxonomy ID"].apply(lambda x:taxdi.get(x)) 49 | 50 | proteinsdf=getDf(proteinsfile) 51 | proteinscols=["taxid","entrez GeneID","gene symbol","gene description","protein accession.ver","mrna accession.ver",\ 52 | "length of protein listed in column 5","-11) contains data about gene location on the genome",\ 53 | "starting position of gene in 0-based coordinate",\ 54 | "end position of the gene in 0-based coordinate","strand","nucleotide gi of genomic sequence where this gene is annotated"] 55 | proteinsdf.columns=proteinscols 56 | proteinsdf["organism"]=proteinsdf["taxid"].apply(lambda x:taxdi.get(x)) 57 | 58 | protclusdf=getDf(proteinsclusterfile) 59 | protclustercols=["taxid","entrez GeneID","gene symbol","gene description","protein accession.ver","mrna accession.ver",\ 60 | "length of protein listed in column 5","-11) contains data about gene location on the genome",\ 61 | "starting position of gene in 0-based coordinate",\ 62 | "end position of the gene in 0-based coordinate","strand","nucleotide gi of genomic sequence where this gene is annotated"] 63 | protclusdf.columns=proteinscols 64 | protclusdf["organism"]=protclusdf["taxid"].apply(lambda x:taxdi.get(x)) 65 | 66 | return genedf, protclusdf, proteinsdf 67 | -------------------------------------------------------------------------------- /AGEpy/meme.py: -------------------------------------------------------------------------------- 1 | import sys 2 | def filterMotifs(memeFile,outFile, minSites): 3 | """ 4 | Selectes motifs from a meme file based on the number of sites. 5 | 6 | :param memeFile: MEME file to be read 7 | :param outFile: MEME file to be written 8 | :param minSites: minimum number of sites each motif needs to have to be valid 9 | 10 | :returns: nothing 11 | """ 12 | 13 | with open(memeFile, "r") as mF: 14 | oldMEME=mF.readlines() 15 | newMEME=oldMEME[:7] 16 | i=7 17 | while i < len(oldMEME): 18 | if oldMEME[i].split(" ")[0] == "MOTIF": 19 | print(oldMEME[i].split("\n")[0], int(oldMEME[i+2].split("nsites= ")[1].split(" ")[0])) 20 | sys.stdout.flush() 21 | if int(oldMEME[i+2].split("nsites= ")[1].split(" ")[0]) > minSites: 22 | newMEME.append(oldMEME[i]) 23 | f=i+1 24 | while oldMEME[f].split(" ")[0] != "MOTIF": 25 | newMEME.append(oldMEME[f]) 26 | f=f+1 27 | i=i+1 28 | else: 29 | i=i+1 30 | else: 31 | i=i+1 32 | with open(outFile, "w+") as out: 33 | out.write("".join(newMEME) ) 34 | 35 | return newMEME 36 | -------------------------------------------------------------------------------- /AGEpy/rbiom.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | #try: 4 | # from rpy2.robjects.packages import importr 5 | # try: 6 | # biomaRt = importr("biomaRt") 7 | # except: 8 | # print "rpy2 could be loaded but 'biomaRt' could not be found.\nIf you want to use 'biomaRt' related functions please install 'biomaRt' in R.\n\n$ R\n> source('http://bioconductor.org/biocLite.R')\n> biocLite()\n> biocLite('biomaRt')\n> quit()" 9 | # sys.stdout.flush() 10 | #except: 11 | # print "Failed to import rpy2 module.\nPlease make sure you are using the same version of R you had when AGEpy was installed." 12 | # sys.stdout.flush() 13 | 14 | rbiomart_host="www.ensembl.org" 15 | 16 | def RdatabasesBM(host=rbiomart_host): 17 | """ 18 | Lists BioMart databases through a RPY2 connection. 19 | 20 | :param host: address of the host server, default='www.ensembl.org' 21 | 22 | :returns: nothing 23 | 24 | """ 25 | biomaRt = importr("biomaRt") 26 | print(biomaRt.listMarts(host=host)) 27 | 28 | def RdatasetsBM(database,host=rbiomart_host): 29 | """ 30 | Lists BioMart datasets through a RPY2 connection. 31 | 32 | :param database: a database listed in RdatabasesBM() 33 | :param host: address of the host server, default='www.ensembl.org' 34 | 35 | :returns: nothing 36 | 37 | """ 38 | biomaRt = importr("biomaRt") 39 | ensemblMart=biomaRt.useMart(database, host=host) 40 | print(biomaRt.listDatasets(ensemblMart)) 41 | 42 | def RfiltersBM(dataset,database,host=rbiomart_host): 43 | """ 44 | Lists BioMart filters through a RPY2 connection. 45 | 46 | :param dataset: a dataset listed in RdatasetsBM() 47 | :param database: a database listed in RdatabasesBM() 48 | :param host: address of the host server, default='www.ensembl.org' 49 | 50 | :returns: nothing 51 | 52 | """ 53 | biomaRt = importr("biomaRt") 54 | ensemblMart=biomaRt.useMart(database, host=host) 55 | ensembl=biomaRt.useDataset(dataset, mart=ensemblMart) 56 | print(biomaRt.listFilters(ensembl)) 57 | 58 | def RattributesBM(dataset,database,host=rbiomart_host): 59 | """ 60 | Lists BioMart attributes through a RPY2 connection. 61 | 62 | :param dataset: a dataset listed in RdatasetsBM() 63 | :param database: a database listed in RdatabasesBM() 64 | :param host: address of the host server, default='www.ensembl.org' 65 | 66 | :returns: nothing 67 | 68 | """ 69 | biomaRt = importr("biomaRt") 70 | ensemblMart=biomaRt.useMart(database, host=rbiomart_host) 71 | ensembl=biomaRt.useDataset(dataset, mart=ensemblMart) 72 | print(biomaRt.listAttributes(ensembl)) 73 | 74 | def RqueryBM(query_filter,query_items,query_attributes,dataset,database,host=rbiomart_host): 75 | """ 76 | Queries BioMart. 77 | 78 | :param query_filtery: one BioMart filter associated with the items being queried 79 | :param query_items: list of items to be queried (must assoiate with given filter) 80 | :param query_attributes: list of attributes to recover from BioMart 81 | :param dataset: dataset to query 82 | :param database: database to query 83 | :param host: address of the host server, default='www.ensembl.org' 84 | 85 | return: a Pandas dataframe of the queried attributes 86 | 87 | """ 88 | 89 | biomaRt = importr("biomaRt") 90 | ensemblMart=biomaRt.useMart(database, host=rbiomart_host) 91 | ensembl=biomaRt.useDataset(dataset, mart=ensemblMart) 92 | df=biomaRt.getBM(attributes=query_attributes, filters=query_filter, values=query_items, mart=ensembl) 93 | output = [tuple([df[j][i] for j in range(df.ncol)]) for i in range(df.nrow)] 94 | output = pd.DataFrame(output) 95 | output.columns = query_attributes 96 | return output 97 | -------------------------------------------------------------------------------- /AGEpy/sam.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | def readSAM(SAMfile,header=False): 5 | """ 6 | Reads and parses a sam file. 7 | 8 | :param SAMfile: /path/to/file.sam 9 | :param header: logical, if True, reads the header information 10 | 11 | :returns: a pandas dataframe with the respective SAM columns: 'QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL' and a list of the headers if header=True 12 | 13 | """ 14 | if header==True: 15 | f=open(SAMfile,"r+") 16 | head=[] 17 | for line in f.readlines(): 18 | if line[0]=="@": 19 | head.append(line) 20 | else: 21 | continue 22 | f.close() 23 | 24 | sam=pd.read_table(SAMfile,sep="this_gives_one_column",comment="@",header=None) 25 | sam=pd.DataFrame(sam[0].str.split("\t").tolist()) 26 | acols=[0,1,2,3,4,5,6,7,8,9] 27 | sam_=sam[acols] 28 | samcols=sam.columns.tolist() 29 | bcols=[ s for s in samcols if s not in acols ] 30 | sam_[10]=sam[bcols[0]] 31 | if len(bcols) > 1: 32 | for c in bcols[1:]: 33 | sam_[10]=sam_[10].astype(str) 34 | sam[c]=sam[c].astype(str) 35 | sam_[10]=sam_[10]+"\t"+sam[c] 36 | 37 | sam_.columns=['QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL'] 38 | 39 | if header==True: 40 | return sam_, head 41 | else: 42 | return sam_ 43 | 44 | def writeSAM(sam,SAMfile,header=None): 45 | """ 46 | Writes a pandas dataframe with the respective SAM columns: 'QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL' into a sam file 47 | 48 | :param sam: pandas dataframe to be writen 49 | :param SAMfile: /path/to/file.sam 50 | 51 | :returns: nothing 52 | """ 53 | def toNone(x): 54 | if x=="None": 55 | x=np.nan 56 | return x 57 | 58 | sam.reset_index(inplace=True,drop=True) 59 | QUAL=pd.DataFrame(sam['QUAL'].str.split("\t").tolist()) 60 | cols=QUAL.columns.tolist() 61 | 62 | for c in cols: 63 | QUAL[c]=QUAL[c].apply(lambda x: toNone(x)) 64 | 65 | sam=sam.drop(['QUAL'],axis=1) 66 | sam=pd.concat([sam,QUAL],axis=1) 67 | sam=sam.astype(str) 68 | sam=sam.as_matrix() 69 | 70 | tfile=open(SAMfile, "w+") 71 | 72 | if header != None: 73 | for l in header: 74 | tfile.write(l) 75 | 76 | for l in sam: 77 | l=[ s for s in l if s not in ['nan'] ] 78 | l="\t".join(l) 79 | tfile.write(l+"\n") 80 | 81 | tfile.close() 82 | 83 | def SAMflags(x): 84 | """ 85 | Explains a SAM flag. 86 | 87 | :param x: flag 88 | 89 | :returns: complete SAM flag explanaition 90 | """ 91 | flags=[] 92 | 93 | if x & 1: 94 | l="1: Read paired" 95 | else: 96 | l="0: Read unpaired" 97 | flags.append(l) 98 | 99 | if x & 2 : 100 | l="1: Read mapped in proper pair" 101 | else: 102 | l="0: Read not mapped in proper pair" 103 | flags.append(l) 104 | 105 | if x & 4 : 106 | l="1: Read unmapped" 107 | else: 108 | l="0: Read mapped" 109 | flags.append(l) 110 | 111 | if x & 8 : 112 | l="1: Mate unmapped" 113 | else: 114 | l="0: Mate mapped" 115 | flags.append(l) 116 | 117 | if x & 16 : 118 | l="1: Read reverse strand" 119 | else: 120 | l="0: Read direct strand" 121 | flags.append(l) 122 | 123 | if x & 32 : 124 | l="1: Mate reverse strand" 125 | else: 126 | l="0: Mate direct strand" 127 | flags.append(l) 128 | 129 | if x & 64 : 130 | l="1: First in pair" 131 | else: 132 | l="0: Second in pair" 133 | flags.append(l) 134 | 135 | if x & 128 : 136 | l="1: Second in pair" 137 | else: 138 | l="0: First in pair" 139 | flags.append(l) 140 | 141 | if x & 256 : 142 | l="1: Not primary alignment" 143 | else: 144 | l="0: Primary alignment" 145 | flags.append(l) 146 | 147 | if x & 512 : 148 | l="1: Read fails platform/vendor quality checks" 149 | else: 150 | l="0: Read passes platform/vendor quality checks" 151 | flags.append(l) 152 | 153 | if x & 1024 : 154 | l="1: Read is PCR or optical duplicate" 155 | else: 156 | l="0: Read is not PCR or optical duplicate" 157 | flags.append(l) 158 | 159 | if x & 2048 : 160 | l="1: Supplementary alignment" 161 | else: 162 | l="0: Not supplementary alignment" 163 | flags.append(l) 164 | 165 | return flags 166 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim 2 | 3 | RUN apt-get update && apt-get install -yq --no-install-recommends git gcc g++ libz-dev imagemagick imagemagick-doc && apt-get clean && rm -rf /var/lib/apt/lists/* 4 | 5 | RUN pip3 install git+https://github.com/mpg-age-bioinformatics/AGEpy.git 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## AGEpy [![Build Status](https://travis-ci.org/mpg-age-bioinformatics/AGEpy.svg?branch=master)](https://travis-ci.org/mpg-age-bioinformatics/AGEpy) [![PyPI version](https://badge.fury.io/py/AGEpy.svg)](https://badge.fury.io/py/AGEpy) [![ReadtheDocs](https://readthedocs.org/projects/agepy/badge/?version=latest)](http://agepy.readthedocs.io) 2 | 3 | This python package contains Bioinformatics tools developed at the 4 | Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing. 5 | 6 | > Max Planck Institute for Biology of Ageing 7 | > Joseph-Stelzmann-Str. 9b 8 | > D-50931 Cologne 9 | > Germany 10 | 11 | [https://bioinformatics.age.mpg.de](https://bioinformatics.age.mpg.de) 12 | 13 | #### Read the Docs 14 | 15 | [agepy.readthedocs.io](http://agepy.readthedocs.io) 16 | 17 | #### Installation 18 | 19 | ###### pip 20 | 21 | ```bash 22 | pip3 install git+https://github.com/mpg-age-bioinformatics/AGEpy.git --user 23 | ``` 24 | 25 | To install a specific commit use: 26 | ``` 27 | pip3 install git+https://github.com/mpg-age-bioinformatics/AGEpy.git@ --user 28 | # eg. 29 | pip3 install git+https://github.com/mpg-age-bioinformatics/AGEpy.git@9b10b76d021652c44f93e8dd3850a7a937e6fcee --user 30 | ``` 31 | 32 | Alternatively you can also install the package with a symlink, so that changes 33 | to the source files will be immediately available to users of the package on 34 | your system: 35 | 36 | ```bash 37 | git clone https://github.com/mpg-age-bioinformatics/AGEpy 38 | cd AGEpy 39 | python setup.py develop --user 40 | ``` 41 | 42 | Be aware that with the develop option you won't be able to properly update once new scripts are added. 43 | 44 | #### Example usage 45 | 46 | ```python 47 | import AGEpy as age 48 | 49 | gtf=age.readGTF("/path/to/file.gtf") 50 | 51 | gtf.head() 52 | ``` 53 | 54 | #### Help 55 | 56 | In bash: 57 | 58 | ```bash 59 | pydoc AGEpy.AGEpy 60 | ``` 61 | 62 | In python: 63 | 64 | ```python 65 | help("AGEpy.AGEpy") 66 | ``` 67 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | AGEpy 2 | ^^^^^ 3 | 4 | This python package contains Bioinformatics tools developed at the 5 | Bioinformatics Core Facility of the Max Planck Institute for Biology of 6 | Ageing. 7 | 8 | Max Planck Institute for Biology of Ageing 9 | 10 | Joseph-Stelzmann-Str. 9b 11 | 12 | D-50931 Cologne Germany 13 | 14 | `https://bioinformatics.age.mpg.de`_ 15 | 16 | Read the Docs 17 | ^^^^^^^^^^^^^ 18 | 19 | `agepy.readthedocs.io`_ 20 | 21 | .. _agepy.readthedocs.io: http://agepy.readthedocs.io 22 | .. _https://bioinformatics.age.mpg.de: https://bioinformatics.age.mpg.de 23 | -------------------------------------------------------------------------------- /bin/abed: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description="abed is an annotation tool for bed files.",formatter_class = argparse.ArgumentDefaultsHelpFormatter ) 6 | parser.add_argument("-b", "--bed", help="/path/to/file.bed") 7 | parser.add_argument("-g", "--gtf", help="/path/to/file.gtf") 8 | parser.add_argument("-s", "--sizes", help="/path/to/file.genome. Tab separated values of 'chromosome name' and 'size' information.") 9 | parser.add_argument("-c", "--columns", help="A comma separated string of column headers to use when reading in the bed file. eg.: 'chr,start,end,name'." ) 10 | parser.add_argument("-p", "--promoter", help="A comma separated list containing the upstream start of the promoter region from the TSS and the downstream end of the promoter region from the TSS. eg.: '1000,200'.") 11 | parser.add_argument("-o", "--output", help="/path/to/output.tsv.") 12 | 13 | args = parser.parse_args() 14 | 15 | import AGEpy as age 16 | import pandas as pd 17 | 18 | promoters=args.promoter 19 | promoters=promoters.split(",") 20 | promoters=[ int(s) for s in promoters ] 21 | 22 | bed=age.AnnotateBED(args.bed,args.gtf, args.sizes, bedcols=args.columns, promoter=promoters) 23 | 24 | bed.to_csv(args.output, index=None, sep="\t") 25 | -------------------------------------------------------------------------------- /bin/blasto: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | 7 | sys.stdout.flush() 8 | 9 | # argparse arguments 10 | 11 | parser = argparse.ArgumentParser(description="This module will load a fasta formatted file and query each fasta sequence for blast \ 12 | The user may add blast parameters as space separated list after the sequence name. All queries are \ 13 | listed into a log table. The user can either let the program running while waiting for the results \ 14 | using the -C option, or quit and check if the results are ready later using -W -t ", \ 15 | formatter_class = argparse.ArgumentDefaultsHelpFormatter) 16 | # Flags 17 | parser.add_argument('-S', '--submitFromFasta', help = 'Read in fasta file and submit blast queries. Write out submitted query IDs.', \ 18 | action = 'store_true') 19 | parser.add_argument('-C', '--continueThrough', help = 'Read from fasta file, submit and continue checking. Write results when they are \ 20 | ready and exit after all results are finished.', action = 'store_true') 21 | parser.add_argument('-W', '--checkAndWriteResults', help = 'Read query IDs from tsv and check status. If results are ready, collect and safe.',\ 22 | action = 'store_true') 23 | # Input 24 | parser.add_argument("-f", "--inputFasta", default = '', help="Fasta formatted input file containing one or more input sequences. \ 25 | The sequence name may contain additional blast paramers, ") # include example 26 | parser.add_argument('-t', '--inputTsv', default = '',help='Tab separated input file containing sequence IDs, output prefix, query IDs, query arguments.') 27 | # Output 28 | parser.add_argument('-o', '--outputPrefix', default = '', help='Output prefix. All files will start with this prefix, blast output files will\ 29 | be written two _.') 30 | parser.add_argument('--format_type', default = 'Tabular', help='format of the blast output') 31 | parser.add_argument('--sleepTime', default = 60, type = int, help = 'time to wait before checking again if your jobs are done, only active if -C is on') 32 | parser.add_argument("--description", help="Get a description of what this script does.", action="store_true") 33 | 34 | args = parser.parse_args() 35 | 36 | 37 | if args.description: 38 | print "This module will load a fasta formatted file and query each fasta sequence for blast \ 39 | The user may add blast parameters as space separated list after the sequence name. All queries are \ 40 | listed into a log table. The user can either let the program running while waiting for the results \ 41 | using the -C option, or quit and check if the results are ready later using -W -t " 42 | sys.exit(0) 43 | 44 | # test input and arguments 45 | 46 | # test if inputfiles are present, if -S then -f if -W then -t 47 | if args.submitFromFasta and args.inputFasta == '': 48 | print('ERROR: If you are trying to submit your jobs, you need to supply input fasta sequences using -f ') 49 | sys.exit(1) 50 | 51 | if args.submitFromFasta and not os.path.exists(args.inputFasta): 52 | print ('ERROR: No such input file: %s' %(args.inputFasta)) 53 | sys.exit(1) 54 | 55 | if args.checkAndWriteResults and args.inputTsv == '': 56 | print('ERROR: If you are trying to check and write your jobs, you need to supply input tab separated table -t ') 57 | sys.exit(1) 58 | 59 | if args.checkAndWriteResults and not os.path.exists(args.inputTsv): 60 | print ('ERROR: No such input file: %s' %(args.inputTsv)) 61 | sys.exit(1) 62 | 63 | # test if output location is writable 64 | if not os.path.isdir('/'.join(args.outputPrefix.split('/')[:-1])): 65 | print('ERROR: No such output directory: %s' %('/'.join(args.outputPrefix.split('/')[:-1]))) 66 | sys.exit(1) 67 | if not os.access('/'.join(args.outputPrefix.split('/')[:-1]), os.W_OK): 68 | print('ERROR: You do not have write permissions: %s' %('/'.join(args.outputPrefix.split('/')[:-1]))) 69 | 70 | # test if format_type belongs to possible format types 71 | if not args.format_type in ['Tabular', 'Text', 'XML', 'XML2', 'JSON2']: 72 | print('ERROR: Only Tabular, Text, XML, XML2, or JSON2 are a supported format_type right now. %s is not supported' %(args.format_type)) 73 | sys.exit(1) 74 | 75 | # import AGEpy and other packages 76 | import AGEpy as age 77 | import pandas as pd 78 | import numpy as np 79 | import time 80 | 81 | 82 | # read in fasta 83 | if args.submitFromFasta: 84 | I = open(args.inputFasta) 85 | FASTA = {} 86 | 87 | while True: 88 | tmp_seqID = I.readline() 89 | tmp_sequence = I.readline() 90 | if not tmp_seqID.startswith('>'): 91 | break 92 | FASTA[tmp_seqID.replace('\n', '')[1:]] = {'sequence': tmp_sequence.replace('\n', '')} 93 | 94 | I.close() 95 | 96 | # open query_output file 97 | queryID_output = open('%s.queryTable.tsv' %(args.outputPrefix), 'w') 98 | queryID_output.write('SequenceID\tuser_prefix\tqueryID\tparameters\n') 99 | 100 | # for each fasta make a query and save queryID 101 | for seq in FASTA: 102 | # initalize BLAST parameters 103 | database = 'nt'; program = 'blastn'; filter=None; format_type=None; expect=None 104 | nucl_reward=None; nucl_penalty=None; gapcosts=None; matrix=None; hitlist_size=None 105 | descriptions=None; alignments=None; ncbi_gi=None; threshold=None 106 | word_size=None; composition_based_statistics=None; organism=None; others=None 107 | num_threads=None; baseURL="http://blast.ncbi.nlm.nih.gov"; verbose=False 108 | # redifine paramters based on user input 109 | params = seq.split(' ')[1:] 110 | for p in params: 111 | exec(p) 112 | # correctly format gapcosts 113 | if gapcosts: 114 | gapcosts = gapcosts.replace(',', ' ') 115 | # submit BLAST 116 | RID=age.BLASTquery(FASTA[seq]['sequence'], database, program, filter=filter,\ 117 | format_type=format_type, expect=expect,\ 118 | nucl_reward=nucl_reward, nucl_penalty=nucl_penalty,\ 119 | gapcosts=gapcosts, matrix=matrix,\ 120 | hitlist_size=hitlist_size, descriptions=descriptions,\ 121 | alignments=alignments, ncbi_gi=ncbi_gi, threshold=threshold,\ 122 | word_size=word_size, composition_based_statistics=composition_based_statistics,\ 123 | organism=organism, others=others, num_threads=num_threads, baseURL=baseURL,\ 124 | verbose=verbose) 125 | print(FASTA[seq]['sequence']) 126 | print(RID) 127 | FASTA[seq]['queryID'] = RID 128 | FASTA[seq]['SeqID'] = seq.split(' ')[0] 129 | FASTA[seq]['params'] = seq.split(' ')[1:] 130 | # write query id to log table 131 | queryID_output.write('%s\t%s\t%s\t%s\n' %(seq.split(' ')[0], args.outputPrefix, RID, ' '.join(seq.split(' ')[1:]))) 132 | 133 | queryID_output.close() 134 | print('%s jobs have been submitted.' %(len(FASTA))) 135 | 136 | # exit if -C is not specified 137 | if not args.continueThrough: 138 | print('\nYou can find an overview here: %s.queryTable.tsv' %(args.outputPrefix)) 139 | print "\n\n*************************************\nDeveloped by the Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing \n\nbioinformatics@age.mpg.de\n\n" 140 | exit(0) 141 | 142 | print('continuing ...') 143 | 144 | 145 | # read in tsv if the program stopped after submitting ### TODO get prefix from file 146 | if args.checkAndWriteResults: 147 | I = open(args.inputTsv) 148 | FASTA = {} 149 | I.readline() 150 | for line in I: 151 | L = line.replace('\n', '').split('\t') 152 | FASTA['%s %s' %(L[0], L[3])] = {'params' : L[3].split(' '), 'sequence' : '', 'SeqID': L[0], 'queryID': L[2]} 153 | I.close() 154 | 155 | 156 | # check if results are ready and write them if they are ready 157 | while len(FASTA) > 0: 158 | finished = [] 159 | for seq in FASTA: 160 | status, therearehits=age.BLASTcheck(FASTA[seq]['queryID']) 161 | if status == 'READY' and therearehits == 'yes': 162 | r=age.BLASTresults(FASTA[seq]['queryID'], format_type = args.format_type) 163 | if args.format_type == 'Tabular': 164 | r.insert(0, 'query_name', [FASTA[seq]['SeqID']] * r.shape[0]) 165 | r.to_csv('%s_%s.tsv' %(args.outputPrefix, FASTA[seq]['SeqID']), sep = '\t', index = False) 166 | elif format_type.lower() in ['html', 'Text', 'xml', 'xml2', 'json2']: 167 | O = open('%s_%s.%s' %(args.outputPrefix, FASTA[seq]['SeqID'], format_type.lower()), 'w') 168 | O.write(r) 169 | O.close() 170 | else: 171 | print('Only Tabular, Text, XML, XML2, or JSON2 are a supported format_type right now. %s is not supported' %(args.format_type)) 172 | finished += [seq] 173 | elif status == 'READY' and therearehits == 'no': 174 | print('Query %s is ready but has no hits' %(FASTA[seq]['SeqID'])) 175 | finished += [seq] 176 | else: 177 | print('Query %s is not ready yet' %(FASTA[seq]['SeqID'])) 178 | for seq in finished: 179 | del FASTA[seq] 180 | if not args.continueThrough: 181 | print('%s jobs are still running' %(len(FASTA))) 182 | print "\n\n*************************************\nDeveloped by the Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing \n\nbioinformatics@age.mpg.de\n\n" 183 | exit(0) 184 | print('%s jobs are still running' %(len(FASTA))) 185 | if len(FASTA) > 0: 186 | print('waiting ...') 187 | time.sleep(args.sleepTime) 188 | 189 | print('finished') 190 | 191 | print "\n\n*************************************\nDeveloped by the Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing \n\nbioinformatics@age.mpg.de\n\n" 192 | sys.exit() 193 | 194 | 195 | # python blasto -S -f /home/fmetge/Documents/corefacility/AGEpy/test_fasta.fa -o /home/fmetge/Documents/corefacility/AGEpy/test 196 | # python blasto -W -t /home/fmetge/Documents/corefacility/AGEpy/test.queryTable.tsv -o /home/fmetge/Documents/corefacility/AGEpy/test 197 | 198 | # python blasto -S -C -f /home/fmetge/Documents/corefacility/AGEpy/test_fasta.fa -o /home/fmetge/Documents/corefacility/AGEpy/test ... works in theory, but doesnt finit 199 | # python blasto -W -C -t /home/fmetge/Documents/corefacility/AGEpy/test.queryTable.tsv -o /home/fmetge/Documents/corefacility/AGEpy/test 200 | -------------------------------------------------------------------------------- /bin/david: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | 7 | sys.stdout.flush() 8 | 9 | parser = argparse.ArgumentParser(description="Queries the DAVID database for an enrichment \ 10 | analysis and plots CellPlots as well as SymPlots (see plots). \ 11 | Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == 'type' tag and categories == 'annot' tag.", \ 12 | formatter_class = argparse.ArgumentDefaultsHelpFormatter) 13 | parser.add_argument("-i", "--input", help="A file with tab separated values where \ 14 | the first column contains the identifiers to be queried and the second column the \ 15 | respective log2fc for each identifier.") 16 | parser.add_argument("-o", "--output", help="/path/to/output/prefix") 17 | parser.add_argument("-d", "--database", help="a string for the database to query, e.g. 'WORMBASE_GENE_ID'.") 18 | parser.add_argument("-c", "--categories", help="a comma separated list of categories.",\ 19 | default='GOTERM_BP_FAT,GOTERM_CC_FAT,GOTERM_MF_FAT,KEGG_PATHWAY,BIOCARTA,PFAM,PROSITE') 20 | parser.add_argument("-u", "--user", help="a user ID registered at DAVID for querying") 21 | parser.add_argument("-v", "--verbose", help="Print more.",default=None, action="store_true") 22 | parser.add_argument("-p", "--pvalue", help="Maximum p value for enrichment of a term.", default=0.1) 23 | parser.add_argument("-n", "--ngenes", help="Minimum number of genes within a term.", default=2) 24 | parser.add_argument("-b", "--background", help="A file with tab separated values where \ 25 | the first column contains the identifiers to used as a background. \ 26 | None for whole DAVID database as background.", default=None) 27 | args = parser.parse_args() 28 | 29 | import pandas as pd 30 | import AGEpy as age 31 | 32 | df_ids=pd.read_csv(args.input, sep = '\t') 33 | if args.background: 34 | df_ids_bg=pd.read_csv(args.background, sep = '\t') 35 | else: 36 | df_ids_bg=None 37 | 38 | #categories=args.categories.split(",") 39 | 40 | age.DAVIDplot(args.database, args.categories, args.user, df_ids, args.output, \ 41 | df_ids_bg = df_ids_bg, name = '', name_bg = '', verbose = args.verbose, \ 42 | p = args.pvalue, n = args.ngenes) 43 | -------------------------------------------------------------------------------- /bin/obo2tsv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description="obo to tsv parser", formatter_class = argparse.ArgumentDefaultsHelpFormatter) 6 | parser.add_argument("-i", "--input",help="go-basic.obo file. Files can be downloaded from http://geneontology.org/page/download-ontology.") 7 | parser.add_argument("-u", "--url", help="If no go-basic.obo input file is specified, a url to a target obo file can be specified instead.", default='http://geneontology.org/ontology/go-basic.obo') 8 | parser.add_argument("-o", "--output", help="Name of output tab separated file.", default="go-basic.tsv") 9 | parser.add_argument("-c", "--cpus", help="Number of cpus.", default=36) 10 | parser.add_argument("--organism", help="Optional, merge GO obo.tsv with a GO annotation for an organism: either a link to a file on geneontology.org eg. http://geneontology.org/gene-associations/gene_association.fb.gz or the path for the respective downloded .gz file.",default=None) 11 | args = parser.parse_args() 12 | 13 | import pandas as pd 14 | import numpy as np 15 | import multiprocessing as mp 16 | import sys 17 | import contextlib 18 | import cStringIO 19 | @contextlib.contextmanager 20 | def nostdout(): 21 | save_stdout = sys.stdout 22 | sys.stdout = cStringIO.StringIO() 23 | yield 24 | sys.stdout = save_stdout 25 | with nostdout(): 26 | import AGEpy.AGEpy as age 27 | #age.checkImport() 28 | 29 | 30 | n_processors=int(args.cpus) 31 | 32 | if args.input: 33 | f=open(args.input) 34 | lis=f.readlines() 35 | r=args.input 36 | elif args.url: 37 | from urllib import urlopen 38 | f=urlopen(args.url).read().split("\n") 39 | lis=[str(x)+"\n" for x in f] 40 | r=args.url 41 | 42 | print "Finished importing %s" %r 43 | sys.stdout.flush() 44 | 45 | def getTerm(i,lines): 46 | term={} 47 | cats=[] 48 | GOid=lines[i].split("\n")[0].split(": ")[1] 49 | i+=1 50 | while lines[i] != "\n": 51 | line=lines[i].split("\n")[0].split(": ") 52 | cats.append(line[0]) 53 | if not line[0] in term.keys(): 54 | term[line[0]]=line[1] 55 | else: 56 | nval=term[line[0]]+"; "+line[1] 57 | term[line[0]]=nval 58 | i+=1 59 | return i+1, GOid, term, cats 60 | 61 | def collectUpper(x,GOdic): 62 | allUpper=[] 63 | is_a=GOdic[x]["is_a"] 64 | is_a=is_a.split("; ") 65 | is_a=[ s.split(" ! ")[0] for s in is_a] 66 | return is_a 67 | 68 | def checkTop(x,df): 69 | name_spaces=set(df["namespace"].tolist()) 70 | if len(x) == 1: 71 | if x[0] in name_spaces: 72 | return True 73 | else: 74 | return False 75 | else: 76 | return False 77 | 78 | i=0 79 | GOdic={} 80 | cats=[] 81 | while i < len(lis): 82 | l=lis[i] 83 | if '[Term]' in l: 84 | i, GOid, term, c=getTerm(i+1,lis) 85 | GOdic[GOid]=term 86 | cats.append(c) 87 | 88 | else: 89 | i+=1 90 | cats=[item for sublist in cats for item in sublist] 91 | 92 | df=pd.DataFrame.from_dict(GOdic,orient="index") 93 | 94 | print "Collecting information on parent terms" 95 | sys.stdout.flush() 96 | 97 | for GO in GOdic.keys(): 98 | allUpper=[] 99 | if "is_a" in GOdic[GO].keys(): 100 | upper=collectUpper(GO,GOdic) 101 | allUpper.append(upper) 102 | while not checkTop(upper,df): 103 | sub=[] 104 | for u in upper: 105 | if "is_a" in GOdic[u].keys(): 106 | upper_=collectUpper(u,GOdic) 107 | sub.append(upper_) 108 | if len(sub)>0: 109 | sub=[item for sublist in sub for item in sublist] 110 | allUpper.append(sub) 111 | upper=sub 112 | else: 113 | break 114 | allUpper=list(set([item for sublist in allUpper for item in sublist])) 115 | allUpper="; ".join(allUpper) 116 | GOdic[GO]["parent_terms"]=allUpper 117 | 118 | #import json 119 | #with open("/beegfs/group_bit/home/JBoucas/GO_test/out.dic", 'w') as configfile: 120 | # json.dump(GOdic, configfile) 121 | #import json 122 | #with open("/beegfs/group_bit/home/JBoucas/GO_test/out.dic", 'r') as configfile: 123 | # GOdic=json.load(configfile) 124 | 125 | df=pd.DataFrame.from_dict(GOdic,orient="index") 126 | df["term"]=df.index.tolist() 127 | df=df.reset_index(drop=True) 128 | 129 | def getChildren(x,dfn=df): 130 | children=[] 131 | for i in range(len(dfn)): 132 | if str(dfn.ix[i,"parent_terms"])!="nan": 133 | if x in dfn.ix[i,"parent_terms"]: 134 | children.append(dfn.ix[i,"term"]) 135 | if len(children)>1: 136 | children="; ".join(children) 137 | elif len(children)==1: 138 | children=str(children[0]) 139 | else: 140 | children="None" 141 | return children 142 | 143 | def worker(df): 144 | df=pd.DataFrame(df) 145 | df["children"]=df['term'].apply(getChildren) 146 | df["result"]=df["term"].astype(str)+"-"+df["children"].astype(str) 147 | res=df["result"].tolist() 148 | return res 149 | 150 | def correctNones(x): 151 | if x == "None": 152 | return np.nan 153 | else: 154 | return x 155 | 156 | 157 | if __name__ == '__main__': 158 | print "Collecting information on child terms" 159 | sys.stdout.flush() 160 | 161 | reader = np.array_split(df,n_processors) 162 | pool = mp.Pool(n_processors) 163 | funclist = [] 164 | for d in reader: 165 | out_put = pool.apply_async(worker,[d]) 166 | funclist.append(out_put) 167 | 168 | dfCov=pd.DataFrame() 169 | for f in funclist: 170 | covs_=f.get() 171 | covs_=pd.DataFrame(covs_,index=range(len(covs_))) 172 | dfCov=pd.concat([dfCov,covs_],axis=0) 173 | dfCov.columns=['chil'] 174 | TSS=pd.DataFrame(dfCov['chil'].str.split("-").tolist()) 175 | TSS.columns=["term","children"] 176 | TSS["children"]=TSS["children"].apply(lambda x: correctNones(x) ) 177 | df=pd.merge(df,TSS,on="term",how="outer") 178 | col=['term','synonym','name', 'relationship', 'namespace', 'is_a', 'def', 'subset', 'comment', 'xref', 'is_obsolete', 'consider', 'alt_id','replaced_by', 'parent_terms', 'children'] 179 | df=df[col] 180 | if args.organism: 181 | org=age.getGeneAssociation(args.organism) 182 | if "GO ID" in org.columns.tolist(): 183 | df=pd.merge(org,df,left_on=["GO ID"], right_on=["term"],how="left") 184 | else: 185 | check=org.ix[0] 186 | for i in range(len(check)): 187 | if "GO:" in check[i]: 188 | break 189 | df=pd.merge(org,df,left_on=[i], right_on=["term"],how="left") 190 | df.to_csv(args.output,sep="\t",index=None) 191 | print "Done" 192 | -------------------------------------------------------------------------------- /conf.py: -------------------------------------------------------------------------------- 1 | from recommonmark.parser import CommonMarkParser 2 | 3 | source_parsers = { 4 | '.md': CommonMarkParser, 5 | } 6 | 7 | source_suffix = ['.rst', '.md'] 8 | -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/.DS_Store -------------------------------------------------------------------------------- /docs/cookbook.md: -------------------------------------------------------------------------------- 1 | ### Importing 2 | 3 | All functions in the AGEpy pakcage can be accessed using: 4 | 5 | ```python 6 | import AGEpy as age 7 | help(age.readGTF) 8 | ``` 9 | 10 | Alternatively, functions from the different modules can be accessed with for example: 11 | 12 | ```python 13 | from AGEpy import gtf 14 | help(gtf.readGTF) 15 | ``` 16 | 17 | ### Help 18 | 19 | In bash: 20 | 21 | ```bash 22 | pydoc AGEpy.AGEpy 23 | ``` 24 | 25 | In python: 26 | 27 | ```python 28 | help("AGEpy.AGEpy") 29 | ``` 30 | 31 | ### Example usage 32 | 33 | ```python 34 | import AGEpy as age 35 | 36 | gtf=age.readGTF("/path/to/file.gtf") 37 | 38 | gtf.head() 39 | ``` 40 | -------------------------------------------------------------------------------- /docs/executables/abed.md: -------------------------------------------------------------------------------- 1 | ## Intro 2 | 3 | `abed` is annotation tool for bed files. 4 | 5 | It annotates bed files with gene names, gene ids, and feature information from a 6 | provided annotations file (GTF). 7 | 8 | ## Examples 9 | 10 | ``` 11 | $ abed -b K27AC_chip1_peaks.bed -g hg38.83.gtf -s hg38.83.genome \ 12 | -c "chr,start,end,name,signal_value,strand,fold_change,p_value,Benjamini_Hochberg_FDR" \ 13 | -p 1000,200 -o annotated.bed.tsv 14 | ``` 15 | 16 | ## Output files 17 | 18 | * **`annotated.bed.tsv`** 19 | 20 | ``` 21 | chr start end name signal_value strand fold_change p_value Benjamini_Hochberg_FDR annotated_gene_features 22 | 1 1205710 1205930 chip1_peak_2799 241.68113891 . 12.1486518993 1.18025200195e-06 0.00504618822857 TNFRSF18/ENSG00000186891: promoter 23 | 1 1616560 1616780 chip1_peak_5889 71.050614487 . 3.57152066778 5.81256160902e-06 0.0113928182561 RP11-345P4.9/ENSG00000272106: promoter; MIB2/ENSG00000197530: five_prime_utr, exon, promoter, CDS 24 | 1 1892440 1892660 chip1_peak_3527 243.582136289 . 12.2442098543 1.93910073064e-06 0.00651852531479 RP1-140A9.1/ENSG00000231050: exon 25 | 1 2212540 2212870 chip1_peak_25 81.3040545107 . 4.08693314134 9.22431065693e-12 4.99354651514e-06 FAAP20/ENSG00000162585: exon, promoter; RP11-181G12.4/ENSG00000234396: exon, promoter 26 | 1 3712500 3712720 chip1_peak_6234 38.8954679096 . 1.95516912169 6.52541908518e-06 0.0120595193967 TP73/ENSG00000078900; RP5-1092A11.2/ENSG00000235131 27 | 1 3772780 3773000 chip1_peak_4768 120.93909338 . 6.0792784787 3.69904369916e-06 0.00905667169324 SMIM1/ENSG00000235169: five_prime_utr, exon, promoter 28 | 1 4680280 4680500 chip1_peak_7707 110.246753848 . 5.54180372353 9.96101899764e-06 0.014735318932 AJAP1/ENSG00000196581 29 | 1 5652020 5652240 chip1_peak_526 145.04841094 . 7.29118813738 2.00633103721e-08 0.000477258277665 RP11-154H17.1/ENSG00000236948 30 | 1 6330720 6330940 chip1_peak_7213 153.651918104 . 7.72366298467 8.71110008982e-06 0.0138290963917 ACOT7/ENSG00000097021 31 | 1 6362730 6362950 chip1_peak_7153 87.6949697748 . 4.40818702657 8.54621275409e-06 0.0136908694171 ACOT7/ENSG00000097021 32 | 1 6421360 6421580 chip1_peak_5440 279.339262378 . 14.0416230896 4.90344812744e-06 0.0104638228061 HES2/ENSG00000069812 33 | 1 6423890 6424220 chip1_peak_3597 398.194019503 . 20.0161276677 2.01692509039e-06 0.00664985446589 ESPN/ENSG00000187017: promoter; HES2/ENSG00000069812 34 | 1 6859710 6860040 chip1_peak_53 79.510332304 . 3.99676761667 4.53361213715e-11 1.10018291319e-05 CAMTA1/ENSG00000171735 35 | 1 7400250 7400470 chip1_peak_5957 122.230907584 . 6.14421445654 5.936688264e-06 0.011503154056 CAMTA1/ENSG00000171735 36 | 1 7705060 7705390 chip1_peak_1184 96.6376920702 . 4.85771329365 1.53921999586e-07 0.001601221552 CAMTA1/ENSG00000171735 37 | 1 7745650 7745870 chip1_peak_746 105.6964562 . 5.31307266734 4.97095836632e-08 0.000833924420617 CAMTA1/ENSG00000171735: exon, CDS 38 | ``` 39 | 40 | ## Help 41 | 42 | ``` 43 | $ abed --help 44 | 45 | usage: abed [-h] [-b BED] [-g GTF] [-s SIZES] [-c COLUMNS] [-p PROMOTER] 46 | [-o OUTPUT] 47 | 48 | abed is an annotation tool for bed files. 49 | 50 | optional arguments: 51 | -h, --help show this help message and exit 52 | -b BED, --bed BED /path/to/file.bed (default: None) 53 | -g GTF, --gtf GTF /path/to/file.gtf (default: None) 54 | -s SIZES, --sizes SIZES 55 | /path/to/file.genome. Tab separated values of 56 | 'chromosome name' and 'size' information. (default: 57 | None) 58 | -c COLUMNS, --columns COLUMNS 59 | A comma separated string of column headers to use when 60 | reading in the bed file. eg.: 'chr,start,end,name'. 61 | (default: None) 62 | -p PROMOTER, --promoter PROMOTER 63 | A comma separated list containing the upstream start 64 | of the promoter region from the TSS and the downstream 65 | end of the promoter region from the TSS. eg.: 66 | '1000,200'. (default: None) 67 | -o OUTPUT, --output OUTPUT 68 | /path/to/output.tsv. (default: None) 69 | ``` 70 | -------------------------------------------------------------------------------- /docs/executables/adiff.md: -------------------------------------------------------------------------------- 1 | ## Intro 2 | 3 | `aDiff` is an annotation tool for differential gene expression results generated by ***cuffdiff*** (Trapnell C., *Nature Biotechnology*, 2012). 4 | 5 | It annotates *cuffdiff* outputs with ensembl gene ids, gene ontology terms and kegg ids. 6 | 7 | Additonally it uses ***DAVID***s API (Huang DW, *Nature Protoc.*, 2009; Huang DW, *Nucleic Acids Res.*, 2009; Xiaoli J, *Bioinformatics*, 2012) to perform enrichment analysis. 8 | 9 | A ***Cytoscape*** (Shannon P, *Genome Research*, 2003) instance running with the ***String*** (Szklarczyk D, *Nucleic Acids Res.*, 2017) App installed can additionally be plugged in to generate expanded protein-protein interactions. 10 | 11 | For a full RNAseq pipeline including `aDiff` check: [http://bioinformatics.age.mpg.de/presentations-tutorials/presentations/modules/rnaseq-tuxedo-update/#/intro](http://bioinformatics.age.mpg.de/presentations-tutorials/presentations/modules/rnaseq-tuxedo-update/#/intro) 12 | 13 | ## Examples 14 | 15 | Example of an `aDiff` call on a *c. elegans* dataset: 16 | 17 | ``` 18 | $ aDiff -D -i cuffdiff_output -o adiff_output \ 19 | -G references/cel.latest.ensembl.gtf \ 20 | -C cuffmerge_output/merged.gtf \ 21 | --DAVIDuser "" \ 22 | --organismtag CEL \ 23 | --cytoscape_host 'localhost' \ 24 | --cytoscape_port 1234 25 | ``` 26 | 27 | Example of an `aDiff` call on a *d. melanogaster* dataset: 28 | 29 | ``` 30 | $ aDiff -D -i cuffdiff_output -o adiff_output \ 31 | -G references/Drosophila_melanogaster.BDGP6.90.gtf \ 32 | -C cuffmerge_output/merged.gtf \ 33 | --dataset dmelanogaster_gene_ensembl \ 34 | --filter flybase_gene_id \ 35 | --outputBiotypes 'flybase_gene_id gene_biotype' \ 36 | --outputGoterms 'flybase_gene_id go_id name_1006' \ 37 | --DAVIDid FLYBASE_GENE_ID \ 38 | --DAVIDuser "" \ 39 | --organismtag DMEL \ 40 | --species 'drosophila melanogaster' \ 41 | --cytoscape_host 'localhost' \ 42 | --cytoscape_port 1234 43 | ``` 44 | 45 | Example of an `aDiff` call on a *mus musculus* dataset: 46 | 47 | ``` 48 | $ aDiff -i cufdiff_output -o adiff_output \ 49 | -G ensembl.mus_musculus.83.original.gtf \ 50 | -C cuffmerge_output/merged.gtf \ 51 | --TSV \ 52 | --dataset mmusculus_gene_ensembl \ 53 | -u "" \ 54 | --DAVIDid ENSEMBL_GENE_ID \ 55 | --host http://dec2015.archive.ensembl.org/biomart \ 56 | --organismtag MUS \ 57 | --species 'mus musculus' \ 58 | --cytoscape_host 'localhost' \ 59 | --cytoscape_port 1234 60 | ``` 61 | 62 | Example of an `aDiff` call on a *h. sapiens* dataset: 63 | 64 | ``` 65 | $ aDiff -i cufdiff_output -o adiff_output \ 66 | -G ensembl.homo_sapiens.83.original.gtf \ 67 | -C cuffmerge_output/merged.gtf \ 68 | --TSV \ 69 | --dataset hsapiens_gene_ensembl \ 70 | -u "" \ 71 | --DAVIDid ENSEMBL_GENE_ID \ 72 | --host http://dec2015.archive.ensembl.org/biomart \ 73 | --organismtag HSA \ 74 | --species 'homo sapiens' \ 75 | --cytoscape_host 'localhost' \ 76 | --cytoscape_port 1234 77 | ``` 78 | 79 | ## Output files 80 | 81 | Example of the output for the the *h. sapiens* call above. 82 | 83 | * **`diff_sig_geneexp.xlsx`** this file reports significant differential gene expression. It is based on the *gene_exp.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*). 84 | 85 | * **`diff_sig_iso.xlsx`** this file reports significant differential isoform expression . It is based on the *isoform_exp.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*). 86 | 87 | * **`diff_sig_prom.xlsx`** this file reports significant differential promoter usage. It is based on the *promoters.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*). 88 | 89 | * **`diff_sig_splic.xlsx`** this file reprots significant differential splicing . It is based on the *splicing.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*). 90 | 91 | * **`diff_sig_cds.xlsx`** this file reports significant differential cds usage. It is based on the *cds.diff* file output of *cuffdiff* adding annotation columns to it. It contains one sheet for each pairwise comparison filtered to significant values (as defined in *cuffdiff*). 92 | 93 | * **`geneexp_ALL.tsv`** this file is based on the *gene_exp.diff* file output of *cuffdiff* adding annotation columns to it. 94 | 95 | * **`iso_ALL.tsv`** this file is based on the *isoform_exp.diff* file output of *cuffdiff* adding annotation columns to it. 96 | 97 | * **`prom_ALL.tsv`** this file is based on the *promoters.diff* file output of *cuffdiff* adding annotation columns to it. 98 | 99 | * **`splic_ALL.tsv`** this file is based on the *splicing.diff* file output of *cuffdiff* adding annotation columns to it. 100 | 101 | * **`cds_ALL.tsv`** this file is based on the *cds.diff* file output of *cuffdiff* adding annotation columns to it. 102 | 103 | * **`diff_p.05.xlsx`** contains a sheet for each of the files above (ie. *geneexp_ALL.tsv*, *iso_ALL.tsv*, *prom_ALL.tsv*, *splic_ALL.tsv*, *cds_ALL.tsv* ) subset to p values bellow 0.05. 104 | 105 | * **`KEGG_PATHWAY_diff_sig_geneexp.xlsx`** this file is based on the *gene_exp.diff* file output of *cuffdiff*. It generates a result sheet for each pairwise comparison. It reports DAVID enrichment results for KEGG using genes labeled as significant by *cuffdiff*. 106 | 107 | * **`GOTERM_BP_FAT_diff_sig_splic.xlsx`** this is file is based on the *splicing.diff* file output of *cuffdiff*. It generates a result sheet for each pairwise comparison. It reports DAVID enrichment results for Gene Ontology Biological Process (GOTERM BP) using genes labeled as significant by *cuffdiff*. 108 | 109 | * **`OMIM_DISEASE_diff_sig_geneexp.xlsx`** this file is based on the *gene_exp.diff* file output of *cuffdiff*. It generates a result sheet for each pairwise comparison. It reports DAVID enrichment results for OMIM DISEASE using genes labeled as significant by *cuffdiff*. 110 | 111 | DAVID output columns: 112 | 113 | * **categoryName**: Category name. eg.: GOTERM_BP_FAT. 114 | 115 | * **termName**: Term name. eg.: GO:0048468~cell development. 116 | 117 | * **listHits**: Number of items in the query list matching this term. 118 | 119 | * **percent**: Percentage of items in the query list matching this term. 120 | 121 | * **ease**: ease test p value. 122 | 123 | * **geneIds**: gene ids. 124 | 125 | * **Gene_name**: gene name. 126 | 127 | * **listTotals**: number of genes in query list. 128 | 129 | * **popHits**: number of genes in background population list matching this term. 130 | 131 | * **popTotals**: number of genes in background population lis. 132 | 133 | * **foldEnrichment**: Fold enrichment. 134 | 135 | * **bonferroni**: Bonferroni corrected p values. 136 | 137 | * **benjamini**: Benjamini-Hochberg corrected p values. 138 | 139 | * **afdr**: False discovery rate. 140 | 141 | More information on the standard ouput columns of *cuffdiff* can be found [here](http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/index.html). 142 | 143 | The `cytoscape` folder contains cytoscape session files `cys`, as well as `pdf`s and `png`s of the generated networks. Networks are generated by String PPI queries allowing a 25% size expanasion and a confidence cuttoff of 0.4. It also generates a subnetwork by ranking the genes by abs(log2(fold change)) and selecting the top 10% of nodes with edges and the respective first neighbours as well as the same 10% slection but using difusion. Node color maps log2(fold change) - blue down, red up - while node border color and size map normalized expression. 144 | 145 | ## Help 146 | 147 | ``` 148 | $ aDiff --help 149 | 150 | aDiff is an annotation tool for differential gene expression results generated 151 | by cuffdiff (Trapnell C., Nature Biotechnology, 2012). 152 | 153 | usage: aDiff [-h] [-D] [-i INPUTFOLDER] [-o OUTPUTFOLDER] [-G ORIGINALGTF] 154 | [-C CUFFCOMPAREGTF] [-f INPUTFILES] [-s SHORTOUTPUTNAME] 155 | [--sigOnly] [--TSV] [--TSVall] [--description] [--listMarts] 156 | [--mart MART] [--listDatasets] [--dataset DATASET] 157 | [--listFilters] [--filter FILTER] [--listAttributes] 158 | [--outputBiotypes OUTPUTBIOTYPES] [--outputGoterms OUTPUTGOTERMS] 159 | [--KEGG] [--listKEGGorganisms] [--KEGGorg KEGGORG] [--findKEGGdb] 160 | [--KEGGdb KEGGDB] [--DAVIDid DAVIDID] [--DAVIDcat DAVIDCAT] 161 | [-u DAVIDUSER] [--host HOST] [--organismtag {DMEL,CEL,MUS,HSA}] 162 | [--species SPECIES] [--limit LIMIT] [--cuttoff CUTTOFF] 163 | [--taxon TAXON] [--cytoscape_host CYTOSCAPE_HOST] 164 | [--cytoscape_port CYTOSCAPE_PORT] 165 | 166 | optional arguments: 167 | -h, --help show this help message and exit 168 | -D, --DAVID Use this flag to perform DAVID GO enrichment analysis 169 | (default: False) 170 | -i INPUTFOLDER, --inputFolder INPUTFOLDER 171 | Cuffdiff output folder (default: None) 172 | -o OUTPUTFOLDER, --outputFolder OUTPUTFOLDER 173 | Output folder (default: None) 174 | -G ORIGINALGTF, --originalGTF ORIGINALGTF 175 | Original/downloaded GTF (default: None) 176 | -C CUFFCOMPAREGTF, --cuffcompareGTF CUFFCOMPAREGTF 177 | Merged cuffcompared GTF (default: None) 178 | -f INPUTFILES, --inputFiles INPUTFILES 179 | Implies -s. Use this option to select which *.diff 180 | files you wish to analyse.'. (default: gene_exp.diff 181 | promoters.diff splicing.diff cds.diff 182 | isoform_exp.diff) 183 | -s SHORTOUTPUTNAME, --shortOutputName SHORTOUTPUTNAME 184 | Use this option to select a short outpput name for 185 | each *.diff file used in '-f'. No '.' (dots) allowed. 186 | (default: geneexp prom splic cds iso) 187 | --sigOnly Only create report tables for cuffdiff-labeled 188 | significantly changed genes (default: False) 189 | --TSV For p values > = 0.05 write tables as tab separated 190 | values (default: False) 191 | --TSVall Save p < 0.05 save tables as tab separated values in a 192 | folder called TSV (default: False) 193 | --description Get a description of what this script does. (default: 194 | False) 195 | --listMarts List biomaRt Marts (default: False) 196 | --mart MART Your mart of choice. (default: ENSEMBL_MART_ENSEMBL) 197 | --listDatasets List datasets for your mart (default: False) 198 | --dataset DATASET Dataset of your choice. (default: 199 | celegans_gene_ensembl) 200 | --listFilters List available filters (default: False) 201 | --filter FILTER Filter to use to identify your genes. (default: 202 | ensembl_gene_id) 203 | --listAttributes List available attributes for your dataset. (default: 204 | False) 205 | --outputBiotypes OUTPUTBIOTYPES 206 | Outputs/attributes for your biotypes data. Order has 207 | to be kept, ie. first IDs then biotype. (default: 208 | ensembl_gene_id gene_biotype) 209 | --outputGoterms OUTPUTGOTERMS 210 | Outputs/attributes for your goterms data. Order has to 211 | be kept, ie. 1st gene_id, then go_id, then 212 | go_term_name (default: ensembl_gene_id go_id 213 | name_1006) 214 | --KEGG Add KEGG annotations (default: False) 215 | --listKEGGorganisms List KEGG organisms. (default: False) 216 | --KEGGorg KEGGORG KEGG organism. (default: cel) 217 | --findKEGGdb KEGG has DB identifier for each linked DB. Use this 218 | function to find the label of your DB, eg: 'ensembl- 219 | hsa', 'FlyBase'. This option requires --originalGTF 220 | and --KEGGorg (default: False) 221 | --KEGGdb KEGGDB KEGG database linked to your ensembl organism. 222 | (default: EnsemblGenomes-Gn) 223 | --DAVIDid DAVIDID DAVID's id for your dataset. List of ids available in 224 | http://david.abcc.ncifcrf.gov/content.jsp?file=DAVID_A 225 | PI.html#input_list (default: WORMBASE_GENE_ID) 226 | --DAVIDcat DAVIDCAT DAVID's categories you wish to analyse. List of 227 | available categories in https://david.ncifcrf.gov/cont 228 | ent.jsp?file=DAVID_API.html#approved_list. (default: G 229 | OTERM_BP_FAT,GOTERM_CC_FAT,GOTERM_MF_FAT,KEGG_PATHWAY, 230 | PFAM,PROSITE,GENETIC_ASSOCIATION_DB_DISEASE,OMIM_DISEA 231 | SE) 232 | -u DAVIDUSER, --DAVIDuser DAVIDUSER 233 | Your DAVID's user id. example: 'John.Doe@age.mpg.de' 234 | (default: None) 235 | --host HOST Ensembl host. Check http://www.ensembl.org/info/websit 236 | e/archives/index.html for older releases. (default: 237 | http://www.ensembl.org/biomart) 238 | --organismtag {DMEL,CEL,MUS,HSA} 239 | Organism tag. (default: None) 240 | --species SPECIES Species for string app query. eg. 'caenorhabditis 241 | elegans', 'drosophila melanogaster', 'mus musculus', 242 | 'homo sapiens'. Default='caenorhabditis elegans' 243 | (default: caenorhabditis elegans) 244 | --limit LIMIT Limit for string app query. Number of extra genes to 245 | recover. If None, limit=N(query_genes)*.25 (default: 246 | None) 247 | --cuttoff CUTTOFF Confidence cuttoff for sting app query. Default=0.4 248 | (default: 0.4) 249 | --taxon TAXON Taxon id for string app query. For the species shown 250 | above, taxon id will be automatically identified. 251 | (default: None) 252 | --cytoscape_host CYTOSCAPE_HOST 253 | Host address for cytoscape. (default: None) 254 | --cytoscape_port CYTOSCAPE_PORT 255 | Cytoscape port. (default: None) 256 | ``` 257 | -------------------------------------------------------------------------------- /docs/executables/blasto.md: -------------------------------------------------------------------------------- 1 | ## Intro 2 | 3 | This module will load a fasta formatted file and query each fasta sequence for blast. 4 | The user may add blast parameters as space separated list after the sequence name. All queries are 5 | listed into a log table. The user can either let the program running while waiting for the results 6 | using the -C option, or quit and check if the results are ready later using -W -t 7 | 8 | ## Examples 9 | 10 | ```bash 11 | $cat input.fa 12 | 13 | >sequence1 14 | GCGAAGCCCAAGAGGATGAAGCCAGAGATGGTGTTGGAGTTGCTGGGGCTGCTGAGGGTATTGATCTGTCTGTGACCTGCGATAGCATCAGAAGTTGTTTCACATTCTAGTTATAGCTGAGGGAGGTTATGTTTTGAGCAAGCAGGAAAC 15 | >Sequence2 16 | AGCTCCTGAGAAACTTGGGGGGCGCGACACAGATAGGGTGAAAGCAGAGTGATAGACCTGGGATGGTTACGGGACCAAGGGAAGACCAGGCTGGTTGGCATACACCGGTGAACGGATGGGAGTCCTAGGGAAAGATGATGCGCCTAACAG 17 | >sequence2_filtered database='nt' filter="T" nucl_penalty=-5 gapcosts='1,11' 18 | AGCTCCTGAGAAACTTGGGGGGCGCGACACAGATAGGGTGAAAGCAGAGTGATAGACCTGGGATGGTTACGGGACCAAGGGAAGACCAGGCTGGTTGGCATACACCGGTGAACGGATGGGAGTCCTAGGGAAAGATGATGCGCCTAACAG 19 | >sequence3 20 | TCGTTTGATTCTGCAAGCAGCACCTACTGTGGGGTATTGATAAGATCTCTGATGGCGTCTGAAATTCTTCTGAGATTAGAGGAAGATCAGGTGTGTTTTAATGTCGAGCAGGTGTTTCCCCAAGATTAGTGGGGGGATTCGGTTTTTCCT 21 | 22 | $blasto -S -f /usr/home/JDoe/project1/input.fa -o /usr/home/JDoe/project1/run1 23 | $blasto -W -t /usr/home/JDoe/project1/run1.queryTable.tsv -o /usr/home/JDoe/project1/run1 24 | ``` 25 | 26 | ## Help 27 | ```bash 28 | 29 | usage: blasto [-h] [-S] [-C] [-W] [-f INPUTFASTA] [-t INPUTTSV] 30 | [-o OUTPUTPREFIX] [--format_type FORMAT_TYPE] 31 | [--sleepTime SLEEPTIME] [--description] 32 | 33 | This module will load a fasta formatted file and query each fasta sequence for 34 | blast The user may add blast parameters as space separated list after the 35 | sequence name. All queries are listed into a log table. The user can either 36 | let the program running while waiting for the results using the -C option, or 37 | quit and check if the results are ready later using -W -t 38 | 39 | optional arguments: 40 | -h, --help show this help message and exit 41 | -S, --submitFromFasta 42 | Read in fasta file and submit blast queries. Write out 43 | submitted query IDs. (default: False) 44 | -C, --continueThrough 45 | Read from fasta file, submit and continue checking. 46 | Write results when they are ready and exit after all 47 | results are finished. (default: False) 48 | -W, --checkAndWriteResults 49 | Read query IDs from tsv and check status. If results 50 | are ready, collect and safe. (default: False) 51 | -f INPUTFASTA, --inputFasta INPUTFASTA 52 | Fasta formatted input file containing one or more 53 | input sequences. The sequence name may contain 54 | additional blast paramers, (default: ) 55 | -t INPUTTSV, --inputTsv INPUTTSV 56 | Tab separated input file containing sequence IDs, 57 | output prefix, query IDs, query arguments. (default: ) 58 | -o OUTPUTPREFIX, --outputPrefix OUTPUTPREFIX 59 | Output prefix. All files will start with this prefix, 60 | blast output files will be written two 61 | _. (default: ) 62 | --format_type FORMAT_TYPE 63 | format of the blast output (default: Tabular) 64 | --sleepTime SLEEPTIME 65 | time to wait before checking again if your jobs are 66 | done, only active if -C is on (default: 60) 67 | --description Get a description of what this script does. (default: 68 | False) 69 | ``` 70 | -------------------------------------------------------------------------------- /docs/executables/david.md: -------------------------------------------------------------------------------- 1 | ## Intro 2 | 3 | Queries the DAVID database for an enrichment analysis and plots CellPlots as well as SymPlots (see plots). 4 | 5 | ## Examples 6 | 7 | ```bash 8 | $ cat input.tsv 9 | 10 | ensembl_gene_id log2(fold_change) 11 | ENSG00000272449 1.859500 12 | ENSG00000130762 0.601051 13 | ENSG00000083444 -0.881957 14 | ENSG00000162493 -0.638433 15 | ENSG00000253368 0.654517 16 | 17 | $ david -i input.tsv -o /usr/home/JDoe/project1/datasetA -d ENSEMBL_GENE_ID -u 'email.registered@david.com' 18 | ``` 19 | 20 | ## Help 21 | 22 | ```bash 23 | $ david --help 24 | 25 | usage: david [-h] [-i INPUT] [-o OUTPUT] [-d DATABASE] [-c CATEGORIES] 26 | [-u USER] [-v] [-p PVALUE] [-n NGENES] [-b BACKGROUND] 27 | 28 | Queries the DAVID database for an enrichment analysis and plots CellPlots as 29 | well as SymPlots (see plots). Check 30 | https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == 31 | 'type' tag and categories == 'annot' tag. 32 | 33 | optional arguments: 34 | -h, --help show this help message and exit 35 | -i INPUT, --input INPUT 36 | A file with tab separated values where the first 37 | column contains the identifiers to be queried and the 38 | second column the respective log2fc for each 39 | identifier. (default: None) 40 | -o OUTPUT, --output OUTPUT 41 | /path/to/output/prefix (default: None) 42 | -d DATABASE, --database DATABASE 43 | a string for the database to query, e.g. 44 | 'WORMBASE_GENE_ID'. (default: None) 45 | -c CATEGORIES, --categories CATEGORIES 46 | a comma separated list of categories. (default: GOTERM 47 | _BP_FAT,GOTERM_CC_FAT,GOTERM_MF_FAT,KEGG_PATHWAY,BIOCA 48 | RTA,PFAM,PROSITE) 49 | -u USER, --user USER a user ID registered at DAVID for querying (default: 50 | None) 51 | -v, --verbose Print more. (default: None) 52 | -p PVALUE, --pvalue PVALUE 53 | Maximum p value for enrichment of a term. (default: 54 | 0.1) 55 | -n NGENES, --ngenes NGENES 56 | Minimum number of genes within a term. (default: 2) 57 | -b BACKGROUND, --background BACKGROUND 58 | A file with tab separated values where the first 59 | column contains the identifiers to used as a 60 | background. None for whole DAVID database as 61 | background. (default: None) 62 | ``` 63 | -------------------------------------------------------------------------------- /docs/executables/obo2tsv.md: -------------------------------------------------------------------------------- 1 | ## Intro 2 | 3 | `obo2tsv` parses a gene ontology obo file to tsv. It will include for each term columns for parent terms as well as child terms. 4 | 5 | ## Examples 6 | 7 | ``` 8 | $ obo2tsv -u http://geneontology.org/ontology/go-basic.obo \ 9 | -o go-basic.tsv -c 4 \ 10 | --organism http://geneontology.org/gene-associations/gene_association.fb.gz 11 | ``` 12 | 13 | Links to other `--organism` can be found on [http://geneontology.org/page/download-annotations](http://geneontology.org/page/download-annotations). 14 | 15 | ## Help 16 | 17 | ``` 18 | $ obo2tsv --help 19 | 20 | usage: obo2tsv [-h] [-i INPUT] [-u URL] [-o OUTPUT] [-c CPUS] 21 | [--organism ORGANISM] 22 | 23 | obo to tsv parser 24 | 25 | optional arguments: 26 | -h, --help show this help message and exit 27 | -i INPUT, --input INPUT 28 | go-basic.obo file. Files can be downloaded from 29 | http://geneontology.org/page/download-ontology. 30 | (default: None) 31 | -u URL, --url URL If no go-basic.obo input file is specified, a url to a 32 | target obo file can be specified instead. (default: 33 | http://geneontology.org/ontology/go-basic.obo) 34 | -o OUTPUT, --output OUTPUT 35 | Name of output tab separated file. (default: go- 36 | basic.tsv) 37 | -c CPUS, --cpus CPUS Number of cpus. (default: 36) 38 | --organism ORGANISM Optional, merge GO obo.tsv with a GO annotation for an 39 | organism: either a link to a file on geneontology.org 40 | eg. http://geneontology.org/gene- 41 | associations/gene_association.fb.gz or the path for 42 | the respective downloded .gz file. (default: None) 43 | ``` 44 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ## AGEpy [![Build Status](https://travis-ci.org/mpg-age-bioinformatics/AGEpy.svg?branch=master)](https://travis-ci.org/mpg-age-bioinformatics/AGEpy) [![PyPI version](https://badge.fury.io/py/AGEpy.svg)](https://badge.fury.io/py/AGEpy) ![ReadtheDocs](https://readthedocs.org/projects/agepy/badge/?version=latest) 2 | 3 | This python package contains Bioinformatics tools developed at the 4 | Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing. 5 | 6 | > Max Planck Institute for Biology of Ageing 7 | > Joseph-Stelzmann-Str. 9b 8 | > D-50931 Cologne 9 | > Germany 10 | 11 | [https://bioinformatics.age.mpg.de](https://bioinformatics.age.mpg.de) 12 | 13 | ### Installation 14 | 15 | ###### pip 16 | 17 | Latest pip release: 18 | 19 | ```bash 20 | pip install AGEpy --user 21 | ``` 22 | 23 | ###### github 24 | 25 | Get the latest development version from github: 26 | 27 | ```bash 28 | git clone https://github.com/mpg-age-bioinformatics/AGEpy 29 | ``` 30 | 31 | Install: 32 | 33 | ```bash 34 | cd AGEpy 35 | python setup.py install --user 36 | ``` 37 | 38 | and then update to the latest release whenever required with: 39 | 40 | ```bash 41 | cd AGEpy 42 | git pull 43 | python setup.py install --user --force 44 | ``` 45 | 46 | Alternatively you can also install the package with a symlink, so that changes 47 | to the source files will be immediately available to users of the package on 48 | your system: 49 | 50 | ```bash 51 | cd AGEpy 52 | python setup.py develop --user 53 | ``` 54 | 55 | Be aware that with the develop option you won't be able to properly update once new scripts are added. 56 | -------------------------------------------------------------------------------- /docs/modules/MA1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/MA1.png -------------------------------------------------------------------------------- /docs/modules/MA2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/MA2.png -------------------------------------------------------------------------------- /docs/modules/MA3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/MA3.png -------------------------------------------------------------------------------- /docs/modules/bed.md: -------------------------------------------------------------------------------- 1 | ## ___GetBEDnarrowPeakgz___ 2 | 3 | Reads a gz compressed BED narrow peak file from a web address or local file and returns a pandas dataframe. 4 | 5 | **`GetBEDnarrowPeakgz(URL_or_PATH_TO_file)`** 6 | 7 | * **`URL_or_PATH_TO_file`** source of input bed. Either a web link or a path to a local file. 8 | 9 | * **`returns`** a pandas dataframe of the inpud bed. 10 | 11 | ```python 12 | >>> import AGEpy as age 13 | 14 | >>> eCLIP_1_bednarrowPeak="https://www.encodeproject.org/files/ENCFF066PCT/@@download/ENCFF066PCT.bed.gz" 15 | >>> bed=age.GetBEDnarrowPeakgz(eCLIP_1_bednarrowPeak) 16 | >>> print bed.head() 17 | 18 | chrom chromStart chromEnd name score strand signalValue \ 19 | 0 chr7 139371278 139371296 Peak_0 1000 + 5.09062636514014 20 | 1 chr7 139371257 139371278 Peak_1 1000 + 5.0840236303159 21 | 2 chr7 155781335 155781431 Peak_2 1000 + 3.70481328524336 22 | 3 chr7 87156569 87156676 Peak_3 1000 + 3.95023151551588 23 | 4 chr7 105073472 105073521 Peak_4 1000 + 4.14556204062503 24 | 25 | -log10(pValue) -log10(qvalue) peak 26 | 0 48.9834262537309 -1 -1 27 | 1 48.7463712698062 -1 -1 28 | 2 42.6519289009201 -1 -1 29 | 3 37.7848384917051 -1 -1 30 | 4 34.0756845242392 -1 -1 31 | ``` 32 | ___ 33 | 34 | ## ___writeBED___ 35 | 36 | Writes a bed dataframe into a bed file. 37 | 38 | **`writeBED(inBED, file_path)`** 39 | 40 | * **`inBED`** a pandas dataframe with the contents of the bed file to be written. 41 | * **`file_path`** path to target file. 42 | 43 | * **`returns`** nothing. 44 | 45 | ```python 46 | >>> import AGEpy as age 47 | >>> print bed.head() 48 | 49 | chrom chromStart chromEnd name score strand signalValue \ 50 | 0 chr7 139371278 139371296 Peak_0 1000 + 5.09062636514014 51 | 1 chr7 139371257 139371278 Peak_1 1000 + 5.0840236303159 52 | 2 chr7 155781335 155781431 Peak_2 1000 + 3.70481328524336 53 | 3 chr7 87156569 87156676 Peak_3 1000 + 3.95023151551588 54 | 4 chr7 105073472 105073521 Peak_4 1000 + 4.14556204062503 55 | 56 | -log10(pValue) -log10(qvalue) peak 57 | 0 48.9834262537309 -1 -1 58 | 1 48.7463712698062 -1 -1 59 | 2 42.6519289009201 -1 -1 60 | 3 37.7848384917051 -1 -1 61 | 4 34.0756845242392 -1 -1 62 | 63 | >>> age.writeBED(bed,"/path/to/file.bed") 64 | ``` 65 | ___ 66 | 67 | ## ___dfTObedtool___ 68 | 69 | Transforms a pandas dataframe into a bedtool. Requires `bedtools` to be in your `path`. 70 | 71 | **`dfTObedtool(df)`** 72 | 73 | * **`df`** a pandas dataframe. 74 | * **`returns`** a bedtool. 75 | 76 | ```python 77 | >>> import AGEpy as age 78 | >>> print bed.head() 79 | 80 | chrom chromStart chromEnd name score strand signalValue \ 81 | 0 chr7 139371278 139371296 Peak_0 1000 + 5.09062636514014 82 | 1 chr7 139371257 139371278 Peak_1 1000 + 5.0840236303159 83 | 2 chr7 155781335 155781431 Peak_2 1000 + 3.70481328524336 84 | 3 chr7 87156569 87156676 Peak_3 1000 + 3.95023151551588 85 | 4 chr7 105073472 105073521 Peak_4 1000 + 4.14556204062503 86 | 87 | -log10(pValue) -log10(qvalue) peak 88 | 0 48.9834262537309 -1 -1 89 | 1 48.7463712698062 -1 -1 90 | 2 42.6519289009201 -1 -1 91 | 3 37.7848384917051 -1 -1 92 | 4 34.0756845242392 -1 -1 93 | 94 | >>> bedtool=age.dfTObedtool(bed) 95 | >>> print bedtool.head() 96 | 97 | chr7 139371278 139371296 Peak_0 1000 + 5.09062636514014 48.9834262537309 -1 -1 98 | chr7 139371257 139371278 Peak_1 1000 + 5.0840236303159 48.7463712698062 -1 -1 99 | chr7 155781335 155781431 Peak_2 1000 + 3.70481328524336 42.6519289009201 -1 -1 100 | chr7 87156569 87156676 Peak_3 1000 + 3.95023151551588 37.7848384917051 -1 -1 101 | chr7 105073472 105073521 Peak_4 1000 + 4.14556204062503 34.0756845242392 -1 -1 102 | chr7 128761857 128761952 Peak_5 1000 + 4.02131461357736 33.9350181783027 -1 -1 103 | chr7 121296414 121296454 Peak_6 1000 + 3.50632247892067 30.2512926812531 -1 -1 104 | chr7 139368342 139368352 Peak_7 1000 + 4.41912711395099 29.6666535015756 -1 -1 105 | chr7 87155583 87155635 Peak_8 1000 + 4.08769554637519 29.3752024210392 -1 -1 106 | chr7 105540000 105540028 Peak_9 1000 + 4.2212263105571 29.0451450847765 -1 -1 107 | 108 | >>> print type(bed) 109 | 110 | 111 | 112 | >>> print type(bedtool) 113 | 114 | 115 | ``` 116 | 117 | ___ 118 | 119 | ## ___GetPeaksExons___ 120 | 121 | Annotates a bedtool, BED narrow peak. 122 | 123 | **`GetPeaksExons(bed,parsedGTF)`** 124 | 125 | * **`bed`** a pandas dataframe in bed format 126 | * **`parsedGTF`** a parsed GTF file as outputed by parseGTF() 127 | 128 | * **`returns`** a Pandas dataframe 129 | 130 | ```python 131 | >>> import AGEpy as age 132 | >>> print bed.head() 133 | 134 | chrom chromStart chromEnd name score strand signalValue \ 135 | 0 chr7 139371278 139371296 Peak_0 1000 + 5.09062636514014 136 | 1 chr7 139371257 139371278 Peak_1 1000 + 5.0840236303159 137 | 2 chr7 155781335 155781431 Peak_2 1000 + 3.70481328524336 138 | 3 chr7 87156569 87156676 Peak_3 1000 + 3.95023151551588 139 | 4 chr7 105073472 105073521 Peak_4 1000 + 4.14556204062503 140 | 141 | -log10(pValue) -log10(qvalue) peak 142 | 0 48.9834262537309 -1 -1 143 | 1 48.7463712698062 -1 -1 144 | 2 42.6519289009201 -1 -1 145 | 3 37.7848384917051 -1 -1 146 | 4 34.0756845242392 -1 -1 147 | 148 | >>> GTF=age.readGTF("/beegfs/group_bit/data/projects/departments/Bioinformatics/bit_RNAseq_eCLIP/downloads/gencode.v24.primary_assembly.annotation.gtf") 149 | >>> print GTF.head() 150 | 151 | seqname source feature start end score strand frame \ 152 | 0 chr1 HAVANA gene 11869 14409 . + . 153 | 1 chr1 HAVANA transcript 11869 14409 . + . 154 | 2 chr1 HAVANA exon 11869 12227 . + . 155 | 3 chr1 HAVANA exon 12613 12721 . + . 156 | 4 chr1 HAVANA exon 13221 14409 . + . 157 | 158 | attribute 159 | 0 gene_id "ENSG00000223972.5"; gene_type "transc..." 160 | 1 gene_id "ENSG00000223972.5"; transcript_id "EN..." 161 | 2 gene_id "ENSG00000223972.5"; transcript_id "EN..." 162 | 3 gene_id "ENSG00000223972.5"; transcript_id "EN..." 163 | 4 gene_id "ENSG00000223972.5"; transcript_id "EN..." 164 | 165 | >>> GTFpa=age.parseGTF(GTF) 166 | >>> print GTFpa.head() 167 | 168 | seqname source feature start end score strand frame gene_status \ 169 | 0 chr1 HAVANA gene 11869 14409 . + . KNOWN 170 | 1 chr1 HAVANA transcript 11869 14409 . + . KNOWN 171 | 2 chr1 HAVANA exon 11869 12227 . + . KNOWN 172 | 3 chr1 HAVANA exon 12613 12721 . + . KNOWN 173 | 4 chr1 HAVANA exon 13221 14409 . + . KNOWN 174 | 175 | havana_gene ... exon_id transcript_id \ 176 | 0 OTTHUMG00000000961.2 ... NaN NaN 177 | 1 OTTHUMG00000000961.2 ... NaN ENST00000456328.2 178 | 2 OTTHUMG00000000961.2 ... ENSE00002234944.1 ENST00000456328.2 179 | 3 OTTHUMG00000000961.2 ... ENSE00003582793.1 ENST00000456328.2 180 | 4 OTTHUMG00000000961.2 ... ENSE00002312635.1 ENST00000456328.2 181 | 182 | exon_number ont havana_transcript ccdsid transcript_name \ 183 | 0 NaN NaN NaN NaN NaN 184 | 1 NaN NaN OTTHUMT00000362751.1 NaN DDX11L1-002 185 | 2 1 NaN OTTHUMT00000362751.1 NaN DDX11L1-002 186 | 3 2 NaN OTTHUMT00000362751.1 NaN DDX11L1-002 187 | 4 3 NaN OTTHUMT00000362751.1 NaN DDX11L1-002 188 | 189 | gene_type transcript_status gene_name 190 | 0 transcribed_unprocessed_pseudogene NaN DDX11L1 191 | 1 transcribed_unprocessed_pseudogene KNOWN DDX11L1 192 | 2 transcribed_unprocessed_pseudogene KNOWN DDX11L1 193 | 3 transcribed_unprocessed_pseudogene KNOWN DDX11L1 194 | 4 transcribed_unprocessed_pseudogene KNOWN DDX11L1 195 | 196 | >>> bedAn=age.GetPeaksExons(bed,GTFpa) 197 | >>> print bedAn.head() 198 | 199 | chrom chromStart chromEnd name score strand signalValue \ 200 | 0 chr7 155781335 155781431 Peak_2 1000 + 3.704813 201 | 1 chr7 155781335 155781431 Peak_2 1000 + 3.704813 202 | 2 chr7 121296414 121296454 Peak_6 1000 + 3.506322 203 | 3 chr7 87155538 87155583 Peak_16 1000 + 4.077391 204 | 4 chr7 107904733 107904812 Peak_17 1000 + 3.674368 205 | 206 | -log10(pValue) -log10(qvalue) peak ... \ 207 | 0 42.651929 -1 -1 ... 208 | 1 42.651929 -1 -1 ... 209 | 2 30.251293 -1 -1 ... 210 | 3 22.798739 -1 -1 ... 211 | 4 21.118496 -1 -1 ... 212 | 213 | gene_id exon_id_count exon_id norm. mean -log10(pValue) \ 214 | 0 ENSG00000184863.10 1 42.651929 215 | 1 ENSG00000184863.10 1 42.651929 216 | 2 ENSG00000106034.17 1 30.251293 217 | 3 ENSG00000135164.18 3 2951.868281 218 | 4 ENSG00000091140.12 1 21.118496 219 | 220 | exon_id signalValue transcript_id_count \ 221 | 0 3.704813 1 222 | 1 3.704813 1 223 | 2 3.506322 1 224 | 3 42.703999 3 225 | 4 3.674368 1 226 | 227 | transcript_id norm. mean -log10(pValue) transcript_id signalValue \ 228 | 0 42.651929 3.704813 229 | 1 42.651929 3.704813 230 | 2 30.251293 3.506322 231 | 3 2951.868281 42.703999 232 | 4 21.118496 3.674368 233 | 234 | gene_id_count gene_id norm. mean -log10(pValue) gene_id signalValue 235 | 0 4 116.619012 17.830941 236 | 1 4 116.619012 17.830941 237 | 2 2 30.251293 2.144090 238 | 3 8 3300.707425 73.902289 239 | 4 5 135.139064 22.210269 240 | ``` 241 | **gene_id_count**: number of intervals overlapping this gene 242 | 243 | **transcript_id_count**: number of intervals overlapping this transcript 244 | 245 | **exon_id_count**: number of intervals overlapping this exon 246 | ___ 247 | 248 | ## ___AnnotateBED___ 249 | 250 | Annotates a bedtool, BED narrow peak. 251 | 252 | **`AnnotateBED(bed,GTF, genome_file, bedcols=None, promoter=[1000,200])`** 253 | 254 | * **`bed`** either a /path/to/file.bed or a Pandas dataframe in bed format. /path/to/file.bed implies bedcols. 255 | * **`GTF`** /path/to/file.gtf 256 | * **`genome_file`** /path/to/file.genome - a tab separated values of chr name and size information 257 | * **`bedcols`** a comma separated string of column headers to use when reading in a bed file. eg: "chr,start,end,name" 258 | * **`promoter`** a list containing the upstream start of the promoter region from the TSS and the downstream end of the promoter region from the TSS. 259 | 260 | * **`returns`** a Pandas dataframe with the annotated bed file. exons and promoters will be reported as well in the annotated_gene_features column. 261 | 262 | ```python 263 | ```python 264 | >>> import AGEpy as age 265 | >>> print bed.head() 266 | 267 | chr start end name signal value strand fold change \ 268 | 0 2 175167300 175167740 chip1_peak_2 58.993528 . 2.965444 269 | 1 2 27052080 27052410 chip1_peak_3 154.897096 . 7.786255 270 | 2 1 243719300 243719630 chip1_peak_4 99.776458 . 5.015490 271 | 3 17 2564650 2564980 chip1_peak_5 72.892502 . 3.664107 272 | 4 7 44999240 44999570 chip1_peak_6 106.434435 . 5.350169 273 | 274 | p-value Benjamini-Hochberg FDR enriched in marker 275 | 0 5.747544e-15 4.044835e-08 control K27AC 276 | 1 2.197614e-14 8.934691e-08 control K27AC 277 | 2 2.915657e-14 8.934691e-08 control K27AC 278 | 3 3.173957e-14 8.934691e-08 control K27AC 279 | 4 3.871249e-14 9.081308e-08 control K27AC 280 | 281 | >>> bed=AnnotateBED(bed,"hg38.83.gtf","hg38.83.genome") 282 | >>> print bed.head() 283 | 284 | chr start end name signal value strand fold change \ 285 | 0 1 789880 791350 chip2_peak_44728 172.757426 . 8.473977 286 | 1 1 820750 822710 chip1_peak_22461 148.812870 . 11.672676 287 | 2 1 905550 905850 chip1_peak_1792 289.437404 . 13.231699 288 | 3 1 913500 913800 chip1_peak_4243 43.508330 . 1.988994 289 | 4 1 960150 960450 chip1_peak_1666 67.008675 . 3.063317 290 | 291 | p-value Benjamini-Hochberg FDR enriched in marker \ 292 | 0 6.043877e-06 0.000314 stretch H3K9 293 | 1 5.292319e-07 0.000057 control H3K9 294 | 2 9.544848e-07 0.004798 control H3K27 295 | 3 4.932846e-06 0.010117 control H3K27 296 | 4 8.347840e-07 0.004535 control H3K27 297 | 298 | annotated_gene_features 299 | 0 RP5-857K21.4; RP11-206L10.9 300 | 1 RP5-857K21.4 301 | 2 RP11-54O7.16 302 | 3 RP11-54O7.1: exon; RP11-54O7.2: promoter; RP11... 303 | 4 NOC2L: promoter; KLHL17: promoter 304 | ``` 305 | ___ 306 | -------------------------------------------------------------------------------- /docs/modules/biom.md: -------------------------------------------------------------------------------- 1 | ## ___datasetsBM___ 2 | 3 | Lists BioMart datasets. 4 | 5 | **`datasetsBM(host=biomart_host)`** 6 | 7 | * **`host`** address of the host server, default='http://www.ensembl.org/biomart' 8 | * **`returns`** nothing 9 | 10 | ```python 11 | >>> import AGEpy as age 12 | >>> age.datasetsBM() 13 | 14 | u'acarolinensis_gene_ensembl' Anole lizard genes (AnoCar2.0), 15 | u'acarolinensis_genomic_sequence' Anole lizard sequences (AnoCar2.0), 16 | u'amelanoleuca_gene_ensembl' Panda genes (ailMel1), 17 | u'amelanoleuca_genomic_sequence' Panda sequences (ailMel1), 18 | u'amexicanus_gene_ensembl' Cave fish genes (AstMex102), 19 | u'amexicanus_genomic_sequence' Cave fish sequences (AstMex102), 20 | u'anancymaae_gene_ensembl' Ma's night monkey genes (Anan_2.0), 21 | u'anancymaae_genomic_sequence' Ma's night monkey sequences (Anan_2.0), 22 | u'aplatyrhynchos_gene_ensembl' Duck genes (BGI_duck_1.0), 23 | u'aplatyrhynchos_genomic_sequence' Duck sequences (BGI_duck_1.0), 24 | u'btaurus_gene_ensembl' Cow genes (UMD3.1), 25 | u'btaurus_genomic_sequence' Cow sequences (UMD3.1), 26 | u'btaurus_marker_end' marker_feature_end, 27 | u'btaurus_marker_start' marker_feature, 28 | u'btaurus_qtl_feature' qtl_feature, 29 | . 30 | . 31 | . 32 | ``` 33 | ___ 34 | 35 | ## ___filtersBM___ 36 | 37 | Lists BioMart filters for a specific dataset. 38 | 39 | **`filtersBM(dataset,host=biomart_host)`** 40 | 41 | * **`dataset`** dataset to list filters of 42 | * **`host`** address of the host server, default='http://www.ensembl.org/biomart' 43 | 44 | * **`returns`** nothing 45 | 46 | ```python 47 | >>> import AGEpy as age 48 | >>> age.filtersBM('hsapiens_gene_ensembl') 49 | 50 | u'affy_hc_g110' 'AFFY HC G110 probe ID(s) [e.g. 266_s_at]' (type id_list, values []), 51 | u'affy_hg_focus' 'AFFY HG Focus probe ID(s) [e.g. 212481_s_at]' (type id_list, values []), 52 | u'affy_hg_u133_plus_2' 'AFFY HG U133 Plus 2 probe ID(s) [e.g. 1553551_s_at]' (type id_list, values []), 53 | u'affy_hg_u133a' 'AFFY HG U133A probe ID(s) [e.g. 211600_at]' (type id_list, values []), 54 | u'affy_hg_u133a_2' 'AFFY HG U133A 2 probe ID(s) [e.g. 211600_at]' (type id_list, values []), 55 | u'affy_hg_u133b' 'AFFY HG U133B probe ID(s) [e.g. 224321_at]' (type id_list, values []), 56 | u'affy_hg_u95a' 'AFFY HG U95A probe ID(s) [e.g. 33866_at]' (type id_list, values []), 57 | u'affy_hg_u95av2' 'AFFY HG U95Av2 probe ID(s) [e.g. 33866_at]' (type id_list, values []), 58 | u'affy_hg_u95b' 'AFFY HG U95B probe ID(s) [e.g. 48794_s_at]' (type id_list, values []), 59 | u'affy_hg_u95c' 'AFFY HG U95C probe ID(s) [e.g. 66888_at]' (type id_list, values []), 60 | u'affy_hg_u95d' 'AFFY HG U95D probe ID(s) [e.g. 70806_at]' (type id_list, values []), 61 | u'affy_hg_u95e' 'AFFY HG U95E probe ID(s) [e.g. 88289_at]' (type id_list, values []), 62 | u'affy_hta_2_0' 'AFFY HTA 2 0 probe ID(s) [e.g. TC04001102.hg]' (type id_list, values []), 63 | u'affy_huex_1_0_st_v2' 'AFFY HuEx 1 0 st v2 probe ID(s) [e.g. 4037584]' (type id_list, values []), 64 | u'affy_hugene_1_0_st_v1' 'AFFY HuGene 1 0 st v1 probe ID(s) [e.g. 8165644]' (type id_list, values []), 65 | u'affy_hugene_2_0_st_v1' 'AFFY HuGene 2 0 st v1 probe ID(s) [e.g. 17100641]' (type id_list, values []), 66 | u'affy_hugenefl' 'AFFY HuGeneFL probe ID(s) [e.g. Z70759_at]' (type id_list, values []), 67 | u'affy_primeview' 'AFFY PrimeView probe ID(s) [e.g. 11761516_x_at]' (type id_list, values []), 68 | . 69 | . 70 | . 71 | 72 | ``` 73 | ___ 74 | 75 | ## ___attributesBM___ 76 | 77 | Lists BioMart attributes for a specific dataset. 78 | 79 | **`attributesBM(dataset,host=biomart_host)`** 80 | 81 | * **`dataset`** dataset to list attributes of 82 | * **`host`** address of the host server, default='http://www.ensembl.org/biomart' 83 | 84 | * **`returns`** nothing 85 | 86 | ```python 87 | >>> import AGEpy as age 88 | >>> age.attributesBM('hsapiens_gene_ensembl') 89 | 90 | u'3_utr_end' '3' UTR end' (default False), 91 | u'3_utr_start' '3' UTR start' (default False), 92 | u'3utr' '3' UTR' (default False), 93 | u'5_utr_end' '5' UTR end' (default False), 94 | u'5_utr_start' '5' UTR start' (default False), 95 | u'5utr' '5' UTR' (default False), 96 | u'acarolinensis_homolog_associated_gene_name' 'Anole lizard gene name' (default False), 97 | u'acarolinensis_homolog_canonical_transcript_protein' 'Query protein or transcript ID' (default False), 98 | u'acarolinensis_homolog_chrom_end' 'Anole lizard chromosome/scaffold end (bp)' (default False), 99 | u'acarolinensis_homolog_chrom_start' 'Anole lizard chromosome/scaffold start (bp)' (default False), 100 | u'acarolinensis_homolog_chromosome' 'Anole lizard chromosome/scaffold name' (default False), 101 | u'acarolinensis_homolog_dn' 'dN with Anole lizard' (default False), 102 | u'acarolinensis_homolog_ds' 'dS with Anole lizard' (default False), 103 | u'acarolinensis_homolog_ensembl_gene' 'Anole lizard gene stable ID' (default False), 104 | . 105 | . 106 | . 107 | 108 | ``` 109 | ___ 110 | 111 | ## ___queryBM___ 112 | 113 | Queries BioMart. 114 | 115 | **`queryBM(query_attributes,query_dataset,query_filter=None,query_items=None,query_dic=None,host=biomart_host)`** 116 | 117 | * **`query_attributes`** list of attributes to recover from BioMart 118 | * **`query_dataset`** dataset to query 119 | * **`query_filter`** one BioMart filter associated with the items being queried 120 | * **`query_items`** list of items to be queried (must assoiate with given filter) 121 | * **`query_querydic`** for complex queries this option should be used instead of 'filters' and 'items' and a dictionary of filters provided here eg. querydic={"filter1":["item1","item2"],"filter2":["item3","item4"]}. If using querydic, don't query more than 350 items at once. 122 | * **`host`** address of the host server, default='http://www.ensembl.org/biomart' 123 | 124 | * **`returns`** a Pandas dataframe of the queried attributes 125 | 126 | ```python 127 | >>> import AGEpy as age 128 | >>> queryDf=queryBM(query_attributes=["ensembl_gene_id","external_gene_name", \ 129 | "go_id","name_1006","definition_1006"],\ 130 | query_dataset='hsapiens_gene_ensembl') 131 | >>> print queryDf.head() 132 | 133 | ensembl_gene_id external_gene_name go_id name_1006 \ 134 | 0 ENSG00000283891 MIR628 GO:0005615 extracellular space 135 | 1 ENSG00000251931 RNU6-871P 136 | 2 ENSG00000207766 MIR626 137 | 3 ENSG00000275323 AC012314.7 GO:0003723 RNA binding 138 | 4 ENSG00000275323 AC012314.7 GO:0005634 nucleus 139 | 140 | definition_1006 141 | 0 "That part of a multicellular organism outside..." 142 | 1 143 | 2 144 | 3 "Interacting selectively and non-covalently wi..." 145 | 4 "A membrane-bounded organelle of eukaryotic ce..." 146 | ``` 147 | ___ 148 | 149 | ## ___FilterGOstring___ 150 | 151 | Filters GO terms based on given strings using ENSEMBL's biomart homology mapping. 152 | 153 | **`FilterGOstring(names_filter=["age-", "aging", "aged", 'aging', 'aging.', 'aging,'], exclude_names=["packaging","voltage","cleavage-", "stage-1","cage-like","message-specific", "damage-associated","stage-specific","foraging", "DNA-damaging","engaging","damaged","packaged"], defs_filter=[" age-", " aging", " aged", ' aging', ' aging.', ' aging,'], exclude_defs=["packaging","voltage","cleavage-", "stage-1","cage-like","message-specific", "damage-associated","stage-specific","foraging", "DNA-damaging","engaging","damaged","packaged"], host=biomart_host, HSA=None,MUS=None,CEL=None,DMEL=None)`** 154 | 155 | * **`names_filter`** list of substrings to filter GO names on. Default=["age-", "aging", "aged", 'aging', 'aging.', 'aging,'] 156 | * **`exclude_names`** list of substrings to be used for exclusion of GO names. Default=["packaging","voltage","cleavage-", 157 | "stage-1","cage-like","message-specific", 158 | "damage-associated","stage-specific","foraging", 159 | "DNA-damaging","engaging","damaged","packaged"] 160 | * **`defs_filter`** list of substrings to filter GO defenitions on. Default=[" age-", " aging", " aged", ' aging', ' aging.', ' aging,'] 161 | * **`exclude_defs`** list of substrings to be used for exclustion of GO defenitions. Default=["packaging","voltage","cleavage-", 162 | "stage-1","cage-like","message-specific", 163 | "damage-associated","stage-specific","foraging", 164 | "DNA-damaging","engaging","damaged","packaged"] 165 | * **`host`** biomart host server, default="http://www.ensembl.org/biomart" 166 | * **`HSA`** retrieved hsa dataframe 167 | * **`MUS`** retrieved mus dataframe 168 | * **`CEL`** retrieved cel dataframe 169 | * **`DMEL`** retrieved dmel dataframe 170 | 171 | * **`returns`** homology_df, HSA, MUS, CEL, DMEL 172 | 173 | ```python 174 | >>> import AGEpy as age 175 | >>> homology_df, HSA, MUS, CEL, DMEL=age.FilterGOstring() 176 | >>> print homology_df.head() 177 | 178 | HSA_ensembl_gene_id HSA_external_gene_name \ 179 | 0 ENSG00000000003 TSPAN6 180 | 1 ENSG00000000005 TNMD 181 | 2 ENSG00000000460 C1orf112 182 | 3 ENSG00000000971 CFH 183 | 4 ENSG00000002079 MYH16 184 | 185 | HSA_go_id \ 186 | 0 GO:0039532, , GO:0070062, GO:0016021, GO:00160... 187 | 1 GO:0005737, , GO:0016020, GO:0035990, GO:00717... 188 | 2 NaN 189 | 3 , GO:0030449, GO:0070062, GO:0045087, GO:00725... 190 | 4 NaN 191 | 192 | HSA_name_1006 \ 193 | 0 , negative regulation of NIK/NF-kappaB signali... 194 | 1 , nuclear envelope, cytoplasm, negative regula... 195 | 2 NaN 196 | 3 , innate immune response, heparan sulfate prot... 197 | 4 NaN 198 | 199 | HSA_definition_1006 MUS_ensembl_gene_id \ 200 | 0 "The component of a membrane consisting of the..." ENSMUSG00000067377 201 | 1 "The component of a membrane consisting of the..." ENSMUSG00000031250 202 | 2 NaN ENSMUSG00000041406 203 | 3 "Interacting selectively and non-covalently wi..." NaN 204 | 4 NaN NaN 205 | 206 | CEL_ensembl_gene_id DMEL_ensembl_gene_id MUS_external_gene_name \ 207 | 0 NaN NaN Tspan6 208 | 1 NaN NaN Tnmd 209 | 2 NaN NaN BC055324 210 | 3 NaN NaN None 211 | 4 NaN NaN None 212 | 213 | MUS_go_id ... \ 214 | 0 GO:0039532, , GO:0070062, GO:0016021, GO:00160... ... 215 | 1 GO:0016020, GO:0035990, GO:0071773, GO:0016021... ... 216 | 2 GO:0005575, GO:0008150, GO:0003674, ... 217 | 3 None ... 218 | 4 None ... 219 | 220 | MUS_definition_1006 CEL_external_gene_name \ 221 | 0 "The component of a membrane consisting of the..." None 222 | 1 "The component of a membrane consisting of the..." None 223 | 2 "Elemental activities, such as catalysis or bi..." None 224 | 3 None None 225 | 4 None None 226 | 227 | CEL_go_id CEL_name_1006 CEL_definition_1006 DMEL_external_gene_name \ 228 | 0 None None None None 229 | 1 None None None None 230 | 2 None None None None 231 | 3 None None None None 232 | 4 None None None None 233 | 234 | DMEL_go_id DMEL_name_1006 DMEL_definition_1006 evidence 235 | 0 None None None NaN 236 | 1 None None None NaN 237 | 2 None None None NaN 238 | 3 None None None NaN 239 | 4 None None None NaN 240 | 241 | ``` 242 | 243 | **evidence** indicates from which organisms there is evidence of the intended string 244 | -------------------------------------------------------------------------------- /docs/modules/blast.md: -------------------------------------------------------------------------------- 1 | ## ___BLASTquery___ 2 | 3 | Performs a blast query online. As in https://ncbi.github.io/blast-cloud/ 4 | 5 | **`BLASTquery(query,database,program,filter=None, format_type=None, expect=None, nucl_reward=None, nucl_penalty=None, gapcosts=None, matrix=None, hitlist_size=None, descriptions=None, alignments=None, ncbi_gi=None, threshold=None, word_size=None, composition_based_statistics=None, organism=None, others=None, num_threads=None, baseURL="http://blast.ncbi.nlm.nih.gov", verbose=False)`** 6 | 7 | * **`query`** Search query. Allowed values: Accession, GI, or FASTA. 8 | * **`database`** BLAST database. Allowed values: nt, nr, refseq_rna, refseq_protein, swissprot, pdbaa, pdbnt 9 | * **`program`** BLAST program. Allowed values: blastn, megablast, blastp, blastx, tblastn, tblastx 10 | * **`filter`** Low complexity filtering. Allowed values: F to disable. T or L to enable. Prepend “m” for mask at lookup (e.g., mL) 11 | * **`format_type`** Report type. Allowed values: HTML, Text, XML, XML2, JSON2, or Tabular. HTML is the default. 12 | * **`expect`** Expect value. Allowed values: Number greater than zero. 13 | * **`nucl_reward`** Reward for matching bases (BLASTN and megaBLAST). Allowed values: Integer greater than zero. 14 | * **`nucl_penalty`** Cost for mismatched bases (BLASTN and megaBLAST). Allowed values: Integer less than zero. 15 | * **`gapcosts`** Gap existence and extension costs. Allowed values: Pair of positive integers separated by a space such as “11 1”. 16 | * **`matrix`** Scoring matrix name. Allowed values: One of BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, PAM250, PAM30 or PAM70. Default: BLOSUM62 for all applicable programs. 17 | * **`hitlist_size`** Number of databases sequences to keep. Allowed values: Integer greater than zero. 18 | * **`descriptions`** Number of descriptions to print (applies to HTML and Text). Allowed values: Integer greater than zero. 19 | * **`alignments`** Number of alignments to print (applies to HTML and Text). Allowed values: Integer greater than zero. 20 | * **`ncbi_gi`** Show NCBI GIs in report. Allowed values: T or F. 21 | * **`threshold`** Neighboring score for initial words. Allowed values: Positive integer (BLASTP default is 11). Does not apply to BLASTN or MegaBLAST). 22 | * **`word_size`** Size of word for initial matches. Allowed values: Positive integer. 23 | * **`composition_based_statistics`** Composition based statistics algorithm to use. Allowed values: One of 0, 1, 2, or 3. See comp_based_stats command line option in the BLAST+ user manual for details. 24 | * **`organism`** an organism as in https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome 25 | * **`others`** here you can add other parameters as seen in a blast bookmarked page. Define you query in https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome 26 | Once your query is defined click on "Bookmark" on right upper side of the page. You can copy fragments of the URL 27 | which define the query. Eg. For organism "Homo sapiens (taxid:9606)" you will see the string "EQ_MENU=Homo%20sapiens%20%28taxid%3A9606%29" - this is 28 | the string you can use here in others. 29 | * **`num_threads`** Number of virtual CPUs to use. Allowed values: Integer greater than zero (default is 1). Supported only on the cloud. 30 | * **`verbose`** print more 31 | 32 | * **`returns`** BLAST search request identifier 33 | 34 | ```python 35 | >>> import AGEpy as age 36 | >>> seq="CTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTAC" 37 | >>> RID=age.BLASTquery(seq,"nt","blastn") 38 | >>> print RID 39 | 40 | 4MS2JV8T014 41 | ``` 42 | ___ 43 | 44 | ## ___BLASTcheck___ 45 | 46 | Checks the status of a query. 47 | 48 | **`BLASTcheck(rid,baseURL="http://blast.ncbi.nlm.nih.gov")`** 49 | 50 | * **`rid`** BLAST search request identifier. Allowed values: The Request ID (RID) returned when the search was submitted 51 | * **`baseURL`** server url. Default=http://blast.ncbi.nlm.nih.gov 52 | 53 | * **`returns status`** status for the query. 54 | * **`returns therearehist`** yes or no for existing hits on a finished query. 55 | 56 | ```python 57 | >>> import AGEpy as age 58 | >>> status, therearehits=age.BLASTcheck(RID) 59 | 60 | RID: 4MRYDZSC014; status:READY; hits: yes 61 | 62 | >>> print status, therearehits 63 | 64 | READY yes 65 | ``` 66 | ___ 67 | 68 | ## ___BLASTresults___ 69 | 70 | Retrieves results for an RID. 71 | 72 | **`BLASTresults(rid, format_type="Tabular", hitlist_size= None, alignments=None, ncbi_gi = None, format_object=None, baseURL="http://blast.ncbi.nlm.nih.gov")`** 73 | 74 | * **`rid`** BLAST search request identifier. Allowed values: The Request ID (RID) returned when the search was submitted 75 | * **`format_type`** Report type. Allowed values: HTML, Text, XML, XML2, JSON2, or Tabular. 76 | * **`hitlist_size`** Number of databases sequences to keep. Allowed values: Integer greater than zero. 77 | * **`alignments`** Number of alignments to print (applies to HTML and Text). Allowed values: Integer greater than zero. 78 | * **`ncbi_gi`** Show NCBI GIs in report. Allowed values: T or F. 79 | * **`format_object`** Object type. Allowed values: SearchInfo (status check) or Alignment (report formatting). 80 | * **`baseURL`** server url. Default=http://blast.ncbi.nlm.nih.gov 81 | 82 | * **`returns`** the result of a BLAST query. If format_type="Tabular" it will parse the content into a Pandas dataframe. 83 | 84 | ```python 85 | >>> import AGEpy as age 86 | >>> r=age.BLASTresults(RID) 87 | >>> print r.head() 88 | 89 | query id subject ids \ 90 | 0 Query_17381 gi|1012955506|gb|JN214348.1| 91 | 1 Query_17381 gi|631786534|tpe|HG975427.1| 92 | 2 Query_17381 gi|369762889|gb|JN900492.1| 93 | 3 Query_17381 gi|371502118|ref|NM_001126118.1| 94 | 4 Query_17381 gi|371502115|ref|NM_001126112.2|;gi|454521556|... 95 | 96 | query acc.ver subject acc.ver % identity alignment length mismatches \ 97 | 0 Query_17381 JN214348.1 100.000 1190 0 98 | 1 Query_17381 HG975427.1 100.000 1190 0 99 | 2 Query_17381 JN900492.1 100.000 1190 0 100 | 3 Query_17381 NM_001126118.1 100.000 1190 0 101 | 4 Query_17381 NM_001126112.2 100.000 1190 0 102 | 103 | gap opens q. start q. end s. start s. end evalue bit scor 104 | 0 0 1 1190 614 1803 0.0 2147 105 | 1 0 1 1190 766 1955 0.0 2147 106 | 2 0 1 1190 877 2066 0.0 2147 107 | 3 0 1 1190 888 2077 0.0 2147 108 | 4 0 1 1190 768 1957 0.0 2147 109 | ``` 110 | -------------------------------------------------------------------------------- /docs/modules/cellplot.CellPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/cellplot.CellPlot.png -------------------------------------------------------------------------------- /docs/modules/cytoscape.md: -------------------------------------------------------------------------------- 1 | ## ___checkCytoscapeVersion___ 2 | 3 | Checks cytoscape version. 4 | 5 | **`CheckResponse(r)`** 6 | 7 | * **`host`** cytoscape host address, default=cytoscape_host 8 | * **`port`** cytoscape port, defaul=cytoscape_port 9 | * **`returns`** nothing 10 | 11 | ```python 12 | >>> import AGEpy as age 13 | >>> age.checkCytoscapeVersion() 14 | 15 | cytoscapeVersion 3.6.0 16 | apiVersion v1 17 | ``` 18 | ___ 19 | ## ___cytoscape___ 20 | 21 | General function for interacting with Cytoscape API. 22 | 23 | **`cytoscape(namespace,command="",PARAMS={},host=cytoscape_host,port=cytoscape_port,method="POST",verbose=False)`** 24 | 25 | * **`namespace`** namespace where the request should be executed. eg. "string" 26 | * **`commnand`** command to execute. eg. "protein query" 27 | * **`PARAMs`** a dictionary with the parameters. Check your swagger normaly running on 28 | 'http://localhost:1234/v1/swaggerUI/swagger-ui/index.html?url=http://localhost:1234/v1/commands/swagger.json' 29 | * **`host`** cytoscape host address, default=cytoscape_host 30 | * **`port`** cytoscape port, default=cytoscape_port 31 | * **`method`** type of http call, ie. "POST" or "GET" or "HELP". 32 | * **`verbose`** print more information 33 | * **`returns`** For "POST" the data in the content's response. For "GET" None. 34 | 35 | ```python 36 | >>> import AGEpy as age 37 | >>> response=age.cytoscape("string","pubmed query",{"pubmed":"p53 p21","limit":"50"}) 38 | >>> print response 39 | 40 | {u'SUID': 37560} 41 | ``` 42 | ___ 43 | ## ___result___ 44 | 45 | Displays the current network. 46 | 47 | **`result(filetype="PNG", saveas=None, host=cytoscape_host, port=cytoscape_port)`** 48 | 49 | * **`filetype`** file type, default="PNG" 50 | * **`saveas`** /path/to/non/tmp/file.prefix 51 | * **`host`** cytoscape host address, default=cytoscape_host 52 | * **`port`** cytoscape port, default=cytoscape_port 53 | * **`returns`** an image 54 | 55 | ```python 56 | >>> import AGEpy as age 57 | >>> response=age.result() 58 | >>> response 59 | ``` 60 | ![cytoscape](p53.png) 61 | ___ 62 | 63 | ## ___getTableColumns___ 64 | 65 | Gets tables from cytoscape. 66 | 67 | **`getTableColumns(table, columns, namespace = "default", network = "current", host=cytoscape_host,port=cytoscape_port,verbose=False)`** 68 | 69 | * **`table`** table to retrieve eg. node 70 | * **`columns`** columns to retrieve in list format 71 | * **`namespace`** namepsace, default="default" 72 | * **`network`** a network name or id, default="current" 73 | * **`host`** cytoscape host address, default=cytoscape_host 74 | * **`port`** cytoscape port, default=cytoscape_port 75 | * **`verbose`** print more information 76 | * **`returns`** a pandas dataframe 77 | 78 | ```python 79 | >>> import AGEpy as age 80 | >>> response=age.getTableColumns('node',['display name']) 81 | >>> print response 82 | 83 | display name 84 | 9606.ENSP00000367207 MYC 85 | 9606.ENSP00000356150 MDM4 86 | 9606.ENSP00000228872 CDKN1B 87 | 9606.ENSP00000361021 PTEN 88 | 9606.ENSP00000265734 CDK6 89 | ``` 90 | ___ 91 | ## ___loadTableData___ 92 | 93 | Loads tables into cytoscape. 94 | 95 | **`loadTableData(df, df_key='index',table="node", table_key_column = "name", network="current", namespace="default", host=cytoscape_host, port=cytoscape_port, verbose=False)`** 96 | 97 | * **`df`** a pandas dataframe to load 98 | * **`df_key`** key column in df, defaul="index" 99 | * **`table`** target table, default="node" 100 | * **`table_key_column`** table key column, default="name" 101 | * **`network`** a network name or id, default="current" 102 | * **`host`** cytoscape host address, default=cytoscape_host 103 | * **`port`** cytoscape port, default=cytoscape_port 104 | * **`verbose`** print more information 105 | * **`returns`** output of put request 106 | 107 | ```python 108 | >>> import AGEpy as age 109 | >>> print df.head() 110 | 111 | display name 112 | 9606.ENSP00000367207 MYC 113 | 9606.ENSP00000356150 MDM4 114 | 9606.ENSP00000228872 CDKN1B 115 | 9606.ENSP00000361021 PTEN 116 | 9606.ENSP00000265734 CDK6 117 | 118 | >>> def MarkCKDs(x): 119 | ... if "CDK" in x: 120 | ... res="yes" 121 | ... else: 122 | ... res="not" 123 | ... return res 124 | >>> df["CDK"]=df["display name"].apply( lambda x: MarkCKDs(x) ) 125 | >>> print df.head() 126 | 127 | display name CDK 128 | 9606.ENSP00000367207 MYC not 129 | 9606.ENSP00000356150 MDM4 not 130 | 9606.ENSP00000228872 CDKN1B yes 131 | 9606.ENSP00000361021 PTEN not 132 | 9606.ENSP00000265734 CDK6 yes 133 | 134 | >>> response=age.loadTableData(df[["CDK"]]) 135 | ``` 136 | ___ 137 | 138 | ## ***simple_defaults*** 139 | 140 | Simplifies default layouts. 141 | 142 | **`simple_defaults(defaults_dic)`** 143 | 144 | * **`defaults_dic`** a dictionary of the form { visualProperty_A:value_A, visualProperty_B:value_B, ..} 145 | * **`returns`** a list of dictionaries with each item corresponding to a given key in defaults_dic 146 | 147 | ```python 148 | >>> import AGEpy as age 149 | >>> defaults_dic={"NODE_SHAPE":"ellipse",\ 150 | "NODE_SIZE":60,\ 151 | "NODE_FILL_COLOR":"#AAAAAA",\ 152 | "EDGE_TRANSPARENCY":120} 153 | >>> defaults_list=age.simple_defaults(defaults_dic) 154 | >>> print defaults_list 155 | 156 | [{'visualProperty': 'NODE_SIZE', 'value': 60}, \ 157 | {'visualProperty': 'NODE_FILL_COLOR', 'value': '#AAAAAA'}, \ 158 | {'visualProperty': 'NODE_SHAPE', 'value': 'ellipse'}, \ 159 | {'visualProperty': 'EDGE_TRANSPARENCY', 'value': 120}] 160 | ``` 161 | ___ 162 | 163 | ## ***create_styles*** 164 | 165 | Creates a new visual style. 166 | 167 | **`create_styles(title,defaults=None,mappings=None,host=cytoscape_host,port=cytoscape_port)`** 168 | 169 | * **`title`** title of the visual style 170 | * **`defaults`** a list of dictionaries for each visualProperty 171 | * **`mappings`** a list of dictionaries for each visualProperty 172 | * **`host`** cytoscape host address, default=cytoscape_host 173 | * **`port`** cytoscape port, default=cytoscape_port 174 | * **`retunrs`** nothing 175 | 176 | ```python 177 | >>> import AGEpy as age 178 | >>> print defaults_list 179 | 180 | [{'visualProperty': 'NODE_SIZE', 'value': 60}, \ 181 | {'visualProperty': 'NODE_FILL_COLOR', 'value': '#AAAAAA'}, \ 182 | {'visualProperty': 'NODE_SHAPE', 'value': 'ellipse'}, \ 183 | {'visualProperty': 'EDGE_TRANSPARENCY', 'value': 120}] 184 | 185 | >>> response=age.create_styles("newStyle",defaults=defaults_list) 186 | ``` 187 | ___ 188 | 189 | ## ***update_style*** 190 | 191 | Updates a visual style. 192 | 193 | **`update_style(title, defaults=None, mappings=None, host=cytoscape_host, port=cytoscape_port, verbose=False)`** 194 | 195 | * **`title`** title of the visual style 196 | * **`defaults`** a list of dictionaries for each visualProperty 197 | * **`mappings`** a list of dictionaries for each visualProperty 198 | * **`host`** cytoscape host address, default=cytoscape_host 199 | * **`port`** cytoscape port, default=cytoscape_port 200 | * **`retunrs`** nothing 201 | 202 | ```python 203 | >>> import AGEpy as age 204 | >>> print new_defaults_list 205 | 206 | [{'visualProperty': 'NODE_SIZE', 'value': 80}, \ 207 | {'visualProperty': 'NODE_FILL_COLOR', 'value': '#AAAAAA'}, \ 208 | {'visualProperty': 'NODE_SHAPE', 'value': 'ellipse'}, \ 209 | {'visualProperty': 'EDGE_TRANSPARENCY', 'value': 120}] 210 | 211 | >>> response=age.update_style("newStyle",defaults=defaults_list) 212 | ``` 213 | ___ 214 | 215 | ## ***mapVisualProperty*** 216 | 217 | Generates a dictionary for a given visual property 218 | 219 | **`mapVisualProperty(visualProperty, mappingType, mappingColumn, lower=None,center=None,upper=None, discrete=None, network="current",table="node", namespace="default", host=cytoscape_host, port=cytoscape_port, verbose=False)`** 220 | 221 | * **`visualProperty`** visualProperty 222 | * **`mappingType`** mappingType 223 | * **`mappingColumn`** mappingColumn 224 | * **`lower`** for "continuous" mappings a list of the form [value,rgb_string] 225 | * **`center`** for "continuous" mappings a list of the form [value,rgb_string] 226 | * **`upper`** for "continuous" mappings a list of the form [value,rgb_string] 227 | * **`discrete`** for discrete mappings, a list of lists of the form [ list_of_keys, list_of_values ] 228 | * **`network`** a network name or id, default="current" 229 | * **`host`** cytoscape host address, default=cytoscape_host 230 | * **`port`** cytoscape port, default=cytoscape_port 231 | * **`retunrs`** a dictionary for the respective visual property 232 | 233 | ```python 234 | >>> import AGEpy as age 235 | >>> import matplotlib 236 | 237 | >>> NODE_LABEL=age.mapVisualProperty("NODE_LABEL","passthrough","display name") 238 | >>> print NODE_LABEL 239 | 240 | {'mappingType': 'passthrough', 'visualProperty': 'NODE_LABEL', 'mappingColumnType': u'String', 'mappingColumn': 'display name'} 241 | 242 | >>> NODE_SHAPE=age.mapVisualProperty('NODE_SHAPE','discrete','CDK',\ 243 | discrete=[ ["yes","not"], \ 244 | ["DIAMOND", "ellipse"] ]) 245 | 246 | >>> NODE_SIZE=age.mapVisualProperty('NODE_SIZE','discrete','CDK',\ 247 | discrete=[ ["yes","not"],\ 248 | ["100.0","60.0"] ]) 249 | 250 | # imagine you have a log2(fold_change) column in your cytoscape table 251 | >>> cmap = matplotlib.cm.get_cmap("bwr") 252 | >>> norm = matplotlib.colors.Normalize(vmin=-4, vmax=4) 253 | >>> min_color=matplotlib.colors.rgb2hex(cmap(norm(-4))) 254 | >>> center_color=matplotlib.colors.rgb2hex(cmap(norm(0))) 255 | >>> max_color=matplotlib.colors.rgb2hex(cmap(norm(4))) 256 | >>> NODE_FILL_COLOR=age.mapVisualProperty('NODE_FILL_COLOR','continuous','log2(fold_change)',\ 257 | lower=[-4,min_color],center=[0.0,center_color],upper=[4,max_color]) 258 | ``` 259 | ___ 260 | 261 | ## ***aDiffCytoscape*** 262 | 263 | Plots tables from aDiff/cuffdiff into cytoscape using String protein queries. 264 | Uses top changed genes as well as first neighbours and difusion fo generate subnetworks. 265 | 266 | **`aDiffCytoscape(df, aging_genes, target, species="caenorhabditis elegans", limit=None, cutoff=0.4, taxon=None, cytoscape_host=cytoscape_host, cytoscape_port=cytoscape_port)`** 267 | 268 | * **`df`** df as outputed by aDiff for differential gene expression 269 | * **`aging_genes`** ENS gene ids to be labeled with a diagonal 270 | * **`target`** target destination for saving files without prefix. eg. "/beegfs/group_bit/home/JBoucas/test/N2_vs_daf2" 271 | * **`species`** species for string app query. eg. "caenorhabditis elegans", "drosophila melanogaster", "mus musculus", "homo sapiens" 272 | * **`limit`** limit for string app query. Number of extra genes to recover. If None, limit=N(query_genes)*.25 273 | * **`cuttoff`** confidence cuttoff for sting app query. Default=0.4 274 | * **`taxon`** taxon id for string app query. For the species shown above, taxon id will be automatically identified 275 | * **`cytoscape_host`** host address for cytoscape, default=cytoscape_host 276 | * **`cytoscape_port`** cytoscape port, defaut=cytoscape_port 277 | * **`returns`** nothing 278 | 279 | ```python 280 | >>> import AGEpy as age 281 | >>> print genes[:10] 282 | 283 | ['WBGene00008288', 'WBGene00002169', 'WBGene00008733', 'WBGene00004178', 'WBGene00004178', 'WBGene00004179', 'WBGene00004179', 'WBGene00020581', 'WBGene00001877', 'WBGene00001881'] 284 | 285 | >>> print df.head() 286 | 287 | ensembl_gene_id gene locus sample_1 sample_2 status \ 288 | 0 WBGene00022275 Y74C9A.1 I:43732-44677 N2 daf2 OK 289 | 1 WBGene00004418 F53G12.9,rpl-7 I:111037-113672 N2 daf2 OK 290 | 2 WBGene00018774 F53G12.9,rpl-7 I:111037-113672 N2 daf2 OK 291 | 3 WBGene00018772 F53G12.4 I:134336-137282 N2 daf2 OK 292 | 4 WBGene00018958 F56C11.6 I:171339-175991 N2 daf2 OK 293 | 294 | value_1 value_2 log2(fold_change) test_stat p_value q_value \ 295 | 0 0.195901 0.986634 2.332390 2.32959 0.00570 0.031216 296 | 1 3354.820000 2463.480000 -0.445539 -2.71381 0.00005 0.000556 297 | 2 3354.820000 2463.480000 -0.445539 -2.71381 0.00005 0.000556 298 | 3 1.235670 2.992460 1.276040 3.16508 0.00005 0.000556 299 | 4 2.651180 3.795600 0.517696 1.73994 0.00410 0.024157 300 | 301 | significant GO_id \ 302 | 0 yes NaN 303 | 1 yes GO:0003735; GO:0000463; GO:0044822; GO:0002181... 304 | 2 yes NaN 305 | 3 yes NaN 306 | 4 yes GO:0016787; GO:0005615; GO:0004104 307 | 308 | GO_term gene_biotype \ 309 | 0 NaN protein_coding 310 | 1 structural constituent of ribosome; maturation... protein_coding 311 | 2 NaN protein_coding 312 | 3 NaN protein_coding 313 | 4 hydrolase activity; extracellular space; choli... protein_coding 314 | 315 | NormInt evidence 316 | 0 -0.356904 no 317 | 1 3.458609 no 318 | 2 3.458609 no 319 | 3 0.283965 no 320 | 4 0.501360 no 321 | 322 | >>> age.aDiffCytoscape(df,genes,"/u/home/JBoucas/cytoscape/cyto") 323 | ``` 324 | -------------------------------------------------------------------------------- /docs/modules/david.md: -------------------------------------------------------------------------------- 1 | ## ___DAVIDenrich___ 2 | 3 | Queries the DAVID database for an enrichment analysis. 4 | Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == "type" tag and categories == "annot" tag. 5 | 6 | **`DAVIDenrich(database, categories, user, ids, ids_bg = None, name = '', name_bg = '', verbose = False, p = 0.1, n = 2)`** 7 | 8 | * **`database`** A string for the database to query, e.g. 'WORMBASE_GENE_ID' 9 | * **`categories`** A comma separated string with databases 10 | * **`user`** A user ID registered at DAVID for querying 11 | * **`ids`** A list with identifiers 12 | * **`name`** A string with the name for the query set 13 | * **`ids_bg`** A list with the background identifiers to enrich against, 'None' for whole set 14 | * **`name_bg`** A string with the name for the background set 15 | * **`p`** Maximum p value for enrichment of a term 16 | * **`n`** Minimum number of genes within a term 17 | * **`returns`** None if no ids match the queried database, or a pandas dataframe with results 18 | 19 | ```python 20 | >>> import AGEpy as age 21 | >>> print sigGenes[:10] 22 | 23 | [u'WBGene00022275', u'WBGene00004418', u'WBGene00018774', 24 | u'WBGene00018772', u'WBGene00018958', u'WBGene00021662', 25 | u'WBGene00255594', u'WBGene00021658', u'WBGene00021026', 26 | u'WBGene00022042'] 27 | 28 | >>> categories=['GOTERM_BP_FAT', 'GOTERM_CC_FAT', 'GOTERM_MF_FAT', 'KEGG_PATHWAY','BIOCARTA', 'PFAM', 'PROSITE' ] 29 | >>> DAVIDdf=age.DAVIDenrich('WORMBASE_GENE_ID', categories, 'email.registered@david.com', sigGenes) 30 | >>> print DAVIDdf.head() 31 | 32 | categoryName termName listHits \ 33 | 0 GOTERM_BP_FAT GO:0006412~translation 177 34 | 1 GOTERM_BP_FAT GO:0006518~peptide metabolic process 198 35 | 2 GOTERM_BP_FAT GO:0043043~peptide biosynthetic process 177 36 | 3 GOTERM_BP_FAT GO:0043604~amide biosynthetic process 180 37 | 4 GOTERM_BP_FAT GO:0043603~cellular amide metabolic process 206 38 | 39 | percent ease \ 40 | 0 5.85704831238 4.32627669357e-43 41 | 1 6.55195234944 1.36601477909e-42 42 | 2 5.85704831238 4.04090150003e-42 43 | 3 5.95632031767 1.05565138148e-40 44 | 4 6.81667769689 3.74871147863e-40 45 | 46 | geneIds listTotals popHits \ 47 | 0 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 379 48 | 1 WBGENE00002063, WBGENE00006626, WBGENE00007584... 1878 455 49 | 2 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 384 50 | 3 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 402 51 | 4 WBGENE00002063, WBGENE00006626, WBGENE00007584... 1878 499 52 | 53 | popTotals foldEnrichment bonferroni benjamini \ 54 | 0 11221 2.79042292227 1.28576943333e-39 1.28576943333e-39 55 | 1 11221 2.60009830425 4.05979592345e-39 2.02989796172e-39 56 | 2 11221 2.75408929047 1.20095592581e-38 4.00318641936e-39 57 | 3 11221 2.6753612131 3.13739590576e-37 7.84348976441e-38 58 | 4 11221 2.46662227543 1.11411705145e-36 2.2282341029e-37 59 | 60 | afdr 61 | 0 7.78207551683e-40 62 | 1 2.45717759656e-39 63 | 2 7.26874466353e-39 64 | 3 1.89889814083e-37 65 | 4 6.7431553467e-37 66 | ``` 67 | ___ 68 | 69 | ## ___DAVIDgetGeneAttribute___ 70 | 71 | Returns a list of gene names for given gene ids. 72 | 73 | **`DAVIDgetGeneAttribute(x, df, refCol="ensembl_gene_id", fieldTOretrieve="gene_name")`** 74 | 75 | * **`x`** a string with the list of IDs separated by ', ' 76 | * **`df`** a dataframe with the reference column and a the column to retrieve 77 | * **`refCol`** the header of the column containing the identifiers 78 | * **`fieldTOretrieve`** the field to retrieve from parsedGTF eg. 'gene_name' 79 | * **`returns`** list of fieldTOretrieve separeted by ', ' in the same order as the given in x 80 | 81 | ```python 82 | >>> import AGEpy as age 83 | >>> print df.head() 84 | 85 | ensembl_gene_id gene locus sample_1 sample_2 status \ 86 | 0 WBGene00022275 Y74C9A.1 I:43732-44677 N2 daf2 OK 87 | 1 WBGene00004418 F53G12.9,rpl-7 I:111037-113672 N2 daf2 OK 88 | 2 WBGene00018774 F53G12.9,rpl-7 I:111037-113672 N2 daf2 OK 89 | 3 WBGene00018772 F53G12.4 I:134336-137282 N2 daf2 OK 90 | 4 WBGene00018958 F56C11.6 I:171339-175991 N2 daf2 OK 91 | 92 | value_1 value_2 log2(fold_change) test_stat p_value q_value \ 93 | 0 0.195901 0.986634 2.332390 2.32959 0.00570 0.031216 94 | 1 3354.820000 2463.480000 -0.445539 -2.71381 0.00005 0.000556 95 | 2 3354.820000 2463.480000 -0.445539 -2.71381 0.00005 0.000556 96 | 3 1.235670 2.992460 1.276040 3.16508 0.00005 0.000556 97 | 4 2.651180 3.795600 0.517696 1.73994 0.00410 0.024157 98 | 99 | significant GO_id \ 100 | 0 yes NaN 101 | 1 yes GO:0003735; GO:0000463; GO:0044822; GO:0002181... 102 | 2 yes NaN 103 | 3 yes NaN 104 | 4 yes GO:0016787; GO:0005615; GO:0004104 105 | 106 | GO_term gene_biotype \ 107 | 0 NaN protein_coding 108 | 1 structural constituent of ribosome; maturation... protein_coding 109 | 2 NaN protein_coding 110 | 3 NaN protein_coding 111 | 4 hydrolase activity; extracellular space; choli... protein_coding 112 | 113 | NormInt evidence 114 | 0 -0.356904 no 115 | 1 3.458609 no 116 | 2 3.458609 no 117 | 3 0.283965 no 118 | 4 0.501360 no 119 | 120 | >>> print DAVIDdf.head() 121 | 122 | categoryName termName listHits \ 123 | 0 GOTERM_BP_FAT GO:0006412~translation 177 124 | 1 GOTERM_BP_FAT GO:0006518~peptide metabolic process 198 125 | 2 GOTERM_BP_FAT GO:0043043~peptide biosynthetic process 177 126 | 3 GOTERM_BP_FAT GO:0043604~amide biosynthetic process 180 127 | 4 GOTERM_BP_FAT GO:0043603~cellular amide metabolic process 206 128 | 129 | percent ease \ 130 | 0 5.85704831238 4.32627669357e-43 131 | 1 6.55195234944 1.36601477909e-42 132 | 2 5.85704831238 4.04090150003e-42 133 | 3 5.95632031767 1.05565138148e-40 134 | 4 6.81667769689 3.74871147863e-40 135 | 136 | geneIds listTotals popHits \ 137 | 0 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 379 138 | 1 WBGENE00002063, WBGENE00006626, WBGENE00007584... 1878 455 139 | 2 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 384 140 | 3 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 402 141 | 4 WBGENE00002063, WBGENE00006626, WBGENE00007584... 1878 499 142 | 143 | popTotals foldEnrichment bonferroni benjamini \ 144 | 0 11221 2.79042292227 1.28576943333e-39 1.28576943333e-39 145 | 1 11221 2.60009830425 4.05979592345e-39 2.02989796172e-39 146 | 2 11221 2.75408929047 1.20095592581e-38 4.00318641936e-39 147 | 3 11221 2.6753612131 3.13739590576e-37 7.84348976441e-38 148 | 4 11221 2.46662227543 1.11411705145e-36 2.2282341029e-37 149 | 150 | afdr 151 | 0 7.78207551683e-40 152 | 1 2.45717759656e-39 153 | 2 7.26874466353e-39 154 | 3 1.89889814083e-37 155 | 4 6.7431553467e-37 156 | 157 | >>> gene_names=df[["ensembl_gene_id","gene"]].drop_duplicates() 158 | >>> DAVIDdf["gene_names"]=DAVIDdf["geneIds"].apply(lambda x: \ 159 | age.DAVIDgetGeneAttribute(x,\ 160 | gene_names,\ 161 | refCol="ensembl_gene_id",\ 162 | fieldTOretrieve="gene")) 163 | >>> print DAVIDdf.head() 164 | 165 | categoryName termName listHits \ 166 | 0 GOTERM_BP_FAT GO:0006412~translation 177 167 | 1 GOTERM_BP_FAT GO:0006518~peptide metabolic process 198 168 | 2 GOTERM_BP_FAT GO:0043043~peptide biosynthetic process 177 169 | 3 GOTERM_BP_FAT GO:0043604~amide biosynthetic process 180 170 | 4 GOTERM_BP_FAT GO:0043603~cellular amide metabolic process 206 171 | 172 | percent ease \ 173 | 0 5.85704831238 4.32627669357e-43 174 | 1 6.55195234944 1.36601477909e-42 175 | 2 5.85704831238 4.04090150003e-42 176 | 3 5.95632031767 1.05565138148e-40 177 | 4 6.81667769689 3.74871147863e-40 178 | 179 | geneIds listTotals popHits \ 180 | 0 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 379 181 | 1 WBGENE00002063, WBGENE00006626, WBGENE00007584... 1878 455 182 | 2 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 384 183 | 3 WBGENE00002063, WBGENE00013678, WBGENE00006626... 1878 402 184 | 4 WBGENE00002063, WBGENE00006626, WBGENE00007584... 1878 499 185 | 186 | popTotals foldEnrichment bonferroni benjamini \ 187 | 0 11221 2.79042292227 1.28576943333e-39 1.28576943333e-39 188 | 1 11221 2.60009830425 4.05979592345e-39 2.02989796172e-39 189 | 2 11221 2.75408929047 1.20095592581e-38 4.00318641936e-39 190 | 3 11221 2.6753612131 3.13739590576e-37 7.84348976441e-38 191 | 4 11221 2.46662227543 1.11411705145e-36 2.2282341029e-37 192 | 193 | afdr gene_names 194 | 0 7.78207551683e-40 ife-5, Y105E8A.20, tsn-1, yars-1, ife-3, C14C1... 195 | 1 2.45717759656e-39 ife-5, tsn-1, C14C10.1, ife-3, rps-30, iff-2, ... 196 | 2 7.26874466353e-39 ife-5, Y105E8A.20, tsn-1, yars-1, ife-3, C14C1... 197 | 3 1.89889814083e-37 ife-5, Y105E8A.20, tsn-1, yars-1, ife-3, C14C1... 198 | 4 6.7431553467e-37 ife-5, tsn-1, C14C10.1, ife-3, rps-30, Y51H4A.... 199 | ``` 200 | ___ 201 | 202 | ## ***DAVIDplot*** 203 | 204 | Queries the DAVID database for an enrichment analysis and plots CellPlots as 205 | well as SymPlots (see plots) using the 20 most significant terms. 206 | Check https://david.ncifcrf.gov/content.jsp?file=DAVID_API.html for database == "type" tag and categories == "annot" tag. 207 | 208 | **`DAVIDplot(database, categories, user, df_ids, output, df_ids_bg = None, name = '', name_bg = '', verbose = False, p = 0.1, n = 2)`** 209 | 210 | * **`database`** a string for the database to query, e.g. 'WORMBASE_GENE_ID' 211 | * **`categories`** a comma separated string with databases 212 | * **`user`** a user ID registered at DAVID for querying 213 | * **`df_ids`** a dataframe where the first column contains the identifiers 214 | to be queried and the second column the respective log2fc for each identifier. 215 | * **`output`** /path/to/output/prefix 216 | * **`df_ids_bg`** a dataframe where the first column contains the identifiers to be used as background. 'None' for whole set 217 | * **`name`** a string with the name for the query set 218 | * **`name_bg`** a string with the name for the background set 219 | * **`p`** Maximum p value for enrichment of a term 220 | * **`n`** Minimum number of genes within a term 221 | 222 | * **`returns`** nothing 223 | 224 | ```python 225 | >>> import AGEpy as age 226 | >>> print df.head() 227 | 228 | ensembl_gene_id log2(fold_change) 229 | 0 ENSG00000272449 1.859500 230 | 1 ENSG00000130762 0.601051 231 | 2 ENSG00000083444 -0.881957 232 | 3 ENSG00000162493 -0.638433 233 | 4 ENSG00000253368 0.654517 234 | 235 | >>> categories=['GOTERM_BP_FAT', 'GOTERM_CC_FAT', 'GOTERM_MF_FAT', 'KEGG_PATHWAY','BIOCARTA', 'PFAM', 'PROSITE' ] 236 | >>> DAVIDdf=DAVIDplot('ENSEMBL_GENE_ID', categories, 'email.registered@david.com', df, "/usr/home/JDoe/mydataset") 237 | ``` 238 | ___ 239 | -------------------------------------------------------------------------------- /docs/modules/fasta.md: -------------------------------------------------------------------------------- 1 | ## ___getFasta___ 2 | 3 | Retrieves a sequence from an opened multifasta file. 4 | 5 | **`getFasta(opened_file, sequence_name)`** 6 | 7 | * **`opened_file`** an opened multifasta file eg. opened_file=open("/path/to/file.fa",'r+') 8 | * **`sequence_name`** the name of the sequence to be retrieved eg. for '>2 dna:chromosome chromosome:GRCm38:2:1:182113224:1 REF' use: sequence_name=str(2) 9 | * **`returns`** a string with the sequence of interest 10 | 11 | ```python 12 | >>> import AGEpy as age 13 | >>> fafile="/path/to/GRCm38.dna.primary_assembly.fa" 14 | >>> with open(fafile, "r") as fastafile: 15 | ... chr2=age.getFasta(fastafile, "2") 16 | >>> print len(chr2) 17 | 18 | 182113224 19 | 20 | >>> print chr2[82113224:82113284] 21 | 22 | AGGGTGAATGATGTTTCTGGTACAGTGTACCAGTAAACCTAGCAGTAGGAGCATCAGTAT 23 | ``` 24 | ___ 25 | 26 | ## ___writeFasta___ 27 | 28 | Writes a fasta sequence into a file. 29 | 30 | **`writeFasta(sequence, sequence_name, output_file)`** 31 | 32 | * **`sequence`** a string with the sequence to be written 33 | * **`sequence_name`** name of the the fasta sequence 34 | * **`output_file`** /path/to/file.fa to be written 35 | * **`returns`** nothing 36 | 37 | ```python 38 | >>> import AGEpy as age 39 | >>> print len(chr2) 40 | 41 | 182113224 42 | 43 | >>> print chr2[82113224:82113284] 44 | 45 | AGGGTGAATGATGTTTCTGGTACAGTGTACCAGTAAACCTAGCAGTAGGAGCATCAGTAT 46 | 47 | >>> age.writeFasta(chr2,"2 my version of this sequence","/path/to/out/file.fa") 48 | ``` 49 | ___ 50 | 51 | ## ___rewriteFasta___ 52 | 53 | Rewrites a specific sequence in a multifasta file while keeping the sequence header. 54 | 55 | **`rewriteFasta(sequence, sequence_name, fasta_in, fasta_out)`** 56 | 57 | * **`sequence`** a string with the sequence to be written 58 | * **`sequence_name`** the name of the sequence to be retrieved eg. for '>2 dna:chromosome chromosome:GRCm38:2:1:182113224:1 REF' use: sequence_name=str(2) 59 | * **`fasta_in`** /path/to/original.fa 60 | * **`fasta_out`** /path/to/destination.fa 61 | * **`returns`** nothing 62 | 63 | ```python 64 | >>> import AGEpy as age 65 | >>> fafile="/path/to/GRCm38.dna.primary_assembly.fa" 66 | >>> with open(fafile, "r") as fastafile: 67 | ... chr2=age.getFasta(fastafile, "2") 68 | >>> chr2=chr2.strip("N") 69 | >>> age.rewriteFasta(chr2, "2", fafile, "/path/to/modified/file.fa") 70 | ``` 71 | ___ 72 | -------------------------------------------------------------------------------- /docs/modules/go.md: -------------------------------------------------------------------------------- 1 | ## ___getGeneAssociation___ 2 | 3 | This function collects GO annotation from http://geneontology.org/page/download-annotations. 4 | 5 | **`getGeneAssociation(URL_or_file)`** 6 | 7 | * **`URL_or_file`** either a link to a file on geneontology.org eg. http://geneontology.org/gene-associations/gene_association.fb.gz or the path for the respective downloded .gz file. 8 | 9 | * **`returns`** a Pandas dataframe with the parsed table. 10 | 11 | ```python 12 | >>> import pandas as pd 13 | >>> gA=age.getGeneAssociation("http://geneontology.org/gene-associations/gene_association.wb.gz") 14 | >>> print gA.head() 15 | 16 | DB DB_Object_ID DB_Object_Symbol Qualifier GO ID \ 17 | 0 WB WBGene00000001 aap-1 GO:0005942 18 | 1 WB WBGene00000001 aap-1 GO:0005942 19 | 2 WB WBGene00000001 aap-1 GO:0008286 20 | 3 WB WBGene00000001 aap-1 GO:0008286 21 | 4 WB WBGene00000001 aap-1 GO:0008286 22 | 23 | DB:Reference Evidence With (or) From Aspect \ 24 | 0 GO_REF:0000002 IEA InterPro:IPR001720 C 25 | 1 WB_REF:WBPaper00005614|PMID:12393910 IDA C 26 | 2 WB_REF:WBPaper00005614|PMID:12393910 IGI WB:WBGene00000090 P 27 | 3 WB_REF:WBPaper00005614|PMID:12393910 IGI WB:WBGene00000898 P 28 | 4 WB_REF:WBPaper00005614|PMID:12393910 IMP P 29 | 30 | DB_Object_Name DB_Object_Synonym DB_Object_Type Taxon Date \ 31 | 0 Y110A7A.10 gene taxon:6239 20170321 32 | 1 Y110A7A.10 gene taxon:6239 20151214 33 | 2 Y110A7A.10 gene taxon:6239 20151214 34 | 3 Y110A7A.10 gene taxon:6239 20151214 35 | 4 Y110A7A.10 gene taxon:6239 20060302 36 | 37 | Assigned_by Annotation Extension Gene Product Form ID 38 | 0 WB 39 | 1 WB 40 | 2 WB 41 | 3 WB 42 | 4 WB 43 | ``` 44 | ___ 45 | -------------------------------------------------------------------------------- /docs/modules/gtf.md: -------------------------------------------------------------------------------- 1 | ## ___readGTF___ 2 | 3 | Reads a GTF file and labels the respective columns in agreement with GTF file standards: 4 | 'seqname','source','feature','start','end','score','strand','frame','attribute'. 5 | 6 | **`readGTF(infile)`** 7 | 8 | * **`infile`** /path/to/file.gtf 9 | * **`returns`** a Pandas dataframe of the respective GTF 10 | 11 | ```python 12 | >>> import AGEpy as age 13 | >>> GTF=age.readGTF("gencode.v24.primary_assembly.annotation.gtf") 14 | >>> print GTF.head() 15 | 16 | seqname source feature start end score strand frame \ 17 | 0 chr1 HAVANA gene 11869 14409 . + . 18 | 1 chr1 HAVANA transcript 11869 14409 . + . 19 | 2 chr1 HAVANA exon 11869 12227 . + . 20 | 3 chr1 HAVANA exon 12613 12721 . + . 21 | 4 chr1 HAVANA exon 13221 14409 . + . 22 | 23 | attribute 24 | 0 gene_id "ENSG00000223972.5"; gene_type "transc..." 25 | 1 gene_id "ENSG00000223972.5"; transcript_id "EN..." 26 | 2 gene_id "ENSG00000223972.5"; transcript_id "EN..." 27 | 3 gene_id "ENSG00000223972.5"; transcript_id "EN..." 28 | 4 gene_id "ENSG00000223972.5"; transcript_id "EN..." 29 | ``` 30 | ___ 31 | 32 | ## ***retrieve_GTF_field*** 33 | 34 | Returns a field of choice from the attribute column of the GTF. 35 | 36 | **`retrieve_GTF_field(field,gtf)`** 37 | 38 | * **`field`** field to be retrieved 39 | * **`returns`** a Pandas dataframe with one column containing the field of choice 40 | 41 | ```python 42 | >>> import AGEpy as age 43 | >>> GTF=age.readGTF("/gencode.v24.primary_assembly.annotation.gtf") 44 | >>> print GTF.head() 45 | 46 | seqname source feature start end score strand frame \ 47 | 0 chr1 HAVANA gene 11869 14409 . + . 48 | 1 chr1 HAVANA transcript 11869 14409 . + . 49 | 2 chr1 HAVANA exon 11869 12227 . + . 50 | 3 chr1 HAVANA exon 12613 12721 . + . 51 | 4 chr1 HAVANA exon 13221 14409 . + . 52 | 53 | attribute 54 | 0 gene_id "ENSG00000223972.5"; gene_type "transc..." 55 | 1 gene_id "ENSG00000223972.5"; transcript_id "EN..." 56 | 2 gene_id "ENSG00000223972.5"; transcript_id "EN..." 57 | 3 gene_id "ENSG00000223972.5"; transcript_id "EN..." 58 | 4 gene_id "ENSG00000223972.5"; transcript_id "EN..." 59 | 60 | >>> GTF["gene_id"]=age.retrieve_GTF_field("gene_id",GTF) 61 | >>> print GTF.head() 62 | 63 | seqname source feature start end score strand frame \ 64 | 0 chr1 HAVANA gene 11869 14409 . + . 65 | 1 chr1 HAVANA transcript 11869 14409 . + . 66 | 2 chr1 HAVANA exon 11869 12227 . + . 67 | 3 chr1 HAVANA exon 12613 12721 . + . 68 | 4 chr1 HAVANA exon 13221 14409 . + . 69 | 70 | attribute gene_id 71 | 0 gene_id "ENSG00000223972.5"; gene_type "transc..." ENSG00000223972.5 72 | 1 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 73 | 2 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 74 | 3 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 75 | 4 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 76 | ``` 77 | ___ 78 | 79 | ## ___attributesGTF___ 80 | 81 | List the type of attributes in a the attribute section of a GTF file 82 | 83 | **`attributesGTF(inGTF)`** 84 | 85 | * **`inGTF`** GTF dataframe to be analysed 86 | * **`returns`** a list of attributes present in the attribute section 87 | 88 | ```python 89 | >>> import AGEpy as age 90 | >>> attributes=age.attributesGTF(GTF) 91 | >>> print attributes 92 | 93 | ['gene_status', 'havana_gene', 'transcript_support_level', 'level', 'transcript_type', 'tag', 'protein_id', 'gene_id', 'exon_id', 'transcript_id', 'exon_number', 'ont', 'havana_transcript', 'ccdsid', 'transcript_name', 'gene_type', 'transcript_status', 'gene_name'] 94 | ``` 95 | ___ 96 | ## ___parseGTF___ 97 | 98 | Reads an extracts all attributes in the attributes section of a GTF and constructs a new dataframe wiht one collumn per attribute instead of the attributes column. 99 | 100 | **`parseGTF(inGTF)`** 101 | 102 | * **`inGTF`** GTF dataframe to be parsed 103 | * **`returns`** a dataframe of the orignal input GTF with attributes parsed 104 | 105 | ```python 106 | >>> GTF=age.readGTF("gencode.v24.primary_assembly.annotation.gtf") 107 | >>> print GTF.head() 108 | 109 | seqname source feature start end score strand frame \ 110 | 0 chr1 HAVANA gene 11869 14409 . + . 111 | 1 chr1 HAVANA transcript 11869 14409 . + . 112 | 2 chr1 HAVANA exon 11869 12227 . + . 113 | 3 chr1 HAVANA exon 12613 12721 . + . 114 | 4 chr1 HAVANA exon 13221 14409 . + . 115 | 116 | attribute 117 | 0 gene_id "ENSG00000223972.5"; gene_type "transc..." 118 | 1 gene_id "ENSG00000223972.5"; transcript_id "EN..." 119 | 2 gene_id "ENSG00000223972.5"; transcript_id "EN..." 120 | 3 gene_id "ENSG00000223972.5"; transcript_id "EN..." 121 | 4 gene_id "ENSG00000223972.5"; transcript_id "EN..." 122 | 123 | >>> GTFpa=age.parseGTF(GTF) 124 | >>> print GTFpa.head() 125 | 126 | seqname source feature start end score strand frame gene_status \ 127 | 0 chr1 HAVANA gene 11869 14409 . + . KNOWN 128 | 1 chr1 HAVANA transcript 11869 14409 . + . KNOWN 129 | 2 chr1 HAVANA exon 11869 12227 . + . KNOWN 130 | 3 chr1 HAVANA exon 12613 12721 . + . KNOWN 131 | 4 chr1 HAVANA exon 13221 14409 . + . KNOWN 132 | 133 | havana_gene ... exon_id transcript_id \ 134 | 0 OTTHUMG00000000961.2 ... NaN NaN 135 | 1 OTTHUMG00000000961.2 ... NaN ENST00000456328.2 136 | 2 OTTHUMG00000000961.2 ... ENSE00002234944.1 ENST00000456328.2 137 | 3 OTTHUMG00000000961.2 ... ENSE00003582793.1 ENST00000456328.2 138 | 4 OTTHUMG00000000961.2 ... ENSE00002312635.1 ENST00000456328.2 139 | 140 | exon_number ont havana_transcript ccdsid transcript_name \ 141 | 0 NaN NaN NaN NaN NaN 142 | 1 NaN NaN OTTHUMT00000362751.1 NaN DDX11L1-002 143 | 2 1 NaN OTTHUMT00000362751.1 NaN DDX11L1-002 144 | 3 2 NaN OTTHUMT00000362751.1 NaN DDX11L1-002 145 | 4 3 NaN OTTHUMT00000362751.1 NaN DDX11L1-002 146 | 147 | gene_type transcript_status gene_name 148 | 0 transcribed_unprocessed_pseudogene NaN DDX11L1 149 | 1 transcribed_unprocessed_pseudogene KNOWN DDX11L1 150 | 2 transcribed_unprocessed_pseudogene KNOWN DDX11L1 151 | 3 transcribed_unprocessed_pseudogene KNOWN DDX11L1 152 | 4 transcribed_unprocessed_pseudogene KNOWN DDX11L1 153 | ``` 154 | ___ 155 | 156 | ## ___writeGTF___ 157 | 158 | Write a GTF dataframe into a file. 159 | 160 | **`writeGTF(inGTF,file_path)`** 161 | 162 | * **`inGTF`** GTF dataframe to be written. It should either have 9 columns with the last one being the "attributes" section or more than 9 columns where all columns after the 8th will be colapsed into one. 163 | * **`file_path`** /path/to/the/file.gtf 164 | * **`returns`** nothing 165 | 166 | ```python 167 | >>> import AGEpy as age 168 | >>> writeGTF(GTFpa,"/path/to/new/file.gtf") 169 | ``` 170 | ___ 171 | 172 | ## ___MAPGenoToTrans___ 173 | 174 | Gets all positions of all bases in an exon. 175 | 176 | **`MAPGenoToTrans(parsedGTF,feature)`** 177 | 178 | * **`df`** a Pandas dataframe with 'start','end', and 'strand' information for each entry. df must contain ['seqname','feature','start','end','strand','frame','gene_id', 'transcript_id','exon_id','exon_number'] 179 | * **`feature`** feature upon wich to generate the map, eg. 'exon' or 'transcript' 180 | * **`returns`** a dictionary with a string with the comma separated positions of all bases in the exon 181 | 182 | ```python 183 | >>> import AGEpy as age 184 | >>> print GTF.head() 185 | 186 | seqname source feature start end score strand frame \ 187 | 0 chr1 HAVANA gene 11869 14409 . + . 188 | 1 chr1 HAVANA transcript 11869 14409 . + . 189 | 2 chr1 HAVANA exon 11869 12227 . + . 190 | 3 chr1 HAVANA exon 12613 12721 . + . 191 | 4 chr1 HAVANA exon 13221 14409 . + . 192 | 193 | attribute gene_id \ 194 | 0 gene_id "ENSG00000223972.5"; gene_type "transc..." ENSG00000223972.5 195 | 1 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 196 | 2 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 197 | 3 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 198 | 4 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 199 | 200 | transcript_id exon_id exon_number 201 | 0 NaN NaN NaN 202 | 1 ENST00000456328.2 NaN NaN 203 | 2 ENST00000456328.2 ENSE00002234944.1 1 204 | 3 ENST00000456328.2 ENSE00003582793.1 2 205 | 4 ENST00000456328.2 ENSE00002312635.1 3 206 | 207 | >>> GtoT=age.MAPGenoToTrans(GTF,"exon") 208 | >>> print GtoT 209 | 210 | {ENST23923910:'234,235,236,1021,..'} 211 | ``` 212 | ___ 213 | 214 | ## ___GTFtoBED___ 215 | 216 | Transform a GTF dataframe into a bed dataframe 217 | 218 | **`GTFtoBED(inGTF,name)`** 219 | 220 | * **`inGTF`** GTF dataframe for transformation 221 | * **`name`** field of the GTF data frame to be use for the bed 'name' positon 222 | * **`returns`** a bed dataframe with the corresponding bed fiels: 'chrom','chromStart','chromEnd','name','score','strand' 223 | 224 | ```python 225 | >>> import AGEpy as age 226 | >>> bed = age.GTFtoBED(GTF, "gene_id") 227 | ``` 228 | ___ 229 | ## ___GetTransPosition___ 230 | 231 | Maps a genome position to transcript positon. 232 | 233 | **`GetTransPosition(df, field, dic, refCol="transcript_id")`** 234 | 235 | * **`df`** a Pandas dataframe 236 | * **`field`** the head of the column containing the genomic position 237 | * **`dic`** a dictionary containing for each transcript the respective bases eg. {ENST23923910:'234,235,236,1021,..'}. See *MAPGenoToTrans*. 238 | * **`refCol`** header of the reference column with IDs, eg. 'transcript_id' 239 | 240 | ```python 241 | >>> import AGEpy as age 242 | >>> print GTF_.head() 243 | 244 | seqname source feature start end score strand frame \ 245 | 2 chr1 HAVANA exon 11869 12227 . + . 246 | 3 chr1 HAVANA exon 12613 12721 . + . 247 | 4 chr1 HAVANA exon 13221 14409 . + . 248 | 6 chr1 HAVANA exon 12010 12057 . + . 249 | 7 chr1 HAVANA exon 12179 12227 . + . 250 | 251 | attribute gene_id \ 252 | 2 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 253 | 3 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 254 | 4 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 255 | 6 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 256 | 7 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 257 | 258 | transcript_id exon_id exon_number target 259 | 2 ENST00000456328.2 ENSE00002234944.1 1 12000 260 | 3 ENST00000456328.2 ENSE00003582793.1 2 12617 261 | 4 ENST00000456328.2 ENSE00002312635.1 3 14000 262 | 6 ENST00000450305.2 ENSE00001948541.1 1 12040 263 | 7 ENST00000450305.2 ENSE00001671638.2 2 12210 264 | 265 | >>> GTF_["transcript target"]=GTF_.apply(age.GetTransPosition, \ 266 | args=("target",GtoT),axis=1) 267 | >>> print GTF_.head() 268 | 269 | seqname source feature start end score strand frame \ 270 | 2 chr1 HAVANA exon 11869 12227 . + . 271 | 3 chr1 HAVANA exon 12613 12721 . + . 272 | 4 chr1 HAVANA exon 13221 14409 . + . 273 | 6 chr1 HAVANA exon 12010 12057 . + . 274 | 7 chr1 HAVANA exon 12179 12227 . + . 275 | 276 | attribute gene_id \ 277 | 2 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 278 | 3 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 279 | 4 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 280 | 6 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 281 | 7 gene_id "ENSG00000223972.5"; transcript_id "EN..." ENSG00000223972.5 282 | 283 | transcript_id exon_id exon_number target transcript target 284 | 2 ENST00000456328.2 ENSE00002234944.1 1 12000 132 285 | 3 ENST00000456328.2 ENSE00003582793.1 2 12617 364 286 | 4 ENST00000456328.2 ENSE00002312635.1 3 14000 1248 287 | 6 ENST00000450305.2 ENSE00001948541.1 1 12040 31 288 | 7 ENST00000450305.2 ENSE00001671638.2 2 12210 80 289 | ``` 290 | ___ 291 | 292 | ## ***getPromotersBed*** 293 | 294 | Reads a gtf file and returns a bed file for the promoter coordinates. 295 | 296 | **`getPromotersBed(gtf,fa,upstream=2000,downstream=200)`** 297 | 298 | * **`gtf`** path/to/file.gtf. Must be an ensembl gtf. 299 | * **`fa`** path/to/fasta.fa. Must be an ensembl fasta file. 300 | * **`upstream`** number of bases upstream of transcript start sites the promoter should start 301 | * **`downstream`** number of bases downstream of transcript start sites the promoter should end 302 | 303 | * **`returns`** a pandas dataframe in bed format 304 | 305 | ```python 306 | >>> import AGEpy as age 307 | >>> bed=age.getPromotersBed(gtf="Caenorhabditis_elegans.WBcel235.89.gtf",\ 308 | fa="Caenorhabditis_elegans.WBcel235.dna.toplevel.fa",\ 309 | upstream=2000,downstream=200) 310 | >>> print(bed.head()) 311 | 312 | chrom chromStart chromEnd name score strand 313 | 0 V 0 380 WBGene00197333, cTel3X.2 . + 314 | 1 V 0 5857 WBGene00015153, B0348.5 . + 315 | 2 V 129 2329 WBGene00198386, cTel3X.3 . - 316 | 3 V 7622 9822 WBGene00002061, ife-3 . - 317 | 4 V 8539 10739 WBGene00255704, B0348.10 . - 318 | ``` 319 | ___ 320 | 321 | -------------------------------------------------------------------------------- /docs/modules/homology.md: -------------------------------------------------------------------------------- 1 | ## ___getHomoloGene___ 2 | 3 | Returns NBCI's Homolog Gene tables. 4 | 5 | **`getHomoloGene(taxfile="build_inputs/taxid_taxname", genefile="homologene.data", proteinsfile="build_inputs/all_proteins.data", proteinsclusterfile="build_inputs/proteins_for_clustering.data", baseURL="http://ftp.ncbi.nih.gov/pub/HomoloGene/current/")`** 6 | 7 | * **`taxfile`** path to local file or to baseURL/taxfile, default="build_inputs/taxid_taxname", 8 | * **`genefile`** path to local file or to baseURL/genefile, defult="homologene.data" 9 | * **`proteinsfile`** path to local file or to baseURL/proteinsfile, default="build_inputs/all_proteins.data" 10 | * **`proteinsclusterfile`** path to local file or to baseURL/proteinsclusterfile, default="build_inputs/proteins_for_clustering.data" 11 | * **`baseURL`** baseURL for downloading files, default="http://ftp.ncbi.nih.gov/pub/HomoloGene/current/" 12 | * **`returns genedf`** Homolog gene Pandas dataframe 13 | * **`returns protclusdf`** Pandas dataframe. Lists one protein per gene that were used for homologene clustering. 14 | If a gene has multiple protein accessions derived from alternative splicing, 15 | only one protein isoform that give most protein alignment to proteins in other species 16 | was selected for clustering and it is listed in this file. 17 | * **`returns proteinsdf`** Pandas dataframe. Lists all proteins and their gene information. 18 | If a gene has multple protein accessions derived from alternative splicing event, 19 | each protein accession is list in a separate line. 20 | 21 | ```python 22 | >>> import AGEpy as age 23 | >>> genedf, protclusdf, proteinsdf = age.getHomoloGene() 24 | >>> print genedf.head() 25 | 26 | HID Taxonomy ID Gene ID Gene Symbol Protein gi Protein accession \ 27 | 0 3 9606 34 ACADM 4557231 NP_000007.1 28 | 1 3 9598 469356 ACADM 160961497 NP_001104286.1 29 | 2 3 9544 705168 ACADM 109008502 XP_001101274.1 30 | 3 3 9615 490207 ACADM 545503811 XP_005622188.1 31 | 4 3 9913 505968 ACADM 115497690 NP_001068703.1 32 | 33 | organism 34 | 0 Homo sapiens 35 | 1 Pan troglodytes 36 | 2 Macaca mulatta 37 | 3 Canis lupus familiaris 38 | 4 Bos taurus 39 | 40 | >>> print protclusdf.head() 41 | 42 | taxid entrez GeneID gene symbol gene description protein accession.ver \ 43 | 0 3702 10723019 AT1G27045 AT1G27045 NP_001185103.1 44 | 1 3702 10723020 AT2G41231 AT2G41231 NP_001189726.1 45 | 2 3702 10723023 AT1G24095 AT1G24095 NP_001185076.1 46 | 3 3702 10723026 AT1G12855 AT1G12855 NP_001184976.1 47 | 4 3702 10723027 AT4G22758 AT4G22758 NP_001190802.1 48 | 49 | mrna accession.ver length of protein listed in column 5 \ 50 | 0 NM_001198174.1 227 51 | 1 NM_001202797.1 99 52 | 2 NM_001198147.1 213 53 | 3 NM_001198047.1 462 54 | 4 NM_001203873.1 255 55 | 56 | -11) contains data about gene location on the genome \ 57 | 0 240254421 58 | 1 240254678 59 | 2 240254421 60 | 3 240254421 61 | 4 240256243 62 | 63 | starting position of gene in 0-based coordinate \ 64 | 0 9391608 65 | 1 17195291 66 | 2 8523246 67 | 3 4382159 68 | 4 11958309 69 | 70 | end position of the gene in 0-based coordinate strand \ 71 | 0 9393018 + 72 | 1 17195914 + 73 | 2 8524928 + 74 | 3 4383610 + 75 | 4 11960035 + 76 | 77 | nucleotide gi of genomic sequence where this gene is annotated \ 78 | 0 AT1G27045 79 | 1 AT2G41231 80 | 2 AT1G24095 81 | 3 AT1G12855 82 | 4 AT4G22758 83 | 84 | organism 85 | 0 Arabidopsis thaliana 86 | 1 Arabidopsis thaliana 87 | 2 Arabidopsis thaliana 88 | 3 Arabidopsis thaliana 89 | 4 Arabidopsis thaliana 90 | 91 | >>> print proteinsdf.head() 92 | 93 | taxid entrez GeneID gene symbol gene description protein accession.ver \ 94 | 0 3702 10723019 AT1G27045 AT1G27045 NP_001185103.1 95 | 1 3702 10723020 AT2G41231 AT2G41231 NP_001189725.1 96 | 2 3702 10723020 AT2G41231 AT2G41231 NP_001189726.1 97 | 3 3702 10723023 AT1G24095 AT1G24095 NP_001185076.1 98 | 4 3702 10723026 AT1G12855 AT1G12855 NP_001184976.1 99 | 100 | mrna accession.ver length of protein listed in column 5 \ 101 | 0 NM_001198174.1 227 102 | 1 NM_001202796.1 104 103 | 2 NM_001202797.1 99 104 | 3 NM_001198147.1 213 105 | 4 NM_001198047.1 462 106 | 107 | -11) contains data about gene location on the genome \ 108 | 0 240254421 109 | 1 240254678 110 | 2 240254678 111 | 3 240254421 112 | 4 240254421 113 | 114 | starting position of gene in 0-based coordinate \ 115 | 0 9391608 116 | 1 17195291 117 | 2 17195291 118 | 3 8523246 119 | 4 4382159 120 | 121 | end position of the gene in 0-based coordinate strand \ 122 | 0 9393018 + 123 | 1 17195914 + 124 | 2 17195914 + 125 | 3 8524928 + 126 | 4 4383610 + 127 | 128 | nucleotide gi of genomic sequence where this gene is annotated \ 129 | 0 AT1G27045 130 | 1 AT2G41231 131 | 2 AT2G41231 132 | 3 AT1G24095 133 | 4 AT1G12855 134 | 135 | organism 136 | 0 Arabidopsis thaliana 137 | 1 Arabidopsis thaliana 138 | 2 Arabidopsis thaliana 139 | 3 Arabidopsis thaliana 140 | 4 Arabidopsis thaliana 141 | ``` 142 | ___ 143 | -------------------------------------------------------------------------------- /docs/modules/meme.md: -------------------------------------------------------------------------------- 1 | ## ___filterMotifs___ 2 | 3 | Selectes motifs from a meme file based on the number of sites. 4 | 5 | **`filterMotifs(memeFile,outFile, minSites)`** 6 | 7 | * **`memeFile`** MEME file to be read 8 | * **`outFile`** MEME file to be written 9 | * **`minSites`** minimum number of sites each motif needs to have to be valid 10 | * **`returns`** nothing 11 | 12 | ```python 13 | >>> import AGEpy as age 14 | >>> age.filterMotifs("/path/to/input.meme","/path/to/output.meme", 15) 15 | ``` 16 | ___ 17 | -------------------------------------------------------------------------------- /docs/modules/p53.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/p53.png -------------------------------------------------------------------------------- /docs/modules/plots.md: -------------------------------------------------------------------------------- 1 | ## ___CellPlot___ 2 | 3 | Python implementation of the CellPlot from the CellPlot package for R. 4 | -inf or inf enrichments will come out as min found float or max found float, respectively. 5 | 6 | **`CellPlot(df, output_file=None, term_col="termName", gene_expression_col='log2fc', gene_expression="log2FC",x_values="-log10(p)", xaxis_label = "-log10(p)", pvalCol="ease", figure_title="CellPlot", lowerLimit=None, upperLimit=None, colorBarType='coolwarm'):`** 7 | 8 | * **`df`** pandas dataframe with the following columns - 'Enrichment', 'Term', and 'log2fc'. 9 | For log2fc each cell must contain a comma separated string with the log2fc for the genes enriched in the respective term. 10 | eg. '-inf,-1,2,3.4,3.66,inf' 11 | * **`output_file`** prefix for an output file. If given it will create output_file.CellPlot.svg and output_file.CellPlot.png 12 | * **`gene_expression_col`** column with gene expression data separated by a comma (ie. ',') 13 | * **`gene_expression`** label for the color gradiant bar. 14 | * **`x_values`** values to use on the x-axis 15 | * **`xaxis_label`** label for x-axis 16 | * **`figure_title`** Figure title. 17 | * **`term_col`** the column with the terms names 18 | * **`pvalCol`** name of the column containing the p values to determine if the terms should be marked as NS - not significant, use None for no marking 19 | * **`lowerLimit`** lower limit for the heatmap bar (default is the 0.1 percentile) 20 | * **`upperLimit`** upper limit for the heatmap bar (default is the 0.9 percentile) 21 | * **`colorBarType`** type of heatmap, 'Spectral' is default, alternative eg. 'seismic' 22 | * **`returns`** a matplotlib figure 23 | 24 | ```python 25 | >>> import AGEpy as age 26 | >>> print df.head() 27 | 28 | categoryName termName listHits \ 29 | 0 GOTERM_BP_FAT GO:0006396~RNA processing 716 30 | 1 GOTERM_BP_FAT GO:0043933~macromolecular complex subunit orga... 1433 31 | 2 GOTERM_BP_FAT GO:0016071~mRNA metabolic process 523 32 | 3 GOTERM_BP_FAT GO:0044085~cellular component biogenesis 1596 33 | 4 GOTERM_BP_FAT GO:0022613~ribonucleoprotein complex biogenesis 398 34 | 35 | percent ease \ 36 | 0 10.599556 8.904648e-157 37 | 1 21.213916 2.144221e-124 38 | 2 7.742413 1.473027e-109 39 | 3 23.626943 2.398988e-108 40 | 4 5.891932 7.142953e-99 41 | 42 | geneIds listTotals popHits \ 43 | 0 ENSG00000151304, ENSG00000091127, ENSG00000171... 6085 910 44 | 1 ENSG00000166337, ENSG00000110075, ENSG00000110... 6085 2461 45 | 2 ENSG00000138385, ENSG00000106355, ENSG00000110... 6085 672 46 | 3 ENSG00000110075, ENSG00000110074, ENSG00000164... 6085 2914 47 | 4 ENSG00000151304, ENSG00000215301, ENSG00000171... 6085 480 48 | 49 | popTotals foldEnrichment bonferroni benjamini afdr \ 50 | 0 16650 2.152907 1.021719e-152 1.021719e-152 1.842717e-153 51 | 1 16650 1.593266 2.460279e-120 1.230140e-120 4.437224e-121 52 | 2 16650 2.129541 1.690151e-105 5.633836e-106 3.048263e-106 53 | 3 16650 1.498639 2.752599e-104 6.881498e-105 4.964436e-105 54 | 4 16650 2.268796 8.195824e-95 1.639165e-95 1.478154e-95 55 | 56 | genes name \ 57 | 0 SRFBP1, PUS7, CHD7, SSB, LSM5, NOB1, GTF2H1, A... 58 | 1 TAF10, PPP6R3, FOXRED1, PSMC1, ILK, EP400, CTB... 59 | 2 SSB, LSM5, GTF2H1, ALYREF, RPS10, SNRNP35, CNO... 60 | 3 PPP6R3, FOXRED1, UTP15, SIX1, LLGL1, RPL9, TRI... 61 | 4 SRFBP1, DDX3X, CHD7, NOB1, RPS10, UTP15, RPL30... 62 | 63 | log2fc -log10(p) 64 | 0 0.153, 0.37, 0.023, 0.321, 0.084, 0.61, 0.118,... 156.050383 65 | 1 -0.309, 0.078, -0.063, 0.005, 0.054, -0.051, 0... 123.668730 66 | 2 0.321, 0.084, 0.118, -0.013, 0.06, -0.055, 0.0... 108.831789 67 | 3 0.078, -0.063, -0.303, -0.39, -0.254, 0.092, -... 107.619972 68 | 4 0.153, 0.221, 0.023, 0.61, 0.06, -0.303, 0.15,... 98.146122 69 | 70 | >>> cellplot=age.CellPlot(df[:20]) 71 | ``` 72 | ![cellpot](cellplot.CellPlot.png) 73 | ___ 74 | 75 | ## ___SymPlot___ 76 | 77 | Python implementation of the SymPlot from the CellPlot package for R. 78 | -inf or inf enrichments will come out as min found float or max found float, respectively. 79 | 80 | **`SymPlot(df,output_file=None,figure_title="SymPlot", pvalCol="ease", term_col="termName", x_values="-log10(p)", n_terms_col="listHits", gene_expression_col="log2fc" , xaxis_label = "-log10(p)", colorBarType='coolwarm')`** 81 | 82 | * **`df`** pandas dataframe with the following columns - 'Enrichment', 'Significant', 'Annotated', 'Term', and 'log2fc'. 'Annotated'i stands for number of genes annotated with the respective GO term. As reported in DAVID by listHits. For log2fc each cell must contain a comma separated string with the log2fc for the genes enriched in the respective term. eg. '-inf,-1,2,3.4,3.66,inf' 83 | * **`gene_expression_col`** column with gene expression data separated by a comma (ie. ',') 84 | * **`gene_expression`** label for the color gradiant bar. 85 | * **`x_values`** values to use on the x-axis 86 | * **`xaxis_label`** label for x-axis 87 | * **`term_col`** the column with the terms names 88 | * **`output_file`** prefix for an output file. If given it witll create output_file.SymPlot.svg and output_file.SymPlot.png 89 | * **`figure_title`** Figure title. 90 | * **`pvalCol`** name of the column containing the p values to determine if the terms should be marked as NS - not significant, use None for no marking 91 | * **`colorBarType`** type of heatmap, 'coolwarm' is dafault, alternative eg. 'seismic','Spectral','bwr','coolwarm' 92 | * **`returns`** a matplotlib figure 93 | 94 | ```python 95 | >>> import AGEpy as age 96 | >>> symplot=age.SymPlot(df[:20],"symplot", "mutant 1",pvalCol="ease") 97 | ``` 98 | ![sympot](symplot.SymPlot.png) 99 | ___ 100 | ## ___MA___ 101 | 102 | Plots an MA like plot. 103 | 104 | **`MA(df, title, figName, c, daType="counts", nbins=10, perc=.5, deg=3, eq=True, splines=True, spec=None, Targets=None, ylim=None, sizeRed=8)`** 105 | 106 | * **`df`** dataframe output of GetData() 107 | * **`title`** plot title, 'Genes' or 'Transcripts' 108 | * **`figName`** /path/to/saved/figure/prefix 109 | * **`c`** pair of samples to be plotted in list format 110 | * **`daType`** data type, ie. 'counts' or 'FPKM' 111 | * **`nbins`** number of bins on normalized intensities to fit the splines 112 | * **`per`** log2(fold change) percentil to which the splines will be fitted 113 | * **`deg`** degress of freedom used to fit the splines 114 | * **`eq`** if true assumes for each bin that the lower and upper values are equally distant to 0, taking the smaller distance for both 115 | * **`splines`** plot splines, default=True 116 | * **`spec`** list of ids to be highlighted 117 | * **`Targets`** list of ids that will be highlighted if outside of the fitted splines 118 | * **`ylim`** a list of limits to apply on the y-axis of the plot 119 | * **`sizeRed`** size of the highlight marker 120 | * **`returns df_`** a Pandas dataframe similar to the GetData() output with normalized intensities and spline outbounds rows marked as 1. 121 | * **`returns red`** list of ids that are highlighted 122 | 123 | ```python 124 | >>> import AGEpy as age 125 | >>> print df.head() 126 | 127 | gene_id gene wt0 wt20 log2(wt20/wt0) \ 128 | 0 ENSG00000223972 DDX11L1 0.0 0.0 NaN 129 | 1 ENSG00000243485 MIR1302-2,RP11-34P13.3 0.0 0.0 NaN 130 | 2 ENSG00000274890 MIR1302-2,RP11-34P13.3 0.0 0.0 NaN 131 | 3 ENSG00000268020 OR4G4P 0.0 0.0 NaN 132 | 4 ENSG00000240361 OR4G11P 0.0 0.0 NaN 133 | 134 | p_value q_value significant 135 | 0 1.0 1.0 no 136 | 1 1.0 1.0 no 137 | 2 1.0 1.0 no 138 | 3 1.0 1.0 no 139 | 4 1.0 1.0 no 140 | 141 | >>> madf1,sig1=age.MA(dge_, 'Genes',"MA1",["wt0","wt20"], daType="FPKM") 142 | ``` 143 | ![ma1](MA1.png) 144 | ```python 145 | >>> sigGenes=df[df["significant"=="yes"]]["gene_id"].tolist() 146 | >>> madf2,sig2=age.MA(dge_, 'Genes',"MA2", ["wt0","wt20"], splines=False, daType="FPKM",spec=sigGenes) 147 | ``` 148 | ![ma2](MA2.png) 149 | ```python 150 | >>> madf3,sig3=age.MA(dge_, 'Genes',"MA3", ["wt0","wt20"], splines=True, daType="FPKM",Targets=sigGenes) 151 | ``` 152 | ![ma3](MA3.png) 153 | ___ 154 | -------------------------------------------------------------------------------- /docs/modules/sam.md: -------------------------------------------------------------------------------- 1 | ## ___readSAM___ 2 | 3 | Reads and parses a sam file. 4 | 5 | **`readSAM(SAMfile,header=False)`** 6 | 7 | * **`SAMfile`** /path/to/file.sam 8 | * **`header`** logical, if True, reads the header information 9 | * **`returns`** a pandas dataframe with the respective SAM columns: 'QNAME','FLAG','RNAME','POS','MAPQ','CIGAR','RNEXT','PNEXT','TLEN','SEQ','QUAL' and a list of the headers if header=True 10 | 11 | ```python 12 | >>> import AGEpy as age 13 | >>> SAMdf=age.readSAM("sample1.sam") 14 | >>> print SAMdf.head() 15 | 16 | CIGAR \ 17 | 0 J00137:91:HJG75BBXX:6:1101:27458:1244 4 * 0 0 * 18 | 1 J00137:91:HJG75BBXX:6:1101:2483:1226 4 * 0 0 * 19 | 2 J00137:91:HJG75BBXX:6:1101:6593:1244 16 II 11210427 255 2S146M 20 | 3 J00137:91:HJG75BBXX:6:1101:9293:1244 0 I 10433525 255 150M 21 | 4 J00137:91:HJG75BBXX:6:1101:13271:1244 16 III 5277278 255 150M 22 | 23 | RNEXT PNEXT TLEN SEQ \ 24 | 0 * 0 0 CCAAAATCAGTTACAAAAAAATTAAATATCGAGTTCCTCCCCCAGA... 25 | 1 * 0 0 ACGTGACCGATGGTTGGCATGGCACGCATACCACGGAAGCGTCTGC... 26 | 2 * 0 0 AACAACAGCAGCAGCAGATTTACCAAAGGTTCCCAGCAAGACTAAT... 27 | 3 * 0 0 CTTGATTGTACTGCTGTGGTGGACCGCGTGGTCCTCCTTGTTGGTT... 28 | 4 * 0 0 GGACATGATGATCATGGCCACGACTCTCATGGACATAGTCATGATC... 29 | 30 | QUAL 31 | 0 AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ... 32 | 1 A->> import AGEpy as age 50 | >>> age.writeSAM(SAMdf,"modified.sam") 51 | ``` 52 | ___ 53 | 54 | ## ___SAMflags___ 55 | Explains a SAM flag. 56 | 57 | **`SAMflags(x)`** 58 | 59 | * **`x`** flag 60 | * **`returns`** complete SAM flag explanation 61 | 62 | ``` 63 | >>> import AGEpy as age 64 | >>> print age.SAMflags(64) 65 | ``` 66 | ["0: Read unpaired", 67 | "0: Read not mapped in proper pair", 68 | "0: Read mapped", 69 | "0: Mate mapped", 70 | "0: Read direct strand", 71 | "0: Mate direct strand", 72 | "1: First in pair", 73 | "0: First in pair", 74 | "0: Primary alignment", 75 | "0: Read passes platform/vendor quality checks", 76 | "0: Read is not PCR or optical duplicate", 77 | "0: Not supplementary alignment"] 78 | 79 | ___ 80 | -------------------------------------------------------------------------------- /docs/modules/symplot.SymPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpg-age-bioinformatics/AGEpy/51bf9d0459f995659b94aba34128956b09ea4b7c/docs/modules/symplot.SymPlot.png -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | AGEpy 5 | 6 | 7 | 8 |

Redirect to new page...

9 | 10 | 11 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | theme: readthedocs 2 | site_name: AGEpy 3 | pages: 4 | - Home: index.md 5 | - Cookbook: cookbook.md 6 | - Modules: 7 | - bed: modules/bed.md 8 | - biom: modules/biom.md 9 | - blast: modules/blast.md 10 | - cytoscape: modules/cytoscape.md 11 | - david: modules/david.md 12 | - fasta: modules/fasta.md 13 | - go: modules/go.md 14 | - gtf: modules/gtf.md 15 | - homology: modules/homology.md 16 | - kegg: modules/kegg.md 17 | - meme: modules/meme.md 18 | - plots: modules/plots.md 19 | - sam: modules/sam.md 20 | - Executables: 21 | - aDiff: executables/adiff.md 22 | - abed: executables/abed.md 23 | - obo2tsv: executables/obo2tsv.md 24 | - david: executables/david.md 25 | - blasto: executables/blasto.md 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | AGEpy 2 | Pandas>=0.15.2 3 | numpy>=1.9.2 4 | requests>=2.20.0 5 | openpyxl 6 | suds 7 | zeep 8 | xlrd 9 | biomart 10 | matplotlib 11 | xlsxwriter 12 | pybedtools 13 | wand 14 | paramiko 15 | ipaddress 16 | seaborn 17 | scipy 18 | scikit-learn 19 | statsmodels 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os 3 | import re 4 | import sys 5 | from setuptools import setup 6 | 7 | here = os.path.abspath(os.path.dirname(__file__)) 8 | 9 | 10 | def read(*parts): 11 | # intentionally *not* adding an encoding option to open, See: 12 | # https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690 13 | with codecs.open(os.path.join(here, *parts), 'r') as fp: 14 | return fp.read() 15 | 16 | setup(name = 'AGEpy', 17 | version = '0.8.2', 18 | description = 'Bioinformatics tools for Python developed at the MPI for Biology of Ageing', 19 | long_description = read('README.rst'), 20 | url = 'https://github.com/mpg-age-bioinformatics/AGEpy', 21 | author = 'Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing', 22 | author_email = 'bioinformatics@age.mpg.de', 23 | license = 'MIT', 24 | packages = [ 'AGEpy' ], 25 | install_requires = [ 'Pandas>=0.15.2', 'numpy>=1.9.2','requests>=2.20.0', \ 26 | 'suds', 'zeep', 'openpyxl','xlrd', 'biomart', 'matplotlib','pybedtools', \ 27 | 'xlsxwriter','wand','paramiko','ipaddress', 'seaborn', \ 28 | 'scipy', 'scikit-learn', 'statsmodels'], 29 | zip_safe = False, 30 | scripts=['bin/obo2tsv','bin/aDiff','bin/abed','bin/david', 'bin/blasto', 'bin/QC_plots'] 31 | ) 32 | --------------------------------------------------------------------------------