├── ExTraMapper.py
├── ExTraMapper_Figure.jpg
├── Human-Monkey-Processed-Data
    ├── README.md
    ├── config.human-monkey.conf
    ├── extMpreprocess
    └── scripts
    │   ├── ensemblUtils.py
    │   ├── liftOver
    │   ├── liftover-withMultiples
    │   ├── parseAndPicklePerPair.py
    │   └── splitExonsIntoIndividualFiles.py
├── Human-Mouse-Preprocess-Data
    ├── README.md
    ├── config.human-mouse.conf
    ├── extMpreprocess
    └── scripts
    │   ├── ensemblUtils.py
    │   ├── liftOver
    │   ├── liftover-withMultiples
    │   ├── parseAndPicklePerPair.py
    │   └── splitExonsIntoIndividualFiles.py
├── LICENSE
├── README.md
├── Result
    ├── Exon-Pairs
    │   └── README.md
    └── Transcript-Pairs
    │   ├── ExTraMapper_Transcript_Mapping_ENSMBL102_Genome_Build_Human_vs_Mouse.xlsx
    │   ├── ExTraMapper_Transcript_Mapping_ENSMBL102_Human_vs_Monkey.xlsx
    │   ├── ExTraMapper_Transcript_Mapping_ENSMBL81_Genome_Build_Human_vs_Mouse.xlsx
    │   └── README.md
└── extMsummarise


/ExTraMapper_Figure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ay-lab/ExTraMapper/ff8bf6399e457c041e10ab8d94c83ae54414b273/ExTraMapper_Figure.jpg


--------------------------------------------------------------------------------
/Human-Monkey-Processed-Data/README.md:
--------------------------------------------------------------------------------
 1 | ## Steps to generate the input files (Human - Monkey)
 2 | The users should run the _extMpreprocess_ to generate the inputfiles. All the input files will be generated under _preprocess/data_ folder. All the required executables and scripts are provided here. The _extMpreprocess_ has 7 individual steps and should be run in the following manner 
 3 | 
 4 | ### Run the following steps 
 5 | 
 6 |  - ![#f03c15](https://via.placeholder.com/15/f03c15/000000?text=+) For help, type <br>
 7 |    
 8 |     ```bash
 9 |     ./extMpreprocess help
10 |     
11 |     This script will download and preprocess the dataset required for exon-pair and transcript pair finding by ExTraMapper.
12 |     Type ./extMpreprocess <config.conf> <step> to execute the script.
13 |     Type ./extMpreprocess example to print a example config.conf file.
14 | 
15 |     This script will run seven (7) sequential steps to create the inputs for ExTraMapper program.
16 |     Users can provide step numbers (1-7) or all in the <step> arugemt of this script.
17 |     Short description of the individual scripts:
18 |     Step 1: Download per organism specific files e.g. reference genomes, gene annotation files.
19 |     Step 2: Will create genomedata archives with the genomes of org1 and org2 (Make sure to install genomedata package).
20 |     Step 3: Pickle files for each homologous gene pair will be created.
21 |     Step 4: Perform coordinate liftOver of exons with multiple mappings (This step requires bedtools and liftOver executables).
22 |     Step 5-7: postprocessing the liftOver files.
23 |     
24 |     example: 
25 |     
26 |     ./extMpreprocess config.human-monkey.conf all
27 |     ```
28 |    <br>
29 |    <br>
30 |  - ![#f03c15](https://via.placeholder.com/15/f03c15/000000?text=+) The script requires genomedata package which can be installed by running the following commnand. <br>
31 |     
32 |     ```bash
33 |     $ pip install genomedata --user
34 |     ```
35 |     <br>
36 |  
37 |     <br>
38 |     
39 | #### Once finished the _extMpreproces_ script shoudld produce the _preprocess_ folder with the following subfolders.<br>
40 | 
41 | ```bash 
42 | ./preprocess
43 | |-- bin
44 | |   `-- liftOver
45 | `-- data
46 |     |-- human-rhesus
47 |     |   |-- GTFsummaries
48 |     |   |   |-- onlyOrthologAndCodingGenes
49 |     |   |   |   |-- org1-allExons-GTFparsed.txt
50 |     |   |   |   |-- org1-allGenes-GTFparsed.txt
51 |     |   |   |   |-- org1-allTranscripts-GTFparsed.txt
52 |     |   |   |   |-- org2-allExons-GTFparsed.txt
53 |     |   |   |   |-- org2-allGenes-GTFparsed.txt
54 |     |   |   |   `-- org2-allTranscripts-GTFparsed.txt
55 |     |   |   |-- org1-allExons-GTFparsed.txt
56 |     |   |   |-- org1-allGenes-GTFparsed.txt
57 |     |   |   |-- org1-allTranscripts-GTFparsed.txt
58 |     |   |   |-- org2-allExons-GTFparsed.txt
59 |     |   |   |-- org2-allGenes-GTFparsed.txt
60 |     |   |   `-- org2-allTranscripts-GTFparsed.txt
61 |     |   |-- ensemblDownloads
62 |     |   |   |-- org1.gtf
63 |     |   |   |-- org1.gtf.gz
64 |     |   |   |-- org1_homolog_org2.txt
65 |     |   |   |-- org1_homolog_org2.txt.gz
66 |     |   |   |-- org2.gtf
67 |     |   |   |-- org2.gtf.gz
68 |     |   |   |-- org2_homolog_org1.txt
69 |     |   |   `-- org2_homolog_org1.txt.gz
70 |     |   |-- genePairsSummary-one2one.txt
71 |     |   |-- genomedataArchives
72 |     |   |   |-- org1 [27 entries exceeds filelimit, not opening dir]
73 |     |   |   `-- org2 [23 entries exceeds filelimit, not opening dir]
74 |     |   |-- liftoverRelatedFiles [56 entries exceeds filelimit, not opening dir]
75 |     |   |-- perExonLiftoverCoords
76 |     |   |   |-- org1 [619127 entries exceeds filelimit, not opening dir]
77 |     |   |   `-- org2 [260616 entries exceeds filelimit, not opening dir]
78 |     |   `-- perGenePairPickledInfo [16150 entries exceeds filelimit, not opening dir]
79 |     |-- liftover_chains
80 |     |   |-- hg38
81 |     |   |   `-- liftOver
82 |     |   |       `-- hg38ToRheMac10.over.chain.gz
83 |     |   `-- rheMac10
84 |     |       `-- liftOver
85 |     |           `-- rheMac10ToHg38.over.chain.gz
86 |     `-- reference_genomes
87 |         |-- hg38 [27 entries exceeds filelimit, not opening dir]
88 |         `-- rheMac10 [24 entries exceeds filelimit, not opening dir]
89 | ```
90 | 
91 | ##### The whole process should take several hours to complete!
92 | ##### [(Check also the Human-Mouse data processing steps)](https://github.com/ay-lab/ExTraMapper/tree/master/Human-Mouse-Preprocess-Data)
93 | 


--------------------------------------------------------------------------------
/Human-Monkey-Processed-Data/config.human-monkey.conf:
--------------------------------------------------------------------------------
 1 | # reference genome versions
 2 | ref1=hg38
 3 | ref2=rheMac10
 4 | 
 5 | # short names of organisms
 6 | org1=human
 7 | org2=rhesus
 8 | 
 9 | # Ensembl release version number to be used for both organisms
10 | releaseNo=102
11 | 
12 | # Find out the standard Ensembl names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/gtf/
13 | org1EnsemblName=homo_sapiens
14 | org2EnsemblName=macaca_mulatta
15 | 
16 | # Find out the full and short Ensembl Mart names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/mysql/ensembl_mart_81
17 | org1EnsemblMartName=hsapiens
18 | org2EnsemblMartName=mmulatta
19 | org1EnsemblMartNameShort=hsap
20 | org2EnsemblMartNameShort=mmul
21 | 
22 | #liftOver executable path (Please make sure it is executable, chmod u+x liftOver)
23 | liftOver=<path to>/Human-Monkey-Preprocess-Data/scripts/liftOver
24 | 


--------------------------------------------------------------------------------
/Human-Monkey-Processed-Data/extMpreprocess:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | ## This script will download and preprocess the dataset required for 
  4 | ## exon-pair and transcript pair finding by ExTraMapper.
  5 | ## The script requires a config.conf file which will direct this script
  6 | ## to download and process the essential data. 
  7 | 
  8 | ##################### config.conf file #####################
  9 | ## Example of human-monkey confif.conf file:
 10 | ##
 11 | ## #Reference genome versions
 12 | ## ref1=hg38
 13 | ## ref2=rheMac10
 14 | ##
 15 | ## #Short names of organisms
 16 | ## org1=human
 17 | ## org2=rhesus
 18 | ##
 19 | ## #Ensembl release version number to be used for both organisms
 20 | ## releaseNo=102
 21 | ##
 22 | ## #Find out the standard Ensembl names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/gtf/
 23 | ## org1EnsemblName=homo_sapiens
 24 | ## org2EnsemblName=macaca_mulatta
 25 | ##
 26 | ## #Find out the full and short Ensembl Mart names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/mysql/ensembl_mart_102
 27 | ## org1EnsemblMartName=hsapiens
 28 | ## org2EnsemblMartName=mmulatta
 29 | ## org1EnsemblMartNameShort=hsap
 30 | ## org2EnsemblMartNameShort=mmul
 31 | ##
 32 | ## #liftOver executable path (Check here https://hgdownload.cse.ucsc.edu/admin/exe)
 33 | ## liftOver=./usr/bin/liftOver
 34 | ##
 35 | ##
 36 | ## Example of human-mouse confif.conf file:
 37 | ##
 38 | ## #Reference genome versions
 39 | ## ref1=hg38
 40 | ## ref2=mm10
 41 | ##
 42 | ## #Short names of organisms
 43 | ## org1=human
 44 | ## org2=mouse
 45 | ##
 46 | ## #Ensembl release version number to be used for both organisms
 47 | ## releaseNo=102
 48 | ##
 49 | ## #Find out the standard Ensembl names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/gtf/
 50 | ## org1EnsemblName=homo_sapiens
 51 | ## org2EnsemblName=mus_musculus
 52 | ##
 53 | ## #Find out the full and short Ensembl Mart names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/mysql/ensembl_mart_102
 54 | ## org1EnsemblMartName=hsapiens
 55 | ## org2EnsemblMartName=mmusculus
 56 | ## org1EnsemblMartNameShort=hsap
 57 | ## org2EnsemblMartNameShort=mmus
 58 | ##
 59 | ## #liftOver executable path (Check here https://hgdownload.cse.ucsc.edu/admin/exe)
 60 | ## liftOver=./usr/bin/liftOver
 61 | ##
 62 | ############################################################
 63 | 
 64 | if ($#ARGV == -1 || $ARGV[0] eq "help") {
 65 |   print ("\n");
 66 |   print ("This script will download and preprocess the dataset required for exon-pair and transcript pair finding by ExTraMapper.\n");
 67 |   print ("Type ./extMpreprocess <config.conf> <step> to execute the script.\n");
 68 |   print ("Type ./extMpreprocess example to print a example config.conf file.\n\n");
 69 |   print ("This script will run seven (7) sequential steps to create the inputs for ExTraMapper program.\n");
 70 |   print ("Users can provide step numbers (1-7) or all in the <step> arugemt of this script.\n");
 71 |   print ("Short description of the individual scripts:\n");
 72 |   print ("Step 1: Download per organism specific files e.g. reference genomes, gene annotation files.\n");
 73 |   print ("Step 2: Will create genomedata archives with the genomes of org1 and org2 (Make sure to install genomedata package).\n");
 74 |   print ("Step 3: Pickle files for each homologous gene pair will be created.\n");
 75 |   print ("Step 4: Perform coordinate liftOver of exons with multiple mappings (This step requires bedtools and liftOver executables).\n");
 76 |   print ("Step 5-7: postprocessing the liftOver files.\n");
 77 |   print ("\n");
 78 |   exit();
 79 | } elsif ($ARGV[0] eq "example") {
 80 |   my @exmpl = "# reference genome versions
 81 | ref1=hg38
 82 | ref2=mm10
 83 | 
 84 | # short names of organisms
 85 | org1=human
 86 | org2=mouse
 87 | 
 88 | # Ensembl release version number to be used for both organisms
 89 | releaseNo=102
 90 | 
 91 | # Find out the standard Ensembl names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/gtf/
 92 | org1EnsemblName=homo_sapiens
 93 | org2EnsemblName=mus_musculus
 94 | 
 95 | # Find out the full and short Ensembl Mart names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/mysql/ensembl_mart_81
 96 | org1EnsemblMartName=hsapiens
 97 | org2EnsemblMartName=mmusculus
 98 | org1EnsemblMartNameShort=hsap
 99 | org2EnsemblMartNameShort=mmus
100 | 
101 | #liftOver executable path (Check here https://hgdownload.cse.ucsc.edu/admin/exe)
102 | liftOver=/usr/bin/liftOver\n";
103 |  
104 |  print (@exmpl);
105 |  print ("\n");
106 |  open (out, ">config.human-mouse.conf");
107 |  print out @exmpl;
108 |  close out;
109 |  print ("The example config.human-mouse.conf file is written\n"); 
110 |  exit;
111 | } 
112 | my ($configfile, $step) = @ARGV;
113 | chomp ($configfile, $step);
114 | 
115 | #### File and folder check ####
116 | die "The $configfile does not exists, exit!" unless -e "$configfile";
117 | 
118 | 
119 | #### Get the environmental variables ####
120 | $ENV{'EXTRAMAPPER_DIR'} = $ENV{'PWD'};
121 | open(in, $configfile);
122 | while (my $var = <in> ) {
123 |   chomp $var;
124 |   if ($var =~ /=/) {
125 |     $var_n = (split(/=/,$var))[0];
126 |     $var_v = (split(/=/,$var))[1];
127 |     $ENV{$var_n} = $var_v;
128 |   } 
129 | }
130 | close in;
131 | 
132 | #### Set the variable folders and files ####
133 | $dataDir             = "$ENV{'EXTRAMAPPER_DIR'}/preprocess/data";
134 | $dataDirPerPair      = "$ENV{'EXTRAMAPPER_DIR'}/preprocess/data/$ENV{'org1'}-$ENV{'org2'}";
135 | $referenceGenomesDir = "$dataDir/reference_genomes";
136 | $chainsDir           = "$dataDir/liftover_chains";
137 | $ensemblDir          = "$dataDirPerPair/ensemblDownloads";
138 | $genomedataDir       = "$dataDirPerPair/genomedataArchives";
139 | $GTFsummaryDir       = "$dataDirPerPair/GTFsummaries";
140 | $perGenePairPickleDir= "$dataDirPerPair/perGenePairPickledInfo";
141 | $liftOverFilesDir    = "$dataDirPerPair/liftoverRelatedFiles";
142 | $perExonLiftoverDir  = "$dataDirPerPair/perExonLiftoverCoords";
143 | 
144 | #### Main functions and sub-routines ####
145 | sub getfasta {
146 |   my $path = $_[0];
147 |   my $org  = $_[1];
148 |   my %chr;
149 |   open(chrname,"$path/$org/name_chr.txt");
150 |   while ( <chrname> ){
151 |     chomp $_;
152 |     $chr{$_} = 1;
153 |   }
154 |   close (chrname);
155 | 
156 |   my $file = "$path/$org/$org.fa.gz";
157 |   open(in, "zcat $file |");
158 |   while ( <in> ) {
159 |     chomp $_;
160 |     if ($_ =~ />/) {
161 |       $name = $_;
162 |       $ckpt = 0;
163 |       $name =~ s/>//g;
164 |       if ($chr{$name} ne "") {
165 |         print ("Extracting $name from $org.fa.gz file\n");
166 |         $ckpt = 1;
167 |         open($out,"|gzip -c > $path/$org/$name.fa.gz");
168 |         print $out (">$name\n");
169 |       } else {
170 |         close ($out);
171 |       }
172 |     } else {
173 |       if ($ckpt == 1) {
174 |         print $out ("$_\n");
175 |       }
176 |     }
177 |   }
178 |   close(in);
179 |   system("rm -rf $path/$org/$org.fa.gz");
180 |   print ("Finished extracting chromosomes and writing the individual *.fa.gz files\n");
181 |   print ("Removed $path/$org/$org.fa.gz\n");
182 | }
183 | 
184 | sub downloadrefgenome {
185 | 
186 |   my $path = $_[0];
187 |   my $org  = $_[1];
188 |   if (!-d "$path/$org") {
189 |     print ("Creating $path/$org folder\n");
190 |     system("mkdir -p $path/$org");
191 |     print ("Running: wget --timestamping ftp://hgdownload.cse.ucsc.edu/goldenPath/$org/chromosomes/* --directory-prefix=$path/$org 2>&1 | grep \"Login incorrect\"\n");
192 |     my $error = `wget --timestamping ftp://hgdownload.cse.ucsc.edu/goldenPath/$org/chromosomes/* --directory-prefix=$path/$org 2>&1 | grep "No such directory"`;
193 |     if ($error =~ "No such directory") {
194 |       print ("There is no chromosome folder for $org. So, downloding the bigZip file and extracting them\n");
195 |       print ("Running: wget --timestamping ftp://hgdownload.cse.ucsc.edu/goldenPath/$org/bigZips/$org.fa.gz --directory-prefix=$path/$org 2> /dev/null\n");
196 |       system("wget --timestamping ftp://hgdownload.cse.ucsc.edu/goldenPath/$org/bigZips/$org.fa.gz --directory-prefix=$path/$org 2> /dev/null"); 
197 |       print ("Extracting the individual chromosomes\n");
198 |       print ("zcat $path/$org/$org.fa.gz |grep \">\" |grep -v \"_random\" |grep -v \"chrUn\" |sed 's/>//g' > $path/$org/name_chr.txt\n");
199 |       system("zcat $path/$org/$org.fa.gz |grep \">\" |grep -v \"_random\" |grep -v \"chrUn\" |sed 's/>//g' > $path/$org/name_chr.txt");
200 |       getfasta($path, $org);
201 |       print "Reference genomes are downloaded in $path/$org\n";
202 |     } else {
203 |       system("rm -rf $path/$org/*_random*");
204 |       system("rm -rf $path/$org/chrUn*");
205 |       system("rm -rf $path/$org/*_alt*");
206 |     }
207 |   } else {
208 |     print ("$path/$org folder already exists, skipping downloading the dataset\n");
209 |   }
210 | }
211 | 
212 | sub downloadliftoverfiles {
213 |  
214 |   my $path = $_[0];
215 |   my $org1 = $_[1];
216 |   my $org2 = $_[2];
217 |   if (!-d "$path/$org1/liftOver") {
218 |     print ("Creating $path/$org1/liftOver folder\n");
219 |     system("mkdir -p $path/$org1/liftOver");
220 |     my $ref2Cap =`echo $org2 | python -c "s=input(); print (s[0].upper()+s[1:])"`; 
221 |     chomp $ref2Cap;
222 |     my $chain_name = $org1."To".$ref2Cap;
223 |     print ("Running: wget http://hgdownload.cse.ucsc.edu/goldenPath/$org1/liftOver/$chain_name.over.chain.gz --directory-prefix=$path/$org1/liftOver\n"); 
224 |     system("wget http://hgdownload.cse.ucsc.edu/goldenPath/$org1/liftOver/$chain_name.over.chain.gz --directory-prefix=$path/$org1/liftOver 2> /dev/null");
225 |     print ("LiftOver chain saved to $path/$org1/liftOver/$chain_name.over.chain.gz\n");
226 |   } else {
227 |     print ("$path/$org1 folder already exists, skipping download\n");
228 |   }
229 | }
230 | 
231 | sub downloadensmblfiles {
232 |  
233 |   my $path                = $_[0];
234 |   my $releaseNo           = $_[1];
235 |   my $org1EnsemblName     = $_[2];
236 |   my $org1EnsemblMartName = $_[3];
237 |   my $org2EnsemblName     = $_[4];
238 |   my $org2EnsemblMartName = $_[5];
239 |  
240 |   print ("Downloading GTF files\n");
241 |   if (!-e "$path/org1.gtf.gz") {
242 |     print ("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/gtf/$org1EnsemblName/*.$releaseNo.gtf.gz -O $path/org1.gtf.gz\n");
243 |     system("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/gtf/$org1EnsemblName/*.$releaseNo.gtf.gz -O $path/org1.gtf.gz 2> /dev/null");
244 |     print ("GTF files downloaded in $path\n");
245 |   } else {
246 |     print ("$path/org1.gtf.gz file exists, skipping download\n");
247 |   }
248 |   if (!-e "$path/org2.gtf.gz") {
249 |     print ("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/gtf/$org2EnsemblName/*.$releaseNo.gtf.gz -O $path/org2.gtf.gz\n");
250 |     system("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/gtf/$org2EnsemblName/*.$releaseNo.gtf.gz -O $path/org2.gtf.gz 2> /dev/null");
251 |     print ("GTF files downloaded in $path\n");
252 |   } else {
253 |     print ("$path/org2.gtf.gz file exists, skipping download\n");
254 |   }
255 |  
256 |   print ("Downloading ENSEMBL homologs\n");
257 |   if (!-e "$path/org1_homolog_org2.txt.gz") {
258 |     print ("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/mysql/ensembl_mart_$releaseNo/$org1EnsemblMartName\_gene_ensembl__homolog_$org2EnsemblMartName\__dm.txt.gz -O $path/org1_homolog_org2.txt.gz\n");
259 |     system("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/mysql/ensembl_mart_$releaseNo/$org1EnsemblMartName\_gene_ensembl__homolog_$org2EnsemblMartName\__dm.txt.gz -O $path/org1_homolog_org2.txt.gz 2> /dev/null");
260 |     print ("ENSEMBL homolog downloaded in $path\n");
261 |   } else {
262 |     print ("$path/org1_homolog_org2.txt.gz file exists, skipping download\n");
263 |   }
264 | 
265 |   if (!-e "$path/org2_homolog_org1.txt.gz") {
266 |     print ("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/mysql/ensembl_mart_$releaseNo/$org2EnsemblMartName\_gene_ensembl__homolog_$org1EnsemblMartName\__dm.txt.gz -O $path/org2_homolog_org1.txt.gz\n");
267 |     system("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/mysql/ensembl_mart_$releaseNo/$org2EnsemblMartName\_gene_ensembl__homolog_$org1EnsemblMartName\__dm.txt.gz -O $path/org2_homolog_org1.txt.gz 2> /dev/null");
268 |     print ("ENSEMBL homolog downloaded in $path\n");
269 |   } else {
270 |     print ("$path/org2_homolog_org1.txt.gz file exists, skipping download\n");
271 |   }
272 | 
273 | }
274 | 
275 | sub ltime {
276 |    
277 |   my $time = localtime;
278 |   return($time);
279 | }
280 | 
281 | sub genomedataarchive {
282 | 
283 |   my $path = $_[0];
284 |   my $org  = $_[1];
285 |   my $ref  = $_[2];
286 |   my $referenceGenomesDir = $_[3]; 
287 |   my $old_path = $ENV{'PWD'};
288 |   chdir $path;
289 |   if (-e "$ref.fa") {
290 |     print ("Deleting the existing $ref.fa\n");
291 |     system("rm -rf $ref.fa");
292 |   }
293 |   if (!-d $org) {
294 |     print ("Running : zcat $referenceGenomesDir/$ref/*.fa.gz > $ref.fa\n");
295 |     print ("Started at ",ltime(),"\n");
296 |     system("zcat $referenceGenomesDir/$ref/*.fa.gz > $ref.fa");
297 |     print ("Ended at ",ltime(),"\n");
298 |     print ("Running : genomedata-load-seq -d $org $ref.fa\n");
299 |     print ("Started at ",ltime(),"\n");
300 |     system("genomedata-load-seq -d $org $ref.fa");
301 |     system("genomedata-close-data $org");
302 |     print ("Ended at ",ltime(),"\n");
303 |     system("rm -rf $ref.fa");
304 |   } else {
305 |     print ("$org genomedata exists, skipping the step\n");
306 |   }
307 |   chdir $old_path;
308 | }
309 | 
310 | sub parseAndPicklePerPair {
311 | 
312 |   my $extmapper_path       = $_[0];
313 |   my $ensemblDir           = $_[1];
314 |   my $dataDirPerPair       = $_[2];
315 |   my $GTFsummaryDir        = $_[3]; 
316 |   my $perGenePairPickleDir = $_[4];
317 |    
318 |   if (!-e "$ensemblDir/org1.gtf") {
319 |     print ("Running : gunzip -k $ensemblDir/org1.gtf.gz\n");
320 |     system("gunzip -k $ensemblDir/org1.gtf.gz");
321 |   } else {
322 |     print ("$ensemblDir/org1.gtf file present, skipping gunzip action\n");
323 |   }
324 |   if (!-e "$ensemblDir/org2.gtf") {
325 |     print ("Running : gunzip -k $ensemblDir/org2.gtf.gz\n");
326 |     system("gunzip -k $ensemblDir/org2.gtf.gz");
327 |   } else {
328 |     print ("$ensemblDir/org2.gtf file present, skipping gunzip action\n");
329 |   }
330 |   if (!-e "$ensemblDir/org1_homolog_org2.txt") {
331 |     print ("Running : gunzip -k $ensemblDir/org1_homolog_org2.txt.gz\n");
332 |     system("gunzip -k $ensemblDir/org1_homolog_org2.txt.gz");
333 |   } else {
334 |     print ("$ensemblDir/org1_homolog_org2.txt file present, skipping gunzip action\n");
335 |   }
336 |   if (!-e "$ensemblDir/org2_homolog_org1.txt") {
337 |     print ("Running : gunzip -k $ensemblDir/org2_homolog_org1.txt.gz\n");
338 |     system("gunzip -k $ensemblDir/org2_homolog_org1.txt.gz");
339 |   } else {
340 |     print ("$ensemblDir/org2_homolog_org1.txt file present, skipping gunzip action\n");
341 |   }
342 | 
343 |   if (!-d $perGenePairPickleDir) {
344 |     print ("Running : python $extmapper_path/scripts/parseAndPicklePerPair.py $dataDirPerPair $GTFsummaryDir $perGenePairPickleDir\n");
345 |     print ("Started at ",ltime(),"\n");
346 |     system("python $extmapper_path/scripts/parseAndPicklePerPair.py $dataDirPerPair $GTFsummaryDir $perGenePairPickleDir");
347 |     print ("Ended at ",ltime(),"\n");
348 |     system("mv $perGenePairPickleDir/genePairsSummary-one2one.txt $dataDirPerPair/genePairsSummary-one2one.txt");
349 |   } else {
350 |     print ("perGenePairPickleDir found, skipping\n");
351 |   }
352 | }
353 | 
354 | sub liftoverexonmultiplemapping {
355 | 
356 |   my $GTFsummaryDir    = $_[0];
357 |   my $liftOverFilesDir = $_[1];
358 |   my $chainsDir = $_[2];
359 |   my $ref1 = $_[3];
360 |   my $ref2 = $_[4];
361 |   my $extmapper_path = $_[5]; 
362 | 
363 |   my $indir = "$GTFsummaryDir/onlyOrthologAndCodingGenes";
364 |  
365 |   print ("Running : cat $indir/org1-allExons-GTFparsed.txt | awk -v OFS='\\t' 'NR>1{print \$1,\$2,\$3,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_allExonsList.bed\n");
366 |   print ("Started at ",ltime(),"\n");
367 |   system("cat $indir/org1-allExons-GTFparsed.txt | awk -v OFS='\\t' 'NR>1{print \$1,\$2,\$3,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_allExonsList.bed");
368 |   print ("Ended at ",ltime(),"\n");
369 |   
370 |   print ("Running : cat $indir/org2-allExons-GTFparsed.txt | awk -v OFS='\\t' 'NR>1{print \$1,\$2,\$3,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_allExonsList.bed\n");
371 |   print ("Started at ",ltime(),"\n");
372 |   system("cat $indir/org2-allExons-GTFparsed.txt | awk -v OFS='\\t' 'NR>1{print \$1,\$2,\$3,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_allExonsList.bed");
373 |   print ("Ended at ",ltime(),"\n");
374 | 
375 |   print ("Running : cat $indir/org1-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"partCoding\" {print \$1,\$7,\$8,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_partCodingExonsList.bed\n");
376 |   print ("Started at ",ltime(),"\n");
377 |   system("cat $indir/org1-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"partCoding\" {print \$1,\$7,\$8,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_partCodingExonsList.bed");
378 |   print ("Ended at ",ltime(),"\n");
379 | 
380 |   print ("Running : cat $indir/org2-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"partCoding\" {print \$1,\$7,\$8,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_partCodingExonsList.bed\n");
381 |   print ("Started at ",ltime(),"\n");
382 |   system("cat $indir/org2-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"partCoding\" {print \$1,\$7,\$8,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_partCodingExonsList.bed");
383 |   print ("Ended at ",ltime(),"\n");
384 | 
385 |   print ("Running : cat $indir/org1-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"fullCoding\" {print \$1,\$2,\$3,\$4,\$5}' > $liftOverFilesDir/org1_f.temp\n");
386 |   print ("Started at ",ltime(),"\n");
387 |   system("cat $indir/org1-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"fullCoding\" {print \$1,\$2,\$3,\$4,\$5}' > $liftOverFilesDir/org1_f.temp");
388 |   print ("Ended at ",ltime(),"\n");
389 |  
390 |   print ("Running : cat $indir/org2-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"fullCoding\" {print \$1,\$2,\$3,\$4,\$5}' > $liftOverFilesDir/org2_f.temp\n");
391 |   print ("Started at ",ltime(),"\n");
392 |   system("cat $indir/org2-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"fullCoding\" {print \$1,\$2,\$3,\$4,\$5}' > $liftOverFilesDir/org2_f.temp");
393 |   print ("Ended at ",ltime(),"\n");
394 | 
395 |   print ("Running : cat $liftOverFilesDir/org1_partCodingExonsList.bed $liftOverFilesDir/org1_f.temp | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_allCodingExonsList.bed\n");
396 |   print ("Started at ",ltime(),"\n");
397 |   system("cat $liftOverFilesDir/org1_partCodingExonsList.bed $liftOverFilesDir/org1_f.temp | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_allCodingExonsList.bed");
398 |   print ("Ended at ",ltime(),"\n");
399 | 
400 |   print ("Running : cat $liftOverFilesDir/org2_partCodingExonsList.bed $liftOverFilesDir/org2_f.temp | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_allCodingExonsList.bed\n");
401 |   print ("Started at ",ltime(),"\n");
402 |   system("cat $liftOverFilesDir/org2_partCodingExonsList.bed $liftOverFilesDir/org2_f.temp | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_allCodingExonsList.bed");
403 |   print ("Ended at ",ltime(),"\n");
404 | 
405 |   print ("Running : cat $liftOverFilesDir/org1_allCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_allCodingExonsList.sorted.temp\n");
406 |   print ("Started at ",ltime(),"\n");
407 |   system("cat $liftOverFilesDir/org1_allCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_allCodingExonsList.sorted.temp");
408 |   print ("Ended at ",ltime(),"\n");
409 | 
410 |   print ("Running : cat $liftOverFilesDir/org2_allCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_allCodingExonsList.sorted.temp\n");
411 |   print ("Started at ",ltime(),"\n");
412 |   system("cat $liftOverFilesDir/org2_allCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_allCodingExonsList.sorted.temp");
413 |   print ("Ended at ",ltime(),"\n");
414 | 
415 |   print ("Running : cat $liftOverFilesDir/org1_allExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_allExonsList.sorted.temp\n");
416 |   print ("Started at ",ltime(),"\n");
417 |   system("cat $liftOverFilesDir/org1_allExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_allExonsList.sorted.temp");
418 |   print ("Ended at ",ltime(),"\n");
419 | 
420 |   print ("Running : cat $liftOverFilesDir/org2_allExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_allExonsList.sorted.temp\n");
421 |   print ("Started at ",ltime(),"\n");
422 |   system("cat $liftOverFilesDir/org2_allExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_allExonsList.sorted.temp");
423 |   print ("Ended at ",ltime(),"\n");
424 | 
425 |   print ("Running : cat $liftOverFilesDir/org1_partCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_partCodingExonsList.sorted.temp\n");
426 |   print ("Started at ",ltime(),"\n");
427 |   system("cat $liftOverFilesDir/org1_partCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_partCodingExonsList.sorted.temp");
428 |   print ("Ended at ",ltime(),"\n");
429 | 
430 |   print ("Running : cat $liftOverFilesDir/org2_partCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_partCodingExonsList.sorted.temp\n");
431 |   print ("Started at ",ltime(),"\n");
432 |   system("cat $liftOverFilesDir/org2_partCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_partCodingExonsList.sorted.temp");
433 |   print ("Ended at ",ltime(),"\n");
434 | 
435 |   my $chain1to2=`ls $chainsDir/$ref1/liftOver/*.over.chain.gz`;
436 |   my $chain2to1=`ls $chainsDir/$ref2/liftOver/*.over.chain.gz`;
437 |   chomp ($chain1to2, $chain2to1);
438 |   
439 |   foreach my $minMatch (qw{1 0.95 0.9}) {
440 |     print ("Running : $extmapper_path/scripts/liftover-withMultiples 0 $minMatch $chain1to2 $chain2to1\n"); 
441 |     print ("Started at ",ltime(),"\n");
442 |     system("$extmapper_path/scripts/liftover-withMultiples 0 $minMatch $chain1to2 $chain2to1");
443 |     print ("Ended at ",ltime(),"\n");
444 |   }
445 |   system("rm -rf $liftOverFilesDir/org2_allExonsList.sorted.temp");
446 |   system("rm -rf $liftOverFilesDir/org1_allExonsList.sorted.temp");
447 |   system("rm -rf $liftOverFilesDir/org2_partCodingExonsList.sorted.temp");
448 |   system("rm -rf $liftOverFilesDir/org1_partCodingExonsList.sorted.temp"); 
449 |   system("rm -rf $liftOverFilesDir/org2_allCodingExonsList.sorted.temp");
450 |   system("rm -rf $liftOverFilesDir/org1_allCodingExonsList.sorted.temp");
451 | }
452 | 
453 | sub liftoverfilesprocess {
454 | 
455 |   my $indir  = $_[0];
456 |   my $outdir = $_[1];
457 |   my $flank  = $_[2];
458 |   my $extmapper_path = $_[3];
459 | 
460 |   if (-e "oneHugeFile-2to1-partCoding.txt") {
461 |     system("rm -rf oneHugeFile-2to1-partCoding.txt");
462 |   }
463 |   if (-e "oneHugeFile-1to2-partCoding.txt") {
464 |     system("rm -rf oneHugeFile-1to2-partCoding.txt");
465 |   }
466 | 
467 |   foreach my $minMatch (qw{1 0.95 0.9}) {
468 |     $suffix="flank$flank-minMatch$minMatch-multiples-partCoding";
469 |     print ("Running : zcat $indir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\".\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-2to1-partCoding.txt\n");
470 |     print ("Started at ",ltime(),"\n");
471 |     system("zcat $indir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\".\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-2to1-partCoding.txt");
472 |     print ("Ended at ",ltime(),"\n");
473 | 
474 |     print ("Running : zcat $indir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\".\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-1to2-partCoding.txt\n");
475 |     print ("Started at ",ltime(),"\n");
476 |     system("zcat $indir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\".\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-1to2-partCoding.txt");
477 |     print ("Started at ",ltime(),"\n");
478 |   }
479 | 
480 |   if (-e "oneHugeFile-2to1-others.txt") {
481 |     system("rm -rf oneHugeFile-2to1-others.txt");
482 |   } 
483 |   if (-e "oneHugeFile-1to2-others.txt") {
484 |     system("rm -rf oneHugeFile-1to2-others.txt");
485 |   }
486 | 
487 |   foreach my $minMatch (qw{1 0.95 0.9}) {
488 |     $suffix="flank$flank-minMatch$minMatch-multiples";
489 |     print ("Running : zcat $indir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\"\.\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-2to1-others.txt\n");
490 |     print ("Started at ",ltime(),"\n");
491 |     system("zcat $indir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\"\.\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-2to1-others.txt");
492 |     print ("Ended at ",ltime(),"\n");
493 | 
494 |     print ("Running : zcat $indir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\"\.\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-1to2-others.txt\n");
495 |     print ("Started at ",ltime(),"\n");
496 |     system("zcat $indir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\"\.\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-1to2-others.txt");
497 |     print ("Ended at ",ltime(),"\n");
498 |   }
499 |   
500 |   print ("Running : cat oneHugeFile-1to2-partCoding.txt oneHugeFile-1to2-others.txt | sort -k10,10 >oneHugeFile-1to2.txt.sorted\n");
501 |   print ("Started at ",ltime(),"\n");
502 |   system("cat oneHugeFile-1to2-partCoding.txt oneHugeFile-1to2-others.txt | sort -k10,10 >oneHugeFile-1to2.txt.sorted");
503 |   print ("Ended at ",ltime(),"\n");
504 | 
505 |   print ("Running : cat oneHugeFile-2to1-partCoding.txt oneHugeFile-2to1-others.txt | sort -k10,10 >oneHugeFile-2to1.txt.sorted\n");
506 |   print ("Started at ",ltime(),"\n");
507 |   system("cat oneHugeFile-2to1-partCoding.txt oneHugeFile-2to1-others.txt | sort -k10,10 >oneHugeFile-2to1.txt.sorted");
508 |   print ("Ended at ",ltime(),"\n");
509 | 
510 |   system("mkdir -p $outdir/org1 $outdir/org2");
511 |   $whichCol=10;
512 |   $fileSuffix="_mapped.txt";
513 | 
514 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix\n");
515 |   print ("Started at ",ltime(),"\n");
516 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix");
517 |   print ("Ended at ",ltime(),"\n");
518 | 
519 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix\n");
520 |   print ("Started at ",ltime(),"\n");
521 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix");
522 |   print ("Ended at ",ltime(),"\n");
523 |  
524 |   print ("Removing temporary files\n");
525 |   system("rm -rf oneHugeFile*.txt"); 
526 | 
527 | }
528 | 
529 | sub liftoverfilesprocessunmappedexons {
530 | 
531 |   my $indir  = $_[0];
532 |   my $outdir = $_[1];
533 |   my $flank  = $_[2];
534 |   my $extmapper_path = $_[3];
535 | 
536 |   if (-e "oneHugeFile-2to1-partCoding.txt") {
537 |     system("rm -rf oneHugeFile-2to1-partCoding.txt");
538 |   }
539 |   if (-e "oneHugeFile-1to2-partCoding.txt") {
540 |     system("rm -rf oneHugeFile-1to2-partCoding.txt");
541 |   }
542 |   
543 |   foreach my $minMatch (qw{1 0.95 0.9}) {
544 |     $suffix="flank$flank-minMatch$minMatch-multiples-partCoding";
545 |     print ("Running : zcat $indir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed.gz |awk -v OFS='\\t' '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-partCoding.txt\n");
546 |     print ("Started at ",ltime(),"\n");
547 |     system("zcat $indir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed.gz |awk -v OFS='\\t' '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-partCoding.txt"); 
548 |     print ("Ended at ",ltime(),"\n");
549 |   
550 |     print ("Running : zcat $indir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed.gz |awk -v OFS='\\t' '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-partCoding.txt\n");
551 |     print ("Started at ",ltime(),"\n");
552 |     system("zcat $indir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed.gz |awk -v OFS='\\t' '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-partCoding.txt");
553 |     print ("Ended at ",ltime(),"\n");
554 |   }  
555 |  
556 |   if (-e "oneHugeFile-2to1-others.txt") {
557 |     system("rm -rf oneHugeFile-2to1-others.txt");
558 |   }
559 |   if (-e "oneHugeFile-1to2-others.txt") {
560 |     system("rm -rf oneHugeFile-1to2-others.txt");
561 |   }
562 | 
563 |   foreach my $minMatch (qw{1 0.95 0.9}) {
564 |     $suffix="flank$flank-minMatch$minMatch-multiples";
565 |     print ("Running : zcat $indir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-others.txt\n");
566 |     print ("Started at ",ltime(),"\n");
567 |     system("zcat $indir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-others.txt");
568 |     print ("Ended at ",ltime(),"\n");
569 | 
570 |     print ("Running : zcat $indir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-others.txt\n");
571 |     print ("Started at ",ltime(),"\n");
572 |     system("zcat $indir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-others.txt");
573 |     print ("Ended at ",ltime(),"\n");
574 |   }
575 | 
576 |   print ("Running : cat oneHugeFile-1to2-partCoding.txt oneHugeFile-1to2-others.txt | sort -k5,5 >oneHugeFile-1to2.txt.sorted\n");
577 |   print ("Started at ",ltime(),"\n");
578 |   system("cat oneHugeFile-1to2-partCoding.txt oneHugeFile-1to2-others.txt | sort -k5,5 >oneHugeFile-1to2.txt.sorted");
579 |   print ("Ended at ",ltime(),"\n");
580 | 
581 |   print ("Running : cat oneHugeFile-2to1-partCoding.txt oneHugeFile-2to1-others.txt | sort -k5,5 >oneHugeFile-2to1.txt.sorted\n");
582 |   print ("Started at ",ltime(),"\n");
583 |   system("cat oneHugeFile-2to1-partCoding.txt oneHugeFile-2to1-others.txt | sort -k5,5 >oneHugeFile-2to1.txt.sorted");
584 |   print ("Ended at ",ltime(),"\n");
585 | 
586 |   system("mkdir -p $outdir/org1 $outdir/org2");
587 |   $whichCol=5;
588 |   $fileSuffix="_unmapped.txt";
589 | 
590 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix\n");
591 |   print ("Started at ",ltime(),"\n");
592 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix");
593 |   print ("Ended at ",ltime(),"\n");
594 | 
595 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix\n");
596 |   print ("Started at ",ltime(),"\n");
597 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix");
598 |   print ("Ended at ",ltime(),"\n");
599 | }
600 | 
601 | sub liftoverfilesprocessmappedexons {
602 | 
603 |   my $indir  = $_[0];
604 |   my $outdir = $_[1];
605 |   my $flank  = $_[2];
606 |   my $extmapper_path = $_[3];
607 | 
608 |   if (-e "oneHugeFile-2to1-others.txt") {
609 |     system("rm -rf oneHugeFile-2to1-others.txt");
610 |   }
611 |   if (-e "oneHugeFile-1to2-others.txt") {
612 |     system("rm -rf oneHugeFile-1to2-others.txt");
613 |   }
614 |   
615 |   foreach my $minMatch (qw{1 0.95 0.9}) {
616 |     $suffix="flank$flank-minMatch$minMatch-multiples";
617 |     print ("Running : zcat $indir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-others.txt\n");
618 |     print ("Started at ",ltime(),"\n");
619 |     system("zcat $indir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-others.txt");
620 |     print ("Ended at ",ltime(),"\n");
621 | 
622 |     print ("Running : zcat $indir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-others.txt\n");
623 |     print ("Started at ",ltime(),"\n");
624 |     system("zcat $indir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-others.txt");
625 |     print ("Ended at ",ltime(),"\n");
626 |   }
627 |  
628 |   print ("Running : cat oneHugeFile-1to2-others.txt | sort -k5,5 >oneHugeFile-1to2.txt.sorted\n");
629 |   print ("Started at ",ltime(),"\n");
630 |   system("cat oneHugeFile-1to2-others.txt | sort -k5,5 >oneHugeFile-1to2.txt.sorted");
631 |   print ("Ended at ",ltime(),"\n");
632 | 
633 |   print ("Running : cat oneHugeFile-2to1-others.txt | sort -k5,5 >oneHugeFile-2to1.txt.sorted\n");
634 |   print ("Started at ",ltime(),"\n");
635 |   system("cat oneHugeFile-2to1-others.txt | sort -k5,5 >oneHugeFile-2to1.txt.sorted");
636 |   print ("Ended at ",ltime(),"\n");
637 |  
638 |   system("mkdir -p $outdir/org1 $outdir/org2");
639 |   $whichCol=5;
640 |   $fileSuffix="_nonintersecting.txt";
641 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix\n");
642 |   print ("Started at ",ltime(),"\n");
643 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix");
644 |   print ("Ended at ",ltime(),"\n");
645 | 
646 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix\n");
647 |   print ("Started at ",ltime(),"\n");
648 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix");
649 |   print ("Ended at ",ltime(),"\n");  
650 | 
651 |   print ("Removing temporary files\n");
652 |   system("rm -rf oneHugeFile* dummy.txt");
653 | }
654 | 
655 | sub step {
656 |  
657 |    my $step = $_[0];
658 | 
659 |    if ($step == 1 || $step eq "all" || $step eq "All" || $step eq "ALL") { 
660 |  
661 |     print ("Running step 1:\n");
662 |     print ("Downloading per organism specific files and keep the original organism names for future reuse\n");
663 |     print ("Downloading the two reference genomes from UCSC and get rid of unknown, random and alt contigs\n");
664 | 
665 |     system("mkdir -p $referenceGenomesDir");
666 |     downloadrefgenome($referenceGenomesDir, $ENV{'ref1'});
667 |     downloadrefgenome($referenceGenomesDir, $ENV{'ref2'});
668 | 
669 |     system("mkdir -p $chainsDir");
670 |     downloadliftoverfiles($chainsDir, $ENV{'ref1'}, $ENV{'ref2'});
671 |     downloadliftoverfiles($chainsDir, $ENV{'ref2'}, $ENV{'ref1'});
672 |  
673 |     system("mkdir -p $ensemblDir");
674 |     downloadensmblfiles($ensemblDir, $ENV{'releaseNo'}, $ENV{'org1EnsemblName'}, $ENV{'org1EnsemblMartName'}, $ENV{'org2EnsemblName'}, $ENV{'org2EnsemblMartName'}); 
675 |     print ("---------------------- Step 1 Finished ----------------------\n");
676 |   }
677 |   
678 |   if ($step == 2 || $step eq "all" || $step eq "All" || $step eq "ALL") {
679 | 
680 |     print ("Running step 2:\n");
681 |     print ("Initialize the genomedata archives with the genomes of org1 and org2\n");
682 |     print ("Make sure genomedata is installed first\n");
683 |     print ("Installation: pip install genomedata --user\n");
684 |     system("mkdir -p $genomedataDir");
685 |     
686 |     genomedataarchive($genomedataDir, "org1", $ENV{'ref1'}, $referenceGenomesDir);
687 |     genomedataarchive($genomedataDir, "org2", $ENV{'ref2'}, $referenceGenomesDir);
688 |     print ("---------------------- Step 2 Finished ----------------------\n");
689 |   }
690 |  
691 |   if ($step == 3 || $step eq "all" || $step eq "All" || $step eq "ALL") {
692 |     print ("Running step 3:\n");
693 |     print ("Creating pickle files\n");
694 |     parseAndPicklePerPair($ENV{'EXTRAMAPPER_DIR'}, $ensemblDir, $dataDirPerPair, $GTFsummaryDir, $perGenePairPickleDir);
695 |     print ("---------------------- Step 3 Finished ----------------------\n");
696 |   }
697 | 
698 |   if ($step == 4 || $step eq "all" || $step eq "All" || $step eq "ALL") {
699 |     print ("Running step 4:\n");
700 |     print ("liftOver the exon lists but this time allow multiple mappings and also compute intersections with the other set of exons\n");
701 |     system("mkdir -p $liftOverFilesDir");
702 |     system("mkdir -p preprocess/bin");
703 |     if (!-e "./preprocess/bin/liftOver") {
704 |       system("ln -s \$(readlink $ENV{liftOver}) ./preprocess/bin");
705 |     }
706 |     liftoverexonmultiplemapping($GTFsummaryDir, $liftOverFilesDir, $chainsDir, $ENV{'ref1'}, $ENV{'ref2'}, $ENV{'EXTRAMAPPER_DIR'});
707 |     print ("---------------------- Step 4 Finished ----------------------\n");
708 |   }
709 | 
710 |   if ($step == 5 || $step eq "all" || $step eq "All" || $step eq "ALL") {
711 |     print ("Running step 5:\n");
712 |     print ("Putting together, sorting, making them uniq and then splitting into one file per exon for all the liftover files created so far\n");
713 |     liftoverfilesprocess($liftOverFilesDir,  $perExonLiftoverDir, 0, $ENV{'EXTRAMAPPER_DIR'}); 
714 |     print ("---------------------- Step 5 Finished ----------------------\n");
715 |   }
716 |   
717 |   if ($step == 6 || $step eq "all" || $step eq "All" || $step eq "ALL") {
718 |     print ("Running step 6:\n");
719 |     print ("Putting together, sorting, making them uniq and then splitting into one file per exon for all the liftover files created for UNMAPPED EXONS so far\n"); 
720 |     liftoverfilesprocessunmappedexons($liftOverFilesDir,  $perExonLiftoverDir, 0, $ENV{'EXTRAMAPPER_DIR'});
721 |     print ("---------------------- Step 6 Finished ----------------------\n");
722 |   }
723 | 
724 |   if ($step == 7 || $step eq "all" || $step eq "All" || $step eq "ALL") {
725 |     print ("Runing step 7:\n");
726 |     print ("Putting together, sorting, making them uniq and then splitting into one file per exon for all the liftover files for MAPPED EXONS that DO NOT INTERSECT WITH AN EXON so far\n");
727 |     liftoverfilesprocessmappedexons($liftOverFilesDir,  $perExonLiftoverDir, 0, $ENV{'EXTRAMAPPER_DIR'});
728 |     print ("---------------------- Step 7 Finished ----------------------\n");
729 |     print ("Preporcessing steps finished!\n");
730 |   }
731 | }
732 | 
733 | step($step);
734 | 


--------------------------------------------------------------------------------
/Human-Monkey-Processed-Data/scripts/ensemblUtils.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | ##############################################################################
   3 | ### To use the functions in this lib simply import this python module using
   4 | ### import ensemblUtils
   5 | ### Then you'll able able to call functions with the proper arguments using
   6 | ### returnVal=ensemblUtils.func1(arg1,arg2)
   7 | ##############################################################################
   8 | ##############################################################################
   9 | import sys
  10 | import os
  11 | 
  12 | ##############################################################
  13 | # Genomedata is off by one, seq[0] returns you the 1st bp. 
  14 | # Have to account for this by subtracting one from each seq coordinate
  15 | # before retrieving sequence from Genomedata.
  16 | # installation: pip install genomedata --user
  17 | from genomedata import Genome
  18 | ##############################################################
  19 | 
  20 | import string
  21 | import math
  22 | import gzip
  23 | import _pickle as pickle
  24 | import numpy as np
  25 | 
  26 | complement = str.maketrans('atcgn', 'tagcn')
  27 | 
  28 | def parse_ensembl_geneAndProtein_pairings(infilename,proteinToGeneDic,proteinPairsDic):
  29 | 	"""
  30 | 	This function parses a given Ensembl file downloaded from below folders
  31 | 	and parses out the gene and protein pairings. 
  32 | 	For the first files, say org1 to org2, it only reports protein pairings.
  33 | 	For the second file it reports back the protein and gene pairings using
  34 | 	the protein pairings from the first proteinPairingsSoFar dictionary.
  35 | 	Fields of these files are (as far as I understand):
  36 | 		someScore chr	notsure	start end orthologytype
  37 | 	
  38 | 		0.14740	MT	192515	3307	4262	ortholog_one2one	73.39810	Euarchontoglires	77	
  39 | 		ENSG00000198888	ENSP00000354687	ENSMUSP00000080991	77	0
  40 | 
  41 | 	"""
  42 | 	sys.stderr.write("Parsing gene and protein pairings from file "+infilename+"\n")
  43 | 
  44 | 	isFirstFile=True
  45 | 	if bool(proteinToGeneDic): # empty dic evaluates to False
  46 | 		isFirstFile=False
  47 | 
  48 | 	genePairsDic={}
  49 | 	if infilename.endswith(".gz"):
  50 | 		infile=gzip.open(infilename,'rb')
  51 | 	else:
  52 | 		infile=open(infilename,'r')
  53 | 	#
  54 | 
  55 | 	# it doesn't have a header
  56 | 	lineCount=0
  57 | 	for line in infile:
  58 | 		words=line.rstrip().split()
  59 | 		someScore,chr,notsure,st,en,orthologyType,someOtherScore,phylo,\
  60 | 			gene1PercentIdentity,gene1,protein1,protein2,genename,gene2PercentIdentity,notsure=words ## genename variable added
  61 | 		
  62 | 		# skip the chromosomes that are not 1..23 or X or Y
  63 | 		if chr=="MT" or len(chr)>2: 
  64 | 			continue
  65 | 		
  66 | 		if "ortholog" not in orthologyType:
  67 | 			continue
  68 | 
  69 | 		proteinToGeneDic[protein1]=gene1
  70 | 		if protein1 not in proteinPairsDic:
  71 | 			proteinPairsDic[protein1]=[]
  72 | 		proteinPairsDic[protein1].append(protein2)
  73 | 
  74 | 		if not isFirstFile:
  75 | 			if protein2 not in proteinToGeneDic: # second gene is not from a 1,22 or X,Y chr
  76 | 				#print [chr,gene1,protein1,protein2]
  77 | 				continue 
  78 | 			gene2=proteinToGeneDic[protein2]
  79 | 			# below checks ensure I only get one entry per g1-g2 pair
  80 | 			if gene1 not in genePairsDic:
  81 | 				genePairsDic[gene1]=[]
  82 | 				genePairsDic[gene1].append([gene2,orthologyType,gene1PercentIdentity,gene2PercentIdentity])
  83 | 			else: 
  84 | 				if gene2 not in [a[0] for a in genePairsDic[gene1]]:
  85 | 					genePairsDic[gene1].append([gene2,orthologyType,gene1PercentIdentity,gene2PercentIdentity])
  86 | 			#
  87 | 	#
  88 | 	types={}
  89 | 	for g1 in genePairsDic:
  90 | 		type=genePairsDic[g1][0][1]
  91 | 		num=len(genePairsDic[g1])
  92 | 		if type=="ortholog_one2one" and num>1:
  93 | 			sys.exit("Matching should be one2one but it isn't\t"+g1)
  94 | 		if type not in types:
  95 | 			types[type]=0
  96 | 		types[type]+=1
  97 | 	#
  98 | 	infile.close()
  99 | 	
 100 | 	sys.stderr.write("Gene pair mappings summary: "+repr(types)+"\n\n")
 101 | 
 102 | 	return proteinToGeneDic,genePairsDic,proteinPairsDic
 103 | 
 104 | def parse_ensembl_gene_pairings(infilename):
 105 | 	"""
 106 | 	This function parses a given Ensembl file (comma seperated) 
 107 | 	that matches the genes of the first organism to the second.
 108 | 	"""
 109 | 	sys.stderr.write("Parsing gene pairings from file "+infilename+"\n")
 110 | 
 111 | 	genePairsDic={}
 112 | 	if infilename.endswith(".gz"):
 113 | 		infile=gzip.open(infilename,'rb')
 114 | 	else:
 115 | 		infile=open(infilename,'r')
 116 | 	#
 117 | 	#########################################################################
 118 | 	## Commented by Abhijit 
 119 | 	## The file header is changed so the following section needs to be recoded
 120 | 	##
 121 | 	##columnIndices={"GeneID1" : -1, "GeneID2" : -1, "Homology Type" : -1, "Orthology confidence": -1, "Percent Identity" : -1, "Chromosome Name" : []}
 122 | 	## parse the information header first
 123 | 	##columnNames=infile.readline().strip().split(",")
 124 | 	##i=0 #0-based column indices
 125 | 	##for c in columnNames:
 126 | 	##	if c=="Ensembl Gene ID":
 127 | 	##		columnIndices["GeneID1"]=i
 128 | 	##	elif c.endswith("Ensembl Gene ID"):
 129 | 	##		columnIndices["GeneID2"]=i
 130 | 	##	elif c=="Homology Type":
 131 | 	##		columnIndices["Homology Type"]=i
 132 | 	##	elif "Orthology confidence" in c:
 133 | 	##		columnIndices["Orthology confidence"]=i
 134 | 	##	elif c.endswith("Identity with respect to query gene"):
 135 | 	##		columnIndices["Percent Identity"]=i
 136 | 	##	elif c.endswith("Chromosome Name"):
 137 | 	##		columnIndices["Chromosome Name"].append(i)
 138 | 	##	i+=1
 139 | 	#########################################################################
 140 | 
 141 | 	######## Rewritten ########
 142 | 	columnIndices={"GeneID1" : -1, "GeneID2" : -1, "Homology Type" : -1, "Orthology confidence": -1, "Percent Identity" : -1, "Chromosome Name1" : -1, "Chromosome Name2" : -1}
 143 | 	## parse the information header first
 144 | 	columnNames=infile.readline().strip().split(",")
 145 | 	i=0 #0-based column indices
 146 | 	for c in columnNames:
 147 | 		if c=="GeneID1":
 148 | 			columnIndices["GeneID1"]=i
 149 | 		elif c=="GeneID2":
 150 | 			columnIndices["GeneID2"]=i
 151 | 		elif c=="Homology Type":
 152 | 			columnIndices["Homology Type"]=i
 153 | 		elif c=="Orthology confidence":
 154 | 			columnIndices["Orthology confidence"]=i
 155 | 		elif c=="Percent Identity":
 156 | 			columnIndices["Percent Identity"]=i
 157 | 		elif c=="Chromosome Name1":
 158 | 			columnIndices["Chromosome Name1"]=i
 159 | 		elif c=="Chromosome Name2":
 160 | 			columnIndices["Chromosome Name2"]=i
 161 | 		i+=1
 162 | 	#
 163 | 	lineCount=0
 164 | 	for line in infile:
 165 | 		words=line.rstrip().split(",")
 166 | 		# skip the chromosomes that are not 1..23 or X or Y
 167 | 		##ch1,ch2 = words[columnIndices["Chromosome Name"][0]],words[columnIndices["Chromosome Name"][1]]
 168 | 		ch1,ch2 = words[columnIndices["Chromosome Name1"]],words[columnIndices["Chromosome Name2"]]
 169 | 		if ch1=="MT" or len(ch1)>2 or ch2=="MT" or len(ch2)>2:
 170 | 			continue
 171 | 		#
 172 | 		gene1,gene2= words[columnIndices["GeneID1"]], words[columnIndices["GeneID2"]]
 173 | 		homologyType= words[columnIndices["Homology Type"]]
 174 | 		orthologyConfidence=words[columnIndices["Orthology confidence"]]
 175 | 		percentIdentity=words[columnIndices["Percent Identity"]]
 176 | 		# below checks ensure I only get one entry per g1-g2 pair
 177 | 		if gene1 not in genePairsDic:
 178 | 			genePairsDic[gene1]=[]
 179 | 			genePairsDic[gene1].append([gene2,homologyType,orthologyConfidence,percentIdentity])
 180 | 		else: 
 181 | 			if gene2 not in [a[0] for a in genePairsDic[gene1]]:
 182 | 				genePairsDic[gene1].append([gene2,homologyType,orthologyConfidence,percentIdentity])
 183 | 			#
 184 | 		#
 185 | 		lineCount+=1
 186 | 	#
 187 | 	#print len(genePairsDic)
 188 | 	types={}
 189 | 	for g1 in genePairsDic:
 190 | 		type=genePairsDic[g1][0][1] 
 191 | 		num=len(genePairsDic[g1])
 192 | 		if type=="ortholog_one2one" and num>1:
 193 | 			sys.exit("Matching should be one2one but it isn't\t"+g1)
 194 | 		if type not in types:
 195 | 			types[type]=0
 196 | 		types[type]+=1
 197 | 	#
 198 | 	infile.close()
 199 | 	
 200 | 	sys.stderr.write("Gene pair mappings summary: "+repr(types)+"\n\n")
 201 | 
 202 | 	return genePairsDic
 203 | 
 204 | 
 205 | def parse_organism_GTF(orgID, infilename, outdir):
 206 | 	"""
 207 | 	This function parses a given Ensembl GTF file into the 
 208 | 	internal data structure for that organism. 
 209 | 	Does not output anything if outdir is "None"
 210 | 	"""
 211 | 	sys.stderr.write("Parsing organism GTF for "+orgID+" from file "+infilename+"\n")
 212 | 	geneDic={}
 213 | 	transcriptDic={}
 214 | 	exonDic={}
 215 | 	infoDic={} # build, version, accession
 216 | 	if infilename.endswith(".gz"):
 217 | 		infile=gzip.open(infilename,'rb')
 218 | 	else:
 219 | 		infile=open(infilename,'r')
 220 | 	# parse the information header first
 221 | 	elemCounts={"CDS" : 0, "exon" : 0, "gene" : 0, "start_codon" : 0, 
 222 | 		"stop_codon" : 0, "transcript" : 0, "UTR" : 0, "other" : 0}
 223 | 	lineCount=0
 224 | 	lastReadExon="dummy"
 225 | 	for line in infile:
 226 | 		if line.startswith("#"):
 227 | 			key, item = line.split()[:2]
 228 | 			infoDic[key.split("-")[-1]]=item
 229 | 		else:
 230 | 			elemType=line.rstrip().split("\t")[2]
 231 | 			chrName=line.rstrip().split("\t")[0]
 232 | 			if chrName=="MT" or len(chrName)>2:
 233 | 				#print chrName
 234 | 				continue
 235 | 			if elemType=="gene":
 236 | 				newGene=EnsemblGene(line)
 237 | 				geneDic[newGene.basicInfoDic["gene_id"]]=newGene
 238 | 			elif elemType=="transcript":
 239 | 				newTranscript=EnsemblTranscript(line)
 240 | 				transcriptDic[newTranscript.basicInfoDic["transcript_id"]]=newTranscript
 241 | 				geneId=newTranscript.basicInfoDic["gene_id"]
 242 | 				geneDic[geneId].add_transcript(newTranscript)
 243 | 			elif elemType=="exon":
 244 | 				newExon=EnsemblExon(line)
 245 | 				lastReadExon=newExon.basicInfoDic["exon_id"]
 246 | 				
 247 | 				# DELETEME!!
 248 | 				#print("ALL_EXON_ENTRY\t%s\n" % (newExon.get_summary_string())),
 249 | 
 250 | 				# Make sure to store certain information about the exon if it appears multiple times
 251 | 				if lastReadExon not in exonDic:
 252 | 					exonDic[lastReadExon]=newExon
 253 | 				else:
 254 | 					# DELETEME!!
 255 | 					#print("DUPLICATE_EXON_ENTRY\t%s\t%s\n" % (exonDic[lastReadExon].get_summary_string(),newExon.get_summary_string())),
 256 | 					exonDic[lastReadExon].add_another_instance(newExon)
 257 | 				#
 258 | 				geneId=newExon.basicInfoDic["gene_id"]
 259 | 				transcriptId=newExon.basicInfoDic["transcript_id"]
 260 | 				## Add the exon to a gene, don't care about ordering and simply owerwrite if exists
 261 | 				geneDic[geneId].add_exon(newExon)
 262 | 				## Add to exon to a transcript, make sure this exon insertion is ordered
 263 | 				## Luckily entrys come ordered!
 264 | 				## also same exon doesn't appear twice in once transcript so that case is not handled
 265 | 				transcriptDic[transcriptId].add_exon(newExon)
 266 | 				## no need for below line because Python is Pass-by-object-reference
 267 | 				#geneDic[geneId].transcripts[transcriptId].add_exon(newExon)
 268 | 			elif elemType=="CDS":
 269 | 				#meaning previously read exon is completely/partially coding
 270 | 				newLocus=EnsemblLocus(line)
 271 | 				transcriptDic[transcriptId].handle_CDS(newLocus)
 272 | 				exonDic[lastReadExon].handle_CDS(newLocus)
 273 | 				# DELETEME!!
 274 | 				#print "handleCDS\t%s\t%s\n" % (lastReadExon,newLocus.get_summary_string()),
 275 | 			elif elemType=="UTR" or elemType=="stop_codon" or elemType=="start_codon":
 276 | 				newLocus=EnsemblLocus(line)
 277 | 				transcriptDic[transcriptId].add_locus(newLocus,elemType)
 278 | 			#
 279 | 			if elemType not in elemCounts:
 280 | 				elemType="other"
 281 | 			elemCounts[elemType]=elemCounts[elemType]+1
 282 | 			lineCount+=1
 283 | 			if lineCount%100000==0:
 284 | 				sys.stderr.write(str(lineCount)+"\t")
 285 | 		#
 286 | 	#
 287 | 	sys.stderr.write("\n")
 288 | 	sys.stderr.write("GTF parsing summary: " +repr(elemCounts)+"\n\n")
 289 | 	infile.close()
 290 | 	if outdir!="None":
 291 | 		print_some_summary(orgID, geneDic,transcriptDic,exonDic,elemCounts, outdir)
 292 | 	return (geneDic,transcriptDic,exonDic,infoDic)
 293 | 
 294 | 
 295 | def print_some_summary(orgID, geneDic,transcriptDic,exonDic,elemCounts, outdir):
 296 | 	"""
 297 | 	Print a summary for the genes, transcripts and exons in the
 298 | 	read GTF file.
 299 | 	"""
 300 | 	outfile=open(outdir+"/"+orgID+"-allGenes-GTFparsed.txt",'w')
 301 | 	outfile.write("chrName\tstartCoord\tendCoord\tstrand\tgeneID\tgeneName\tgeneType\tnoOfTranscripts\tnoOfExons\telementType\n")
 302 | 	for g in geneDic:
 303 | 		outfile.write(geneDic[g].get_summary_string()+"\n")
 304 | 		#print geneDic[g].get_summary_string()
 305 | 	#
 306 | 	outfile.close()
 307 | 
 308 | 	totalNumberOfExons=0
 309 | 	outfile=open(outdir+"/"+orgID+"-allTranscripts-GTFparsed.txt",'w')
 310 | 	outfile.write("chrName\tstartCoord\tendCoord\tstrand\ttranscriptID\ttranscriptName\ttranscriptType\tgeneID\tgeneName\texonIDs\texonTypes\texonStarts\texonEnds\tcodingStarts\tcodingEnds\tstartCodon\tstopCodon\tUTRstarts\tUTRends\tproteinID\telementType\n")
 311 | 	for t in transcriptDic:
 312 | 		outfile.write(transcriptDic[t].get_summary_string()+"\n")
 313 | 		totalNumberOfExons+=len(transcriptDic[t].exon_types)
 314 | 		#print transcriptDic[t].get_summary_string()
 315 | 	#
 316 | 	outfile.close()
 317 | #	print ["totalNumberOfExons", totalNumberOfExons]
 318 | 
 319 | 	outfile=open(outdir+"/"+orgID+"-allExons-GTFparsed.txt",'w')
 320 | 	outfile.write("chrName\tstartCoord\tendCoord\tstrand\texonID\texonType\tcodingStart\tcodingEnd\ttranscriptIDs\texonNumbers\tgeneID\texonLength\tacceptor2bp\tdonor2bp\tavgCodingConsScore\tavgConsScore\tfirstMidLastCounts\telementType\n")
 321 | 	for e in exonDic:
 322 | 		outfile.write(exonDic[e].get_summary_string()+"\n")
 323 | 		#print exonDic[e].get_summary_string()
 324 | 	#
 325 | 	outfile.close()
 326 | 
 327 | 	return
 328 | 
 329 | def overlapping_combined( orig_data, reverse = False):
 330 |     """
 331 |     Return list of intervals with overlapping neighbours merged together
 332 |     Assumes sorted intervals unless reverse is set
 333 | 
 334 |     """
 335 |     if not orig_data or not len(orig_data): return []
 336 |     if len(orig_data) == 1:
 337 |         return orig_data
 338 | 
 339 |     new_data = []
 340 | 
 341 |     if reverse:
 342 |         data = orig_data[:]
 343 |         data.reverse()
 344 |     else:
 345 |         data = orig_data
 346 | 
 347 |     if not data[0][0] <= data[1][0]:
 348 |         print((data, reverse))
 349 |     assert(data[0][0] <= data[1][0])
 350 | 
 351 |     # start with the first interval
 352 |     prev_beg, prev_end = data[0]
 353 | 
 354 |     # check if any subsequent intervals overlap
 355 |     for beg, end in data[1:]:
 356 |         if beg - prev_end + 1 > 0:
 357 |             new_data.append((prev_beg, prev_end))
 358 |             prev_beg = beg
 359 |         prev_end = max(end, prev_end)
 360 | 
 361 |     new_data.append((prev_beg, prev_end))
 362 | 
 363 |     if reverse:
 364 |         new_data.reverse()
 365 |     return new_data
 366 | 
 367 | 
 368 | def get_overlap_between_intervals(a, b):
 369 | 	"""
 370 | 	Finds the overlap between two intervals end points inclusive.
 371 | 	#Makes sure not to report overlap beyond either interval length.
 372 | 	#	a=[10,20]; b=[10,20] --> f(a,b)=10  !(not 11)
 373 | 	#	a=[10,20]; b=[20,30] --> f(a,b)=1
 374 | 		a=[10,20]; b=[15,30] --> f(a,b)=6
 375 | 	"""
 376 | 	#lena=abs(float(a[1])-float(a[0]))
 377 | 	#lenb=abs(float(b[1])-float(b[0]))
 378 | 	overlap=max(0, min(float(a[1]), float(b[1])) - max(float(a[0]), float(b[0]))+1)
 379 | 	#minlen=min(lena,lenb)
 380 | 	#return min(minlen,overlap)
 381 | 	return overlap
 382 | 
 383 | def sort_by_column(somelist, n):
 384 | 	"""
 385 | 	Given a list with 1 or more columns this functions sorts it according 
 386 | 	to the desired column n [0 len(list)). Does this in-place.
 387 | 	"""
 388 | 	somelist[:] = [(x[n], x) for x in somelist]
 389 | 	somelist.sort()
 390 | 	somelist[:] = [val for (key, val) in somelist]
 391 | 	return
 392 | 
 393 | def chr_name_conversion(chrIn,org):
 394 | 	"""
 395 | 	Given an identifier for the chromosome name (str) or a chromosome number (int) 
 396 | 	and an organism this function converts the identifier to the other representation. 
 397 | 	Example: 
 398 | 		converts 'chrX' or 'X' to 23 for human
 399 | 		converts 23 to 'chrX' 1 to 'chr1' for human
 400 | 	"""
 401 | 	if isinstance(chrIn, int): # int to str
 402 | 		if org=='human':
 403 | 			if	chrIn<23 and chrIn>0:
 404 | 				chrOut='chr'+str(chrIn)
 405 | 			elif chrIn==23:
 406 | 				chrOut='chrX'
 407 | 			elif chrIn==24:
 408 | 				chrOut='chrY'
 409 | 			else:
 410 | 				return 'problem'
 411 | 		elif org=='mouse':
 412 | 			if	chrIn<20 and chrIn>0:
 413 | 				chrOut='chr'+str(chrIn)
 414 | 			elif chrIn==20:
 415 | 				chrOut='chrX'
 416 | 			elif chrIn==21:
 417 | 				chrOut='chrY'
 418 | 			else:
 419 | 				return 'problem'
 420 | 		else:
 421 | 			chrOut='chr'+str(chrIn)
 422 | 	else: # str to int
 423 | 		if 'chr' in chrIn:
 424 | 			chrIn=chrIn[:3] # cut the 'chr'
 425 | 		if org=='human':
 426 | 			if	chrIn=='X':
 427 | 				chrOut=23
 428 | 			elif chrIn=='Y':
 429 | 				chrOut=24
 430 | 			else:
 431 | 				chrOut=int(chrIn)
 432 | 		elif org=='mouse':
 433 | 			if	chrIn=='X':
 434 | 				chrOut=20
 435 | 			elif chrIn=='Y':
 436 | 				chrOut=21
 437 | 			else:
 438 | 				chrOut=int(chrIn)
 439 | 	return chrOut
 440 | 
 441 | 
 442 | 
 443 | ################################# BEGIN ExtendedExon ##################################
 444 | ### NOT USED FOR NOW, NOT YET IMPLEMENTED ####
 445 | class ExtendedExon:
 446 | 	"""
 447 | 	This class is a container for exons that combines input from multiple different
 448 | 	sources/files. Below is a list of these sources:
 449 | 		- Ensembl Exon: This will initiate the instance of the ExtendedExon class 
 450 | 		- LiftOver Files:
 451 | 		- Genomedata Archive:
 452 | 		- PhastCons Scores:
 453 | 		- BLAT within species: 
 454 | 	"""
 455 | 	def __init__(self, ensemblExon):
 456 | 		self.basicInfoDic= ensemblExon.basicInfoDic
 457 | ################################# END ExtendedExon ##################################
 458 | 
 459 | 
 460 | 
 461 | ################################# BEGIN EnsemblExon ##################################
 462 | class EnsemblExon:
 463 | 	"""
 464 | 	This class is a container for Ensembl exons
 465 | 	"""
 466 | 	def __init__(self, line):
 467 | 		# parse the transcript line
 468 | 		chr,d,elemType,start_coord,end_coord,d,strand,d,others=line.rstrip().split("\t")
 469 | 		if elemType!="exon":
 470 | 			sys.exit("Not an exon parsed in class EnsemblExon:\t"+elemType)
 471 | 		#
 472 | 		#basic information about the exon
 473 | 		self.basicInfoDic={"chromosome" : chr, "start_coord" : int(start_coord), "end_coord" : int(end_coord), "strand" : strand}
 474 | 
 475 | 		# there are 13 or more items for exons. We keep only 7 relevant ones.
 476 | 		# e.g: gene_id "ENSG00000167468"; gene_version "14"; 
 477 | 		#	transcript_id "ENST00000593032"; transcript_version "3"; exon_number "2"; 
 478 | 		#	gene_name "GPX4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; 
 479 | 		#	transcript_name "GPX4-006"; transcript_source "havana"; transcript_biotype "protein_coding"; 
 480 | 		#	exon_id "ENSE00003420595"; exon_version "1"; tag "seleno"; tag "cds_end_NF"; 
 481 | 		items=others.replace('"', '').split(";")
 482 | 		for item in items:
 483 | 			wds=item.lstrip().split()
 484 | 			if len(wds)>1:
 485 | 				key, val = wds[0],wds[1]
 486 | 				if key in ["gene_id", "gene_name", "transcript_id", "transcript_name", "transcript_biotype", "exon_id", "exon_number"]:
 487 | 					self.basicInfoDic[key]=val
 488 | 		#
 489 | 
 490 | 		self.exon_type = "nonCoding" # by default
 491 | 		self.codingExon =  [-1,-1] # the coordinates of below codingExon will change if partialCoding or coding
 492 | 		self.transcriptIds=[self.basicInfoDic["transcript_id"]]
 493 | 		self.exonNumbers=[int(self.basicInfoDic["exon_number"])]
 494 | 		self.acceptor2bp="NN"
 495 | 		self.donor2bp="NN"
 496 | 		self.phastConsScores=[]
 497 | 		self.avgConsScore=0
 498 | 		self.avgCodingConsScore=0
 499 | 		self.firstMidLast=[0,0,0] # appearences of this exon as first, mid and last exons. Single exons are counted as first and last.
 500 | 	#
 501 | 
 502 | 	def handle_CDS(self,newLocus):
 503 | 		lastSt, lastEn=self.basicInfoDic["start_coord"], self.basicInfoDic["end_coord"]
 504 | 		newSt, newEn= newLocus.basicInfoDic["start_coord"], newLocus.basicInfoDic["end_coord"]	
 505 | 		if lastSt==newSt and lastEn==newEn:
 506 | 			self.exon_type="fullCoding"
 507 | 		elif get_overlap_between_intervals([lastSt,lastEn], [newSt,newEn])>0:
 508 | 			self.exon_type="partCoding"
 509 | 		else:
 510 | 			sys.exit("Reached a CDS entry that doesn't overlap with previous exon\t"\
 511 | 				+newLocus.get_summary_string()+"\n")   
 512 | 		#
 513 | 		self.codingExon=[newSt,newEn]
 514 | 	#
 515 | 	def add_another_instance(self,newExon):
 516 | 		# an exon may appear in only one gene but for many different transcripts
 517 | 		self.transcriptIds.append(newExon.basicInfoDic["transcript_id"])
 518 | 		self.exonNumbers.append(int(newExon.basicInfoDic["exon_number"]))
 519 | 	#
 520 | 	# data containers within this class
 521 | 	__slots__ = ["basicInfoDic", "exon_type", "codingExon", "transcriptIds", "exonNumbers", \
 522 | 			"acceptor2bp", "donor2bp", "phastConsScores", "avgCodingConsScore", "avgConsScore", "firstMidLast"]
 523 | 
 524 | 	# get one liner summary of the given instance
 525 | 	def get_summary_string(self):
 526 | 		summary="chr"+self.basicInfoDic["chromosome"]+"\t"+str(self.basicInfoDic["start_coord"])+"\t"+\
 527 | 			str(self.basicInfoDic["end_coord"])+"\t"+self.basicInfoDic["strand"]+"\t"+self.basicInfoDic["exon_id"]+"\t"+\
 528 | 			self.exon_type +"\t"+str(self.codingExon[0])+"\t"+str(self.codingExon[1])+"\t"+\
 529 | 			",".join(self.transcriptIds)+"\t"+",".join([str(e) for e in self.exonNumbers])+"\t"+\
 530 | 			self.basicInfoDic["gene_id"]+"\t"+str(abs(self.basicInfoDic["end_coord"]-self.basicInfoDic["start_coord"]))+"\t"+\
 531 | 			self.acceptor2bp+"\t"+self.donor2bp+"\t"+str(self.avgCodingConsScore)+"\t"+str(self.avgConsScore)+"\t"+\
 532 | 			",".join([str(e) for e in self.firstMidLast])+"\texon"
 533 | 			#self.basicInfoDic["transcript_id"]+"\t"+self.basicInfoDic["exon_number"]+"\t"+\
 534 | 		return summary
 535 | ################################# END EnsemblExon ##################################
 536 | 
 537 | ################################# BEGIN EnsemblLocus ##################################
 538 | class EnsemblLocus:
 539 | 	"""
 540 | 	This class is a container for a basic locus that has chr, start, end, strand 
 541 | 	fields. UTRs, start and stop codons from Ensembl are of this type.
 542 | 	"""
 543 | 	def __init__(self, line):
 544 | 		# parse the locus line
 545 | 		chr,d,elemType,start_coord,end_coord,d,strand,d,others=line.rstrip().split("\t")
 546 | 		if elemType!="UTR" and elemType!="stop_codon" and elemType!="start_codon" and elemType!="CDS":
 547 | 			sys.exit("Not a basic locus as intended parsed in from Ensemble line:\t"+line)
 548 | 		#
 549 | 		#basic information about the locus
 550 | 		self.basicInfoDic={"chromosome" : chr, "start_coord" : int(start_coord), \
 551 | 			"end_coord" : int(end_coord), "strand" : strand, "locus_type" : elemType}
 552 | 
 553 | 		items=others.replace('"', '').split(";")
 554 | 		for item in items:
 555 | 			wds=item.lstrip().split()
 556 | 			if len(wds)>1:
 557 | 				key, val = wds[0],wds[1]
 558 | 				if key in ["gene_id", "gene_name", "transcript_id", "transcript_name", "transcript_biotype", "protein_id"]:
 559 | 					self.basicInfoDic[key]=val
 560 | 		#
 561 | 
 562 | 
 563 | 		#
 564 | 
 565 | 	__slots__ = ["basicInfoDic"]
 566 | 
 567 | 	# get one liner summary of the given instance
 568 | 	def get_summary_string(self):
 569 | 		summary="chr"+self.basicInfoDic["chromosome"]+"\t"+str(self.basicInfoDic["start_coord"])+"\t"+\
 570 | 			str(self.basicInfoDic["end_coord"])+"\t"+self.basicInfoDic["strand"]+"\t"+self.basicInfoDic["locus_type"]+"\t"+\
 571 | 			self.basicInfoDic["transcript_id"]+"\t"+self.basicInfoDic["transcript_name"]+"\t"+\
 572 | 			self.basicInfoDic["transcript_biotype"]+"\t"+self.basicInfoDic["gene_id"]+"\t"+self.basicInfoDic["gene_name"]+"\tlocus"
 573 | 		return summary
 574 | ################################# END EnsemblLocus ##################################
 575 | 
 576 | 
 577 | ################################# BEGIN EnsemblTranscript ##################################
 578 | class EnsemblTranscript:
 579 | 	"""
 580 | 	This class is a container for Ensembl transcripts
 581 | 	"""
 582 | 	def __init__(self, line):
 583 | 		# parse the transcript line
 584 | 		chr,d,elemType,start_coord,end_coord,d,strand,d,others=line.rstrip().split("\t")
 585 | 		if elemType!="transcript":
 586 | 			sys.exit("Not a transcript parsed in class EnsemblTranscript:\t"+elemType)
 587 | 		#
 588 | 		#basic information about the transcript	
 589 | 		self.basicInfoDic={"chromosome" : chr, "start_coord" : int(start_coord), "end_coord" : int(end_coord), "strand" : strand}
 590 | 
 591 | 		# there are 10 or more items for transcripts. We keep only 5 relevant ones.
 592 | 		# e.g: gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; 
 593 | 		#	transcript_version "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; 
 594 | 		#	transcript_name "DDX11L1-002"; transcript_source "havana"; transcript_biotype "processed_transcript";
 595 | 		#
 596 | 
 597 | 		items=others.replace('"', '').split(";")
 598 | 		for item in items:
 599 | 			wds=item.lstrip().split()
 600 | 			if len(wds)>1:
 601 | 				key, val = wds[0],wds[1]
 602 | 				#key, val = (item.lstrip().split())[0:2]
 603 | 				if key in ["gene_id", "gene_name", "transcript_id", "transcript_name", "transcript_biotype"]:
 604 | 					self.basicInfoDic[key]=val
 605 | 		#
 606 | 		if "gene_name" not in self.basicInfoDic: 
 607 | 			self.basicInfoDic["gene_name"]=self.basicInfoDic["gene_id"]
 608 | 		if "transcript_name" not in self.basicInfoDic: 
 609 | 			self.basicInfoDic["transcript_name"]=self.basicInfoDic["transcript_id"]
 610 | 
 611 | 		self.start_codon=["cds_start_NF"] # by default make them non-confirmed
 612 | 		self.stop_codon=["cds_stop_NF"]  # by default make them non-confirmed
 613 | 		self.exons = []
 614 | 		self.codingExons = []
 615 | 		self.exon_types= []
 616 | 		self.protein_id="None"
 617 | 		self.UTRs=[]
 618 | 	#
 619 | 
 620 | 	# adding an exon to the list of exons of the transcript "in order"
 621 | 	def add_exon(self,newExon):
 622 | 		exonCountSoFar=len(self.exons)
 623 | 		#print "to add\t"+str(newExon.basicInfoDic["exon_number"])+"\t"+str(newExon.basicInfoDic["exon_id"])
 624 | 		if int(newExon.basicInfoDic["exon_number"])==exonCountSoFar+1:
 625 | 			exonEntry=[newExon.basicInfoDic["start_coord"],newExon.basicInfoDic["end_coord"],
 626 | 				newExon.basicInfoDic["exon_id"]]
 627 | 			self.exons.append(exonEntry)
 628 |  			# the coordinates of below codingExon will change if partialCoding or coding
 629 | 			self.codingExons.append([-1,-1])
 630 | 			exonType="nonCoding" # by default
 631 | 			self.exon_types.append(exonType)
 632 | 		else:
 633 | 			sys.exit("Exon entry is being entered out of order to the transcript\t"
 634 | 				+self.basicInfoDic["transcript_id"])
 635 | 		#
 636 | 	#
 637 | 	def add_locus(self,newLocus,locus_type):
 638 | 		if locus_type=='start_codon':
 639 | 			self.start_codon=[newLocus.basicInfoDic["start_coord"],newLocus.basicInfoDic["end_coord"]]
 640 | 		elif locus_type=='stop_codon':
 641 | 			self.stop_codon=[newLocus.basicInfoDic["start_coord"],newLocus.basicInfoDic["end_coord"]]
 642 | 		elif locus_type=='UTR':
 643 | 			self.UTRs.append([newLocus.basicInfoDic["start_coord"],newLocus.basicInfoDic["end_coord"]])
 644 | 		else:
 645 | 			sys.exit("Unknow locus type being inserted to transcript\t"\
 646 | 				+self.basicInfoDic["transcript_id"])   
 647 | 	#
 648 | 
 649 | 	def handle_CDS(self,newLocus):
 650 | 		exonType="nonCoding" # by default
 651 | 		exonCountSoFar=len(self.exons)
 652 | 		lastAddedExon=self.exons[exonCountSoFar-1]
 653 | 		lastSt,lastEn=self.exons[exonCountSoFar-1][0:2]
 654 | 		newSt, newEn= newLocus.basicInfoDic["start_coord"], newLocus.basicInfoDic["end_coord"]		
 655 | 		if lastSt==newSt and lastEn==newEn:
 656 | 			exonType="fullCoding"
 657 | 		elif get_overlap_between_intervals([lastSt,lastEn], [newSt,newEn])>0:
 658 | 			exonType="partCoding"
 659 | 		else:
 660 | 			sys.exit("Reached a CDS entry that doesn't overlap with previous exon\t"\
 661 | 				+newLocus.get_summary_string()+"\n")
 662 | 		#
 663 | 		self.codingExons[exonCountSoFar-1]=[newSt,newEn]
 664 | 		self.exon_types[exonCountSoFar-1]=exonType # replace with the previous nonCoding tag
 665 | 		self.protein_id=newLocus.basicInfoDic["protein_id"]
 666 | 	#
 667 | 
 668 | 	# data containers within this class
 669 | 	__slots__ = [
 670 | 			"basicInfoDic", 
 671 | 			"start_codon",
 672 | 			"stop_codon",
 673 | 			"exons",
 674 | 			"codingExons",
 675 | 			"exon_types",
 676 | 			"protein_id",
 677 | 			"UTRs"
 678 | 	]
 679 | 	# get one liner summary of the given instance
 680 | 	def get_summary_string(self):
 681 | 		if len(self.UTRs)==0:
 682 | 			self.UTRs.append(["None","None"])
 683 | 		#
 684 | 		summary="chr"+self.basicInfoDic["chromosome"]+"\t"+str(self.basicInfoDic["start_coord"])+"\t"+\
 685 | 			str(self.basicInfoDic["end_coord"])+"\t"+self.basicInfoDic["strand"]+"\t"+self.basicInfoDic["transcript_id"]+"\t"+\
 686 | 			self.basicInfoDic["transcript_name"]+"\t"+self.basicInfoDic["transcript_biotype"]+"\t"+\
 687 | 			self.basicInfoDic["gene_id"]+"\t"+self.basicInfoDic["gene_name"]+"\t"+\
 688 | 			",".join([e[2] for e in self.exons])+"\t"+",".join(self.exon_types)+"\t"+\
 689 | 			",".join([str(e[0]) for e in self.exons])+"\t"+",".join([str(e[1]) for e in self.exons])+"\t"+\
 690 | 			",".join([str(e[0]) for e in self.codingExons])+"\t"+",".join([str(e[1]) for e in self.codingExons])+"\t"+\
 691 | 			",".join([str(i) for i in self.start_codon])+"\t"+",".join([str(i) for i in self.stop_codon])+"\t"+\
 692 | 			",".join([str(e[0]) for e in self.UTRs])+"\t"+",".join([str(e[1]) for e in self.UTRs])+"\t"+\
 693 | 			self.protein_id+"\t"+"transcript"
 694 | 		return summary
 695 | ################################# END EnsemblTranscript ##################################
 696 | 
 697 | 
 698 | 			
 699 | ################################# BEGIN EnsemblGene ##################################
 700 | class EnsemblGene:
 701 | 	"""
 702 | 	This class is a container for Ensembl genes.
 703 | 	"""
 704 | 	def __init__(self, line):
 705 | 		# parse the gene line
 706 | 		chr,d,elemType,start_coord,end_coord,d,strand,d,others=line.rstrip().split("\t")
 707 | 		if elemType!="gene":
 708 | 			sys.exit("Not a gene parsed in class EnsemblGene:\t"+elemType)
 709 | 		#
 710 | 		#basic information about the gene	
 711 | 		self.basicInfoDic={"chromosome" : chr, "start_coord" : int(start_coord), "end_coord" : int(end_coord), "strand" : strand}
 712 | 
 713 | 		# there are 5 items for genes: 
 714 | 		# e.g: gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed";
 715 | 		items=others.replace('"', '').split(";")
 716 | 		for item in items:
 717 | 			if len(item)>1:
 718 | 				key, val = item.lstrip().split()
 719 | 				self.basicInfoDic[key]=val
 720 | 		if "gene_name" not in self.basicInfoDic: 
 721 | 			self.basicInfoDic["gene_name"]=self.basicInfoDic["gene_id"]
 722 | 		self.exons = {}
 723 | 		self.transcripts = {}
 724 | 	#
 725 | 
 726 | 	# data containers within this class
 727 | 	__slots__ = [
 728 | 			"basicInfoDic", 
 729 | 			"exons",
 730 | 			"transcripts"]
 731 | 
 732 | 	# adding a transcript to the gene
 733 | 	def add_transcript(self,newTranscript):
 734 | 		self.transcripts[newTranscript.basicInfoDic["transcript_id"]]=newTranscript
 735 | 	# adding an exon to the list of exons of the gene
 736 | 	def add_exon(self,newExon):
 737 | 		self.exons[newExon.basicInfoDic["exon_id"]]=\
 738 | 			[newExon.basicInfoDic["start_coord"],newExon.basicInfoDic["end_coord"]]
 739 | 	# get one liner summary of the given instance
 740 | 	def get_summary_string(self):
 741 | 		summary="chr"+self.basicInfoDic["chromosome"]+"\t"+str(self.basicInfoDic["start_coord"])+"\t"+\
 742 | 			str(self.basicInfoDic["end_coord"])+"\t"+self.basicInfoDic["strand"]+"\t"+self.basicInfoDic["gene_id"]+"\t"+\
 743 | 			self.basicInfoDic["gene_name"]+"\t"+self.basicInfoDic["gene_biotype"]+"\t"+\
 744 | 			str(len(self.transcripts))+"\t"+str(len(self.exons))+"\tgene"
 745 | 		return summary
 746 | 	#
 747 | 	#def gene_wrap_up():
 748 | 	#	self.beg = min(self.exons[e][0] for e in self.exons)
 749 | 	#	self.end = max(self.exons[e][1] for e in self.exons)
 750 | 	#
 751 | 
 752 | ################################# END EnsemblGene ##################################
 753 | 
 754 | 
 755 | def convert_UCSC_to_bed_format(l):
 756 | 	"""
 757 | 	Given a locus in UCSC format this function converts it to bed format with 3 fields
 758 | 	chr1:121-21111 --> ['chr1', 121, 21111]
 759 | 	"""
 760 | 	chr=l[:l.find(':')]
 761 | 	st=int(l[l.find(':')+1:l.find('-')])
 762 | 	en=int(l[l.find('-')+1:])
 763 | 	return (chr,st,en)
 764 | 
 765 | 
 766 | def consistency_check(org1TOorg2,org2TOorg1):
 767 | 	"""
 768 | 	Check the consistency between the two matchings (e.g. human-to-mouse, mouse-to-human)
 769 | 	read from separate Ensembl file. This function will do nothing if all is consistent.
 770 | 	"""
 771 | 	for g1 in org1TOorg2:
 772 | 		type=org1TOorg2[g1][0][1]
 773 | 		if type=="ortholog_one2one":
 774 | 			g2=org1TOorg2[g1][0][0]
 775 | 			if g2 not in org2TOorg1:
 776 | 				sys.exit("Reverse entry for a one2one match couldn't be found\t"+g1+"\t"+g2)
 777 | 			elif org2TOorg1[g2][0][0]!=g1:
 778 | 				sys.exit("Reverse entry for a one2one match mismatches with original one\t"+g1+"\t"+g2)
 779 | 			# else good
 780 | 		else:
 781 | 			for oneMatch1 in org1TOorg2[g1]:
 782 | 				g2=oneMatch1[0]
 783 | 				if g2 not in org2TOorg1:
 784 | 					sys.exit("Reverse entry for a NON-one2one match couldn't be found\t"+g1+"\t"+g2)
 785 | 				else:
 786 | 					reverseFound=False
 787 | 					for oneMatch2 in org2TOorg1[g2]:
 788 | 						if oneMatch2[0]==g1:
 789 | 							reverseFound=True
 790 | 							break
 791 | 					#
 792 | 					if reverseFound==False:
 793 | 						sys.exit("Reverse entry for a NON-one2one match mismatches with original one\t"+g1+"\t"+g2)
 794 | 					# else good
 795 | 				#
 796 | 	#
 797 | 	for g1 in org2TOorg1:
 798 | 		type=org2TOorg1[g1][0][1]
 799 | 		if type=="ortholog_one2one":
 800 | 			g2=org2TOorg1[g1][0][0]
 801 | 			if g2 not in org1TOorg2:
 802 | 				sys.exit("Reverse entry for a one2one match couldn't be found\t"+g1+"\t"+g2)
 803 | 			elif org1TOorg2[g2][0][0]!=g1:
 804 | 				sys.exit("Reverse entry for a one2one match mismatches with original one\t"+g1+"\t"+g2)
 805 | 			# else good
 806 | 		else:
 807 | 			for oneMatch1 in org2TOorg1[g1]:
 808 | 				g2=oneMatch1[0]
 809 | 				if g2 not in org1TOorg2:
 810 | 					sys.exit("Reverse entry for a NON-one2one match couldn't be found\t"+g1+"\t"+g2)
 811 | 				else:
 812 | 					reverseFound=False
 813 | 					for oneMatch2 in org1TOorg2[g2]:
 814 | 						if oneMatch2[0]==g1:
 815 | 							reverseFound=True
 816 | 							break
 817 | 					#
 818 | 					if reverseFound==False:
 819 | 						sys.exit("Reverse entry for a NON-one2one match mismatches with original one\t"+g1+"\t"+g2)
 820 | 					# else good
 821 | 				#
 822 | 	#
 823 | 	return
 824 | 
 825 | 
 826 | def pickle_one2one_genePairs_allInfo(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,transcriptDic1,transcriptDic2,outdir):
 827 | 	"""
 828 | 	Pickle the gene, transcript and exon dictionaries for each pair of ortholog_one2one genes.
 829 | 	There are around 16.5k such genes for human-mouse and 15.8k are protein_coding pairs.
 830 | 	"""
 831 | 	geneOnlyOrthoDic1,transcriptOnlyOrthoDic1,exonOnlyOrthoDic1={},{},{}
 832 | 	geneOnlyOrthoDic2,transcriptOnlyOrthoDic2,exonOnlyOrthoDic2={},{},{}
 833 | 
 834 | 	for g1 in genePairsDic:
 835 | 		type=genePairsDic[g1][0][1]
 836 | 		if type=="ortholog_one2one":
 837 | 			g2=genePairsDic[g1][0][0]
 838 | 		else:
 839 | 			continue
 840 | 
 841 | 		if geneDic1[g1].basicInfoDic["gene_biotype"]!="protein_coding" or geneDic2[g2].basicInfoDic["gene_biotype"]!="protein_coding":
 842 | 			continue
 843 | 
 844 | 		# small dictionaries that have only the relevant stuff for one gene pair
 845 | 		newGeneDic1={}; newGeneDic2={}
 846 | 		newExonDic1={}; newExonDic2={}
 847 | 		newTranscriptDic1={}; newTranscriptDic2={}
 848 | 		#
 849 | 		newGeneDic1[g1]=geneDic1[g1]
 850 | 		newGeneDic2[g2]=geneDic2[g2]
 851 | 		geneOnlyOrthoDic1[g1]=geneDic1[g1]
 852 | 		geneOnlyOrthoDic2[g2]=geneDic2[g2]
 853 | 		for tId in geneDic1[g1].transcripts:
 854 | 			newTranscriptDic1[tId]=transcriptDic1[tId]
 855 | 			transcriptOnlyOrthoDic1[tId]=transcriptDic1[tId]
 856 | 		for tId in geneDic2[g2].transcripts:
 857 | 			newTranscriptDic2[tId]=transcriptDic2[tId]
 858 | 			transcriptOnlyOrthoDic2[tId]=transcriptDic2[tId]
 859 | 		#
 860 | 		for eId in geneDic1[g1].exons:
 861 | 			newExonDic1[eId]=exonDic1[eId]
 862 | 			exonOnlyOrthoDic1[eId]=exonDic1[eId]
 863 | 		for eId in geneDic2[g2].exons:
 864 | 			newExonDic2[eId]=exonDic2[eId]
 865 | 			exonOnlyOrthoDic2[eId]=exonDic2[eId]
 866 | 		#
 867 | 		os.system("mkdir -p "+ outdir+"/"+g1+"-"+g2)
 868 | 		#print geneDic1[g1].get_summary_string()+"\t"+geneDic2[g2].get_summary_string()
 869 | 		outfilename=outdir+"/"+g1+"-"+g2+"/org1.pickledDictionaries"
 870 | 		pickle.dump((newGeneDic1,newTranscriptDic1,newExonDic1), open(outfilename,"wb"))
 871 | 		outfilename=outdir+"/"+g1+"-"+g2+"/org2.pickledDictionaries"
 872 | 		pickle.dump((newGeneDic2,newTranscriptDic2,newExonDic2), open(outfilename,"wb"))
 873 | 		# to load use:
 874 | 		#geneDic1,transcriptDic1,exonDic1=pickle.load(open("pickled.stuff","rb"))
 875 | 	#
 876 | 	
 877 | 	return (geneOnlyOrthoDic1,transcriptOnlyOrthoDic1,exonOnlyOrthoDic1, geneOnlyOrthoDic2,transcriptOnlyOrthoDic2,exonOnlyOrthoDic2)
 878 | 
 879 | def print_one2one_genePairs(genePairsDic, geneDic1,geneDic2,outfilename):
 880 | 	"""
 881 | 	Print one liner for each pair of genes that match each other one to one.
 882 | 	There are around 16.5k such genes for human-mouse and 15.8k are protein_coding pairs.
 883 | 	"""
 884 | 	outfile=open(outfilename,'w')
 885 | 	outfile.write("chrName1\tstart_coord1\tend_coord1\tstrand1\tgeneID1\tgeneName1\tgeneType1\tnoOfTranscripts1\tnoOfExons1\ttype1\t")
 886 | 	outfile.write("chrName2\tstart_coord2\tend_coord2\tstrand2\tgeneID2\tgeneName2\tgeneType2\tnoOfTranscripts2\tnoOfExons2\ttype2\n")
 887 | 	#print "chrName1\tstart_coord1\tend_coord1\tstrand1\tgeneID1\tgeneName1\tgeneType1\tnoOfTranscripts1\tnoOfExons1\ttype1\t",
 888 | 	#print "chrName2\tstart_coord2\tend_coord2\tstrand2\tgeneID2\tgeneName2\tgeneType2\tnoOfTranscripts2\tnoOfExons2\ttype2"
 889 | 	for g1 in genePairsDic:
 890 | 		type=genePairsDic[g1][0][1]
 891 | 		if type=="ortholog_one2one":
 892 | 			g2=genePairsDic[g1][0][0]
 893 | 		else:
 894 | 			continue
 895 | 		outfile.write(geneDic1[g1].get_summary_string()+"\t"+geneDic2[g2].get_summary_string()+"\n")
 896 | 		#print geneDic1[g1].get_summary_string()+"\t"+geneDic2[g2].get_summary_string()
 897 | 	#
 898 | 	outfile.close()
 899 | 	return
 900 | 
 901 | def print_one2one_transcriptListPairs(genePairsDic, geneDic1,geneDic2,transcriptDic1,transcriptDic2,orgId1,orgId2,outdir):
 902 | 	"""
 903 | 	Print the lists of transcripts for each one to one mapped gene pair. 
 904 | 	There are around 16.5k such genes for human-mouse and 15.8k are protein_coding pairs.
 905 | 	"""
 906 | 	for g1 in genePairsDic:
 907 | 		type=genePairsDic[g1][0][1]
 908 | 		if type=="ortholog_one2one":
 909 | 			g2=genePairsDic[g1][0][0]
 910 | 		else:
 911 | 			continue
 912 | 		#
 913 | 		outdirTemp=outdir+"/"+g1+"-"+g2; os.system("mkdir -p "+outdirTemp)
 914 | 		outfile1=open(outdirTemp+"/"+orgId1+"_transcripts.bed",'w')
 915 | 		outfile2=open(outdirTemp+"/"+orgId2+"_transcripts.bed",'w')
 916 | 		
 917 | 		transcripts1=geneDic1[g1].transcripts
 918 | 		transcripts2=geneDic2[g2].transcripts
 919 | 		for t1 in transcripts1:
 920 | 			outfile1.write(transcriptDic1[t1].get_summary_string()+"\n")
 921 | 		for t2 in transcripts2:
 922 | 			outfile2.write(transcriptDic2[t2].get_summary_string()+"\n")
 923 | 		#
 924 | 		outfile1.close()
 925 | 		outfile2.close()
 926 | 	#
 927 | 	return
 928 | 
 929 | def print_one2one_exonListPairs(genePairsDic, geneDic1,geneDic2,exonDic1,exonDic2,orgId1,orgId2,outdir):
 930 | 	"""
 931 | 	Print the lists of exons for each one to one mapped gene pair. 
 932 | 	There are around 16.5k such genes for human-mouse and 15.8k are protein_coding pairs.
 933 | 	"""
 934 | 	for g1 in genePairsDic:
 935 | 		type=genePairsDic[g1][0][1]
 936 | 		if type=="ortholog_one2one":
 937 | 			g2=genePairsDic[g1][0][0]
 938 | 		else:
 939 | 			continue
 940 | 		#
 941 | 		outdirTemp=outdir+"/"+g1+"-"+g2; os.system("mkdir -p "+outdirTemp)
 942 | 		outfile1=open(outdirTemp+"/"+orgId1+"_exons.bed",'w')
 943 | 		outfile2=open(outdirTemp+"/"+orgId2+"_exons.bed",'w')
 944 | 		
 945 | 		exons1=geneDic1[g1].exons
 946 | 		exons2=geneDic2[g2].exons
 947 | 		for e1 in exons1:
 948 | 			outfile1.write(exonDic1[e1].get_summary_string()+"\n")
 949 | 		for e2 in exons2:
 950 | 			outfile2.write(exonDic2[e2].get_summary_string()+"\n")
 951 | 		#
 952 | 	#	print exonDic[e].get_summary_string()
 953 | 
 954 | 		outfile1.close()
 955 | 		outfile2.close()
 956 | 		
 957 | 	#
 958 | 	return
 959 | 
 960 | def extract_fasta_files_for_exons(refGD,exonDic,typ,fivePrimeFlank,threePrimeFlank,outfilename):
 961 | 	"""
 962 | 	With the help of genomedata archive extract the nucleotide sequences 
 963 | 	from and around each exon and write them in a .fa file.
 964 | 	refGD is the genomedata archive created for the reference genome.
 965 | 	typ can be one of the following:
 966 | 		"allExon": Extract the sequence of the whole exon.
 967 | 		"allExonPlusMinus": Like allExon but with flanking 5' and 3'.
 968 | 		"intronExon": Extract the sequence from the juction of this
 969 | 		   exon and the previous intron.
 970 | 		"exonIntron": Extract the sequence from the juction of this
 971 | 		   exon and the next intron.
 972 | 	fivePrimeFlank is the amount to extract extra from the 5' end.
 973 | 	threePrimeFlank is the amount to extract extra from the 3' end.
 974 | 
 975 | 	"""
 976 | 	if typ=="allExon":
 977 | 		fivePrimeFlank=0; threePrimeFlank=0
 978 | 	#
 979 | 	outfile=open(outfilename,'w')
 980 | 	with Genome(refGD) as genome:
 981 | 		for id in exonDic:
 982 | 			e=exonDic[id]
 983 | 			ch,st,en="chr"+e.basicInfoDic["chromosome"], e.basicInfoDic["start_coord"], e.basicInfoDic["end_coord"]
 984 | 			strand,id=e.basicInfoDic["strand"], e.basicInfoDic["exon_id"]
 985 | 			# off by one error fix by -1
 986 | 			st=int(st)-1
 987 | 			en=int(en)-1
 988 | 			if strand=="+":
 989 | 				if typ=="intronExon":
 990 | 					en=st # make sure we're around the first bp of exon
 991 | 					st=st-fivePrimeFlank # make sure 5' part is of size fivePrimeFlank including st
 992 | 					en=en+threePrimeFlank # make sure 3' part is of size threePrimeFlank including en
 993 | 				elif typ=="exonIntron":
 994 | 					st=en # make sure we're around the last bp of exon
 995 | 					st=st-fivePrimeFlank+1
 996 | 					en=en+threePrimeFlank+1
 997 | 				elif typ=="allExonPlusMinus" or typ=="allExon":
 998 | 					st=st-fivePrimeFlank
 999 | 					en=en+threePrimeFlank+1
1000 | 				#
1001 | 				id=id+"_plusStrand"
1002 | 				sq=genome[ch].seq[st:en].tostring().lower().upper()
1003 | 			else:
1004 | 				if typ=="intronExon":
1005 | 					st=en # make sure we're around the first bp of exon
1006 | 					en=en+fivePrimeFlank+1
1007 | 					st=st-threePrimeFlank+1
1008 | 				elif typ=="exonIntron":
1009 | 					en=st # make sure we're around the last bp of exon
1010 | 					en=en+fivePrimeFlank
1011 | 					st=st-threePrimeFlank
1012 | 				elif typ=="allExonPlusMinus" or typ=="allExon":
1013 | 					st=st-threePrimeFlank
1014 | 					en=en+fivePrimeFlank+1
1015 | 				#
1016 | 				id=id+"_minusStrand"
1017 | 				sq=genome[ch].seq[st:en].tostring()
1018 | 				#sq=sq.lower()[::-1].upper() # reverse
1019 | 				#sq=sq.lower().translate(complement).upper() # complement
1020 | 				sq=sq.lower().translate(complement)[::-1].upper() # reverse complement
1021 | 			#
1022 | 			outfile.write(">"+id+"_"+typ+"\n")
1023 | 			outfile.write(sq+"\n")
1024 | 		#
1025 | 	#
1026 | 	outfile.close()
1027 | 	return
1028 | 
1029 | def extract_conservation_stats_for_exons(refGD,exonDic,typ,fivePrimeFlank,threePrimeFlank,outfilename):
1030 | 	"""
1031 | 	With the help of genomedata archive extract the nucleotide sequences 
1032 | 	from and around each exon and convservation scores and write them to a file.
1033 | 	refGD is the genomedata archive created for the reference genome.
1034 | 	typ can be one of the following:
1035 | 		"allExon": Extract the sequence of the whole exon.
1036 | 		"allExonPlusMinus": Like allExon but with flanking 5' and 3'.
1037 | 		"intronExon": Extract the sequence from the juction of this
1038 | 		   exon and the previous intron.
1039 | 		"exonIntron": Extract the sequence from the juction of this
1040 | 		   exon and the next intron.
1041 | 	fivePrimeFlank is the amount to extract extra from the 5' end.
1042 | 	threePrimeFlank is the amount to extract extra from the 3' end.
1043 | 	IF outfilename is "None" then no output file is written, only 
1044 | 	relevant fields are added to the exon in exonDic.
1045 | 	"""
1046 | 	sys.stderr.write("Extracting conservation stats and acceptor donor sites for exons from genomedata archive\n")
1047 | 	# this is the trackname for phastCons scores loaded from wig files
1048 | 	trackName="phastCons"
1049 | 	#
1050 | 
1051 | 	if typ=="allExon":
1052 | 		fivePrimeFlank=0; threePrimeFlank=0
1053 | 	#
1054 | 	if outfilename!="None":
1055 | 		outfile=open(outfilename,'w')
1056 | 		# header line
1057 | 		outfile.write("CHR\tstart\tend\tstrand\tExonID\tacceptor2bp\tdonor2bp\tpreAcceptorCons\taccepterCons1\taccepterCons2\texon5primeCons\texonMidCons\texon3primeCons\tdonorCons1\tdonorCons2\tpostDonorCons\n")
1058 | 	#
1059 | 	with Genome(refGD) as genome:
1060 | 		lineCount=0
1061 | 		for id in exonDic:
1062 | 			print (id)
1063 | 			e=exonDic[id]
1064 | 			ch,st,en="chr"+e.basicInfoDic["chromosome"], e.basicInfoDic["start_coord"], e.basicInfoDic["end_coord"]
1065 | 			if e.exon_type=="partCoding":
1066 | 				codingSt=min(int(e.codingExon[0])-1,int(e.codingExon[1])-1)
1067 | 				codingEn=max(int(e.codingExon[0])-1,int(e.codingExon[1])-1)
1068 | 			#
1069 | 			stOrig=st; enOrig=en;
1070 | 			strand,id=e.basicInfoDic["strand"], e.basicInfoDic["exon_id"]
1071 | 			# off by one error fix by -1
1072 | 			st=int(st)-1
1073 | 			en=int(en)-1
1074 | 			if strand=="+":
1075 | 				if typ=="intronExon":
1076 | 					en=st # make sure we're around the first bp of exon
1077 | 					st=st-fivePrimeFlank # make sure 5' part is of size fivePrimeFlank including st
1078 | 					en=en+threePrimeFlank # make sure 3' part is of size threePrimeFlank including en
1079 | 				elif typ=="exonIntron":
1080 | 					st=en # make sure we're around the last bp of exon
1081 | 					st=st-fivePrimeFlank+1
1082 | 					en=en+threePrimeFlank+1
1083 | 				elif typ=="allExonPlusMinus":
1084 | 					st=st-fivePrimeFlank
1085 | 					en=en+threePrimeFlank+1
1086 | 				#
1087 | 				#id=id+"_plusStrand"
1088 | 				sq=genome[ch].seq[st:en].tostring().lower().upper()
1089 | 				allScores=(genome[ch])[st:en,trackName]
1090 | 			else:
1091 | 				if typ=="intronExon":
1092 | 					st=en # make sure we're around the first bp of exon
1093 | 					en=en+fivePrimeFlank+1
1094 | 					st=st-threePrimeFlank+1
1095 | 				elif typ=="exonIntron":
1096 | 					en=st # make sure we're around the last bp of exon
1097 | 					en=en+fivePrimeFlank
1098 | 					st=st-threePrimeFlank
1099 | 				elif typ=="allExonPlusMinus":
1100 | 					st=st-threePrimeFlank
1101 | 					en=en+fivePrimeFlank+1
1102 | 				#
1103 | 				#id=id+"_minusStrand"
1104 | 				sq=genome[ch].seq[st:en].tostring()
1105 | 				sq=sq.lower().translate(complement)[::-1].upper() # reverse complement
1106 | 				allScores=(genome[ch])[st:en,trackName][::-1]
1107 | 			#
1108 | 			print (sq)
1109 | 			print (allScores)
1110 | 			print (genome[ch].seq[st:en].tostring())
1111 | 			if e.exon_type=="partCoding":
1112 | 				codingScores=(genome[ch])[codingSt:codingEn,trackName][::-1]
1113 | 			### Extract all the scores to be written to the output file ###
1114 | 			acceptor2bp=sq[fivePrimeFlank-2:fivePrimeFlank]
1115 | 			donor2bp=(sq[-threePrimeFlank:])[0:2]
1116 | 			#
1117 | 			x=allScores[:fivePrimeFlank-2]
1118 | 			preAcceptorCons=np.nanmean(x)
1119 | 			#
1120 | 			accepterCons1=allScores[fivePrimeFlank-2]
1121 | 			accepterCons2=allScores[fivePrimeFlank-1]
1122 | 			#
1123 | 			x=allScores[fivePrimeFlank:fivePrimeFlank+(fivePrimeFlank-2)]
1124 | 			exon5primeCons=np.nanmean(x)
1125 | 			#
1126 | 			x=allScores[fivePrimeFlank+(fivePrimeFlank-2):-(threePrimeFlank+(threePrimeFlank-2))]
1127 | 			exonMidCons=np.nanmean(x)
1128 | 			#
1129 | 			x=allScores[-(threePrimeFlank+(threePrimeFlank+2)):-threePrimeFlank]
1130 | 			exon3primeCons=np.nanmean(x)
1131 | 			#
1132 | 			donorCons1=allScores[-threePrimeFlank]
1133 | 			donorCons2=allScores[-threePrimeFlank+1]
1134 | 			#
1135 | 			x=allScores[-threePrimeFlank+2:]
1136 | 			postDonorCons=np.nanmean(x)
1137 | 			#
1138 | 			#first20bp=allScores[:20]
1139 | 			#outfile.write("%s\t%d\t%d\t%s\t%s\t%s\t%s\t" % (ch,stOrig,enOrig,strand,id,acceptor2bp,donor2bp))
1140 | 			#outfile.write("\t".join([repr(x) for x in first20bp])+"\n")
1141 | 			exonDic[id].acceptor2bp=acceptor2bp
1142 | 			exonDic[id].donor2bp=donor2bp
1143 | 			exonDic[id].phastConsScores=allScores
1144 | 			exonDic[id].avgConsScore=np.nanmean(allScores[fivePrimeFlank:-threePrimeFlank])
1145 | 			if e.exon_type=="partCoding":
1146 | 				exonDic[id].avgCodingConsScore=np.nanmean(codingScores)
1147 | 
1148 | 			if lineCount%100000==0:
1149 | 				sys.stderr.write(str(lineCount)+"\t")
1150 | 			lineCount+=1
1151 | 			print (sq)
1152 | 			print ("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % \
1153 |                                         (ch,stOrig,enOrig,strand,id,acceptor2bp,donor2bp, preAcceptorCons, accepterCons1, accepterCons2, exon5primeCons,\
1154 |                                          exonMidCons, exon3primeCons, donorCons1, donorCons2, postDonorCons, exonDic[id].avgCodingConsScore, exonDic[id].avgConsScore))
1155 | 
1156 | 			if outfilename!="None":
1157 | 				outfile.write("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % \
1158 | 					(ch,stOrig,enOrig,strand,id,acceptor2bp,donor2bp, preAcceptorCons, accepterCons1, accepterCons2, exon5primeCons,\
1159 | 					 exonMidCons, exon3primeCons, donorCons1, donorCons2, postDonorCons, exonDic[id].avgCodingConsScore, exonDic[id].avgConsScore))
1160 | 			#
1161 | 			###
1162 | 		#
1163 | 	#
1164 | 	sys.stderr.write("\n\n")
1165 | 	if outfilename!="None":
1166 | 		outfile.close()
1167 | 	return exonDic
1168 | 
1169 | def assign_firstMidLast_exon_counts(exonDic,transcriptDic):
1170 | 	for e in exonDic:
1171 | 		for i in range(len(exonDic[e].exonNumbers)):
1172 | 			transcriptLength=len(transcriptDic[exonDic[e].transcriptIds[i]].exons)
1173 | 			tempExNo=exonDic[e].exonNumbers[i]
1174 | 			#print [transcriptLength, tempExNo]
1175 | 			#single exon
1176 | 			if tempExNo==1 and tempExNo==transcriptLength:
1177 | 				exonDic[e].firstMidLast[0]+=1
1178 | 				exonDic[e].firstMidLast[2]+=1
1179 | 			#first exon
1180 | 			elif tempExNo==1:
1181 | 				exonDic[e].firstMidLast[0]+=1
1182 | 			#last exon
1183 | 			elif tempExNo==transcriptLength:
1184 | 				exonDic[e].firstMidLast[2]+=1
1185 | 			else:
1186 | 				exonDic[e].firstMidLast[1]+=1
1187 | 		#
1188 | 		#if exonDic[e].firstMidLast[0]>0 or exonDic[e].firstMidLast[2]>0:
1189 | 		#print exonDic[e].get_summary_string()
1190 | 	#
1191 | 	return exonDic
1192 | 
1193 | #   Testing functionalities
1194 | def main(argv):
1195 | 	orgId1="human"; orgId2="mouse";
1196 | 	refGD1="/home/fao150/proj/2015orthoR01/results/2015-03-17_creating-genomedata-archives-for-refs/hg38"
1197 | 	refGD2="/home/fao150/proj/2015orthoR01/results/2015-03-17_creating-genomedata-archives-for-refs/mm10"
1198 | 
1199 | #	outdir="GTFsummaries"; 
1200 | 	if len(argv)==1:
1201 | 		return
1202 | 
1203 | 	outdir=argv[1]
1204 | 	os.system("mkdir -p "+outdir)
1205 | 	os.system("mkdir -p "+outdir+"/GTFsummaries")		
1206 | 	#infilename="/projects/b1017/shared/Ensembl-files/Homo_sapiens.GRCh38.78.gtf.gz"
1207 | 	infilename="Homo_sapiens.GRCh38.102.gtf"
1208 | 	#geneDic1,transcriptDic1,exonDic1,infoDic1=parse_organism_GTF(orgId1, infilename, outdir+"/GTFsummaries")
1209 | 	
1210 | 	#infilename="/projects/b1017/shared/Ensembl-files/Mus_musculus.GRCm38.78.gtf.gz"
1211 | 	infilename="Mus_musculus.GRCm38.102.gtf"
1212 | 	#geneDic2,transcriptDic2,exonDic2,infoDic2=parse_organism_GTF(orgId2, infilename, outdir+"/GTFsummaries")
1213 | 
1214 | 	## these two files were downloaded by hand selecting columns from Ensembl's Biomart
1215 | 	## I weren't able to redo the same column selections recently so I decided to switch to
1216 | 	## parsing the orthology information from readily available Ensembl files like below ones:
1217 | 	## ftp://ftp.ensembl.org/pub/release-80/mysql/ensembl_mart_80/
1218 | 	## hsapiens_gene_ensembl__homolog_mmus__dm.txt.gz
1219 | 
1220 | 	#infilename="/projects/b1017/shared/Ensembl-files/Ensembl-human-GRCh38-to-mouse-GRCm38.p3.txt.gz"
1221 | 	#genePairsHumanToMouse=parse_ensembl_gene_pairings(orgId1,orgId2,infilename)
1222 | 	#infilename="/projects/b1017/shared/Ensembl-files/Ensembl-mouse-GRCm38.p3-to-human-GRCh38.txt.gz"
1223 | 	#genePairsHumanToMouse=parse_ensembl_gene_pairings(orgId1,orgId2,infilename)
1224 | 	
1225 | 	#### Rewritten:	Abhijit ####
1226 | 	infilename="Ensembl-human-GRCh38-to-mouse-GRCm38.Formatted.txt"
1227 | 	#genePairsHumanToMouse=parse_ensembl_gene_pairings(infilename)
1228 | 
1229 | 	infilename="Ensembl-mouse-GRCm38-to-human-GRCh38.Formatted.txt" 
1230 | 	#genePairsMouseToHuman=parse_ensembl_gene_pairings(infilename)
1231 | 	#consistency_check(genePairsHumanToMouse,genePairsMouseToHuman)
1232 | 
1233 | 	## if consistency check is ok then just use one side. This is OK for one2one mappings.
1234 | 	#genePairsDic=genePairsHumanToMouse
1235 | 	#os.system("mkdir -p "+outdir+"/perGenePairPickledInfo")
1236 | 	#pickle_one2one_genePairs_allInfo(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,transcriptDic1,transcriptDic2,outdir+"/perGenePairPickledInfo") 
1237 | 	
1238 | 	#infilename="/projects/b1017/shared/Ensembl-files/hsapiens_gene_ensembl__homolog_mmus__dm.txt.gz"
1239 | 	#infilename="hsapiens_gene_ensembl__homolog_mmusculus__dm.txt"
1240 | 	#proteinToGeneDic,genePairsDic,proteinPairsDic=parse_ensembl_geneAndProtein_pairings(infilename,{},{})
1241 | 	#print (["1",len(proteinToGeneDic),len(genePairsDic),len(proteinPairsDic)])
1242 | 
1243 | 	#infilename="/projects/b1017/shared/Ensembl-files/mmusculus_gene_ensembl__homolog_hsap__dm.txt.gz"
1244 | 	#infilename="mmusculus_gene_ensembl__homolog_hsapiens__dm.txt"
1245 | 	#proteinToGeneDic,genePairsDic,proteinPairsDic=parse_ensembl_geneAndProtein_pairings(infilename,proteinToGeneDic,proteinPairsDic)
1246 | 	#print (["2",len(proteinToGeneDic),len(genePairsDic),len(proteinPairsDic)])
1247 | 
1248 | 	
1249 | 	#exonDic1=assign_firstMidLast_exon_counts(exonDic1,transcriptDic1)
1250 | 	#exonDic2=assign_firstMidLast_exon_counts(exonDic2,transcriptDic2)
1251 | 
1252 | 	typ="allExonPlusMinus"
1253 | 	outfilename="None"
1254 | 	fivePrimeFlank=12; threePrimeFlank=12
1255 | 	#exonDic1=extract_conservation_stats_for_exons(refGD1,exonDic1,typ,fivePrimeFlank,threePrimeFlank,outfilename)
1256 | 	#exonDic2=extract_conservation_stats_for_exons(refGD2,exonDic2,typ,fivePrimeFlank,threePrimeFlank,outfilename)
1257 | 
1258 | 	#outdir=argv[1]+"/after"
1259 | 	#os.system("mkdir -p "+outdir)
1260 | 	#print_some_summary(orgId1, geneDic1,transcriptDic1,exonDic1,{}, outdir)
1261 | 	#print_some_summary(orgId2, geneDic2,transcriptDic2,exonDic2,{}, outdir)
1262 | 
1263 | #	outdir="perGenePairExonLists"
1264 | 	if len(argv)==2:
1265 | 		return
1266 | 
1267 | 	#outdir=argv[2]
1268 | 	#os.system("mkdir -p "+outdir)
1269 | 
1270 | 	#pickle_one2one_genePairs_allInfo(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,transcriptDic1,transcriptDic2,outdir) 
1271 | 
1272 | #	outfilename=outdir+"/genePairsSummary-one2one.txt"
1273 | #	print_one2one_genePairs(genePairsDic,geneDic1,geneDic2,outfilename) # either way is ok since one2one
1274 | 	#
1275 | #	print_one2one_exonListPairs(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,orgId1,orgId2,outdir)
1276 | #	print_one2one_transcriptListPairs(genePairsDic,geneDic1,geneDic2,transcriptDic1,transcriptDic2,orgId1,orgId2,outdir)
1277 | 
1278 | 	return
1279 | 
1280 | if __name__ == "__main__":
1281 | 	main(sys.argv)
1282 | 
1283 | 
1284 | 


--------------------------------------------------------------------------------
/Human-Monkey-Processed-Data/scripts/liftOver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ay-lab/ExTraMapper/ff8bf6399e457c041e10ab8d94c83ae54414b273/Human-Monkey-Processed-Data/scripts/liftOver


--------------------------------------------------------------------------------
/Human-Monkey-Processed-Data/scripts/liftover-withMultiples:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -ex
  2 | set -o pipefail
  3 | set -o errexit 
  4 | 
  5 | source config.conf
  6 | 
  7 | 
  8 | dataDir=${EXTRAMAPPER_DIR}/preprocess/data
  9 | dataDirPerPair=${EXTRAMAPPER_DIR}/preprocess/data/$org1-$org2
 10 | 
 11 | chainsDir=$dataDir/liftover_chains
 12 | ensemblDir=$dataDirPerPair/ensemblDownloads
 13 | 
 14 | liftOverFilesDir=$dataDirPerPair/liftoverRelatedFiles
 15 | perExonLiftoverDir=$dataDirPerPair/perExonLiftoverCoords
 16 | 
 17 | outdir=$liftOverFilesDir
 18 | flank=$1
 19 | minMatch=$2
 20 | 
 21 | chain1to2=$3
 22 | chain2to1=$4 
 23 | 
 24 | mkdir -p $ensemblDir
 25 | 
 26 | GTFfile1=$ensemblDir/org1.gtf.gz
 27 | GTFfile2=$ensemblDir/org2.gtf.gz
 28 | org1to2homologFile=$ensemblDir/org1_homolog_org2.txt.gz
 29 | org2to1homologFile=$ensemblDir/org2_homolog_org1.txt.gz
 30 | refGDdir1=$ensemblDir/org1 # genomedata archive for org1
 31 | refGDdir2=$ensemblDir/org2 # genomedata archive for org2
 32 | 
 33 | 	##########################  need to add 1 to liftedOver coordinates to match UCSC coordinates ###################
 34 | 	############## HOWEVER, this is only correct if original/lifted strands are same -/- or +/+   ####################
 35 | 	############## THEREFORE, I account manually for this by checking the strand pairs  	      ####################
 36 | 
 37 | 	############## ALSO, liftOver does not CHANGE the strand of original coordinates when used ######################
 38 | 	#############  without the -multiple option and it DOES with -multiple. 		#########################
 39 | 	############ HENCE, I handle these two cases differently.				########################
 40 | 
 41 | ## OLDER AND INCORRECT WAY #1	###########################################################################################################
 42 | #			zcat $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed.gz | awk '{print $1"\t"$2+1"\t"$3+1"\t"$4"\t"$5}' \
 43 | #				> $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp 
 44 | #			
 45 | #			zcat $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed.gz | awk '{print $1"\t"$2+1"\t"$3+1"\t"$4"\t"$5}' \
 46 | #				> $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp 
 47 | #
 48 | #rm -rf $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp
 49 | ###############################################################################################################################################
 50 | 
 51 | 
 52 | #
 53 | 	# first work on the partCoding exons
 54 | 	suffix=flank$flank-minMatch$minMatch-multiples-partCoding
 55 | 
 56 | 	# fourth fields stays the same, fifth is replaced by multiplicity, sixth will be the new strand after liftover
 57 | 	${EXTRAMAPPER_DIR}/preprocess/bin/liftOver <(cat $outdir/org1_partCodingExonsList.bed | awk '{print $1,$2-s,$3+s,$5,$4,$4}' s=$flank) \
 58 | 		$chain1to2 org2_mapped-$suffix.bed org2_unmapped-$suffix.bed -minMatch=$minMatch -multiple
 59 | 
 60 | 	${EXTRAMAPPER_DIR}/preprocess/bin/liftOver <(cat $outdir/org2_partCodingExonsList.bed | awk '{print $1,$2-s,$3+s,$5,$4,$4}' s=$flank) \
 61 | 		$chain2to1 org1_mapped-$suffix.bed org1_unmapped-$suffix.bed -minMatch=$minMatch -multiple
 62 | 
 63 | 	# chr, start, end, exonId, Multiplicity, strand (after conversion)
 64 | 	cat org1_mapped-$suffix.bed | sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed
 65 | 	cat org2_mapped-$suffix.bed | sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed
 66 | 	
 67 | 	# chr, start, end, exonId, Why unmapped, strand (before conversion)
 68 | 	cat org1_unmapped-$suffix.bed | awk '{l1=$1; getline; printf("%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,l1,$5)}' |\
 69 | 		 sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed
 70 | 	cat org2_unmapped-$suffix.bed | awk '{l1=$1; getline; printf("%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,l1,$5)}' |\
 71 | 		 sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed
 72 | 
 73 | 	rm -rf org1_mapped-$suffix.bed org2_mapped-$suffix.bed org1_unmapped-$suffix.bed org2_unmapped-$suffix.bed
 74 | 
 75 | 	# take the intersections
 76 | 	## NEW AND CORRECT WAY - FOR ONLY liftOver with -multiple OPTION  ###########################################################################
 77 | 	cat $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed | awk '{print $4,$0}' | sort -k1,1 > mapped.temp
 78 | 	join $outdir/org2_partCodingExonsList.sorted.temp mapped.temp | \
 79 | 		awk '{s=$8; e=$9; if ($5!=$12) {s=s+1; e=e+1;}; print $7"\t"s"\t"e"\t"$10"\t"$11"\t"$12}' \
 80 | 		| sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp
 81 | 	bedtools intersect -a $outdir/org1_allCodingExonsList.bed \
 82 | 		-b $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp -sorted -wao \
 83 | 		> $outdir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed
 84 | 
 85 | 	bedtools intersect -b $outdir/org1_allCodingExonsList.bed \
 86 | 		-a $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp -sorted -v \
 87 | 		> $outdir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed
 88 | 
 89 | 	cat $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed | awk '{print $4,$0}' | sort -k1,1 > mapped.temp
 90 | 	join $outdir/org1_partCodingExonsList.sorted.temp mapped.temp | \
 91 | 		awk '{s=$8; e=$9; if ($5!=$12) {s=s+1; e=e+1;}; print $7"\t"s"\t"e"\t"$10"\t"$11"\t"$12}' \
 92 | 		| sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp
 93 | 
 94 | 	bedtools intersect -a $outdir/org2_allCodingExonsList.bed \
 95 | 		-b $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp -sorted -wao \
 96 | 		> $outdir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed
 97 | 
 98 | 	bedtools intersect -b $outdir/org2_allCodingExonsList.bed \
 99 | 		-a $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp -sorted -v \
100 | 		> $outdir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed
101 | 	###############################################################################################################################################
102 | 
103 | 	rm -rf $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp mapped.temp
104 | 
105 | 	gzip $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed
106 | 	gzip $outdir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed $outdir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed
107 | 	gzip $outdir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed $outdir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed
108 | 	gzip $outdir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed $outdir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed
109 | 
110 | #
111 | 	# now work on all exons including the partCoding, nonCoding and fullCoding ones
112 | 
113 | 	suffix=flank$flank-minMatch$minMatch-multiples
114 | 
115 | 	# fourth fields stays the same, fifth is replaced by multiplicity, sixth will be the new strand after liftover
116 | 	#liftOver <(cat $outdir/org1_allExonsList.bed | awk '{if ($4=="+") print $1,$2-s,$3+s,$5,$4,$4; else print $1,$2-s,$3+s,$5,$4,$4;}' s=$flank) \
117 | 	${EXTRAMAPPER_DIR}/preprocess/bin/liftOver <(cat $outdir/org1_allExonsList.bed | awk '{print $1,$2-s,$3+s,$5,$4,$4}' s=$flank) \
118 | 		$chain1to2 org2_mapped-$suffix.bed org2_unmapped-$suffix.bed -minMatch=$minMatch -multiple
119 | 
120 | 	${EXTRAMAPPER_DIR}/preprocess/bin/liftOver <(cat $outdir/org2_allExonsList.bed | awk '{print $1,$2-s,$3+s,$5,$4,$4}' s=$flank) \
121 | 		$chain2to1 org1_mapped-$suffix.bed org1_unmapped-$suffix.bed -minMatch=$minMatch -multiple
122 | 
123 | 	cat org1_mapped-$suffix.bed | sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed
124 | 	cat org2_mapped-$suffix.bed | sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed
125 | 	#cat org1_unmapped-$suffix.bed | awk 'NR%2==1' | sort | uniq - 
126 | 	#cat org2_unmapped-$suffix.bed | awk 'NR%2==1' | sort | uniq - 
127 | 	
128 | 	cat org1_unmapped-$suffix.bed | awk '{l1=$1; getline; printf("%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,l1,$5)}' |\
129 | 		 sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed
130 | 	cat org2_unmapped-$suffix.bed | awk '{l1=$1; getline; printf("%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,l1,$5)}' |\
131 | 		 sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed
132 | 	
133 | 	rm -rf org1_mapped-$suffix.bed org2_mapped-$suffix.bed org1_unmapped-$suffix.bed org2_unmapped-$suffix.bed
134 | 
135 | 	# take the intersections
136 | 	## NEW AND CORRECT WAY - FOR ONLY liftOver with -multiple OPTION  ###########################################################################
137 | 	# This correction in coordinates leads to some exons with that doesn't have any file mapped, unmapped, nonintersecting. ##
138 | 	# There is only 2 such exons and they be deemed unmapped (i.e., deleted from the second organism) #
139 | 	cat $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed | awk '{print $4,$0}' | sort -k1,1 > mapped.temp
140 | 	join $outdir/org2_allExonsList.sorted.temp mapped.temp | \
141 | 		awk '{s=$8; e=$9; if ($5!=$12) {s=s+1; e=e+1;}; print $7"\t"s"\t"e"\t"$10"\t"$11"\t"$12}' \
142 | 		| sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp
143 | 	bedtools intersect -a $outdir/org1_allExonsList.bed \
144 | 		-b $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp -sorted -wao \
145 | 		> $outdir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed
146 | 	bedtools intersect -b $outdir/org1_allExonsList.bed \
147 | 		-a $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp -sorted -v \
148 | 		> $outdir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed
149 | 
150 | 	cat $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed | awk '{print $4,$0}' | sort -k1,1 > mapped.temp
151 | 	join $outdir/org1_allExonsList.sorted.temp mapped.temp | \
152 | 		awk '{s=$8; e=$9; if ($5!=$12) {s=s+1; e=e+1;}; print $7"\t"s"\t"e"\t"$10"\t"$11"\t"$12}' \
153 | 		| sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp
154 | 
155 | 	bedtools intersect -a $outdir/org2_allExonsList.bed \
156 | 		-b $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp -sorted -wao \
157 | 		> $outdir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed
158 | 
159 | 	bedtools intersect -b $outdir/org2_allExonsList.bed \
160 | 		-a $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp -sorted -v \
161 | 		> $outdir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed
162 | 
163 | 	rm -rf $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp mapped.temp
164 | 
165 | 	gzip $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed
166 | 	gzip $outdir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed $outdir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed
167 | 	gzip $outdir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed $outdir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed
168 | 	gzip $outdir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed $outdir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed
169 | 
170 | 	###############################################################################################################################################
171 | 
172 | 


--------------------------------------------------------------------------------
/Human-Monkey-Processed-Data/scripts/parseAndPicklePerPair.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ##############################################################################
  3 | ### To use the functions in this lib simply import this python module using
  4 | ### import ensemblUtils
  5 | ### Then you'll able able to call functions with the proper arguments using
  6 | ### returnVal=ensemblUtils.func1(arg1,arg2)
  7 | ##############################################################################
  8 | ##############################################################################
  9 | import sys
 10 | import os
 11 | import string
 12 | import math
 13 | import gzip
 14 | import _pickle as pickle
 15 | 
 16 | # reads from exported environment variable
 17 | ExTraMapperPath=os.environ['EXTRAMAPPER_DIR']
 18 | sys.path.append(ExTraMapperPath+"/scripts")
 19 | from ensemblUtils import *
 20 | 
 21 | #   Testing functionalities
 22 | def main(argv):
 23 | 	indir=argv[1]
 24 | 	orgId1="org1"; orgId2="org2";
 25 | 	refGD1=indir+"/genomedataArchives/org1"
 26 | 	refGD2=indir+"/genomedataArchives/org2"
 27 | 
 28 | 
 29 | #	outdir="GTFsummaries"; 
 30 | 	if len(argv)==2:
 31 | 		return
 32 | 
 33 | 	outdir=argv[2]
 34 | 	os.system("mkdir -p "+outdir)
 35 | 		
 36 | 	#infilename=indir+"/ensemblDownloads/org1.gtf.gz"
 37 | 	infilename=indir+"/ensemblDownloads/org1.gtf"	## Abhijit
 38 | 	geneDic1,transcriptDic1,exonDic1,infoDic1=parse_organism_GTF(orgId1, infilename, outdir)
 39 | 	
 40 | 	#infilename=indir+"/ensemblDownloads/org2.gtf.gz"
 41 | 	infilename=indir+"/ensemblDownloads/org2.gtf"	## Abhijit
 42 | 	geneDic2,transcriptDic2,exonDic2,infoDic2=parse_organism_GTF(orgId2, infilename, outdir)
 43 | 
 44 | 	## these two files were downloaded by hand selecting columns from Ensembl's Biomart
 45 | 	## I weren't able to redo the same column selections recently so I decided to switch to
 46 | 	## parsing the orthology information from readily available Ensembl files like below ones:
 47 | 	## ftp://ftp.ensembl.org/pub/release-80/mysql/ensembl_mart_80/
 48 | 	## hsapiens_gene_ensembl__homolog_mmus__dm.txt.gz
 49 | 	#infilename="/projects/b1017/shared/Ensembl-files/Ensembl-human-GRCh38-to-mouse-GRCm38.p3.txt.gz"
 50 | 	#genePairsHumanToMouse=parse_ensembl_gene_pairings(infilename)
 51 | 	#infilename="/projects/b1017/shared/Ensembl-files/Ensembl-mouse-GRCm38.p3-to-human-GRCh38.txt.gz"
 52 | 	#genePairsMouseToHuman=parse_ensembl_gene_pairings(infilename)
 53 | 	#consistency_check(genePairsHumanToMouse,genePairsMouseToHuman)
 54 | 	## if consistency check is ok then just use one side. This is OK for one2one mappings.
 55 | 	#genePairsDic=genePairsHumanToMouse
 56 | 	#pickle_one2one_genePairs_allInfo(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,transcriptDic1,transcriptDic2,outdir) 
 57 | 	
 58 | 	#infilename=indir+"/ensemblDownloads/org1_homolog_org2.txt.gz"
 59 | 	infilename=indir+"/ensemblDownloads/org1_homolog_org2.txt"	## Abhijit
 60 | 	proteinToGeneDic,genePairsDic,proteinPairsDic=parse_ensembl_geneAndProtein_pairings(infilename,{},{})
 61 | 	print (["1",len(proteinToGeneDic),len(genePairsDic),len(proteinPairsDic)])
 62 | 
 63 | 	#infilename=indir+"/ensemblDownloads/org2_homolog_org1.txt.gz"
 64 | 	infilename=indir+"/ensemblDownloads/org2_homolog_org1.txt"	## Abhijit
 65 | 	proteinToGeneDic,genePairsDic,proteinPairsDic=parse_ensembl_geneAndProtein_pairings(infilename,proteinToGeneDic,proteinPairsDic)
 66 | 	print (["2",len(proteinToGeneDic),len(genePairsDic),len(proteinPairsDic)])
 67 | 
 68 | 	
 69 | 	exonDic1=assign_firstMidLast_exon_counts(exonDic1,transcriptDic1)
 70 | 	exonDic2=assign_firstMidLast_exon_counts(exonDic2,transcriptDic2)
 71 | 
 72 | 	typ="allExonPlusMinus"
 73 | 	outfilename="None"
 74 | 	fivePrimeFlank=12; threePrimeFlank=12
 75 | 
 76 |         ###### Not required ######
 77 | 	#exonDic1=extract_conservation_stats_for_exons(refGD1,exonDic1,typ,fivePrimeFlank,threePrimeFlank,outfilename)
 78 | 	#exonDic2=extract_conservation_stats_for_exons(refGD2,exonDic2,typ,fivePrimeFlank,threePrimeFlank,outfilename)
 79 |         ###### 
 80 |  
 81 | 	outdir=argv[2] # overwrite previous summaries
 82 | 	os.system("mkdir -p "+outdir)
 83 | 	print_some_summary(orgId1, geneDic1,transcriptDic1,exonDic1,{}, outdir)
 84 | 	print_some_summary(orgId2, geneDic2,transcriptDic2,exonDic2,{}, outdir)
 85 | 
 86 | #	outdir="perGenePairExonLists"
 87 | 	if len(argv)==3:
 88 | 		return
 89 | 
 90 | 	outdir=argv[3]
 91 | 	os.system("mkdir -p "+outdir)
 92 | 
 93 | 	outfilename=outdir+"/genePairsSummary-one2one.txt"
 94 | 	print_one2one_genePairs(genePairsDic,geneDic1,geneDic2,outfilename) # either way is ok since one2one
 95 | 
 96 | 	geneOnlyOrthoDic1,transcriptOnlyOrthoDic1,exonOnlyOrthoDic1, geneOnlyOrthoDic2,transcriptOnlyOrthoDic2,exonOnlyOrthoDic2=pickle_one2one_genePairs_allInfo(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,transcriptDic1,transcriptDic2,outdir) 
 97 | 
 98 | 	print ([len(geneDic1), len(geneDic2)])
 99 | 	print (len(geneOnlyOrthoDic1))
100 | 	print (len(geneOnlyOrthoDic2))
101 | 
102 | 	outdir=argv[2]+"/onlyOrthologAndCodingGenes"
103 | 	os.system("mkdir -p "+outdir)
104 | 	print (outdir)
105 | 	print_some_summary(orgId1, geneOnlyOrthoDic1,transcriptOnlyOrthoDic1,exonOnlyOrthoDic1,{}, outdir)
106 | 	print_some_summary(orgId2, geneOnlyOrthoDic2,transcriptOnlyOrthoDic2,exonOnlyOrthoDic2,{}, outdir)
107 | 
108 | 	#
109 | #	print_one2one_exonListPairs(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,orgId1,orgId2,outdir)
110 | #	print_one2one_transcriptListPairs(genePairsDic,geneDic1,geneDic2,transcriptDic1,transcriptDic2,orgId1,orgId2,outdir)
111 | 
112 | 	return
113 | 
114 | if __name__ == "__main__":
115 | 	main(sys.argv)
116 | 
117 | 


--------------------------------------------------------------------------------
/Human-Monkey-Processed-Data/scripts/splitExonsIntoIndividualFiles.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | def main(argv):
 5 | 	infilename=argv[1]
 6 | 	outdir=argv[2]
 7 | 	whichCol=int(argv[3])-1
 8 | 	fileSuffix=argv[4]
 9 | 	infile=open(infilename,'r')
10 | 	lastExon="dummy"
11 | 	outfile=open("dummy.txt",'w')
12 | 	for line in infile:
13 | 		newExon=line.rstrip().split()[whichCol] # where exon name is
14 | 		if newExon!=lastExon:
15 | 			outfile.close()
16 | 			outfile=open(outdir+"/"+newExon+fileSuffix,'w')
17 | 		#
18 | 		outfile.write(line)
19 | 		lastExon=newExon
20 | 	#
21 | 	outfile.close()
22 | 	return
23 | 
24 | if __name__ == "__main__":
25 | 	main(sys.argv)
26 | #
27 | 
28 | 


--------------------------------------------------------------------------------
/Human-Mouse-Preprocess-Data/README.md:
--------------------------------------------------------------------------------
 1 | ## Steps to generate the input files (Human - Mouse)
 2 | The users should run the _extMpreprocess_ to generate the inputfiles. All the input files will be generated under _preprocess/data_ folder. All the required executables and scripts are provided here. The _extMpreprocess_ has 7 individual steps and should be run in the following manner 
 3 | 
 4 | ### Run the following steps 
 5 | 
 6 |  - ![#f03c15](https://via.placeholder.com/15/f03c15/000000?text=+) For help, type <br>
 7 |    
 8 |     ```bash
 9 |     ./extMpreprocess help
10 |     
11 |     This script will download and preprocess the dataset required for exon-pair and transcript pair finding by ExTraMapper.
12 |     Type ./extMpreprocess <config.conf> <step> to execute the script.
13 |     Type ./extMpreprocess example to print a example config.conf file.
14 | 
15 |     This script will run seven (7) sequential steps to create the inputs for ExTraMapper program.
16 |     Users can provide step numbers (1-7) or all in the <step> arugemt of this script.
17 |     Short description of the individual scripts:
18 |     Step 1: Download per organism specific files e.g. reference genomes, gene annotation files.
19 |     Step 2: Will create genomedata archives with the genomes of org1 and org2 (Make sure to install genomedata package).
20 |     Step 3: Pickle files for each homologous gene pair will be created.
21 |     Step 4: Perform coordinate liftOver of exons with multiple mappings (This step requires bedtools and liftOver executables).
22 |     Step 5-7: postprocessing the liftOver files.
23 |     
24 |     example: 
25 |     
26 |     ./extMpreprocess config.human-mouse.conf all
27 |     ```
28 |    <br>
29 |    <br>
30 |  - ![#f03c15](https://via.placeholder.com/15/f03c15/000000?text=+) The script requires genomedata package which can be installed by running the following commnand. <br>
31 |     
32 |     ```bash
33 |     $ pip install genomedata --user
34 |     ```
35 |     <br>
36 |  
37 |     <br>
38 | 
39 | #### Once finished the _extMpreprocess_ script shoudld produce the _preprocess folder with the following subfolders.<br>
40 | 
41 | ```bash 
42 | ./preprocess
43 | |-- bin
44 | |   `-- liftOver
45 | |-- data
46 |     |-- human-mouse
47 |     |   |-- GTFsummaries
48 |     |   |   |-- onlyOrthologAndCodingGenes
49 |     |   |   |   |-- org1-allExons-GTFparsed.txt
50 |     |   |   |   |-- org1-allGenes-GTFparsed.txt
51 |     |   |   |   |-- org1-allTranscripts-GTFparsed.txt
52 |     |   |   |   |-- org2-allExons-GTFparsed.txt
53 |     |   |   |   |-- org2-allGenes-GTFparsed.txt
54 |     |   |   |   `-- org2-allTranscripts-GTFparsed.txt
55 |     |   |   |-- org1-allExons-GTFparsed.txt
56 |     |   |   |-- org1-allGenes-GTFparsed.txt
57 |     |   |   |-- org1-allTranscripts-GTFparsed.txt
58 |     |   |   |-- org2-allExons-GTFparsed.txt
59 |     |   |   |-- org2-allGenes-GTFparsed.txt
60 |     |   |   `-- org2-allTranscripts-GTFparsed.txt
61 |     |   |-- ensemblDownloads
62 |     |   |   |-- org1.gtf
63 |     |   |   |-- org1.gtf.gz
64 |     |   |   |-- org1_homolog_org2.txt
65 |     |   |   |-- org1_homolog_org2.txt.gz
66 |     |   |   |-- org2.gtf
67 |     |   |   |-- org2.gtf.gz
68 |     |   |   |-- org2_homolog_org1.txt
69 |     |   |   `-- org2_homolog_org1.txt.gz
70 |     |   |-- genePairsSummary-one2one.txt
71 |     |   |-- genomedataArchives
72 |     |   |   |-- org1 [25 entries exceeds filelimit, not opening dir]
73 |     |   |   `-- org2 [22 entries exceeds filelimit, not opening dir]
74 |     |   |-- liftoverRelatedFiles [56 entries exceeds filelimit, not opening dir]
75 |     |   |-- perExonLiftoverCoords
76 |     |   |   |-- org1 [654707 entries exceeds filelimit, not opening dir]
77 |     |   |   `-- org2 [484860 entries exceeds filelimit, not opening dir]
78 |     |   |-- perGenePairPickledInfo [15804 entries exceeds filelimit, not opening dir]
79 |     |   
80 |     |-- liftover_chains
81 |     |   |-- hg38
82 |     |   |   `-- liftOver
83 |     |   |       `-- hg38ToMm10.over.chain.gz
84 |     |   `-- mm10
85 |     |       `-- liftOver
86 |     |           `-- mm10ToHg38.over.chain.gz
87 |     `-- reference_genomes
88 |         |-- hg38 [27 entries exceeds filelimit, not opening dir]
89 |         `-- mm10 [24 entries exceeds filelimit, not opening dir]
90 | 
91 | ```
92 | <br>
93 | 
94 | ##### The whole process should take several hours to complete!
95 | ##### [(Check also the Human-Moneky data processing steps)](https://github.com/ay-lab/ExTraMapper/tree/master/Human-Monkey-Processed-Data)
96 | 


--------------------------------------------------------------------------------
/Human-Mouse-Preprocess-Data/config.human-mouse.conf:
--------------------------------------------------------------------------------
 1 | # reference genome versions
 2 | ref1=hg38
 3 | ref2=mm10
 4 | 
 5 | # short names of organisms
 6 | org1=human
 7 | org2=mouse
 8 | 
 9 | # Ensembl release version number to be used for both organisms
10 | releaseNo=102
11 | 
12 | # Find out the standard Ensembl names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/gtf/
13 | org1EnsemblName=homo_sapiens
14 | org2EnsemblName=mus_musculus
15 | 
16 | # Find out the full and short Ensembl Mart names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/mysql/ensembl_mart_81
17 | org1EnsemblMartName=hsapiens
18 | org2EnsemblMartName=mmusculus
19 | org1EnsemblMartNameShort=hsap
20 | org2EnsemblMartNameShort=mmus
21 | 
22 | #liftOver executable path (Please make sure it is executable, chmod u+x liftOver)
23 | liftOver=<path to>/Human-Mouse-Preprocess-Data/scripts/liftOver
24 | 


--------------------------------------------------------------------------------
/Human-Mouse-Preprocess-Data/extMpreprocess:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | ## This script will download and preprocess the dataset required for 
  4 | ## exon-pair and transcript pair finding by ExTraMapper.
  5 | ## The script requires a config.conf file which will direct this script
  6 | ## to download and process the essential data. 
  7 | 
  8 | ##################### config.conf file #####################
  9 | ## Example of human-monkey confif.conf file:
 10 | ##
 11 | ## #Reference genome versions
 12 | ## ref1=hg38
 13 | ## ref2=rheMac10
 14 | ##
 15 | ## #Short names of organisms
 16 | ## org1=human
 17 | ## org2=rhesus
 18 | ##
 19 | ## #Ensembl release version number to be used for both organisms
 20 | ## releaseNo=102
 21 | ##
 22 | ## #Find out the standard Ensembl names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/gtf/
 23 | ## org1EnsemblName=homo_sapiens
 24 | ## org2EnsemblName=macaca_mulatta
 25 | ##
 26 | ## #Find out the full and short Ensembl Mart names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/mysql/ensembl_mart_102
 27 | ## org1EnsemblMartName=hsapiens
 28 | ## org2EnsemblMartName=mmulatta
 29 | ## org1EnsemblMartNameShort=hsap
 30 | ## org2EnsemblMartNameShort=mmul
 31 | ##
 32 | ## #liftOver executable path (Check here https://hgdownload.cse.ucsc.edu/admin/exe)
 33 | ## liftOver=./usr/bin/liftOver
 34 | ##
 35 | ##
 36 | ## Example of human-mouse confif.conf file:
 37 | ##
 38 | ## #Reference genome versions
 39 | ## ref1=hg38
 40 | ## ref2=mm10
 41 | ##
 42 | ## #Short names of organisms
 43 | ## org1=human
 44 | ## org2=mouse
 45 | ##
 46 | ## #Ensembl release version number to be used for both organisms
 47 | ## releaseNo=102
 48 | ##
 49 | ## #Find out the standard Ensembl names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/gtf/
 50 | ## org1EnsemblName=homo_sapiens
 51 | ## org2EnsemblName=mus_musculus
 52 | ##
 53 | ## #Find out the full and short Ensembl Mart names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/mysql/ensembl_mart_102
 54 | ## org1EnsemblMartName=hsapiens
 55 | ## org2EnsemblMartName=mmusculus
 56 | ## org1EnsemblMartNameShort=hsap
 57 | ## org2EnsemblMartNameShort=mmus
 58 | ##
 59 | ## #liftOver executable path (Check here https://hgdownload.cse.ucsc.edu/admin/exe)
 60 | ## liftOver=./usr/bin/liftOver
 61 | ##
 62 | ############################################################
 63 | 
 64 | if ($#ARGV == -1 || $ARGV[0] eq "help") {
 65 |   print ("\n");
 66 |   print ("This script will download and preprocess the dataset required for exon-pair and transcript pair finding by ExTraMapper.\n");
 67 |   print ("Type ./extMpreprocess <config.conf> <step> to execute the script.\n");
 68 |   print ("Type ./extMpreprocess example to print a example config.conf file.\n\n");
 69 |   print ("This script will run seven (7) sequential steps to create the inputs for ExTraMapper program.\n");
 70 |   print ("Users can provide step numbers (1-7) or all in the <step> arugemt of this script.\n");
 71 |   print ("Short description of the individual scripts:\n");
 72 |   print ("Step 1: Download per organism specific files e.g. reference genomes, gene annotation files.\n");
 73 |   print ("Step 2: Will create genomedata archives with the genomes of org1 and org2 (Make sure to install genomedata package).\n");
 74 |   print ("Step 3: Pickle files for each homologous gene pair will be created.\n");
 75 |   print ("Step 4: Perform coordinate liftOver of exons with multiple mappings (This step requires bedtools and liftOver executables).\n");
 76 |   print ("Step 5-7: postprocessing the liftOver files.\n");
 77 |   print ("\n");
 78 |   exit();
 79 | } elsif ($ARGV[0] eq "example") {
 80 |   my @exmpl = "# reference genome versions
 81 | ref1=hg38
 82 | ref2=mm10
 83 | 
 84 | # short names of organisms
 85 | org1=human
 86 | org2=mouse
 87 | 
 88 | # Ensembl release version number to be used for both organisms
 89 | releaseNo=102
 90 | 
 91 | # Find out the standard Ensembl names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/gtf/
 92 | org1EnsemblName=homo_sapiens
 93 | org2EnsemblName=mus_musculus
 94 | 
 95 | # Find out the full and short Ensembl Mart names for your organisms of interest from ftp://ftp.ensembl.org/pub/release-81/mysql/ensembl_mart_81
 96 | org1EnsemblMartName=hsapiens
 97 | org2EnsemblMartName=mmusculus
 98 | org1EnsemblMartNameShort=hsap
 99 | org2EnsemblMartNameShort=mmus
100 | 
101 | #liftOver executable path (Check here https://hgdownload.cse.ucsc.edu/admin/exe)
102 | liftOver=/usr/bin/liftOver\n";
103 |  
104 |  print (@exmpl);
105 |  print ("\n");
106 |  open (out, ">config.human-mouse.conf");
107 |  print out @exmpl;
108 |  close out;
109 |  print ("The example config.human-mouse.conf file is written\n"); 
110 |  exit;
111 | } 
112 | my ($configfile, $step) = @ARGV;
113 | chomp ($configfile, $step);
114 | 
115 | #### File and folder check ####
116 | die "The $configfile does not exists, exit!" unless -e "$configfile";
117 | 
118 | 
119 | #### Get the environmental variables ####
120 | $ENV{'EXTRAMAPPER_DIR'} = $ENV{'PWD'};
121 | open(in, $configfile);
122 | while (my $var = <in> ) {
123 |   chomp $var;
124 |   if ($var =~ /=/) {
125 |     $var_n = (split(/=/,$var))[0];
126 |     $var_v = (split(/=/,$var))[1];
127 |     $ENV{$var_n} = $var_v;
128 |   } 
129 | }
130 | close in;
131 | 
132 | #### Set the variable folders and files ####
133 | $dataDir             = "$ENV{'EXTRAMAPPER_DIR'}/preprocess/data";
134 | $dataDirPerPair      = "$ENV{'EXTRAMAPPER_DIR'}/preprocess/data/$ENV{'org1'}-$ENV{'org2'}";
135 | $referenceGenomesDir = "$dataDir/reference_genomes";
136 | $chainsDir           = "$dataDir/liftover_chains";
137 | $ensemblDir          = "$dataDirPerPair/ensemblDownloads";
138 | $genomedataDir       = "$dataDirPerPair/genomedataArchives";
139 | $GTFsummaryDir       = "$dataDirPerPair/GTFsummaries";
140 | $perGenePairPickleDir= "$dataDirPerPair/perGenePairPickledInfo";
141 | $liftOverFilesDir    = "$dataDirPerPair/liftoverRelatedFiles";
142 | $perExonLiftoverDir  = "$dataDirPerPair/perExonLiftoverCoords";
143 | 
144 | #### Main functions and sub-routines ####
145 | sub getfasta {
146 |   my $path = $_[0];
147 |   my $org  = $_[1];
148 |   my %chr;
149 |   open(chrname,"$path/$org/name_chr.txt");
150 |   while ( <chrname> ){
151 |     chomp $_;
152 |     $chr{$_} = 1;
153 |   }
154 |   close (chrname);
155 | 
156 |   my $file = "$path/$org/$org.fa.gz";
157 |   open(in, "zcat $file |");
158 |   while ( <in> ) {
159 |     chomp $_;
160 |     if ($_ =~ />/) {
161 |       $name = $_;
162 |       $ckpt = 0;
163 |       $name =~ s/>//g;
164 |       if ($chr{$name} ne "") {
165 |         print ("Extracting $name from $org.fa.gz file\n");
166 |         $ckpt = 1;
167 |         open($out,"|gzip -c > $path/$org/$name.fa.gz");
168 |         print $out (">$name\n");
169 |       } else {
170 |         close ($out);
171 |       }
172 |     } else {
173 |       if ($ckpt == 1) {
174 |         print $out ("$_\n");
175 |       }
176 |     }
177 |   }
178 |   close(in);
179 |   system("rm -rf $path/$org/$org.fa.gz");
180 |   print ("Finished extracting chromosomes and writing the individual *.fa.gz files\n");
181 |   print ("Removed $path/$org/$org.fa.gz\n");
182 | }
183 | 
184 | sub downloadrefgenome {
185 | 
186 |   my $path = $_[0];
187 |   my $org  = $_[1];
188 |   if (!-d "$path/$org") {
189 |     print ("Creating $path/$org folder\n");
190 |     system("mkdir -p $path/$org");
191 |     print ("Running: wget --timestamping ftp://hgdownload.cse.ucsc.edu/goldenPath/$org/chromosomes/* --directory-prefix=$path/$org 2>&1 | grep \"Login incorrect\"\n");
192 |     my $error = `wget --timestamping ftp://hgdownload.cse.ucsc.edu/goldenPath/$org/chromosomes/* --directory-prefix=$path/$org 2>&1 | grep "No such directory"`;
193 |     if ($error =~ "No such directory") {
194 |       print ("There is no chromosome folder for $org. So, downloding the bigZip file and extracting them\n");
195 |       print ("Running: wget --timestamping ftp://hgdownload.cse.ucsc.edu/goldenPath/$org/bigZips/$org.fa.gz --directory-prefix=$path/$org 2> /dev/null\n");
196 |       system("wget --timestamping ftp://hgdownload.cse.ucsc.edu/goldenPath/$org/bigZips/$org.fa.gz --directory-prefix=$path/$org 2> /dev/null"); 
197 |       print ("Extracting the individual chromosomes\n");
198 |       print ("zcat $path/$org/$org.fa.gz |grep \">\" |grep -v \"_random\" |grep -v \"chrUn\" |sed 's/>//g' > $path/$org/name_chr.txt\n");
199 |       system("zcat $path/$org/$org.fa.gz |grep \">\" |grep -v \"_random\" |grep -v \"chrUn\" |sed 's/>//g' > $path/$org/name_chr.txt");
200 |       getfasta($path, $org);
201 |       print "Reference genomes are downloaded in $path/$org\n";
202 |     } else {
203 |       system("rm -rf $path/$org/*_random*");
204 |       system("rm -rf $path/$org/chrUn*");
205 |       system("rm -rf $path/$org/*_alt*");
206 |     }
207 |   } else {
208 |     print ("$path/$org folder already exists, skipping downloading the dataset\n");
209 |   }
210 | }
211 | 
212 | sub downloadliftoverfiles {
213 |  
214 |   my $path = $_[0];
215 |   my $org1 = $_[1];
216 |   my $org2 = $_[2];
217 |   if (!-d "$path/$org1/liftOver") {
218 |     print ("Creating $path/$org1/liftOver folder\n");
219 |     system("mkdir -p $path/$org1/liftOver");
220 |     my $ref2Cap =`echo $org2 | python -c "s=input(); print (s[0].upper()+s[1:])"`; 
221 |     chomp $ref2Cap;
222 |     my $chain_name = $org1."To".$ref2Cap;
223 |     print ("Running: wget http://hgdownload.cse.ucsc.edu/goldenPath/$org1/liftOver/$chain_name.over.chain.gz --directory-prefix=$path/$org1/liftOver\n"); 
224 |     system("wget http://hgdownload.cse.ucsc.edu/goldenPath/$org1/liftOver/$chain_name.over.chain.gz --directory-prefix=$path/$org1/liftOver 2> /dev/null");
225 |     print ("LiftOver chain saved to $path/$org1/liftOver/$chain_name.over.chain.gz\n");
226 |   } else {
227 |     print ("$path/$org1 folder already exists, skipping download\n");
228 |   }
229 | }
230 | 
231 | sub downloadensmblfiles {
232 |  
233 |   my $path                = $_[0];
234 |   my $releaseNo           = $_[1];
235 |   my $org1EnsemblName     = $_[2];
236 |   my $org1EnsemblMartName = $_[3];
237 |   my $org2EnsemblName     = $_[4];
238 |   my $org2EnsemblMartName = $_[5];
239 |  
240 |   print ("Downloading GTF files\n");
241 |   if (!-e "$path/org1.gtf.gz") {
242 |     print ("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/gtf/$org1EnsemblName/*.$releaseNo.gtf.gz -O $path/org1.gtf.gz\n");
243 |     system("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/gtf/$org1EnsemblName/*.$releaseNo.gtf.gz -O $path/org1.gtf.gz 2> /dev/null");
244 |     print ("GTF files downloaded in $path\n");
245 |   } else {
246 |     print ("$path/org1.gtf.gz file exists, skipping download\n");
247 |   }
248 |   if (!-e "$path/org2.gtf.gz") {
249 |     print ("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/gtf/$org2EnsemblName/*.$releaseNo.gtf.gz -O $path/org2.gtf.gz\n");
250 |     system("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/gtf/$org2EnsemblName/*.$releaseNo.gtf.gz -O $path/org2.gtf.gz 2> /dev/null");
251 |     print ("GTF files downloaded in $path\n");
252 |   } else {
253 |     print ("$path/org2.gtf.gz file exists, skipping download\n");
254 |   }
255 |  
256 |   print ("Downloading ENSEMBL homologs\n");
257 |   if (!-e "$path/org1_homolog_org2.txt.gz") {
258 |     print ("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/mysql/ensembl_mart_$releaseNo/$org1EnsemblMartName\_gene_ensembl__homolog_$org2EnsemblMartName\__dm.txt.gz -O $path/org1_homolog_org2.txt.gz\n");
259 |     system("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/mysql/ensembl_mart_$releaseNo/$org1EnsemblMartName\_gene_ensembl__homolog_$org2EnsemblMartName\__dm.txt.gz -O $path/org1_homolog_org2.txt.gz 2> /dev/null");
260 |     print ("ENSEMBL homolog downloaded in $path\n");
261 |   } else {
262 |     print ("$path/org1_homolog_org2.txt.gz file exists, skipping download\n");
263 |   }
264 | 
265 |   if (!-e "$path/org2_homolog_org1.txt.gz") {
266 |     print ("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/mysql/ensembl_mart_$releaseNo/$org2EnsemblMartName\_gene_ensembl__homolog_$org1EnsemblMartName\__dm.txt.gz -O $path/org2_homolog_org1.txt.gz\n");
267 |     system("wget ftp://ftp.ensembl.org/pub/release-$releaseNo/mysql/ensembl_mart_$releaseNo/$org2EnsemblMartName\_gene_ensembl__homolog_$org1EnsemblMartName\__dm.txt.gz -O $path/org2_homolog_org1.txt.gz 2> /dev/null");
268 |     print ("ENSEMBL homolog downloaded in $path\n");
269 |   } else {
270 |     print ("$path/org2_homolog_org1.txt.gz file exists, skipping download\n");
271 |   }
272 | 
273 | }
274 | 
275 | sub ltime {
276 |    
277 |   my $time = localtime;
278 |   return($time);
279 | }
280 | 
281 | sub genomedataarchive {
282 | 
283 |   my $path = $_[0];
284 |   my $org  = $_[1];
285 |   my $ref  = $_[2];
286 |   my $referenceGenomesDir = $_[3]; 
287 |   my $old_path = $ENV{'PWD'};
288 |   chdir $path;
289 |   if (-e "$ref.fa") {
290 |     print ("Deleting the existing $ref.fa\n");
291 |     system("rm -rf $ref.fa");
292 |   }
293 |   if (!-d $org) {
294 |     print ("Running : zcat $referenceGenomesDir/$ref/*.fa.gz > $ref.fa\n");
295 |     print ("Started at ",ltime(),"\n");
296 |     system("zcat $referenceGenomesDir/$ref/*.fa.gz > $ref.fa");
297 |     print ("Ended at ",ltime(),"\n");
298 |     print ("Running : genomedata-load-seq -d $org $ref.fa\n");
299 |     print ("Started at ",ltime(),"\n");
300 |     system("genomedata-load-seq -d $org $ref.fa");
301 |     system("genomedata-close-data $org");
302 |     print ("Ended at ",ltime(),"\n");
303 |     system("rm -rf $ref.fa");
304 |   } else {
305 |     print ("$org genomedata exists, skipping the step\n");
306 |   }
307 |   chdir $old_path;
308 | }
309 | 
310 | sub parseAndPicklePerPair {
311 | 
312 |   my $extmapper_path       = $_[0];
313 |   my $ensemblDir           = $_[1];
314 |   my $dataDirPerPair       = $_[2];
315 |   my $GTFsummaryDir        = $_[3]; 
316 |   my $perGenePairPickleDir = $_[4];
317 |    
318 |   if (!-e "$ensemblDir/org1.gtf") {
319 |     print ("Running : gunzip -k $ensemblDir/org1.gtf.gz\n");
320 |     system("gunzip -k $ensemblDir/org1.gtf.gz");
321 |   } else {
322 |     print ("$ensemblDir/org1.gtf file present, skipping gunzip action\n");
323 |   }
324 |   if (!-e "$ensemblDir/org2.gtf") {
325 |     print ("Running : gunzip -k $ensemblDir/org2.gtf.gz\n");
326 |     system("gunzip -k $ensemblDir/org2.gtf.gz");
327 |   } else {
328 |     print ("$ensemblDir/org2.gtf file present, skipping gunzip action\n");
329 |   }
330 |   if (!-e "$ensemblDir/org1_homolog_org2.txt") {
331 |     print ("Running : gunzip -k $ensemblDir/org1_homolog_org2.txt.gz\n");
332 |     system("gunzip -k $ensemblDir/org1_homolog_org2.txt.gz");
333 |   } else {
334 |     print ("$ensemblDir/org1_homolog_org2.txt file present, skipping gunzip action\n");
335 |   }
336 |   if (!-e "$ensemblDir/org2_homolog_org1.txt") {
337 |     print ("Running : gunzip -k $ensemblDir/org2_homolog_org1.txt.gz\n");
338 |     system("gunzip -k $ensemblDir/org2_homolog_org1.txt.gz");
339 |   } else {
340 |     print ("$ensemblDir/org2_homolog_org1.txt file present, skipping gunzip action\n");
341 |   }
342 | 
343 |   if (!-d $perGenePairPickleDir) {
344 |     print ("Running : python $extmapper_path/scripts/parseAndPicklePerPair.py $dataDirPerPair $GTFsummaryDir $perGenePairPickleDir\n");
345 |     print ("Started at ",ltime(),"\n");
346 |     system("python $extmapper_path/scripts/parseAndPicklePerPair.py $dataDirPerPair $GTFsummaryDir $perGenePairPickleDir");
347 |     print ("Ended at ",ltime(),"\n");
348 |     system("mv $perGenePairPickleDir/genePairsSummary-one2one.txt $dataDirPerPair/genePairsSummary-one2one.txt");
349 |   } else {
350 |     print ("perGenePairPickleDir found, skipping\n");
351 |   }
352 | }
353 | 
354 | sub liftoverexonmultiplemapping {
355 | 
356 |   my $GTFsummaryDir    = $_[0];
357 |   my $liftOverFilesDir = $_[1];
358 |   my $chainsDir = $_[2];
359 |   my $ref1 = $_[3];
360 |   my $ref2 = $_[4];
361 |   my $extmapper_path = $_[5]; 
362 | 
363 |   my $indir = "$GTFsummaryDir/onlyOrthologAndCodingGenes";
364 |  
365 |   print ("Running : cat $indir/org1-allExons-GTFparsed.txt | awk -v OFS='\\t' 'NR>1{print \$1,\$2,\$3,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_allExonsList.bed\n");
366 |   print ("Started at ",ltime(),"\n");
367 |   system("cat $indir/org1-allExons-GTFparsed.txt | awk -v OFS='\\t' 'NR>1{print \$1,\$2,\$3,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_allExonsList.bed");
368 |   print ("Ended at ",ltime(),"\n");
369 |   
370 |   print ("Running : cat $indir/org2-allExons-GTFparsed.txt | awk -v OFS='\\t' 'NR>1{print \$1,\$2,\$3,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_allExonsList.bed\n");
371 |   print ("Started at ",ltime(),"\n");
372 |   system("cat $indir/org2-allExons-GTFparsed.txt | awk -v OFS='\\t' 'NR>1{print \$1,\$2,\$3,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_allExonsList.bed");
373 |   print ("Ended at ",ltime(),"\n");
374 | 
375 |   print ("Running : cat $indir/org1-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"partCoding\" {print \$1,\$7,\$8,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_partCodingExonsList.bed\n");
376 |   print ("Started at ",ltime(),"\n");
377 |   system("cat $indir/org1-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"partCoding\" {print \$1,\$7,\$8,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_partCodingExonsList.bed");
378 |   print ("Ended at ",ltime(),"\n");
379 | 
380 |   print ("Running : cat $indir/org2-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"partCoding\" {print \$1,\$7,\$8,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_partCodingExonsList.bed\n");
381 |   print ("Started at ",ltime(),"\n");
382 |   system("cat $indir/org2-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"partCoding\" {print \$1,\$7,\$8,\$4,\$5}' | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_partCodingExonsList.bed");
383 |   print ("Ended at ",ltime(),"\n");
384 | 
385 |   print ("Running : cat $indir/org1-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"fullCoding\" {print \$1,\$2,\$3,\$4,\$5}' > $liftOverFilesDir/org1_f.temp\n");
386 |   print ("Started at ",ltime(),"\n");
387 |   system("cat $indir/org1-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"fullCoding\" {print \$1,\$2,\$3,\$4,\$5}' > $liftOverFilesDir/org1_f.temp");
388 |   print ("Ended at ",ltime(),"\n");
389 |  
390 |   print ("Running : cat $indir/org2-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"fullCoding\" {print \$1,\$2,\$3,\$4,\$5}' > $liftOverFilesDir/org2_f.temp\n");
391 |   print ("Started at ",ltime(),"\n");
392 |   system("cat $indir/org2-allExons-GTFparsed.txt |awk -v OFS='\\t' '\$6==\"fullCoding\" {print \$1,\$2,\$3,\$4,\$5}' > $liftOverFilesDir/org2_f.temp");
393 |   print ("Ended at ",ltime(),"\n");
394 | 
395 |   print ("Running : cat $liftOverFilesDir/org1_partCodingExonsList.bed $liftOverFilesDir/org1_f.temp | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_allCodingExonsList.bed\n");
396 |   print ("Started at ",ltime(),"\n");
397 |   system("cat $liftOverFilesDir/org1_partCodingExonsList.bed $liftOverFilesDir/org1_f.temp | sort -k1,1 -k2,2n > $liftOverFilesDir/org1_allCodingExonsList.bed");
398 |   print ("Ended at ",ltime(),"\n");
399 | 
400 |   print ("Running : cat $liftOverFilesDir/org2_partCodingExonsList.bed $liftOverFilesDir/org2_f.temp | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_allCodingExonsList.bed\n");
401 |   print ("Started at ",ltime(),"\n");
402 |   system("cat $liftOverFilesDir/org2_partCodingExonsList.bed $liftOverFilesDir/org2_f.temp | sort -k1,1 -k2,2n > $liftOverFilesDir/org2_allCodingExonsList.bed");
403 |   print ("Ended at ",ltime(),"\n");
404 | 
405 |   print ("Running : cat $liftOverFilesDir/org1_allCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_allCodingExonsList.sorted.temp\n");
406 |   print ("Started at ",ltime(),"\n");
407 |   system("cat $liftOverFilesDir/org1_allCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_allCodingExonsList.sorted.temp");
408 |   print ("Ended at ",ltime(),"\n");
409 | 
410 |   print ("Running : cat $liftOverFilesDir/org2_allCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_allCodingExonsList.sorted.temp\n");
411 |   print ("Started at ",ltime(),"\n");
412 |   system("cat $liftOverFilesDir/org2_allCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_allCodingExonsList.sorted.temp");
413 |   print ("Ended at ",ltime(),"\n");
414 | 
415 |   print ("Running : cat $liftOverFilesDir/org1_allExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_allExonsList.sorted.temp\n");
416 |   print ("Started at ",ltime(),"\n");
417 |   system("cat $liftOverFilesDir/org1_allExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_allExonsList.sorted.temp");
418 |   print ("Ended at ",ltime(),"\n");
419 | 
420 |   print ("Running : cat $liftOverFilesDir/org2_allExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_allExonsList.sorted.temp\n");
421 |   print ("Started at ",ltime(),"\n");
422 |   system("cat $liftOverFilesDir/org2_allExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_allExonsList.sorted.temp");
423 |   print ("Ended at ",ltime(),"\n");
424 | 
425 |   print ("Running : cat $liftOverFilesDir/org1_partCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_partCodingExonsList.sorted.temp\n");
426 |   print ("Started at ",ltime(),"\n");
427 |   system("cat $liftOverFilesDir/org1_partCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org1_partCodingExonsList.sorted.temp");
428 |   print ("Ended at ",ltime(),"\n");
429 | 
430 |   print ("Running : cat $liftOverFilesDir/org2_partCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_partCodingExonsList.sorted.temp\n");
431 |   print ("Started at ",ltime(),"\n");
432 |   system("cat $liftOverFilesDir/org2_partCodingExonsList.bed |awk '{print \$5,\$0}' | sort -k1,1 > $liftOverFilesDir/org2_partCodingExonsList.sorted.temp");
433 |   print ("Ended at ",ltime(),"\n");
434 | 
435 |   my $chain1to2=`ls $chainsDir/$ref1/liftOver/*.over.chain.gz`;
436 |   my $chain2to1=`ls $chainsDir/$ref2/liftOver/*.over.chain.gz`;
437 |   chomp ($chain1to2, $chain2to1);
438 |   
439 |   foreach my $minMatch (qw{1 0.95 0.9}) {
440 |     print ("Running : $extmapper_path/scripts/liftover-withMultiples 0 $minMatch $chain1to2 $chain2to1\n"); 
441 |     print ("Started at ",ltime(),"\n");
442 |     system("$extmapper_path/scripts/liftover-withMultiples 0 $minMatch $chain1to2 $chain2to1");
443 |     print ("Ended at ",ltime(),"\n");
444 |   }
445 |   system("rm -rf $liftOverFilesDir/org2_allExonsList.sorted.temp");
446 |   system("rm -rf $liftOverFilesDir/org1_allExonsList.sorted.temp");
447 |   system("rm -rf $liftOverFilesDir/org2_partCodingExonsList.sorted.temp");
448 |   system("rm -rf $liftOverFilesDir/org1_partCodingExonsList.sorted.temp"); 
449 |   system("rm -rf $liftOverFilesDir/org2_allCodingExonsList.sorted.temp");
450 |   system("rm -rf $liftOverFilesDir/org1_allCodingExonsList.sorted.temp");
451 | }
452 | 
453 | sub liftoverfilesprocess {
454 | 
455 |   my $indir  = $_[0];
456 |   my $outdir = $_[1];
457 |   my $flank  = $_[2];
458 |   my $extmapper_path = $_[3];
459 | 
460 |   if (-e "oneHugeFile-2to1-partCoding.txt") {
461 |     system("rm -rf oneHugeFile-2to1-partCoding.txt");
462 |   }
463 |   if (-e "oneHugeFile-1to2-partCoding.txt") {
464 |     system("rm -rf oneHugeFile-1to2-partCoding.txt");
465 |   }
466 | 
467 |   foreach my $minMatch (qw{1 0.95 0.9}) {
468 |     $suffix="flank$flank-minMatch$minMatch-multiples-partCoding";
469 |     print ("Running : zcat $indir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\".\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-2to1-partCoding.txt\n");
470 |     print ("Started at ",ltime(),"\n");
471 |     system("zcat $indir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\".\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-2to1-partCoding.txt");
472 |     print ("Ended at ",ltime(),"\n");
473 | 
474 |     print ("Running : zcat $indir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\".\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-1to2-partCoding.txt\n");
475 |     print ("Started at ",ltime(),"\n");
476 |     system("zcat $indir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\".\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-1to2-partCoding.txt");
477 |     print ("Started at ",ltime(),"\n");
478 |   }
479 | 
480 |   if (-e "oneHugeFile-2to1-others.txt") {
481 |     system("rm -rf oneHugeFile-2to1-others.txt");
482 |   } 
483 |   if (-e "oneHugeFile-1to2-others.txt") {
484 |     system("rm -rf oneHugeFile-1to2-others.txt");
485 |   }
486 | 
487 |   foreach my $minMatch (qw{1 0.95 0.9}) {
488 |     $suffix="flank$flank-minMatch$minMatch-multiples";
489 |     print ("Running : zcat $indir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\"\.\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-2to1-others.txt\n");
490 |     print ("Started at ",ltime(),"\n");
491 |     system("zcat $indir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\"\.\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-2to1-others.txt");
492 |     print ("Ended at ",ltime(),"\n");
493 | 
494 |     print ("Running : zcat $indir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\"\.\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-1to2-others.txt\n");
495 |     print ("Started at ",ltime(),"\n");
496 |     system("zcat $indir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed.gz |awk -v OFS='\\t' '\$6!=\"\.\"{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$11,\$9,\$12,s}' s=$suffix >> oneHugeFile-1to2-others.txt");
497 |     print ("Ended at ",ltime(),"\n");
498 |   }
499 |   
500 |   print ("Running : cat oneHugeFile-1to2-partCoding.txt oneHugeFile-1to2-others.txt | sort -k10,10 >oneHugeFile-1to2.txt.sorted\n");
501 |   print ("Started at ",ltime(),"\n");
502 |   system("cat oneHugeFile-1to2-partCoding.txt oneHugeFile-1to2-others.txt | sort -k10,10 >oneHugeFile-1to2.txt.sorted");
503 |   print ("Ended at ",ltime(),"\n");
504 | 
505 |   print ("Running : cat oneHugeFile-2to1-partCoding.txt oneHugeFile-2to1-others.txt | sort -k10,10 >oneHugeFile-2to1.txt.sorted\n");
506 |   print ("Started at ",ltime(),"\n");
507 |   system("cat oneHugeFile-2to1-partCoding.txt oneHugeFile-2to1-others.txt | sort -k10,10 >oneHugeFile-2to1.txt.sorted");
508 |   print ("Ended at ",ltime(),"\n");
509 | 
510 |   system("mkdir -p $outdir/org1 $outdir/org2");
511 |   $whichCol=10;
512 |   $fileSuffix="_mapped.txt";
513 | 
514 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix\n");
515 |   print ("Started at ",ltime(),"\n");
516 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix");
517 |   print ("Ended at ",ltime(),"\n");
518 | 
519 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix\n");
520 |   print ("Started at ",ltime(),"\n");
521 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix");
522 |   print ("Ended at ",ltime(),"\n");
523 |  
524 |   print ("Removing temporary files\n");
525 |   system("rm -rf oneHugeFile*.txt"); 
526 | 
527 | }
528 | 
529 | sub liftoverfilesprocessunmappedexons {
530 | 
531 |   my $indir  = $_[0];
532 |   my $outdir = $_[1];
533 |   my $flank  = $_[2];
534 |   my $extmapper_path = $_[3];
535 | 
536 |   if (-e "oneHugeFile-2to1-partCoding.txt") {
537 |     system("rm -rf oneHugeFile-2to1-partCoding.txt");
538 |   }
539 |   if (-e "oneHugeFile-1to2-partCoding.txt") {
540 |     system("rm -rf oneHugeFile-1to2-partCoding.txt");
541 |   }
542 |   
543 |   foreach my $minMatch (qw{1 0.95 0.9}) {
544 |     $suffix="flank$flank-minMatch$minMatch-multiples-partCoding";
545 |     print ("Running : zcat $indir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed.gz |awk -v OFS='\\t' '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-partCoding.txt\n");
546 |     print ("Started at ",ltime(),"\n");
547 |     system("zcat $indir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed.gz |awk -v OFS='\\t' '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-partCoding.txt"); 
548 |     print ("Ended at ",ltime(),"\n");
549 |   
550 |     print ("Running : zcat $indir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed.gz |awk -v OFS='\\t' '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-partCoding.txt\n");
551 |     print ("Started at ",ltime(),"\n");
552 |     system("zcat $indir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed.gz |awk -v OFS='\\t' '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-partCoding.txt");
553 |     print ("Ended at ",ltime(),"\n");
554 |   }  
555 |  
556 |   if (-e "oneHugeFile-2to1-others.txt") {
557 |     system("rm -rf oneHugeFile-2to1-others.txt");
558 |   }
559 |   if (-e "oneHugeFile-1to2-others.txt") {
560 |     system("rm -rf oneHugeFile-1to2-others.txt");
561 |   }
562 | 
563 |   foreach my $minMatch (qw{1 0.95 0.9}) {
564 |     $suffix="flank$flank-minMatch$minMatch-multiples";
565 |     print ("Running : zcat $indir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-others.txt\n");
566 |     print ("Started at ",ltime(),"\n");
567 |     system("zcat $indir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-others.txt");
568 |     print ("Ended at ",ltime(),"\n");
569 | 
570 |     print ("Running : zcat $indir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-others.txt\n");
571 |     print ("Started at ",ltime(),"\n");
572 |     system("zcat $indir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-others.txt");
573 |     print ("Ended at ",ltime(),"\n");
574 |   }
575 | 
576 |   print ("Running : cat oneHugeFile-1to2-partCoding.txt oneHugeFile-1to2-others.txt | sort -k5,5 >oneHugeFile-1to2.txt.sorted\n");
577 |   print ("Started at ",ltime(),"\n");
578 |   system("cat oneHugeFile-1to2-partCoding.txt oneHugeFile-1to2-others.txt | sort -k5,5 >oneHugeFile-1to2.txt.sorted");
579 |   print ("Ended at ",ltime(),"\n");
580 | 
581 |   print ("Running : cat oneHugeFile-2to1-partCoding.txt oneHugeFile-2to1-others.txt | sort -k5,5 >oneHugeFile-2to1.txt.sorted\n");
582 |   print ("Started at ",ltime(),"\n");
583 |   system("cat oneHugeFile-2to1-partCoding.txt oneHugeFile-2to1-others.txt | sort -k5,5 >oneHugeFile-2to1.txt.sorted");
584 |   print ("Ended at ",ltime(),"\n");
585 | 
586 |   system("mkdir -p $outdir/org1 $outdir/org2");
587 |   $whichCol=5;
588 |   $fileSuffix="_unmapped.txt";
589 | 
590 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix\n");
591 |   print ("Started at ",ltime(),"\n");
592 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix");
593 |   print ("Ended at ",ltime(),"\n");
594 | 
595 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix\n");
596 |   print ("Started at ",ltime(),"\n");
597 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix");
598 |   print ("Ended at ",ltime(),"\n");
599 | }
600 | 
601 | sub liftoverfilesprocessmappedexons {
602 | 
603 |   my $indir  = $_[0];
604 |   my $outdir = $_[1];
605 |   my $flank  = $_[2];
606 |   my $extmapper_path = $_[3];
607 | 
608 |   if (-e "oneHugeFile-2to1-others.txt") {
609 |     system("rm -rf oneHugeFile-2to1-others.txt");
610 |   }
611 |   if (-e "oneHugeFile-1to2-others.txt") {
612 |     system("rm -rf oneHugeFile-1to2-others.txt");
613 |   }
614 |   
615 |   foreach my $minMatch (qw{1 0.95 0.9}) {
616 |     $suffix="flank$flank-minMatch$minMatch-multiples";
617 |     print ("Running : zcat $indir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-others.txt\n");
618 |     print ("Started at ",ltime(),"\n");
619 |     system("zcat $indir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-2to1-others.txt");
620 |     print ("Ended at ",ltime(),"\n");
621 | 
622 |     print ("Running : zcat $indir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-others.txt\n");
623 |     print ("Started at ",ltime(),"\n");
624 |     system("zcat $indir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed.gz |awk '{print \$1,\$2,\$3,\$6,\$4,\$5,s}' s=$suffix >> oneHugeFile-1to2-others.txt");
625 |     print ("Ended at ",ltime(),"\n");
626 |   }
627 |  
628 |   print ("Running : cat oneHugeFile-1to2-others.txt | sort -k5,5 >oneHugeFile-1to2.txt.sorted\n");
629 |   print ("Started at ",ltime(),"\n");
630 |   system("cat oneHugeFile-1to2-others.txt | sort -k5,5 >oneHugeFile-1to2.txt.sorted");
631 |   print ("Ended at ",ltime(),"\n");
632 | 
633 |   print ("Running : cat oneHugeFile-2to1-others.txt | sort -k5,5 >oneHugeFile-2to1.txt.sorted\n");
634 |   print ("Started at ",ltime(),"\n");
635 |   system("cat oneHugeFile-2to1-others.txt | sort -k5,5 >oneHugeFile-2to1.txt.sorted");
636 |   print ("Ended at ",ltime(),"\n");
637 |  
638 |   system("mkdir -p $outdir/org1 $outdir/org2");
639 |   $whichCol=5;
640 |   $fileSuffix="_nonintersecting.txt";
641 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix\n");
642 |   print ("Started at ",ltime(),"\n");
643 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-1to2.txt.sorted $outdir/org1 $whichCol $fileSuffix");
644 |   print ("Ended at ",ltime(),"\n");
645 | 
646 |   print ("Running : python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix\n");
647 |   print ("Started at ",ltime(),"\n");
648 |   system("python $extmapper_path/scripts/splitExonsIntoIndividualFiles.py oneHugeFile-2to1.txt.sorted $outdir/org2 $whichCol $fileSuffix");
649 |   print ("Ended at ",ltime(),"\n");  
650 | 
651 |   print ("Removing temporary files\n");
652 |   system("rm -rf oneHugeFile* dummy.txt");
653 | }
654 | 
655 | sub step {
656 |  
657 |    my $step = $_[0];
658 | 
659 |    if ($step == 1 || $step eq "all" || $step eq "All" || $step eq "ALL") { 
660 |  
661 |     print ("Running step 1:\n");
662 |     print ("Downloading per organism specific files and keep the original organism names for future reuse\n");
663 |     print ("Downloading the two reference genomes from UCSC and get rid of unknown, random and alt contigs\n");
664 | 
665 |     system("mkdir -p $referenceGenomesDir");
666 |     downloadrefgenome($referenceGenomesDir, $ENV{'ref1'});
667 |     downloadrefgenome($referenceGenomesDir, $ENV{'ref2'});
668 | 
669 |     system("mkdir -p $chainsDir");
670 |     downloadliftoverfiles($chainsDir, $ENV{'ref1'}, $ENV{'ref2'});
671 |     downloadliftoverfiles($chainsDir, $ENV{'ref2'}, $ENV{'ref1'});
672 |  
673 |     system("mkdir -p $ensemblDir");
674 |     downloadensmblfiles($ensemblDir, $ENV{'releaseNo'}, $ENV{'org1EnsemblName'}, $ENV{'org1EnsemblMartName'}, $ENV{'org2EnsemblName'}, $ENV{'org2EnsemblMartName'}); 
675 |     print ("---------------------- Step 1 Finished ----------------------\n");
676 |   }
677 |   
678 |   if ($step == 2 || $step eq "all" || $step eq "All" || $step eq "ALL") {
679 | 
680 |     print ("Running step 2:\n");
681 |     print ("Initialize the genomedata archives with the genomes of org1 and org2\n");
682 |     print ("Make sure genomedata is installed first\n");
683 |     print ("Installation: pip install genomedata --user\n");
684 |     system("mkdir -p $genomedataDir");
685 |     
686 |     genomedataarchive($genomedataDir, "org1", $ENV{'ref1'}, $referenceGenomesDir);
687 |     genomedataarchive($genomedataDir, "org2", $ENV{'ref2'}, $referenceGenomesDir);
688 |     print ("---------------------- Step 2 Finished ----------------------\n");
689 |   }
690 |  
691 |   if ($step == 3 || $step eq "all" || $step eq "All" || $step eq "ALL") {
692 |     print ("Running step 3:\n");
693 |     print ("Creating pickle files\n");
694 |     parseAndPicklePerPair($ENV{'EXTRAMAPPER_DIR'}, $ensemblDir, $dataDirPerPair, $GTFsummaryDir, $perGenePairPickleDir);
695 |     print ("---------------------- Step 3 Finished ----------------------\n");
696 |   }
697 | 
698 |   if ($step == 4 || $step eq "all" || $step eq "All" || $step eq "ALL") {
699 |     print ("Running step 4:\n");
700 |     print ("liftOver the exon lists but this time allow multiple mappings and also compute intersections with the other set of exons\n");
701 |     system("mkdir -p $liftOverFilesDir");
702 |     system("mkdir -p preprocess/bin");
703 |     if (!-e "./preprocess/bin/liftOver") {
704 |       system("ln -s \$(readlink $ENV{liftOver}) ./preprocess/bin");
705 |     }
706 |     liftoverexonmultiplemapping($GTFsummaryDir, $liftOverFilesDir, $chainsDir, $ENV{'ref1'}, $ENV{'ref2'}, $ENV{'EXTRAMAPPER_DIR'});
707 |     print ("---------------------- Step 4 Finished ----------------------\n");
708 |   }
709 | 
710 |   if ($step == 5 || $step eq "all" || $step eq "All" || $step eq "ALL") {
711 |     print ("Running step 5:\n");
712 |     print ("Putting together, sorting, making them uniq and then splitting into one file per exon for all the liftover files created so far\n");
713 |     liftoverfilesprocess($liftOverFilesDir,  $perExonLiftoverDir, 0, $ENV{'EXTRAMAPPER_DIR'}); 
714 |     print ("---------------------- Step 5 Finished ----------------------\n");
715 |   }
716 |   
717 |   if ($step == 6 || $step eq "all" || $step eq "All" || $step eq "ALL") {
718 |     print ("Running step 6:\n");
719 |     print ("Putting together, sorting, making them uniq and then splitting into one file per exon for all the liftover files created for UNMAPPED EXONS so far\n"); 
720 |     liftoverfilesprocessunmappedexons($liftOverFilesDir,  $perExonLiftoverDir, 0, $ENV{'EXTRAMAPPER_DIR'});
721 |     print ("---------------------- Step 6 Finished ----------------------\n");
722 |   }
723 | 
724 |   if ($step == 7 || $step eq "all" || $step eq "All" || $step eq "ALL") {
725 |     print ("Runing step 7:\n");
726 |     print ("Putting together, sorting, making them uniq and then splitting into one file per exon for all the liftover files for MAPPED EXONS that DO NOT INTERSECT WITH AN EXON so far\n");
727 |     liftoverfilesprocessmappedexons($liftOverFilesDir,  $perExonLiftoverDir, 0, $ENV{'EXTRAMAPPER_DIR'});
728 |     print ("---------------------- Step 7 Finished ----------------------\n");
729 |     print ("Preporcessing steps finished!\n");
730 |   }
731 | }
732 | 
733 | step($step);
734 | 


--------------------------------------------------------------------------------
/Human-Mouse-Preprocess-Data/scripts/liftOver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ay-lab/ExTraMapper/ff8bf6399e457c041e10ab8d94c83ae54414b273/Human-Mouse-Preprocess-Data/scripts/liftOver


--------------------------------------------------------------------------------
/Human-Mouse-Preprocess-Data/scripts/liftover-withMultiples:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -ex
  2 | set -o pipefail
  3 | set -o errexit 
  4 | 
  5 | source config.conf
  6 | 
  7 | 
  8 | dataDir=${EXTRAMAPPER_DIR}/preprocess/data
  9 | dataDirPerPair=${EXTRAMAPPER_DIR}/preprocess/data/$org1-$org2
 10 | 
 11 | chainsDir=$dataDir/liftover_chains
 12 | ensemblDir=$dataDirPerPair/ensemblDownloads
 13 | 
 14 | liftOverFilesDir=$dataDirPerPair/liftoverRelatedFiles
 15 | perExonLiftoverDir=$dataDirPerPair/perExonLiftoverCoords
 16 | 
 17 | outdir=$liftOverFilesDir
 18 | flank=$1
 19 | minMatch=$2
 20 | 
 21 | chain1to2=$3
 22 | chain2to1=$4 
 23 | 
 24 | mkdir -p $ensemblDir
 25 | 
 26 | GTFfile1=$ensemblDir/org1.gtf.gz
 27 | GTFfile2=$ensemblDir/org2.gtf.gz
 28 | org1to2homologFile=$ensemblDir/org1_homolog_org2.txt.gz
 29 | org2to1homologFile=$ensemblDir/org2_homolog_org1.txt.gz
 30 | refGDdir1=$ensemblDir/org1 # genomedata archive for org1
 31 | refGDdir2=$ensemblDir/org2 # genomedata archive for org2
 32 | 
 33 | 	##########################  need to add 1 to liftedOver coordinates to match UCSC coordinates ###################
 34 | 	############## HOWEVER, this is only correct if original/lifted strands are same -/- or +/+   ####################
 35 | 	############## THEREFORE, I account manually for this by checking the strand pairs  	      ####################
 36 | 
 37 | 	############## ALSO, liftOver does not CHANGE the strand of original coordinates when used ######################
 38 | 	#############  without the -multiple option and it DOES with -multiple. 		#########################
 39 | 	############ HENCE, I handle these two cases differently.				########################
 40 | 
 41 | ## OLDER AND INCORRECT WAY #1	###########################################################################################################
 42 | #			zcat $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed.gz | awk '{print $1"\t"$2+1"\t"$3+1"\t"$4"\t"$5}' \
 43 | #				> $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp 
 44 | #			
 45 | #			zcat $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed.gz | awk '{print $1"\t"$2+1"\t"$3+1"\t"$4"\t"$5}' \
 46 | #				> $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp 
 47 | #
 48 | #rm -rf $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp
 49 | ###############################################################################################################################################
 50 | 
 51 | 
 52 | #
 53 | 	# first work on the partCoding exons
 54 | 	suffix=flank$flank-minMatch$minMatch-multiples-partCoding
 55 | 
 56 | 	# fourth fields stays the same, fifth is replaced by multiplicity, sixth will be the new strand after liftover
 57 | 	${EXTRAMAPPER_DIR}/preprocess/bin/liftOver <(cat $outdir/org1_partCodingExonsList.bed | awk '{print $1,$2-s,$3+s,$5,$4,$4}' s=$flank) \
 58 | 		$chain1to2 org2_mapped-$suffix.bed org2_unmapped-$suffix.bed -minMatch=$minMatch -multiple
 59 | 
 60 | 	${EXTRAMAPPER_DIR}/preprocess/bin/liftOver <(cat $outdir/org2_partCodingExonsList.bed | awk '{print $1,$2-s,$3+s,$5,$4,$4}' s=$flank) \
 61 | 		$chain2to1 org1_mapped-$suffix.bed org1_unmapped-$suffix.bed -minMatch=$minMatch -multiple
 62 | 
 63 | 	# chr, start, end, exonId, Multiplicity, strand (after conversion)
 64 | 	cat org1_mapped-$suffix.bed | sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed
 65 | 	cat org2_mapped-$suffix.bed | sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed
 66 | 	
 67 | 	# chr, start, end, exonId, Why unmapped, strand (before conversion)
 68 | 	cat org1_unmapped-$suffix.bed | awk '{l1=$1; getline; printf("%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,l1,$5)}' |\
 69 | 		 sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed
 70 | 	cat org2_unmapped-$suffix.bed | awk '{l1=$1; getline; printf("%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,l1,$5)}' |\
 71 | 		 sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed
 72 | 
 73 | 	rm -rf org1_mapped-$suffix.bed org2_mapped-$suffix.bed org1_unmapped-$suffix.bed org2_unmapped-$suffix.bed
 74 | 
 75 | 	# take the intersections
 76 | 	## NEW AND CORRECT WAY - FOR ONLY liftOver with -multiple OPTION  ###########################################################################
 77 | 	cat $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed | awk '{print $4,$0}' | sort -k1,1 > mapped.temp
 78 | 	join $outdir/org2_partCodingExonsList.sorted.temp mapped.temp | \
 79 | 		awk '{s=$8; e=$9; if ($5!=$12) {s=s+1; e=e+1;}; print $7"\t"s"\t"e"\t"$10"\t"$11"\t"$12}' \
 80 | 		| sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp
 81 | 	bedtools intersect -a $outdir/org1_allCodingExonsList.bed \
 82 | 		-b $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp -sorted -wao \
 83 | 		> $outdir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed
 84 | 
 85 | 	bedtools intersect -b $outdir/org1_allCodingExonsList.bed \
 86 | 		-a $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp -sorted -v \
 87 | 		> $outdir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed
 88 | 
 89 | 	cat $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed | awk '{print $4,$0}' | sort -k1,1 > mapped.temp
 90 | 	join $outdir/org1_partCodingExonsList.sorted.temp mapped.temp | \
 91 | 		awk '{s=$8; e=$9; if ($5!=$12) {s=s+1; e=e+1;}; print $7"\t"s"\t"e"\t"$10"\t"$11"\t"$12}' \
 92 | 		| sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp
 93 | 
 94 | 	bedtools intersect -a $outdir/org2_allCodingExonsList.bed \
 95 | 		-b $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp -sorted -wao \
 96 | 		> $outdir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed
 97 | 
 98 | 	bedtools intersect -b $outdir/org2_allCodingExonsList.bed \
 99 | 		-a $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp -sorted -v \
100 | 		> $outdir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed
101 | 	###############################################################################################################################################
102 | 
103 | 	rm -rf $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp mapped.temp
104 | 
105 | 	gzip $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed
106 | 	gzip $outdir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed $outdir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed
107 | 	gzip $outdir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed $outdir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed
108 | 	gzip $outdir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed $outdir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed
109 | 
110 | #
111 | 	# now work on all exons including the partCoding, nonCoding and fullCoding ones
112 | 
113 | 	suffix=flank$flank-minMatch$minMatch-multiples
114 | 
115 | 	# fourth fields stays the same, fifth is replaced by multiplicity, sixth will be the new strand after liftover
116 | 	#liftOver <(cat $outdir/org1_allExonsList.bed | awk '{if ($4=="+") print $1,$2-s,$3+s,$5,$4,$4; else print $1,$2-s,$3+s,$5,$4,$4;}' s=$flank) \
117 | 	${EXTRAMAPPER_DIR}/preprocess/bin/liftOver <(cat $outdir/org1_allExonsList.bed | awk '{print $1,$2-s,$3+s,$5,$4,$4}' s=$flank) \
118 | 		$chain1to2 org2_mapped-$suffix.bed org2_unmapped-$suffix.bed -minMatch=$minMatch -multiple
119 | 
120 | 	${EXTRAMAPPER_DIR}/preprocess/bin/liftOver <(cat $outdir/org2_allExonsList.bed | awk '{print $1,$2-s,$3+s,$5,$4,$4}' s=$flank) \
121 | 		$chain2to1 org1_mapped-$suffix.bed org1_unmapped-$suffix.bed -minMatch=$minMatch -multiple
122 | 
123 | 	cat org1_mapped-$suffix.bed | sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed
124 | 	cat org2_mapped-$suffix.bed | sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed
125 | 	#cat org1_unmapped-$suffix.bed | awk 'NR%2==1' | sort | uniq - 
126 | 	#cat org2_unmapped-$suffix.bed | awk 'NR%2==1' | sort | uniq - 
127 | 	
128 | 	cat org1_unmapped-$suffix.bed | awk '{l1=$1; getline; printf("%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,l1,$5)}' |\
129 | 		 sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed
130 | 	cat org2_unmapped-$suffix.bed | awk '{l1=$1; getline; printf("%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,l1,$5)}' |\
131 | 		 sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed
132 | 	
133 | 	rm -rf org1_mapped-$suffix.bed org2_mapped-$suffix.bed org1_unmapped-$suffix.bed org2_unmapped-$suffix.bed
134 | 
135 | 	# take the intersections
136 | 	## NEW AND CORRECT WAY - FOR ONLY liftOver with -multiple OPTION  ###########################################################################
137 | 	# This correction in coordinates leads to some exons with that doesn't have any file mapped, unmapped, nonintersecting. ##
138 | 	# There is only 2 such exons and they be deemed unmapped (i.e., deleted from the second organism) #
139 | 	cat $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed | awk '{print $4,$0}' | sort -k1,1 > mapped.temp
140 | 	join $outdir/org2_allExonsList.sorted.temp mapped.temp | \
141 | 		awk '{s=$8; e=$9; if ($5!=$12) {s=s+1; e=e+1;}; print $7"\t"s"\t"e"\t"$10"\t"$11"\t"$12}' \
142 | 		| sort -k1,1 -k2,2n > $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp
143 | 	bedtools intersect -a $outdir/org1_allExonsList.bed \
144 | 		-b $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp -sorted -wao \
145 | 		> $outdir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed
146 | 	bedtools intersect -b $outdir/org1_allExonsList.bed \
147 | 		-a $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp -sorted -v \
148 | 		> $outdir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed
149 | 
150 | 	cat $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed | awk '{print $4,$0}' | sort -k1,1 > mapped.temp
151 | 	join $outdir/org1_allExonsList.sorted.temp mapped.temp | \
152 | 		awk '{s=$8; e=$9; if ($5!=$12) {s=s+1; e=e+1;}; print $7"\t"s"\t"e"\t"$10"\t"$11"\t"$12}' \
153 | 		| sort -k1,1 -k2,2n > $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp
154 | 
155 | 	bedtools intersect -a $outdir/org2_allExonsList.bed \
156 | 		-b $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp -sorted -wao \
157 | 		> $outdir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed
158 | 
159 | 	bedtools intersect -b $outdir/org2_allExonsList.bed \
160 | 		-a $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp -sorted -v \
161 | 		> $outdir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed
162 | 
163 | 	rm -rf $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.temp $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.temp mapped.temp
164 | 
165 | 	gzip $outdir/org2_to_org1_liftOver_mappedExonsList-$suffix.bed $outdir/org1_to_org2_liftOver_mappedExonsList-$suffix.bed
166 | 	gzip $outdir/org2_to_org1_liftOver_unmappedExonsList-$suffix.bed $outdir/org1_to_org2_liftOver_unmappedExonsList-$suffix.bed
167 | 	gzip $outdir/org2_VS_org1_to_org2_intersectingExonsList-$suffix.bed $outdir/org1_VS_org2_to_org1_intersectingExonsList-$suffix.bed
168 | 	gzip $outdir/org2_VS_org1_to_org2_nonintersectingExonsList-$suffix.bed $outdir/org1_VS_org2_to_org1_nonintersectingExonsList-$suffix.bed
169 | 
170 | 	###############################################################################################################################################
171 | 
172 | 


--------------------------------------------------------------------------------
/Human-Mouse-Preprocess-Data/scripts/parseAndPicklePerPair.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ##############################################################################
  3 | ### To use the functions in this lib simply import this python module using
  4 | ### import ensemblUtils
  5 | ### Then you'll able able to call functions with the proper arguments using
  6 | ### returnVal=ensemblUtils.func1(arg1,arg2)
  7 | ##############################################################################
  8 | ##############################################################################
  9 | import sys
 10 | import os
 11 | import string
 12 | import math
 13 | import gzip
 14 | import _pickle as pickle
 15 | 
 16 | # reads from exported environment variable
 17 | ExTraMapperPath=os.environ['EXTRAMAPPER_DIR']
 18 | sys.path.append(ExTraMapperPath+"/scripts")
 19 | from ensemblUtils import *
 20 | 
 21 | #   Testing functionalities
 22 | def main(argv):
 23 | 	indir=argv[1]
 24 | 	orgId1="org1"; orgId2="org2";
 25 | 	refGD1=indir+"/genomedataArchives/org1"
 26 | 	refGD2=indir+"/genomedataArchives/org2"
 27 | 
 28 | 
 29 | #	outdir="GTFsummaries"; 
 30 | 	if len(argv)==2:
 31 | 		return
 32 | 
 33 | 	outdir=argv[2]
 34 | 	os.system("mkdir -p "+outdir)
 35 | 		
 36 | 	#infilename=indir+"/ensemblDownloads/org1.gtf.gz"
 37 | 	infilename=indir+"/ensemblDownloads/org1.gtf"	## Abhijit
 38 | 	geneDic1,transcriptDic1,exonDic1,infoDic1=parse_organism_GTF(orgId1, infilename, outdir)
 39 | 	
 40 | 	#infilename=indir+"/ensemblDownloads/org2.gtf.gz"
 41 | 	infilename=indir+"/ensemblDownloads/org2.gtf"	## Abhijit
 42 | 	geneDic2,transcriptDic2,exonDic2,infoDic2=parse_organism_GTF(orgId2, infilename, outdir)
 43 | 
 44 | 	## these two files were downloaded by hand selecting columns from Ensembl's Biomart
 45 | 	## I weren't able to redo the same column selections recently so I decided to switch to
 46 | 	## parsing the orthology information from readily available Ensembl files like below ones:
 47 | 	## ftp://ftp.ensembl.org/pub/release-80/mysql/ensembl_mart_80/
 48 | 	## hsapiens_gene_ensembl__homolog_mmus__dm.txt.gz
 49 | 	#infilename="/projects/b1017/shared/Ensembl-files/Ensembl-human-GRCh38-to-mouse-GRCm38.p3.txt.gz"
 50 | 	#genePairsHumanToMouse=parse_ensembl_gene_pairings(infilename)
 51 | 	#infilename="/projects/b1017/shared/Ensembl-files/Ensembl-mouse-GRCm38.p3-to-human-GRCh38.txt.gz"
 52 | 	#genePairsMouseToHuman=parse_ensembl_gene_pairings(infilename)
 53 | 	#consistency_check(genePairsHumanToMouse,genePairsMouseToHuman)
 54 | 	## if consistency check is ok then just use one side. This is OK for one2one mappings.
 55 | 	#genePairsDic=genePairsHumanToMouse
 56 | 	#pickle_one2one_genePairs_allInfo(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,transcriptDic1,transcriptDic2,outdir) 
 57 | 	
 58 | 	#infilename=indir+"/ensemblDownloads/org1_homolog_org2.txt.gz"
 59 | 	infilename=indir+"/ensemblDownloads/org1_homolog_org2.txt"	## Abhijit
 60 | 	proteinToGeneDic,genePairsDic,proteinPairsDic=parse_ensembl_geneAndProtein_pairings(infilename,{},{})
 61 | 	print (["1",len(proteinToGeneDic),len(genePairsDic),len(proteinPairsDic)])
 62 | 
 63 | 	#infilename=indir+"/ensemblDownloads/org2_homolog_org1.txt.gz"
 64 | 	infilename=indir+"/ensemblDownloads/org2_homolog_org1.txt"	## Abhijit
 65 | 	proteinToGeneDic,genePairsDic,proteinPairsDic=parse_ensembl_geneAndProtein_pairings(infilename,proteinToGeneDic,proteinPairsDic)
 66 | 	print (["2",len(proteinToGeneDic),len(genePairsDic),len(proteinPairsDic)])
 67 | 
 68 | 	
 69 | 	exonDic1=assign_firstMidLast_exon_counts(exonDic1,transcriptDic1)
 70 | 	exonDic2=assign_firstMidLast_exon_counts(exonDic2,transcriptDic2)
 71 | 
 72 | 	typ="allExonPlusMinus"
 73 | 	outfilename="None"
 74 | 	fivePrimeFlank=12; threePrimeFlank=12
 75 | 
 76 |         ###### Not required ######
 77 | 	#exonDic1=extract_conservation_stats_for_exons(refGD1,exonDic1,typ,fivePrimeFlank,threePrimeFlank,outfilename)
 78 | 	#exonDic2=extract_conservation_stats_for_exons(refGD2,exonDic2,typ,fivePrimeFlank,threePrimeFlank,outfilename)
 79 |         ###### 
 80 |  
 81 | 	outdir=argv[2] # overwrite previous summaries
 82 | 	os.system("mkdir -p "+outdir)
 83 | 	print_some_summary(orgId1, geneDic1,transcriptDic1,exonDic1,{}, outdir)
 84 | 	print_some_summary(orgId2, geneDic2,transcriptDic2,exonDic2,{}, outdir)
 85 | 
 86 | #	outdir="perGenePairExonLists"
 87 | 	if len(argv)==3:
 88 | 		return
 89 | 
 90 | 	outdir=argv[3]
 91 | 	os.system("mkdir -p "+outdir)
 92 | 
 93 | 	outfilename=outdir+"/genePairsSummary-one2one.txt"
 94 | 	print_one2one_genePairs(genePairsDic,geneDic1,geneDic2,outfilename) # either way is ok since one2one
 95 | 
 96 | 	geneOnlyOrthoDic1,transcriptOnlyOrthoDic1,exonOnlyOrthoDic1, geneOnlyOrthoDic2,transcriptOnlyOrthoDic2,exonOnlyOrthoDic2=pickle_one2one_genePairs_allInfo(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,transcriptDic1,transcriptDic2,outdir) 
 97 | 
 98 | 	print ([len(geneDic1), len(geneDic2)])
 99 | 	print (len(geneOnlyOrthoDic1))
100 | 	print (len(geneOnlyOrthoDic2))
101 | 
102 | 	outdir=argv[2]+"/onlyOrthologAndCodingGenes"
103 | 	os.system("mkdir -p "+outdir)
104 | 	print (outdir)
105 | 	print_some_summary(orgId1, geneOnlyOrthoDic1,transcriptOnlyOrthoDic1,exonOnlyOrthoDic1,{}, outdir)
106 | 	print_some_summary(orgId2, geneOnlyOrthoDic2,transcriptOnlyOrthoDic2,exonOnlyOrthoDic2,{}, outdir)
107 | 
108 | 	#
109 | #	print_one2one_exonListPairs(genePairsDic,geneDic1,geneDic2,exonDic1,exonDic2,orgId1,orgId2,outdir)
110 | #	print_one2one_transcriptListPairs(genePairsDic,geneDic1,geneDic2,transcriptDic1,transcriptDic2,orgId1,orgId2,outdir)
111 | 
112 | 	return
113 | 
114 | if __name__ == "__main__":
115 | 	main(sys.argv)
116 | 
117 | 


--------------------------------------------------------------------------------
/Human-Mouse-Preprocess-Data/scripts/splitExonsIntoIndividualFiles.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | def main(argv):
 5 | 	infilename=argv[1]
 6 | 	outdir=argv[2]
 7 | 	whichCol=int(argv[3])-1
 8 | 	fileSuffix=argv[4]
 9 | 	infile=open(infilename,'r')
10 | 	lastExon="dummy"
11 | 	outfile=open("dummy.txt",'w')
12 | 	for line in infile:
13 | 		newExon=line.rstrip().split()[whichCol] # where exon name is
14 | 		if newExon!=lastExon:
15 | 			outfile.close()
16 | 			outfile=open(outdir+"/"+newExon+fileSuffix,'w')
17 | 		#
18 | 		outfile.write(line)
19 | 		lastExon=newExon
20 | 	#
21 | 	outfile.close()
22 | 	return
23 | 
24 | if __name__ == "__main__":
25 | 	main(sys.argv)
26 | #
27 | 
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 ay-lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ExTraMapper
  2 | ExTraMapper is a tool to find Exon and Transcript-level Mappings of a given pair of orthologous genes between two organisms using sequence conservation. The figure below shows the overall schematic description of ExTraMapper mapping the homologous transcript and exon-pairs between human and mouse genome. 
  3 | 
  4 | 
  5 | ![ExTraMapper_Figure](https://user-images.githubusercontent.com/18036388/90572310-8b693e00-e168-11ea-9fbc-8188c2834de9.jpg)
  6 | 
  7 | # Steps to run ExtraMapper (For python version 3 or later usage)
  8 | 
  9 | ### Step 1: Prepare the input files
 10 | ExTraMapper requires a set of preprocessed files to find the conservation scores. Examples to create these files are provided within the following folders
 11 |  
 12 | 1. [__Human-Mouse-Preprocessed-Data__](https://github.com/ay-lab/ExTraMapper/tree/master/Human-Mouse-Preprocess-Data) 
 13 | 
 14 |     and 
 15 |     
 16 | 2. [__Human-Rhesus_macaque-Preprocessed-Data__](https://github.com/ay-lab/ExTraMapper/tree/master/Human-Monkey-Processed-Data) 
 17 | 
 18 | ### Steps to generate the input files
 19 | The users should run the _extMpreprocess_ to generate the inputfiles within the above Preprocessed-Data folders. All the input files will be generated under _preprocess/data_ folder. All the required executables and scripts are provided here. The _extMpreprocess_ has 7 individual steps and should be run in the following manner 
 20 |  
 21 |  - ![#f03c15](https://via.placeholder.com/15/f03c15/000000?text=+) For help, type <br>
 22 |    
 23 |     ```bash
 24 |     ./extMpreprocess help
 25 |     
 26 |     This script will download and preprocess the dataset required for exon-pair and transcript pair finding by ExTraMapper.
 27 |     Type ./extMpreprocess <config.conf> <step> to execute the script.
 28 |     Type ./extMpreprocess example to print a example config.conf file.
 29 | 
 30 |     This script will run seven (7) sequential steps to create the inputs for ExTraMapper program.
 31 |     Users can provide step numbers (1-7) or all in the <step> arugemt of this script.
 32 |     Short description of the individual scripts:
 33 |     Step 1: Download per organism specific files e.g. reference genomes, gene annotation files.
 34 |     Step 2: Will create genomedata archives with the genomes of org1 and org2 (Make sure to install genomedata package).
 35 |     Step 3: Pickle files for each homologous gene pair will be created.
 36 |     Step 4: Perform coordinate liftOver of exons with multiple mappings (This step requires bedtools and liftOver executables).
 37 |     Step 5-7: postprocessing the liftOver files.
 38 |     
 39 |     example: 
 40 |     
 41 |     ./extMpreprocess config.human-mouse.conf all
 42 |     ```
 43 |    <br>
 44 | <br>
 45 | 
 46 | ### Step 2: Set the following path
 47 | ```bash export EXTRAMAPPER_DIR=/path/to/this/folder```
 48 | 
 49 | <br>
 50 | 
 51 | ### Step 3: Run ExTraMapper individually
 52 | ```bash
 53 | $ python ExTraMapper.py -h
 54 | usage: ExTraMapper.py [-h] -m MAPPING -o1 ORG1 -o2 ORG2 -p ORTHOLOG
 55 | 
 56 | Check the help flag
 57 | 
 58 | optional arguments:
 59 |   -h, --help   show this help message and exit
 60 |   -m MAPPING   ExTraMapper Exon threshold value [e.g. 1]
 61 |   -o1 ORG1     First organism name [e.g. human]
 62 |   -o2 ORG2     Second organism name [e.g. mouse]
 63 |   -p ORTHOLOG  Orthologous gene pair [e.g. ENSG00000141510-ENSMUSG00000059552 OR all]
 64 | ```
 65 | 
 66 | #### Example run of ExTraMapper.py using orthologous gene pair ENSG00000141510-ENSMUSG00000059552 
 67 | ```bash
 68 | $ python ExTraMapper.py -m 1 -o1 human -o2 mouse -p ENSG00000141510-ENSMUSG00000059552
 69 | 
 70 | Finding exon mappings for gene pair number 0    ENSG00000141510-ENSMUSG00000059552
 71 | *****************************************************************
 72 | Gene pair ID: ENSG00000141510-ENSMUSG00000059552
 73 | 
 74 | Information about each gene. Last two numbers are no of transcripts and exons
 75 | ENSG00000141510 chr17   7661779 7687538 -       ENSG00000141510 TP53    protein_coding  27      49      gene
 76 | ENSMUSG00000059552      chr11   69580359        69591873        +       ENSMUSG00000059552      Trp53   protein_coding  6       24      gene
 77 | 
 78 | Number of exons before and after duplicate removal according to coordinates
 79 | Org1    49      40
 80 | Org2    24      20
 81 | 
 82 | *****************************************************************
 83 | 
 84 | *****************************************************************
 85 | GCGCTGGGGACCTGTCCCTAGGGGGCAGATGAGACACTGATGGGCGTACTTAGAGATTTGCCATGAAGTGGGTTTGAAGAATGGAGCTGTGTGTGAAAT
 86 | Exon file type summaries for the first gene from: ENSG00000141510-ENSMUSG00000059552
 87 |         0 exons with: No file exists
 88 |         22 exons with: Only Mapped
 89 |         0 exons with: Only nonintersecting
 90 |         11 exons with: Only unmapped
 91 |         15 exons with: Mapped and unmapped
 92 |         0 exons with: Mapped and nonintersecting
 93 |         1 exons with: Nonintersecting and unmapped
 94 |         0 exons with: All three files
 95 | Exon file type summaries for the second gene from: ENSG00000141510-ENSMUSG00000059552
 96 |         0 exons with: No file exists
 97 |         14 exons with: Only Mapped
 98 |         0 exons with: Only nonintersecting
 99 |         3 exons with: Only unmapped
100 |         7 exons with: Mapped and unmapped
101 |         0 exons with: Mapped and nonintersecting
102 |         0 exons with: Nonintersecting and unmapped
103 |         0 exons with: All three files
104 | Writing exon-level similarity scores into file:
105 |  /path/output/human-mouse/ENSG00000141510-ENSMUSG00000059552/exonLevelSimilarities-1.0.txt
106 | 
107 | Writing exon classes into file:
108 |  /path/output/human-mouse/ENSG00000141510-ENSMUSG00000059552/exonClasses-1.0.txt
109 |         For org1: Mapped exons= 17, Unmapped exons= 21, Nonintersecting exons= 1, OTHER= 10
110 |         For org2: Mapped exons= 13, Unmapped exons= 7, Nonintersecting exons= 0, OTHER= 4
111 | *****************************************************************
112 | 
113 | *****************************************************************
114 | Writing exon-level mappings into file:
115 |  /path/output/human-mouse/ENSG00000141510-ENSMUSG00000059552/exonLevelMappings-1.0.txt
116 | Writing trascript-level similarity scores into file:
117 |  /path/output/human-mouse/ENSG00000141510-ENSMUSG00000059552/transcriptLevelSimilarities-1.0.txt
118 | Writing transcript-level mappings into file:
119 |  /path/output/human-mouse/ENSG00000141510-ENSMUSG00000059552/transcriptLevelMappings-1.0.txt
120 | 
121 | Condition counter from the greedy transcript mapping stage:
122 |         5 pairs with Condition1: Unique winner pair
123 |         0 pairs with Condition2: Tie in one score, not in the other
124 |         0 pairs with Condition3: Tie in both scores but coding exon length diff breaks the tie
125 |         0 pairs with Condition4: Tie in both scores and coding exon length diff but overall exon length breaks the tie
126 |         1 pairs with Condition5: Tie in all the above but coding length (bp) diff breaks the tie
127 |         0 pairs with Condition6: Tie in all the above, just give up and report all
128 | 
129 | Writing UCSC browser bed output for org1 into file:
130 |  /path/output/human-mouse/ENSG00000141510-ENSMUSG00000059552/org1-ucsc-1.0.bed
131 | Writing UCSC browser bed output for org2 into file:
132 |  /path/output/human-mouse/ENSG00000141510-ENSMUSG00000059552/org2-ucsc-1.0.bed
133 | 
134 | ........
135 | ExTraMapper ran successfully for 1 gene pairs between: human and mouse
136 | 
137 | 
138 | *****************************************************************
139 | $ tree ./output
140 | 
141 | ./output
142 | `-- human-mouse
143 |     `-- ENSG00000141510-ENSMUSG00000059552
144 |         |-- exonClasses-1.0.txt
145 |         |-- exonLevelMappings-1.0.txt
146 |         |-- exonLevelSimilarities-1.0.txt
147 |         |-- org1-ucsc-1.0.bed
148 |         |-- org2-ucsc-1.0.bed
149 |         |-- transcriptLevelMappings-1.0.txt
150 |         `-- transcriptLevelSimilarities-1.0.txt
151 | ```
152 | 
153 | Note: The __exonLevelMappings-1.0.txt__ & __transcriptLevelMappings-1.0.txt__ file contains the mapped exon and transcript pairs from __ENSG00000141510-ENSMUSG00000059552__ orthologous gene-pair. 
154 | 
155 | <br>
156 | 
157 | # OR
158 | 
159 | ### Step 3: Run ExTraMapper for all the gene pairs
160 | ```bash
161 | $ python ExTraMapper.py -h
162 | usage: ExTraMapper.py [-h] -m MAPPING -o1 ORG1 -o2 ORG2 -p all
163 | ```
164 | 
165 | <br>
166 | 
167 | ### Summarise the ExTraMapper results ###
168 | Run _extMsummarise_ script to generate a concatenated file will all the results. Run the script in the follwoing manner 
169 | ```bash
170 | $ ./extMsummarise help
171 | Type ./extMsummarise <preprocess_folder> <extramapper_folder> <orthologous_genepair_list> <org1name> <org2name> <outputprefix>
172 | preprocess_folder  : Path to the preprocess folder generated by the extMpreproces script
173 | extramapper_folder : Path to the output folder generated by ExTraMapper program
174 | orthologous_genepair_list : A list of orthologous gene-pairs
175 | org1name : org1 name e.g. human
176 | org2name : org2 name e.g. mouse
177 | outputprefix : output file prefix
178 | 
179 | example : 
180 | ./extMsummarise ./preprocess ./output gene-pair.list human mouse extramapper-result
181 | ```
182 | <br>
183 | 
184 | 
185 | # Prepocessed Results
186 | 
187 | Check the [Result/Exon-Pairs](https://github.com/ay-lab/ExTraMapper/tree/master/Result/Exon-Pairs) and [Result/Transcript-Pairs](https://github.com/ay-lab/ExTraMapper/tree/master/Result/Transcript-Pairs) to download the precomputed ExTraMapper result for human-mouse and human-rhesus orthologous exon and transcript pairs.
188 | 
189 | <br>
190 | 
191 | ### Refer the work
192 | [_ExTraMapper: Exon- and Transcript-level mappings for orthologous gene pairs._](https://academic.oup.com/bioinformatics/advance-article-abstract/doi/10.1093/bioinformatics/btab393/6278896?redirectedFrom=fulltext)
193 | 
194 | __Chakraborty A, Ay F, Davuluri RV. ExTraMapper: Exon- and Transcript-level mappings for orthologous gene pairs. Bioinformatics. 2021 May 20:btab393. doi: 10.1093/bioinformatics/btab393. Epub ahead of print. PMID: 34014317.__
195 | 
196 | The data shown in the above paper was performed using Human & Mouse ENSMBL release 81 with python 2.7 code. 
197 | The current update is with ENSMBL release 102 and python 3 or later version. To see the older code and data please
198 | change the __Branch__ to [__ExTraMapper-python2v__](https://github.com/ay-lab/ExTraMapper/tree/ExTraMapper-python2v) from __master__
199 | 
200 | ### Check the webserver for a nice vizualization 
201 | https://ay-lab-tools.lji.org/extramapper/index.html
202 | 


--------------------------------------------------------------------------------
/Result/Exon-Pairs/README.md:
--------------------------------------------------------------------------------
 1 | ## Download the exon pair files
 2 | 
 3 | [ExTraMapper_Exon_Mapping_ENSMBL102_Human_vs_Monkey](https://drive.google.com/file/d/1L9Ef7vYr9R66xW-zz4wfVU4moVDCVddX/view?usp=sharing)
 4 | 
 5 | (This file contains Human and monkey (Rhesus macaque) orthologous exon pairs from ExTraMapper. The results were generarted using ENSEMBL vserion 102)
 6 | 
 7 | [ExTraMapper_Exon_Mapping_ENSMBL102_Genome_Build_Human_vs_Mouse](https://drive.google.com/file/d/1vpJCW5hmNDmWdmGn6cxDWHvyC7oiCRFy/view?usp=sharing)
 8 | 
 9 | (This file contains Human and mouse orthologous exon pairs from ExTraMapper. The results were generarted using ENSEMBL vserion 102)
10 | 
11 | [ExTraMapper_Exon_Mapping_ENSMBL81_Genome_Build_Human_vs_Mouse](https://drive.google.com/file/d/1eeJ9_ck6-WKMox2Kw1A4VT43z3IDEJYU/view?usp=sharing)
12 | 
13 | (This file contains Human and mouse orthologous exon pairs from ExTraMapper. The results were generarted using ENSEMBL vserion 81)
14 | 


--------------------------------------------------------------------------------
/Result/Transcript-Pairs/ExTraMapper_Transcript_Mapping_ENSMBL102_Genome_Build_Human_vs_Mouse.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ay-lab/ExTraMapper/ff8bf6399e457c041e10ab8d94c83ae54414b273/Result/Transcript-Pairs/ExTraMapper_Transcript_Mapping_ENSMBL102_Genome_Build_Human_vs_Mouse.xlsx


--------------------------------------------------------------------------------
/Result/Transcript-Pairs/ExTraMapper_Transcript_Mapping_ENSMBL102_Human_vs_Monkey.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ay-lab/ExTraMapper/ff8bf6399e457c041e10ab8d94c83ae54414b273/Result/Transcript-Pairs/ExTraMapper_Transcript_Mapping_ENSMBL102_Human_vs_Monkey.xlsx


--------------------------------------------------------------------------------
/Result/Transcript-Pairs/ExTraMapper_Transcript_Mapping_ENSMBL81_Genome_Build_Human_vs_Mouse.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ay-lab/ExTraMapper/ff8bf6399e457c041e10ab8d94c83ae54414b273/Result/Transcript-Pairs/ExTraMapper_Transcript_Mapping_ENSMBL81_Genome_Build_Human_vs_Mouse.xlsx


--------------------------------------------------------------------------------
/Result/Transcript-Pairs/README.md:
--------------------------------------------------------------------------------
 1 | ## File Description
 2 | 
 3 | #### 1. ExTraMapper_Transcript_Mapping_ENSMBL102_Genome_Build_Human_vs_Mouse.xlsx : 
 4 | 
 5 | This file contains Human and mouse orthologous transcript pairs from ExTraMapper. The results were generarted using ENSEMBL vserion 102.
 6 | 
 7 | #### 2. ExTraMapper_Transcript_Mapping_ENSMBL81_Genome_Build_Human_vs_Mouse.xlsx :
 8 | 
 9 | This file contains Human and mouse orthologous transcript pairs from ExTraMapper. The results were generarted using ENSEMBL vserion 81.
10 | 
11 | #### 3. ExTraMapper_Transcript_Mapping_ENSMBL102_Human_vs_Monkey.xlsx :
12 | 
13 | This file contains Human and monkey (Rhesus macaque) orthologous transcript pairs from ExTraMapper. The results were generarted using ENSEMBL vserion 102.
14 | 


--------------------------------------------------------------------------------
/extMsummarise:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | sub generatefiles {
  4 |  
  5 |   my $htgtf = $_[0]; 
  6 |   my $mtgtf = $_[1];
  7 |   my $ogene = $_[2];   
  8 |   my $etmfl = $_[3];
  9 |   my $mping = $_[4];
 10 |   my $outpt = $_[5];
 11 |   my %geneid;
 12 |  
 13 |   open(hgtf_in, $htgtf);
 14 |   while (my $line = <hgtf_in>) {
 15 |     chomp $line;
 16 |     if ($. > 1) {
 17 |       my $tname = (split(/\s+/,$line))[4];
 18 |       my $tgene = (split(/\s+/,$line))[5];
 19 |       my @exons = split(/,/,(split(/\s+/,$line))[9]);
 20 |       $geneid{'h'}{'t'}{$tname} = $tgene;
 21 |       foreach my $e (@exons) {
 22 |         chomp $e;
 23 |         if ($geneid{'h'}{'e'}{$e} eq "") {
 24 |           $geneid{'h'}{'e'}{$e} = $tgene;
 25 |         } else {
 26 |           $geneid{'h'}{'e'}{$e} = "$geneid{'h'}{'e'}{$e},$tgene";
 27 |         }
 28 |       }
 29 |       undef @exons;
 30 |     }
 31 |   }
 32 |   close(hgtf_in);
 33 |  
 34 |   open(mgtf_in, $mtgtf);
 35 |   while (my $line = <mgtf_in>) {
 36 |     chomp $line;
 37 |     if ($. > 1) {
 38 |       my $tname = (split(/\s+/,$line))[4];
 39 |       my $tgene = (split(/\s+/,$line))[5];
 40 |       my @exons = split(/,/,(split(/\s+/,$line))[9]);
 41 |       $geneid{'m'}{'t'}{$tname} = $tgene;
 42 |       foreach my $e (@exons) {
 43 |         chomp $e;
 44 |         if ($geneid{'m'}{'e'}{$e} eq "") {
 45 |           $geneid{'m'}{'e'}{$e} = $tgene;
 46 |         } else {
 47 |           $geneid{'m'}{'e'}{$e} = "$geneid{'m'}{'e'}{$e},$tgene";
 48 |         }
 49 |       }
 50 |       undef @exons;
 51 |     }
 52 |   }
 53 |   close(mgtf_in);
 54 | 
 55 |   open(out_tpair,">$outpt.transcriptLevelMappings-$mping.txt");
 56 |   open(out_epair,">$outpt.exonLevelMappings-$mping.txt");
 57 |   print out_tpair ("chrName1\tstartCoord1\tendCoord1\tstrand1\tchrName2\tstartCoord2\tendCoord2\tstrand2\ttranscriptID1\ttranscriptID2\ttranscriptName1\ttranscriptName2\ttranscriptType1\ttranscriptType2\toverallSimScore\tcodingSimScore\tortholog\n");
 58 |   print out_epair ("chrName1\tstartCoord1\tendCoord1\tstrand1\tchrName2\tstartCoord2\tendCoord2\tstrand2\texonID1\texonID2\texonName1\texonName2\texonType1\texonType2\toverlapScoreFromFullLength\toverlapScoreFromPartialCodingPart\tortholog\n");
 59 |   open(ogene_in, $ogene);
 60 |   while (my $line = <ogene_in>) {
 61 |     chomp $line;
 62 |     open (extresult_trans_in, "$etmfl/$line/transcriptLevelMappings-$mping.txt");
 63 |     while (my $r = <extresult_trans_in>) {
 64 |       if ($. > 1) {
 65 |         my $chrName1        = (split(/\s+/, $r))[0]; 
 66 |         my $startCoord1     = (split(/\s+/, $r))[1];
 67 |         my $endCoord1       = (split(/\s+/, $r))[2];
 68 |         my $strand1         = (split(/\s+/, $r))[3];      
 69 |         my $chrName2        = (split(/\s+/, $r))[6];
 70 |         my $startCoord2     = (split(/\s+/, $r))[7];
 71 |         my $endCoord2       = (split(/\s+/, $r))[8];
 72 |         my $strand2         = (split(/\s+/, $r))[9];
 73 |         my $transcriptID1   = (split(/\s+/, $r))[4];
 74 |         my $transcriptType1 = (split(/\s+/, $r))[5];
 75 |         my $transcriptID2   = (split(/\s+/, $r))[10];
 76 |         my $transcriptType2 = (split(/\s+/, $r))[11];
 77 |         my $overallSimScore = (split(/\s+/, $r))[18];
 78 |         my $codingSimScore  = (split(/\s+/, $r))[19];
 79 |         my $transcriptName1 = $geneid{'h'}{'t'}{$transcriptID1};
 80 |         my $transcriptName2 = $geneid{'m'}{'t'}{$transcriptID2};
 81 |         print out_tpair ("$chrName1\t$startCoord1\t$endCoord1\t$strand1\t$chrName2\t$startCoord2\t$endCoord2\t$strand2\t$transcriptID1\t$transcriptID2\t$transcriptName1\t$transcriptName2\t$transcriptType1\t$transcriptType2\t$overallSimScore\t$codingSimScore\t$line\n");
 82 |       }
 83 |     }
 84 |     close(extresult_trans_in);
 85 |     
 86 |     open (extresult_exons_in, "$etmfl/$line/exonLevelMappings-$mping.txt");
 87 |     while (my $r = <extresult_exons_in>) {
 88 |       if ($. > 1) {
 89 |         my $chrName1        = (split(/\s+/, $r))[0];
 90 |         my $startCoord1     = (split(/\s+/, $r))[1];
 91 |         my $endCoord1       = (split(/\s+/, $r))[2];
 92 |         my $strand1         = (split(/\s+/, $r))[3];
 93 |         my $chrName2        = (split(/\s+/, $r))[6];
 94 |         my $startCoord2     = (split(/\s+/, $r))[7];
 95 |         my $endCoord2       = (split(/\s+/, $r))[8];
 96 |         my $strand2         = (split(/\s+/, $r))[9]; 
 97 |         my $exonID1         = (split(/\s+/, $r))[4];
 98 |         my $exonType1       = (split(/\s+/, $r))[5];
 99 |         my $exonID2         = (split(/\s+/, $r))[10];
100 |         my $exonType2       = (split(/\s+/, $r))[11];
101 |         my $overlapScoreFromFullLength        = (split(/\s+/, $r))[12];
102 |         my $overlapScoreFromPartialCodingPart = (split(/\s+/, $r))[13];
103 |         my $exonName1       = $geneid{'h'}{'e'}{$exonID1};
104 |         my $exonName2       = $geneid{'m'}{'e'}{$exonID2};
105 |         print out_epair ("$chrName1\t$startCoord1\t$endCoord1\t$strand1\t$chrName2\t$startCoord2\t$endCoord2\t$strand2\t$exonID1\t$exonID2\t$exonName1\t$exonName2\t$exonType1\t$exonType2\t$overlapScoreFromFullLength\t$overlapScoreFromPartialCodingPart\t$line\n");
106 |       }
107 |     }
108 |     close(extresult_exons_in);
109 |   } 
110 |   close(out_tpair);
111 |   close(out_epair);
112 |   close(ogene_in);
113 | }
114 | 
115 | if ($#ARGV == -1 || $ARGV[0] eq "help" || $#ARGV < 5) {
116 |   print ("Type ./extMsummarise <preprocess_folder> <extramapper_folder> <orthologous_genepair_list> <org1name> <org2name> <outputprefix>\n");
117 |   print ("preprocess_folder  : Path to the preprocess folder generated by the extMpreproces script\n");
118 |   print ("extramapper_folder : Path to the output folder generated by ExTraMapper program\n");
119 |   print ("orthologous_genepair_list : A list of orthologous gene-pairs\n");
120 |   print ("org1name : org1 name e.g. human\n");
121 |   print ("org2name : org2 name e.g. mouse\n");
122 |   print ("outputprefix : output file prefix\n\n");
123 |   exit;
124 | }
125 | else {
126 |   my ($preprocess_folder, $extmapper_result, $pair_list, $org1, $org2, $output) = @ARGV; 
127 |   chomp ($preprocess_folder, $extmapper_result, $pair_list, $org1, $org2, $output);
128 | 
129 |   my $org1_transcript_gtf = "$preprocess_folder/data/$org1-$org2/GTFsummaries/org1-allTranscripts-GTFparsed.txt";
130 |   my $org2_transcript_gtf = "$preprocess_folder/data/$org1-$org2/GTFsummaries/org2-allTranscripts-GTFparsed.txt";
131 |   my $ogene  = $pair_list;
132 |   my $etmfl  = "$extmapper_result/$org1-$org2";
133 |   generatefiles($org1_transcript_gtf,$org2_transcript_gtf,$ogene,$etmfl,"0.8",$output);
134 | }
135 | 


--------------------------------------------------------------------------------