├── CreateChrBedFromVCF.pl ├── CreateChrBedFromVCF2.pl ├── MS2LDhat.pl ├── MS2PHASE.pl ├── README.md ├── fastPHASE2LDhat.pl ├── fastPHASE2VCF.pl ├── license.txt ├── thinVCF.pl ├── vcf2MS.pl ├── vcf2PHASE.pl ├── vcf2fastPHASE.pl ├── vcf2fastPHASE_4males.pl └── vcf_merge.pl /CreateChrBedFromVCF.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #Created: December 6, 2011 4 | #Last Modified: January 7, 2013 5 | 6 | #program reads in a vcf file and extracts coordinates 7 | #in bed format 8 | #of a specific number of SNPs in a window 9 | 10 | $vcf = $ARGV[0]; 11 | $block = $ARGV[1]; 12 | 13 | unless ($#ARGV==1) { 14 | print STDERR "Error: Please provide filename of input VCF file and synteny block number on command line!\n\n"; 15 | die; 16 | } #end unless 17 | 18 | open(VCF, $vcf); 19 | 20 | $SNP_counter = 0; 21 | @chromosome = (); 22 | @coordinates = (); 23 | $first_time_in_loop = 1; 24 | $first_SNP = 1; 25 | $second_SNP = 0; 26 | 27 | while () { 28 | chomp; 29 | $vcf_line = $_; 30 | @variant = split(/\s+/, $vcf_line); 31 | 32 | if ($vcf_line=~/^\#/) { 33 | next; #do not read in any header lines 34 | } elsif ($vcf_line=~/^chr/ && $variant[6]=~/PASS/) { 35 | $SNP_counter++; 36 | } else { 37 | print STDERR "Non-pass sites in VCF at $variant[0] $variant[1] bc site is $variant[6]!\n"; 38 | next; 39 | } #end else 40 | 41 | if($first_time_in_loop == 1) { #first time in loop opens first output 42 | $chromosome=$variant[0]; 43 | $first_time_in_loop = 0; 44 | open(OUTPUT, ">>$variant[0]-input.BED"); 45 | open(OUTPUT2, ">$variant[0].synteny_block.$block.blocks.txt"); 46 | } #end if 47 | 48 | if ($chromosome !~/$variant[0]/) { #when new chromosome, open new output 49 | print STDERR "finished $chromosome, now starting $variant[0]...\n"; 50 | print OUTPUT "$last_position\n"; 51 | print OUTPUT2 "$last_position\n"; 52 | open(OUTPUT, ">>$variant[0]-input.BED"); 53 | open(OUTPUT2, ">$variant[0].synteny_block.$block.blocks.txt"); 54 | $first_SNP = 1; 55 | $SNP_counter=1; 56 | } #end if 57 | 58 | if ($SNP_counter == 1 && $first_SNP == 1) { #prints first line of BED file 59 | print OUTPUT "$variant[0]\t$variant[1]\t" ; 60 | print OUTPUT2 "$variant[0]\t$variant[1]\t" ; 61 | # print STDERR "$variant[0]\t$variant[1]\t" ; 62 | $first_SNP = 0; 63 | $first_interval = 1; 64 | $after_first_interval = 0; 65 | } elsif ($SNP_counter == 199 && $made_past_200 ==1 && $after_first_interval==0) { #prints end position and next start position 66 | print OUTPUT "$variant[1]\n$chr_next\t$start_next\t" ; 67 | print OUTPUT2 "$variant[1]\n$chr_next\t$start_next\t" ; 68 | # print STDERR "$variant[1]\n$chr_next\t$start_next\t" ; 69 | $made_past_200 = 0; 70 | $second_SNP = 1; 71 | $after_first_interval = 1; 72 | } elsif ($SNP_counter == 199 && $made_past_200 ==1 && $second_SNP==1) { #prints end position and next start position 73 | print OUTPUT "$variant[1]\n$chr_next\t$start_next\t" ; 74 | print OUTPUT2 "$variant[1]\n$chr_next\t$start_next\t" ; 75 | # print STDERR "$variant[1]\n$chr_next\t$start_next\t" ; 76 | $made_past_200 = 0; 77 | $second_SNP = 0; 78 | } elsif ($SNP_counter == 199 && $made_past_200 ==1 && $second_SNP==0) { #prints end position and next start position 79 | print OUTPUT "$variant[1]\n$chr_next\t$start_next\t" ; 80 | print OUTPUT2 "$variant[1]\n$chr_next\t$start_next\t" ; 81 | # print STDERR "$variant[1]\n$chr_next\t$start_next\t" ; 82 | $made_past_200 = 0; 83 | } elsif ($SNP_counter == 3800 && $after_first_interval == 1 && $made_it_past_200==0) { #holds next beginning 84 | $chr_next = $variant[0]; 85 | $start_next = $variant[1]; 86 | $SNP_counter=0; 87 | $made_past_200 =1; 88 | } elsif ($SNP_counter == 3801 && $first_interval == 1) { #holds next beginning 89 | $chr_next = $variant[0]; 90 | $start_next = $variant[1]; 91 | $SNP_counter=0; 92 | $made_past_200 =1; 93 | $first_interval = 0; 94 | } #end if 95 | 96 | $chromosome=$variant[0]; 97 | $last_position = $variant[1]; 98 | # print STDERR "Snp counter: $SNP_counter\n"; 99 | } #end while 100 | 101 | print OUTPUT "$last_position\n"; 102 | print OUTPUT2 "$last_position\n"; 103 | -------------------------------------------------------------------------------- /CreateChrBedFromVCF2.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #Created: December 6, 2011 4 | #Last Modified: April 11, 2014 5 | 6 | #program reads in a vcf file and extracts coordinates 7 | #in bed format 8 | #of a specific number of SNPs in a window 9 | 10 | $vcf = $ARGV[0]; 11 | $block = $ARGV[1]; 12 | 13 | unless ($#ARGV==1) { 14 | print STDERR "Error: Please provide filename of input VCF file and synteny block number on command line!\n\n"; 15 | die; 16 | } #end unless 17 | 18 | open(VCF, $vcf); 19 | 20 | $SNP_counter = 0; 21 | @chromosome = (); 22 | @coordinates = (); 23 | $first_time_in_loop = 1; 24 | $first_SNP = 1; 25 | $second_SNP = 0; 26 | 27 | while () { 28 | chomp; 29 | $vcf_line = $_; 30 | @variant = split(/\s+/, $vcf_line); 31 | 32 | if ($vcf_line=~/^\#/) { 33 | next; #do not read in any header lines 34 | } elsif ($vcf_line=~/^chr/ && $variant[6]=~/PASS/) { 35 | $SNP_counter++; 36 | } else { 37 | print STDERR "Non-pass sites in VCF at $variant[0] $variant[1] bc site is $variant[6]!\n"; 38 | next; 39 | } #end else 40 | 41 | if($first_time_in_loop == 1) { #first time in loop opens first output 42 | $chromosome=$variant[0]; 43 | $first_time_in_loop = 0; 44 | open(OUTPUT, ">>$variant[0]-input.BED"); 45 | open(OUTPUT2, ">$variant[0].synteny_block.$block.blocks.txt"); 46 | } #end if 47 | 48 | if ($chromosome !~/$variant[0]/) { #when new chromosome, open new output 49 | print STDERR "finished $chromosome, now starting $variant[0]...\n"; 50 | print OUTPUT "$last_position\n"; 51 | print OUTPUT2 "$last_position\n"; 52 | open(OUTPUT, ">>$variant[0]-input.BED"); 53 | open(OUTPUT2, ">$variant[0].synteny_block.$block.blocks.txt"); 54 | $first_SNP = 1; 55 | $SNP_counter=1; 56 | } #end if 57 | 58 | if ($SNP_counter == 1 && $first_SNP == 1) { #prints first line of BED file 59 | print OUTPUT "$variant[0]\t$variant[1]\t" ; 60 | print OUTPUT2 "$variant[0]\t$variant[1]\t" ; 61 | # print STDERR "$variant[0]\t$variant[1]\t" ; 62 | $first_SNP = 0; 63 | $first_interval = 1; 64 | $after_first_interval = 0; 65 | } elsif ($SNP_counter == 99 && $made_past_200 ==1 && $after_first_interval==0) { #prints end position and next start position 66 | print OUTPUT "$variant[1]\t$block\n$chr_next\t$start_next\t" ; 67 | print OUTPUT2 "$variant[1]\t$block\n$chr_next\t$start_next\t" ; 68 | # print STDERR "$variant[1]\n$chr_next\t$start_next\t" ; 69 | $made_past_200 = 0; 70 | $second_SNP = 1; 71 | $after_first_interval = 1; 72 | } elsif ($SNP_counter == 99 && $made_past_200 ==1 && $second_SNP==1) { #prints end position and next start position 73 | print OUTPUT "$variant[1]\t$block\n$chr_next\t$start_next\t" ; 74 | print OUTPUT2 "$variant[1]\t$block\n$chr_next\t$start_next\t" ; 75 | # print STDERR "$variant[1]\n$chr_next\t$start_next\t" ; 76 | $made_past_200 = 0; 77 | $second_SNP = 0; 78 | } elsif ($SNP_counter == 99 && $made_past_200 ==1 && $second_SNP==0) { #prints end position and next start position 79 | print OUTPUT "$variant[1]\t$block\n$chr_next\t$start_next\t" ; 80 | print OUTPUT2 "$variant[1]\t$block\n$chr_next\t$start_next\t" ; 81 | # print STDERR "$variant[1]\n$chr_next\t$start_next\t" ; 82 | $made_past_200 = 0; 83 | } elsif ($SNP_counter == 300 && $after_first_interval == 1 && $made_it_past_200==0) { #holds next beginning 84 | $chr_next = $variant[0]; 85 | $start_next = $variant[1]; 86 | $SNP_counter=0; 87 | $made_past_200 =1; 88 | } elsif ($SNP_counter == 301 && $first_interval == 1) { #holds next beginning 89 | $chr_next = $variant[0]; 90 | $start_next = $variant[1]; 91 | $SNP_counter=0; 92 | $made_past_200 =1; 93 | $first_interval = 0; 94 | } #end if 95 | 96 | $chromosome=$variant[0]; 97 | $last_position = $variant[1]; 98 | # print STDERR "Snp counter: $SNP_counter\n"; 99 | } #end while 100 | 101 | print OUTPUT "$last_position\t$block\n"; 102 | print OUTPUT2 "$last_position\t$block\n"; 103 | -------------------------------------------------------------------------------- /MS2LDhat.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #Program reads in MS formatted input (*.hud) and converts to LDhat input format 4 | 5 | $input=$ARGV[0]; 6 | $output=$ARGV[1]; 7 | $num_ind=$ARGV[2]; 8 | $num_haps=$num_ind*2; 9 | 10 | unless ($#ARGV==2) { 11 | print STDERR "Please provide input and output filenames and number of individuals on command line.\n\n"; 12 | die; 13 | } #end unless 14 | 15 | open(INPUT, $input); 16 | open(SITES, ">$output.ldhat.sites"); 17 | open(LOCS, ">$output.ldhat.locs"); 18 | 19 | $num_sites=(); 20 | chomp $num_sites; 21 | $positions=(); 22 | chomp $positions; 23 | @positions=split(/\s+/,$positions); 24 | 25 | print LOCS "$num_sites\t$positions[$#positions]\tL\n"; 26 | 27 | for ($p=0; $p<=$#positions;$p++) { 28 | $kb_sites = $positions[$p]/1000; 29 | printf LOCS ("%0.3f\n", $kb_sites); 30 | # print LOCS "$positions[$p]\n"; 31 | } #end for 32 | 33 | #print STDERR "Num sites: $num_sites; Length positions array: $#positions.\n"; 34 | print SITES "$num_haps\t$num_sites\t1\n"; 35 | 36 | $loop_counter=1; 37 | 38 | while () { 39 | chomp; 40 | if ($loop_counter >$num_haps) { 41 | last; 42 | } #end if 43 | 44 | print SITES ">haplotype $loop_counter\n$_\n"; 45 | $loop_counter++; 46 | 47 | } #end while 48 | -------------------------------------------------------------------------------- /MS2PHASE.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #Program reads in MS formatted input (*.hud) and converts to PHASE input format 4 | 5 | $input=$ARGV[0]; 6 | $output=$ARGV[1]; 7 | $num_ind=$ARGV[2]; 8 | $num_haps=($num_ind*2) + 1; 9 | 10 | unless ($#ARGV==2) { 11 | print STDERR "Please provide input and output filenames and number of individuals on command line.\n\n"; 12 | die; 13 | } #end unless 14 | 15 | open(INPUT, $input); 16 | open(OUTPUT, ">$output"); 17 | 18 | print OUTPUT "$num_ind\n"; 19 | $loop_counter=0; 20 | 21 | while () { 22 | chomp; 23 | if ($loop_counter >$num_haps) { 24 | last; 25 | } #end if 26 | 27 | if ($loop_counter==0) { 28 | $num_loci=$_; 29 | print OUTPUT "$num_loci\n"; #prints first line of input into output (number of loci) 30 | $loop_counter++; 31 | next; 32 | } elsif ($loop_counter==1) { #prints next two lines of output with positions and specifies all loci to be SNPs 33 | @positions=split(/\s+/,$_); 34 | print OUTPUT "P"; 35 | for ($i=0; $i<@positions; $i++) { 36 | $locus=$positions[$i]*1000000; 37 | print OUTPUT " $locus"; 38 | } #end for 39 | 40 | print OUTPUT "\n"; 41 | for ($s=0; $s<@positions; $s++) { 42 | print OUTPUT "S"; 43 | } #end for 44 | print OUTPUT "\n"; 45 | $loop_counter++; 46 | next; 47 | } #end elsif 48 | 49 | if ($loop_counter % 2 == 0) { #loop counter is even (need to specify haplotype names) 50 | print OUTPUT "#$loop_counter\n$_\n"; 51 | } else { 52 | print OUTPUT "$_\n"; 53 | } #end else 54 | 55 | $loop_counter++; 56 | 57 | } #end while 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | vcf-conversion-tools 2 | ==================== 3 | 4 | Tools to (1) convert to and from vcf format, (2) convert between LDhat, fastPHASE, and PHASE formats, and (3) convert VCF to BED format and get overlapping bins of 4000 or 400 SNPs. 5 | 6 | (1) Files in this folder will help convert between vcf and other formats: 7 | 'fastPHASE2VCF.pl' #converts between fastPHASE output to VCF 8 | 'thinVCF.pl' #thins VCF files (slightly different algorithm from vcftools, removes all but one site if sites are close to each other) 9 | 'vcf_merge.pl' #merges multiple VCF files into single file 10 | 'vcf2fastPHASE.pl' #converts a VCF file to fastPHASE input for autosomes and females 11 | 'vcf2fastPHASE_4males.pl' #converts VCF file to fastPHASE input for males on the X 12 | 'vcf2MS.pl' #converts VCF to MS format 13 | 14 | 15 | (2) A few other programs are included as they an be used with the others to convert to and from LDhat formats: 16 | 'fastPHASE2LDhat.pl' #converts fastPHASE output to LDhat input 17 | 'MS2LDhat.pl' #converts MS input to LDhat input 18 | 'MS2PHASE.pl' #converts MS input to PHASE input 19 | 20 | (3) The following programs are useful for creating bed files of a subset of overlapping SNPs from a VCF file: 21 | 'CreateChrBedFromVCF.pl' #creates BED file with bins of 4k SNPs, 100 overlapping, useful for running LDhat 22 | 'CreateChrBedFromVCF2.pl' #creates BED file with bins of 400 SNPs, 100 overlapping, useful for running PHASE 23 | 24 | For citing purposes: http://dx.doi.org/10.5281/zenodo.10288 25 | -------------------------------------------------------------------------------- /fastPHASE2LDhat.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #program converts fastPHASE output to LDhat sites input 4 | #locs input file should be generated with 'vcf2fastPHASE.pl' program 5 | #August 13th, 2012 6 | #Modified September 4, 2012 7 | 8 | $fP_output = $ARGV[0]; 9 | $fP_input = $ARGV[1]; 10 | $positions = $ARGV[2]; 11 | 12 | unless ($#ARGV==2) { 13 | print STDERR "Please provide name of fastPHASE output filename, fastPHASE input filename, and the positions filename on command line\n\n"; 14 | die; 15 | } #end unless 16 | 17 | print STDERR "Reading in positions file..."; 18 | open(POS, $positions); 19 | @positions_start = (); 20 | @positions_end = (); 21 | 22 | while() { 23 | chomp; 24 | @input = split(/\s+/, $_); 25 | push(@positions_start, $input[0]); 26 | push(@positions_end, $input[1]); 27 | # print STDERR "\n$input[0]\t$input[1]\n"; 28 | } #end while 29 | 30 | print STDERR "done.\nReading in fastPHASE output file..."; 31 | open(INPUT, $fP_output); 32 | $start_input = 0; 33 | $start_new_indv = 0; 34 | $start_haplotype1 = 0; 35 | $start_haplotype2 = 0; 36 | 37 | %haplotypes = (); 38 | @hap_names = (); 39 | 40 | while() { 41 | chomp; 42 | if ($_=~/BEGIN GENOTYPES/) { 43 | $start_input = 1; 44 | $start_new_indv = 1; 45 | next; 46 | } elsif ($start_input==0) { 47 | next; 48 | } elsif ($_=~/END GENOTYPES/) { 49 | last; 50 | } #end elsif 51 | 52 | if ($start_new_indv==1 && $start_haplotype1==0) { 53 | @input_array = split(/\s+/, $_); 54 | $hap1_name = ">" . $input_array[1] . "-0"; 55 | $hap2_name = ">" . $input_array[1] . "-1"; 56 | # print STDERR "$hap1_name\n"; 57 | push(@hap_names, $hap1_name); 58 | $start_haplotype1 = 1; 59 | $start_new_indv = 0; 60 | } elsif ($start_haplotype1==1 && $start_haplotype2==0) { 61 | @haplotype = split(/\s+/, $_); 62 | for ($i=0; $i<=$#haplotype; $i++) { 63 | push @{$haplotypes{$hap1_name}}, $haplotype[$i]; 64 | } #end for 65 | push(@hap_names, $hap2_name); 66 | # print STDERR "$hap2_name\n"; 67 | $start_haplotype2 = 1; 68 | $start_haplotype1 = 0; 69 | } elsif ($start_haplotype1==0 && $start_haplotype2==1) { 70 | @haplotype = split(/\s+/, $_); 71 | for ($i=0; $i<=$#haplotype; $i++) { 72 | push @{$haplotypes{$hap2_name}}, $haplotype[$i]; 73 | } #end for 74 | $start_haplotype2 = 0; 75 | $start_new_indv = 1; 76 | } #end else 77 | 78 | } #end while 79 | 80 | print STDERR "done.\nWriting LDhat output files..."; 81 | 82 | @fP_pieces1 = split(/\./, $fP_input); 83 | #print STDERR "fastPHASE input: $fP_input; chr: $fP_pieces1[0]; block: $fP_pieces1[2]\n"; 84 | #@fP_pieces2 = split(/\./, $fP_pieces1[$#fP_pieces1]); 85 | 86 | $positions_file = "$fP_pieces1[0].synteny_block.$fP_pieces1[2].positions.out"; 87 | 88 | open(POS2, $positions_file); 89 | $header = (); 90 | @sites=(); 91 | 92 | while () { 93 | chomp; 94 | push(@sites, $_); 95 | } #end while 96 | 97 | open(fastPHASE, $fP_input); 98 | $num_indv = (); 99 | chomp($num_indv); 100 | $num_hap = $num_indv*2; 101 | $num_sites = (); 102 | chomp($num_sites); 103 | #$sites_input = (); 104 | #chomp($sites_input); 105 | #@sites = split(/\s+/, $sites_input); 106 | #shift(@sites); 107 | 108 | #print STDERR "Length of positions array: $#positions_start\n"; 109 | 110 | for ($i=0; $i<=$#positions_start; $i++) { 111 | 112 | $count = $i+1; 113 | $locs = "LDhat-inputs\/$fP_pieces1[0]\/$fP_pieces1[0]\.$positions_start[$i]\.$positions_end[$i]\.ldhat\.locs"; 114 | open(LOCS, ">$locs"); 115 | 116 | for ($l=0; $l<=$#sites; $l++) { 117 | if ($sites[$l]==$positions_start[$i]) { 118 | $start_k = $l; 119 | } elsif ($sites[$l]==$positions_end[$i]) { 120 | $end_k = $l; 121 | last; 122 | } #end elsif 123 | } #end for 124 | 125 | $num_sites_i_block = $end_k - $start_k + 1; 126 | $end_block = $sites[$end_k]/1000; 127 | print LOCS "$num_sites_i_block $end_block L\n"; 128 | 129 | $sites = "LDhat-inputs\/$fP_pieces1[0]\/$fP_pieces1[0]\.$positions_start[$i]\.$positions_end[$i]\.ldhat\.sites"; 130 | open(SITES, ">$sites"); 131 | print SITES "$num_hap $num_sites_i_block 1"; 132 | 133 | for ($j=0; $j<=$#hap_names; $j++) { 134 | $current_hap_name = $hap_names[$j]; 135 | @current_hap = @{$haplotypes{$current_hap_name}}; 136 | print SITES "\n$current_hap_name\n"; 137 | for ($k=$start_k; $k<=$end_k; $k++) { 138 | print SITES "$current_hap[$k]"; 139 | if ($j==0) { 140 | $kb_sites = $sites[$k]/1000; 141 | printf LOCS ("%0.3f\n", $kb_sites); 142 | } #end if 143 | } #end for 144 | } #end for 145 | 146 | $list = ">>$fP_pieces1[0]-input.BED"; 147 | # print STDERR "cat \"$fP_pieces1[0]\t$sites[$start_k]\t$sites[$end_k]\" $list\n"; 148 | system("echo \"$fP_pieces1[0]\t$sites[$start_k]\t$sites[$end_k]\" $list"); 149 | } #end for 150 | 151 | print STDERR "done.\n"; 152 | -------------------------------------------------------------------------------- /fastPHASE2VCF.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #program converts fastPHASE output to VCF file 4 | #program needs original VCF file and input file for fastPHASE 5 | #January 4th, 2013 6 | #Last Modified Jan 8th, 2013 7 | 8 | $fP_input = $ARGV[0]; 9 | $fP_output = $ARGV[1]; 10 | $VCF = $ARGV[2]; 11 | $VCF_out=$ARGV[3]; 12 | $block=$ARGV[4]; 13 | 14 | @filename = split(/\./, $fP_input); 15 | @chr = split(/\//, $filename[0]); 16 | $error_report = "error_report.$chr[1].txt"; 17 | 18 | unless ($#ARGV==4) { 19 | print STDERR "Please provide name of fastPHASE input and output filenames, the original VCF filename and an output VCF filename, and the synteny block number on command line\n\n"; 20 | die; 21 | } #end unless 22 | 23 | print STDERR "Reading in fastPHASE input file..."; 24 | open(FPINPUT, $fP_input); 25 | 26 | @original_positions = (); 27 | $positions_line = 0; 28 | 29 | while () { 30 | chomp; 31 | if ($_=~/P\s+/) { 32 | $positions_line = 1; 33 | } elsif ($_=~/\#/) { 34 | last; 35 | } #end elsif 36 | 37 | if ($positions_line==1) { 38 | @line_array = split(/\s+/, $_); 39 | push(@original_positions, @line_array); 40 | } #end if 41 | 42 | } #end while 43 | 44 | shift @original_positions; 45 | 46 | print STDERR "positions: $original_positions[0]-$original_positions[$#original_positions]...done.\nReading in fastPHASE output file..."; 47 | open(FPOUTPUT, $fP_output); 48 | $start_input = 0; 49 | $start_new_indv = 0; 50 | $start_haplotype1 = 0; 51 | $start_haplotype2 = 0; 52 | 53 | %haplotypes = (); 54 | @hap_names = (); 55 | @ind_names = (); 56 | 57 | while() { 58 | chomp; 59 | if ($_=~/BEGIN GENOTYPES/) { 60 | $start_input = 1; 61 | $start_new_indv = 1; 62 | next; 63 | } elsif ($start_input==0) { 64 | next; 65 | } elsif ($_=~/END GENOTYPES/) { 66 | last; 67 | } #end elsif 68 | 69 | if ($start_new_indv==1 && $start_haplotype1==0) { 70 | @input_array = split(/\s+/, $_); 71 | $hap1_name = $input_array[1] . "_1"; 72 | $hap2_name = $input_array[1] . "_2"; 73 | # print STDERR "$hap1_name\n"; 74 | push(@ind_names, $input_array[1]); 75 | push(@hap_names, $hap1_name); 76 | $start_haplotype1 = 1; 77 | $start_new_indv = 0; 78 | } elsif ($start_haplotype1==1 && $start_haplotype2==0) { 79 | @haplotype = split(/\s+/, $_); 80 | for ($i=0; $i<=$#haplotype; $i++) { 81 | push @{$haplotypes{$hap1_name}}, $haplotype[$i]; 82 | } #end for 83 | push(@hap_names, $hap2_name); 84 | # print STDERR "$hap2_name\n"; 85 | $start_haplotype2 = 1; 86 | $start_haplotype1 = 0; 87 | } elsif ($start_haplotype1==0 && $start_haplotype2==1) { 88 | @haplotype = split(/\s+/, $_); 89 | for ($i=0; $i<=$#haplotype; $i++) { 90 | push @{$haplotypes{$hap2_name}}, $haplotype[$i]; 91 | } #end for 92 | $start_haplotype2 = 0; 93 | $start_new_indv = 1; 94 | } #end else 95 | 96 | } #end while 97 | 98 | #compare position to @original_positions array from fastPHASE input file! 99 | @output_size = @{$haplotypes{$hap2_name}}; 100 | if ($#output_size!=$#original_positions) { 101 | print STDERR "Error: Original number of sites does not match output number of sites in fastPHASE files!\n"; 102 | die; 103 | } #end if 104 | 105 | $loop_size=$#ind_names + 9; 106 | $num_indv = $#ind_names + 1; 107 | $while_loop_counter = 0; 108 | @names= (); 109 | print STDERR "done.\nReading in original VCF file with $num_indv individuals and writing output VCF file..."; 110 | open(OUTPUT, ">$VCF_out"); 111 | open(VCF, $VCF); 112 | 113 | while () { 114 | chomp; 115 | if ($_=~/\#\#/) { 116 | print OUTPUT "$_\n"; 117 | next; 118 | } elsif ($_=~/\#/) { 119 | @input_line = split(/\s+/, $_); 120 | for ($a=9; $a<=$loop_size; $a++) { 121 | $b = $a - 9; 122 | push(@names, $input_line[$a]); 123 | if ($input_line[$a]!~/$ind_names[$b]/) { 124 | print STDERR "error: names in fastPHASE file and VCF don't match\n\n"; 125 | die; 126 | } #end if 127 | } #end for 128 | 129 | print OUTPUT "$_"; 130 | next; 131 | } #end elsif 132 | @input_line = split(/\s+/, $_); 133 | $position = $input_line[1]; 134 | 135 | #compare position to @original_positions array from fastPHASE input file! 136 | if ($original_positions[$while_loop_counter]!=$position) { 137 | print STDERR "Error: position in VCF file ($position) does not match position in fastPHASE input file ($original_positions[$while_loop_counter])!\n"; 138 | next; 139 | } #end if 140 | 141 | $ref = $input_line[3]; 142 | $alt = $input_line[4]; 143 | print OUTPUT "\n$input_line[0]\t$input_line[1]\t$input_line[2]\t$input_line[3]\t$input_line[4]\t$input_line[5]\t$input_line[6]\t$input_line[7];synteny_block=$block\t$input_line[8]"; 144 | 145 | for ($i=9; $i<=$loop_size; $i++) { 146 | $o = $i - 9; 147 | $hap1 = $names[$o] . "_1"; 148 | $hap2 = $names[$o] . "_2"; 149 | @genotype = split(":", $input_line[$i]); 150 | # print STDERR "Hap1: $hap1; Hap2: $hap2; $Before: $genotype[0] "; 151 | $allele1 = @{$haplotypes{$hap1}}[$while_loop_counter]; 152 | $allele1 =~ s/$ref/0/g; 153 | $allele1 =~ s/$alt/1/g; 154 | 155 | $allele2 = @{$haplotypes{$hap2}}[$while_loop_counter]; 156 | $allele2 =~ s/$ref/0/g; 157 | $allele2 =~ s/$alt/1/g; 158 | 159 | #genotype can either be homo-ref, homo-alt, het, or missing data 160 | @alleles = split(/\||\//, $genotype[0]); 161 | 162 | if ($alleles[0]=~/0/ && $alleles[1]=~/0/) { #original vcf is homozygous reference allele 163 | if ($allele1!=0 || $allele2!=0) { #after phasing is not homo ref 164 | system("echo \"Potential genotyping error found at position $input_line[1] for $names[$o]; after phasing is not homo ref\" >>$error_report"); 165 | system("echo \"Before phasing: $genotype[0]; allele 1: $alleles[0]; allele 2: $alleles[1]; After phasing: Allele 1: $allele1; Allele 2: $allele2\" >>$error_report"); 166 | } #end if 167 | } elsif ($alleles[0]=~/1/ && $alleles[1]=~/1/) { #original vcf is homozygous alternate allele 168 | if ($allele1!=1 || $allele2!=1) { #after phasing not homo alt 169 | system("echo \"Potential genotyping error found at position $input_line[1] for $names[$o]; after phasing is not homo alt\" >>$error_report"); 170 | system("echo \"Before phasing: $genotype[0]; allele 1: $alleles[0]; allele 2: $alleles[1]; After phasing: Allele 1: $allele1; Allele 2: $allele2\" >>$error_report"); 171 | } #end if 172 | } elsif (($alleles[0]=~/0/ && $alleles[1]=~/1/) || ($alleles[0]=~/1/ && $alleles[1]=~/0/)) { #original vcf is heterozygous 173 | if (($allele1==1 && $allele2==1) || ($allele1==0 && $allele2==0)) { #after phasing is homozygous 174 | system("echo \"Potential genotyping error found at position $input_line[1] for $names[$o]; after phasing is homozygous\" >>$error_report"); 175 | system("echo \"Before phasing: $genotype[0]; allele 1: $alleles[0]; allele 2: $alleles[1]; After phasing: Allele 1: $allele1; Allele 2: $allele2\" >>$error_report"); 176 | } #end if 177 | } #end elsif 178 | 179 | shift @genotype; 180 | $end_of_line = join (":", @genotype); 181 | 182 | print OUTPUT "\t$allele1|$allele2:$end_of_line"; 183 | } #end for 184 | 185 | $while_loop_counter++; 186 | } #end while 187 | 188 | 189 | print OUTPUT "\n"; 190 | print STDERR "done.\n"; 191 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Laurie Stevison 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /thinVCF.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #program thins sites in VCF file that are too close together 4 | #August 15th, 2012 5 | 6 | $vcf = $ARGV[0]; 7 | $output = $ARGV[1]; 8 | 9 | unless ($#ARGV==1) { 10 | print STDERR "Please provide name of input and output vcf files on command line\n\n"; 11 | die; 12 | } #end unless 13 | 14 | open(VCF, $vcf); 15 | open(OUTPUT, ">$output"); 16 | $remove_both_sites = 0; 17 | print STDERR "Reading in VCF file..."; 18 | 19 | while() { 20 | chomp; 21 | if ($_=~/^\#/) { 22 | $first_line=1; 23 | print OUTPUT "$_\n"; 24 | next; 25 | } #end if 26 | 27 | if ($first_line==1) { 28 | @input_line = split(/\s+/, $_); 29 | $last_pos = $input_line[1]; 30 | $last_site = $_; 31 | $first_line=0; 32 | next; 33 | } #end if 34 | 35 | @input_line = split(/\s+/, $_); 36 | $current_pos = $input_line[1]; 37 | 38 | $difference = $current_pos - $last_pos + 1; 39 | 40 | if ($difference>15 && $remove_both_sites==0) { 41 | print OUTPUT "$last_site\n"; 42 | $remove_both_sites = 0; 43 | } elsif ($difference>15 && $remove_both_sites==1) { 44 | # print STDERR "$last_pos"; 45 | $remove_both_sites = 0; 46 | } elsif ($difference<=15 && $remove_both_sites==0) { 47 | $remove_both_sites = 1; 48 | # print STDERR "\n$last_pos, "; 49 | } elsif ($difference<=15 && $remove_both_sites==1) { 50 | # print STDERR "$last_pos, "; 51 | } #end elsif 52 | 53 | $last_pos = $current_pos; 54 | $last_site = $_; 55 | 56 | } #end while 57 | 58 | if ($difference>15 && $remove_both_sites==0) { 59 | print OUTPUT "$last_site\n"; 60 | } else { 61 | # print STDERR "$last_pos\n"; 62 | } #end if 63 | 64 | print STDERR "done.\n"; 65 | -------------------------------------------------------------------------------- /vcf2MS.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #program converts vcf file to MS input file format 4 | #January 17th, 2013 5 | 6 | $vcf = $ARGV[0]; 7 | $output = $ARGV[1]; 8 | $sample_size = $ARGV[2]; 9 | 10 | unless ($#ARGV==2) { 11 | print STDERR "Please provide name of input vcf file, filename for MS formatted output, and sample size on command line\n\n"; 12 | die; 13 | } #end unless 14 | 15 | open(VCF, $vcf); 16 | 17 | @positions = (); 18 | @names = (); 19 | $loop_size = $sample_size + 8; 20 | %genotypes = (); 21 | 22 | print STDERR "Reading in VCF file..."; 23 | 24 | while() { 25 | chomp; 26 | if ($_=~/\#\#/) { 27 | next; 28 | } elsif ($_=~/\#/) { 29 | @input_line = split(/\s+/, $_); 30 | for ($a=9; $a<=$loop_size; $a++) { 31 | push(@names, $input_line[$a]); 32 | } #end for 33 | next; 34 | } #end elsif 35 | 36 | @input_line = split(/\s+/, $_); 37 | push(@positions, $input_line[1]); 38 | 39 | # $ref = $input_line[3]; 40 | # $alt = $input_line[4]; 41 | 42 | for ($i=9; $i<=$loop_size; $i++) { 43 | $o = $i - 9; 44 | $hap1 = $names[$o] . "_1"; 45 | $hap2 = $names[$o] . "_2"; 46 | @genotype = split(":", $input_line[$i]); 47 | # print STDERR "Hap1: $hap1; Hap2: $hap2; $Before: $genotype[0]\t"; 48 | # $genotype[0] =~ s/0/$ref/g; 49 | # $genotype[0] =~ s/1/$alt/g; 50 | # $genotype[0] =~ s/\./\?/g; 51 | # print STDERR "After: $genotype[0]\n"; 52 | @haplotypes = split(/[\|\/]/, $genotype[0]); 53 | push @{$genotypes{$hap1}}, $haplotypes[0]; 54 | push @{$genotypes{$hap2}}, $haplotypes[1]; 55 | } #end for 56 | } #end while 57 | 58 | print STDERR "done.\nNow printing output..."; 59 | 60 | open(OUTPUT, ">$output"); 61 | $number_loci = $#positions + 1; 62 | 63 | print OUTPUT "$number_loci\n"; 64 | 65 | for ($b=0; $b<=$#positions; $b++) { 66 | print OUTPUT "$positions[$b] "; 67 | } #end for 68 | 69 | for ($c=0; $c<=$#names; $c++) { 70 | print OUTPUT "\n"; 71 | 72 | $hap1 = $names[$c] . "_1"; 73 | @hap1_geno = @{$genotypes{$hap1}}; 74 | 75 | for ($d=0; $d<=$#hap1_geno; $d++) { 76 | print OUTPUT "$hap1_geno[$d]"; 77 | } #end for 78 | print OUTPUT "\n"; 79 | 80 | $hap2 = $names[$c] . "_2"; 81 | @hap2_geno = @{$genotypes{$hap2}}; 82 | 83 | for ($e=0; $e<=$#hap2_geno; $e++) { 84 | print OUTPUT "$hap2_geno[$e]"; 85 | } #end for 86 | } #end for 87 | 88 | print OUTPUT "\n"; 89 | 90 | print STDERR "done.\n"; 91 | -------------------------------------------------------------------------------- /vcf2PHASE.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #program converts vcf file to PHASE input format 4 | #Last Modified: April 11, 2014 5 | 6 | $vcf = $ARGV[0]; 7 | $output = $ARGV[1]; 8 | $sample_size = $ARGV[2]; 9 | 10 | unless ($#ARGV==2) { 11 | print STDERR "Please provide name of input vcf file, filename for PHASE formatted output, and sample size on command line\n\n"; 12 | die; 13 | } #end unless 14 | 15 | open(VCF, $vcf); 16 | 17 | @positions = (); 18 | @names = (); 19 | $loop_size = $sample_size + 8; 20 | %genotypes = (); 21 | 22 | print STDERR "Reading in VCF file..."; 23 | 24 | while() { 25 | chomp; 26 | if ($_=~/\#\#/) { 27 | next; 28 | } elsif ($_=~/\#/) { 29 | @input_line = split(/\s+/, $_); 30 | for ($a=9; $a<=$loop_size; $a++) { 31 | push(@names, $input_line[$a]); 32 | } #end for 33 | next; 34 | } #end elsif 35 | 36 | @input_line = split(/\s+/, $_); 37 | push(@positions, $input_line[1]); 38 | 39 | $ref = $input_line[3]; 40 | $alt = $input_line[4]; 41 | 42 | for ($i=9; $i<=$loop_size; $i++) { 43 | $o = $i - 9; 44 | $hap1 = $names[$o] . "_1"; 45 | $hap2 = $names[$o] . "_2"; 46 | @genotype = split(":", $input_line[$i]); 47 | # print STDERR "Hap1: $hap1; Hap2: $hap2; $Before: $genotype[0]\t"; 48 | $genotype[0] =~ s/0/$ref/g; 49 | $genotype[0] =~ s/1/$alt/g; 50 | $genotype[0] =~ s/\./\?/g; 51 | # print STDERR "After: $genotype[0]\n"; 52 | @haplotypes = split(/[\|\/]/, $genotype[0]); 53 | push @{$genotypes{$hap1}}, $haplotypes[0]; 54 | push @{$genotypes{$hap2}}, $haplotypes[1]; 55 | } #end for 56 | } #end while 57 | 58 | print STDERR "done.\nNow printing output..."; 59 | 60 | open(OUTPUT, ">$output"); 61 | 62 | $number_loci = $#positions + 1; 63 | print OUTPUT "$sample_size\n$number_loci\nP "; 64 | 65 | for ($b=0; $b<=$#positions; $b++) { 66 | print OUTPUT "$positions[$b] "; 67 | } #end for 68 | 69 | print OUTPUT "\n"; 70 | 71 | for ($f=0; $f<=$#positions; $f++) { 72 | print OUTPUT "S"; 73 | } #end for 74 | 75 | for ($c=0; $c<=$#names; $c++) { 76 | print OUTPUT "\n\#$names[$c]\n"; 77 | 78 | $hap1 = $names[$c] . "_1"; 79 | @hap1_geno = @{$genotypes{$hap1}}; 80 | $hap1_line = ""; 81 | 82 | for ($d=0; $d<=$#hap1_geno; $d++) { 83 | print OUTPUT "$hap1_geno[$d]"; 84 | } #end for 85 | print OUTPUT "\n"; 86 | 87 | $hap2 = $names[$c] . "_2"; 88 | @hap2_geno = @{$genotypes{$hap2}}; 89 | $hap2_line = ""; 90 | 91 | for ($e=0; $e<=$#hap2_geno; $e++) { 92 | print OUTPUT "$hap2_geno[$e]"; 93 | } #end for 94 | } #end for 95 | 96 | print OUTPUT "\n"; 97 | 98 | print STDERR "done.\n"; 99 | -------------------------------------------------------------------------------- /vcf2fastPHASE.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #program converts vcf file to fastPHASE input format 4 | #September 13th, 2012 5 | 6 | $vcf = $ARGV[0]; 7 | $output = $ARGV[1]; 8 | $output2 = $ARGV[2]; 9 | $sample_size = $ARGV[3]; 10 | 11 | unless ($#ARGV==3) { 12 | print STDERR "Please provide the name of your input vcf file. Additionally, provide a filename for the two outputs - the fastPHASE formatted genotype output and a positions file. Finally, please include the sample size on command line\n\n"; 13 | die; 14 | } #end unless 15 | 16 | open(VCF, $vcf); 17 | 18 | @positions = (); 19 | @names = (); 20 | $loop_size = $sample_size + 8; 21 | %genotypes = (); 22 | 23 | print STDERR "Reading in VCF file..."; 24 | 25 | while() { 26 | chomp; 27 | if ($_=~/\#\#/) { 28 | next; 29 | } elsif ($_=~/\#/) { 30 | @input_line = split(/\s+/, $_); 31 | for ($a=9; $a<=$loop_size; $a++) { 32 | push(@names, $input_line[$a]); 33 | } #end for 34 | next; 35 | } #end elsif 36 | 37 | @input_line = split(/\s+/, $_); 38 | push(@positions, $input_line[1]); 39 | 40 | $ref = $input_line[3]; 41 | $alt = $input_line[4]; 42 | 43 | for ($i=9; $i<=$loop_size; $i++) { 44 | $o = $i - 9; 45 | $hap1 = $names[$o] . "_1"; 46 | $hap2 = $names[$o] . "_2"; 47 | @genotype = split(":", $input_line[$i]); 48 | # print STDERR "Hap1: $hap1; Hap2: $hap2; $Before: $genotype[0]\t"; 49 | $genotype[0] =~ s/0/$ref/g; 50 | $genotype[0] =~ s/1/$alt/g; 51 | $genotype[0] =~ s/\./\?/g; 52 | # print STDERR "After: $genotype[0]\n"; 53 | @haplotypes = split(/[\|\/]/, $genotype[0]); 54 | push @{$genotypes{$hap1}}, $haplotypes[0]; 55 | push @{$genotypes{$hap2}}, $haplotypes[1]; 56 | } #end for 57 | } #end while 58 | 59 | print STDERR "done.\nNow printing output..."; 60 | 61 | open(OUTPUT, ">$output"); 62 | open(OUTPUT2, ">$output2"); 63 | 64 | $number_loci = $#positions + 1; 65 | 66 | print OUTPUT "$sample_size\n$number_loci\nP "; 67 | print OUTPUT2 "CHR\tPOS\n"; 68 | $positions_line = "P "; 69 | 70 | for ($b=0; $b<=$#positions; $b++) { 71 | $positions_line .= "$positions[$b] "; 72 | $cnt = length($positions_line); 73 | 74 | if ($cnt<500000) { 75 | print OUTPUT "$positions[$b] "; 76 | } elsif ($cnt>=500000) { 77 | print OUTPUT "\n$positions[$b] "; 78 | $positions_line = "$positions[$b] "; 79 | } #end elsif 80 | print OUTPUT2 "$positions[$b]\n"; 81 | } #end for 82 | 83 | for ($c=0; $c<=$#names; $c++) { 84 | print OUTPUT "\n\# $names[$c]\n"; 85 | 86 | $hap1 = $names[$c] . "_1"; 87 | @hap1_geno = @{$genotypes{$hap1}}; 88 | $hap1_line = ""; 89 | 90 | for ($d=0; $d<=$#hap1_geno; $d++) { 91 | $hap1_line .= $hap1_geno[$d]; 92 | $cnt2 = length($hap1_line); 93 | if ($cnt2<500000) { 94 | print OUTPUT "$hap1_geno[$d]"; 95 | } elsif ($cnt2>=500000) { 96 | print OUTPUT "\n$hap1_geno[$d]"; 97 | $hap1_line = ""; 98 | } #end elsif 99 | } #end for 100 | print OUTPUT "\n"; 101 | 102 | $hap2 = $names[$c] . "_2"; 103 | @hap2_geno = @{$genotypes{$hap2}}; 104 | $hap2_line = ""; 105 | 106 | for ($e=0; $e<=$#hap2_geno; $e++) { 107 | $hap2_line .= $hap2_geno[$e]; 108 | $cnt3 = length($hap2_line); 109 | if ($cnt3<500000) { 110 | print OUTPUT "$hap2_geno[$e]"; 111 | } elsif ($cnt3>=500000) { 112 | print OUTPUT "\n$hap2_geno[$e]"; 113 | $hap2_line = ""; 114 | } #end elsif 115 | } #end for 116 | } #end for 117 | 118 | print OUTPUT "\n"; 119 | 120 | print STDERR "done.\n"; 121 | -------------------------------------------------------------------------------- /vcf2fastPHASE_4males.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #program converts vcf file to fastPHASE input format 4 | #September 13th, 2012 5 | 6 | $vcf = $ARGV[0]; 7 | $output = $ARGV[1]; 8 | $output2 = $ARGV[2]; 9 | $sample_size = $ARGV[3]; 10 | 11 | unless ($#ARGV==3) { 12 | print STDERR "Please provide name of input vcf file, filename for fastPHASE formatted output, filename for positions file, and sample size on command line\n\n"; 13 | die; 14 | } #end unless 15 | 16 | open(VCF, $vcf); 17 | 18 | @positions = (); 19 | @names = (); 20 | $loop_size = $sample_size + 8; 21 | %genotypes = (); 22 | 23 | print STDERR "Reading in VCF file..."; 24 | 25 | while() { 26 | chomp; 27 | if ($_=~/\#\#/) { 28 | next; 29 | } elsif ($_=~/\#/) { 30 | @input_line = split(/\s+/, $_); 31 | for ($a=9; $a<=$loop_size; $a++) { 32 | push(@names, $input_line[$a]); 33 | } #end for 34 | next; 35 | } #end elsif 36 | 37 | @input_line = split(/\s+/, $_); 38 | $ref = $input_line[3]; 39 | $alt = $input_line[4]; 40 | 41 | for ($i=9; $i<=$loop_size; $i++) { 42 | $o = $i - 9; 43 | $hap1 = $names[$o]; 44 | @genotype = split(":", $input_line[$i]); 45 | # print STDERR "Hap name: $hap1; Original genotype: $input_line[$i]; Next step: $genotype[0];"; 46 | $genotype[0] =~ s/0/$ref/g; 47 | $genotype[0] =~ s/1/$alt/g; 48 | $genotype[0] =~ s/\./\?/g; 49 | # print STDERR " Final: $genotype[0]\n"; 50 | push @{$genotypes{$hap1}}, $genotype[0]; 51 | } #end for 52 | } #end while 53 | 54 | print STDERR "done.\nNow printing output..."; 55 | 56 | open(OUTPUT, ">$output"); 57 | $hap_size = $sample_size; 58 | print OUTPUT "$hap_size"; 59 | 60 | for ($c=0; $c<=$#names; $c++) { 61 | print OUTPUT "\n\# $names[$c]\n"; 62 | 63 | $hap1 = $names[$c]; 64 | @hap1_geno = @{$genotypes{$hap1}}; 65 | $hap1_line = ""; 66 | 67 | for ($d=0; $d<=$#hap1_geno; $d++) { 68 | $hap1_line .= $hap1_geno[$d]; 69 | $cnt2 = length($hap1_line); 70 | if ($cnt2<500000) { 71 | print OUTPUT "$hap1_geno[$d]"; 72 | } elsif ($cnt2>=500000) { 73 | print OUTPUT "\n$hap1_geno[$d]"; 74 | $hap1_line = ""; 75 | } #end elsif 76 | } #end for 77 | 78 | # print OUTPUT "\n$hap1_line"; 79 | 80 | } #end for 81 | 82 | print OUTPUT "\n"; 83 | 84 | print STDERR "done.\n"; 85 | -------------------------------------------------------------------------------- /vcf_merge.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #program combined two vcf files with different sets of inidividuals into one file 4 | #July 15, 2013 5 | #Last Modified July 15, 2013 6 | 7 | $VCF1 = $ARGV[0]; 8 | $VCF2 = $ARGV[1]; 9 | $VCF_out = $ARGV[2]; 10 | 11 | unless ($#ARGV==2) { 12 | print STDERR "Please provide input names of both VCF files and output filename on command line.\n\n"; 13 | die; 14 | } #end unless 15 | 16 | print STDERR "Now reading through two VCF input files and writing merged output...\n"; 17 | 18 | open(OUTPUT, ">$VCF_out"); 19 | open(VCF1, $VCF1); 20 | open(VCF2, $VCF2); 21 | 22 | while () { 23 | chomp; 24 | if ($_=~/\#\#/) { 25 | print OUTPUT "$_\n"; 26 | next; 27 | } elsif ($_=~/\#CHROM/) { 28 | 29 | print OUTPUT "$_"; 30 | 31 | while () { 32 | chomp; 33 | if ($_=~/\#CHROM/) { 34 | @vcf2_chrom_line = split(/\s+/, $_); 35 | last; 36 | } #end if 37 | } #end while 38 | 39 | for ($d=9; $d<=$#vcf2_chrom_line; $d++) { 40 | print OUTPUT "\t$vcf2_chrom_line[$d]"; 41 | } #end for 42 | 43 | next; 44 | } elsif ($_!~/chr/) { 45 | next; 46 | } #end elsif 47 | 48 | @input_line = split(/\s+/, $_); 49 | $vcf1_position = $input_line[1]; 50 | 51 | $vcf2_line = (); 52 | @vcf2_lines = split(/\s+/, $vcf2_line); 53 | $vcf2_position = $vcf2_lines[1]; 54 | 55 | #compare position to @original_positions array from fastPHASE input file! 56 | if ($vcf2_position!=$vcf1_position) { 57 | print STDERR "Error: position in VCF file 1 ($vcf1_position) does not match position in second VCF file ($vcf2_position)\n"; 58 | next; 59 | } #end if 60 | 61 | print OUTPUT "\n$_"; 62 | 63 | for ($e=9; $e<=$#vcf2_lines; $e++) { 64 | print OUTPUT "\t$vcf2_lines[$e]"; 65 | } #end for 66 | 67 | 68 | } #end while 69 | 70 | 71 | print OUTPUT "\n"; 72 | print STDERR "done.\n"; 73 | --------------------------------------------------------------------------------