#!/usr/bin/perl # GB + CG july 2011 # RA Dec 2011 use warnings; use strict; use File::Basename; #use bignum; unless (@ARGV == 2) {die;} my $list = $ARGV[0]; my $out = $ARGV[1]; my $max_error_rate = 0.05; my $print_counts = "T"; # set to true if you want all the counts as well, set to nothing to not print out counts. Will print out, to a seperate file, the number of As Ts Cs Gs and their quality score for each site, all of which are seperated by a semicolon. my $skip_lrt = 1; # set to false if you want to perform the LRT (likelyhood ratio test) my $min_coverage = 4; # min read depth at a site, less than this will be NN, does not count N's towards depth if($skip_lrt) {$min_coverage=5;} # only positions with at least this much coverage will be scored if $skip_lrt my %base_calls; my %all_counts; my @samples; # list of samples my @bases = ("A","T","C","G"); sub genotype { # this subroutine is passed an array of read counts for each base, # plus the mean error rate at the site as the last element my @counts = @_; my %n; # $n{a} = sprintf("%09d", $counts[0]); # $n{t} = sprintf("%09d", $counts[1]); ### hash of 'BASE' => 'N_READS' $n{c} = sprintf("%09d", $counts[2]); # $n{g} = sprintf("%09d", $counts[3]); # my $E = $counts[4]; # the mean error rate my $nS = $n{a} + $n{t} + $n{g} + $n{c}; # total reads # sort the hash decreasing by values, return the keys my @sort_bases = reverse sort { $n{$a} cmp $n{$b} } keys %n; my $n1base = $sort_bases[0]; # my $n2base = $sort_bases[1]; ### the bases, so that we can call the genotype later my $n3base = $sort_bases[2]; # my $n4base = $sort_bases[3]; # my $n1 = int($n{$n1base}); ### n1 == number of reads for most frequent base my $n2 = int($n{$n2base}); ### n2 == number of reads for the second most frequent base my $n3 = int($n{$n3base}); # my $n4 = int($n{$n4base}); # my $L11; my $L12; my $sampling; if ($nS < 150){ # running factorials on numbers higher than 150 are slow and will crash without use::bignum # the factorials are expensive, so we consolidate them as one variable $sampling = fact($nS) / ( fact($n1) * fact($n2) * fact($n3) * fact($n4) ); # likelihood of a homozygote: eq (1a) $L11 = $sampling * ( ( 1 - ((3*$E)/4) )**$n1 ) * ( ($E/4)**($n2 + $n3 + $n4) ); # likelihood of a heterozygote: eq (1b) $L12 = $sampling * ( ( 0.5 - ($E/4) )**($n1 + $n2) ) * ( ($E/4)**($n3 + $n4) ); # the test statistic is the likelihood ratio of our two most likely hypotheses } else { if (($n2/$nS)>0.1){ #If more than 10% of the total reads are the second base, set the likelihood of heterozygote as higher. $L11 = 0; $L12 = 1; } else { # otherwise set the likelihood of the homozygote as higher $L11 = 1; $L12 = 0; } } my $D = 1; if($skip_lrt){ $D = 5; } else{ $D = abs(2 * log($L12/$L11) ); } # for a = 0.05, this number comes from a chi-squared distribution with 1 degree of freedom # print "D = $D\n"; my $sig = '3.841'; # no call is the default my $call = 'NN'; # if the test statistic is significant if($D > $sig && $nS >= $min_coverage){ # # if a heterozygote was more likely, then that was our alternative hypothesis if($L12>$L11){ $call = $n1base.$n2base; } # else, our hypothesis was homozygosity else{ $call = $n1base.$n1base; } } if($D > 3.814){ $call = uc($call); } else{ $call = lc($call); } return $call; } # factorial subroutine sub fact { no warnings 'recursion'; # THIS COULD BE DANGEROUS!!! my $n = shift; $n == 0 ? 1 : $n*fact($n-1); } open LIST, $list; # for thru the list of files, open each and add name to a array to print while (){ chomp; my $sample_file=$_; my $sample=basename($sample_file,".MQgt30.sam"); #print "On $sample...\n"; #print "File is $sample_file\n"; push (@samples,$sample); my %base_count; my %qual_count; open SAM, $sample_file; while (){ my @read; my @quals; my $cgr; my @cigar; my @a = split(/\s/, $_); my $go_on; my $done; my $indel; my $use_ins; unless ((/^@/)or($a[2]eq '*')or($a[4]eq 0)or($a[5]eq'*')){ $cgr = $a[5]; if ($cgr !~ m/(N|H|P|=|X)/){ $go_on="T"; } } if($go_on){ @cigar = split (/M|I|D|S/,$cgr); unless ($#cigar>2){ $cgr =~ s/\d//g; if ($cgr eq "MIM"){ $indel="I"; }elsif($cgr eq "MDM"){ $indel="D"; }elsif($cgr eq "SM"){ $indel="SS"; }elsif($cgr eq "MS"){ $indel="SE"; }elsif($cgr eq "M"){ $indel="no"; } } } if($indel){ if ($indel eq "SS"){ # skip the first bit of the read $a[9] =substr($a[9],$cigar[0]); $a[10] = substr($a[10],$cigar[0]); } @read = split ('',$a[9]); @quals = split ('',$a[10]); for (my $i = 0; $i<=$#read; $i++) { my $p = 10**(-((ord($quals[$i]))-33)/10); my $bp; #for indels if ($i == $cigar[0]){ if ($indel eq "D"){ #skip ahead on the reference $a[3]= $a[3]+$cigar[1]; }elsif ($indel eq "I"){ #skip ahead on the read $use_ins = "T"; }elsif ($indel eq "SE"){ $done = "T"; } } $bp = $a[3]+$i; unless (($p>$max_error_rate)||($read[$i]eq"N")or ($done)){ # add the base to the count if ($use_ins){ if ($read[($i+$cigar[1])]){ $base_count{"$a[2]\t".$bp}{$read[($i+$cigar[1])]}++; #needs to count the quality here if ($qual_count{"$a[2]\t".$bp}){ $qual_count{"$a[2]\t".$bp} += $p; }else{ $qual_count{"$a[2]\t".$bp} = $p; } } }else{ $base_count{"$a[2]\t".$bp}{$read[$i]}++; if ($qual_count{"$a[2]\t".$bp}){ $qual_count{"$a[2]\t".$bp} += $p; }else{ $qual_count{"$a[2]\t".$bp} = $p; } } } } } } close SAM; # now the sam is done and you have a hash with counts and quality if ($print_counts){ open OUT, ">$out"."$sample.MLcounts"; foreach my $loc (sort keys %base_count){ print OUT "$loc\t"; foreach my $base (@bases){ if ($base_count{$loc}{$base}) { print OUT "$base_count{$loc}{$base};"; }else{ print OUT "0;"; } } print OUT "$qual_count{$loc}\n"; } close OUT; } #otherwise loop thru the count hash and call the bases open OUT, ">$out"."$sample.MLcalls"; foreach my $loc (sort keys %base_count){ my @counts; if ($base_count{$loc}{"A"}){ $counts[0] = $base_count{$loc}{"A"}; }else{ $counts[0] = '0'; } if ($base_count{$loc}{"T"}){ $counts[1] = $base_count{$loc}{"T"}; }else{ $counts[1] = '0'; } if ($base_count{$loc}{"C"}){ $counts[2] = $base_count{$loc}{"C"}; }else{ $counts[2] = '0'; } if ($base_count{$loc}{"G"}){ $counts[3] = $base_count{$loc}{"G"}; }else{ $counts[3] = '0'; } #print "$loc\t$counts[0]\t$counts[1]\t$counts[2]\t$counts[3]\n"; my $sum_counts = ($counts[0]+$counts[1]+$counts[2]+$counts[3]); my $call; if($sum_counts<$min_coverage){ $call = 'NN'; } else{ $counts[4] = $qual_count{$loc}/$sum_counts; $call = genotype(@counts); } print OUT "$loc\t$call\n"; } %qual_count = (); %base_count = (); } close LIST;