#!/usr/bin/perl
# GB + CG july 2011
# RA Dec 2011

use warnings;
use strict;
use File::Basename;
#use bignum;

unless (@ARGV == 2) {die;}
my $list = $ARGV[0];
my $out = $ARGV[1];

my $max_error_rate = 0.05;
my $print_counts = "T"; # set to true if you want all the counts as well, set to nothing to not print out counts. Will print out, to a seperate file, the number of As Ts Cs Gs and their quality score for each site, all of which are seperated by a semicolon.
my $skip_lrt = 1; # set to false if you want to perform the LRT (likelyhood ratio test)
my $min_coverage = 4; # min read depth at a site, less than this will be NN, does not count N's towards depth
if($skip_lrt) {$min_coverage=5;} # only positions with at least this much coverage will be scored if $skip_lrt
my %base_calls;
my %all_counts;

my @samples; # list of samples
my @bases = ("A","T","C","G");

sub genotype {
	# this subroutine is passed an array of read counts for each base,
	# plus the mean error rate at the site as the last element
	my @counts = @_;
	my %n; #
	$n{a} = sprintf("%09d", $counts[0]); #
	$n{t} = sprintf("%09d", $counts[1]); ### hash of 'BASE' => 'N_READS'
	$n{c} = sprintf("%09d", $counts[2]); #
	$n{g} = sprintf("%09d", $counts[3]); #

	my $E = $counts[4]; # the mean error rate
	my $nS = $n{a} + $n{t} + $n{g} + $n{c}; # total reads
	# sort the hash decreasing by values, return the keys
	my @sort_bases = reverse sort { $n{$a} cmp $n{$b} } keys %n;
	my $n1base = $sort_bases[0]; #
	my $n2base = $sort_bases[1]; ### the bases, so that we can call the genotype later
	my $n3base = $sort_bases[2]; #
	my $n4base = $sort_bases[3]; #

	my $n1 = int($n{$n1base}); ### n1 == number of reads for most frequent base
	my $n2 = int($n{$n2base}); ### n2 == number of reads for the second most frequent base
	my $n3 = int($n{$n3base}); #
	my $n4 = int($n{$n4base}); #
	my $L11;
	my $L12;
	my $sampling;
	if ($nS < 150){
	
	# running factorials on numbers higher than 150 are slow and will crash without use::bignum

		# the factorials are expensive, so we consolidate them as one variable
		$sampling = fact($nS) / ( fact($n1) * fact($n2) * fact($n3) * fact($n4) );
		# likelihood of a homozygote: eq (1a)
		$L11 = $sampling * ( ( 1 - ((3*$E)/4) )**$n1 ) * ( ($E/4)**($n2 + $n3 + $n4) );
		# likelihood of a heterozygote: eq (1b)
		$L12 = $sampling * ( ( 0.5 - ($E/4) )**($n1 + $n2) ) * ( ($E/4)**($n3 + $n4) );
		# the test statistic is the likelihood ratio of our two most likely hypotheses
	}
	else {
		if (($n2/$nS)>0.1){ #If more than 10% of the total reads are the second base, set the likelihood of heterozygote as higher.
			$L11 = 0;
			$L12 = 1; 	
		}
		else { # otherwise set the likelihood of the homozygote as higher
			$L11 = 1;
			$L12 = 0;
		}
	}
	my $D = 1;
	if($skip_lrt){
		$D = 5;
	}
	else{
		$D = abs(2 * log($L12/$L11) );
	}
	# for a = 0.05, this number comes from a chi-squared distribution with 1 degree of freedom
#	print "D = $D\n";
	my $sig = '3.841';
	# no call is the default
	my $call = 'NN';
	# if the test statistic is significant
	if($D > $sig && $nS >= $min_coverage){ #
		# if a heterozygote was more likely, then that was our alternative hypothesis
		if($L12>$L11){
			$call = $n1base.$n2base;
		}
			# else, our hypothesis was homozygosity
			else{
			$call = $n1base.$n1base;
		}
	}
	if($D > 3.814){
		$call = uc($call);
	}
	else{
		$call = lc($call);
	}

	return $call;
}
# factorial subroutine
sub fact {
	no warnings 'recursion'; # THIS COULD BE DANGEROUS!!!
	my $n = shift;
	$n == 0 ? 1 : $n*fact($n-1);
}

open LIST, $list;

# for thru the list of files, open each and add name to a array to print  
while (<LIST>){  
	chomp;  
	my $sample_file=$_;  
	my $sample=basename($sample_file,".MQgt30.sam");  
	#print "On $sample...\n";  
	#print "File is $sample_file\n";  
	push (@samples,$sample);  
	my %base_count;  
	my %qual_count;  
	  
	open SAM, $sample_file;  
	while (<SAM>){  

		my @read;
		my @quals;
		my $cgr;
		my @cigar;
		my @a = split(/\s/, $_); 
		my $go_on;
		my $done;
		my $indel;
		my $use_ins;
		unless ((/^@/)or($a[2]eq '*')or($a[4]eq 0)or($a[5]eq'*')){ 
			$cgr = $a[5];
			if ($cgr !~ m/(N|H|P|=|X)/){
				$go_on="T";
			}
		}
		if($go_on){
	
			@cigar = split (/M|I|D|S/,$cgr);
			unless ($#cigar>2){
				$cgr =~ s/\d//g;
				if ($cgr eq "MIM"){
					$indel="I";
				}elsif($cgr eq "MDM"){
					$indel="D";
				}elsif($cgr eq "SM"){
					$indel="SS";
				}elsif($cgr eq "MS"){
					$indel="SE";				
				}elsif($cgr eq "M"){
					$indel="no";
				}
			}
		}
		if($indel){
			if ($indel eq "SS"){ # skip the first bit of the read
				$a[9] =substr($a[9],$cigar[0]);
				$a[10] = substr($a[10],$cigar[0]);
			}
			@read = split ('',$a[9]);  
			@quals = split ('',$a[10]);  

			for (my $i = 0; $i<=$#read; $i++) {
				my $p = 10**(-((ord($quals[$i]))-33)/10);
				my $bp;

				#for indels
				if ($i == $cigar[0]){
					if ($indel eq "D"){
						#skip ahead on the reference
						$a[3]= $a[3]+$cigar[1];
					}elsif 	($indel eq "I"){
						#skip ahead on the read
						$use_ins = "T";
					}elsif 	($indel eq "SE"){
						$done = "T";
					}
				}
				$bp = $a[3]+$i;
				unless (($p>$max_error_rate)||($read[$i]eq"N")or ($done)){
					# add the base to the count
					if ($use_ins){
						if ($read[($i+$cigar[1])]){
			 				$base_count{"$a[2]\t".$bp}{$read[($i+$cigar[1])]}++;
							#needs to count the quality here
							if ($qual_count{"$a[2]\t".$bp}){
								$qual_count{"$a[2]\t".$bp} += $p;
							}else{
								$qual_count{"$a[2]\t".$bp} = $p;
							}
						}
					}else{

						$base_count{"$a[2]\t".$bp}{$read[$i]}++;
						if ($qual_count{"$a[2]\t".$bp}){
							$qual_count{"$a[2]\t".$bp} += $p;
						}else{
							$qual_count{"$a[2]\t".$bp} = $p;
						}
					}	
				}
			}
		}		
	}
	close SAM;
	# now the sam is done and you have a hash with counts and quality
	if ($print_counts){	
		open OUT, ">$out"."$sample.MLcounts";
		foreach my $loc (sort keys %base_count){
			print OUT "$loc\t";
			foreach my $base (@bases){

				if ($base_count{$loc}{$base}) {
					print OUT "$base_count{$loc}{$base};";
				}else{
					print OUT "0;";
				}
			}

			print OUT "$qual_count{$loc}\n";
		}
		close OUT;
	}
	#otherwise loop thru the count hash and call the bases
	open OUT, ">$out"."$sample.MLcalls";
	foreach my $loc (sort keys %base_count){
		my @counts;

		if ($base_count{$loc}{"A"}){
			$counts[0] = $base_count{$loc}{"A"};
		}else{
			$counts[0] = '0';
		}
		if ($base_count{$loc}{"T"}){
			$counts[1] = $base_count{$loc}{"T"};
		}else{
			$counts[1] = '0';
		}
		if ($base_count{$loc}{"C"}){
			$counts[2] = $base_count{$loc}{"C"};
		}else{
			$counts[2] = '0';
		}
		if ($base_count{$loc}{"G"}){
			$counts[3] = $base_count{$loc}{"G"};
		}else{
			$counts[3] = '0';
		}
		#print "$loc\t$counts[0]\t$counts[1]\t$counts[2]\t$counts[3]\n";
		my $sum_counts = ($counts[0]+$counts[1]+$counts[2]+$counts[3]);
		my $call;
		if($sum_counts<$min_coverage){
			$call = 'NN';
		}
		else{
			$counts[4] = $qual_count{$loc}/$sum_counts;
			$call = genotype(@counts);
		}
		print OUT "$loc\t$call\n";
	}
	%qual_count = ();
	%base_count = ();	
}
close LIST;
