#!/usr/bin/perl -w

#===============================================================================
# summarize read length distribution (percentage) and match rate from (.dist) file
# for all samples with or without sample order file
# if no sample order file provided, samples will be ordered by alphabet.
# ARGV[0]:	sample order file, e.g. sampleOder,
# 	1	CCACTC_4
# 	2	AAGCTA_3
# 	3	AAGCTA_7
# Output:	read length distribution summary file (all_sample_distRate.sum) for all samples
#			match rate summary file (all_sample_matchRate.sum) on each step for all samples
# Usage:	sumRate.pl sampleOrder
#===============================================================================

use strict;
use warnings;
use Getopt::Long qw(GetOptions);

# get the project name from the command line, or use all_sample as the default
my $project_name = "all_sample";
GetOptions("project=s" => \$project_name);

my $sampleOrder = $ARGV[0];
my $dataDIR = "./";
my @sampleList = ();
my $sample;
my $suffix = "_Processed.dist";

# get sample order
my $i = 0;
if ( $sampleOrder ) {
	open(In, "<", $sampleOrder) or die "   can't open sample order file - $sampleOrder\n";
	while (<In>) {
		chomp;
		# make sure to get rid of any kind of carriage return sign
		$_ =~ s/\r|\n//g;
		my @array = split(/\t/, $_);
		if ( scalar(@array) == 2 ) {
			$sampleList[$i] = $array[1];
			$i++;
		}
	}
	close(In);
} else {
	print "no sample order file input, will retrieve and order all samples automatically.\n";
	opendir my $dir, $dataDIR or die "   Can't open directory: $dataDIR";
	my @fileList = readdir $dir;
	closedir $dir;
	my @profileList = grep(/$suffix$/, @fileList);
	foreach my $profile (@profileList) {
		$sample = $profile;
		$sample =~ s/$suffix$//;
		$sampleList[$i] = $sample;
		$i++;
	}
	@sampleList = sort @sampleList;
}

# get distributions for all samples
my %dist = ();
my %rate = ();
my @matchType = ();
my $maxReadLength = 50; # default value is 50, but will be changed according to the output .dist file
$i = 0;
foreach my $sample (@sampleList) {
	my $profile = $sample.$suffix;
	open(In, "<", $profile) or die "cannot open profile for sample: $profile";
	$i++;
	print "--- Read distribution infomation for #$i sample: $sample\n";
	my @inputDist = ();
	my @matchRate = ();
	@matchType = ();
	my $totalInput = 1;
	my $j = 0;
	while (<In>) {
		chomp;
		if ( $_ !~ /^refName/ ) {	# avoid head
			my @array = split(/\t/, $_);
			my $num = scalar(@array);
			$maxReadLength = $num - 5;
			if ( $j == 0 ) {	# get total input read number and read length distribution
				$totalInput = $array[4];
				@inputDist = @array[3..($num - 1)];
				$j = 1;
			}
			if ( $array[2] eq "match" ) {	# get matched read number on each step
				push @matchType, $array[0].".mis_".$array[1];
				push @matchRate, $array[4] / $totalInput;
				$j = 1;
			}
		}
	}
	close(In);
	$dist{$sample} = \@inputDist;
	$rate{$sample} = \@matchRate;	
}

# output distributions for all samples
my $outputFile = $project_name.".distRate.sum";
print "=== Write distributions for all samples to: $outputFile\n";
open(Out, ">", $outputFile);
print Out "sample_ID\t";
print Out join("\t", (1..$maxReadLength))."\t"."\n";
foreach $sample (@sampleList) {
	my @array = ();
	if ( exists $dist{$sample} ) {
		@array = @ { $dist{$sample} };
	}
	my $readN = $array[1];
	# calculate distribution percentage
	foreach my $i (1..$maxReadLength) {
		$array[$i+1] = $array[$i+1] / $readN;
	}
	shift @array;
	shift @array;
	print Out $sample."\t".join("\t", @array)."\n";
}
close(Out);

# output match rates on each step for all samples
$outputFile = $project_name."_matchRate.sum";
print "=== Write distributions for all samples to: $outputFile\n";
open(Out, ">", $outputFile);
print Out "sample_ID\t";
print Out join("\t", @matchType), "\n";
foreach $sample (@sampleList) {
	my @array = ();
	if ( exists $rate{$sample} ) {
		@array = @ { $rate{$sample} };
	}
	print Out $sample."\t".join("\t", @array)."\n";
}
close(Out);