#!/usr/bin/perl -w

#===============================================================================
# summarize matched read sequences (.profile)
# for all samples with or without sample order file
# if no sample order file provided, samples will be ordered by alphabet automatically.
# ARGV[0]:	sample order file, e.g. sampleOder,
# 	1	CCACTC_4
# 	2	AAGCTA_3
# 	3	AAGCTA_7
# Need:		"all_sample_matchCount.sum", generated by "sumDist.pl"
# Output:	matched read sequence file (all_sample.profile) for all samples
# Usage:	sumProfile.pl sampleOrder
# ------Note------
# read sequence matches are grouped and outputed by matchOrder
# read sequences are ranked first by alphabet on matchID,
# and then by read sequence length. matchID could be single or multiple matches!!!
# matchOrder is an ordered array for matchType
# this version keep maxMismatch (0 - 2) and strand information in matchType
# matchType = refName + maxMismatch + strand, e.g., "human_miRNA.mis_0+"
# ------Heavy Duty------
# using a heavy way to summarize profile matchType by matchType progressively 
# for large sample-size dataset ( >40 samples )
#===============================================================================

use strict;
use warnings;

my $sampleOrder = $ARGV[0];
my $dataDIR = "./";
my @sampleList = ();
my $sample;
my $suffix = "_Processed.profile";

# get sample order
my $i = 0;
if ( $sampleOrder ) {
	open(In, "<", $sampleOrder) or print "can't open sample order file - $sampleOrder\n";
	while (<In>) {
		chomp;
		# make sure to get rid of any kind of carriage return sign
		$_ =~ s/\r|\n//g;
		my @array = split(/\t/, $_);
		if ( scalar(@array) == 2 ) {
			$sampleList[$i] = $array[1];
			$i++;
		}
	}
	close(In);
} else {
	print "no sample order file input, will retrieve and order all samples automatically.\n";
	opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
	my @fileList = readdir $dir;
	closedir $dir;
	my @profileList = grep(/$suffix$/, @fileList);
	foreach my $profile (@profileList) {
		$sample = $profile;
		$sample =~ s/$suffix$//;
		$sampleList[$i] = $sample;
		$i++;
	}
	@sampleList = sort @sampleList;
}

# get match order (ordered by matchType, e.g., "human_miRNA.mis_0")
my @matchOrder = ();
my $inputFile = "all_sample_matchCount.sum";
open(In, "<", $inputFile) or die "$inputFile !!!";;
my $firstLine = <In>;
chomp($firstLine);
@matchOrder = split(/\t/, $firstLine);
shift @matchOrder;
pop @matchOrder;
close(In);

# output formated profiles (e.g., all_sample.profile) for all samples
my $outputFile = "all_sample.profile";
print "=== Write formated profiles for all samples to: $outputFile\n";
open(Out, ">", $outputFile);
# generate table head
print Out "matchType\t"."readSeq\t"."readLength\t"."matchID\t"."offset\t"."matchNum\t".join("\t", @sampleList)."\n";


# get profiles for all samples, matchType by matchType progressively for havy duty (>40 samples, when running in a 36G memory mahcine)
my $matchType;
my $readID;
my $readSeq;
my $matchID;
my $offset;
my $matchNum;
my $multiMatch;
my $readNum;

foreach my $type (@matchOrder) {
	print "--- Get matched sequences for $type\+\/\-\n";
	
	my %match = ();
	my %info = ();
	
	$i = 0;
	foreach my $sample (@sampleList) {
		my $profile = $sample.$suffix;
		open(In, "<", $profile) or die "cannot open profile for sample: $profile";
		$i++;
		
		print "   --- Read matched sequences for #$i sample: $sample\n";
		while (<In>) {
			chomp;
			my @array = split(/\t/, $_);
			$matchType = shift @array;
			
			my $sampleMatchType = $matchType;
			$sampleMatchType =~ s/\+|\-$//;
			if ( $sampleMatchType eq $type ) {
				$readID = shift @array;
				$readSeq = shift @array;
				
				# remove $multiMatch to save storage and memory space
				@array = @array[0..2];
				
				# decode collapsed read sequence head (>SN-countNumber, e.g., >6-128, readNum = 128)
				my @head = split(/\-/, $readID);
				shift @head;
				$readNum = shift @head;
				if ( !($readNum) ) { # if NOT specified, then count one alignment as one read count
					$readNum = 1;
				}
				
				# read count
				if ( exists $match{$matchType}{$readSeq}{$sample} ) { # just in case for BIOO trmming
					$match{$matchType}{$readSeq}{$sample} += $readNum;
				} else {
					$match{$matchType}{$readSeq}{$sample} = $readNum;
				}
				
				# check if read alignment info already exists from any samples
				if ( !(exists $info{$matchType}{$readSeq}) ) { 
					$info{$matchType}{$readSeq} = join("\t", @array);
				}
			}
		}
		close(In);
	}

	# write matched read sequences for all samples, matchType by matchType progressively
	foreach my $strand ("\+", "\-") {
		$matchType = $type.$strand;
=hide
# output formated profiles (e.g., all_sample_matchType.profile) for all samples, matchType by matchType separately
my $outputFile = "all_sample_".$matchType."\.profile";
print "=== Write formated profiles for all samples to: $outputFile\n";
open(Out, ">", $outputFile);
# generate table head
print Out "matchType\t"."readSeq\t"."readLength\t"."matchID\t"."offset\t"."matchNum\t".join("\t", @sampleList)."\n";
=cut
		# ordering
		my @allReadSeq = keys % { $match{$matchType} };
		@allReadSeq = sort { length($a) <=> length($b) } @allReadSeq;
		@allReadSeq = sort @allReadSeq;
		
		# write matched sequences on each matchType.+/- for all samples
		foreach $readSeq (@allReadSeq) {
			$offset = "";
			$matchNum = "";
#			$multiMatch = "";
			if ( exists $info{$matchType}{$readSeq} ) {
				my @array = split(/\t/, $info{$matchType}{$readSeq});
				$matchID = shift @array;
				$offset = shift @array;
				$matchNum = shift @array;
#				$multiMatch = shift @array;
			}
			
			my $readLength = length($readSeq);
			print Out $matchType."\t".$readSeq."\t".$readLength."\t".$matchID."\t".$offset."\t".$matchNum;
			
			# do not output multiMatch to save storage space
=hide
		# output multiMatch
			if ( $multiMatch ) {
				print Out "\t".$multiMatch;
			} else {
				print Out "\t";
			}
=cut
				
			foreach $sample (@sampleList) {
				print Out "\t";
				if ( exists $match{$matchType}{$readSeq}{$sample} ) {
					$readNum = $match{$matchType}{$readSeq}{$sample};
					print Out $readNum;
				}
			}
			print Out "\n";
		}
	}
}
close(Out);