#!/usr/bin/perl -w

#===============================================================================
# [12/18/2015] fit to miRNA profiling by multiple assignment approach
# summarize matched read sequences (.profile) for multiple assignment approach
# for all samples with or without sample order file
# if no sample order file provided, samples will be ordered by alphabet automatically.
# ARGV[0]:	sample order file, e.g. sampleOder,
# 	1	CCACTC_4
# 	2	AAGCTA_3
# 	3	AAGCTA_7
# Need:		"all_sample_matchCount.sum", generated by "sumDist.pl"
# Output:	matched read sequence file (all_sample.profile) for all samples
# Usage:	sumProfile.pl sampleOrder
# ------Note------
# read sequence matches are grouped and outputed by matchOrder
# read sequences are ranked first by alphabet on matchID,
# and then by read sequence length. matchID could be single or multiple matches!!!
# matchOrder is an ordered array for matchType
# this version keep maxMismatch (0 - 2) and strand information in matchType
# matchType = refName + maxMismatch + strand, e.g., "human_miRNA.mis_0+"
#===============================================================================

use strict;
use warnings;

use Getopt::Long qw(GetOptions);

# get the project name from the command line, or use all_sample as the default
my $project_name = "all_sample";
GetOptions("project=s" => \$project_name);

my $sampleOrder = $ARGV[0];
my $dataDIR = "./";
my @sampleList = ();
my $sample;
my $suffix = "_Processed.profile";

# get sample order
my $i = 0;
if ( $sampleOrder ) {
	open(In, "<", $sampleOrder) or print "can't open sample order file - $sampleOrder\n";
	while (<In>) {
		chomp;
		# make sure to get rid of any kind of carriage return sign
		$_ =~ s/\r|\n//g;
		my @array = split(/\t/, $_);
		if ( scalar(@array) == 2 ) {
			$sampleList[$i] = $array[1];
			$i++;
		}
	}
	close(In);
} else {
	print "no sample order file input, will retrieve and order all samples automatically.\n";
	opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
	my @fileList = readdir $dir;
	closedir $dir;
	my @profileList = grep(/$suffix$/, @fileList);
	foreach my $profile (@profileList) {
		$sample = $profile;
		$sample =~ s/$suffix$//;
		$sampleList[$i] = $sample;
		$i++;
	}
	@sampleList = sort @sampleList;
}

# get match order (ordered by matchType, e.g., "human_miRNA.mis_0")
my @matchOrder = ();
my $inputFile = $project_name."_matchCount.sum";
open(In, "<", $inputFile) or die "$inputFile !!!";
my $firstLine = <In>;
chomp($firstLine);
@matchOrder = split(/\t/, $firstLine);
shift @matchOrder;
pop @matchOrder;
close(In);

# get profiles for all samples
my %match = ();
my %info = ();
my $matchType;
my $readID;
my $readSeq;
my $matchID;
my $offset;
my $matchNum;
my $multiMatch;
my $readNum;

$i = 0;
foreach my $sample (@sampleList) {
	my $profile = $sample.$suffix;
	open(In, "<", $profile) or die "cannot open profile for sample: $profile";
	$i++;

	print "--- Read matched sequences for #$i sample: $sample\n";
	while (<In>) {
		chomp;
		my @array = split(/\t/, $_);
		$matchType = shift @array;
		$readID = shift @array;
		$readSeq = shift @array;
		$matchID = shift @array;
		$offset = shift @array;
		$matchNum = shift @array;
		
		# remove $multiMatch to save storage and memory space
		# keep the first matched matchID, offset, and matchNum for each readSeq on all samples,
		# to avoid reporting identical read sequence multiple times, due to bowtie report multiple matches in a random order each time running
		@array = @array[0..2];
		
		# decode collapsed read sequence head (>SN-countNumber, e.g., >6-128, readNum = 128)
		my @head = split(/\-/, $readID);
		if ( (scalar(@head) == 2) and ($head[1]=~ /^\d+$/) ) {	# if it's collapsed FASTA format
			$readNum = $head[1];
		} else { 												# if it's NOT a collapsed FASTA format
			$readNum = 1;
		}
		
		# composit matchID with offset and matchNum
		my $matchIDComp = $matchID."\t".$offset."\t".$matchNum;
		
		if ( exists $match{$matchType}{$readSeq}{$matchIDComp}{$sample} ) { # just in case for BIOO trmming
			$match{$matchType}{$readSeq}{$matchIDComp}{$sample} += $readNum;
		} else {
			$match{$matchType}{$readSeq}{$matchIDComp}{$sample} = $readNum;
		}
	}
	close(In);
}

# output formated profiles (e.g., all_sample.profile) for all samples
my $outputFile = $project_name.".profile";
print "=== Write formated profiles for all samples to: $outputFile\n";
open(Out, ">", $outputFile);
# generate table head
print Out "matchType\t"."readSeq\t"."readLength\t"."matchID\t"."offset\t"."matchNum\t".join("\t", @sampleList)."\n";


# write matched read sequences for all samples
foreach my $type (@matchOrder) {
	foreach my $strand ("\+", "\-") {
		$matchType = $type.$strand;

		# ordering
		my @allReadSeq = keys % { $match{$matchType} };
		@allReadSeq = sort { length($a) <=> length($b) } @allReadSeq;
		@allReadSeq = sort @allReadSeq;
		
		# write matched sequences on each matchType.+/- for all samples
		foreach $readSeq (@allReadSeq) {
			$offset = "";
			$matchNum = "";

			my @allMatchIDComp = keys % { $match{$matchType}{$readSeq} };
			@allMatchIDComp = sort @allMatchIDComp;
			
			foreach my $matchIDComp (@allMatchIDComp) {
				my @array = split(/\t/, $matchIDComp);
				$matchID = shift @array;
				$offset = shift @array;
				$matchNum = shift @array;

				my $readLength = length($readSeq);
				print Out $matchType."\t".$readSeq."\t".$readLength."\t".$matchID."\t".$offset."\t".$matchNum;
				
				foreach $sample (@sampleList) {
					print Out "\t";
					if ( exists $match{$matchType}{$readSeq}{$matchIDComp}{$sample} ) {
						$readNum = $match{$matchType}{$readSeq}{$matchIDComp}{$sample};
						print Out $readNum;
					}
				}
				print Out "\n";
			}
		}
	}
}
close(Out);	