#!/usr/bin/perl -w

#===============================================================================
# summarize matched features (.feature) for all samples directly
# from matched read sequences (.profile) for all samples
# ARGV[0]:	all_sample.profile
# Need:		"all_sample_matchCount.sum", generated by "sumDist.pl"
# Output:	matched feature file for all samples (all_sample.feature)
# Usage:	sumFeatureP.pl all_sample.profile
# ------Note------
# features are ranked by matchOrder and then by alphabet on matchID
# matchOrder is an ordered array for matchType
# this version keep maxMismatch (0 - 2) and strand information in matchType
# matchType = refName + maxMismatch + strand, e.g., "human_miRNA.mis_0+"
#===============================================================================

use strict;
use warnings;
use Getopt::Long qw(GetOptions);

# get the project name from the command line, or use all_sample as the default
my $project_name = "all_sample";
GetOptions("project=s" => \$project_name);

my $profile = $ARGV[0];
my %match = ();
my $matchType;
my $readSeq;
my $readLength;
my $matchID;
my $offset;
my $matchNum;
my $readNum;

# get match order (ordered by matchType, e.g., "human_miRNA.mis_0")
my @matchOrder = ();
my $inputFile = $project_name."_matchCount.sum";
open(In, "<", $inputFile) or die "$inputFile !!!";;
my $firstLine = <In>;
chomp($firstLine);
@matchOrder = split(/\t/, $firstLine);
shift @matchOrder;
pop @matchOrder;
close(In);

# read matched read sequences (.profile) for all samples
open(In, "<", $profile) or die "Can not open $profile !!!\n";
print "--- Read matched read sequences for all samples from: $profile\n";
# read table head (first line)
$firstLine = <In>;
chomp($firstLine);
my @array = split(/\t/, $firstLine);
$matchType = shift(@array);
$readSeq = shift(@array);
$readLength = shift(@array);
$matchID = shift(@array);
$offset = shift(@array);
$matchNum = shift(@array);
# read sample list
my @sampleList = @array;
my $size = scalar(@array);

# get features for all samples
while (<In>) {
	chomp;
	@array = split(/\t/, $_);
	$matchType = shift(@array);
	$readSeq = shift(@array);
	$readLength = shift(@array);
	$matchID = shift(@array);
	$offset = shift(@array);
	$matchNum = shift(@array);
	
	# sum count numbers for features
	for my $i (0..($size - 1)) { # for each sample
		my $sample = $sampleList[$i];
		if ( $array[$i] ) {
			if ( exists $match{$matchType}{$matchID}{$sample} ) {
				$match{$matchType}{$matchID}{$sample} += $array[$i];
			} else {
				$match{$matchType}{$matchID}{$sample} = $array[$i];
			}
		}
	}
}
close(In);

# output formated features for all samples
my $outputFile = $profile;
$outputFile =~ s/\.profile/\.feature/;
print "=== Write formated features for all samples to: $outputFile\n";
open(Out, ">", $outputFile);
# generate table head
print Out "matchType\t"."matchID\t".join("\t", @sampleList)."\n";
foreach my $type (@matchOrder) {
	foreach my $strand ("\+", "\-") {
		$matchType = $type.$strand;
		my @allMatchID = keys % { $match{$matchType} };
		@allMatchID = sort @allMatchID;
		foreach $matchID (@allMatchID) {
			print Out $matchType."\t".$matchID;
			foreach my $sample (@sampleList) {
				print Out "\t";
				if ( exists $match{$matchType}{$matchID}{$sample} ) {
					$readNum = $match{$matchType}{$matchID}{$sample};
					print Out $readNum;
				}
			}
			print Out "\n";
		}
	}
}
close(Out);
