#!/usr/bin/perl -w

#===============================================================================
# summarize matchIDs with all mismatches (mis_0, mis_1, and mis_2) for all samples on specific annotated feature file
# ARGV[0]:	feature file (*_anno.feature) with both description information
# 			and taxonomy information annotated
# Output:	summary file (*_matchType.sum)
# Usage:	sumMatchID.pl all_sample_human_anno.human_miRNA
#===============================================================================

use strict;
use warnings;

my $feature = $ARGV[0];
my $prefix = $feature;
my $suffix = "\.feature";
$prefix =~ s/$suffix$//;
$suffix = "\.sum";
my %match = ();
my @array = ();

# test if input feature file exists
if ( !-e $feature ) {
	die "   Can not open $feature !!!\n";
}
# test if input feature file has annotation information (both description and taxonomy information)
if ( ($feature !~ /\_anno/) or ($feature !~ /\.feature$/) ) {
	print "   This is NOT a feature file with annotation information!!!\n";
	print "   Please use desFeature.pl and taxFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}

# read feature file head
open(In, "<", $feature) or die "Can not open $feature !!!";
my $firstLine = <In>; # read feature file head
chomp($firstLine);
@array = split(/\t/, $firstLine);
if ( $array[2] ne "taxonomy" ) {
	die "the feature file has no taxonomy information annotated!!!";
}
# read sample list
my $size = scalar(@array);
my $j = 4; # first sample position in array, depends on feature file format
my @sampleList = @array[$j..($size - 1)];

# sum count numbers based on each macthID
while (<In>) { # read annotated features
	chomp;
	@array = split(/\t/, $_);
	my $matchType = $array[0];
	my $matchID = $array[1];
	my $taxonomy = $array[2];
	my $description = $array[3];
	if ( ($matchType =~ /RNA/) and  ($matchType =~ /\-$/) ) {
		next;
	}
	my @string = split(/\.mis\_|\_sub\.mis\_/, $matchType);
	$matchType = $string[0];
	$matchType =~ s/\_subSeq/\_repSeq/;
	$matchType =~ s/\_sub//;
	@array = @array[$j..($size - 1)]; # read count numbers for all samples
	
	# sum count numbers for specific rank
	for my $i (0..($size - $j - 1)) { # for each sample
		if ( $array[$i] ) {
			if ( exists $match{$matchType}{$matchID}{$i} ) {
				$match{$matchType}{$matchID}{$i} += $array[$i];
			} else {
				$match{$matchType}{$matchID}{$i} = $array[$i];
			}
		}
	}
}
close (In);

my $head = join("\t", @sampleList);
# write summary file for each matchType
my @allMatchType = keys %match;
@allMatchType = sort @allMatchType;
foreach my $matchType ( @allMatchType ) { # for each matchType
	my $outputFile = $prefix."\_".$matchType.$suffix;
	open(Out, ">", $outputFile);
	print "   === write summarization file - $outputFile\n";
	# write summary head for each rank level
	print Out $matchType."\t".$head."\n"; 
	
	# write summary for each macthID
	my @allMatchID = keys % { $match{$matchType} } ;
	@allMatchID = sort @allMatchID;
	foreach my $matchID (@allMatchID) {
		print Out $matchID;
		for my $i (0..($size - $j - 1)) {
			print Out "\t";
			if ( exists $match{$matchType}{$matchID}{$i} ) {
				print Out $match{$matchType}{$matchID}{$i};
			}
		}
		print Out "\n";
	}
	close(Out);
}
