#!/usr/bin/perl -w

#===============================================================================
# add geneName  to human_RNA features (matchType, matchID, taxonomy, geneName, description, readNum)
# ARGV[0]: 	any feature file (*.feature) for single sample or all samples
# Output: 	gene feature file (*.gene) with gene name added
# Usage:	sumGene.pl all_sample_human_anno.human_RNA.feature
#===============================================================================

use strict;
use warnings;

my $inputFile = $ARGV[0];
# test if input file exists
if ( !-e $inputFile ) {
	die "   Can not open $inputFile !!!\n";
}

# read head line

open(In, "<", $inputFile) or die "can not open file: $inputFile\n";

my $firstLine = <In>;
chomp($firstLine);
my @array = split(/\t/, $firstLine);
# read sample list
my $size = scalar(@array);
my $j = 4; # first sample position in array, depends on feature file format
my @sampleList = @array[$j..($size - 1)];

# get gene name and max read count
my %match = (); # index from samples and genes to count numbers
while (<In>) {
	chomp;
	@array = split(/\t/, $_);
	my $matchType = $array[0];
	my $description = $array[3];
	my @string = split(/\(|\)/, $description);
	pop @string;
	my $gene = pop @string;
	@array = @array[$j..($size - 1)]; # read count numbers for all samples
	# find max read count numbers on uniq genes for each sample
	for my $i (0..($size - $j - 1)) { # for each sample
		if ( $array[$i] ) {
			if ( (exists($match{$matchType}{$gene}{$i})) and ($match{$matchType}{$gene}{$i} >= $array[$i]) ) {
				# do nothing!!!
			} else {
				$match{$matchType}{$gene}{$i} = $array[$i];
			}
		}
	}
}
close(In);

# output geneName feature and max read count
my $outputFile = $inputFile;
$outputFile =~ s/\.feature/\.gene/;
open(Out, ">", $outputFile);
# output head
print Out "matchType\t"."geneName\t".join("\t", @sampleList)."\n";
# output content
my @allMatchType = keys %match;
@allMatchType = sort @allMatchType;
foreach my $matchType (@allMatchType) {
	my @allGene = keys % { $match{$matchType} };
	@allGene = sort @allGene;
	foreach my $gene (@allGene) {
		print Out $matchType."\t".$gene;
		for my $i (0..($size - $j - 1)) {
			print Out "\t";
			if ( exists($match{$matchType}{$gene}{$i}) ) {
				print Out $match{$matchType}{$gene}{$i};
			}
		}
		print Out "\n";
	}
}
close(Out);
