#!/usr/bin/perl -w

#===============================================================================
# summarize taxonomy for all samples on specific annotated feature file
# ARGV[0]:	feature file (*_anno.feature) with both description information
# 			and taxonomy information annotated
# Output:	summary file (*_rankName.sum) at every interested classification levels
#			(rankName = phylum, class, order, family, genus, species)
# Usage:	sumTax.pl all_sample_bacteria_anno.feature
#			sumTax.pl all_sample_fungi_anno.feature
#			sumTax.pl all_sample_virus_anno.feature
#			...
#===============================================================================

use strict;
use warnings;

my @classification = ("superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species", "sciname");

my $feature = $ARGV[0];
my $prefix = $feature;
my $suffix = "_anno.feature";
$prefix =~ s/$suffix$//;
$suffix = "\.sum";
my %match = ();
my @array = ();

# test if input feature file exists
if ( !-e $feature ) {
	die "   Can not open $feature !!!\n";
}
# test if input feature file has annotation information (both description and taxonomy information)
if ( ($feature !~ /\_anno/) or ($feature !~ /\.feature$/) ) {
	print "   This is NOT a feature file with annotation information!!!\n";
	print "   Please use desFeature.pl and taxFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}

# read feature file head
open(In, "<", $feature) or die "Can not open $feature !!!";
my $firstLine = <In>; # read feature file head
chomp($firstLine);
@array = split(/\t/, $firstLine);
if ( $array[2] ne "taxonomy" ) {
	die "the feature file has no taxonomy information annotated!!!";
}
# read sample list
my $size = scalar(@array);
my $j = 4; # first sample position in array, depends on feature file format
my @sampleList = @array[$j..($size - 1)];

# sum count numbers based on each taxonomy name at interested rank level ("phylum", "class", "order", "family", "genus", "species")
while (<In>) { # read annotated features
	chomp;
	@array = split(/\t/, $_);
	my $taxonomy = $array[2];
	my @taxInfo = split(/\;/, $taxonomy);
	@array = @array[$j..($size - 1)]; # read count numbers for all samples
	
	foreach my $rank (2..8) { # for each rank 
		# read taxonomy name at specific rank
		my $taxName = "Unknown"; # if no taxonomy name obtained then assign to "Unknown"
		if ( $taxInfo[$rank] ) {
			$taxName = $taxInfo[$rank];
		}
		
		# sum count numbers for specific rank
		for my $i (0..($size - $j - 1)) { # for each sample
			if ( $array[$i] ) {
				if ( exists $match{$rank}{$taxName}{$i} ) {
					$match{$rank}{$taxName}{$i} += $array[$i];
				} else {
					$match{$rank}{$taxName}{$i} = $array[$i];
				}
			}
		}
	}
}
close (In);

my $head = join("\t", @sampleList);
# write summary file for all interested rank levels
foreach my $rank (2..8) { # for each rank
	my $rankName = $classification[$rank];
	my $outputFile = $prefix."\_".$rankName.$suffix;
	open(Out, ">", $outputFile);
	print "   === write summarization file - $outputFile\n";
	# write summary head for each rank level
	print Out $rankName."\t".$head."\n"; 
	
	# write summary for each rank level
	@array = keys % { $match{$rank} } ;
	@array = sort @array;
	foreach my $taxName (@array) {
		print Out $taxName;
		for my $i (0..($size - $j - 1)) {
			print Out "\t";
			if ( exists $match{$rank}{$taxName}{$i} ) {
				print Out $match{$rank}{$taxName}{$i};
			}
		}
		print Out "\n";
	}
	close(Out);
}
