#!/usr/bin/perl -w

#===============================================================================
# summarize all domains from annotated features
# ARGV[0]:	feature file (*_anno.feature) with both description information
# 			and taxonomy information annotated
# Output:	summary file (*.sum) of read counts on each domain for all samples
# Usage:	sumDomain.pl all_sample_anno.feature
#===============================================================================

use strict;
use warnings;

my $feature = $ARGV[0];
my $prefix = $feature;
my $suffix = "_anno.feature";
$prefix =~ s/$suffix$//;
my $fileName;
my %match = ();
my %matchTotal = ();

# test if input feature file exists
if ( !-e $feature ) {
	die "   Can not open $feature !!!\n";
}
# test if input feature file has annotation information (both description and taxonomy information)
if ( $feature !~ /_anno.feature/ ) {
	print "   This is NOT a feature file (*_anno.feature) with annotation information!!!\n";
	print "   Please use desFeature.pl and taxFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}

# count readN for each type of feature based on both refName and feature description
my @divide = qw(
	human
	bacteria
	fungi
	virus
	plant
	mammal
	chordata
	bug
	worm
	other
);

# get input feature file
open(In, "<", $feature) or die "Can not open $feature !!!";

# add head line to all the divide feature files
my $firstLine = <In>; 
chomp($firstLine);
my @array = split(/\t/, $firstLine);
my $matchType = shift @array;
my $matchID = shift @array;
my $taxonomy = shift @array;
my $description = shift @array;
if ( !(($matchType eq "$matchType") and ($matchID eq "$matchID") and ($taxonomy eq "$taxonomy") and ($description eq "$description")) ) {
	print "   This a feature file (*_anno.feature) with wrong annotation format!!!\n";
	print "   Please use desFeature.pl and taxFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}
my @sampleOrder = @array;
my $sampleSize = scalar(@sampleOrder);

while(<In>){
	chomp;
	@array = split(/\t/, $_);
	$matchType = shift @array;
	$matchID = shift @array;
	$taxonomy = shift @array;
	$description = shift @array;
	
	# only count forward (+) trand alignmet for RNA type refDB
	if ( ($matchType =~ /RNA/) and ($matchType =~ /\-$/) ) {
		next;
	}
	
	my $i = 9;
	if ( $matchType =~ /nt_Vec/ ) { # filter out all vectors (e.g., cloning vector, expression vector, and transfer vector, etc.)
		$i = 9;
	} elsif ( ($matchType =~ /human/) or ($taxonomy =~ /\;Homo sapiens/) ) { 								# human
		$i = 0;
	} elsif ( ($matchType =~ /bacteria|archaea|microbiome/) or ($taxonomy =~ /^Bacteria\;|Archaea\;/) ) {	# bacteria
		$i = 1;
	} elsif ( ($matchType =~ /fungi/) or ($taxonomy =~ /^Eukaryota;Fungi;/) ) {								# fungi
		$i = 2;
	} elsif ( ($matchType =~ /virus/) or ($taxonomy =~ /Viruses\;|Viroids\;/) ) {							# virus
		$i = 3;
	} elsif ( ($matchType =~ /plant/) or ($taxonomy =~ /Viridiplantae\;/) ) {								# plant
		$i = 4;
	} elsif ( ($matchType =~ /nt_mouse|nt_mammal/) or ($taxonomy =~ /Mammalia\;/) ) {						# mammal
		$i = 5;
	} elsif ( ($matchType =~ /nt_chordata/) or ($taxonomy =~ /Chordata\;/) ) {								# chordata
		$i = 6;
	} elsif ( ($matchType =~ /nt_bug/) or ($taxonomy =~ /Arthropoda\;/) ) {									# bug
		$i = 7;
	} elsif ( ($matchType =~ /nt_worm/) or ($taxonomy =~ /Nematoda\;/) ) {									# worm
		$i = 8;
	} else {																								# other
		$i = 9;
	}
	
	# summarize read counts on each divide (domain or field) for all samples
	my $j = 0;
	foreach my $num (@array) {
		if ( $num eq "" ) {
			$num = 0;
		}
		if ( exists $match{$i}{$j} ) {
			$match{$i}{$j} += $num;
		} else {
			$match{$i}{$j} = $num;
		}
		if ( exists $matchTotal{$j} ) {
			$matchTotal{$j} += $num;
		} else {
			$matchTotal{$j} = $num;
		}
		$j++;
	}
}
close(In);

# output summary file (*.sum) of read counts on each divide for all samples
$fileName = $prefix."\_domain"."\.sum";
open(Out, ">", $fileName);
print $fileName."\n";
# add head line to domain summary file
print Out "domain\t".join("\t", @sampleOrder)."\n";
my $i = 0;
for my $domain (@divide) {
	my $line = $domain;
	foreach my $j (0..($sampleSize - 1)) {
		if ( !exists $match{$i}{$j} ) {
			$match{$i}{$j} = 0;
		}
		$line = $line."\t".$match{$i}{$j};
	}
	print Out $line."\n";
	$i++;
}
my $line = "Total";
foreach my $j (0..($sampleSize - 1)) {
	$line = $line."\t".$matchTotal{$j};
}
print Out $line."\n";
close(Out);
