#!/usr/bin/perl -w

#===============================================================================
# summarize molecular types from annotated features mainly designed for human
# ARGV[0]:	feature file (*_anno.feature) with both description information
# 			and taxonomy information annotated
# Output:	summary file (*.sum) of read counts on each molecular type for all samples
# Usage:	sumMolTypes.pl all_sample_human_anno.feature
#===============================================================================

use strict;
use warnings;

my $feature = $ARGV[0];
my $prefix = $feature;
my $suffix = "_anno.feature";
$prefix =~ s/$suffix$//;
my $fileName;
my %match = ();
my %matchTotal = ();

# test if input feature file exists
if ( !-e $feature ) {
	die "   Can not open $feature !!!\n";
}
# test if input feature file has annotation information (both description and taxonomy information)
if ( $feature !~ /_anno.feature/ ) {
	print "   This is NOT a feature file (*_anno.feature) with annotation information!!!\n";
	print "   Please use desFeature.pl and taxFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}

# count readN for each type of feature based on both refName and feature description
my @divide = qw(
	miRNA
	piRNA
	lncRNA
	repSeq
	mRNA
	pseudogene
	snRNA
	snoRNA
	rRNA
	tRNA
	Mt_rRNA
	Mt_tRNA
	mtDNA
	Y_RNA
	vault_RNA
	SRP_RNA
	RNase_P_RNA
	RNase_MRP_RNA
	telomerase_RNA
	ncRNA
	sense_overlapping
	3prime_overlapping_ncrna
	misc_RNA
	antisense
	sense_intronic
	retained_intron
	guide_RNA
	processed_transcript
	CDS
	DNA
	other
);

# get input feature file
open(In, "<", $feature) or die "Can not open $feature !!!";

# add head line to all the divide feature files
my $firstLine = <In>; 
chomp($firstLine);
my @array = split(/\t/, $firstLine);
my $matchType = shift @array;
my $matchID = shift @array;
my $taxonomy = shift @array;
my $description = shift @array;
if ( !(($matchType eq "$matchType") and ($matchID eq "$matchID") and ($taxonomy eq "$taxonomy") and ($description eq "$description")) ) {
	print "   This a feature file (*_anno.feature) with wrong annotation format!!!\n";
	print "   Please use desFeature.pl and taxFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}
my @sampleOrder = @array;
my $sampleSize = scalar(@sampleOrder);

while(<In>){
	chomp;
	@array = split(/\t/, $_);
	$matchType = shift @array;
	$matchID = shift @array;
	$taxonomy = shift @array;
	$description = shift @array;
	
	# ignore head lines
	if ($matchType =~ /matchType/) {
		next;
	}
	
	# only count forward (+) trand alignmet for RNA type refDB
	if ( ($matchType =~ /RNA/) and ($matchType =~ /\-$/) ) {
		next;
	}
	
	my $i = 0;
	if ( ($matchType =~ /miRNA/) or ($description =~ /miRNA|microRNA$/) ) {								# miRNA
		$i = 0;
	} elsif ( ($matchType =~ /piRNA/) or ($description =~ /piwiRNA/) ) {								# piRNA
		$i = 1;
	} elsif ( ($matchType =~ /lncRNA/) or ($description =~ /:lincRNA|\, long non-coding RNA$/) ) {		# lncRNA
		$i = 2;
	} elsif ( ($matchType =~ /repSeq/) or ($matchType =~ /subSeq/) ) {									# repSeq
		$i = 3;
	} elsif ($description =~ /mRNA$/) {																	# mRNA
		$i = 4;
	} elsif ($description =~ /\:pseudogene$/) {															# pseudogene
		$i = 5;
	} elsif ($description =~ /\:snRNA|\, small nuclear RNA$/) {											# snRNA
		$i = 6;
	} elsif ( ($matchType =~ /snoRNA/) or ($description =~ /\:snoRNA|\, small nucleolar RNA$/) ) {		# snoRNA
		$i = 7;
	} elsif ($description =~ /\:rRNA|\, ribosomal RNA$/) {												# rRNA
		$i = 8;
	} elsif ($description =~ /\:tRNA|tRNA\-|\, transfer RNA$/) {										# tRNA
		$i = 9;
	} elsif ($description =~ /\:Mt_rRNA$|mitochondrial rRNA\,|mitochondrial ribosomal RNA/) {			# Mt_rRNA
		$i = 10;
	} elsif ($description =~ /\:Mt_tRNA$|mitochondrial tRNA\-|mitochondrial transfer RNA/) {			# Mt_tRNA
		$i = 11;
	} elsif ($description =~ /mtDNA|mitochondrial DNA/) {												# mtDNA
		$i = 12;
	} elsif ($description =~ /\, Y RNA$/) {																# Y_RNA
		$i = 13;
	} elsif ($description =~ /\, vault RNA$/) {															# vault_RNA
		$i = 14;
	} elsif ($description =~ /\, SRP RNA$/) {															# SRP_RNA
		$i = 15;
	} elsif ($description =~ /\, RNase P RNA$/) {														# RNase_P_RNA
		$i = 16;
	} elsif ($description =~ /\, RNase MRP RNA$/) {														# RNase_MRP_RNA
		$i = 17;
	} elsif ($description =~ /\, telomerase RNA$/) {													# telomerase_RNA
		$i = 18;
	} elsif ($description =~ /\, ncRNA|\, partial ncRNA|\, non-coding RNA$/) {							# ncRNA
		$i = 19;
	} elsif ($description =~ /\:sense_overlapping$/) {													# sense_overlapping
		$i = 20;
	} elsif ($description =~ /\:3prime_overlapping_ncrna$/) {											# 3prime_overlapping_ncrna
		$i = 21;
	} elsif ($description =~ /misc_RNA$/) {																# misc_RNA
		$i = 22;
	} elsif ($description =~ /:antisense|\, antisense RNA$/) {											# antisense
		$i = 23;
	} elsif ($description =~ /:sense_intronic$/) {														# sense_intronic
		$i = 24;
	} elsif ($description =~ /:retained_intron$/) {														# retained_intron
		$i = 25;
	} elsif ($description =~ /\, guide RNA$/) {															# guide_RNA
		$i = 26;;
	} elsif ($description =~ /:processed_transcript$/) {												# processed_transcript
		$i = 27;
	} elsif ($matchType =~ /CDS/) {																		# CDS
		$i = 28;
	} elsif ($matchType =~ /DNA/) {																		# DNA
		$i = 29;
	} else {																							# other
		$i = 30;
#		print $matchType."\t".$matchID."\t".$taxonomy."\t".$description."\n";
	}
	
	# summarize read counts on each divide (domain or field) for all samples
	my $j = 0;
	foreach my $num (@array) {
		if ( $num eq "" ) {
			$num = 0;
		}
		if ( exists $match{$i}{$j} ) {
			$match{$i}{$j} += $num;
		} else {
			$match{$i}{$j} = $num;
		}
		if ( exists $matchTotal{$j} ) {
			$matchTotal{$j} += $num;
		} else {
			$matchTotal{$j} = $num;
		}
		$j++;
	}
}
close(In);

# output summary file (*.sum) of read counts on each divide for all samples
$fileName = $prefix."\_moltype"."\.sum";
open(Out, ">", $fileName);
print $fileName."\n";
# add head line to domain summary file
print Out "molType\t".join("\t", @sampleOrder)."\n";
my $i = 0;
for my $molType (@divide) {
	my $line = $molType;
	foreach my $j (0..($sampleSize - 1)) {
		if ( !exists $match{$i}{$j} ) {
			$match{$i}{$j} = 0;
		}
		$line = $line."\t".$match{$i}{$j};
	}
	print Out $line."\n";
	$i++;
}
my $line = "Total";
foreach my $j (0..($sampleSize - 1)) {
	$line = $line."\t".$matchTotal{$j};
}
print Out $line."\n";
close(Out);
