#!/usr/bin/perl -w

#===============================================================================
# ARGV[0]:	feature file (*_anno.feature) with both description information
# 			and taxonomy information annotated
# Output:	divided feature files (*_anno.feature) according to divide
# Usage:	divideFeature.pl all_sample_anno.feature
#			divideFeature.pl all_sample_anno.feature --with-rtRNA
#			divideFeature.pl all_sample_anno.feature --without-rtRNA
#===============================================================================

use strict;
use warnings;

my $feature = $ARGV[0];
my $divideOption = $ARGV[1];
if ( !$ARGV[1] ) {
	$divideOption = "--without-rtRNA";
}
my $prefix = $feature;
my $suffix = "_anno.feature";
$prefix =~ s/$suffix$//;
my $fileName;
my @fileHandle = ();
my %match = ();
my %matchTotal = ();

# test if input feature file exists
if ( !-e $feature ) {
	die "   Can not open $feature !!!\n";
}
# test if input feature file has annotation information (both description and taxonomy information)
if ( $feature !~ /_anno.feature/ ) {
	print "   This is NOT a feature file (*_anno.feature) with annotation information!!!\n";
	print "   Please use desFeature.pl and taxFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}

my @divide = qw(
	Vec
	rtRNA
	human
	bacteria
	fungi
	virus
	plant
	mammal
	chordata
	bug
	worm
	other
);

# get input feature file
open(In, "<", $feature) or die "Can not open $feature !!!";
# open a list of divide feature files
foreach my $name (@divide) {
	$fileName = $prefix."\_".$name.$suffix;
	open(my $out, ">", $fileName);
	push @fileHandle, $out;
	print $fileName."\n";
}

# add head line to all the divide feature files
my $firstLine = <In>; 
foreach my $file (@fileHandle) {
	print $file $firstLine;
}
chomp($firstLine);
my @array = split(/\t/, $firstLine);
my $matchType = shift @array;
my $matchID = shift @array;
my $taxonomy = shift @array;
my $description = shift @array;
if ( !(($matchType eq "$matchType") and ($matchID eq "$matchID") and ($taxonomy eq "$taxonomy") and ($description eq "$description")) ) {
	print "   This a feature file (*_anno.feature) with wrong annotation format!!!\n";
	print "   Please use desFeature.pl and taxFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}
my @sampleOrder = @array;
my $sampleSize = scalar(@sampleOrder);

while(<In>){ # divide features based on keywords apeared in matchType, taxonomy, and description
	chomp;
	my $line = $_;
	@array = split(/\t/, $line);
	$matchType = shift @array;
	$matchID = shift @array;
	$taxonomy = shift @array;
	$description = shift @array;
	
	# only count forward (+) trand alignmet for RNA type refDB
	if ( ($matchType =~ /RNA/) and ($matchType =~ /\-$/) ) {
		next;
	}
	
	my $i = 11;
	
	if ( $matchType =~ /nt_Vec/ ) { # filter out all vectors (e.g., cloning vector, expression vector, and transfer vector, etc.)
		$i = 0;
	} elsif ( ($matchType =~ /rRNA|rtRNA/) and ($divideOption eq "--without-rtRNA") ) { # filter out all ribosomal RNA (rRNA), transfer RNA (tRNA), and mitochondrial RNA
#	} elsif ( (($matchType =~ /rRNA|rtRNA/) or ($description =~ /gbkey=tRNA$|gbkey=rRNA$/)) and ($divideOption eq "--without-rtRNA") ) { # filter out all ribosomal RNA (rRNA), transfer RNA (tRNA), and mitochondrial RNA
		$i = 1;
	} elsif ( ($matchType =~ /human/) or ($taxonomy =~ /\;Homo sapiens/) ) { 								# human
		$i = 2;
	} elsif ( ($matchType =~ /bacteria|archaea|microbiome/) or ($taxonomy =~ /^Bacteria\;|Archaea\;/) ) {	# bacteria
		$i = 3;
	} elsif ( ($matchType =~ /fungi/) or ($taxonomy =~ /^Eukaryota;Fungi;/) ) {								# fungi
		$i = 4;
	} elsif ( ($matchType =~ /virus/) or ($taxonomy =~ /Viruses\;|Viroids\;/) ) {							# virus
		$i = 5;
	} elsif ( ($matchType =~ /plant/) or ($taxonomy =~ /Viridiplantae\;/) ) {								# plant
		$i = 6;
	} elsif ( ($matchType =~ /nt_mouse|nt_mammal/) or ($taxonomy =~ /Mammalia\;/) ) {						# mammal
		$i = 7;
	} elsif ( ($matchType =~ /nt_chordata/) or ($taxonomy =~ /Chordata\;/) ) {								# chordata
		$i = 8;
	} elsif ( ($matchType =~ /nt_bug/) or ($taxonomy =~ /Arthropoda\;/) ) {									# bug
		$i = 9;
	} elsif ( ($matchType =~ /nt_worm/) or ($taxonomy =~ /Nematoda\;/) ) {									# worm
		$i = 10;
	} else {																								# other
		$i = 11;
	}
	my $file = $fileHandle[$i];
	print $file $line."\n";
}

close(In);
foreach my $file (@fileHandle) {
	close($file);
}
