#!/usr/bin/perl -w

#===============================================================================
# ARGV[0]: FASTA file downloaded or divided from NT database (e.g. nt or nt_bacteria.fa)
# divide NT database into ribosamal RNA (rRNA) database and non rRNA database
# according to description information directly from sequence head in FASTA file
# divided NT databases (e.g. nt_bacteria.fa) can be generate by "divide_NT.pl"
# nucleotide sequence database (NT) is from blast/NCBI, which is Not non-redundant
# ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/
# Note:	rtRNA database here also includes transfer RNA (tRNA) and mitochondrial RNA!!!
#===============================================================================

use strict;
use warnings;

# for alpha test!!!
use IO::Handle;
STDOUT->autoflush(1);

my $refDBFile = $ARGV[0];
my $fileName;
my @fileHandle = ();
my $totalN;

# get sequence with head (sequence ID and annotation) from NT database
open(In, "<", $refDBFile) or die "can't open file: $refDBFile\n";
print "Loading sequence file ($refDBFile)...\n";
$totalN = `cat $refDBFile | grep "^>" | wc -l`;
print "Total sequences: $totalN"; # having "\n" already

my @divide = qw(
	rtRNA
	main
);

my $prefix = $refDBFile;
$prefix =~ s/.fa$//;
my $suffix = ".fa";


# open a list of divide FASTA files
foreach my $name (@divide) {
	$fileName = $prefix."\_".$name.$suffix;
	open(my $out, ">", $fileName);
	push @fileHandle, $out;
}

# open a warning file
$fileName = $prefix."\.warning";
open(Warn, ">", $fileName);
print Warn "Warning: sequence description over 32767 characters will be truncated to 32767 characters!!!\n\n";

						# for alpha test!!!
						my $i = 0;
						print "\n"."Reading sequence #";

my $seqHead = "";
my $seq = "";
while (<In>) {
	if (/^>/) { # read sequence head
	
						# for alpha test!!!
						$i++;
						print $i;
		
		my $seqHead_next = $_;
		chomp;
		if ( $seq ne "" ) { # if sequence is NOT empty
			desMultiplexer();
		}
		$seq = ""; 
		$seqHead = $seqHead_next;

						# alpha test!!!
						print "\b" x length($i);

	} else { # read sequence
		$seq = $seq.$_;
	}
}
close(In);
# get the last seq
if ( $seq ne "" ) { # if sequence is NOT empty
	desMultiplexer();
}

print "\n\n=== write divided FASTA files:\n";
foreach my $name(@divide) {
	print $prefix."\_".$name.$suffix."\n";
}
print "\n";

foreach my $file (@fileHandle) {
	close($file);
}

#================================ sub functions ================================
# use description information to decide which divide file is output
sub desMultiplexer {
	my $head = $seqHead;
	$head =~ s/\t/\ /g;
	$head =~ s/\ /\t/;
	my @array= split(/\t/, $head);
	my $matchID = $array[0];
	my $description = $array[1];
	
	my $i = -1;
	# ribosomal RNA, transfer RNA, and mitochondrial RNA
	if ( $description =~ /rRNA|ribosomal|\ 5S\ |\ 5\.8S\ |\ 16S\ |\ 23S\ |\ 18S\ |\ 28S\ |\ 45S\ |\ tRNA|\-tRNA|transfer\ RNA|mitochondrial/ ) {
		$i = 0;
	} else {						# remaining sequences
		$i = 1;
		# output warning for description over 32767 characters and truncate description to 32767 characters
		if ( length($description) > 32767 ) {
			print Warn $seqHead.$seq;
			$description = substr($description, 0, 32767);
			$seqHead = $matchID."\ ".$description;
		}
	}
	if ( $i >= 0 ) {
		my $file = $fileHandle[$i];
		print $file $seqHead.$seq;
	}
}
