#!/usr/bin/perl -w

#===============================================================================
# ARGV[0]: FASTA file downloaded or divided from NT database (e.g. nt or nt_animal.fa)
# divide NT database according to taxonomy information from gi_taxid_nucl.dmp
# and taxonomyAnno, built by buildTaxonomy.pl
# nucleotide sequence database (NT) is from blast/NCBI, which is Not non-redundant
# ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/
#===============================================================================

use strict;
use warnings;

# for alpha test!!!
use IO::Handle;
STDOUT->autoflush(1);

my $refDBFile = $ARGV[0];
my %gi_taxid = ();
my %seqBatch = ();
my %taxInfo = ();
my $fileName;
my @fileHandle = ();
my $totalN;
my $comString;
my $memUsage;

# get taxonomy index and information
open(In, "<", "taxonomyAnno") or die "can't open file: taxonomyAnno\n";
print "Loading taxonomy file (taxonomyAnno)...\n";
$totalN = `cat taxonomyAnno | wc -l`;
print "Total taxid: $totalN"; # having "\n" already
my $firstline = <In>;
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	my $taxid = $array[0];
	my $info = $array[2];
	$taxInfo{$taxid} = $info;
}	
close(In);

# get sequence with head (sequence ID and annotation) from NT database
open(In, "<", $refDBFile) or die "can't open file: $refDBFile\n";
print "Loading sequence file ($refDBFile)...\n";
$totalN = `cat $refDBFile | grep "^>" | wc -l`;
print "Total sequences: $totalN"; # having "\n" already
$totalN = `cat gi_taxid_nucl.dmp | wc -l`;

my @divide = qw(
	virus
	bacteria
	fungi
	plant
	mouse
	human
	mammal
	chordata
	arthropod
	nematode
	other
);

my $prefix = $refDBFile;
$prefix =~ s/.fa$//;
my $suffix = ".fa";


# open a list of divide FASTA files
foreach my $name (@divide) {
	$fileName = $prefix."\_".$name.$suffix;
	open(my $out, ">", $fileName);
	push @fileHandle, $out;
}

						# for alpha test!!!
						my $i = 0;
						print "\n"."Reading sequence #";
						
my $c1 = 1;
my $c2 = 1;
my $limit = 0;
my $seqHead = "";
my $gi = 0;
my $seq = "";
while (<In>) {
	if (/^>/) { # read sequence head
	
						# for alpha test!!!
						$i++;
						print $i;
		
		my $seqHead_next = $_;
		chomp;
		my @array = split(/\|/, $_); 						
		my $gi_next = $array[1];
		if ( $seq ne "" ) { # if sequence is NOT empty
			$seqBatch{$gi} = $seqHead.$seq;
			if ($c1 >= 10000) { # check memory usage after reading every 10000 sequences!!!
				# check memory usage
				$comString = `free -m | grep "Mem\:"`;
				@array = split(/\s+/, $comString);
				my $totalMem = $array[1];
				$comString = `free -m | grep "cache\:"`;
				@array = split(/\s+/, $comString);
				my $usedMem = $array[2];
				$memUsage = $usedMem / $totalMem;
				my $string = sprintf("%2.1f%%", 100 * $memUsage);
				print "    Memory usage: ".$string;
				
				# if memory usage exceeds 60%, then output seqBatch and empty it
				if ( (($memUsage > 0.60) and ($limit <= 0)) or (($limit > 0) and ($c2 > $limit)) ) {
					if ( $limit <= 0 ) {
						$limit = $c2;
					}
					
					# output seqBatch and empty it
					taxMultiplexer();
					undef %seqBatch;
					%seqBatch = ();	
					$c2 = 0;
					
					print "\n\n"."Reading sequence #".$i;
					print "    Memory usage: ".$string;

				}
				$c1 = 0;
				$c2++;
				print "\b" x (18 + length($string));
			}
			$c1++;
		}
		$seq = ""; 
		$gi = $gi_next;
		$seqHead = $seqHead_next;

						# alpha test!!!
						print "\b" x length($i);

	} else { # read sequence
		$seq = $seq.$_;
	}
}
close(In);
# get the last seq
if ( $seq ne "" ) { # if sequence is NOT empty
	$seqBatch{$gi} = $seqHead.$seq;
}
# output the last seqBatch and empty it
taxMultiplexer();
undef %seqBatch;
%seqBatch = ();	

print "\n\n=== write divided FASTA files:\n";
foreach my $name(@divide) {
	print $prefix."\_".$name.$suffix."\n";
}
print "\n";

foreach my $file (@fileHandle) {
	close($file);
}

#================================ sub functions ================================
# map gi to taxid and use tax info to decide which divide file is output
sub taxMultiplexer {

	# get gi-taxid mapping
	open(GiTax, "<", "gi_taxid_nucl.dmp") or die "can't open file: gi_taxid_nucl.dmp\n";
	print "\n"."Loading gi-taxid mapping file (gi_taxid_nucl.dmp)...";
	print "\n"."Total gi_taxid pairs: $totalN"; # having "\n" already
	
						# for alpha test!!!
						my $j = 0;
						print "Reading gi_taxid #";
	
	while (<GiTax>) {
	
						# for alpha test!!!
						$j++;
						print $j;
	
		chomp;
		my @array = split(/\t/, $_);
		my $gi = $array[0];
		my $taxid = $array[1];
		if ( exists($seqBatch{$gi}) and exists($taxInfo{$taxid}) ) {
			my $seq = $seqBatch{$gi};
			my $info = $taxInfo{$taxid};
			my $i = 0;
			if ( $info =~ /Viruses\;|Viroids\;/ ) {			# viruses and viroids
				$i = 0;
			} elsif ( $info =~ /Bacteria\;|Archaea\;/ ) {	# bacteria and achaea
				$i = 1;
			} elsif ( $info =~ /Fungi\;/ ) {				# fungi, including mushrooms
				$i = 2;
			} elsif ( $info =~ /Viridiplantae\;/ ) {		# plants
				$i = 3;
			} elsif ( $info =~ /Mus\;/ ) {					# mouse
				$i = 4;
			} elsif ( $info =~ /Homo\;/ ) {					# human
				$i = 5;
			} elsif ( $info =~ /Mammalia\;/ ) {				# other mammals
				$i = 6;
			} elsif ( $info =~ /Chordata\;/ ) {				# other chordata (fishes, frogs, lizards and birds etc.)
				$i = 7;
			} elsif ( $info =~ /Arthropoda\;/ ) {			# bugs (insects and spiders etc.)
				$i = 8;
			} elsif ( $info =~ /Nematoda\;/ ) { 			# worms (e.g., C. elegans)
				$i = 9;
			} else {										# others
				$i = 10;
			}
			my $file = $fileHandle[$i];
			print $file $seqBatch{$gi};
		}
		
						# alpha test!!!
						print "\b" x length($j);
	}
	close(GiTax);
}
