#!/usr/bin/perl -w

#===============================================================================
# retrieveFASTA.pl fastaFile Keyword
# e.g., retrieveFASTA.pl all.ncrna.frn ebola
#===============================================================================

use strict;
use warnings;


# for alpha test!!!
use IO::Handle;
STDOUT->autoflush(1);

my $refDBFile = $ARGV[0];
my $fileName;
my @fileHandle = ();
my $totalN;

# get sequence with head (sequence ID and annotation) from input database
open(In, "<", $refDBFile) or die "can't open file: $refDBFile\n";
print "Loading sequence file ($refDBFile)...\n";
$totalN = `cat $refDBFile | grep "^>" | wc -l`;
print "Total sequences: $totalN"; # having "\n" already

my @divide = qw(
	Serratia_liquefaciens
);

my @array = split(/\./, $refDBFile);
my $suffix = pop @array;
$suffix = "\.".$suffix;
my $prefix = $refDBFile;
$prefix =~ s/$suffix$//;

# open a list of divide FASTA files
foreach my $name (@divide) {
	$fileName = $prefix."\_".$name.$suffix;
	open(my $out, ">", $fileName);
	push @fileHandle, $out;
}

# open a warning file
$fileName = "warning.txt";
open(Warn, ">", $fileName);
print Warn "Warning: sequence description over 32767 characters will be truncated to 32767 characters!!!\n\n";

						# for alpha test!!!
						my $i = 0;
						print "\n"."Reading sequence #";

my $seqHead = "";
my $seq = "";
while (<In>) {
	if (/^>/) { # read sequence head
	
						# for alpha test!!!
						$i++;
						print $i;
		
		my $seqHead_next = $_;
		chomp;
		if ( $seq ne "" ) { # if sequence is NOT empty
			desMultiplexer();
		}
		$seq = ""; 
		$seqHead = $seqHead_next;

						# alpha test!!!
						print "\b" x length($i);

	} else { # read sequence
		$seq = $seq.$_;
	}
}
close(In);
# get the last seq
if ( $seq ne "" ) { # if sequence is NOT empty
	desMultiplexer();
}

print "\n\n=== write divided FASTA files:\n";
foreach my $name(@divide) {
	print $prefix."\_".$name.$suffix."\n";
}
print "\n";

foreach my $file (@fileHandle) {
	close($file);
}

#================================ sub functions ================================
# use description information to decide which divide file is output
sub desMultiplexer {
	my $head = $seqHead;
	$head =~ s/\t/\ /g;
	$head =~ s/\ /\t/;
	my @array= split(/\t/, $head);
	my $matchID = $array[0];
	$matchID =~ s/^>//;
	my $description = $array[1];
	
	my $i = -1;
	# cloning vector
	if ( $description =~ /Serratia liquefaciens/ ) {
		$i = 0;
		# output warning for description over 32767 characters and truncate description to 32767 characters
		if ( length($description) > 32767 ) {
			print Warn $seqHead.$seq;
			$description = substr($description, 0, 32767);
			$seqHead = "\>".$matchID."\ ".$description;
		}
	} else {						# remaining sequences
		$i = -1;
	}
	if ( $i >= 0 ) {
		my $file = $fileHandle[$i];
		print $file $seqHead.$seq;
	}
}