#!/usr/bin/perl -w

#===============================================================================
# ARGV[0]: FASTA file downloaded or divided from NT database (e.g. nt or nt_bacteria.fa)
# retrieve vector (Vec) sequences from NT database
# according to description information directly from sequence head in FASTA file
# divided NT databases (e.g. nt_bacteria.fa) can be generate by "divide_NT.pl"
# nucleotide sequence database (NT) is from blast/NCBI, which is Not non-redundant
# ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/
# Note:	including cloning vector, expression vector, and transfer vector etc.
#===============================================================================

use strict;
use warnings;

# for alpha test!!!
use IO::Handle;
STDOUT->autoflush(1);

my $refDBFile = $ARGV[0];
my $maxMismatch = $ARGV[1];
my $fileName;
my @fileHandle = ();
my $totalN;
my %matchVector = ();

# get sequence with head (sequence ID and annotation) from NT database
open(In, "<", $refDBFile) or die "can't open file: $refDBFile\n";
print "Loading sequence file ($refDBFile)...\n";
$totalN = `cat $refDBFile | grep "^>" | wc -l`;
print "Total sequences: $totalN"; # having "\n" already

my @divide = qw(
	Vec
);

my $prefix = $refDBFile;
$prefix =~ s/.fa$//;
my $suffix = ".fa";

# get matched vectors mapping from mature and precursor miRNAs with mismatch 0-1-2
open(Vec, "<", "mature_nt_Vec_Processed.profile");
while (<Vec>) {
	chomp;
	my @array = split(/\t/, $_);
	my $matchType = $array[0];
	my $matchID = $array[3];
	@array = split(/\.mis_/, $matchType);
	my $mismatch = $array[1];
	$mismatch =~ s/\+|\-//;
	if ( $mismatch <= $maxMismatch ) {
		$matchVector{$matchID} = 1;
	}
}
close(Vec);
open(Vec, "<", "hairpin_nt_Vec_Processed.profile");
while (<Vec>) {
	chomp;
	my @array = split(/\t/, $_);
	my $matchType = $array[0];
	my $matchID = $array[3];
	@array = split(/\.mis_/, $matchType);
	my $mismatch = $array[1];
	$mismatch =~ s/\+|\-//;
	if ( $mismatch <= $maxMismatch ) {
		$matchVector{$matchID} = 1;
	}
}
close(Vec);

# open a list of divide FASTA files
foreach my $name (@divide) {
	$fileName = $prefix."\_".$name.$suffix;
	open(my $out, ">", $fileName);
	push @fileHandle, $out;
}

# open a warning file
$fileName = $prefix."\.warning";
open(Warn, ">", $fileName);
print Warn "Warning: sequence description over 32767 characters will be truncated to 32767 characters!!!\n\n";

						# for alpha test!!!
						my $i = 0;
						print "\n"."Reading sequence #";

my $seqHead = "";
my $seq = "";
while (<In>) {
	if (/^>/) { # read sequence head
	
						# for alpha test!!!
						$i++;
						print $i;
		
		my $seqHead_next = $_;
		chomp;
		if ( $seq ne "" ) { # if sequence is NOT empty
			desMultiplexer();
		}
		$seq = ""; 
		$seqHead = $seqHead_next;

						# alpha test!!!
						print "\b" x length($i);

	} else { # read sequence
		$seq = $seq.$_;
	}
}
close(In);
# get the last seq
if ( $seq ne "" ) { # if sequence is NOT empty
	desMultiplexer();
}

print "\n\n=== write divided FASTA files:\n";
foreach my $name(@divide) {
	print $prefix."\_".$name.$suffix."\n";
}
print "\n";

foreach my $file (@fileHandle) {
	close($file);
}

#================================ sub functions ================================
# use description information to decide which divide file is output
sub desMultiplexer {
	my $head = $seqHead;
	$head =~ s/\t/\ /g;
	$head =~ s/\ /\t/;
	my @array= split(/\t/, $head);
	my $matchID = $array[0];
	$matchID =~ s/^>//;
	my $description = $array[1];
	
	my $i = -1;
	# cloning vector
	if ( ($description =~ /vector/) and (!(exists $matchVector{$matchID})) ) {
		$i = 0;
		# output warning for description over 32767 characters and truncate description to 32767 characters
		if ( length($description) > 32767 ) {
			print Warn $seqHead.$seq;
			$description = substr($description, 0, 32767);
			$seqHead = "\>".$matchID."\ ".$description;
		}
	} else {						# remaining sequences
		$i = -1;
	}
	if ( $i >= 0 ) {
		my $file = $fileHandle[$i];
		print $file $seqHead.$seq;
	}
}
