#!/usr/bin/perl -w

#===============================================================================
# ARGV[0]: refDB FASTA file, too big for bowtie-build, and need to be divided
# into batches or chunks of about 3.6 billion characters or less each
# and build bowtie index each independently.
# e.g., divideRefDB.pl nt
# e.g., divideRefDB.pl all.dna.fna
#===============================================================================

use strict;
use warnings;

my $refDBFile = $ARGV[0];
my $prefix = $refDBFile;
my $suffix = "fa";
my @array = split(/\./, $refDBFile);
if (scalar(@array) > 1) {
	$suffix = pop(@array);
	$prefix = join('.', @array);
}

my $limit = 9.6*(2**30); # GB, currently set to 9.6G, can fit to bowtie index building on 36G memory machines
my $partFile = "";
my $c = 0;
my $i = 1;
open(In, "<", $refDBFile);
my $iStr = sprintf("%02u", $i);
$partFile = "$prefix"."_part$iStr"."\.$suffix";
open(Out, ">", $partFile);
print $partFile."\n";
while(<In>){
	if( /^>/ ){
		if( $c > $limit ){
			$c = 0;
			close(Out);
			$i++;
			$iStr = sprintf("%02u", $i);
			$partFile = "$prefix"."_part$iStr"."\.$suffix";
			open(Out, ">", $partFile);
			print $partFile."\n";
		}
    }
    print Out $_;
    $c = $c+length($_);
}
close(In);
close(Out);
