#!/usr/bin/perl -w

#===============================================================================
# format a FASTA file to a compact mode - identical read sequence collapsed, 
# usually after merging several FASTA files to one FASTA file
#===============================================================================

use strict;
use warnings;

my $dataDIR = "./";

opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
my @fileList = readdir $dir;
closedir $dir;

my $suffix = "\.fa";
my @readFileList = grep(/$suffix$/, @fileList);
@readFileList = sort @readFileList;

my $num = scalar(@readFileList);
if ( $num == 0 ) {
	print "\nThere is NO FASTQ file ($suffix) in this folder\n";
	exit;
} elsif ( $num == 1 ) {
	print "\nThere is ".$num." FASTQ file ($suffix) in this folder\n";
} else {
	print "\nThere are ".$num." FASTQ file ($suffix) in this folder\n";
}

my $i = 0;
foreach my $readFile (@readFileList) {
	$i++;
	my $prefix = $readFile;
	$prefix =~ s/$suffix$//;
	my $expandedReadFile = $prefix."\_Expanded.fasta";
	my $collapsedReadFile = $prefix."\_Formated.fa";
	
	print "\n====== Preprocess #".$i." read file: $prefix ======\n";


	
	# expand collased FASTA file (sequence head with read number 'NO-N', one sequence N reads) to general FASTA file (sequence head without read number, one sequence one read)
	expandFASTA($readFile, $expandedReadFile);
	print "   --- collased read expanding for $prefix completed\n";
	
	# use FASTX_collapser for identical read collapsing
	system("fastx_collapser -i $expandedReadFile -o $readFile");
	
	system("rm $expandedReadFile");
	print "   --- Identical read collapsing for $prefix completed\n";
}

# expand collased FASTA file (sequence head with read number 'NO-N', one sequence N reads) to general FASTA file (sequence head without read number, one sequence one read)
sub expandFASTA {
	my ($readFile, $expandedReadFile) = @_;

	# read collapsed FASTA file and write expanded FASTA file
	open(In, "<", "$readFile") or die "\n   can't find $readFile!!!\n";
	open(Out, ">", "$expandedReadFile");
	my $readID = "1-1";
	my $readNum = 1;
	# sum readN for all reads (collapsed or general)
	while (<In>) {
		chomp;
		if ( /^>/ ) {	# sequence head
			$readID = $_;
			my @head = split(/\-/, $readID);
			if ( (scalar(@head) == 2) and ($head[1]=~ /^\d+$/) ) {	# if it's collapsed FASTA format
				$readID = $head[0];
				$readNum = $head[1];
			} else { 												# if it's NOT a collapsed FASTA format
				$readNum = 1;
			}
		} else {		# sequence
			my $readSeq = $_;
			for my $i (1..$readNum) {
				print Out $readID."\_".$i."\n";
				print Out $readSeq."\n";
			} 
		}
	}
	close(In);
	close(Out);
}