#!/usr/bin/perl -w

#===============================================================================
# summarize collapsed read sequence files (.fa)
# for all samples with or without sample order file
# if no sample order file provided, samples will be ordered by alphabet automatically.
# ARGV[0]:	sample order file, e.g. sampleOder,
# 	1	CCACTC_4
# 	2	AAGCTA_3
# 	3	AAGCTA_7
# Need:		"all_sample_matchCount.sum", generated by "sumDist.pl"
# Output:	matched read sequence file (all_sample.profile) for all samples
# Usage:	sumProfile.pl sampleOrder
# ------Note------
# read sequences are ranked first by alphabet, then by read sequence length
# and finally by read frequency
#===============================================================================

use strict;
use warnings;

my $sampleOrder = $ARGV[0];
my $dataDIR = "./";
my @sampleList = ();
my $sampleSize;
my $presentRate = 0.5;
my $sample;
my $suffix = "_Processed.fa";

# get sample order
my $i = 0;
if ( $sampleOrder ) {
	open(In, "<", $sampleOrder) or print "can't open sample order file - $sampleOrder\n";
	while (<In>) {
		chomp;
		# make sure to get rid of any kind of carriage return sign
		$_ =~ s/\r|\n//g;
		my @array = split(/\t/, $_);
		if ( scalar(@array) == 2 ) {
			$sampleList[$i] = $array[1];
			$i++;
		}
	}
	close(In);
} else {
	print "no sample order file input, will retrieve and order all samples automatically.\n";
	opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
	my @fileList = readdir $dir;
	closedir $dir;
	my @profileList = grep(/$suffix$/, @fileList);
	foreach my $profile (@profileList) {
		$sample = $profile;
		$sample =~ s/$suffix$//;
		$sampleList[$i] = $sample;
		$i++;
	}
	@sampleList = sort @sampleList;
}
$sampleSize = scalar(@sampleList);

# get read sequences for all samples
my %read = ();
my %readFreq = ();
my $readID;
my $readSeq;
my $readNum;

$i = 0;
foreach my $sample (@sampleList) {
	my $profile = $sample.$suffix;
	open(In, "<", $profile) or die "cannot open profile for sample: $profile";
	$i++;

	print "--- Read sequence file for #$i sample: $sample\n";
	
	# sum read sequences for all reads (collapsed or general)
	while (<In>) {
		chomp;
		if ( /^>/ ) {
			$readID = $_;
			my @head = split(/\-/, $readID);
			if ( (scalar(@head) == 2) and ($head[1]=~ /^\d+$/) ) {	# if it's collapsed FASTA format
				$readNum = $head[1];
			} else { 												# if it's NOT a collapsed FASTA format
				$readNum = 1;
			}
		} else {
			$readSeq = $_;
			
			if ( exists $read{$readSeq}{$sample} ) { 
				$read{$readSeq}{$sample} += $readNum;	# for general FASTA files, which has identical/duplicate reads
			} else {
				$read{$readSeq}{$sample} = $readNum;	# for collapsed FASTA files, which only has unique reads
			}
			
			if ( exists $readFreq{$readSeq} ) { 
				$readFreq{$readSeq} += 1;	# for general FASTA files, which has identical/duplicate reads
			} else {
				$readFreq{$readSeq} = 1;	# for collapsed FASTA files, which only has unique reads
			}
		}
	}
	close(In);
}

# output formated FASTA file (e.g., all_sample.reads) for all samples
my $outputFile = "all_sample.reads";
print "=== Write formated profiles for all samples to: $outputFile\n";
open(Out, ">", $outputFile);
# generate table head
print Out "readSeq\t"."readLength\t"."readNum\t".join("\t", @sampleList)."\n";

# ordering
my @allReadSeq = keys %readFreq;
@allReadSeq = sort { length($a) <=> length($b) } @allReadSeq;
@allReadSeq = sort @allReadSeq;
@allReadSeq = sort { $readFreq{$b} <=> $readFreq{$a} } @allReadSeq;

# write read sequences for all samples
my $presentFreq = $presentRate * $sampleSize;
print "--- Only consider read sequences presenting more than $presentFreq times!\n";
foreach $readSeq (@allReadSeq) {
	my $readLength = length($readSeq);
	my $freq = $readFreq{$readSeq};
	if ( $freq > $presentFreq ) {
		print Out $readSeq."\t".$readLength."\t".$freq;
		
		foreach $sample (@sampleList) {
			print Out "\t";
			if ( exists $read{$readSeq}{$sample} ) {
				$readNum = $read{$readSeq}{$sample};
				print Out $readNum;
			}
		}
		print Out "\n";
	}
}	
close(Out);
	