#!/usr/bin/perl -w

use strict;
use warnings;
use Getopt::Long qw(GetOptions);

# get the project name from the command line, or use all_sample as the default
my $project_name = "all_sample";
GetOptions("project=s" => \$project_name);

my $sampleOrder = $ARGV[0];
my $dataDIR = "./";
my @sampleList = ();
my $sample;
my $suffix = "\_Processed.fa";

# get sample order
my $i = 0;
if ( $sampleOrder ) {
	open(In, "<", $sampleOrder) or print "can't open sample order file - $sampleOrder\n";
	while (<In>) {
		chomp;
		# make sure to get rid of any kind of carriage return sign
		$_ =~ s/\r|\n//g;
		my @array = split(/\t/, $_);
		if ( scalar(@array) == 2 ) {
			$sampleList[$i] = $array[1];
			$i++;
		}
	}
	close(In);
} else {
	print "no sample order file input, will retrieve and order all samples automatically.\n";
	opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
	my @fileList = readdir $dir;
	closedir $dir;
	my @profileList = grep(/$suffix$/, @fileList);
	foreach my $profile (@profileList) {
		$sample = $profile;
		$sample =~ s/$suffix$//;
		$sampleList[$i] = $sample;
		$i++;
	}
	@sampleList = sort @sampleList;
}

# read and count read files
$i = 0;
my $outputFile = $project_name."_stp.sum";
open(Out, ">", $outputFile);
print Out "sample_ID\t"."readN\t"."stp_mis_0\t"."stp_mis_1\t"."stp_mis_2\n";
foreach my $sample (@sampleList) {
	$i++;
	print "--- count readN for #$i sample: $sample\n";
	my $readFile = $sample.$suffix;
	my $input_readN = countReadN($readFile);
	$readFile = $sample."\_stp_mis_0.fa";
	my $stp0_readN = countReadN($readFile);
	$readFile = $sample."\_stp_mis_1.fa";
	my $stp1_readN = countReadN($readFile);
	$readFile = $sample."\_stp_mis_2.fa";
	my $stp2_readN = countReadN($readFile);
	print Out $sample."\t".$input_readN."\t".$stp0_readN."\t".$stp1_readN."\t".$stp2_readN."\n";
}
close(Out);
print "=== Write count numbers of stop oligo for all samples to: $outputFile\n";

#================================ sub functions ================================
# count total number of reads in a collapsed or general FASTA file (.fa)
sub countReadN {
	my ($fastaFile) = @_;
	my $readNum = 0;
	open(In, "<", $fastaFile) or print "cannot open read file for sample: $fastaFile!!!\n";
	# sum readN for all reads (collapsed or general)
	while (<In>) {
		chomp;
		if ( /^>/ ) {
			my $readID = $_;
			my @head = split(/\-/, $readID);
			if ( (scalar(@head) == 2) and ($head[1]=~ /^\d+$/) ) {	# if it's collapsed FASTA format
				$readNum += $head[1];
			} else { 												# if it's NOT a collapsed FASTA format
				$readNum += 1;
			}
		}
	}
	close(In);
	return $readNum;
}
