#!/usr/bin/perl -w

#===============================================================================
# summarize trmming preprocess (_Cutadapt.report)
# for all samples with or without sample order file
# if no sample order file provided, samples will be ordered by alphabet.
# ARGV[0]:	sample order file, e.g. sampleOder,
# 	1	CCACTC_4
# 	2	AAGCTA_3
# 	3	AAGCTA_7
# Output:	trimming summary file (all_sample_trimRate.sum)for all samples
# Usage:	sumTrim.pl sampleOrder
#===============================================================================

use strict;
use warnings;
use Getopt::Long qw(GetOptions);

# get the project name from the command line, or use all_sample as the default
my $project_name = "all_sample";
GetOptions("project=s" => \$project_name);

my $sampleOrder = $ARGV[0];
my $dataDIR = "./";
my @sampleList = ();
my $sample;
my $suffix = "\_Cutadapt.report";

# get sample order
my $i = 0;
if ($sampleOrder) {
	open(In, "<", $sampleOrder) or print "can't open sample order file - $sampleOrder\n";
	while (<In>) {
		chomp;
		# make sure to get rid of any kind of carriage return sign
		$_ =~ s/\r|\n//g;
		my @array = split(/\t/, $_);
		if (scalar(@array) == 2) {
			$sampleList[$i] = $array[1];
			$i++;
		}
	}
	close(In);
} else {
	print "no sample order file input, will retrieve and order all samples automatically.\n";
	opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
	my @fileList = readdir $dir;
	closedir $dir;
	my @profileList = grep(/$suffix$/, @fileList);
	foreach my $profile (@profileList) {
		$sample = $profile;
		$sample =~ s/$suffix$//;
		$sampleList[$i] = $sample;
		$i++;
	}
	@sampleList = sort @sampleList;
}

# get input read count, trimmed read count and trimming rate for all samples

my %trim = ();
$i = 0;
foreach my $sample (@sampleList) {
	$i++;
	print "--- Read preprocess report for #$i sample: $sample\n";
	my @preprocess = ();
	
	$suffix = "\_Cutadapt.report";
	my $profile = $sample.$suffix;
	open(In, "<", $profile) or die "cannot open profile for sample: $profile";
	while (<In>) {
		chomp;
		my @array = ();
		my $inputReadN;
		my $trimReadN;
		my $trimRate;
		if ( $_ =~ /Processed\ reads\:/ ) {
			@array = split(/\:\s*/, $_);
			$inputReadN = $array[1];
			push @preprocess, $inputReadN;
		} elsif ( $_ =~ /Trimmed\ reads\:/ ) {
			@array = split(/\:\s*|s*\(|\)/, $_);
			$trimReadN = $array[1];
			$trimRate = $array[2];
			push @preprocess, $trimReadN;
			push @preprocess, $trimRate;
		}
	}
	close(In);
	
	$suffix = "\_Cutadapt_Prinseq.report";
	$profile = $sample.$suffix;
	open(In, "<", $profile) or die "cannot open profile for sample: $profile";
	while (<In>) {
		chomp;
		my @array = ();
		my $inputReadN;
		my $goodReadN;
		my $goodRate;
		if ( $_ =~ /Input\ sequences\:/ ) {
			@array = split(/\:\s*/, $_);
			$inputReadN = $array[1];
			push @preprocess, $inputReadN;
		} elsif ( $_ =~ /Good\ sequences\:/ ) {
			@array = split(/\:\s*|s*\(|\)/, $_);
			$goodReadN = $array[1];
			$goodRate = $array[2];
			push @preprocess, $goodReadN;
			push @preprocess, $goodRate;
		}
	}
	close(In);
	
	$trim{$sample} = \@preprocess;
}

# output original read count and trimming rate for all samples
my $outputFile = $project_name."_trimRate.sum";
print "=== Write trimming rate for all samples to: $outputFile\n";
open(Out, ">", $outputFile);
print Out "sample_ID\t";
print Out "input\t"."trim_3'\t"."trimRate\t"."input\t"."trim_5'\t"."trimRate\t"."input\t"."trim_Index'\t"."trimRate\t"."input\t"."good\t"."goodRate\n";
foreach $sample (@sampleList) {
	my @array = ();
	if ( exists($trim{$sample}) ) {
		@array = @ { $trim{$sample} };
	}
	print Out $sample."\t".join("\t", @array)."\n";
}
close(Out);