#!/usr/bin/perl -w

#===============================================================================
# Single-end (SE) read sequence preprocessing, specific for NO Barcode situation, including:
# 1) use Cutadapt for 3' adapter trmming and 5' adatper trmming
# 2) use Cutadapt for empty read (only adapter but no insert sequence) discarding
# 3) use Prinseq for low-quality read filtering
# 4) use FASTX_collapser for identical read collapsing
# mainly for SOLiD RNA kit. 
# 		3’ Adapter: CGCCTTGGCCGTACAGCAG
#===============================================================================

use strict;
use warnings;

my $dataDIR = "./";

opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
my @fileList = readdir $dir;
closedir $dir;

my $suffix = "\.fastq.gz";
my @readFileList = grep(/$suffix$/, @fileList);
@readFileList = sort @readFileList;

my $num = scalar(@readFileList);
if ( $num == 0 ) {
	print "\nThere is NO FASTQ file ($suffix) in this folder\n";
	exit;
} elsif ( $num == 1 ) {
	print "\nThere is ".$num." FASTQ file ($suffix) in this folder\n";
} else {
	print "\nThere are ".$num." FASTQ file ($suffix) in this folder\n";
}

# parameters for cutadapt
my $adapter_3 = "CGCCTTGGCCGTACAGCAG";
my $err_3 = 0.2;
my $overlap_3 = 5;
my $min_length = 1;

# parameters for prinseq
my $lc_method = "entropy";
my $lc_threshold = 50;
my $min_len = 15;
my $trim_tail_right = 5;
my $line_width = 55;

my $i = 0;
foreach my $readFile (@readFileList) {
	$i++;
	my $prefix = $readFile;
	$prefix =~ s/$suffix$//;
	
	print "\n====== Preprocess #".$i." read file: $prefix ======\n";
	preprocess($prefix, $suffix, "");
}

#================================ sub functions ================================
# preprocess
sub preprocess {
	my ($prefix, $suffix, $barcode) = @_;
	
	my $readFile = $prefix.$suffix;
	
	my $trimedReadFile = $prefix."\_Cutadapt.fastq";
	my $trimReport = $prefix."\_Cutadapt.report";
	
	my $filteredReadFile = $prefix."\_Prinseq.fastq";
	my $filterReport = $prefix."\_Cutadapt_Prinseq.report";
	
	my $collapsedReadFile = $prefix."\_Processed.fa";
	
	# use Cutadapt for 3' adapter trmming and 5' adatper trmming
	system("cutadapt -a $adapter_3 -n 1 -e $err_3 -O $overlap_3 -m $min_length --match-read-wildcards -o $trimedReadFile $readFile > $trimReport");

	print "   --- Adatper trmming for $prefix completed\n";
	
	# use Prinseq for low-quality read filtering
	system("prinseq-lite.pl -fastq $trimedReadFile -out_format 1 -out_good $filteredReadFile -out_bad null -lc_method $lc_method -lc_threshold $lc_threshold -min_len $min_len -line_width $line_width -trim_tail_right $trim_tail_right 2> $filterReport");

	system("rm $trimedReadFile");
	print "   --- Low-quality read filtering for $prefix completed\n";

	# use FASTX_collapser for identical read collapsing
	system("fastx_collapser -i $filteredReadFile\.fasta -o $collapsedReadFile");
	
	system("rm $filteredReadFile\.fasta");
	print "   --- Identical read collapsing for $prefix completed\n";
}
