#!/usr/bin/perl -w

#===============================================================================
# Single-end (SE) read sequence preprocessing, including:
# 1) use Cutadapt for 3' adapter trmming and 5' adatper trmming
# 2) use Cutadapt for empty read (only adapter but no insert sequence) discarding
# 3) use Prinseq for low-quality read filtering
# 4) use FASTX_collapser for identical read collapsing
# 5) use Cutadapt for stop oligo filtering (optional)
# mainly for NEB Next small RNA kit and Illumina TruSeq small RNA kit. 
# 	NEB Next Small RNA Sample Prep Kit
# 		3’ Adapter: AGATCGGAAGAGCACACGTCT
#		e.g., AGATCGGAAGAGCACACGTCT GAACTCCAGTCAC XXXXXX ATCTCGTATGCC GTCTTCTGCTTG
#										barcode - XXXXXX
# 		5’ Adapter: GTTCAGAGTTCTACAGTCCGACGATC
# 	Illumina TruSeq Small RNA Sample Prep Kit
# 		3’ Adapter: TGGAATTCTCGGGTGCCAAG
#		e.g., TGGAATTCTCGGGTGCCAAG GAACTCCAGTCAC XXXXXX ATCTCGTATGCC GTCTTCTGCTTG
#									   barcode - XXXXXX
# 		5’ Adapter: GTTCAGAGTTCTACAGTCCGACGATC
#===============================================================================

use strict;
use warnings;
use Getopt::Long qw(GetOptions);
use File::Basename;
use FindBin qw($Bin); # $Bin is now the directory where the script is

# parse the command line arguments
# -z or --gzip means reading .fastq.gz files
# --stp means use stop oligo
# --barcode BARCODE_FILE to specify barcode file, if left out, no barcode processing
# --3p  the 3 prime adapter specified by the user (required)
# --5p  the 5 prime adapter specified by the user (required)
# --empty3p  used for trimming empty reads (optional)
# --empty5p  used for trimming empty reads (optional)

my $gz_files;
my $stop_oligo;
my $barcodeFile;
my $user_3p;
my $user_5p;
my $emptyIn_3;
my $emptyIn_5;
my $min_len = 15;
GetOptions("gzip|z" => \$gz_files, "stp" => \$stop_oligo, "barcode=s" => \$barcodeFile,
		   "3p=s" => \$user_3p, "5p=s" => \$user_5p, "empty3p=s" => \$emptyIn_3, "empty5p=s" => \$emptyIn_5,
		   "min-length=i" => \$min_len);

my $dataDIR = "./"; # get data from the current directory

opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
my @fileList = readdir $dir;
closedir $dir;

my $suffix = "\.fastq";
if ($gz_files) {
	$suffix = "\.fastq.gz";
}
my @readFileList = grep(/$suffix$/, @fileList);
@readFileList = sort @readFileList;

my $num = scalar(@readFileList);
if ( $num == 0 ) {
	print "\nThere is NO FASTQ file ($suffix) in this folder\n";
	exit;
} elsif ( $num == 1 ) {
	print "\nThere is ".$num." FASTQ file ($suffix) in this folder\n";
} else {
	print "\nThere are ".$num." FASTQ file ($suffix) in this folder\n";
}

# parameters for cutadapt
# use the adapter sequences provided
my $adapter_3 = $user_3p;
my $adapter_5 = $user_5p;
my $stop_oligo_A = "CGTTCCCGTGG\$";
my $stop_oligo_B = "CCACGTTCCCG";
my $err_3 = 0.2;
my $err_5 = 0.125;
my $err_BC = 0.0625;
my $overlap_3 = 5;
my $overlap_5 = 8;
my $overlap_BC = 12;
my $overlap_STP = 11;
my $min_length = 1;

# parameters for prinseq
my $lc_method = "entropy";
my $lc_threshold = 50;
my $trim_tail_right = 5;
my $line_width = 86;

# if the user provided a barcode file, read it
my %sampleBarcode = ();
if ($barcodeFile) {
	open(In, "<", $barcodeFile) or die "\n   can't find sampleBarcode file!!!\n";
	while (<In>) {
		chomp;
		my @array = split(/\t/, $_);
		if ( scalar(@array) > 1 ) {
			$sampleBarcode{$array[0]} = $array[1];
		}
	}
	close(In);
}


my $i = 0;
foreach my $readFile (@readFileList) {
	$i++;
	my $prefix = $readFile;
	$prefix =~ s/$suffix$//;
	
	# use preprocess with a barcode since one was provided
	if ($barcodeFile) {
		my $barcode;
		if ( exists $sampleBarcode{$prefix} ) {
			$barcode = $sampleBarcode{$prefix};
		} else {
			print "\n   there's no barcode for sample: $prefix!!!\n";
			next;
		}
		print "\n====== Preprocess #".$i." read file: $prefix ======\n";
		preprocess($prefix, $suffix, $barcode);
	}
	# no barcode provided
	else {
		print "\n====== Preprocess #".$i." read file: $prefix ======\n";
		preprocess($prefix, $suffix, 0);
	}
}

#================================ sub functions ================================
# preprocess
# if barcode=0, then ignore it
sub preprocess {
	my ($prefix, $suffix, $barcode) = @_;
	
	my $readFile = $prefix.$suffix;
	
	my $trimedReadFile = $prefix."\_Cutadapt.fastq";
	my $trimReport = $prefix."\_Cutadapt.report";
	
	my $filteredReadFile = $prefix."\_Prinseq";
	my $filterReport = $prefix."\_Cutadapt_Prinseq.report";
	
	my $collapsedReadFile = $prefix."\_Collapsed.fa";
	my $processedReadFile = $prefix."\_Processed.fa";
	my $stpReadFile0 = $prefix."\_stp_mis_0.fa";
	my $stpReadFile1 = $prefix."\_stp_mis_1.fa";
	my $stpReadFile2 = $prefix."\_stp_mis_2.fa";
	my $stpReport = $prefix."\_stp.report"; 
	
	# use Cutadapt for 3' adapter trmming and 5' adatper trmming
	system("cutadapt -a $adapter_3 -n 1 -e $err_3 -O $overlap_3 -m $min_length --match-read-wildcards -o tmp_Cutadapt3.fastq $readFile > $trimReport");
	system("cutadapt -g $adapter_5 -n 1 -e $err_5 -O $overlap_5 -m $min_length --match-read-wildcards -o tmp_Cutadapt.fastq tmp_Cutadapt3.fastq >> $trimReport");
	
	# set the flags for cutadapt for the empty in 3 and 5 if they exist
	my $emptyIn_3_flag = "";
	my $emptyIn_5_flag = "";
	if ($emptyIn_3) {
		$emptyIn_3_flag = "-b $emptyIn_3";
	}
	if ($emptyIn_5) {
		$emptyIn_5_flag = "-b $emptyIn_5";
	}
	my $empty_in_flags = "$emptyIn_3_flag $emptyIn_5_flag";
	
	if ($barcode) {
		my $barcodeArea_1 = $barcode."ATCTCGTATGCCG";
		my $barcodeArea_2 = "CAGTCAC".$barcode."ATCTCGT";
		my $barcodeArea_3 = "GAACTCCAGTCAC".$barcode;
		# use Cutadapt for empty read (only adapter but no insert sequence) discarding
		system("cutadapt -b $barcodeArea_1 -b $barcodeArea_2 -b $barcodeArea_3 $empty_in_flags -n 1 -e $err_BC -O $overlap_BC -m $min_length --match-read-wildcards --discard-trimmed -o $trimedReadFile tmp_Cutadapt.fastq >> $trimReport");
	}
	else {
		# use Cutadapt for empty read (only adapter but no insert sequence) discarding
		system("cutadapt $empty_in_flags -n 1 -e $err_BC -O $overlap_BC -m $min_length --match-read-wildcards --discard-trimmed -o $trimedReadFile tmp_Cutadapt.fastq >> $trimReport");
	}
	
	system("rm tmp_Cutadapt3.fastq tmp_Cutadapt.fastq");
	print "   --- Adatper trmming for $prefix completed\n";
	
	# use Prinseq for low-quality read filtering
	# $Bin is selfDev so $Bin/../Tools is where the prinseq-lite.pl script is located
	system("$Bin/../Tools/prinseq-lite.pl -fastq $trimedReadFile -out_format 1 -out_good $filteredReadFile -out_bad null -lc_method $lc_method -lc_threshold $lc_threshold -min_len $min_len -line_width $line_width -trim_tail_right $trim_tail_right 2> $filterReport");

	system("rm $trimedReadFile");
	print "   --- Low-quality read filtering for $prefix completed\n";

	# use FASTX_collapser for identical read collapsing
	system("fastx_collapser -i $filteredReadFile\.fasta -o $collapsedReadFile");
	
	system("rm $filteredReadFile\.fasta");
	print "   --- Identical read collapsing for $prefix completed\n";
	
	# use Cutadapt for stop oligo filtering
	if ($stop_oligo) {
		system("cutadapt -b $stop_oligo_A -b $stop_oligo_B -n 1 -e 0.000 -O $overlap_STP --no-trim --untrimmed-output stp0_Processed.fa -o $stpReadFile0 $collapsedReadFile >> $stpReport");	# maxMismatch = 0
		system("cutadapt -b $stop_oligo_A -n 1 -e 0.091 -O $overlap_STP --no-trim --untrimmed-output stp1_Processed.fa -o $stpReadFile1 stp0_Processed.fa >> $stpReport");						# maxMismatch = 1
		system("cutadapt -b $stop_oligo_A -n 1 -e 0.182 -O $overlap_STP --no-trim --untrimmed-output $processedReadFile -o $stpReadFile2 stp1_Processed.fa >> $stpReport");						# maxMismatch = 2
		
		system("rm $collapsedReadFile stp0_Processed.fa stp1_Processed.fa");
		print "   --- Stop oligo trimming for $prefix completed\n";
	} else {
		system("mv $collapsedReadFile $processedReadFile");
	}
}