#!/usr/bin/perl -w

#===============================================================================
# Paired-end read sequence preprocessing, including:
# 1) use Cutadapt for 3' adapter trmming and 5' adatper trmming
# 2) use Cutadapt for empty read (only adapter but no insert sequence) discarding
# 3) use Prinseq for low-quality read filtering
# 4) use FASTX_collapser for identical read collapsing
# mainly for Rubicon Genomics Universal Primers (Pre-amplification), Illumina Paired-end (PE) DNA Adapters, Illumina TruSeq universal DNA/RNA kits. 
#	Rubicon Genomics Universal Primers for pre-amplification
#		FW: CCAAACACACCCAACACACCAC
#		RC: GTGGTGTGTTGGGTGTGTTTGG
# 	Illumina Paired-end (PE) DNA Adapters
# 		3’ Adapter: AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
# 		5’ Adapter: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
# 	Illumina TruSeq universal DNA/RNA kits
# 		3’ Adapter: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
#			e.g., AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC XXXXXX ATCTCGTATGCCGTCTTCTGCTTG
#											barcode: XXXXXX
# 		5’ Adapter: GTTCAGAGTTCTACAGTCCGACGATC
# 	Illumina PCR Primers
# 		P7_PCR_primer (ligated on the left of 5’ Adapter):	ATCTCGTATGCCGTCTTCTGCTTG
# 		P5_PCR_primer (ligated on the right of 3’ Adapter):	AGATCTCGGTGGTCGCCGTATCATT
#===============================================================================

use strict;
use warnings;

my $option_01 = $ARGV[0];
if ( !$ARGV[0] ) {
	$option_01 = "-z";
}

my $dataDIR = "./";

opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
my @fileList = readdir $dir;
closedir $dir;

my $suffix = "\_R1.fastq";
if ( $option_01 eq "-z" ) {
	$suffix = "\_R1.fastq.gz";
}
my @readFileList = grep(/$suffix$/, @fileList);
@readFileList = sort @readFileList;

my $num = scalar(@readFileList);
if ( $num == 0 ) {
	print "\nThere is NO FASTQ file ($suffix) in this folder\n";
	exit;
} elsif ( $num == 1 ) {
	print "\nThere is ".$num." FASTQ file ($suffix) in this folder\n";
} else {
	print "\nThere are ".$num." FASTQ file ($suffix) in this folder\n";
}

# parameters for cutadapt
my $primer_FW = "CCAAACACACCCAACACACCAC";
my $primer_RC = "GTGGTGTGTTGGGTGTGTTTGG";
my $adapter_PE_3 = "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG";
my $adapter_PE_5 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT";
my $adapter_Uni_3a = "AGATCGGAAGAGCACACGTCT";
my $adapter_Uni_3b = "GATCGGAAGAGCACACGTCT";
my $adapter_Uni_3c = "AAGAGCACACGTCTGAACTCC";
my $adapter_Uni_3d = "CACACGTCTGAACTCCAGTCAC";
my $adapter_Uni_3e = "GAACTCCAGTCAC";
my $adapter_Uni_5 = "GTTCAGAGTTCTACAGTCCGACGATC";
my $adapter_TruSeq_3a = "TGGAATTCTCGGGTGCCAAG";
my $adapter_TruSeq_3b = "TCTCGGGTGCCAAGGAACTCC";
my $adapter_TruSeq_3c = "GGTGCCAAGGAACTCCAGTCAC";
my $P7_PCR_primer = "ATCTCGTATGCCGTCTTCTGCTTG";
my $P5_PCR_primer = "AGATCTCGGTGGTCGCCGTATCATT";

my $err_3 = 0.2;
my $err_5 = 0.2;
my $err_BC = 0.1;
my $overlap_3 = 5;
my $overlap_5 = 5;
my $overlap_BC = 10;
my $min_length = 1;


# parameters for prinseq
my $lc_method = "entropy";
my $lc_threshold = 50;
my $min_len = 15;
my $trim_tail_right = 5;
my $line_width = 55;

# read sample barcode file
my %sampleBarcode = ();
open(In, "<", "sampleBarcode") or die "\n   can't find sampleBarcode file!!!\n";
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	if ( scalar(@array) == 2 ) {
		$sampleBarcode{$array[0]} = $array[1];
	}
}
close(In);

my $i = 0;
foreach my $readFile (@readFileList) {
	$i++;
	my $prefix = $readFile;
	$prefix =~ s/$suffix$//;
	
	my $barcode;
	if ( exists $sampleBarcode{$prefix} ) {
		$barcode = $sampleBarcode{$prefix};
	} else {
		print "\n   there's no barcode for sample: $prefix!!!\n";
		next;
	}
	
	print "\n====== Preprocess #".$i." read file: $prefix ======\n";
	preprocess($prefix, $suffix, $barcode);
}

#================================ sub functions ================================
# preprocess
sub preprocess {
	my ($prefix, $suffix, $barcode) = @_;
	
	$suffix =~ s/\_R2/\_R1/;
	my $readFile_R1 = $prefix.$suffix;
	$suffix =~ s/\_R1/\_R2/;
	my $readFile_R2 = $prefix.$suffix;
	
	if ( !-e $readFile_R1 ) {
		die "   Can't find paired-end file: $readFile_R1 !!!\n";
	}
	if ( !-e $readFile_R2 ) {
		die "   Can't find paired-end file: $readFile_R2 !!!\n";
	}
	
	my $barcodeArea_1 = $barcode."ATCTCGTATGCCG";
	my $barcodeArea_2 = "CAGTCAC".$barcode."ATCTCGT";
	my $barcodeArea_3 = "GAACTCCAGTCAC".$barcode;
	
	my $trimedReadFile_R1 = $prefix."\_R1_Cutadapt.fastq";
	my $trimedReadFile_R2 = $prefix."\_R2_Cutadapt.fastq";
	my $trimReport_R1 = $prefix."\_R1_Cutadapt.report";
	my $trimReport_R2 = $prefix."\_R2_Cutadapt.report";
	
	my $filteredReadFile_R1 = $prefix."\_R1_Prinseq.fastq";
	my $filteredReadFile_R2 = $prefix."\_R2_Prinseq.fastq";
	my $filterReport_R1 = $prefix."\_R1_Cutadapt_Prinseq.report";
	my $filterReport_R2 = $prefix."\_R2_Cutadapt_Prinseq.report";
	
	my $collapsedReadFile_R1 = $prefix."\_R1_Processed.fa";
	my $collapsedReadFile_R2 = $prefix."\_R2_Processed.fa";
	
	# use Cutadapt for 3' adapter trmming and 5' adatper trmming on Read_1
	system("cutadapt -a $primer_FW -a $adapter_PE_3 -a $adapter_Uni_3a -a $adapter_Uni_3b -a $adapter_Uni_3c -a $adapter_Uni_3d -a $adapter_Uni_3e -a $adapter_TruSeq_3a -a $adapter_TruSeq_3b -a $adapter_TruSeq_3c -n 9 -e $err_3 -O $overlap_3 -m $min_length --match-read-wildcards -o tmp_3FW_R1.fastq -p tmp_3FW_R2.fastq $readFile_R1 $readFile_R2 > $trimReport_R1");
	system("cutadapt -g $primer_RC -g $adapter_PE_5 -g $adapter_Uni_5 -n 3 -e $err_5 -O $overlap_5 -m $min_length --match-read-wildcards -o tmp_5FW_R1.fastq -p tmp_5FW_R2.fastq tmp_3FW_R1.fastq tmp_3FW_R2.fastq >> $trimReport_R1");
	
	# use Cutadapt for 3' adapter trmming and 5' adatper trmming on Read_2
	system("cutadapt -a $primer_FW -a $adapter_PE_3 -a $adapter_Uni_3a -a $adapter_Uni_3b -a $adapter_Uni_3c -a $adapter_Uni_3d -a $adapter_Uni_3e -a $adapter_TruSeq_3a -a $adapter_TruSeq_3b -a $adapter_TruSeq_3c -n 9 -e $err_3 -O $overlap_3 -m $min_length --match-read-wildcards -o tmp_5RC_R2.fastq -p tmp_5RC_R1.fastq tmp_5FW_R2.fastq tmp_5FW_R1.fastq > $trimReport_R2");
	system("cutadapt -g $primer_RC -g $adapter_PE_5 -g $adapter_Uni_5 -m $min_length --match-read-wildcards -o tmp_3RC_R2.fastq -p tmp_3RC_R1.fastq tmp_5RC_R2.fastq tmp_5RC_R1.fastq >> $trimReport_R2");
	
	# use Cutadapt for empty read (only adapter but no insert sequence) discarding
	system("cutadapt -b $primer_FW -b $primer_RC -b $P7_PCR_primer -b $P5_PCR_primer -b $barcodeArea_1 -b $barcodeArea_2 -b $barcodeArea_3 -n 7 -e $err_BC -O $overlap_BC -m $min_length --match-read-wildcards --discard-trimmed -o tmp_R1.fastq -p tmp_R2.fastq tmp_3RC_R1.fastq tmp_3RC_R2.fastq >> $trimReport_R1");
	system("cutadapt -b $primer_FW -b $primer_RC -b $P7_PCR_primer -b $P5_PCR_primer -b $barcodeArea_1 -b $barcodeArea_2 -b $barcodeArea_3 -n 7 -e $err_BC -O $overlap_BC -m $min_length --match-read-wildcards --discard-trimmed -o $trimedReadFile_R2 -p $trimedReadFile_R1 tmp_R2.fastq tmp_R1.fastq >> $trimReport_R2");

	system("rm tmp_*.fastq");
	print "   --- Multiple adatper trmming for $prefix completed\n";
	
	# Execution of Prinseq for Low-quality read filtering
	system("prinseq-lite.pl -fastq $trimedReadFile_R1 -out_format 1 -out_good $filteredReadFile_R1 -out_bad null -lc_method $lc_method -lc_threshold $lc_threshold -min_len $min_len -line_width $line_width -trim_tail_right $trim_tail_right 2> $filterReport_R1");
	system("prinseq-lite.pl -fastq $trimedReadFile_R2 -out_format 1 -out_good $filteredReadFile_R2 -out_bad null -lc_method $lc_method -lc_threshold $lc_threshold -min_len $min_len -line_width $line_width -trim_tail_right $trim_tail_right 2> $filterReport_R2");

	system("rm $trimedReadFile_R1 $trimedReadFile_R2");
	print "   --- Low-quality read filtering for $prefix completed\n";

	# Execution of FASTX_collapser for Identical read collapsing
	system("fastx_collapser -i $filteredReadFile_R1\.fasta -o $collapsedReadFile_R1");
	system("fastx_collapser -i $filteredReadFile_R2\.fasta -o $collapsedReadFile_R2");
	
	system("rm $filteredReadFile_R1\.fasta $filteredReadFile_R2\.fasta");
	print "   --- Identical read collapsing for $prefix completed\n";
}

sub reverseComplement {
	my ($inputSeq) = @_;
	
	my $outputSeq = reverse($inputSeq);
	$outputSeq =~ s/A/S/g;
	$outputSeq =~ s/T/B/g;
	$outputSeq =~ s/S/T/g;
	$outputSeq =~ s/B/A/g;
	
	$outputSeq =~ s/G/D/g;
	$outputSeq =~ s/C/H/g;
	$outputSeq =~ s/D/C/g;
	$outputSeq =~ s/H/G/g;
	
	return $outputSeq;
}
