#!/usr/bin/perl -w

#===============================================================================
# read barcodes from FASTQ files directly, mainly for
# NEB Next small RNA kit and Illumina TruSeq small RNA kit:
# 	NEB Next Small RNA Sample Prep Kit
# 		3’ Adapter: AGATCGGAAGAGCACACGTCT
#		e.g., AGATCGGAAGAGCACACGTCT GAACTCCAGTCAC XXXXXX ATCTCGTATGCC GTCTTCTGCTTG
#										barcode - XXXXXX
# 		5’ Adapter: GTTCAGAGTTCTACAGTCCGACGATC
# 	Illumina TruSeq Small RNA Sample Prep Kit
# 		3’ Adapter: TGGAATTCTCGGGTGCCAAG
#		e.g., TGGAATTCTCGGGTGCCAAG GAACTCCAGTCAC XXXXXX ATCTCGTATGCC GTCTTCTGCTTG
#									   barcode - XXXXXX
# 		5’ Adapter: GTTCAGAGTTCTACAGTCCGACGATC
#===============================================================================

use strict;

my $option_01 = $ARGV[0];
if (scalar(@ARGV) == 0) {
	$option_01 = "null";
}

my $dataDIR = "./";

opendir my $dir, $dataDIR or die "Cannot open directory: $dataDIR";
my @fileList = readdir $dir;
closedir $dir;

my $suffix = "\.fastq";
if ( $option_01 eq "-z" ) {
	$suffix = "\.fastq.gz";
}
my @readFileList = grep(/$suffix$/, @fileList);
@readFileList = sort @readFileList;

my $num = scalar(@readFileList);
if ( $num == 0 ) {
	print "\nThere is NO FASTQ file ($suffix) in this folder\n";
	exit;
} elsif ( $num == 1 ) {
	print "\nThere is ".$num." FASTQ file ($suffix) in this folder\n";
} else {
	print "\nThere are ".$num." FASTQ file ($suffix) in this folder\n";
}

# parameters for cutadapt
my $barcodeLeft = "GAACTCCAGTCAC";
my $barcodeRight = "ATCTCGTATGCC";
my $err = 0.2;
my $overlap = 5;
my $min_length = 6; # barcode/index sequence length
my $max_length = 8; # barcode/index sequence length

my $outputFile = "sampleBarcode";
open(Out, ">", $outputFile);

my $i = 0;
foreach my $readFile (@readFileList) {
	$i++;
	my $prefix = $readFile;
	$prefix =~ s/$suffix$//;
	
	print "\n====== Detect barcode for #".$i." read file: $prefix ======\n";
	my $barcode = detectBarcode($prefix, $suffix);
	my $index = indexBarcode($barcode);
	
	print Out $prefix."\t".$barcode."\t".$index."\n";
	
}
close(Out);

print "\n====== Output barcode information to $outputFile ======\n";

#================================ sub functions ================================
# detetct barcode from a single FASTQ file, by trimming left and right primer sequences close to barcode sequence
sub detectBarcode {
	my ($prefix, $suffix) = @_;
	
	my $readFile = $prefix.$suffix;
	
	my $barcode = "n/a";
	
	my $trimedReadFile = $prefix."\_Cutadapt.fastq";
	my $trimReport = $prefix."\_Cutadapt.report";
	
	my $collapsedReadFile = $prefix."\_Collapsed.fa";
	
	# read first 2000 sequences from a FASTQ file
	my $lineUsed = 4 * 2000;
	if ( $option_01 eq "-z" ) {
		system("zcat $readFile | head -n $lineUsed > temp_Read.fastq");
	} else {
		system("cat $readFile | head -n $lineUsed > temp_Read.fastq");
	}
	
	# use Cutadapt for trimming left and right primer sequences close to barcode sequence
	system("cutadapt -a $barcodeRight -g $barcodeLeft -n 2 -e $err -O $overlap -m $min_length -M $max_length --match-read-wildcards -o $trimedReadFile temp_Read.fastq > $trimReport");
	system("rm temp_Read.fastq $trimReport");
	
	# use FASTX_collapser for identical read collapsing
	system("fastx_collapser -i $trimedReadFile -o $collapsedReadFile");
	system("rm $trimedReadFile");
	
	open(In, "<", $collapsedReadFile) or die "\n   can't find file - $collapsedReadFile!!!\n";
	while (<In>) {
		chomp;
		if ( ( $_ !~ ">" ) and ( length($_) == $min_length ) ) { # we assume a barcode should be the most frequent 6-length sequence after trimming left and right primer sequences close to barcode sequence
			$barcode = $_;
			print "   --- find barcode: $barcode\n";
			last;
		}
	}
	close(In);
	
	system("rm $collapsedReadFile");
	
	return $barcode;
}


# map barcode (i.e. index) sequence to corresponding index number  
sub indexBarcode {
	my ($barcode) = @_;
	
	my $indexNumber = 0;
	my @indexArray = qw(
		ATCACG
		CGATGT
		TTAGGC
		TGACCA
		ACAGTG
		GCCAAT
		CAGATC
		ACTTGA
		GATCAG
		TAGCTT
		GGCTAC
		CTTGTA
		AGTCAA
		AGTTCC
		ATGTCA
		CCGTCC
		GTAGAG
		GTCCGC
		GTGAAA
		GTGGCC
		GTTTCG
		CGTACG
		GAGTGG
		GGTAGC
		ACTGAT
		ATGAGC
		ATTCCT
		CAAAAG
		CAACTA
		CACCGG
		CACGAT
		CACTCA
		CAGGCG
		CATGGC
		CATTTT
		CCAACA
		CGGAAT
		CTAGCT
		CTATAC
		CTCAGA
		GACGAC
		TAATCG
		TACAGC
		TATAAT
		TCATTC
		TCCCGA
		TCGAAG
		TCGGCA
	);
	
	my $i = 0;
	foreach my $index (@indexArray) {
		$i++;
		if ( $barcode eq $index ) {
			$indexNumber = $i;
			last;
		}
	}
	
	return $indexNumber;
}