#!/usr/bin/perl -w

#===============================================================================
# PisomiR [version 1.3]: Pipeline for small RNA sequencing (sRNA-Seq) data analysis
# Update [09/02/2014]: specific for isomiR discovery
#
# ARGV[1]: maxReadLength
# ARGV[0]: processed fasta file (adapter-trimmed and duplicate-read-collapsed)
#          read head: >SN-countNumber, e.g., >6-128
# this version use formatted file as bowtie output
# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
#===============================================================================

use strict;
use warnings;

my $maxReadLength = $ARGV[0];
my $readFile = $ARGV[1];
my $prefix = $readFile;
$prefix =~ s/\.fa$//;
my $refName;
my $maxMismatch;

# reference database for mapping
my $dataDIR = "/var/www/html/database/bowtie";

# open or renew output files
open(PROFILE, ">", $prefix.".profile");
close(PROFILE);
open(PROFILE, ">", $prefix.".feature");
close(PROFILE);
open(DIST, ">", $prefix.".dist");
print DIST "refName\t"."maxMismatch\t"."input/match\t"."uniqReadN\t"."readN\t";
print DIST join("\t", (1...$maxReadLength)), "\n";
close(DIST);

# human miRNA mapping
my @refDB = qw(
	human_miRNA_mature
	human_miRNA_precursor
);
for $maxMismatch (0..1) {
	foreach $refName (@refDB) {
		$readFile = match($readFile, $refName, $maxMismatch, $prefix, "T");
	}
}

countTotalDist($readFile, $prefix);
print "\n";


#================================ sub functions ================================
# map reads to each reference database (refDB)
sub match {
	my ($readFile, $refName, $maxMismatch, $prefix, $reportAll) = @_;
	
	my $index = getBowtieIndex($refName);
	my $matchReads = $prefix."_Match_tmp.fa";
	my $mismatchReads = $prefix."_unMatch_tmp.fa";
	my $formatFile = $prefix.".".$refName.".mis_".$maxMismatch.".format";
	
	print "\n--- readFile: ".$prefix."\t"."matchType: ".$refName.".mis_".$maxMismatch."\n";
	
	# open an empty file in case that there are no match read at all !!!
	open(MATCH, ">", $matchReads);
	close (MATCH);
	open(FORMAT, ">", $formatFile);
	close (FORMAT);
	
	if ($index) {
		# output bowtie alignment results to a formatted file (not SAM file)
		# with 6 fields: readID, strand, matchID, offset, readSeq, mismatch
		# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
		# use -p to require multiple processors/cores
		if( $reportAll eq "T" ){
        	# use "-a" to report all (best to worst) valid alignments per read
        	system("bowtie -v $maxMismatch $index -f $readFile --al $matchReads --un $mismatchReads -a -p 16 --best --quiet --suppress 6,7 > $formatFile");
      	}else{
      		# use "-k 1" to report the best valid alignment per read
        	system("bowtie -v $maxMismatch $index -f $readFile --al $matchReads --un $mismatchReads -k 1 -p 16 --best --quiet --suppress 6,7 > $formatFile");
      	}
		
		# output matched read info into *.profile for each sample
		countRead($matchReads, $formatFile, $refName, $maxMismatch, $prefix);
		
		# output matched feature info into *.feature for each sample
		countFeature($formatFile, $refName, $maxMismatch, $prefix);
		
		# output input and matched read length distribution into *.dist for each sample
		countDist($readFile, $matchReads, $refName, $maxMismatch, $prefix);
		
		system("mv $mismatchReads $prefix"."_unMatch.fa");
		system("rm $matchReads");
		system("rm $formatFile");
	} else {
		print "No bowtie index for refDB: $refName !!!\n";
		exit();
	}
	return $prefix."_unMatch.fa";
}

# summarize matched reads
sub countRead {
	my ($matchReads, $formatFile, $refName, $maxMismatch, $prefix) = @_;
	
	my $readID;
	my $readNum;
	my $strand;
	my $matchID;
	my $offset;
	my $readSeq;
	my $mismatch;
	
	# output refName, maxMismatch, readSeq, matchID, and readNum into a united a united file (*.profile) for each sample
	# further encode match strand(+/-), offset, and mismatch information into read sequence (for isomiR discovery!!!)
	open(FORMAT, "<", $formatFile) or die "Can not open $formatFile !!!\n";
	open(PROFILE, ">>", $prefix.".profile");
	while (<FORMAT>) {
		chomp;
		my @array = split(/\t/, $_);
		$readID = $array[0];
		my @head = split(/-/, $readID);
		$readID = $head[0];
		$readNum = $head[1];
		$strand = $array[1];
		$matchID = $array[2];
		$offset = $array[3];
		$readSeq = $array[4];
		my $mismatch = $strand.$offset;
		if ($array[5]) {
			$mismatch = $mismatch."\,".$array[5];
			
		}
		my $readSeqCode = maskMismatch($readSeq, $mismatch);
		print PROFILE $refName."\t".$maxMismatch."\t".$readSeqCode."\t".$matchID."\t".$readNum."\n";
	}
	close(FORMAT);
	close(PROFILE);
}

# mask (soft mask) mismatched nucleotides through converting letters (A, T, G, C, or N) to their lower cases (a, t, g, c, or n)
# encode match strand(+/-),  offset, and mismatch information into read sequence (for isomiR discovery!!!)
# if offsetN > 8, then "offsetN-readSeq", if offsetN <= 8, then add offsetN numbers of space symbols
# use 8, as 23 (miRNA length) - 15 (cutoff for too short reads) = 8
# e.g, "-	hsa-mir-486-2	38	TACTCGGGGCAGCTCAGTACAGGA	22:C>A,23:G>T"
# => "-38.taCTCGGGGCAGCTCAGTACAGGA"
# e.g, "+	hsa-miR-574-5p	2	TGAGTGTGTGTGTGAGTG	0:A>T,2:T>A"
# => "+2.tGaGTGTGTGTGTGAGTG"
sub maskMismatch {
	my ($readSeq, $mismatch) = @_;
	
	my @array = split(/\,/, $mismatch);
	my $offset = shift @array;
	my $strand;
	my $length = length($readSeq);
	
	# get strand and offset information
	if ($offset =~ /^\+/) {
		$strand = "\+";
		$offset =~ s/^\+//;
	} elsif ($offset =~ /^\-/) {
		$strand = "\-";
		$offset =~ s/^\-//;
	}
	
	# mask (soft mask) mismatched nucleotides through converting letters (A, T, G, C, or N) to their lower cases (a, t, g, c, or n)
	$mismatch = scalar(@array);
	if ($mismatch > 0) {
		foreach my $i (1..$mismatch) {
			my $start = 0;
			if ($strand eq "\+") {
				my $mismatchCode = shift @array;
				my @location = split(/\:/, $mismatchCode);
				$start = $location[0];
			} elsif ($strand eq "\-") {
				my $mismatchCode = pop @array;
				my @location = split(/\:/, $mismatchCode);
				$start = $location[0];
				$start = $length - $start - 1;
			};
			my $letter = substr($readSeq, $start, 1);
			substr($readSeq, $start, 1, lc($letter));
			$offset = $offset."\,".$start;
		}
	}
	
	$readSeq = $strand.$offset."\|".$readSeq;
	
	return $readSeq;
}

# summarize matched features
sub countFeature {
	my ($formatFile, $refName, $maxMismatch, $prefix) = @_;
	
	my %match = ();
	my $readID;
	my $readNum;
	my $matchID;
	my $matchType = $refName."\.mis_".$maxMismatch."\+";
	
	# get matched features from each format file
	open(FORMAT, "<", $formatFile) or die "Can not open $formatFile !!!\n";
	while (<FORMAT>) {
		chomp;
		my @array = split(/\t/, $_);
		$readID = $array[0];
		my @head = split(/-/, $readID);
		$readNum = $head[1];
		$matchID = $array[2];
		if (exists($match{$matchID})) {
			$match{$matchID} += $readNum;
		}else{
			$match{$matchID} = $readNum;
		}
	}
	close(FORMAT);
	
	# output refName, matchID, maxMismatch, and total readNum 
	# into a united file (*.feature) for each sample
	open(FEATURE, ">>", $prefix.".feature");
	foreach $matchID (keys %match){
		$readNum = $match{$matchID};
		print FEATURE $matchType."\t".$matchID."\t".$readNum."\n";
	}
	close(FEATURE);
}


# get matched read length distribution
sub countDist {
	my ($readFile, $matchReads, $refName, $maxMismatch, $prefix) = @_;
	
	my $inputUniqReadN = 0;
	my $inputReadN = 0;
	my $matchUniqReadN = 0;
	my $matchReadN = 0;

	my @inputReadDist = (0) x $maxReadLength;
	my @matchReadDist = (0) x $maxReadLength;
	
	my @read = ();
	my $i;
	
	# get input original read
	open(INPUT, "<", $readFile) or die "Can not open $readFile !!!\n";
	while (<INPUT>) {
		chomp;
		if(/^>/){
			@read = split(/-/, $_);
			$inputUniqReadN++;
			$inputReadN += $read[1];
		}else{
			$i = length($_); # NOT allow multiple lines for readSeq
			if ($i > 0) {
				$inputReadDist[$i-1] += $read[1];
			}
		}
	}
	close(INPUT);
	
	# get matched original read
	open(MATCH, "<", $matchReads) or die "Can not open $matchReads !!!\n";
	while (<MATCH>) {
		chomp;
		if(/^>/){
			@read = split(/-/, $_);
			$matchUniqReadN++;
			$matchReadN += $read[1];
		}else{
			$i = length($_);
			$matchReadDist[$i-1] += $read[1];
		}
	}
	close(MATCH);
	
	# output input and matched read length distribution
	open(DIST, ">>", $prefix.".dist");
	print DIST $refName."\t".$maxMismatch."\t"."input\t".$inputUniqReadN."\t".$inputReadN."\t";
	print DIST join("\t", @inputReadDist)."\n";
	print DIST $refName."\t".$maxMismatch."\t"."match\t".$matchUniqReadN."\t".$matchReadN."\t";
	print DIST join("\t", @matchReadDist)."\n";
	close(DIST);
	
	# output match rate
	printf("    unique matchRate: %.2f%%", 100*$matchUniqReadN/$inputUniqReadN);
	print "\t\t";
	printf("real matchRate: %.2f%%", 100*$matchReadN/$inputReadN);
	print "\n";
}

# get total match read length distribution
sub countTotalDist {
	my ($readFile, $prefix) = @_;
	
	my $inputUniqReadN = 0;
	my $inputReadN = 0;
	my $unmatchUniqReadN = 0;
	my $unmatchReadN = 0;
	my $matchUniqReadN = 0;
	my $matchReadN = 0;

	my @inputReadDist = (0) x $maxReadLength;
	my @unmatchReadDist = (0) x $maxReadLength;
	my @matchReadDist = (0) x $maxReadLength;
	
	my @read = ();
	my $i;
	
	# get final unmatched original read
	open(UNMATCH, "<", $readFile) or die "Can not open $readFile !!!\n";
	while (<UNMATCH>) {
		chomp;
		if(/^>/){
			@read = split(/-/, $_);
			$unmatchUniqReadN++;
			$unmatchReadN += $read[1];
		}else{
			$i = length($_);
			$unmatchReadDist[$i-1] += $read[1];
		}
	}
	close(UNMATCH);
	
	# get initial input original read
	open(DIST, "<", $prefix.".dist");
	$i = 0;
	while (<DIST>) {
		chomp;
		my @array = split(/\t/, $_);
		if ($i == 1) {
			$inputUniqReadN = $array[3];
			$inputReadN = $array[4];
			@inputReadDist = @array[5..($maxReadLength+4)];
		}
		$i++;
	}
	close(DIST);
	
	$matchUniqReadN = $inputUniqReadN - $unmatchUniqReadN;
	$matchReadN = $inputReadN - $unmatchReadN;
	@matchReadDist = map { $inputReadDist[$_] - $unmatchReadDist[$_] } (0..($maxReadLength-1));
	
	# output total input and matched read length distribution
	open(DIST, ">>", $prefix.".dist");
	print DIST "Total\t"."*\t"."input\t".$inputUniqReadN."\t".$inputReadN."\t";
	print DIST join("\t", @inputReadDist)."\n";
	print DIST "Total\t"."*\t"."match\t".$matchUniqReadN."\t".$matchReadN."\t";
	print DIST join("\t", @matchReadDist)."\n";
	close(DIST);
	
	# output match rate
	printf("\n--- total unique matchRate: %.2f%%", 100*$matchUniqReadN/$inputUniqReadN);
	print "\t";
	printf("total real matchRate: %.2f%%", 100*$matchReadN/$inputReadN);
	print "\n";
}

# get bowtie index for each reference database 
sub getBowtieIndex {
	my ($refName) = @_;
	
	my $subDIR = "";
	# human miRNA/lncRNA/mRNA/ncRNA/DNA
	if($refName eq "human_miRNA_mature"){
		$subDIR = "/miRBase/mature_hsa";
	}elsif($refName eq "human_miRNA_precursor"){
		$subDIR = "/miRBase/hairpin_hsa";
	}
	
	if ($subDIR eq "") {
		return "";
	} else {
		return $dataDIR.$subDIR;
	}
}