#!/usr/bin/perl -w

#===============================================================================
# PisomiR [ver1.5]:		Pipeline for small RNA sequencing (sRNA-Seq) data analysis
# update [12/17/2014]:	specific for miRNA profiling and isomiR discovery for rat
# update [12/08/2014]:	for summarization/aggregation purpose, only count matchTypes with (+) 
# 						strand for RNAs and matchTypes with both (+/-) strands for non-RNAs
# Update [11/25/2014]:	If multiple alignments of one read are all reverse (-) strand,
#						do NOT filter out this read (keep it for next step) for RNAs
#						although count the first reverse (-) alignment and
#						report all these reverse (-) strand alignments
# Update [11/21/2014]:	"forward starnd preference" for RNAs. For multiple alignments
#						of each read, report all, while only count the first
#						alignment with forward (+) strand. If all the alignments
#						are mapped to reverse (-) strand of reference sequences,
#						then only count the first reverse (-) alignment.
# Update [11/13/2014]:	first, map reads directly to precursor miRNA sequences,
#						annotated with mature miRNA location info - |matchID|start:end|,
#						second, reassign reads to mature miRNAs using these location info
#						after alignment,
#						finally, summerize read counts based on mature miRNA matchID.
# Update [09/02/2014]:	specific for isomiR discovery
#
# ARGV[0]: maxReadLength
# ARGV[1]: processed fasta file (adapter-trimmed and duplicate-read-collapsed)
#          read head: >SN-countNumber, e.g., >6-128
# this version use formatted file as bowtie output
# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
#===============================================================================

use strict;
use warnings;

my $maxReadLength = $ARGV[0];
my $readFile = $ARGV[1];
my $prefix = $readFile;
$prefix =~ s/\.fa$//;
my $refName;
my $maxMismatch;

# reference database for mapping
my $dataDIR = "/var/www/html/database/bowtie";

# open or renew output files
open(Profile, ">", $prefix.".profile");
close(Profile);
open(Profile, ">", $prefix.".feature");
close(Profile);
open(Dist, ">", $prefix.".dist");
print Dist "refName\t"."maxMismatch\t"."input/match\t"."uniqReadN\t"."readN\t";
print Dist join("\t", (1...$maxReadLength)), "\n";
close(Dist);

# cloning sequence filtering
my @refDB = qw(
	nt_CV
);
$maxMismatch = 0;
foreach $refName (@refDB) {
	$readFile = match($readFile, $refName, $maxMismatch, $prefix, "T");
}

# rat miRNA mapping
@refDB = qw(
	rat_miRNA
	rat_miRNA_sub
);
for $maxMismatch (0..2) {
	foreach $refName (@refDB) {
		$readFile = match($readFile, $refName, $maxMismatch, $prefix, "T");
	}
}

# virus, plant, and all miRNA mapping
@refDB = qw(
	virus_miRNA
	plant_miRNA
	all_miRNA
	all_miRNA_sub
);
$maxMismatch = 0;
foreach $refName (@refDB) {
	$readFile = match($readFile, $refName, $maxMismatch, $prefix, "T");
}

countTotalDist($readFile, $prefix);
print "\n";


#================================ sub functions ================================
# map reads to each reference database (refDB)
sub match {
	my ($readFile, $refName, $maxMismatch, $prefix, $reportAll) = @_;
	
	my $matchType = $refName."\.mis_".$maxMismatch;
	my $matchReads = $prefix."\_Match_tmp.fa";
	my $mismatchReads = $prefix."\_unMatch_tmp.fa";
	my $readNext = $prefix."\_unMatch.fa";
	my $formatFile = $prefix.".".$matchType."\.format";
	
	print "\n--- readFile: ".$prefix."\t"."matchType: ".$matchType."\n";
	
	# check if bowtie index exists
	my $index = getBowtieIndex($refName);
	if ( !($index) ) {
		print "No bowtie index for refDB: $refName !!!\n";
		exit();
	}
	
	# check bowtie index type ("small" or "large") and index file integrity (total 6 files)
	my $indexType = "\ ";		# default is "small" index
	if ( (-e $index."\.1\.ebwt") and (-e $index."\.2\.ebwt") and (-e $index."\.3\.ebwt") and (-e $index."\.4\.ebwt") and (-e $index."\.rev\.1\.ebwt") and (-e $index."\.rev\.2\.ebwt")) {
		$indexType = "\ ";		# use "small" index for bowtie alignments
	} elsif ( (-e $index."\.1\.ebwtl") and (-e $index."\.2\.ebwtl") and (-e $index."\.3\.ebwtl") and (-e $index."\.4\.ebwtl") and (-e $index."\.rev\.1\.ebwtl") and (-e $index."\.rev\.2\.ebwtl")) {
		$indexType = "\--large-index";
	} else {					# use "large" index for bowtie alignments
		print "Incomplete or no bowtie index for refDB: $refName !!!\n";
		exit();
	}
	
	# check bowtie report type ("report the first one" or "report all")
	my $reportType = "-k 1";	# default is "report the first one"
	if ( $reportAll eq "T" ) {
		$reportType = "-a";		# use "-a" to report all valid alignments per read
	} else {
		$reportType = "-k 1";	# use "-k 1" to report the first valid alignment per read
	}
	
	# multiple threads for bowtie alignment
	my $multiThreads = "-p 15"; # e.g., for hex-core processors/CPUs (2 logical cores per physical), total 16 cores, always leave one core for communication
	
	# check refDB type (RNA, CDS or DNA) to decide whether to count reads with only FW alignments
	# "--norc" represent no reverse-complement (RC) alignment
	# "--nofw" represent no foward (FW) strand alignment
	my $orientation = "\ ";		# default is no orientation preference
	if ( $refName =~ /RNA/ ) {
		$orientation = "--norc";# prefer foward (+) strand alignments
	} else {
		$orientation = "\ ";	# no preference for alignment orientation (+/-)
	}
		
	# open an empty file in case that there are no match read at all !!!
	open(Match, ">", $matchReads);
	close(Match);
	open(Format, ">", $formatFile);
	close(Format);
		
	# output bowtie alignment results to a formatted file (not SAM file)
	# with 6 fields: readID, strand, matchID, offset, readSeq, mismatch
	# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
	system("bowtie -v $maxMismatch $orientation $indexType $index -f $readFile --al $matchReads --un $mismatchReads $reportType $multiThreads --quiet --suppress 6,7 > $formatFile");
		
	# reassign reads to mature miRNAs using their location info on precursor miRNA sequences
	if ( $refName =~ /miRNA/ ) {
		reassignRead($formatFile, $refName, $maxMismatch, $prefix);
	}
	
	# output matched read sequence profile (*.profile) and matched feature file (*.feature) for each sample
	countRead($matchReads, $formatFile, $refName, $maxMismatch, $prefix);
	
	# output input and matched read length distribution into *.dist for each sample
	countDist($readFile, $matchReads, $refName, $maxMismatch, $prefix);
	
	system("mv $mismatchReads $readNext");
		
	# only report (Not count) reverse strand alignments for RNAs for future discovery
	# This could be very important, like for transcriptional regulation!!!
	if ( $refName =~ /RNA/ ) {
		$orientation = "--nofw";	# prefer reverse complement (-) strand alignments
#		$reportType = "-k 1";		# only report the first alignment
		system("bowtie -v $maxMismatch $orientation $indexType $index -f $readNext --al $matchReads $reportType $multiThreads --quiet --suppress 6,7 > $formatFile");
		
		# reassign reads to mature miRNAs using their location info on precursor miRNA sequences
		if ( $refName =~ /miRNA/ ) {
			reassignRead($formatFile, $refName, $maxMismatch, $prefix);
		}
		
		# output matched read sequence profile (*.profile) and matched feature file (*.feature) for each sample
		countRead($matchReads, $formatFile, $refName, $maxMismatch, $prefix);
	}
	
	system("rm $matchReads");
	system("rm $formatFile");
	return $readNext;
}


# reassign reads to annotated matchIDs instead of mapped matchIDs
# format of annotated reference sequence headID:
#
# 'matchID_0|0:length||matchID_1|start_1:end_1||matchID_2|start_2:end_2|...'
#
# if over 60% sequence of a read overlaps with zone-(start_i, end_i), then assign this read to matchID_i
# otherwise, it'll be assigned to precursor miRNA - matchID_0
# annotated info will be removed after read reassigning, to keep consistency with traditional alignemnt results (.format)
sub reassignRead {
	my ($formatFile, $refName, $maxMismatch, $prefix) = @_;

	my $readID;
	my $strand;
	my $matchID;
	my $offset;
	my $readSeq;
	
	my $tempFormatFile = $prefix."\.".$refName."\.mis_".$maxMismatch."\_temp.format";
	
	open(Format, "<", $formatFile) or die "Can not open $formatFile !!!\n";
	open(Temp, ">", $tempFormatFile);
	while (<Format>) {
		chomp;
		my @array = split(/\t/, $_);
		$readID = $array[0];
		$strand = $array[1];
		$matchID = $array[2];
		$offset = $array[3];
		$readSeq = $array[4];
		my $readLength = length($readSeq);
		my $readStart = $offset;
		my $readEnd = $offset + $readLength - 1;
		
		# judge if a matched read belongs to mature -5p, or mature -3p, based on location info
		my @matchHead = split(/\|\|/, $matchID);
		my @annoMatchID = split(/\|/, $matchHead[0]);
		$matchID = $annoMatchID[0];
		my $num = scalar(@matchHead);
		for my $i (1..($num - 1)) {
			@annoMatchID = split(/\|/, $matchHead[$i]);
			my $matchID_i = $annoMatchID[0];
			my @location = split(/\:/, $annoMatchID[1]);
			my $start_i = $location[0];
			my $end_i = $location[1];
			# basicaly, if over 60% read sequence mapped to -5p or -3p mature miRNA zone, then it'll be assigned to that mature miRNA 
			if ( seqOverlap($readStart, $readEnd, $start_i, $end_i) >= round(0.6 * $readLength) ) {
				$matchID = $matchID_i;
				last;
			}
		}
		$array[2] = $matchID;
		print Temp join("\t", @array)."\n";
	}
	close(Format);
	close(Temp);
	system("rm $formatFile");
	system("mv $tempFormatFile $formatFile");
}

# calculate overlap between two sequences defined by start_i and end_i
sub seqOverlap {
	my ($readStart, $readEnd, $start_i, $end_i) = @_;
	
	if ($start_i < $readStart) {
		$start_i = $readStart;
	}
	if ($end_i > $readEnd) {
		$end_i = $readEnd;
	}
	if ( $start_i <= $end_i ) {
		return ($end_i - $start_i + 1);
	} else {
		return 0;
	}
}

# round a positive number to integer
sub round {
	my ($number) = @_;
	my $firstNumAfterPoint = int(10 * ($number - int($number)));
	if ( $firstNumAfterPoint < 5 ) {
		return int($number);		
	} else {
		return int($number) + 1;	
	}
}

# summarize matched reads
sub countRead {
	my ($matchReads, $formatFile, $refName, $maxMismatch, $prefix) = @_;
	
	my %read = ();
	my %match = ();
	my %featurePro = ();
	my %feature = ();
	my $matchType = $refName."\.mis_".$maxMismatch;
	my $readID;
	my $readNum;
	my $strand;
	my $matchID;
	my $offset;
	my $readSeq;
	my $mismatch;
	my $matchNum;
	
	# get matched original reads from fasta file
	# bowtie will report original read sequence if orientation is forward (+) strand, while report reverse-complemented sequence if orientation is (-) strand
	# this is an extension port for 5'-end (left) or 3'-end (right) trimming
	# using bowtie "-5/--trim5 <int>" or "-3/--trim3 <int>"
	# http://bowtie-bio.sourceforge.net/manual.shtml#input
	open(Match, "<", $matchReads) or die "Can not open $matchReads !!!\n";
	while (<Match>) {
		chomp;
		if(/^>/){
			$readID = $_;
			$readID =~ s/^>//;
		}else{
			$readSeq = $_;
			$read{$readID} = $readSeq;
		}
	}
	close(Match);

	# get matched features (could be multiple mapping) from each format file
	open(Format, "<", $formatFile) or die "Can not open $formatFile !!!\n";
	while (<Format>) {
		chomp;
		my @array = split(/\t/, $_);
		$readID = $array[0];	# ID-readNum
		$strand = $array[1];	# reference strand aligned to ("+" for forward, "-" for reverse)
		$matchID = $array[2];	# entry ID of reference sequence from refDB (e.g., hsa-miR-342-5p)
		$offset = $array[3];	# 0-based offset of read sequence on forward reference sequence
		$readSeq = $array[4];	# read sequence (or reverse-complemented if orientation is "-")
		if ( $array[5] ) {		# mismatch info - CIGAR string, e.g., "22:C>A,23:G>T" (1-based offsets of mismatches on forward read sequence)
			$offset = $offset."\,".$array[5];
		}
		
		# if an alignment has the situatuion ( mismatch < maxMismatch ), then pass it (since it should be already reported in previous steps)
		@array = split(/\,/, $offset);
		my $num = scalar(@array);
		if ( ($num - 1) < $maxMismatch) {
			next;
		}
		
		# alignment strand coding
		$matchID = $matchID."\|".$strand."\|";
		
		# if an alignment has the situatuion ( mismatch == maxMismatch ),  then report it
		$match{$readID}{$matchID} = $offset;
		
		# calculate feature frequency (multiple mapping - multiple counting) for probabilistic model
		my @head = split(/\-/, $readID);
		$readNum = $head[1];
		if ( exists $featurePro{$matchID} ) {
			$featurePro{$matchID} += $readNum;
		} else {
			$featurePro{$matchID} = $readNum;
		}
	}
	close(Format);
	
	# output matchType (refName + maxMismatch + strand, e.g., "human_miRNA.mis_0+"), readID, readSeq, 
	# matchID (first mapped), offset (readStart + mismatch), matchNum, and multiMatch	
	# into a united a united file (*.profile) for each sample
	open(Profile, ">>", $prefix.".profile");
	foreach $readID (keys %match) {
		my @head = split(/\-/, $readID);
		$readNum = $head[1];
		$readSeq = $read{$readID}; # get matched original read
		
		# rank feature (matchID) by feature frequency (probability), and then by alphabet
		my @multiMatch = keys % { $match{$readID} };
		@multiMatch = sort @multiMatch;
		@multiMatch = sort { $featurePro{$b} <=> $featurePro{$a} } @multiMatch;
		
		# only report top-ranked ten matchIDs if multiple matches > 10
		$matchNum = scalar(@multiMatch);
		if ( $matchNum > 10) {
			@multiMatch = @multiMatch[0..9];
		}
		
		# only count the first-ranked matchID, while report other top-ranked nine matchIDs
		$matchID = shift @multiMatch;
		$offset = $match{$readID}{$matchID};
		
		# alignment strand decoding
		if ( $matchID =~ /\|\+\|$/ ) {
			$strand = "+";
		} elsif ( $matchID =~ /\|\-\|$/ ) {
			$strand = "\-";
		} else {
			die "No strand info warning: $matchID !!!";
		}
		$matchID =~ s/\|(?:\+|\-)\|//;

		# report other top-ranked nine matchIDs
		my $num = scalar(@multiMatch);
		foreach my $i (0..($num - 1)) {
			my $otherOffset = $match{$readID}{$multiMatch[$i]};
			$multiMatch[$i] = $multiMatch[$i].$otherOffset; # offset coding for other matches
		}
		
		print Profile $matchType.$strand."\t".$readID."\t".$readSeq."\t".$matchID."\t".$offset."\t".$matchNum."\t".join("\|\.\|", @multiMatch)."\n";
		
		# calculate feature count (multiple mapping - best one counting) based on probabilistic model
		if ( exists $feature{$strand}{$matchID} ) {
			$feature{$strand}{$matchID} += $readNum;
		} else {
				$feature{$strand}{$matchID} = $readNum;
		}
	}
	close(Profile);
	
	# output matchType (refName + maxMismatch + strand, e.g., "human_miRNA.mis_0+")
	# matchID (first mapped), and subtotal readNum	
	# into a united file (*.feature) for each sample
	open(Feature, ">>", $prefix.".feature");
	foreach my $strand ("\+", "\-") {
		my @allMatchID = keys % { $feature{$strand} };
		@allMatchID = sort @allMatchID;
		foreach $matchID (@allMatchID){
			$readNum = $feature{$strand}{$matchID};
			print Feature $matchType.$strand."\t".$matchID."\t".$readNum."\n";;
		}
	}
	close(Feature);
}

# get matched read length distribution
sub countDist {
	my ($readFile, $matchReads, $refName, $maxMismatch, $prefix) = @_;
	
	my $inputUniqReadN = 0;
	my $inputReadN = 0;
	my $matchUniqReadN = 0;
	my $matchReadN = 0;

	my @inputReadDist = (0) x $maxReadLength;
	my @matchReadDist = (0) x $maxReadLength;
	
	my @read = ();
	my $i;
	
	# get input original read
	open(Input, "<", $readFile) or die "Can not open $readFile !!!\n";
	while (<Input>) {
		chomp;
		if ( /^>/ ) {
			@read = split(/-/, $_);
			$inputUniqReadN++;
			$inputReadN += $read[1];
		} else {
			$i = length($_); # NOT allow multiple lines for readSeq
			if ( $i > 0 ) {
				$inputReadDist[$i-1] += $read[1];
			}
		}
	}
	close(Input);
	
	# get matched original read
	open(Match, "<", $matchReads) or die "Can not open $matchReads !!!\n";
	while (<Match>) {
		chomp;
		if ( /^>/ ) {
			@read = split(/-/, $_);
			$matchUniqReadN++;
			$matchReadN += $read[1];
		} else {
			$i = length($_);
			$matchReadDist[$i-1] += $read[1];
		}
	}
	close(Match);
	
	# output input and matched read length distribution
	open(Dist, ">>", $prefix.".dist");
	print Dist $refName."\t".$maxMismatch."\t"."input\t".$inputUniqReadN."\t".$inputReadN."\t";
	print Dist join("\t", @inputReadDist)."\n";
	print Dist $refName."\t".$maxMismatch."\t"."match\t".$matchUniqReadN."\t".$matchReadN."\t";
	print Dist join("\t", @matchReadDist)."\n";
	close(Dist);
	
	# output match rate
	printf("    unique matchRate: %.2f%%", 100*$matchUniqReadN/$inputUniqReadN);
	print "\t\t";
	printf("real matchRate: %.2f%%", 100*$matchReadN/$inputReadN);
	print "\n";
}

# get total match read length distribution
sub countTotalDist {
	my ($readFile, $prefix) = @_;
	
	my $inputUniqReadN = 0;
	my $inputReadN = 0;
	my $unmatchUniqReadN = 0;
	my $unmatchReadN = 0;
	my $matchUniqReadN = 0;
	my $matchReadN = 0;

	my @inputReadDist = (0) x $maxReadLength;
	my @unmatchReadDist = (0) x $maxReadLength;
	my @matchReadDist = (0) x $maxReadLength;
	
	my @read = ();
	my $i;
	
	# get final unmatched original read
	open(Unmatch, "<", $readFile) or die "Can not open $readFile !!!\n";
	while (<Unmatch>) {
		chomp;
		if ( /^>/ ) {
			@read = split(/-/, $_);
			$unmatchUniqReadN++;
			$unmatchReadN += $read[1];
		} else {
			$i = length($_);
			$unmatchReadDist[$i-1] += $read[1];
		}
	}
	close(Unmatch);
	
	# get initial input original read
	open(Dist, "<", $prefix.".dist");
	$i = 0;
	while (<Dist>) {
		chomp;
		my @array = split(/\t/, $_);
		if ( $i == 1 ) {
			$inputUniqReadN = $array[3];
			$inputReadN = $array[4];
			@inputReadDist = @array[5..($maxReadLength+4)];
		}
		$i++;
	}
	close(Dist);
	
	$matchUniqReadN = $inputUniqReadN - $unmatchUniqReadN;
	$matchReadN = $inputReadN - $unmatchReadN;
	@matchReadDist = map { $inputReadDist[$_] - $unmatchReadDist[$_] } (0..($maxReadLength-1));
	
	# output total input and matched read length distribution
	open(Dist, ">>", $prefix.".dist");
	print Dist "Total\t"."*\t"."input\t".$inputUniqReadN."\t".$inputReadN."\t";
	print Dist join("\t", @inputReadDist)."\n";
	print Dist "Total\t"."*\t"."match\t".$matchUniqReadN."\t".$matchReadN."\t";
	print Dist join("\t", @matchReadDist)."\n";
	close(Dist);
	
	# output match rate
	printf("\n--- total unique matchRate: %.2f%%", 100*$matchUniqReadN/$inputUniqReadN);
	print "\t";
	printf("total real matchRate: %.2f%%", 100*$matchReadN/$inputReadN);
	print "\n";
}

# get bowtie index for each reference database 
sub getBowtieIndex {
	my ($refName) = @_;
	
	my $subDIR = "";
	# human miRNA/lncRNA/RNA/ncRNA/DNA
	if ( $refName eq "rat_miRNA" ) {
		$subDIR = "/miRBase/hairpin_rno_anno";
	} elsif ( $refName eq "rat_miRNA_sub" ) {
		$subDIR = "/miRBase/hairpin_rno_sub_anno";
	}
	
	if ( $subDIR eq "" ) {
		return "";
	} else {
		return $dataDIR.$subDIR;
	}
}