#!/usr/bin/perl -w

#===============================================================================
# map mature miRNA sequences to precursor miRNA sequences in order to
# get macth location and matchID information of mature miRNAs on
# corresponding miRNA precursors, and then assemble those information into
# an annotated miRNA hairpin reference database (hairpin_anno.fna).
#
# ARGV[0]: species (e.g., 'hsa', 'mmu', or 'rno') or 'all'
# ARGV[1]: "--compatibleName" or "--unifiedName" (default)
# also output warning info for inconsistency in the file (*.warning)
# orginal FATSA files (.fa) are downloaded from:
# http://www.mirbase.org/
#===============================================================================

use strict;
use warnings;

my $suffix = $ARGV[0];
my $option_01 = $ARGV[1];
if ( !$option_01 ) {
	$option_01 = "--unifiedName";
}
my $index = "hairpin";
my $prefix = "mature";

if ($suffix) {
	print "=== assemble annotated miRNA hairpin reference database for $suffix ...\n\n";
} else {
	die "   need a parameter for species (e.g., 'hsa', 'mmu', or 'rno') or for 'all' !!!\n\n";
}

if ( $suffix eq "all" ) {
	# transform 'U' to 'T' on RNA sequences
	system("rna2dna.pl hairpin.fa hairpin.fna");
	system("rna2dna.pl mature.fa mature.fna");
} else {
	# retrive miRNAs from mature and hairpin miRNAs for specific species
	system("selectSpecies.pl hairpin.fna $suffix");
	system("selectSpecies.pl mature.fna $suffix");
	$index = $index."\_".$suffix;
	$prefix = $prefix."\_".$suffix;
}

# build bowtie index for hairpin reference database
system("bowtie-build $index\.fna $index > bowtieIndexBuild.log");

my $maxMismatch = 0;
my $readFile = $prefix."\.fna";
my $formatFile = $prefix."\.format";
my %match;
my %matchSeq;
my %matchDes;
my %matchReadSeq;

# output bowtie alignment results to a formatted file (not SAM file)
# with 6 fields: readID, strand, matchID, offset, readSeq, mismatch
# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
# use -p to require multiple processors/cores
# use "-a" to report all (best to worst) valid alignments per read
system("bowtie -v $maxMismatch $index -f $readFile -a -p 16 --best --quiet --suppress 6,7 > $formatFile");

# read mature miRNA alignment file (map to precursor miRNAs)
open(In, "<", $formatFile) or die "can't open file - $formatFile";
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	my $readID = $array[0];
	my @head = split(/\ /, $readID);
	$readID = $head[0];
	
	my $strand = $array[1];
	my $matchID = $array[2];
	my $start = $array[3];
	my $readSeq = $array[4];
	my $end = $start + length($readSeq) - 1;
	
	my $readIDFam = $readID;
	$readIDFam =~ s/-let-//;
	$readIDFam =~ s/-miR-//;
	$readIDFam =~ s/-miR//;
	@head = split(/\-/, $readIDFam);
	$readIDFam = $head[0];
	
	my $matchIDFam = $matchID;
	$matchIDFam =~ s/-let-//;
	$matchIDFam =~ s/-mir-//;
	$matchIDFam =~ s/-MIR//;
	@head = split(/\-/, $matchIDFam);
	$matchIDFam = $head[0];
	
	if ( ( $strand eq "\+") and ($readIDFam eq $matchIDFam) ) {
		$match{$matchID}{$start} = $readID."\|".$start."\:".$end."\|";
		$matchReadSeq{$matchID}{$start} = $readSeq;
	} 
}
close(In);

# read precursor micorRNA sequence file
my $inputFile = $index."\.fna";
open(In, "<", $inputFile) or die "can't open file - $inputFile";
my $seq = "";
my $lastMatchID = "";
while (<In>) {
	chomp;
	if (/^>/) { # read sequence head
		my @head = split(/\ /, $_);
		my $matchID = shift @head;
		$matchID =~ s/^>//;
		my $description = join("\ ", @head);
		$matchDes{$matchID} = $description;
		if ($seq ne "") {
			my $length = length($seq);
			$matchSeq{$lastMatchID} = $seq;
			$seq = "";
		}
		$lastMatchID = $matchID;
	} else {
		$seq = $seq.$_;
	}
}
if ($seq ne "") {
	my $length = length($seq);
	$matchSeq{$lastMatchID} = $seq;
	$seq = "";
}
close(In);

# write precursor miRNA and annotate the corresponding macthID and match location of mature miRNAs
my $outputFile = $index."\_anno.fna";
my $outputSubFile = $index."\_sub_anno.fna"; # seperate out "-1,-2, -12" precursor miRNAs to aviod multiple mapping
my $warningFile = $index."\.warning";
open(Out, ">", $outputFile);
open(Outsub, ">", $outputSubFile);
open(Warn, ">", $warningFile);
my @allMatchID = keys %matchSeq;
@allMatchID = sort @allMatchID;
my $description;
foreach my $matchID (@allMatchID) {
	my $sign = 0;
	$seq = $matchSeq{$matchID};
	my $length = length($seq);
	my $headID = $matchID."\|0:".($length-1)."\|";
	my $readSeq = "\-" x $length;
	if ( exists($match{$matchID}) ) {
		my @allStart = keys % { $match{$matchID} };
		@allStart = sort {$a <=> $b} @allStart;
		
		if ( scalar(@allStart) == 2 ) { # has both -5p and -3p mature miRNAs
			foreach my $start (@allStart) {
				$headID = $headID."\|".$match{$matchID}{$start};
			}
		} elsif ( scalar(@allStart) == 1 ) { # has either -5p or -3p mature miRNAs
			my $start = $allStart[0];
			my @array = split(/\|/, $match{$matchID}{$start});
			my @location = split(/\:/, $array[1]);
			my $end = $location[1];
			
			# calculate half point
			my $half;
			if ( ($length % 2) == 0 ) {
				$half = int($length / 2) - 1;
			} else {
				$half = int($length / 2);
			}
			
			# calculate mirror points
			my $newStart = $length - $end - 1;
			my $newEnd = $length - $start - 1;
			my $matureID = $matchID;
			# change precusor miRNA ID to mature miRNA ID
			$matureID =~ s/mir/miR/;
			$matureID =~ s/MIR/miR/;
			
			# complement another (potential) mature miRNA, based on "Mirroring Principle" - mature -5p and -3p miRNAs normally are on mirroring positions
			if ( $end <= $half ) { # only has -5p mature miRNA
				$headID = $headID."\|".$match{$matchID}{$start}."\|".$matureID."-3P"."\|".$newStart."\:".$newEnd."\|";
			} elsif ( $start > $half ) { # only has -3p mature miRNA
				$headID = $headID."\|".$matureID."-5P"."\|".$newStart."\:".$newEnd."\|\|".$match{$matchID}{$start};
			} else { # difficult to judge if it has -5p or -3p mature miRNA 
				$headID = $headID."\|".$match{$matchID}{$start};
				$sign = 1;
			}
		} else { # abnormal situations, never happened for 'hsa'
			foreach my $start (@allStart) {
				$headID = $headID."\|".$match{$matchID}{$start};
			}
			$sign = 3;
		}
		
		foreach my $start (@allStart) { # align mapped mature miRNAs to precursor mRNA sequence
			my @array = split(/\|/, $match{$matchID}{$start});
			my @location = split(/\:/, $array[1]);
			my $start = $location[0];
			my $end = $location[1];
			substr($readSeq, $start, ($end - $start + 1)) = $matchReadSeq{$matchID}{$start};
		}
	}
	
	
	# use FUTURE mature miRNA naming system (miR-XXX-1-5p and miR-XXX-2-5p), especially for isomiR discovery
	# other than miRBase current mature miRNA naming system (miR-XXX-5p and miR-XXX-2-5p)
	if ( $option_01 eq "--unifiedName" ) {
		$headID = changeID($headID);
	} elsif ( $option_01 eq "--compatibleName" ) {
		# do NOT change microRNA name (ID)!!!
	}
	
	# use description information for precusor miRNA
	$description = $matchDes{$matchID};
	
	
	# seperate sub precursor miRNAs (i.e., -2, -3, ... , -99) out from main precursor miRNAs (i.e., -1 or no)
	if ( ($headID =~ /^[a-zA-Z]+\-[a-zA-Z]+\-[0-9]+[a-z]?\-[2-9]/) or ($headID =~ /^[a-zA-Z]+\-[a-zA-Z]+\-[0-9]+[a-z]?\-[1-9][0-9]+/) ) {
		print Outsub "\>".$headID."\ ".$description."\n";
#		print Outsub $readSeq."\n";
		print Outsub $seq."\n";
	} else {
		print Out "\>".$headID."\ ".$description."\n";
#		print Out $readSeq."\n";
		print Out $seq."\n";
	}

	# output warning info for specific species (e.g., 'hsa'), while only output 3-level warning info for 'all'
	if ( ($sign > 0) and ($suffix ne "all") or ($sign == 3) and ($suffix eq "all") ) {
		print Warn "\>".$headID."\n";
		print Warn $readSeq."\n";
		print Warn $seq."\n";
	}
}
close(Out);
close(Outsub);
close(Warn);
system("rm $index\*.ebwt");
system("rm $prefix\*.format");


#================================ sub functions ================================
# change ID for mature miRNAs corresponding to sub precursor miRNAs (i.e., -2, -3, ... , -99)
# enable to identify/differentiate isomiRs from different sub precursor miRNAs
sub changeID {
	my ($headID) = @_;
	
	my @matchHead = split(/\|\|/, $headID);
		my $num = scalar(@matchHead);
		if ( $num == 3 ) {  # two annotated matchIDs (-3p and -5p mature miRNAs)
			my @annoMatchID = split(/\|/, $matchHead[0]);
			my $matchID_0 = $annoMatchID[0];
			my $location_0 = $annoMatchID[1];
			
			@annoMatchID = split(/\|/, $matchHead[1]);
			my $matchID_1 = $annoMatchID[0];
			my $location_1 = $annoMatchID[1];
			
			@annoMatchID = split(/\|/, $matchHead[2]);
			my $matchID_2 = $annoMatchID[0];
			my $location_2 = $annoMatchID[1];
			
			my $matureID = $matchID_0;
			$matureID =~ s/mir/miR/;
			$matureID =~ s/MIR/miR/;
			$matchID_1 = $matureID."\-5p";
			$matchID_2 = $matureID."\-3p";
			$headID = $matchID_0."\|".$location_0."\|\|".$matchID_1."\|".$location_1."\|\|".$matchID_2."\|".$location_2."\|";
		} 
	return $headID;
}
