#!/usr/bin/perl -w

#===============================================================================
# summarize matchIDs with all mismatches (mis_0, mis_1, and mis_2) and/or both strands (+/-)
# for all samples on a feature file specific for miRNA profiling 
# ARGV[0]:	feature file (*.feature) without any annotation (description and taxonomy) information
# Output:	summary file (*.sum)
# Usage:	sumMatchID_miR.pl all_sample.feature --FW --mis_0 --mirID --weightMismatch
#			sumMatchID_miR.pl all_sample.feature --both --mis_0 --mirFamily --unweightMismatch
#			sumMatchID_miR.pl all_sample.feature --FW --mis_1 --mirID
#			sumMatchID_miR.pl all_sample.feature --RC --mis_2 --mirID
#===============================================================================

use strict;
use warnings;

my $feature = $ARGV[0];
my $strandOption = $ARGV[1];
if ( !$ARGV[1] ) {
	$strandOption = "--FW";
}
my $maxMismatch_limit = $ARGV[2];
if ( !$ARGV[2] ) {
	$maxMismatch_limit = "--mis_2";
}
$maxMismatch_limit =~ s/\--mis\_//;
my $mirOption = $ARGV[3];
if ( !$ARGV[3] ) {
	$mirOption = "--mirID";
}
my $weightMismatchOption = $ARGV[4];
if ( !$ARGV[4] ) {
	$weightMismatchOption = "--unweightMismatch";
}
my $prefix = $feature;
my $suffix = "\.feature";
$prefix =~ s/$suffix$//;
$suffix = "\.sum";
my $strand;
my %match = ();
my @array = ();
my @string = ();

# test if input feature file exists
if ( !-e $feature ) {
	die "   Can not open $feature !!!\n";
}

# read feature file head
open(In, "<", $feature) or die "Can not open $feature !!!";
my $firstLine = <In>; # read feature file head
chomp($firstLine);
@array = split(/\t/, $firstLine);

# read sample list
my $size = scalar(@array);
my $j = 2; # first sample position in array, depends on feature file format
my @sampleList = @array[$j..($size - 1)];

# sum count numbers based on each macthID
while (<In>) { # read annotated features
	chomp;
	@array = split(/\t/, $_);
	my $matchType = $array[0];
	my $matchID = $array[1];
	my $maxMismatch;
	my $mismatchWeight = 1;
	
	# miRNA family filtering
	if ( $mirOption eq "--mirFamily" ) {	# tanslate to miRNA family ID, e.g., hsa-miR-122a-1-3p => hsa-mir-122a
		$matchID =~ s/miR/mir/g;	# treat precursor and mature miRNAs as the same, and counting together
		$matchID =~ s/\*//g;		# ignore special miRNA labeling in the old version of miRBase 
		@string = split(/\-/, $matchID);
		if ( scalar(@string) >= 3 ) {
			@string = @string[0..2];
		}
		$matchID = join("\-", @string);
	} elsif ( $mirOption eq "--mirID" ) {
		# keep matchID
	}
	
	# strand filtering
	if  ( ($strandOption eq "--FW") and ($matchType =~ /RNA/) and ($matchType =~ /\-$/) ) {
		next;	# only consider forward (FW) strand
	} elsif ( ($strandOption eq "--RC") and ($matchType =~ /RNA/) and ($matchType =~ /\+$/) ) {
		next;	# only consider reverse complement (RC) strand
	} elsif ( $strandOption eq "--both") {
				# pass through!!!
	}
	
	# mismatch filtering
	@string = split(/\.mis\_|\_sub\.mis\_/, $matchType);
	$matchType = $string[0];
	$matchType =~ s/\_subSeq/\_repSeq/;
	$matchType =~ s/\_sub//;
	$matchType =~ s/\_mature//;
	$matchType =~ s/\_precursor//;
	$maxMismatch = $string[1];
	$maxMismatch =~ s/\+|\-$//;
	if ( $maxMismatch > $maxMismatch_limit ) {
		next;	# only consider maxMismatch <= maxMismatch_limit (default: 2)
	}
	
	# decide weight values for each mismatch type
	if ( $weightMismatchOption eq "--weightMismatch" ) {
		
		if ( $maxMismatch == 0 ) {		# weight value for mis_0
			$mismatchWeight = 1.0;
		} elsif ( $maxMismatch == 1 ) {	# weight value for mis_1
			$mismatchWeight = 0.8;
		} elsif ( $maxMismatch == 2 ) {	# weight value for mis_2
			$mismatchWeight = 0.6;
		} elsif ( $maxMismatch == 3 ) {	# weight value for mis_3
			$mismatchWeight = 0.4;
		} else {						# weight value for others
			$mismatchWeight = 0.2;
		}
	} elsif ( $weightMismatchOption eq "--unweightMismatch" ) {
		$mismatchWeight = 1;
	}
	
	@array = @array[$j..($size - 1)]; # read count numbers for all samples

	# sum count numbers for specific rank
	for my $i (0..($size - $j - 1)) { # for each sample
		if ( $array[$i] ) {
			$array[$i] = $array[$i] * $mismatchWeight;	# multiply weight values
			if ( exists $match{$matchType}{$matchID}{$i} ) {
				$match{$matchType}{$matchID}{$i} += $array[$i];
			} else {
				$match{$matchType}{$matchID}{$i} = $array[$i];
			}
		}
	}
}
close (In);

if  ( $strandOption eq "--FW" ) {
	$strand = "+";
} elsif ( $strandOption eq "--RC" ) {
	$strand = "-";
} elsif ( $strandOption eq "--both") {
	$strand = "";
}

my $head = join("\t", @sampleList);
# write summary file for each matchType
my @allMatchType = keys %match;
@allMatchType = sort @allMatchType;
foreach my $matchType ( @allMatchType ) { # for each matchType
	my $outputFile = $prefix."\_".$matchType."\.mis_".$maxMismatch_limit.$strand.$suffix;
	open(Out, ">", $outputFile);
	print "   === write summarization file - $outputFile\n";
	# write summary head for each rank level
	print Out $matchType."\t".$head."\n"; 
	
	# write summary for each macthID
	my @allMatchID = keys % { $match{$matchType} } ;
	@allMatchID = sort @allMatchID;
	foreach my $matchID (@allMatchID) {
		print Out $matchID;
		for my $i (0..($size - $j - 1)) {
			print Out "\t";
			if ( exists $match{$matchType}{$matchID}{$i} ) {
				print Out $match{$matchType}{$matchID}{$i};
			}
		}
		print Out "\n";
	}
	close(Out);
}
