#!/usr/bin/perl -w

#===============================================================================
# [01/12/2016] fit to miRNA profiling by multiple assignment approach
# summarize read count distribution along offsets (*.offset_dist.sum) for all samples
# from mapped read sequence file (.profile) by multiple assignment approach
# ARGV[0]:	mapped read sequence file (.profile) by multiple assignment approach
# ARGV[0]:	weightedCountOption - "--unweightedCount" (default), and "--weightedCount"
# Output:	read count distribution along offsets (*.offset_dist.sum) for all samples
# Usage:	sumMismatch_offset.pl *.profile
# Example:	sumMismatch_offset.pl all_sample.human_miRNA_all.mis_1+.profile --weightedCount
#===============================================================================

use strict;
use warnings;

my $profile = $ARGV[0];
my $weightedCountOption = $ARGV[1];
if ( !$ARGV[1] ) {
	$weightedCountOption = "--unweightedCount";
}
my $matchType;
my $readSeq;
my $readLength;
my $matchID;
my $offset;
my $matchNum;
my $orientation;
my $suffix = "\.profile";
my @sampleList = ();
my %offsetDist = ();

# test if input profile file exists
if ( !-e $profile ) {
	die "   Can not open $profile !!!\n";
}
# test if input profile file has description information
if ( $profile !~ /.profile/ ) {
	print "   This is NOT a profile file (*.profile) !!!\n";
	exit;
}

# mismacth types (including 'N')
my @allMismatchType = qw(
	mismatch
	A>T
	A>G
	A>C
	A>N
	T>A
	T>G
	T>C
	T>N
	G>A
	G>T
	G>C
	G>N
	C>A
	C>T
	C>G
	C>N
);

open(In, "<", $profile) or die "can not open $profile !!!\n";
my $outputFile = $profile;
$outputFile =~ s/\.profile/\.dist_offset.sum/;
open(Out, ">", $outputFile);
print "Output summarization file: $outputFile\n";

# get sample order and output head (first line)
my $firstLine = <In>;
chomp $firstLine;
my @array = split(/\t/, $firstLine);
$matchType = shift(@array);
$readSeq = shift(@array);
$readLength = shift(@array);
$matchID = shift(@array);
$offset = shift(@array);
$matchNum = shift(@array);
@sampleList = @array;

print Out "orientation\t"."offset\t"."mismatchType\t".join("\t", @array)."\n";

my $maxReadLength = 0;
# summarize read counts along offsets for each mismatch type (e.g., A>T) and any mismatch types (mismatch)
while (<In>) {
	chomp;
	@array = split(/\t/, $_);
	$matchType = shift(@array);
	$readSeq = shift(@array);
	$readLength = shift(@array);
	$matchID = shift(@array);
	$offset = shift(@array);
	$matchNum = shift(@array);
	
	if ( $readLength > $maxReadLength) {
		$maxReadLength = $readLength
	}
	my @string = split(/\,/, $offset);
	my $misMatch = scalar(@string ) - 1;

	# ignore shorter reads
#	if ( $readLength < 1) {
#		next;
#	}
	
	# ignore multiple mapping reads
#	if ( $matchNum > 1  ) {
#		next;
#	}

	# get mismatch type and coordinates (CIGAR string, 0-based offsets of mismatches on forward read sequence)
	foreach my $j (1..$misMatch) {
		my @cigarStr = split(/\:/,$string[$j]);
		my $mismacthOffset_L = $cigarStr[0];		# mismatch offset along 5'-end forward (left  |---> ) of read sequence
		my $mismacthOffset_R = $readLength - $cigarStr[0] -1;		# mismatch offset along 3'-end reverse (right <---| ) of read sequence
		my $mismatchType = $cigarStr[1];			# mismatch type
#		$mismatchType =~ s/\>/\_to\_/;				# translate to readable mismatch type
		
		my $i = 0;
		foreach my $sample (@sampleList) {
			# ignore no count
			if ( !$array[$i]) {
				$i++;
				next;
			}

			
			my $weight = 1;
			# use match numbers (multiple mapping times) to weight read counts
			if ( $weightedCountOption eq "--unweightedCount" ) {
				$weight = 1;
			} elsif ( $weightedCountOption eq "--weightedCount" ) {
				$weight = $matchNum;
			}
			
			my $weightedCount = $array[$i] / $weight;
			
			# count based on each mismatch type and mismatch offset along 5'-end forward (left  |---> ) of read sequence
			$orientation = "L";
			$offset = $mismacthOffset_L;
			if ( exists($offsetDist{$orientation}{$offset}{$mismatchType}{$sample}) ) {
				$offsetDist{$orientation}{$offset}{$mismatchType}{$sample} += $weightedCount;
			} else {
				$offsetDist{$orientation}{$offset}{$mismatchType}{$sample} = $weightedCount;
			}
			
			# count based on each mismatch type and mismatch offset along 3'-end reverse (right <---| ) of read sequence
			$orientation = "R";
			$offset = $mismacthOffset_R;
			if ( exists($offsetDist{$orientation}{$offset}{$mismatchType}{$sample}) ) {
				$offsetDist{$orientation}{$offset}{$mismatchType}{$sample} += $weightedCount;
			} else {
				$offsetDist{$orientation}{$offset}{$mismatchType}{$sample} = $weightedCount;
			}
			
			# count based on any mismatch type and mismatch offset along 5'-end forward (left  |---> ) of read sequence
			$orientation = "L";
			$offset = $mismacthOffset_L;
			if ( exists($offsetDist{$orientation}{$offset}{"mismatch"}{$sample}) ) {
				$offsetDist{$orientation}{$offset}{"mismatch"}{$sample} += $weightedCount;
			} else {
				$offsetDist{$orientation}{$offset}{"mismatch"}{$sample} = $weightedCount;
			}
			
			# count based on any mismatch type and mismatch offset along 3'-end reverse (right <---| ) of read sequence
			$orientation = "R";
			$offset = $mismacthOffset_R;
			if ( exists($offsetDist{$orientation}{$offset}{"mismatch"}{$sample}) ) {
				$offsetDist{$orientation}{$offset}{"mismatch"}{$sample} += $weightedCount;
			} else {
				$offsetDist{$orientation}{$offset}{"mismatch"}{$sample} = $weightedCount;
			}
			
			$i++;
		}
	}
}
close (In);

# write mismatch read counts along offsets for all samples
foreach my $orientation ("L", "R") {
	foreach my $offset (0..($maxReadLength - 1)) {
		foreach my $mismatchType (@allMismatchType) {
			print Out $orientation."\t".$offset."\t".$mismatchType;
			foreach my $sample (@sampleList) {
				print Out "\t";
				if ( exists($offsetDist{$orientation}{$offset}{$mismatchType}{$sample}) ) {
					print Out round($offsetDist{$orientation}{$offset}{$mismatchType}{$sample});
				}
			}
			print Out "\n";
		}
	}
}
close(Out);


#================================ sub functions ================================
# round a positive number to integer
sub round {
	my ($number) = @_;
	my $firstNumAfterPoint = int(10 * ($number - int($number)));
	if ( $firstNumAfterPoint < 5 ) {
		return int($number);		
	} else {
		return int($number) + 1;	
	}
}