#!/usr/bin/perl -w

#===============================================================================
# summarize and identify significant mismatch events (might potentially be isomiR, RNA editing, or SNP)
# on each nucleotide along every mapped precursor miRNA accross all samples with or without sample order file
# if no sample order file provided, samples will be ordered by alphabet.
# ARGV[0]:	sample order file, e.g. sampleOder,
# 	1	CCACTC_4
# 	2	AAGCTA_3
# 	3	AAGCTA_7
# Output:	summary file of read count distribution on each nucleotide (*_miR.sum) for each individual sample
#			read length distribution summary file (all_sample_dist_miR.sum) across all samples
#			maximum value of read counts on all nucleotides along a single precursor miRNA across all samples
# Usage:	sumDist_miR_v1.6.pl sampleOrder --FW --mis_1 (default setting)
#			sumDist_miR_v1.6.pl sampleOrder --both --mis_1
#			sumDist_miR_v1.6.pl sampleOrder --FW --mis_2
#			sumDist_miR_v1.6.pl sampleOrder --RC --mis_3
#===============================================================================

use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use ReadDBConfig;

my $sampleOrder = $ARGV[0];
my $suffix = "_Processed.profile";
my @sampleList = getSampleOrder($sampleOrder, $suffix);
my $sampleSize = scalar(@sampleList);

my $db_config_file = $ARGV[1];
my $base_path = ReadDBConfig::getBasePath($db_config_file);
print($base_path."\n");
exit 0;

my $strandOption = $ARGV[2];
if ( !$ARGV[1] ) {
	$strandOption = "--FW";
}
my $maxMismatch_limit = $ARGV[3];
if ( !$ARGV[2] ) {
	$maxMismatch_limit = "--mis_1";
}
$maxMismatch_limit =~ s/\--mis\_//;


# define 4-dimentional (4D) hash index for read counts from the following four variables:
# 0) $sample - sample ID
# 1) $seqID - composite sequence heads (representing each precursor miRNA)
# 2) $mismatchType - mismatch type (0, A>T, G>C, etc.) of each nucleotide
# 3) $i - coordinate on precursor miRNA forward sequence, i.e., 0-based offset, i.e., 0..(seqLength -1)
my %refSeqDist = ();

# mismacth types (including 'N')
my @allMismatchType = qw(
	all
	mismatch
	A>T
	A>G
	A>C
	A>N
	T>A
	T>G
	T>C
	T>N
	G>A
	G>T
	G>C
	G>N
	C>A
	C>T
	C>G
	C>N
);

# reference database for mapping
my $dataDIR = $base_path;

my %refSeqHead = ();	# index from precursor or mature miRNA names/IDs to composite sequence heads (annotated miRNA names/IDs with coordinates)
my %refSeq = ();		# index from composite sequence heads (annotated miRNA names/IDs with coordinates) to precursor miRNA sequences

# get precursor miRNA reference sequences from miRBase with composite sequence head
# formatted and annotated with names and coordinates of corresponding mature miRNAs
my $refDBFile = "$base_path/miRBase/hairpin_hsa_all_anno.fna";
open(In, "<", $refDBFile) or die "can not open file: $refDBFile\n";
print "Loading miRBase human annotated precursor miRNA sequence file ...\n";
my $seqSign = 0;
my $seq = "n/a";
my $seqID = "n/a";
my @array = ();
while (<In>) {
	chomp;
	if ( /^>/ ) {
		$seqID = $_;
		$seqID =~ s/^\>//;
		@array = split(/\|\ /, $seqID);
		@array = split(/\|\|/, $array[0]);
		foreach my $annoMatchID (@array) {
			my @head = split(/\|/, $annoMatchID);
			my $matchID = $head[0];
			$refSeqHead{$matchID} = $seqID;
		}
		$seqSign = 0;
	} else {
		$seq = $_;
		if ( $seqSign == 0 ) {	# new sequence line
			$refSeq{$seqID} = $seq;
			$seqSign = 1;
		} else {				# continued sequence line (works also for multiple lines)
			$refSeq{$seqID} = $refSeq{$seqID}.$seq;
		}
	}
}	
close(In);

# summarize mapped miRNA read count distribution on each nucleotide for each sample
my @allSeqID = keys %refSeq;						# list all precursor miRNAs (total 1881 in miRBase 21)
@allSeqID = sort @allSeqID;
my $i = 0;
foreach my $sample (@sampleList) {
	$i++;
	print "\n====== Begin to summarize miRNAs on #".$i." profile ======\n";
	sumEachDist_miR($sample, $suffix, $strandOption, $maxMismatch_limit);
}

# find the maximum value of read counts on all nucleotides along a single precursor miRNA reference sequence for each sample
my %maxReadN = ();
foreach my $sample (@sampleList) {
	foreach my $seqID (@allSeqID) {
		@array = values % { $refSeqDist{$sample}{$seqID}{"all"} };
		@array = reverse sort { $a <=> $b } @array;
		my $maxReadN = 0;
		if ( $array[0] ) {
			$maxReadN{$seqID}{$sample} = $array[0];
		}
	}
}

# output the maximum value of read counts on all nucleotides along a single precursor miRNA reference sequence across all samples
my $outputFile = "all_sample_max_miR.sum";
open(Out, ">", $outputFile);
# generate table head
print Out "matchID\t".join("\t", @sampleList)."\n";
print "\nsummarizing maximum readN for every sample onto: $outputFile ...\n";
foreach my $seqID (@allSeqID) {
	# get precursor miRNA name
	@array = split(/\|\ /, $seqID);
	my $seqIDHead = $array[0];				# composite sequence head with coordinates, no annotation
	@array = split(/\|/, $seqIDHead);
	my $matchID = $array[0];				# precursor miRNA name
	
	my @maxValues = ();
	foreach my $sample (@sampleList) {
		if ( exists($maxReadN{$seqID}{$sample}) ) {
			push @maxValues, $maxReadN{$seqID}{$sample};
		} else {
			push @maxValues, "";
		}
	}
	if ( join("", @maxValues) ne "" ) {		# only report characterized/significant mismatch events and ratio
		print Out $matchID."\t".join("\t", @maxValues)."\n";
	}
}
close(Out);

# output characterized/significant mismatch events and count/ratio cross all the samples
# mismatch ratio is based on the maximum read count number (maxReadN) along each precursor miRNA for each sample
my $outputFile1 = "all_sample_distCount_miR.sum";
my $outputFile2 = "all_sample_distRatio_miR.sum";
open(Out1, ">", $outputFile1);
open(Out2, ">", $outputFile2);
# generate table head
print Out1 "matchID\t"."averageMaxReadN\t"."mismatchType\t"."offset\t"."n-2\t"."n-1\t"."n\t"."n+1\t"."n+2\t".join("\t", @sampleList)."\n";
print Out2 "matchID\t"."averageMaxReadN\t"."mismatchType\t"."offset\t"."n-2\t"."n-1\t"."n\t"."n+1\t"."n+2\t".join("\t", @sampleList)."\n";
print "\nsummarizing mismatch events for every samples onto: $outputFile1 ...\n";
print "\nsummarizing mismatch events for every samples onto: $outputFile2 ...\n";
# ignore "all" and "mismatch" here
shift @allMismatchType;
shift @allMismatchType;
# check significance of each mismatch event located at each loci/coordinate on each precursor miRNAs for each sample
foreach my $seqID (@allSeqID) {
	
	# calculate average maxReadN for individual precursor miRNA (matchID) across all samples
	@array = values % { $maxReadN{$seqID} };
	my $averageMaxReadN = round(sum(@array) / $sampleSize);
	
	# get precursor miRNA sequence
	$seq = "n/a";
	if ( exists($refSeq{$seqID}) ) {
		$seq = $refSeq{$seqID};
	}
	
	# get precursor miRNA sequence length
	my $seqLength = length($seq);
	
	# get every nucleotide (ATGC) along precursor miRNA sequence
	my @pre_miRNA = split("", $seq);
	
	# get precursor miRNA name
	@array = split(/\|\ /, $seqID);
	my $seqIDHead = $array[0];				# composite sequence head with coordinates, no annotation
	@array = split(/\|/, $seqIDHead);
	my $matchID = $array[0];				# precursor miRNA name
	
	# calculate read count ratio for each mismatch event located at each loci/coordinate on each precursor miRNAs for each sample
	foreach my $mismatchType (@allMismatchType) {
		
		# output read count number for each nucleotide on each precursor miRNA
		foreach my $i (0..($seqLength - 1)) {
			my @mismatchCount = ();
			my @mismatchRatio = ();
			# calculate read count ratio for each mismatchType on each precursor miRNA
			foreach my $sample (@sampleList) {
				if ( exists($refSeqDist{$sample}{$seqID}{$mismatchType}{$i}) and ($maxReadN{$seqID}{$sample} >= 10) ) {	# filter out miRNAs with maximum readN < 10
					push @mismatchCount, $refSeqDist{$sample}{$seqID}{$mismatchType}{$i};
					push @mismatchRatio, sprintf('%.05f', ($refSeqDist{$sample}{$seqID}{$mismatchType}{$i} /  $refSeqDist{$sample}{$seqID}{"all"}{$i}));
				} else {
					push @mismatchCount, "";
					push @mismatchRatio, "";
				}
			}
			
			if ( join("", @mismatchCount) ne "" ) {		# only report characterized/significant mismatch events and ratio
				my @neighbor = ();		# get letters of neighbor nucleotides
				my $k = 0;
				foreach my $j (-2, -1, 0, 1, 2) {
					if ( (($i + $j) < 0) or (($i + $j) >= $seqLength) ) {
						$neighbor[$k] = "";
					} else {
						$neighbor[$k] = $pre_miRNA[$i + $j];
					}
					$k++;
				}
				
				print Out1 $matchID."\t".$averageMaxReadN."\t".$mismatchType."\t".$i."\t".join("\t", @neighbor)."\t".join("\t", @mismatchCount)."\n";
				print Out2 $matchID."\t".$averageMaxReadN."\t".$mismatchType."\t".$i."\t".join("\t", @neighbor)."\t".join("\t", @mismatchRatio)."\n";
			}
		}
	}
}
close(Out1);
close(Out2);

my $strandSymbol = "";
if ( $strandOption eq "--FW" ) {
	$strandSymbol = "+";
} elsif ( $strandOption eq "--RC" ) {
	$strandSymbol = "-";
} else {
	$strandSymbol = "";
}
my $dir = "Dist_miR_mis_".$maxMismatch_limit.$strandSymbol;
system("mkdir ".$dir);
system("mv *_miR.sum ./".$dir);


#================================ sub functions ================================
# get sample order with or without sample order file (if no sample order file provided, samples will be ordered by alphabet)
sub getSampleOrder {
	my ($sampleOrder, $suffix) = @_;
	my @sampleList = ();
	my $dataDIR = "./";
	my $i = 0;
	if ( $sampleOrder ) {	# if sampleOrder file is profvided
		open(In, "<", $sampleOrder) or die "   can't open sample order file - $sampleOrder\n";
		while (<In>) {
			chomp;
			# make sure to get rid of any kind of carriage return sign
			$_ =~ s/\r|\n//g;
			my @array = split(/\t/, $_);
			if ( scalar(@array) == 2 ) {
				$sampleList[$i] = $array[1];
				$i++;
			}
		}
		close(In);
	} else {				# if sampleOrder file is NOT profvided
		print "no sample order file input, will retrieve and order all samples automatically.\n";
		opendir my $dir, $dataDIR or die "   Can't open directory: $dataDIR";
		my @fileList = readdir $dir;
		closedir $dir;
		my @profileList = grep(/$suffix$/, @fileList);
		foreach my $profile (@profileList) {
			$profile =~ s/$suffix$//;
			$sampleList[$i] = $profile;
			$i++;
		}
		@sampleList = sort @sampleList;
	}
	
	print join("\t", @sampleList);
	my $num = scalar(@sampleList);
	
	if ( $num == 0 ) {
		print "\n\nThere is NO processed profile ($suffix) in this folder\n";
		exit;
	} elsif ( $num == 1 ) {
		print "\n\nThere is ".$num." processed profiles ($suffix) in this folder\n";
	} else {
		print "\n\nThere are ".$num." processed profiles ($suffix) in this folder\n";
	}
	
	return @sampleList;
}

# summarize read count distribution on each nucleotide of mapped precursor miRNAs
# for individual matched read sequence file (*.profile)
# Output:	summary file of read count distribution on each nucleotide (*_miR.sum)
sub sumEachDist_miR {
	my ($sample, $suffix, $strandOption, $maxMismatch_limit) = @_;
	
	my $inputFile = $sample.$suffix;				# e.g., Sample_1_Processed.profile
	my $outputFile = $sample."\_dist_miR.sum";  	# e.g., Sample_1_dist_miR.sum
	
	# input individual profile, e.g., Sample_1_Processed.profile
	open(In, "<", $inputFile) or die "   Can't open $inputFile !!!\n";
	print "--- Read matched sequences from: $inputFile ...\n";
	while (<In>) {
		chomp;
		@array = split(/\t/, $_);
		my $matchType = $array[0];
		my $readID = $array[1];
		my $readSeq = $array[2];
		my $matchID = $array[3];
		my $offset = $array[4];
		my $matchNum = $array[5];
		my $multiMatch = $array[6];
		my $mismatchType = "all";
		my $mismatchOffset = 0;
		
		# matchType filtering
		if ( $matchType !~ /miRNA/ ) {
			next;	# only consider miRNA mapping profile
		}
		
		# strand filtering
		if  ( ($strandOption eq "--FW") and ($matchType =~ /RNA/) and ($matchType =~ /\-$/) ) {
			next;	# only consider forward (FW) strand
		} elsif ( ($strandOption eq "--RC") and ($matchType =~ /RNA/) and ($matchType =~ /\+$/) ) {
			next;	# only consider reverse complement (RC) strand
		} elsif ( $strandOption eq "--both") {
					# pass through!!!
		}
		
		# mismatch filtering
		my @string = split(/\.mis\_/, $matchType);
		my $maxMismatch = $string[1];
		$maxMismatch =~ s/\+|\-$//;
		if ( $maxMismatch > $maxMismatch_limit ) {
			next;	# only consider maxMismatch <= maxMismatch_limit (default: 1)
		}
		
		# get read count number from readID (collapsed FASTA format)
		my $readNum = 1;
		@array = split(/\-/, $readID);
		if ( scalar(@array) == 2 ) {
			$readNum = $array[1];
		}
		
		# get read length
		$readSeq =~ s/^\s+//; # trim left-end spaces for head-alligned read sequences
		$readSeq =~ s/\s+$//; # trim right-end spaces for tail-alligned read sequences
		my $readLength = length($readSeq);
		
		# get composite sequence head (annotated miRNA names/IDs with coordinates) from a mature or precursor miRNA ID (Unified miRNA Naming System Now!!!)
		$seqID = "n/a";
		if ( exists($refSeqHead{$matchID}) ) {
			$seqID = $refSeqHead{$matchID};
		}
	
		# get read mapping coordinates (0-based offsets on forward reference sequence)
		my $misMatch = 0;
		@array = split(/\,/, $offset);
		$misMatch = scalar(@array) - 1;
		#
		if ( $misMatch != $maxMismatch ) { # check inconsistency!!!
			print "Inconsistency Warning: $matchType\t$readID\t$readSeq\t$matchID\t$offset\n";
		}
		#
		my $readStart = $array[0];
		my $readEnd = $readStart + $readLength - 1;
		
		# count read number for each nucleotide on each precursor miRNA
		$mismatchType = "all";
		foreach my $i ($readStart..$readEnd) {		# read alignment coordinate
			if ( exists($refSeqDist{$sample}{$seqID}{$mismatchType}{$i}) ) {
				$refSeqDist{$sample}{$seqID}{$mismatchType}{$i} = $refSeqDist{$sample}{$seqID}{$mismatchType}{$i} + $readNum;
			} else {
				$refSeqDist{$sample}{$seqID}{$mismatchType}{$i} = $readNum;
			}
		}
		
		# get mismatch type and coordinates (CIGAR string, 0-based offsets of mismatches on forward read sequence)
		foreach my $j (1..$misMatch) {
			my @cigarStr = split(/\:/,$array[$j]);
			my $x = $readStart + $cigarStr[0];			# mismatch coordinate
			$mismatchType = $cigarStr[1];				# mismatch type
#			$mismatchType =~ s/\>/\_to\_/;				# translate to readable mismatch type
			
			# filter out mismatch events happened on the start or end nuceotide of a read
			if ( ($x == $readStart) or ($x == $readEnd) ) {
				next;
			}
			
			# count based on each mismatchType
			if ( exists($refSeqDist{$sample}{$seqID}{$mismatchType}{$x}) ) {
				$refSeqDist{$sample}{$seqID}{$mismatchType}{$x} = $refSeqDist{$sample}{$seqID}{$mismatchType}{$x} + $readNum;
			} else {
				$refSeqDist{$sample}{$seqID}{$mismatchType}{$x} = $readNum;
			}
			
			# count based on for any mismatchType
			$mismatchType = "mismatch";
			if ( exists($refSeqDist{$sample}{$seqID}{$mismatchType}{$x}) ) {
				$refSeqDist{$sample}{$seqID}{$mismatchType}{$x} = $refSeqDist{$sample}{$seqID}{$mismatchType}{$x} + $readNum;
			} else {
				$refSeqDist{$sample}{$seqID}{$mismatchType}{$x} = $readNum;
			}
		}
	}
	close(In);
	
	# output read count distribution on each nucleotide with different mismatch type
	# (e.g., "all", "mismatch", "A>T", "G>C", etc.) for every mapped miRNA
	open(Out, ">", $outputFile);
	print "=== Write read count distribution onto: $outputFile ...\n";
#	my @allSeqID_EachSample = keys % { $refSeqDist{$sample} };		# list mapped precursor miRNAs only, for each precursor miRNA
	foreach my $seqID (@allSeqID) {
		
		# get precursor miRNA sequence
		$seq = "n/a";
		if ( exists($refSeq{$seqID}) ) {
			$seq = $refSeq{$seqID};
		}
		
		# get precursor miRNA sequence length
		my $seqLength = length($seq);
		
		# output composite sequence head (annotated miRNA names/IDs with coordinates), precursor miRNA sequence
		@array = split(/\|\ /, $seqID);
		my $seqIDHead = $array[0];				# composite sequence head with coordinates, no annotation
		@array = split(/\|/, $seqIDHead);
		my $matchID = $array[0];				# precursor miRNA name
		my @seqString = split(//, $seq);		# split a reference sequence to every single nucleotides
	
		print Out $matchID."\t".$seq."\t".join("\t", @seqString)."\n";
		print Out $matchID."\t".$seqIDHead."\t".join("\t", (0..($seqLength-1)))."\n";
	
		# output read count number for each mismatchType on each precursor miRNA
		foreach my $mismatchType (@allMismatchType) {
#			# ignore mismatch type without any read mapped
#			@array = keys % { $refSeqDist{$seqID}{$mismatchType} };
#			@array = reverse sort @array;
#			if ( !($array[0]) ) {
#				next;
#			}
			print Out $matchID."\t".$mismatchType;
			# output read count number for each nucleotide on each precursor miRNA
			foreach my $i (0..($seqLength - 1)) {
				print Out "\t";
				if ( exists($refSeqDist{$sample}{$seqID}{$mismatchType}{$i}) ) {
					print Out $refSeqDist{$sample}{$seqID}{$mismatchType}{$i};
				} else {
#					print Out 0;
					next;
				}
			}
			print Out "\n";
		}
	}
	close(Out);
}

# calculate sum value for an input array/vector
sub sum {
	my @inputArray = @_;
	my $sum = 0;
	foreach my $element (@inputArray) {
		$sum = $sum + $element;
	}
	return $sum;
}

# round a positive number to integer
sub round {
	my ($number) = @_;
	my $firstNumAfterPoint = int(10 * ($number - int($number)));
	if ( $firstNumAfterPoint < 5 ) {
		return int($number);		
	} else {
		return int($number) + 1;	
	}
}

