#!/usr/bin/perl -w

#===============================================================================
# verify significant mismatch events (potential SNPs) by using a database with
# known SNPs (variant name, chromosome, strend, and position) on precursor miRNAs 
# retrieved from Ensembl.
# ARGV[0]:	input file with significant mismatch events across all the samples
# Output:	known SNPs mapped to mismatch events  across all the samples
# Usage:	veryfyDist_miR.pl
#===============================================================================

use strict;
use warnings;
use Getopt::Long qw(GetOptions);
use FindBin qw($Bin);
use lib "$Bin";
use ReadDBConfig;

my $project_name = "all_sample";
GetOptions("project=s" => \$project_name);

my $base_path = ReadDBConfig::getBasePath($ARGV[0]);

my $inputFile1 = $project_name."_distCount_miR.sum";
# test if input file exists
if ( !-e $inputFile1 ) {
	die "   Can not open $inputFile1 !!!\n";
}
my $inputFile2 = $project_name."_distRatio_miR.sum";
# test if input file exists
if ( !-e $inputFile2 ) {
	die "   Can not open $inputFile2 !!!\n";
}
my $outputFile1 = $inputFile1;
$outputFile1 =~ s/\_miR.sum/\_miR_SNP.sum/;
my $outputFile2 = $inputFile2;
$outputFile2 =~ s/\_miR.sum/\_miR_SNP.sum/;


# reference database for mapping
my $dataDIR = $base_path;

my %hsa_SNP_miR = ();	# global index from chromosome and position to known SNP information 
my $annoLine;			# global string of annotation head for known SNPs

print "Loading human miRNA-related SNP data file ...\n";
getKnownSNP("$dataDIR/miRBase/hsa_somatic_SNP_miR_20160325.txt");
getKnownSNP("$dataDIR/miRBase/hsa_SNP_miR_20160325.txt");

print "Verify mismatch events by mapping them to known SNPs ...\n";
verifyMismatch($inputFile1, $outputFile1);
verifyMismatch($inputFile2, $outputFile2);


#================================ sub functions ================================
# get known SNPs (variant name, chromosome, strend, and position) on precursor miRNAs
sub getKnownSNP {
	my ($refDBFile) = @_;
	open(In, "<", $refDBFile) or die "can not open file: $refDBFile\n";

	my $count = 0;
	while (<In>) {
		chomp;
		my @array = split(/\t/, $_);
		if ( (scalar(@array) >= 11) and ($array[0] ne "Variant Name") ) {
			my $HGNC_synbol = $array[10];
			my $Strend = $array[9];
			my $Source = $array[1];
			my $Chromosome = $array[2];
			my $Start = $array[3];
			if ( !exists($hsa_SNP_miR{$Chromosome}{$Start}) ) {
				$hsa_SNP_miR{$Chromosome}{$Start} = join("\t", @array[0..6]);	# inport new record
				$count++;
			} elsif ( $hsa_SNP_miR{$Chromosome}{$Start} !~ /dbSNP/ ) {
				$hsa_SNP_miR{$Chromosome}{$Start} = join("\t", @array[0..6]);	# replace record till getting dbSNP
			}
		} elsif ( (scalar(@array) >= 11) and ($array[0] eq "Variant Name") )  {
			$annoLine = join("\t", @array[0..6]);
		}
	}
	close In;
	print "--- $count known miRNA-related SNPs loaded!!!\n"
}

# verify significant mismatch events by mapping to known SNPs
sub verifyMismatch {
	my ($inputFile, $outputFile) = @_;
	open(In, "<", $inputFile) or die "can not open file: $inputFile\n";
	open(Out, ">", $outputFile);

	my $firstLine = <In>;
	chomp $firstLine;
	my @array = split(/\t/, $firstLine);
	my @inArray = split(/\t/, $annoLine);
	splice @array, 6, 0, @inArray;
	splice @array, 3, 0, "Consistency";
	print Out join("\t", @array)."\n";
	print "=== output verified mismatch events to $outputFile\n";
	
	while (<In>) {
		chomp;
		@array = split(/\t/, $_);
		my $HGNC_synbol = $array[3];
		my $Chromosome = $array[4];
		my $Strend = $array[5];
		my $Start = $array[6];
		my $mismatchType = $array[1];
		
		# translate mismatch type to SNP format (e.g., A>G => A/G if '+' strend, and A>G => T/C if '-' strend)
		my @mismatchEvent = split(/\>/, $mismatchType);
		if ( $Strend == -1 ) {
			$mismatchEvent[0] = complement($mismatchEvent[0]);
			$mismatchEvent[1] = complement($mismatchEvent[1]);
		}
		$mismatchType = join("\/", @mismatchEvent);
		$array[1] = $mismatchType;
		
		if ( exists($hsa_SNP_miR{$Chromosome}{$Start}) ) {
			@inArray = split(/\t/, $hsa_SNP_miR{$Chromosome}{$Start});
			splice @array, 6, 0, @inArray;
			# check if mismatch type is consistent with SNP type on specific loci 
			if ( $array[1] eq $array[11] ) {
				splice @array, 3, 0, 'V';
			} elsif ( $array[11] =~ /\-|MUTATION/ ) {
				splice @array, 3, 0, 'v';
			} else {
				splice @array, 3, 0, 'o';
			}
			print Out join("\t", @array)."\n";
		}
	}
	close In;
	close Out;
}

# get complement code, A <=> T, G <=> C
sub complement {
	my ($inCode) = @_;
	my $outCode = 'N';
	if ( $inCode eq 'A' ) {
		$outCode = 'T';
	} elsif ( $inCode eq 'T' ) {
		$outCode = 'A';
	} elsif ( $inCode eq 'G' ) {
		$outCode = 'C';
	} elsif ( $inCode eq 'C' ) {
		$outCode = 'G';
	}
	return $outCode;
}
