#!/usr/bin/perl -w

#===============================================================================
# Fusion [version 1.5]:	Pipeline for identifying gene fusion events from
#						large RNA sequencing (RNA-Seq) data
#
# processed (adapter-trimmed) fasta files with paired-end reads
# ARGV[0]: mate 1 FASTA file
# this version use formatted file as bowtie output
# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
#===============================================================================

use strict;
use warnings;

my $readFile1 = $ARGV[0];
my $readFile2 = $readFile1;
$readFile2 =~ s/_1.fasta$/_2.fasta/;
if ( !(-e $readFile1) ) {
	die "Can not find mate 1 FASTA file: $readFile1 !!!";
}
if ( !(-e $readFile2) ) {
	die "Can not find mate 2 FASTA file: $readFile2 !!!";
}

my $refName;
my $maxMismatch;

# reference database for mapping
my $dataDIR = "/var/www/html/database/bowtie";

# human RNA mapping
$maxMismatch = 3;
$refName = "human_RNA";
match($readFile1, $refName, $maxMismatch, "F", "FW");
match($readFile2, $refName, $maxMismatch, "F", "RC");
countFusion($readFile1, $readFile2, $refName, $maxMismatch);

print "\n";

#================================ sub functions ================================
# map reads to each reference database (refDB)
sub match {
	my ($readFile, $refName, $maxMismatch, $reportAll, $fowardPrefer) = @_;
	
	my $prefix = $readFile;
	$prefix =~ s/\.fasta$//;
	my $index = getBowtieIndex($refName);
	my $formatFile = $prefix.".".$refName."\.mis_".$maxMismatch.".format";
	
	if ( $index ) {
		# check bowtie index type ("small" or "large") and index file integrity (total 6 files)
		my $indexType = "\ "; # default is "small" index
		if ( (-e $index."\.1\.ebwt") and (-e $index."\.2\.ebwt") and (-e $index."\.3\.ebwt") and (-e $index."\.4\.ebwt") and (-e $index."\.rev\.1\.ebwt") and (-e $index."\.rev\.2\.ebwt")) {
			$indexType = "\ ";
		} elsif ( (-e $index."\.1\.ebwtl") and (-e $index."\.2\.ebwtl") and (-e $index."\.3\.ebwtl") and (-e $index."\.4\.ebwtl") and (-e $index."\.rev\.1\.ebwtl") and (-e $index."\.rev\.2\.ebwtl")) {
			$indexType = "\--large-index";
		} else {
			print "Incomplete or no bowtie index for refDB: $refName !!!\n";
			exit();
		}
		
		# open an empty file in case that there are no match read at all !!!
		open(Format, ">", $formatFile);
		close(Format);
		
		# check bowtie report type ("report the first one" or "report all from best to worst")
		my $reportType = "-k 1"; # default is "report the first one"
		if ( $reportAll eq "T" ) {
			$reportType = "-a --best"; # use "-a" to report all (ordered from best to worst) valid alignments per read
		} else {
			$reportType = "-k 1 --best"; # use "-k 1" to report the first valid alignment per read
		}
		
		# check bowtie alignment orientation ("forward" or "reverse-complement")
		my $orientation = "\ "; # default is both oritations
		my $strand = "\+\/\-";
		if ( $fowardPrefer eq "FW" ) {
			$orientation = "--norc"; # no reverse-complement alignment
			$strand = "\+";
		} elsif ( $fowardPrefer eq "RC" ) {
			$orientation = "--nofw"; # no foward strand alignment
			$strand = "\-";
		}
		
		# multiple/parallel search threads for bowtie alignment
		my $multiThreads = "-p 15"; # e.g., for hex-core processors/CPUs (2 logical cores per physical), total 16 cores, always leave one core for communications
		
		print "\n--- readFile: ".$prefix."\t"."matchType: ".$refName."\.mis_".$maxMismatch.$strand."\n";
		
		# output bowtie alignment results to a formatted file (not SAM file)
		# with 6 fields: readID, strand, matchID, offset, readSeq, mismatch
		# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
		system("bowtie -v $maxMismatch $orientation -5 20 -3 20 $indexType $index -f $readFile $reportType $multiThreads --quiet --suppress 6,7 > $formatFile");
			
	} else {
		print "No bowtie index for refDB: $refName !!!\n";
		exit();
	}
	return $prefix."_unMatch.fa";
}

# detect fusion genes and count fusion gene expressions
sub countFusion {
	my ($readFile1, $readFile2, $refName, $maxMismatch) = @_;
	
	my %match1 = ();
	my %match2 = ();
	my $readID;
	my $strand;
	my $matchID;
	my $offset;
	my $readSeq;
	my $prefix;
	my $formatFile;
	my %fusion = ();
	my %gene = ();
	my %indexDes = ();
	
	# read alignment for mate 1 
	$prefix = $readFile1;
	$prefix =~ s/\.fasta$//;
	$formatFile = $prefix.".".$refName."\.mis_".$maxMismatch.".format";
	open(Format, "<", $formatFile) or die "Can not open $formatFile !!!\n";
	while (<Format>) {
		chomp;
		my @array = split(/\t/, $_);
		$readID = $array[0];
		my @head = split(/\ /, $readID);
		$readID = $head[0];
		$strand = $array[1];
		$matchID = $array[2];
		$offset = "\|".$strand."\|".$array[3]; # strand |+/-| and start of read sequence on reference sequence
		$readSeq = $array[4];
		if ( $array[5] ) { # mismatch info (CIGAR string, e.g., "22:C>A,23:G>T")
			$offset = $offset."\,".$array[5];
		}
		if ( exists($match1{$readID}) ) {
			$match1{$readID} = $match1{$readID}."\|\.\|".$matchID;
		} else {
			$match1{$readID} = $matchID;
		}
	}
	close(Format);
	system("rm $formatFile");
	
	# read alignment for mate 2 
	$prefix = $readFile2;
	$prefix =~ s/\.fasta$//;
	$formatFile = $prefix.".".$refName."\.mis_".$maxMismatch.".format";
	open(Format, "<", $formatFile) or die "Can not open $formatFile !!!\n";
	while (<Format>) {
		chomp;
		my @array = split(/\t/, $_);
		$readID = $array[0];
		my @head = split(/\ /, $readID);
		$readID = $head[0];
		$strand = $array[1];
		$matchID = $array[2];
		$offset = "\|".$strand."\|".$array[3]; # strand |+/-| and start of read sequence on reference sequence
		$readSeq = $array[4];
		if ( $array[5] ) { # mismatch info (CIGAR string, e.g., "22:C>A,23:G>T")
			$offset = $offset."\,".$array[5];
		}
		if ( exists($match2{$readID}) ) {
			$match2{$readID} = $match2{$readID}."\|\.\|".$matchID;
		} else {
			$match2{$readID} = $matchID;
		}
	}
	close(Format);
	system("rm $formatFile");
	
	# detect fusion genes and count paired-end reads
	my @allReadID = keys %match1;
	foreach my $readID (@allReadID) {
		if ( exists($match2{$readID}) ) {
			if ( $match1{$readID} eq $match2{$readID}) {
				my $matchID1 = $match1{$readID};
				if ( exists($gene{$matchID1}) ) {
					$gene{$matchID1}++;
				} else {
					$gene{$matchID1} = 1;
				}
			} else {
				my $matchID1 = $match1{$readID};
				my $matchID2 = $match2{$readID};
				if ( exists($fusion{$matchID1}{$matchID2}) ) {
					$fusion{$matchID1}{$matchID2}++;
				} else {
					$fusion{$matchID1}{$matchID2} = 1;
				}
			}
		}
	}
	
	# read description file
	print "\n--- Loading description file for ".$refName." ...\n";
	my $index = getBowtieIndex($refName);
	$index = $index.".description";
	open(In, "<", $index) or print "need to build description index file: $index\n";
	while (<In>) {
		chomp;
		my @head = split(/\t/, $_);
		if (scalar(@head) == 2) {
			$matchID = $head[0];
			my $description = $head[1];
			$indexDes{$refName}{$matchID} = $description;
		}
	}
	close(In);
	
	$prefix =~ s/\_prinseq_2$//;
	# write normal gene expressionfile
	my $geneFile = $prefix.".gene";
	print "\n--- write normal gene file: ".$geneFile."\n";
	open(Out, ">", $geneFile);
	my @allMatchID1 = keys %gene;
	@allMatchID1 = sort @allMatchID1;
	foreach my $matchID1 (@allMatchID1) {
		my $description1 = $indexDes{$refName}{$matchID1};
		my $readNum = $gene{$matchID1};
		my @array = ();
		push @array, $readNum;
		push @array, $matchID1;
		push @array, $description1;
		print Out join("\t", @array)."\n";
	}
	close(Out);
	
	# write fusion gene expression file
	my $fusionFile = $prefix.".fusion";
	print "\n--- write fusion gene file: ".$fusionFile."\n";
	open(Out, ">", $fusionFile);
	@allMatchID1 = keys %fusion;
	@allMatchID1 = sort @allMatchID1;
	foreach my $matchID1 (@allMatchID1) {
		my $description1 = $indexDes{$refName}{$matchID1};
		my @allMatchID2 = keys % { $fusion{$matchID1} };
		@allMatchID2 = sort @allMatchID2;
		foreach my $matchID2 (@allMatchID2) {
			my $description2 = $indexDes{$refName}{$matchID2};
			my $readNum = $fusion{$matchID1}{$matchID2};
			my @array = ();
			push @array, $readNum;
			push @array, $matchID1;
			push @array, $description1;
			push @array, $matchID2;
			push @array, $description2;
			print Out join("\t", @array)."\n";
		}
	}
	close(Out);
}

# get bowtie index for each reference database 
sub getBowtieIndex {
	my ($refName) = @_;
	
	my $subDIR = "";
	# human miRNA/lncRNA/RNA/ncRNA/DNA
	if ( $refName eq "human_miRNA" ) {
		$subDIR = "/miRBase/hairpin_hsa_anno";
	} elsif ( $refName eq "human_miRNA_sub" ) {
		$subDIR = "/miRBase/hairpin_hsa_sub_anno";
	} elsif ( $refName eq "human_repSeq" ) {
		$subDIR = "/RepBase/humrep";
	} elsif ( $refName eq "human_subSeq" ) {
		$subDIR = "/RepBase/humsub";
	} elsif ( $refName eq "human_lncRNA" ) {
		$subDIR = "/LNCipedia/lncipedia_3_0";
	} elsif ( $refName eq "human_RNA" ) {
		$subDIR = "/RefSeq/human.rna";
	} elsif ( $refName eq "human_ncRNA" ) {
		$subDIR = "/Ensembl/Homo_sapiens.GRCh38.ncrna";
	} elsif ( $refName eq "human_DNA" ) {
		$subDIR = "/UCSC/hg38";
	# exogenous ribosomal RNA
	} elsif ( $refName eq "bacteria_rRNA" ) {
		$subDIR = "/RDP/release11_3_Bacteria_unaligned";
	} elsif ( $refName eq "archaea_rRNA" ) {
		$subDIR = "/RDP/release11_3_Archaea_unaligned";
	} elsif ( $refName eq "fungi_rRNA" ) {
		$subDIR = "/RDP/release11_3_Fungi_unaligned";
	} elsif ( $refName eq "all_rRNA_SSU" ) {
		$subDIR = "/SILVA/SILVA_119_SSURef_tax_silva";
	} elsif ( $refName eq "all_rRNA_LSU" ) {
		$subDIR = "/SILVA/SILVA_119_LSURef_tax_silva";
	# bacteria small regulatory RNA
	} elsif ( $refName eq "bacteria_sRNA" ) {
		$subDIR = "/BSRD/BSRD_sRNA_sequences";
	# human microbiome CDS/DNA
	} elsif ( $refName eq "microbiome_CDS_blood" ) {
		$subDIR = "/HMP/Blood.cds";
	} elsif ( $refName eq "microbiome_CDS_heart" ) {
		$subDIR = "/HMP/Heart.cds";
	} elsif ( $refName eq "microbiome_CDS_lymph" ) {
		$subDIR = "/HMP/Lymph_Node.cds";
	} elsif ( $refName eq "microbiome_CDS_gastrointestinal" ) {
		$subDIR = "/HMP/Gastrointestinal_tract.cds";
	} elsif ( $refName eq "microbiome_CDS_urogenital" ) {
		$subDIR = "/HMP/Urogenital_tract.cds";
	} elsif ( $refName eq "microbiome_CDS_oral" ) {
		$subDIR = "/HMP/Oral.cds";
	} elsif ( $refName eq "microbiome_CDS_airways" ) {
		$subDIR = "/HMP/Airways.cds";
	} elsif ( $refName eq "microbiome_CDS_skin" ) {
		$subDIR = "/HMP/Skin.cds";
	} elsif ( $refName eq "microbiome_CDS_unknown" ) {
		$subDIR = "/HMP/Unknown.cds";
	} elsif ( $refName eq "microbiome_DNA_blood" ) {
		$subDIR = "/HMP/Blood.nuc";
	} elsif ( $refName eq "microbiome_DNA_heart" ) {
		$subDIR = "/HMP/Heart.nuc";
	} elsif ( $refName eq "microbiome_DNA_lymph" ) {
		$subDIR = "/HMP/Lymph_Node.nuc";
	} elsif ( $refName eq "microbiome_DNA_gastrointestinal" ) {
		$subDIR = "/HMP/Gastrointestinal_tract.nuc";
	} elsif ( $refName eq "microbiome_DNA_urogenital" ) {
		$subDIR = "/HMP/Urogenital_tract.nuc";
	} elsif ( $refName eq "microbiome_DNA_oral" ) {
		$subDIR = "/HMP/Oral.nuc";
	} elsif ( $refName eq "microbiome_DNA_airways" ) {
		$subDIR = "/HMP/Airways.nuc";
	} elsif ( $refName eq "microbiome_DNA_skin" ) {
		$subDIR = "/HMP/Skin.nuc";
	} elsif ( $refName eq "microbiome_DNA_unknown" ) {
		$subDIR = "/HMP/Unknown.nuc";
	# bacteria ncRNA/CDS/DNA
	} elsif ( $refName eq "bacteria_ncRNA" ) {
		$subDIR = "/Bacteria/all.ncrna";
	} elsif ( $refName eq "bacteria_CDS" ) {
		$subDIR = "/Bacteria/all.cds";
	} elsif ( $refName eq "bacteria_DNA" ) {
		$subDIR = "/Bacteria/all.dna";
	# all miRNA
	} elsif ( $refName eq "all_miRNA" ) {
		$subDIR = "/miRBase/hairpin_anno";
	} elsif ( $refName eq "all_miRNA_sub" ) {
		$subDIR = "/miRBase/hairpin_sub_anno";
	# all RNA/DNA (nt)
	} elsif ( $refName eq "nt_bacteria_1" ) {
		$subDIR = "/NCBI/nt_bacteria_part1";
	} elsif ( $refName eq "nt_bacteria_2" ) {
		$subDIR = "/NCBI/nt_bacteria_part2";
	} elsif ( $refName eq "nt_fungi" ) {
		$subDIR = "/NCBI/nt_fungi";
	} elsif ( $refName eq "nt_virus" ) {
		$subDIR = "/NCBI/nt_virus";
	} elsif ( $refName eq "nt_plant" ) {
		$subDIR = "/NCBI/nt_plant";
	} elsif ( $refName eq "nt_human" ) {
		$subDIR = "/NCBI/nt_human";
	} elsif ( $refName eq "nt_mouse" ) {
		$subDIR = "/NCBI/nt_mouse";
	} elsif ( $refName eq "nt_mammal" ) {
		$subDIR = "/NCBI/nt_mammal";
	} elsif ( $refName eq "nt_chordata" ) {
		$subDIR = "/NCBI/nt_chordata";
	} elsif ( $refName eq "nt_bug" ) {
		$subDIR = "/NCBI/nt_arthropod";
	} elsif ( $refName eq "nt_worm" ) {
		$subDIR = "/NCBI/nt_nematode";
	} elsif ( $refName eq "nt_other" ) {
		$subDIR = "/NCBI/nt_other";
	}
	
	if ( $subDIR eq "" ) {
		return "";
	} else {
		return $dataDIR.$subDIR;
	}
}