#!/usr/bin/perl -w

#===============================================================================
# designed specific for horse (Equus caballus) studies
# Add description informtion to matchID for matched read sequence file (.profile)
# the description information derives from corresponding refDB (built by buildDescrition.pl)
# ARGV[0]: 	any matched read sequence file (*.profile) for single sample or all samples
# Output: 	matched read sequence file (*_des.profile) with description information annotated 
# Usage:	desProfile.pl all_sample.profile
#===============================================================================

use strict;
use warnings;

my $profile = $ARGV[0];
my %indexDes = ();
my $refName;
my $matchType;
my $readSeq;
my $readLength;
my $matchID;
my $offset;
my $matchNum;
my $description;

# test profile file first!!
if ( !-e $profile ) {
	die "   Can not open $profile !!!\n";
}

# reference database for mapping, also for description file
my $dataDIR = "/var/www/html/database/bowtie";

my @refDB = qw(
	nt_Vec
	horse_miRNA
	horse_miRNA_sub
	virus_miRNA
	plant_miRNA
	all_miRNA
	all_miRNA_sub
	nt_human_rtRNA
	mammal_repSeq
	horse_ncRNA
	horse_CDS
	horse_DNA
	all_rRNA_SSU
	all_rRNA_LSU
	nt_bacteria_rtRNA
	nt_fungi_rtRNA
	nt_virus_rtRNA
	nt_plant_rtRNA
	nt_mouse_rtRNA
	nt_mammal_rtRNA
	nt_chordata_rtRNA
	nt_bug_rtRNA
	nt_worm_rtRNA
	nt_other_rtRNA
	bacteria_sRNA
	microbiome_CDS_blood
	microbiome_CDS_heart
	microbiome_CDS_lymph
	microbiome_CDS_gastrointestinal
	microbiome_CDS_urogenital
	microbiome_CDS_oral
	microbiome_CDS_airways
	microbiome_CDS_skin
	microbiome_CDS_unknown
	microbiome_DNA_blood
	microbiome_DNA_heart
	microbiome_DNA_lymph
	microbiome_DNA_gastrointestinal
	microbiome_DNA_urogenital
	microbiome_DNA_oral
	microbiome_DNA_airways
	microbiome_DNA_skin
	microbiome_DNA_unknown
	bacteria_ncRNA
	bacteria_CDS
	bacteria_DNA
	nt_bacteria_1
	nt_bacteria_2
	nt_fungi
	nt_virus
	nt_plant
	nt_human
	nt_mouse
	nt_mammal
	nt_chordata
	nt_bug
	nt_worm
	nt_other
);

# get all description index
foreach $refName (@refDB) {
	print "Loading description file for ".$refName." ...\n";
	my $index = getBowtieIndex($refName);
	$index = $index.".description";
	open(In, "<", $index) or print "need to build description index file: $index\n";
	while (<In>) {
		chomp;
		my @head = split(/\t/, $_);
		if ( scalar(@head) == 2 ) {
			$matchID = $head[0];
			$description = $head[1];
			$indexDes{$refName}{$matchID} = $description;
		}
	}
	close(In);
}

open(In, "<", $profile) or die "Can not open $profile !!!\n";
my $outputFile = $profile;
$outputFile =~ s/\.profile/_des.profile/;
open(Out, ">", $outputFile);
print "Output described profile file: $outputFile\n";

# read and output head (first line)
my $firstLine = <In>;
my @array = split(/\t/, $firstLine);
$matchType = shift(@array);
$readSeq = shift(@array);
$readLength = shift(@array);
$matchID = shift(@array);
print Out $matchType."\t".$readSeq."\t".$readLength."\t".$matchID."\t"."description"."\t".join("\t", @array);

# read and output data
while (<In>) {
	chomp;
	@array = split(/\t/, $_);
	$matchType = shift(@array);
	$readSeq = shift(@array);
	$readLength = shift(@array);
	$matchID = shift(@array);
	my @head = split(/\.mis_/, $matchType);
	$refName = $head[0];
	$description = "n/a"; # annotated with 'n/a' - representing not avalable!!!
	if ( exists $indexDes{$refName}{$matchID} ) {
			$description = $indexDes{$refName}{$matchID};
	}
	# output matchType, readSeq, readLength, matchID, description, offset, matchNum, ...
	print Out $matchType."\t".$readSeq."\t".$readLength."\t".$matchID."\t".$description."\t".join("\t", @array)."\n";
}
close(In);
close(Out);

#================================ sub functions ================================
# get bowtie index for each reference database 
sub getBowtieIndex {
	my ($refName) = @_;
	
	my $subDIR = "";
	# macaque miRNA
	if ( $refName eq "horse_miRNA" ) {
		$subDIR = "/miRBase/hairpin_eca_anno";
	} elsif ( $refName eq "horse_miRNA_sub" ) {
		$subDIR = "/miRBase/hairpin_eca_sub_anno";
	# virus miRNA
	} elsif ( $refName eq "virus_miRNA" ) {
		$subDIR = "/miRBase/hairpin_virus_anno";
	# plant miRNA
	} elsif ( $refName eq "plant_miRNA" ) {
		$subDIR = "/miRBase/hairpin_plant_anno";
	# all miRNA
	} elsif ( $refName eq "all_miRNA" ) {
		$subDIR = "/miRBase/hairpin_anno";
	} elsif ( $refName eq "all_miRNA_sub" ) {
		$subDIR = "/miRBase/hairpin_sub_anno";
	# primates repSeq
	} elsif ( $refName eq "mammal_repSeq" ) {
		$subDIR = "/RepBase/mammal_rep";
	# macaque ncRNA/CDS/DNA
	} elsif ( $refName eq "horse_ncRNA" ) {
		$subDIR = "/Ensembl/Equus_caballus.EquCab2.ncrna";
	} elsif ( $refName eq "horse_CDS" ) {
		$subDIR = "/Ensembl/Equus_caballus.EquCab2.cds.all";
	} elsif ( $refName eq "horse_DNA" ) {
		$subDIR = "/Ensembl/Equus_caballus.EquCab2.dna.toplevel";
	# exogenous ribosomal RNA
	} elsif ( $refName eq "bacteria_rRNA" ) {
		$subDIR = "/RDP/release11_3_Bacteria_unaligned";
	} elsif ( $refName eq "archaea_rRNA" ) {
		$subDIR = "/RDP/release11_3_Archaea_unaligned";
	} elsif ( $refName eq "fungi_rRNA" ) {
		$subDIR = "/RDP/release11_3_Fungi_unaligned";
	} elsif ( $refName eq "all_rRNA_SSU" ) {
		$subDIR = "/SILVA/SILVA_119_SSURef_tax_silva";
	} elsif ( $refName eq "all_rRNA_LSU" ) {
		$subDIR = "/SILVA/SILVA_119_LSURef_tax_silva";
	# bacteria small regulatory RNA
	} elsif ( $refName eq "bacteria_sRNA" ) {
		$subDIR = "/BSRD/BSRD_sRNA_sequences";
	# human microbiome CDS/DNA
	} elsif ( $refName eq "microbiome_CDS_blood" ) {
		$subDIR = "/HMP/Blood.cds";
	} elsif ( $refName eq "microbiome_CDS_heart" ) {
		$subDIR = "/HMP/Heart.cds";
	} elsif ( $refName eq "microbiome_CDS_lymph" ) {
		$subDIR = "/HMP/Lymph_Node.cds";
	} elsif ( $refName eq "microbiome_CDS_gastrointestinal" ) {
		$subDIR = "/HMP/Gastrointestinal_tract.cds";
	} elsif ( $refName eq "microbiome_CDS_urogenital" ) {
		$subDIR = "/HMP/Urogenital_tract.cds";
	} elsif ( $refName eq "microbiome_CDS_oral" ) {
		$subDIR = "/HMP/Oral.cds";
	} elsif ( $refName eq "microbiome_CDS_airways" ) {
		$subDIR = "/HMP/Airways.cds";
	} elsif ( $refName eq "microbiome_CDS_skin" ) {
		$subDIR = "/HMP/Skin.cds";
	} elsif ( $refName eq "microbiome_CDS_unknown" ) {
		$subDIR = "/HMP/Unknown.cds";
	} elsif ( $refName eq "microbiome_DNA_blood" ) {
		$subDIR = "/HMP/Blood.nuc";
	} elsif ( $refName eq "microbiome_DNA_heart" ) {
		$subDIR = "/HMP/Heart.nuc";
	} elsif ( $refName eq "microbiome_DNA_lymph" ) {
		$subDIR = "/HMP/Lymph_Node.nuc";
	} elsif ( $refName eq "microbiome_DNA_gastrointestinal" ) {
		$subDIR = "/HMP/Gastrointestinal_tract.nuc";
	} elsif ( $refName eq "microbiome_DNA_urogenital" ) {
		$subDIR = "/HMP/Urogenital_tract.nuc";
	} elsif ( $refName eq "microbiome_DNA_oral" ) {
		$subDIR = "/HMP/Oral.nuc";
	} elsif ( $refName eq "microbiome_DNA_airways" ) {
		$subDIR = "/HMP/Airways.nuc";
	} elsif ( $refName eq "microbiome_DNA_skin" ) {
		$subDIR = "/HMP/Skin.nuc";
	} elsif ( $refName eq "microbiome_DNA_unknown" ) {
		$subDIR = "/HMP/Unknown.nuc";
	# bacteria ncRNA/CDS/DNA
	} elsif ( $refName eq "bacteria_ncRNA" ) {
		$subDIR = "/Bacteria/all.ncrna";
	} elsif ( $refName eq "bacteria_CDS" ) {
		$subDIR = "/Bacteria/all.cds";
	} elsif ( $refName eq "bacteria_DNA" ) {
		$subDIR = "/Bacteria/all.dna";
	# virus RNA/CDS/DNA
	} elsif ( $refName eq "virus_RNA" ) {
		$subDIR = "/Viruses/all.ncrna";
	} elsif ( $refName eq "virus_CDS" ) {
		$subDIR = "/Viruses/all.cds";
	} elsif ( $refName eq "virus_DNA" ) {
		$subDIR = "/Viruses/all.dna";
	# all RNA/DNA (nt)
	} elsif ( $refName eq "nt_Vec" ) {
		$subDIR = "/NCBI/nt_Vec";
	} elsif ( $refName eq "nt_virus" ) {
		$subDIR = "/NCBI/nt_virus_main";
	} elsif ( $refName eq "nt_virus_rtRNA" ) {
		$subDIR = "/NCBI/nt_virus_rtRNA";
	} elsif ( $refName eq "nt_bacteria_1" ) {
		$subDIR = "/NCBI/nt_bacteria_main_part01";
	} elsif ( $refName eq "nt_bacteria_2" ) {
		$subDIR = "/NCBI/nt_bacteria_main_part02";
	} elsif ( $refName eq "nt_bacteria_rtRNA" ) {
		$subDIR = "/NCBI/nt_bacteria_rtRNA";
	} elsif ( $refName eq "nt_fungi" ) {
		$subDIR = "/NCBI/nt_fungi_main";
	} elsif ( $refName eq "nt_fungi_rtRNA" ) {
		$subDIR = "/NCBI/nt_fungi_rtRNA";
	} elsif ( $refName eq "nt_plant" ) {
		$subDIR = "/NCBI/nt_plant_main";
	} elsif ( $refName eq "nt_plant_rtRNA" ) {
		$subDIR = "/NCBI/nt_plant_rtRNA";
	} elsif ( $refName eq "nt_human" ) {
		$subDIR = "/NCBI/nt_human_main";
	} elsif ( $refName eq "nt_human_rtRNA" ) {
		$subDIR = "/NCBI/nt_human_rtRNA";
	} elsif ( $refName eq "nt_mouse" ) {
		$subDIR = "/NCBI/nt_mouse_main";
	} elsif ( $refName eq "nt_mouse_rtRNA" ) {
		$subDIR = "/NCBI/nt_mouse_rtRNA";
	} elsif ( $refName eq "nt_mammal" ) {
		$subDIR = "/NCBI/nt_mammal_main";
	} elsif ( $refName eq "nt_mammal_rtRNA" ) {
		$subDIR = "/NCBI/nt_mammal_rtRNA";
	} elsif ( $refName eq "nt_chordata" ) {
		$subDIR = "/NCBI/nt_chordata_main";
	} elsif ( $refName eq "nt_chordata_rtRNA" ) {
		$subDIR = "/NCBI/nt_chordata_rtRNA";
	} elsif ( $refName eq "nt_bug" ) {
		$subDIR = "/NCBI/nt_arthropod_main";
	} elsif ( $refName eq "nt_bug_rtRNA" ) {
		$subDIR = "/NCBI/nt_arthropod_rtRNA";
	} elsif ( $refName eq "nt_worm" ) {
		$subDIR = "/NCBI/nt_nematode_main";
	} elsif ( $refName eq "nt_worm_rtRNA" ) {
		$subDIR = "/NCBI/nt_nematode_rtRNA";
	} elsif ( $refName eq "nt_other" ) {
		$subDIR = "/NCBI/nt_other_main";
	} elsif ( $refName eq "nt_other_rtRNA" ) {
		$subDIR = "/NCBI/nt_other_rtRNA";
	}
	
	if ( $subDIR eq "" ) {
		return "";
	} else {
		return $dataDIR.$subDIR;
	}
}