#!/usr/bin/perl -w

#===============================================================================
# add taxonomy information to features (matchType, matchID, description, taxonomy, readNum)
# mainly using taxonomy annotation file - "taxonomyAnno" (built by buildTaxonamy.pl)
#
# need miRBase organism taxid file - "organisms.txt", downloaded from miRBase at:
# ftp://mirbase.org/pub/mirbase/CURRENT/
#
# need gi-taxid mapping file - "gi_taxid_nucl.dmp", downloaded from NCBI at:
# ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
#
# also need SILVA accid - NCBI taxid mapping files - "taxmap_embl_ssu_ref_119.txt" and "taxmap_embl_lsu_ref_119.txt"
# downloaded from SILVA ribosomal RNA database at:
# http://www.arb-silva.de/no_cache/download/archive/release_119/Exports/taxonomy/
# NOTE: be ware that ".acc_taxid" file only contains SILVA's own taxid, NOT NCBI taxid, totally different taxid!!!
#
# ARGV[0]:	feature file (*_des.feature) with descrition information annotated
# Output:	feature file (*_anno.feature) with taxonomy information annotated
# Usage:	taxFeature.pl all_sample_des.feature
#===============================================================================

use strict;
use warnings;

my $feature = $ARGV[0];
my %taxName = ();
my %taxInfoByID = ();
my %taxInfoByName = ();
my %org_taxid = ();
my %gi_taxid = ();
my %silva_acc_taxid = ();
my $matchType;
my $matchID;
my $description;
my $taxonomy;

# reference database folder, also for taxonomy file
my $dataDIR = "/var/www/html/database";

# test if input feature file exists
if ( !-e $feature ) {
	die "   Can not open $feature !!!\n";
}
# test if input feature file has description information
if ( $feature !~ /_des.feature/ ) {
	print "   This is NOT a feature file (*_des.feature) with descrition information!!!\n";
	print "   Please use desFeature.pl to annotate a feature (.feature) file first!\n\n";
	exit;
}

# get taxonomy index and information in the file - "taxonomyAnno", built by buildTaxonamy.pl
open(In, "<", "$dataDIR/NCBI/taxonomyAnno") or die "can not open file: taxonomyAnno\n";
print "Loading taxonomy file ...\n";
my $firstLine = <In>;
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	my $taxid = $array[0];
	my $name = $array[1];
	my $info = $array[2];
	$taxName{$taxid} = $name;
	$taxInfoByID{$taxid} = $info;
	$taxInfoByName{$name} = $info;
}	
close(In);

# get miRBase organism taxid in "organisms.txt" downloaded from ftp://mirbase.org/pub/mirbase/CURRENT/
my $inputFile = "$dataDIR/miRBase/organisms.txt";
open(In, "<", $inputFile) or die "can not open file: $inputFile\n";
print "Loading miRBase organism taxid mapping file ...\n";
$firstLine = <In>;
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	my $organism = $array[0];
	my $taxid = $array[4];
	$org_taxid{$organism} = $taxid;
}	
close(In);

# get gi number from feature file
open(In, "<", $feature) or die "Can't open $feature !!!\n";
print "Loading gi numbers from feature file ...\n";
my %match_gi = ();
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	$matchType = shift(@array);
	$matchID = shift(@array);
	$description = $array[0];

	if ( $matchID =~ /^gi\|/ ) { # mainly for NCBI databases
		my @head = split(/\|/, $matchID);
		$match_gi{$head[1]} = 1;
	} elsif ( $description =~ /^gi\|/ ) { # mainly for HMP databases
		my @head = split(/\|/, $description);
		$match_gi{$head[1]} = 1;
	}
}
close(In);

# get gi-taxid mapping in "gi_taxid_nucl.dmp" downloaded from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
$inputFile = "$dataDIR/NCBI/gi_taxid_nucl.dmp";
open(In, "<", $inputFile) or die "can not open file: $inputFile\n";
print "Loading gi-taxid mapping file ...\n";
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	my $gi = $array[0];
	my $taxid = $array[1];
	if ( exists $match_gi{$gi} ) {
		$gi_taxid{$gi} = $taxid;
	}
}	
close(In);

# get SILVA accid - NCBI taxid mapping for SILVA databases
$inputFile = "$dataDIR/SILVA/taxmap_embl_ssu_ref_119.txt";
open(In, "<", $inputFile) or die "can not open file: $inputFile\n";
print "Loading SILVA rRNA SSU acc-taxid mapping file ...\n";
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	my $acc = join("\.", @array[0..2]);
#	my $embl_tax = $array[3];
#	my $name = $array[4];
	my $taxid = $array[5];
	$taxid =~ s/\ //g;					# remove all spaces!!!
	$silva_acc_taxid{$acc} = $taxid;
}
$inputFile = "$dataDIR/SILVA/taxmap_embl_lsu_ref_119.txt";
open(In, "<", $inputFile) or die "can not open file: $inputFile\n";
print "Loading SILVA rRNA LSU acc-taxid mapping file ...\n";
while (<In>) {
	chomp;
	my @array = split(/\t/, $_);
	my $acc = join("\.", @array[0..2]);
#	my $embl_tax = $array[3];
#	my $name = $array[4];
	my $taxid = $array[5];
	$taxid =~ s/\ //g;					# remove all spaces!!!
	$silva_acc_taxid{$acc} = $taxid;
}

# add taxonomy information to features
open(In, "<", $feature) or die "Can't open $feature !!!\n";
my $outputFile = $feature;
$outputFile =~ s/\_des.feature/_anno.feature/;
open(Out, ">", $outputFile);
print "Output annotated feature file: $outputFile\n";

# output head (first line)
$firstLine = <In>;
my @array = split(/\t/, $firstLine);
$matchType = shift(@array);
$matchID = shift(@array);
print Out $matchType."\t".$matchID."\t"."taxonomy"."\t".join("\t", @array);

while (<In>) {
	chomp;
	@array = split(/\t/, $_);
	$matchType = shift(@array);
	$matchID = shift(@array);
	$description = $array[0];
	$taxonomy = "n/a"; # annotated with 'n/a' - representing not avalable!!!

	my $gi = "";
	my $taxid = "";
	my $organism = "";
	if ( $matchType =~ /miRNA/ ) { # mainly for miRBase databases
		my @head = split(/\-/, $matchID);
		$organism = $head[0];
		if ( exists $org_taxid{$organism} ) {
			$taxid = $org_taxid{$organism};
			if ( (exists $taxName{$taxid}) and (exists $taxInfoByID{$taxid}) ) {
				$taxonomy = $taxInfoByID{$taxid}.$taxName{$taxid};
			}
		}
	} elsif ( $matchID =~ /^gi\|/ ) { # mainly for NCBI databases
		my @head = split(/\|/, $matchID);
		$gi = $head[1];
		if ( exists $gi_taxid{$gi} ) {
			$taxid = $gi_taxid{$gi};
			if ( (exists $taxName{$taxid}) and (exists $taxInfoByID{$taxid}) ) {
				$taxonomy = $taxInfoByID{$taxid}.$taxName{$taxid};
			}
		}
	} elsif ( $description =~ /^gi\|/ ) { # mainly for BSRD database
		my @head = split(/\|/, $description);
		$gi = $head[1];
		if ( exists $gi_taxid{$gi} ) {
			$taxid = $gi_taxid{$gi};
			if ( (exists $taxName{$taxid}) and (exists $taxInfoByID{$taxid}) ) {
				$taxonomy = $taxInfoByID{$taxid}.$taxName{$taxid};
			}
		}
	} elsif ( $matchType =~ /^microbiome/ ) { # mainly for HMP databases
		my @head = split(/\[|\]/, $description);
		my $name = pop @head;
		
		if ( exists $taxInfoByName{$name} ) {
			$taxonomy = $taxInfoByName{$name}.$name;
		}
	} elsif ( $matchType =~ /^all_rRNA_(?:SSU|LSU)/ ) { # mainly for SILVA databases
		if ( exists $silva_acc_taxid{$matchID} ) {
			$taxid = $silva_acc_taxid{$matchID};
			if ( (exists $taxName{$taxid}) and (exists $taxInfoByID{$taxid}) ) {
				$taxonomy = $taxInfoByID{$taxid}.$taxName{$taxid};
			}
		}
	} elsif ( ($matchType =~ /^(?:bacteria|archaea|fungi)_rRNA/) ) { # mainly for RDP databases
		my @head = split(/\-\-/, $description);
		$taxonomy = shift @head;
		$description = shift @head;
		if ( !($description) ) {
			$description = "n/a"; # annotated with 'n/a' - representing not avalable!!!
		}
		$array[0] = $description;
	}
	
	# output matchType, matchID, taxonomy, description ...
	print Out $matchType."\t".$matchID."\t".$taxonomy."\t".join("\t", @array)."\n";
}
close(In);
close(Out);
