#!/usr/bin/perl -w

#===============================================================================
# build taxonomy annotation file - taxonomyAnno with interested class/rank information
# automatically download file - taxdump.tar.gz from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
# and also download file - gi_taxid_nucl.dmp.gz from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/
# for taxFeature.pl
#===============================================================================

use strict;
use warnings;

###### download and extract taxonomy files from NCBI #####
#system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz");
#system("tar -zxf taxdump.tar.gz");
#system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz");
#system("gunzip gi_taxid_nucl.dmp.gz");


my @classification = ("superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species");

# get name of species
my %name = ();
open(In, "names.dmp") or die "can't open file - names.dmp";
while (<In>)	{
	chomp;
	my @array = split(/\s*\|\s*/, $_);
	if ( /scientific name/ ) {
		$name{$array[0]} = $array[1];
	}
}
close(In);

# get taxonomy linkage/classification
my %parent = ();
my %class = ();
open(In, "nodes.dmp") or die "can't open file - nodes.dmp";
while (<In>){
	chomp;
	my @array = split(/\s*\|\s*/, $_);
	$parent{$array[0]} = $array[1];
	$class{$array[0]} = $array[2];
}
close(In);

# only retrieve interested taxonomyclasses/ranks
my %interestedClass = ();
foreach my $c (@classification) {
	$interestedClass{$c} = 1;
}

# build taxonomy linkage
my @all_taxid = keys %parent; 
my %linkage = ();
foreach my $taxid (@all_taxid) {
	if ( $taxid eq "" ) { next; }
	my $taxid_now = $taxid;
	while ( ($taxid_now ne "1") and (exists $parent{$taxid_now}) ){
		$taxid_now = $parent{$taxid_now};
		if ( exists $interestedClass{$class{$taxid_now}} ){
			if ( !exists $linkage{$taxid}{$class{$taxid_now}} ) {
				$linkage{$taxid}{$class{$taxid_now}} = $taxid_now;
			}
		}
	}
}

open(Out, ">", "taxonomyAnno");
print Out "#tax_id"."\t"."sci_name"."\t".join("\;", @classification)."\;\n";
@all_taxid = sort @all_taxid;
foreach my $taxid (@all_taxid) {
	print Out $taxid."\t".$name{$taxid}."\t";
	foreach my $c (@classification) {
        if ( exists $linkage{$taxid}{$c} ) {
			print Out $name{$linkage{$taxid}{$c}};
		}
		print Out "\;";
	}
	print Out "\n";
}
