#!/usr/bin/perl -w

#===============================================================================
# build description index file for refDB
# ARGV[0]: input reference sequence file (FASTA file, e.g. *.fa)
# ARGV[1]: particular type of refDB, such as:
#
# "miRNA"
# annotated precursor miRNA from miRBase (containing self-defined composite structure in sequence head)
# e.g., ">hsa-mir-101-1|0:74||hsa-miR-101-1-5p|10:31||hsa-miR-101-1-3p|46:66| MI0000103 Homo sapiens miR-101-1 stem-loop"
#
# "RDP"
# rRNA from RDP (containing unified taxonomy information in sequence description)
# e.g., "uncultured archaeon; SK442 Lineage=Root;rootrank;Archaea;domain;"Crenarchaeota";phylum;Thermoprotei;class;Acidilobales;order;Acidilobaceae;family;Acidilobus;genus"
#
# output: description index file (*.description)
# e.g., buildDescription.pl hg38.fa
# e.g., buildDescription.pl hairpin_hsa_anno.fna --miRNA
# e.g., buildDescription.pl hairpin_hsa_sub_anno.fna --miRNA
# e.g., buildDescription.pl hairpin_anno.fna --miRNA
# e.g., buildDescription.pl hairpin_sub_anno.fna --miRNA
# e.g., buildDescription.pl release11_3_Bacteria_unaligned.fa --RDP
# e.g., buildDescription.pl release11_3_Archaea_unaligned.fa --RDP
# e.g., buildDescription.pl release11_3_Fungi_unaligned.fa --RDP
#===============================================================================

use strict;
use warnings;

my $refSeq = $ARGV[0];
my $index = $ARGV[1];
my @array = split(/\./, $refSeq);
pop @array;
my $prefix = join("\.", @array);
my $desFile = $prefix.".description";

open(In, "<", $refSeq) or die "Can not open $refSeq !!!\n";
open(Out, ">", $desFile);
while (<In>) {
	chomp;
	if(/^>/){
		my $head;
		$head = $_;
		$head =~ s/^>//;
		$head =~ s/\t/\ /g;
		$head =~ s/\ /\t/;
		
		if ( $index ) { # if get a code for particular type of refDB
			
			@array= split(/\t/, $head);
			my $matchID = $array[0];
			my $description = $array[1];
			
			# for particular reference databases containing self-defined composite structure in sequence head
			# such as, in an annotated precursor miRNA reference database ("hairpin_anno.fna" or "hairpin_hsa_anno.fna")
			# e.g. ">hsa-mir-101-1|0:74||hsa-miR-101-1-5p|10:31||hsa-miR-101-1-3p|46:66| MI0000103 Homo sapiens miR-101-1 stem-loop"
			if ( $index eq "--miRNA" ) {
				# assign description to precursor miRNA and corresponding mature miRNA
 				my @matchHead = split(/\|\|/, $matchID);
				foreach my $annoMatchID (@matchHead) {
					@array = split(/\|/, $annoMatchID);
					$matchID = $array[0];
					print Out $matchID."\t".$description."\n";
					$description =~ s/\ stem-loop$//;
				}
			
			# for particular reference databases containing unified taxonomy information in sequence description
			# such as, in a rRNA reference database from RDP ("release11_3_Bacteria_unaligned.fa", "release11_3_Archaea_unaligned.fa, or "release11_3_Fungi_unaligned.fa")
			# e.g., "uncultured archaeon; SK442 Lineage=Root;rootrank;Archaea;domain;"Crenarchaeota";phylum;Thermoprotei;class;Acidilobales;order;Acidilobaceae;family;Acidilobus;genus"
			} elsif ( $index eq "--RDP" ) {
				# translate unified taxonomy information into interested ranks ("superkingdom", "kingdom", "phylum", "class", "order", "family", "genus");
				# here "domain" means "superkingdom" for "Bacteria" and "Archaea", and "domain" means "kingdom" for others, e.g., "Fungi",
				my $taxName;
				my $taxonomy;
				
				if ( $description =~ /\ Lineage\=/ ) {
					@array = split(/\ Lineage\=/, $description);
					$description = shift @array;
					$taxonomy = shift @array;
					@array = split(/\;/, $description);
					$taxName = shift @array;
					if ( !($taxName) ) {
						$taxName = "n/a";
					}
					
					@array = split(/\;rootrank\;/, $taxonomy);
					shift @array;
					$taxonomy = shift @array;
					$taxonomy =~ s/\"//g;
					$taxonomy = $taxonomy."\;";
					
					my @classification = ("domain", "phylum", "class", "order", "family", "genus");
					my $taxInfo = "Eukaryota"."\;";
					foreach my $c (@classification) {
						if ( $taxonomy =~ /\;$c\;/ ) {
							@array = split(/\;$c\;/, $taxonomy);
							my $name = shift @array;
							$taxonomy = shift @array;
							if ( ($name eq "Bacteria") or ($name eq "Archaea") ) {
								$taxInfo = $name."\;";
							} else {
								$taxInfo = $taxInfo.$name;
							}
						}
						$taxInfo = $taxInfo."\;";
					}
					$taxInfo = $taxInfo.$taxName;
					print Out $matchID."\t".$taxInfo."\-\-".$description."\n";
				} else {
					print "   Not a reference sequence file from RDP!!!\n\n";
					exit;
				}
			}
			
		} else { # if NOT get any code, then write matchID and description directly as default mode
			print Out $head."\n";
		}
	}	
}
close(In);
close(Out);
