#!/usr/bin/perl -w

#===============================================================================
# PisaR [ver1.6.3]:		Pipeline for small RNA sequencing (sRNA-Seq) data analysis
# update [03/03/2016]:	Name changing to sRNAnalyzer, and this one is designed specific for mitochondrial DNA sequence mapping
# update [03/19/2015]:	highlight exogenous miRNA and filter more reads mapped to nt_human (0-1-2)
# update [02/18/2015]:	filter out nc_Vec reference sequences containing human
# 						mature miRNA sequences, only update database, not affect codes
# update [01/21/2015]:	add probabilistic model for ranking multiple matches by frequency
# update [01/09/2015]:	filter out rtRNA (ribosomal RNA, transfer RNA, and mitochondrial RNA/DNA)
# update [12/18/2014]:	only report first ten matchIDs if other multiple mapping > 10
# update [12/16/2014]:	removed RDP from the pipeline
# update [12/10/2014]:	report all for nt series databases (time-consuming)
# update [12/08/2014]:	for summarization/aggregation purpose, only count matchTypes with (+) 
# 						strand for RNAs and matchTypes with both (+/-) strands for non-RNAs
# Update [11/25/2014]:	If multiple alignments of one read are all reverse (-) strand,
#						do NOT filter out this read (keep it for next step) for RNAs
#						although count the first reverse (-) alignment and
#						report all these reverse (-) strand alignments,
# Update [11/21/2014]:	"forward starnd preference" for RNAs. For multiple alignments
#						of each read, report all, while only count the first
#						alignment with forward (+) strand. If all the alignments
#						are mapped to reverse (-) strand of reference sequences,
#						then only count the first reverse (-) alignment.
# Update [11/13/2014]:	first, map reads directly to precursor miRNA sequences,
#						annotated with mature miRNA location info - |matchID|start:end|,
#						second, reassign reads to mature miRNAs using these location info
#						after alignment,
#						finally, summerize read counts based on mature miRNA matchID.
# Update [09/02/2014]:	specific for isomiR discovery
#
# ARGV[0]: maxReadLength
# ARGV[1]: processed fasta file (adapter-trimmed and duplicate-read-collapsed)
#          read head: >SN-countNumber, e.g., >6-128
# this version use formatted file as bowtie output
# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
#===============================================================================

use strict;
use warnings;

my $maxReadLength = $ARGV[0];
my $readFile = $ARGV[1];
my $prefix = $readFile;
$prefix =~ s/\.fa$//;
my $refName;
my $maxMismatch;

# reference database for mapping
my $dataDIR = "/var/www/html/database/bowtie";

# open or renew output files
open(Profile, ">", $prefix.".profile");
close(Profile);
open(Profile, ">", $prefix.".feature");
close(Profile);
open(Dist, ">", $prefix.".dist");
print Dist "refName\t"."maxMismatch\t"."input/match\t"."uniqReadN\t"."readN\t";
print Dist join("\t", (1...$maxReadLength)), "\n";
close(Dist);


# Vector sequence (including cloning/expression/transfer vector sequences) filtering (nt/Blast/NCBI)
my @refDB = qw(
	nt_Vec
);
=hide
# Vector sequence filtering (UniVec/NCBI, and nt/Blast/NCBI)
my @refDB = qw(
	UniVec
	nt_Vec
);
=cut
$maxMismatch = 0;
foreach $refName (@refDB) {
	$readFile = match($readFile, $refName, $maxMismatch, $prefix, "F");
}

# human miRNA mapping (miRBase, piRBase, and snoRNABase)
@refDB = qw(
	human_miRNA
	human_miRNA_sub
	human_piRNA
	human_snoRNA
);
for $maxMismatch (0..2) {
	foreach $refName (@refDB) {
		$readFile = match($readFile, $refName, $maxMismatch, $prefix, "T");
	}
}

# virus miRNA mapping (miRBase)
@refDB = qw(
	virus_miRNA
);
$maxMismatch = 0;
foreach $refName (@refDB) {
	$readFile = match($readFile, $refName, $maxMismatch, $prefix, "T");
}

# human repetitive sequence mapping (RepBase)
@refDB = qw(
	human_repSeq
	human_subSeq
);
for $maxMismatch (0..2) {
	foreach $refName (@refDB) {
		$readFile = match($readFile, $refName, $maxMismatch, $prefix, "F");
	}
}

# human ribosomal RNA (rRNA), transfer RNA (tRNA), and mitochondrial RNA/DNA mapping (nt/Blast/NCBI)
@refDB = qw(
	nt_human_rtRNA
);
for $maxMismatch (0..2) {
	foreach $refName (@refDB) {
		$readFile = match($readFile, $refName, $maxMismatch, $prefix, "T");
	}
}

# human lncRNA/RNA/ncRNA/DNA mapping (LNCipedia, RefSeq/NCBI, Ensembl, UCSC, and nt/Blast/NCBI)
@refDB = qw(
	human_lncRNA
	human_RNA
	human_ncRNA
	human_DNA
	nt_human
);
for $maxMismatch (0..2) {
	foreach $refName (@refDB) {
		$readFile = match($readFile, $refName, $maxMismatch, $prefix, "T");
	}
}

countTotalDist($readFile, $prefix);
print "\n";


#================================ sub functions ================================
# map reads to each reference database (refDB)
sub match {
	my ($readFile, $refName, $maxMismatch, $prefix, $reportAll) = @_;
	
	my $matchType = $refName."\.mis_".$maxMismatch;
	my $matchReads = $prefix."\_Match_tmp.fa";
	my $mismatchReads = $prefix."\_unMatch_tmp.fa";
	my $readNext = $prefix."\_unMatch.fa";
	my $formatFile = $prefix.".".$matchType."\.format";
	
	print "\n--- readFile: ".$prefix."\t"."matchType: ".$matchType."\n";
	
	# check if bowtie index exists
	my $index = getBowtieIndex($refName);
	if ( !($index) ) {
		print "No bowtie index for refDB: $refName !!!\n";
		exit();
	}
	
	# check bowtie index type ("small" or "large") and index file integrity (total 6 files)
	my $indexType = "\ ";		# default is "small" index
	if ( (-e $index."\.1\.ebwt") and (-e $index."\.2\.ebwt") and (-e $index."\.3\.ebwt") and (-e $index."\.4\.ebwt") and (-e $index."\.rev\.1\.ebwt") and (-e $index."\.rev\.2\.ebwt")) {
		$indexType = "\ ";		# use "small" index for bowtie alignments
	} elsif ( (-e $index."\.1\.ebwtl") and (-e $index."\.2\.ebwtl") and (-e $index."\.3\.ebwtl") and (-e $index."\.4\.ebwtl") and (-e $index."\.rev\.1\.ebwtl") and (-e $index."\.rev\.2\.ebwtl")) {
		$indexType = "\--large-index";
	} else {					# use "large" index for bowtie alignments
		print "Incomplete or no bowtie index for refDB: $refName !!!\n";
		exit();
	}
	
	# check bowtie report type ("report the first one" or "report all")
	my $reportType = "-k 1";	# default is "report the first one"
	if ( $reportAll eq "T" ) {
		$reportType = "-a";		# use "-a" to report all valid alignments per read
	} else {
		$reportType = "-k 1";	# use "-k 1" to report the first valid alignment per read
	}
	
	# multiple threads for bowtie alignment
	my $multiThreads = "-p 15"; # e.g., for hex-core processors/CPUs (2 logical cores per physical), total 16 cores, always leave one core for communication
	
	# check refDB type (RNA, CDS or DNA) to decide whether to count reads with only FW alignments
	# "--norc" represent no reverse-complement (RC) alignment
	# "--nofw" represent no foward (FW) strand alignment
	my $orientation = "\ ";		# default is no orientation preference
	if ( $refName =~ /RNA/ ) {
		$orientation = "--norc";# prefer foward (+) strand alignments
	} else {
		$orientation = "\ ";	# no preference for alignment orientation (+/-)
	}
	
	# open an empty file in case that there are no match read at all !!!
	open(Match, ">", $matchReads);
	close(Match);
	open(Format, ">", $formatFile);
	close(Format);

	# just in case when bowtie has NO matchReads or mismatchReads output
	system("touch $matchReads");
	system("touch $mismatchReads");

	# output bowtie alignment results to a formatted file (not SAM file)
	# with 6 fields: readID, strand, matchID, offset, readSeq, mismatch
	# http://bowtie-bio.sourceforge.net/manual.shtml#default-bowtie-output
	system("bowtie -v $maxMismatch $orientation $indexType $index -f $readFile --al $matchReads --un $mismatchReads $reportType $multiThreads --quiet --suppress 6,7 > $formatFile");
	
	# reassign reads to mature miRNAs using their location info on precursor miRNA sequences
	if ( $refName =~ /miRNA/ ) {
		reassignRead($formatFile, $refName, $maxMismatch, $prefix);
	}
	
	# output matched read sequence profile (*.profile) and matched feature file (*.feature) for each sample
	countRead($matchReads, $formatFile, $refName, $maxMismatch, $prefix);
	
	# output input and matched read length distribution into *.dist for each sample
	countDist($readFile, $matchReads, $refName, $maxMismatch, $prefix);
	
	if ( !-e $mismatchReads ) {
		print "no such file - $mismatchReads\n"
	}
	system("mv $mismatchReads $readNext");
	
	# only report (Not count) reverse strand alignments for RNAs for future discovery
	# This could be very important, like for transcriptional regulation!!!
	if ( $refName =~ /RNA/ ) {
		$orientation = "--nofw";	# prefer reverse complement (-) strand alignments
#		$reportType = "-k 1";		# only report the first alignment
		system("bowtie -v $maxMismatch $orientation $indexType $index -f $readNext --al $matchReads $reportType $multiThreads --quiet --suppress 6,7 > $formatFile");
		
		# reassign reads to mature miRNAs using their location info on precursor miRNA sequences
		if ( $refName =~ /miRNA/ ) {
			reassignRead($formatFile, $refName, $maxMismatch, $prefix);
		}
		
		# output matched read sequence profile (*.profile) and matched feature file (*.feature) for each sample
		countRead($matchReads, $formatFile, $refName, $maxMismatch, $prefix);
	}
	
	system("rm $matchReads");
	system("rm $formatFile");
	return $readNext;
}


# reassign reads to annotated matchIDs instead of mapped matchIDs
# format of annotated reference sequence headID:
#
# 'matchID_0|0:length||matchID_1|start_1:end_1||matchID_2|start_2:end_2|...'
#
# if over 60% sequence of a read overlaps with zone-(start_i, end_i), then assign this read to matchID_i
# otherwise, it'll be assigned to precursor miRNA - matchID_0
# annotated info will be removed after read reassigning, to keep consistency with traditional alignemnt results (.format)
sub reassignRead {
	my ($formatFile, $refName, $maxMismatch, $prefix) = @_;

	my $readID;
	my $strand;
	my $matchID;
	my $offset;
	my $readSeq;
	
	my $tempFormatFile = $prefix."\.".$refName."\.mis_".$maxMismatch."\_temp.format";
	
	open(Format, "<", $formatFile) or die "Can not open $formatFile !!!\n";
	open(Temp, ">", $tempFormatFile);
	while (<Format>) {
		chomp;
		my @array = split(/\t/, $_);
		$readID = $array[0];
		$strand = $array[1];
		$matchID = $array[2];
		$offset = $array[3];
		$readSeq = $array[4];
		my $readLength = length($readSeq);
		my $readStart = $offset;
		my $readEnd = $offset + $readLength - 1;
		
		# judge if a matched read belongs to mature -5p, or mature -3p, based on location info
		my @matchHead = split(/\|\|/, $matchID);
		my @annoMatchID = split(/\|/, $matchHead[0]);
		$matchID = $annoMatchID[0];
		my $num = scalar(@matchHead);
		for my $i (1..($num - 1)) {
			@annoMatchID = split(/\|/, $matchHead[$i]);
			my $matchID_i = $annoMatchID[0];
			my @location = split(/\:/, $annoMatchID[1]);
			my $start_i = $location[0];
			my $end_i = $location[1];
			# basicaly, if over 60% read sequence mapped to -5p or -3p mature miRNA zone, then it'll be assigned to that mature miRNA 
			if ( seqOverlap($readStart, $readEnd, $start_i, $end_i) >= round(0.6 * $readLength) ) {
				$matchID = $matchID_i;
				last;
			}
		}
		$array[2] = $matchID;
		print Temp join("\t", @array)."\n";
	}
	close(Format);
	close(Temp);
	system("rm $formatFile");
	system("mv $tempFormatFile $formatFile");
}

# calculate overlap between two sequences defined by start_i and end_i
sub seqOverlap {
	my ($readStart, $readEnd, $start_i, $end_i) = @_;
	
	if ($start_i < $readStart) {
		$start_i = $readStart;
	}
	if ($end_i > $readEnd) {
		$end_i = $readEnd;
	}
	if ( $start_i <= $end_i ) {
		return ($end_i - $start_i + 1);
	} else {
		return 0;
	}
}

# round a positive number to integer
sub round {
	my ($number) = @_;
	my $firstNumAfterPoint = int(10 * ($number - int($number)));
	if ( $firstNumAfterPoint < 5 ) {
		return int($number);		
	} else {
		return int($number) + 1;	
	}
}

# summarize matched reads
sub countRead {
	my ($matchReads, $formatFile, $refName, $maxMismatch, $prefix) = @_;
	
	my %read = ();
	my %match = ();
	my %featurePro = ();
	my %feature = ();
	my $matchType = $refName."\.mis_".$maxMismatch;
	my $readID;
	my $readNum = 1;
	my $strand;
	my $matchID;
	my $offset;
	my $readSeq = "";
	my $mismatch;
	my $matchNum;
	
	# get matched original reads from fasta file
	# bowtie will report original read sequence if orientation is forward (+) strand, while report reverse-complemented sequence if orientation is (-) strand
	# this is an extension port for 5'-end (left) or 3'-end (right) trimming
	# using bowtie "-5/--trim5 <int>" or "-3/--trim3 <int>"
	# http://bowtie-bio.sourceforge.net/manual.shtml#input
	open(Match, "<", $matchReads) or die "Can not open $matchReads !!!\n";
	while (<Match>) {
		chomp;
		if(/^>/){
			$readID = $_;
			$readID =~ s/^>//;
			$readSeq = "";
		}else{
			$readSeq = $_;
			if ( exists $read{$readID} ) {
				$read{$readID} = $read{$readID}.$readSeq;
			} else {
				$read{$readID} = $readSeq;
			}
		}
	}
	close(Match);

	# get matched features (could be multiple mapping) from each format file
	open(Format, "<", $formatFile) or die "Can not open $formatFile !!!\n";
	while (<Format>) {
		chomp;
		my @array = split(/\t/, $_);
		$readID = $array[0];	# ID-readNum
		$strand = $array[1];	# reference strand aligned to ("+" for forward, "-" for reverse)
		$matchID = $array[2];	# entry ID of reference sequence from refDB (e.g., hsa-miR-342-5p)
		$offset = $array[3];	# 0-based offset of read sequence on forward reference sequence
		$readSeq = $array[4];	# read sequence (or reverse-complemented if orientation is "-")
		if ( $array[5] ) {		# mismatch info - CIGAR string, e.g., "22:C>A,23:G>T" (1-based offsets of mismatches on forward read sequence)
			$offset = $offset."\,".$array[5];
		}
		
		# if an alignment has the situatuion ( mismatch < maxMismatch ), then pass it (since it should be already reported in previous steps)
		@array = split(/\,/, $offset);
		my $num = scalar(@array);
		if ( ($num - 1) < $maxMismatch) {
			next;
		}
		
		# alignment strand coding
		$matchID = $matchID."\|".$strand."\|";
		
		# if an alignment has the situatuion ( mismatch == maxMismatch ),  then report it
		$match{$readID}{$matchID} = $offset;
		
		# calculate feature frequency (multiple mapping - multiple counting) for probabilistic model
		my @head = split(/\-/, $readID);
		if ( (scalar(@head) == 2) and ($head[1]=~ /^\d+$/) ) {	# if it's collapsed FASTA format
			$readNum = $head[1];
		} else { 												# if it's NOT a collapsed FASTA format
			$readNum = 1;
		}
		if ( exists $featurePro{$matchID} ) {
			$featurePro{$matchID} += $readNum;
		} else {
			$featurePro{$matchID} = $readNum;
		}
	}
	close(Format);
	
	# output matchType (refName + maxMismatch + strand, e.g., "human_miRNA.mis_0+"), readID, readSeq, 
	# matchID (first mapped), offset (readStart + mismatch), matchNum, and multiMatch	
	# into a united a united file (*.profile) for each sample
	open(Profile, ">>", $prefix.".profile");
	foreach $readID (keys %match) {
		my @head = split(/\-/, $readID);
		if ( (scalar(@head) == 2) and ($head[1]=~ /^\d+$/) ) {	# if it's collapsed FASTA format
			$readNum = $head[1];
		} else { 												# if it's NOT a collapsed FASTA format
			$readNum = 1;
		}
		$readSeq = $read{$readID}; # get matched original read
		
		# rank feature (matchID) by feature frequency (probability), and then by alphabet
		my @multiMatch = keys % { $match{$readID} };
		@multiMatch = sort @multiMatch;
		@multiMatch = sort { $featurePro{$b} <=> $featurePro{$a} } @multiMatch;
		
		# only report top-ranked ten matchIDs if multiple matches > 10
		$matchNum = scalar(@multiMatch);
		if ( $matchNum > 10) {
			@multiMatch = @multiMatch[0..9];
		}
		
		# only count the first-ranked matchID, while report other top-ranked nine matchIDs
		$matchID = shift @multiMatch;
		$offset = $match{$readID}{$matchID};
		
		# alignment strand decoding
		if ( $matchID =~ /\|\+\|$/ ) {
			$strand = "+";
		} elsif ( $matchID =~ /\|\-\|$/ ) {
			$strand = "\-";
		} else {
			die "No strand info warning: $matchID !!!";
		}
		$matchID =~ s/\|(?:\+|\-)\|//;

		# report other top-ranked nine matchIDs
		my $num = scalar(@multiMatch);
		foreach my $i (0..($num - 1)) {
			my $otherOffset = $match{$readID}{$multiMatch[$i]};
			$multiMatch[$i] = $multiMatch[$i].$otherOffset; # offset coding for other matches
		}
		
		print Profile $matchType.$strand."\t".$readID."\t".$readSeq."\t".$matchID."\t".$offset."\t".$matchNum."\t".join("\|\.\|", @multiMatch)."\n";
		
		# calculate feature count (multiple mapping - best one counting) based on probabilistic model
		if ( exists $feature{$strand}{$matchID} ) {
			$feature{$strand}{$matchID} += $readNum;
		} else {
			$feature{$strand}{$matchID} = $readNum;
		}
	}
	close(Profile);
	
	# output matchType (refName + maxMismatch + strand, e.g., "human_miRNA.mis_0+")
	# matchID (first mapped), and subtotal readNum	
	# into a united file (*.feature) for each sample
	open(Feature, ">>", $prefix.".feature");
	foreach my $strand ("\+", "\-") {
		my @allMatchID = keys % { $feature{$strand} };
		@allMatchID = sort @allMatchID;
		foreach $matchID (@allMatchID){
			$readNum = $feature{$strand}{$matchID};
			print Feature $matchType.$strand."\t".$matchID."\t".$readNum."\n";;
		}
	}
	close(Feature);
}

# get matched read length distribution for each step
sub countDist {
	my ($readFile, $matchReads, $refName, $maxMismatch, $prefix) = @_;
	
	my @inputReadDist;
	my $inputUniqReadN;
	my $inputReadN;
	my @matchReadDist;
	my $matchUniqReadN;
	my $matchReadN;
	
	# get input original read length distribution
	@inputReadDist = getReadDist($readFile, $maxReadLength);
	$inputUniqReadN = shift @inputReadDist;
	$inputReadN = shift @inputReadDist;
	
	# get matched original read length distribution
	@matchReadDist = getReadDist($matchReads, $maxReadLength);
	$matchUniqReadN = shift @matchReadDist;
	$matchReadN = shift @matchReadDist;
	
	# output input and matched read length distribution
	open(Dist, ">>", $prefix.".dist");
	print Dist $refName."\t".$maxMismatch."\t"."input\t".$inputUniqReadN."\t".$inputReadN."\t";
	print Dist join("\t", @inputReadDist)."\n";
	print Dist $refName."\t".$maxMismatch."\t"."match\t".$matchUniqReadN."\t".$matchReadN."\t";
	print Dist join("\t", @matchReadDist)."\n";
	close(Dist);
	
	# output match rate
	my $uniqueMatchRate;
	my $matchRate;
	
	if ( $inputUniqReadN != 0 ) {
		$uniqueMatchRate = 100*$matchUniqReadN/$inputUniqReadN;
		$matchRate = 100*$matchReadN/$inputReadN;
	} else {
		$uniqueMatchRate = 0;
		$matchRate = 0;
	}
	printf("    unique matchRate: %.2f%%", $uniqueMatchRate);
	print "\t\t";
	printf("real matchRate: %.2f%%", $matchRate);
	print "\n";
}


# get total matched read length distribution from final unmatched reads
sub countTotalDist {
	my ($readFile, $prefix) = @_;
	
	# get final unmatched original read length distribution
	my @unmatchReadDist = getReadDist($readFile, $maxReadLength);
	my $unmatchUniqReadN = shift @unmatchReadDist;
	my $unmatchReadN = shift @unmatchReadDist;
	
	# get initial input original reads
	open(Dist, "<", $prefix.".dist");
	my $firstLine = <Dist>;
	my $line = <Dist>;
	chomp($line);
	my @array = split(/\t/, $line);
	my $refName = shift(@array);
	my $maxMismatch = shift(@array);
	my $input = shift(@array);
	my $inputUniqReadN = shift(@array);
	my $inputReadN = shift(@array);
	my @inputReadDist = @array;
	close(Dist);
	
	my $matchUniqReadN = $inputUniqReadN - $unmatchUniqReadN;
	my $matchReadN = $inputReadN - $unmatchReadN;
	my @matchReadDist = map { $inputReadDist[$_] - $unmatchReadDist[$_] } (0..($maxReadLength-1));
	
	# output total input and matched read length distribution
	open(Dist, ">>", $prefix.".dist");
	print Dist "Total\t"."*\t"."input\t".$inputUniqReadN."\t".$inputReadN."\t";
	print Dist join("\t", @inputReadDist)."\n";
	print Dist "Total\t"."*\t"."match\t".$matchUniqReadN."\t".$matchReadN."\t";
	print Dist join("\t", @matchReadDist)."\n";
	close(Dist);
	
	# output match rate
	my $uniqueMatchRate;
	my $matchRate;
	
	if ( $inputUniqReadN != 0 ) {
		$uniqueMatchRate = 100*$matchUniqReadN/$inputUniqReadN;
		$matchRate = 100*$matchReadN/$inputReadN;
	} else {
		$uniqueMatchRate = 0;
		$matchRate = 0;
	}
	printf("\n--- total unique matchRate: %.2f%%", $uniqueMatchRate);
	print "\t";
	printf("total real matchRate: %.2f%%", $matchRate);
	print "\n";
}

# get read length distribution
sub getReadDist {
	my ($readFile, $maxReadLength) = @_;
	
	my $readID;
	my $readNum = 1;
	my $readNum_next = 1;
	my $length = 0;
	my $uniqReadN = 0;
	my $readN = 0;

	my @readDist = (0) x $maxReadLength;

	open(In, "<", $readFile) or die "Can not open $readFile !!!\n";
	while (<In>) {
		chomp;
		if ( /^>/ ) {
			$readID = $_;
			my @head = split(/\-/, $readID);
			if ( (scalar(@head) == 2) and ($head[1]=~ /^\d+$/) ) {	# if it's collapsed FASTA format
				$readNum_next = $head[1];
			} else { 												# if it's NOT a collapsed FASTA format
				$readNum_next = 1;
			}
			$uniqReadN++;
			$readN += $readNum_next;
			if ( ($length > 0) and ($length <= $maxReadLength) ) {
				$readDist[$length - 1] += $readNum;
			} elsif ($length > $maxReadLength) {
				print "\n---Warning: \"$readID\" - Length: ".$length."\n";
				$length = $maxReadLength;
			}
			$length = 0;
			$readNum = $readNum_next; 
		} else {
			$length += length($_);									# allow multiple lines for readSeq
		}
	}
	if ( $length > 0 ) {
		$readDist[$length - 1] += $readNum;
	}
	close(In);
	unshift @readDist, ($uniqReadN, $readN);
	return @readDist;
}


# get bowtie index for each reference database 
sub getBowtieIndex {
	my ($refName) = @_;
	
	my $subDIR = "";
	# human small RNA
	if ( $refName eq "human_miRNA" ) {
		$subDIR = "/miRBase/hairpin_hsa_anno";
	} elsif ( $refName eq "human_miRNA_sub" ) {
		$subDIR = "/miRBase/hairpin_hsa_sub_anno";
	} elsif ( $refName eq "human_piRNA" ) {
		$subDIR = "/piRBase/piR_human_v1.0";
	} elsif ( $refName eq "human_snoRNA" ) {
		$subDIR = "/snoRNABase/snoRNABase";
	# virus miRNA
	} elsif ( $refName eq "virus_miRNA" ) {
		$subDIR = "/miRBase/hairpin_virus_anno";
	# plant miRNA
	} elsif ( $refName eq "plant_miRNA" ) {
		$subDIR = "/miRBase/hairpin_plant_anno";
	# all miRNA
	} elsif ( $refName eq "all_miRNA" ) {
		$subDIR = "/miRBase/hairpin_anno";
	} elsif ( $refName eq "all_miRNA_sub" ) {
		$subDIR = "/miRBase/hairpin_sub_anno";
	# human repSeq/lncRNA/RNA/ncRNA/DNA
	} elsif ( $refName eq "human_repSeq" ) {
		$subDIR = "/RepBase/humrep";
	} elsif ( $refName eq "human_subSeq" ) {
		$subDIR = "/RepBase/humsub";
	} elsif ( $refName eq "human_lncRNA" ) {
		$subDIR = "/LNCipedia/lncipedia_3_0";
	} elsif ( $refName eq "human_RNA" ) {
		$subDIR = "/RefSeq/human.rna";
	} elsif ( $refName eq "human_ncRNA" ) {
		$subDIR = "/Ensembl/Homo_sapiens.GRCh38.ncrna";
	} elsif ( $refName eq "human_DNA" ) {
		$subDIR = "/UCSC/hg38";
	# exogenous ribosomal RNA
	} elsif ( $refName eq "bacteria_rRNA" ) {
		$subDIR = "/RDP/release11_3_Bacteria_unaligned";
	} elsif ( $refName eq "archaea_rRNA" ) {
		$subDIR = "/RDP/release11_3_Archaea_unaligned";
	} elsif ( $refName eq "fungi_rRNA" ) {
		$subDIR = "/RDP/release11_3_Fungi_unaligned";
	} elsif ( $refName eq "all_rRNA_SSU" ) {
		$subDIR = "/SILVA/SILVA_119_SSURef_tax_silva";
	} elsif ( $refName eq "all_rRNA_LSU" ) {
		$subDIR = "/SILVA/SILVA_119_LSURef_tax_silva";
	# bacteria small regulatory RNA
	} elsif ( $refName eq "bacteria_sRNA" ) {
		$subDIR = "/BSRD/BSRD_sRNA_sequences";
	# human microbiome CDS/DNA
	} elsif ( $refName eq "microbiome_CDS_blood" ) {
		$subDIR = "/HMP/Blood.cds";
	} elsif ( $refName eq "microbiome_CDS_heart" ) {
		$subDIR = "/HMP/Heart.cds";
	} elsif ( $refName eq "microbiome_CDS_lymph" ) {
		$subDIR = "/HMP/Lymph_Node.cds";
	} elsif ( $refName eq "microbiome_CDS_gastrointestinal" ) {
		$subDIR = "/HMP/Gastrointestinal_tract.cds";
	} elsif ( $refName eq "microbiome_CDS_urogenital" ) {
		$subDIR = "/HMP/Urogenital_tract.cds";
	} elsif ( $refName eq "microbiome_CDS_oral" ) {
		$subDIR = "/HMP/Oral.cds";
	} elsif ( $refName eq "microbiome_CDS_airways" ) {
		$subDIR = "/HMP/Airways.cds";
	} elsif ( $refName eq "microbiome_CDS_skin" ) {
		$subDIR = "/HMP/Skin.cds";
	} elsif ( $refName eq "microbiome_CDS_unknown" ) {
		$subDIR = "/HMP/Unknown.cds";
	} elsif ( $refName eq "microbiome_DNA_blood" ) {
		$subDIR = "/HMP/Blood.nuc";
	} elsif ( $refName eq "microbiome_DNA_heart" ) {
		$subDIR = "/HMP/Heart.nuc";
	} elsif ( $refName eq "microbiome_DNA_lymph" ) {
		$subDIR = "/HMP/Lymph_Node.nuc";
	} elsif ( $refName eq "microbiome_DNA_gastrointestinal" ) {
		$subDIR = "/HMP/Gastrointestinal_tract.nuc";
	} elsif ( $refName eq "microbiome_DNA_urogenital" ) {
		$subDIR = "/HMP/Urogenital_tract.nuc";
	} elsif ( $refName eq "microbiome_DNA_oral" ) {
		$subDIR = "/HMP/Oral.nuc";
	} elsif ( $refName eq "microbiome_DNA_airways" ) {
		$subDIR = "/HMP/Airways.nuc";
	} elsif ( $refName eq "microbiome_DNA_skin" ) {
		$subDIR = "/HMP/Skin.nuc";
	} elsif ( $refName eq "microbiome_DNA_unknown" ) {
		$subDIR = "/HMP/Unknown.nuc";
	# bacteria ncRNA/CDS/DNA
	} elsif ( $refName eq "bacteria_ncRNA" ) {
		$subDIR = "/Bacteria/all.ncrna";
	} elsif ( $refName eq "bacteria_CDS" ) {
		$subDIR = "/Bacteria/all.cds";
	} elsif ( $refName eq "bacteria_DNA" ) {
		$subDIR = "/Bacteria/all.dna";
	# virus RNA/CDS/DNA
	} elsif ( $refName eq "virus_RNA" ) {
		$subDIR = "/Viruses/all.ncrna";
	} elsif ( $refName eq "virus_CDS" ) {
		$subDIR = "/Viruses/all.cds";
	} elsif ( $refName eq "virus_DNA" ) {
		$subDIR = "/Viruses/all.dna";
	# all RNA/DNA (nt)
	} elsif ( $refName eq "nt_Vec" ) {
		$subDIR = "/NCBI/nt_Vec";
	} elsif ( $refName eq "nt_virus" ) {
		$subDIR = "/NCBI/nt_virus_main";
	} elsif ( $refName eq "nt_virus_rtRNA" ) {
		$subDIR = "/NCBI/nt_virus_rtRNA";
	} elsif ( $refName eq "nt_bacteria_1" ) {
		$subDIR = "/NCBI/nt_bacteria_main_part01";
	} elsif ( $refName eq "nt_bacteria_2" ) {
		$subDIR = "/NCBI/nt_bacteria_main_part02";
	} elsif ( $refName eq "nt_bacteria_rtRNA" ) {
		$subDIR = "/NCBI/nt_bacteria_rtRNA";
	} elsif ( $refName eq "nt_fungi" ) {
		$subDIR = "/NCBI/nt_fungi_main";
	} elsif ( $refName eq "nt_fungi_rtRNA" ) {
		$subDIR = "/NCBI/nt_fungi_rtRNA";
	} elsif ( $refName eq "nt_plant" ) {
		$subDIR = "/NCBI/nt_plant_main";
	} elsif ( $refName eq "nt_plant_rtRNA" ) {
		$subDIR = "/NCBI/nt_plant_rtRNA";
	} elsif ( $refName eq "nt_human" ) {
		$subDIR = "/NCBI/nt_human_main";
	} elsif ( $refName eq "nt_human_rtRNA" ) {
		$subDIR = "/NCBI/nt_human_rtRNA";
	} elsif ( $refName eq "nt_mouse" ) {
		$subDIR = "/NCBI/nt_mouse_main";
	} elsif ( $refName eq "nt_mouse_rtRNA" ) {
		$subDIR = "/NCBI/nt_mouse_rtRNA";
	} elsif ( $refName eq "nt_mammal" ) {
		$subDIR = "/NCBI/nt_mammal_main";
	} elsif ( $refName eq "nt_mammal_rtRNA" ) {
		$subDIR = "/NCBI/nt_mammal_rtRNA";
	} elsif ( $refName eq "nt_chordata" ) {
		$subDIR = "/NCBI/nt_chordata_main";
	} elsif ( $refName eq "nt_chordata_rtRNA" ) {
		$subDIR = "/NCBI/nt_chordata_rtRNA";
	} elsif ( $refName eq "nt_bug" ) {
		$subDIR = "/NCBI/nt_arthropod_main";
	} elsif ( $refName eq "nt_bug_rtRNA" ) {
		$subDIR = "/NCBI/nt_arthropod_rtRNA";
	} elsif ( $refName eq "nt_worm" ) {
		$subDIR = "/NCBI/nt_nematode_main";
	} elsif ( $refName eq "nt_worm_rtRNA" ) {
		$subDIR = "/NCBI/nt_nematode_rtRNA";
	} elsif ( $refName eq "nt_other" ) {
		$subDIR = "/NCBI/nt_other_main";
	} elsif ( $refName eq "nt_other_rtRNA" ) {
		$subDIR = "/NCBI/nt_other_rtRNA";
	# vector sequences
	} elsif ( $refName eq "UniVec" ) {
		$subDIR = "/UniVec/UniVec";
	}
	
	if ( $subDIR eq "" ) {
		return "";
	} else {
		return $dataDIR.$subDIR;
	}
}