#!/usr/bin/perl -w

#===============================================================================
# download, organize and assemble all types of viral reference sequences from NCBI:
# ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/
# all.fna file - DNA (complete genome) sequences for every viral species/strains
# all.rpt file - report (summary)
# all.ffn file - CDS (coding sequences)
# all.gff file - annotation for both CDS and gene, having URL Encoding!!!
# all.frn file - ncRNA (non-coding RNA, mainly structural RNA) sequences
# all.rnt file - annotation for ncRNA
#===============================================================================

use strict;
use warnings;

# for alpha test!!!
use IO::Handle;
STDOUT->autoflush(1);


###### download and extract all types of viral reference sequences from NCBI #####
mkdir("fna", 0775);
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/all.fna.tar.gz");
system("tar -zxf all.fna.tar.gz -C ./fna");

mkdir("rpt", 0775);
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/all.rpt.tar.gz");
system("tar -zxf all.rpt.tar.gz -C ./rpt");

mkdir("ffn", 0775);
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/all.ffn.tar.gz");
system("tar -zxf all.ffn.tar.gz -C ./ffn");

mkdir("gff", 0775);
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/all.gff.tar.gz");
system("tar -zxf all.gff.tar.gz -C ./gff");

mkdir("frn", 0775);
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/all.frn.tar.gz");
system("tar -zxf all.frn.tar.gz -C ./frn");

mkdir("rnt", 0775);
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/all.rnt.tar.gz");
system("tar -zxf all.rnt.tar.gz -C ./rnt");


my @fileList = ();
my %gffFileList = ();
my %giNum = ();
my %accessNum = ();
my %virusName = ();
my %virus = ();
my $i;


###### assemble all viral report information into 'all.sum.rpt' ######

print "\n------ assemble all viral report (summary) information into 'all.sum.rpt' ------\n";

# find rpt file names in all subdirectories and output to a sorted list
@fileList = getFileList("rpt");

						# for alpha test!!!
						$i = 0;
						print "Total: ".scalar(@fileList)." rpt files\n";
						print "Handling #";

# read gi, access, length, taxID, and taxName information from all rpt files
foreach my $fileName (@fileList) {
	
						# for alpha test!!!
						$i++;
						print $i;
	
	my $gi = "n\/a";
	my $access = "n\/a";
	my $length = "n\/a";
	my $taxName;
	my $taxID;
	my $index;
	my @array;
	
	open(In, "<",  $fileName);
	while (<In>) {
		chomp;
		if ($_ =~ /^GI: /) {
			$gi = $_;
			$gi =~ s/^GI: //;
			next;
		} elsif ($_ =~ /^Accession: /) {
			$access = $_;
			$access =~ s/^Accession: //;
			next;
		} elsif ($_ =~ /^DNA  length = /) {
			$length = $_;
			$length =~ s/^DNA  length = //;
			next;
		} elsif ($_ =~ /^Taxid: /) {
			$taxID = $_;
			$taxID =~ s/^Taxid: //;
			next;
		} elsif ($_ =~ /^Taxname: /) {
			$taxName = $_;
			$taxName =~ s/^Taxname: //;
			next;
		}
	}
	close(In);
	# some viral report file does NOT have "GI", "Accession", and "DNA  length" information!!!
	if ( $access ne "n\/a") {
		@array = split(/\./, $access);
		$index = $array[0];
	} else {
		@array = split(/\/|\./, $fileName);
		pop @array;
		$index = pop @array;
	}
	$array[0] = $gi;
	$array[1] = $access;
	$array[2] = $length;
	$array[3] = $taxID;
	$array[4] = $taxName;
	$virus{$index} = \@array;
	
						# alpha test!!!
						print "\b" x length($i);
	
}
print "\n";

# initialize output summary file - "all.sum.rpt"
open(Sum, ">", "all.sum.rpt");
print Sum "GI\t"."Accession\t"."Length\t"."Taxid\t"."Taxname\t"."Refname\t"."Replicon\n";
close(Sum);

# find frn file names in all subdirectories and output to a sorted list
getFileList("rnt");



###### assemble all viral DNA (complete genome) sequences into 'all.dna.fna' ######

print "\n------ assemble all viral DNA (complete genome) sequences into 'all.dna.fna' ------\n";

# find fna file names in all subdirectories and output to a sorted list
@fileList = getFileList("fna");

open(Out, ">", "all.dna.fna");
close(Out);

						# for alpha test!!!
						$i = 0;
						print "Total: ".scalar(@fileList)." fna files\n";
						print "Handling #";

foreach my $fileName (@fileList) {
	
						# for alpha test!!!
						$i++;
						print $i;
	my $line;
	my $gi;
	my $access;
	my $length = 0;
	my $name;
	my $replicon;
	my $index;
	my @array;
	
	# read gi, ref, name information from fasta file head
	open(In, "<",  $fileName);
	while (<In>) {
		chomp;
		$line = $_;
		if ($line =~ /^>/) {
			@array = split(/\,\ /, $line);
			$line = $array[0];
			@array = split(/\|\ /, $line);
			$name = $array[1];
			@array = split(/\|/, $array[0]);
			$gi = $array[1];
			$access = $array[3];
			@array = split(/\./, $array[3]);
			$index = $array[0];
			
			$giNum{$index} = $gi;
			$accessNum{$index} = $access;
			$virusName{$index} = $name;
			
			# check coherence!!!
			$name =~ s/\ DNA//;
			$name =~ s/complete\ chromosome\ //; # NOT for viruses
			$name =~ s/\ complete\ genome//;
			$name =~ s/\ draft\ genome//;
			$name =~ s/\ genome//;
			$name =~ s/\ genome\ sequence//;
			$name =~ s/\ complete\ sequence//;
			$name =~ s/\ partial\ sequence//;
			
			# get "Replicon" information from DNA sequence head
			$name =~ s/\ chromosome/\-\-chromosome/;
			$name =~ s/\ plasmid/\-\-plasmid/;
			$name =~ s/\ megaplasmid/\-\-megaplasmid/;
			my @taxWord = split(/\-\-/, $name);
			$name = $taxWord[0];
			$replicon = "";
			if ($taxWord[1]) {
				$replicon = $taxWord[1];
			}
		} else {
			$length += length($line);
		}
	}
	
	# some viral report file does NOT have "GI", "Accession", and "DNA  length" information!!!
	@array = @ { $virus{$index} };
	if ( $array[0] eq "n\/a" ) {
		$array[0] = $gi;
	}
	if ( $array[1] eq "n\/a" ) {
		$array[1] = $access;
	}
	if ( $array[2] eq "n\/a" ) {
		$array[2] = $length;
	}
	if ( $array[2] eq "n\/a" ) {
		$array[2] = $length;
	}
	push @array, $name;
	push @array, $replicon;
	$virus{$index} = \@array;
	
	close(In);
	
	# assemble all fasta files
	system("cat $fileName >> 'all.dna.fna'");
	
						# alpha test!!!
						print "\b" x length($i);

}
print "\n";



# find gff file names in all subdirectories and output to a sorted list
@fileList = getFileList("gff");

# make index for gff file list
foreach my $fileName (@fileList) {
	my @array = split(/\//, $fileName);
	my $index = pop(@array);
	$index =~ s/.gff$//;
	$gffFileList{$index} = $fileName;
}

# initialize output warning file for wrong/ambiguous information in gff files - "gff.warning"
open(Warn, ">", "gff.warning");
close(Warn);



###### assemble all viral CDS (coding sequences) sequences into 'all.cds.ffn' ######

print "\n------ assemble all viral CDS (coding sequences) sequences into 'all.cds.ffn' ------\n";

# find ffn file names in all subdirectories and output to a sorted list
@fileList = getFileList("ffn");

open(Out, ">", "all.cds.ffn");

						# for alpha test!!!
						$i = 0;
						print "Total: ".scalar(@fileList)." ffn files\n";
						print "Handling #";

foreach my $fileName (@fileList) {
	
						# for alpha test!!!
						$i++;
						print $i;
	my $gi;
	my $access;
	my $seqID;
	my $name;
	my $index;
	my @array;

	# read annotation information from corresponding gff file
	my %annoInfo = % { getGffAnnotation($fileName, "CDS") };
	
	# read location information from ffn file
	open(In, "<",  $fileName);
	while (<In>) {
		chomp;
		if ($_ =~ /^>/) {
			my $line = $_;
			
			# get location string
			@array = split(/\, /, $line);
			$line = $array[0];
			$line =~ s/ /\t/;
			@array = split(/\t/, $line);
			$seqID = $array[0];
			$name = $array[1];
			$name =~ s/ complete genome//;
			@array = split(/\|:/, $array[0]);
			my $location = $array[1];
			
			# get gi, ref, name information
			@array = split(/\|/, $seqID);
			$gi = $array[1];
			$access = $array[3];
			@array = split(/\./, $array[3]);
			$index = $array[0];
			$name =~ s/ complete genome//;
			
			# get start position (default: 1) from location string
			my $start = getStartPosition($location);
			
			# get information (gene=;product=;gbkey=) from annotation index
			my $annotation = "gene=;product=;gbkey=";
			if (exists($annoInfo{$start})) {
				$annotation = $annoInfo{$start};
			}
			print Out $seqID."\| ".$name.", ".$annotation."\n";
		}else{
			print Out $_."\n";
		}
	}
	
	# some viral report file does NOT have "DNA Name" information!!!
	@array = @ { $virus{$index} };
	if ( $array[0] eq "n\/a" ) {
		$array[0] = $gi;
	}
	if ( $array[1] eq "n\/a" ) {
		$array[1] = $access;
	}
	if ( !($array[5]) ) {
		$array[5] = $name;
	}
	$virus{$index} = \@array;
	
	close(In);
	
						# for alpha test!!!
						print "\b" x length($i);

}
close(Out);
print "\n";



# output gi, access, length, taxID, taxName, name, and replicon information into summary file
open(Sum, ">>", "all.sum.rpt");
my @allIndex = keys %virus;
@allIndex = sort @allIndex;
foreach my $index (@allIndex) {
	my @array = @ { $virus{$index} };
	print Sum join("\t", @array)."\n";
} 
close(Sum);



###### assemble all viral ncRNA (non-coding RNA) sequences into 'all.ncrna.frn' ######

print "\n------ assemble all viral ncRNA (non-coding RNA) sequences into 'all.ncrna.frn' ------\n";
# find frn file names in all subdirectories and output to a sorted list
@fileList = getFileList("frn");

open(Out, ">", "all.ncrna.frn");

						# for alpha test!!!
						$i = 0;
						print "Total: ".scalar(@fileList)." frn files\n";
						print "Handling #";

foreach my $fileName (@fileList) {
	
						# for alpha test!!!
						$i++;
						print $i;

	# read annotation information from corresponding gff file
	my %annoInfo = % { getGffAnnotation($fileName, "RNA") };
	
	# read location information from ffn file
	open(In, "<",  $fileName);
	while (<In>) {
		chomp;
		if ($_ =~ /^>/) {
			my $line = $_;
			
			# get location string
			my @array = split(/\|/, $line);
			my $index = $array[1];
			my $gi = "-";
			my $access = $index;
			my $name = "-";
			if (exists($giNum{$index})) {
				$gi = $giNum{$index};
			}
			if (exists($accessNum{$index})) {
				$access = $accessNum{$index};
			}
			if (exists($virusName{$index})) {
				$name = $virusName{$index};
			}
			my $location = $array[2];
			my $seqID = ">gi|".$gi."|ref|".$access."\|".$location;
			$location =~ s/\://;
			
			# get start position (default: 1) from location string
			my $start = getStartPosition($location);
			
			# get information (gene=;product=;gbkey=) from annotation index
			my $annotation = "gene=;product=;gbkey=";
			if (exists($annoInfo{$start})) {
				$annotation = $annoInfo{$start};
			} else {
				
			}
			
			# get full seqID (>gi|giNum|ref|refNum|:location|) from head index
			
			print Out $seqID."\| ".$name.", ".$annotation."\n";
		}else{
			print Out $_."\n";
		}
	}
	close(In);
	
						# for alpha test!!!
						print "\b" x length($i);

}
close(Out);
print "\n\n";



#================================ sub functions ================================
# find specific file (e.g., "rpt", "fna", "gff", "ffn", "rnt", or "frn") names in all subdirectories and output to a sorted list
sub getFileList {
	my ($suffix) = @_;
	
	system("find ./".$suffix." -type f -name '*.".$suffix."' | cat > ".$suffix."FileList.txt");
	my @fileList = ();
	open(FileList, "<", $suffix."FileList.txt");
	while (<FileList>) {
		chomp;
		push @fileList, $_;
	}
	close(FileList);
	@fileList = sort @fileList;
	open(FileList, ">", $suffix."FileList.txt");
	foreach my $fileName (@fileList) {
		print FileList $fileName."\n";
	}
	close(FileList);
	return @fileList;
}



# get start position (default: 1) from location string (e.g., "120-125" => "120", "120-125,126-129" => "120",
# "c125-120" => "120", or "c129-126,c125-120" => "120")
# if minimum value is '1', then chose second minimum!!!
sub getStartPosition {
	my ($location) = @_;
	
	my $start = 1;
	my $small;
	$location  =~ s/c//g;
	my @array = split(/\,/, $location);
	my $pairNum = scalar(@array);
	
	if ($pairNum >= 1) {
		my @pairArray = split(/-/, $array[0]);
		if ($pairArray[0] <= $pairArray[1]) {
			$small = $pairArray[0];
		} else {
			$small = $pairArray[1];
		}
		$start = $small;
	}
	
	if ($pairNum > 1) {
		foreach my $pair (@array[1..($pairNum-1)]) {
			my @pairArray = split(/-/, $pair);
			if ($pairArray[0] <= $pairArray[1]) {
				$small = $pairArray[0];
			} else {
				$small = $pairArray[1];
			}
			
			if (($small > $start) and ($start == 1)) {
				$start = $small;
			} elsif (($small <= $start) and ($small != 1)) {
				$start = $small;
			}
		}
	}
	
	return $start;
}



# read annotation information (geneName=;geneProduct=;gbkey=) from corresponding gff file
# if there are something wrong or ambiguous in gff files, will record them all in an output warning file - "gff.warning"
sub getGffAnnotation {
	my ($fileName, $typeWanted) = @_;
	
	my %annoInfo = ();
	my @array = ();
	my $index;
	my $gffFileName;
	my $geneStart = 0;
	my $geneEnd =0;
	my $geneID = "ID=";
	my $geneName = "gene=";
	my $pseudo = "false";
	
	# find corresponding gff file name from gff file list
	@array = split(/\//, $fileName);
	$index = pop(@array);
	if ($typeWanted eq "CDS") {
		$index =~ s/.ffn$//;
	} elsif ($typeWanted eq "RNA") {
		$index =~ s/.frn$//;
	}
	if (exists($gffFileList{$index})) {
		$gffFileName = $gffFileList{$index};
	} else {
		print "\nThere's no gff file for $index on the gff file list!!!\n";
		return;
	}
	
	open(GFF, "<", $gffFileName) or die "\n$gffFileName does NOT exist!!!\n";
	open(Warn, ">>", "gff.warning");
	while (<GFF>) {
		chomp;
		if ($_ !~ /^#/) {
			@array = split(/\t/, $_);
			my $type = $array[2];
			my $start = $array[3];
			my $end =  $array[4];
			my $annotation = $array[8];
			
			# URL decoding (e.g., %28 => (, %2F => /)
			$annotation = decodeURL($annotation);
			
			my $ID = "ID=";
			my $parent = "Parent=";
			my $note = "Note=";
			my $product = "product=";
			my $gbkey = "gbkey=";
			
			# for all types, read "ID=", "Parent=", "note=", "product=", and "gbkey=" from annotation
			@array = split(/\;/, $annotation);
			foreach my $record (@array) {
				if ($record =~ /^ID=/) {
						$ID = $record;
				} elsif ($record =~ /^Parent=/) {
					$parent = $record;
					$parent =~ s/^Parent=//;
				} elsif ($record =~ /^Note=/) {
					$note = $record;
				} elsif ($record =~ /^product=/) {
					$product = $record;
				} elsif ($record =~ /^gbkey=/) {
					$gbkey = $record;
				}
			}
			
			if ($type eq "gene") {
			# if it's a gene, then keep $start, $end, $geneID, $geneName, and $pseudo information
				$geneStart = $start;
				$geneEnd = $end;
				foreach my $record (@array) {
					if ($record =~ /^ID=/) {
						$geneID = $record;
						$geneID =~ s/^ID=//;
					} elsif ($record =~ /^Name=/) {
						$geneName = $record;
						$geneName =~ s/^Name=/gene=/;
					} elsif ($record =~ /^pseudo=true/) {
						$pseudo = "true";
					}
				}
			
			} elsif (($type eq "CDS") and ($typeWanted eq "CDS")) {
			# if it's a CDS, then compare with the latest gene
			# almost all CDS have parents, but only a few CDS do NOT have parents!!!
				if (($geneStart <= $start) and ($end <= $geneEnd)) {
					# location mapped, while ID NOT mapped 
					if ($parent ne $geneID) {
						print Warn $gffFileName."\t".$type."\t".$start."\t".$end."\t".$parent." ~> ".$geneID."\t"."[Wrong parent!!!]\n";
					}
				} else {
					# location NOT mapped, parent ID NOT mapped either 
					if (($parent ne $geneID) and ($parent ne "Parent=")) {
						print Warn $gffFileName."\t".$type."\t".$start."\t".$end."\t".$parent." ~> ".$geneID."\t"."[Wrong location!!!]\n";
					} else {
						$geneName = $ID;
					}
				}
				# if the latest gene is a pseudogene, then keep "product=pseudogene;Note=$note" in product
				if ($pseudo eq "true") {
					if ($note ne "Note=") {
						$product = "product=pseudogene;".$note;
					}
					$pseudo = "false";
				}
				# if there is no gbkey information, then keep "gbkey=CDS" here
				if ($gbkey eq "gbkey=") {
					$gbkey = "gbkey=CDS";
				}
				# keep "gene=$geneName;product=$product;gbkey=$gbkey" in annotation
				$annoInfo{$start} = $geneName."\;".$product."\;".$gbkey;
				
			} elsif (  ( ($type =~ /RNA$/) or (($type eq "transcript") and ($gbkey =~ /RNA$/)) ) and ($typeWanted eq "RNA")  ) {
			# if it's a RNA (rRNA, tRNA, ncRNA, tmRNA or mRNA, etc.) or transcript with gbkey='*RNA' (e.g., misc_RNA), then compare with the latest gene, but neglect pseudo information
			# most tRNA and a few rRNA do NOT have parents!!!
				if (($geneStart <= $start) and ($end <= $geneEnd)) {
					# location mapped, while ID NOT mapped 
					if (($parent ne $geneID) and ($parent ne "Parent=")) {
						print Warn $gffFileName."\t".$type."\t".$start."\t".$end."\t".$parent." ~> ".$geneID."\t"."[Wrong parent!!!]\n";
					}
				} else {
					# location NOT mapped, parent ID NOT mapped either 
					if (($parent ne $geneID) and ($parent ne "Parent=")) {
						print Warn $gffFileName."\t".$type."\t".$start."\t".$end."\t".$parent." ~> ".$geneID."\t"."[Wrong location!!!]\n";
					} else {
						$geneName = $ID;
					}
				}
				# change "product=tRNA-***" to "product=*** tRNA" 
				if ($product =~ /^product=tRNA-/) {
					$product =~ s/^product=tRNA-//;
					$product = "product=".$product." tRNA";
				}
				# if there is no product information, then keep "product=;Note=$note" in product
				if ($product eq "product=") {
					if ($note ne "Note=") {
						$product = $note;
					}
				}
				# if there is no gbkey information, then keep "gbkey=misc_RNA" here
				if ($gbkey eq "gbkey=") {
					$gbkey = "gbkey=misc_RNA";
				}
				# keep "gene=$geneName;product=$product;gbkey=$gbkey" in annotation
				$annoInfo{$start} = $geneName."\;".$product."\;".$gbkey;
			}
		
		}
	}
	close(GFF);
	close(Warn);

	return \%annoInfo;
}

# decode URL encoding string (e.g., %28 => (, %2F => /)
sub decodeURL {
	my ($string) = @_;
	
	$string =~ s/%20/\ /g;
	$string =~ s/%21/\!/g;
	$string =~ s/%22/\"/g;
	$string =~ s/%23/\#/g;
	$string =~ s/%24/\$/g;
	$string =~ s/%25/\%/g;
	$string =~ s/%26/\&/g;
	$string =~ s/%27/\'/g;
	$string =~ s/%28/\(/g;
	$string =~ s/%29/\)/g;
	$string =~ s/%2A/\*/g;
	$string =~ s/%2B/\+/g;
	$string =~ s/%2C/\'/g;	# change "\," to "\'" to  avoid parsing problem for annotation/description
	$string =~ s/%2D/\-/g;
	$string =~ s/%2E/\./g;
	$string =~ s/%2F/\//g;
	$string =~ s/%3A/\:/g;
	$string =~ s/%3B/\:/g;	# change "\;" to "\:" to avoid parsing problem for annotation/description
	$string =~ s/%3C/\</g;
	$string =~ s/%3D/\=/g;
	$string =~ s/%3E/\>/g;
	$string =~ s/%3F/\?/g;
	$string =~ s/%40/\@/g;
	$string =~ s/%5B/\[/g;
	$string =~ s/%5C/\\/g;
	$string =~ s/%5D/\]/g;
	$string =~ s/%5E/\^/g;
	$string =~ s/%5F/\_/g;
	$string =~ s/%60/\`/g;
	$string =~ s/%7B/\{/g;
	$string =~ s/%7C/\|/g;
	$string =~ s/%7D/\}/g;
	$string =~ s/%7E/\~/g;
	
	return $string;
}