#!/usr/bin/perl -w

#===============================================================================
# Parse and add annotation/description information to head lines for reference 
# sequence file from Bacterial Small Regulatory RNA Database (BSRD)
# http://www.bac-srna.org/BSRD
# BSRD_sRNA_sequences.txt -> BSRD_sRNA_sequences.frn
# will also generate an unmapped name list to nameWarning.txt
#===============================================================================

use strict;
use warnings;

# for alpha test!!!
use IO::Handle;
STDOUT->autoflush(1);

my %bacteria;
my $refDBFile = $ARGV[0];
my $prefix = $refDBFile;
my $suffix = "fa";
my @array = split(/\./, $refDBFile);
if (scalar(@array) > 1) {
	$suffix = pop(@array);
	$prefix = join('.', @array);
}

# Always choose the main/first Replicon when Taxname/DNA Name map to multiple records (Accession IDs)
my @mainReplicon = (
	"chromosome",
	"chromosome 1",
	"chromosome circular",
	"chromosome linear",
	"-",
	"plasmid",
	"plasmid 1",
);

# bacteria name mannually mapping table (a stupid way!!!)
my @mannualMapping = (
	["Agrobacterium tumefaciens str. C58", "Agrobacterium fabrum str. C58"],
	["Burkholderia cenocepacia AU 1056", "Burkholderia cenocepacia AU 1054"],
	["Caldicellulosiruptor kristjanssonii 177R1B", "Caldicellulosiruptor kristjanssonii I77R1B"],
	["Clostridium acetobutylicum ATCC 824 plasmid", "Clostridium acetobutylicum ATCC 824"],
	["Clostridium acetobutylicum ATCC 825 plasmid", "Clostridium acetobutylicum ATCC 824"],
	["Clostridium acetobutylicum ATCC 826 plasmid", "Clostridium acetobutylicum ATCC 824"],
	["Clostridium acetobutylicum ATCC 827 plasmid", "Clostridium acetobutylicum ATCC 824"],
	["Clostridium acetobutylicum ATCC 828 plasmid", "Clostridium acetobutylicum ATCC 824"],
	["Clostridium acetobutylicum ATCC 829 plasmid", "Clostridium acetobutylicum ATCC 824"],
	["Clostridium difficile 630", "Peptoclostridium difficile 630"],
	["Escherichia coli K12 MG1655", "Escherichia coli str. K-12 substr. MG1655"],
	["Geobacter sp. FRC-32", "Geobacter daltonii FRC-32"],
	["Leptospira borgpetersenii serovar Hardjo-bovis JB197", "Leptospira borgpetersenii serovar Hardjo-bovis str. JB197"],
	["Leptospira borgpetersenii serovar Hardjo-bovis L550", "Leptospira borgpetersenii serovar Hardjo-bovis str. L550"],
	["Magnetococcus sp. MC-1", "Magnetococcus marinus MC-1"],
	["Methylobacterium chloromethanicum CM4", "Methylobacterium extorquens CM4"],
	["Mycobacterium abscessus ATCC 19977", "Mycobacterium abscessus"],
	["Pseudomonas fluorescens Pf-5", "Pseudomonas protegens Pf-5"],
	["Salmonella enterica subsp. arizonae serovar 62:z4,z23:-- str. RSK2980", "Salmonella enterica subsp. arizonae serovar 62:z4,z23:- str. RSK2980"],
	["Salmonella enterica subsp. enterica serovar Typhimurium SL1344", "Salmonella enterica subsp. enterica serovar Typhimurium str. SL1344"],
	["Serratia sp. AS9", "Serratia plymuthica AS9"],
	["Sphaerochaeta globus str. Buddy", "Sphaerochaeta globosa str. Buddy"],
	["Syntrophomonas wolfei subsp. wolfei str. Goettingen", "Syntrophomonas wolfei subsp. wolfei str. Goettingen G311"],
	["Thermodesulfobacterium sp. OPB45", "Thermodesulfobacterium geofontis OPF15"],
	["Thermoproteus neutrophilus V24Sta", "Pyrobaculum neutrophilum V24Sta"]
);

# read gi and access information from batceria summary file
open(SUM, "<", "all.sum.rpt") or die "can't find file - all.sum.rpt";
my $i = 0;
while(<SUM>){
	if ($i > 0) {
		chomp;
		@array =  split(/\t/, $_);
		my $gi = $array[0];
		my $access = $array[1];
		my $taxName = $array[4];
		my $name = $array[5];
		my $replicon = $array[6];
		$replicon =~ s/chromosome chromosome 1/chromosome 1/;
		$replicon =~ s/chromosome IV/chromosome 4/;
		$replicon =~ s/chromosome III/chromosome 3/;
		$replicon =~ s/chromosome II/chromosome 2/;
		$replicon =~ s/chromosome I/chromosome 1/;
		$replicon =~ s/chromosome chANA01/chromosome 1/;
		$replicon =~ s/chromosome gsn.131/chromosome/;
		
		# only index records with the main/first Replicon
		foreach my $refReplicon (@mainReplicon) {
			if ($replicon eq $refReplicon) {
				my $head = "gi|".$gi."|ref|".$access."\|";
				$bacteria{$name}{$replicon} = $head;
				if ($name ne $taxName) {
					$bacteria{$taxName}{$replicon} = $head;
				}
			}
		}
	}
	$i++;
}
close(SUM);

open(IN, "<", $refDBFile) or die "can't find file - $refDBFile";
my $outputFile = $prefix.".frn";
open(OUT, ">", $outputFile);
$outputFile = "nameWarning.txt";
open(WARNING, ">", $outputFile);
while(<IN>){
	if( /^>/ ){
		chomp;
		@array =  split(/\|/, $_);
		my $ID = $array[0];
		$ID =~ s/^>//;
		my $name = $array[1];
		my $gene = $array[2];
		my $start = $array[3];
		my $end = $array[4];
		my $strand = $array[5];
		my $head = ">gi|-|ref|-|";
		my $location = "-";
		
		# make sure that $start is NOT greater than $end
		if ($start > $end) {
			my $tmp = $start;
			$start = $end;
			$end = $tmp;
		}
		
		# combine start, end, and strand to location
		if ($strand =~ /forward/) {
			$location = $start."-".$end;
		} elsif ($strand =~ /reverse/) {
			$location = "c".$end."-".$start;
		}
		
		# map mannually mapping table first
		my $num = scalar(@mannualMapping);
		foreach my $i (0..($num-1)) {
			if ($name eq $mannualMapping[$i][0]) {
				$name = $mannualMapping[$i][1];
			}
		} 
		
		# mapp name to gi and access head (always choose the main/first Replicon)
		my $sign = 0;
		foreach my $refReplicon (@mainReplicon) {
			if ( exists($bacteria{$name}{$refReplicon}) ) {
				$head = $bacteria{$name}{$refReplicon};
				$sign = 1;
				last;
			}
		}
		if ($sign == 0) {
			print WARNING $ID."\t".$name."\t".$gene."\n";
		}
		
		print OUT "\>".$ID."\ ".$head."\:".$location."\| ".$name."\, "."Name=".$gene.";gbkey=sRNA\n";
		
	} else {
		print OUT $_;
	}
}
close(IN);
close(OUT);
close(WARNING);
