#!/usr/bin/perl -w

#===============================================================================
# retrieve, format, amd merge miRNA reference sequences for specific domain (e.g. plant)
# map mature miRNA sequences to precursor miRNA sequences in order to
# get macth location and matchID information of mature miRNAs on
# corresponding miRNA precursors, and then assemble those information into
# an annotated miRNA hairpin reference database (hairpin_anno.fna).
#
# Need: "organisms.txt" file downloaded from miRBase
# also output warning info for inconsistency in the file (*.warning)
# orginal FATSA files (.fa) are downloaded from miRBase:
# http://www.mirbase.org/
#===============================================================================

use strict;
use warnings;

my $domain = "plant";
my @organismList = ();

# retrive organism name for specific domain listed in "organisms.txt" downloaded from miRBase
my $filename = "organisms.txt";
open(In, "<", $filename) or die "can not open file: $filename\n";
while(<In>) {
	chomp;
	my @array = split(/\t/, $_);
	my $organism = $array[0];
	my $division = $array[1];
	my $name = $array[2];
	my $taxInfo = $array[3];
	my $taxid = $array[4];
	if ( $taxInfo =~ /Viridiplantae\;/ ) {	# use taxonomy term
		push @organismList, $organism;
	}
}
if ( @organismList ) {
	my $num = scalar(@organismList);
	print "There are ".$num." organisms for ".$domain." in miRBase.\n"; 
} else {
	print "There is no organism for ".$domain." in miRBase.\n";
	exit; 
}

# build composite miRNA database for specific domain
my $filename0 = "hairpin_".$domain."_anno.fna";
open(Out, ">", $filename0);
close(Out);
my $filename1 = "hairpin_".$domain."_sub_anno.fna";
open(Out, ">", $filename1);
close(Out);
my $filename2 = "hairpin_".$domain.".fna";
open(Out, ">", $filename2);
close(Out);
my $filename3 = "mature_".$domain.".fna";
open(Out, ">", $filename3);
close(Out);
my $filename4 = "hairpin_".$domain.".warning";
open(Out, ">", $filename4);
close(Out);
foreach my $organism (@organismList) {
	system("assembleAll_miR.pl $organism");
	system("cat hairpin_".$organism."_anno.fna >> ".$filename0);
	system("cat hairpin_".$organism."_sub_anno.fna >> ".$filename1);
	system("cat hairpin_".$organism.".fna >> ".$filename2);
	system("cat mature_".$organism.".fna >> ".$filename3);
	system("cat hairpin_".$organism.".warning >> ".$filename4);
	system("rm hairpin_".$organism."_anno.fna");
	system("rm hairpin_".$organism."_sub_anno.fna");
	system("rm hairpin_".$organism.".fna");
	system("rm mature_".$organism.".fna");
	system("rm hairpin_".$organism.".warning");
}
