#!/usr/bin/env python

import os.path
import sys
import glob

def cutadaptTrim(inputFile, adapterSeq3, adapterSeq5, vender):
	#===========================================================================
	# Adapter trimming using "Cutadapt"
	# 
	# 1. In the reads produced by Illumina TruSeq SmallRNA-Seq,
	#    the adapters to be trimmed are ligated to 3'-end of inserts
	# 2. The ligated adapter sequence is RPI (RNA PCR Primer (indexed) and
	#    it is shown as reverse complement in actual reads
	# 3. In file name convention used in ISB, the index is added as reverse complement
	#    E.g.) If file name is NFB_GTAGAG_XXX.fastq, it means that RPI17 was used
	#          5'-CAAGCAGAAGACGGCATACGAGAT-CTCTAC-GTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
	#          Adapter to be trimmed is a reverse complement of RPI17, i.e.
	#          TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC-GTAGAG-ATCTCGTATGCCGTCTTCTGCTTG
	#===========================================================================
	
	# Specification of output file name
	cutadaptOutputFile = inputFile.split('.')[0] + "_Cutadapt.fastq"
	if not os.path.isfile(cutadaptOutputFile):
		pass
	else:
		sys.exit("The Cutadapt output file already exists\n")
	cutadaptReportFile = inputFile.split('.')[0] + "_Cutadapt.report"
	if not os.path.isfile(cutadaptReportFile):
		pass
	else:
		sys.exit("The Cutadapt report file already exists\n")
	
	if vender == 1 :
		# Execution of cutadapt with -a or -g (3' or 5' -adapter) option
		cutadaptCommand = '''cutadapt -a %s -g %s -e 0.2 -O 5 -m 1 --match-read-wildcards %s > %s 2> %s'''%(adapterSeq3, adapterSeq5, inputFile, cutadaptOutputFile, cutadaptReportFile)
	else :
		# Execution of cutadapt with -a (3'-adapter) option
		cutadaptCommand = '''cutadapt -a %s -e 0.2 -O 5 -m 1 --match-read-wildcards %s > %s 2> %s'''%(adapterSeq3, inputFile, cutadaptOutputFile, cutadaptReportFile)

	os.system(cutadaptCommand)
	print("Adapter trimming of %s by Cutadapt is finished" %inputFile)
	return(cutadaptOutputFile, cutadaptReportFile)

def prinseqTrim(inputFile):
	
	#===========================================================================
	# Filtering and further trimming using "Prinseq"
	# 	
	# Used parameters are as following
	# -fastq : Input format = fastq
	# -out_format : 1 = fasta output
	# -out_good : output file name for the reads passed filters/trimming conditions
	# -out_bad : null = discard the reads screened by filters/trimming conditions
	# -lc_method : entropy or dust, a filter for low complexity sequences (homopolymers, simple repeats, etc.)
	# -lc_threshold : threshold value for lc_method
	#                 dust - homopolymer = 100, dinucleotide repeats = 49, trinucleotide repeats = 32
	#                 entropy - homopolymer = 0, dinucleotide repeats = 16, trinucleotide repeats = 26
	#  -min_len : minimum length of reads to be retained
	#  -max_len : maximum length of reads to be retained
	#  -trim_tail_right : Trim poly-A/T tails longer than 5nts. 
	#===========================================================================
	
	# Specification of parameters
	lc_method = 'entropy'
	lc_threshold = 50
	min_len = 16
	trim_tail_right = 5
	line_width = 205

	# Specification of file names
	prinseqOutputHeader = inputFile.split('.')[0] + "_Prinseq"	# only header name
	prinseqOutputFile = prinseqOutputHeader + ".fasta"
	if not os.path.isfile(prinseqOutputFile):
		pass
	else:
		sys.exit("The Prinseq output file already exists\n")		
	prinseqReportFile = inputFile.split('.')[0] + "_Prinseq.report"
	if not os.path.isfile(prinseqReportFile):
		pass
	else:
		sys.exit("The Prinseq report file already exists\n")
	
	# Execution of Prinseq
	prinseqCommand = '''prinseq-lite.pl -fastq %s -out_format 1 -out_good %s -out_bad null -lc_method %s \
	-lc_threshold %d -min_len %d -line_width %d -trim_tail_right %d 2>%s''' %(inputFile, prinseqOutputHeader, lc_method, lc_threshold, min_len, line_width, trim_tail_right, prinseqReportFile)
	os.system(prinseqCommand)
	print ("Additional filtering/trimming of %s by Prinseq is finished" %inputFile)
	return(prinseqOutputFile, prinseqReportFile)

def fastqCollapse(inputFile):
	
	#===========================================================================
	# Collapsing duplicates using "fastx_collapser"
	#
	# Collapsing identical sequences in a FASTQ/A file into a single sequence
	# (while maintaining reads counts)
	# fastx_collapser is a part of FASTX_TOOLKIT	
	#===========================================================================
		
	fastxOutputFile = inputFile.split('.')[0] + "_Processed.fasta"
	if not os.path.isfile(fastxOutputFile):
		pass
	else:
		sys.exit("The processed (Cutadapt+Prinseq+Fastx_collapser) file already exists\n")
		
	# Execution of fastx_collapser
	fastxCollapserCommand = '''fastx_collapser -i %s -o %s''' %(inputFile, fastxOutputFile)
	os.system(fastxCollapserCommand)
	print ("Preprocessing of FASTQ file (Cutadapt + Prinseq + Fastx_collapser) is finished")
	return(fastxOutputFile)

if __name__== '__main__':
	filelist = glob.glob('*.fastq.gz')	# List of fastq files in the directory
	nfiles = len(filelist)
	
	for i in range(nfiles):
		ifastq = filelist[i]
		# Check the existence of a fastq file
		if os.path.isfile(ifastq):
			pass
		else:
			sys.exit('%s does not exist\n' %ifastq)
			
		if os.path.isfile(ifastq.split('.')[0]+'_Processed.fa'):
			pass
		else:
			# <Adapter trimming using cutadapt (Illumina small RNA v1.5)> ============================================================
			# adapterSequence = 'ATCTCGTATGCCGTCTTCTGCTTG'	# 3'-adapter sequence
			# (ifastqCutadaptTrimmed,ifastqCutadaptReport) = cutadaptTrim(ifastq, adapterSequence, '', 2)
			#=========================================================================================================================
			# <Adapter trimming using cutadapt (NEB small RNA libraries)> ============================================================
			adapterSeq_3_prime = 'AGATCGGAAGAGCACACGTCT'		# 3'-adapter sequence
			adapterSeq_5_prime = 'GTTCAGACTTCTACAGTCCGACGATC'	# 5'-adapter sequence
			(ifastqCutadaptTrimmed,ifastqCutadaptReport) = cutadaptTrim(ifastq, adapterSeq_3_prime, adapterSeq_5_prime, 1)
			#=========================================================================================================================

			# Further trimming using Prinseq
			(ifastqPrinseqTrimmed,ifastqPrinseqReport) = prinseqTrim(ifastqCutadaptTrimmed)
			os.remove(ifastqCutadaptTrimmed)	# Delete Cutadapt output file for an efficient use of storage
			# Collapsing identical reads using fastx_collapser
			ifastxOutputFile = fastqCollapse(ifastqPrinseqTrimmed)
			os.remove(ifastqPrinseqTrimmed)	# Delete Prinseq output file for an efficient use of storage
			# Rename the processed file into a simplified name
			os.rename(ifastxOutputFile, ifastq.split('.')[0]+'_Processed.fa')

