#!/usr/bin/env python3
# Copyright (c) 2014 Lenwood S. Heath. All rights reserved.
# This program or module is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. It is provided for educational
# purposes and is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

"""
This program reads a FASTA file of sequences and tries to identify
a shared motif using random sampling.
"""

import argparse
import logging
import math
from random import randrange,random
import re
import sys
import time

"""
RandomMotifFinder does the work of random sampling of loci in a list
of sequences to find the best motif.  It uses log likelihood to score
a set of loci.
"""
def RandomMotifFinder(FASTAfile,k,d,runtime):
    first_time = time.time()        # Current time in seconds
    S = ReadFASTA(FASTAfile)
    # Accumulate and log basic statistics
    count = {nucleotide:0.0 for nucleotide in ["A","C","G","T"]}
    for sequence in S:
        for nucleotide in sequence:
            count[nucleotide] += 1.0
    total = 0.0
    for nucleotide in ["A","C","G","T"]:
        logging.info("Count of {0}'s is ".format(nucleotide)
                    + str(int(count[nucleotide])))
        total += count[nucleotide]
    logging.info("Total nucleotide count is " + str(int(total)))
    probability = {x:count[x]/total for x in ["A","C","G","T"]}
    for nucleotide in ["A","C","G","T"]:
        logging.info("Probability of A is ".format(nucleotide)
                    + str(probability[nucleotide]))
    # Generate random samples and check log likelihoods of corresponding
    # motifs
    motif = None
    loci = [0 for x in range(len(S))]
    log_likelihood = -10.0
    while time.time() - first_time < runtime:    # Run for runtime seconds
        new_loci = RandomSample(S,k)
        (new_motif,new_profile) = ProfileMotif(S,new_loci,k,d)
        new_log_likelihood = LogLikelihood(S,
                                           new_loci,
                                           new_motif,
                                           new_profile,
                                           probability)
        if new_log_likelihood > log_likelihood:
            logging.info("Log likelihood is now " + str(new_log_likelihood))
            logging.info("\t\tfor motif " + new_motif)
            loci = new_loci
            motif = new_motif
            log_likelihood = new_log_likelihood
    return (motif,loci,log_likelihood)

"""
Find one random sample of loci.
"""
def RandomSample(S,k):
    loci = list()
    for sequence in S:
        loci.append(randrange(0,len(sequence)-k+1))
    return loci

"""
The job of ProfileMotif is to determine the best motif and the corresponding
profile for given loci in S.
"""
def ProfileMotif(S,loci,k,d):
    # First generate best motif of length k without don't cares
    motif = ""
    # Profile matrix is just count of nucleotides in each position
    # of alignment corresponding to loci.
    profile = {x:{y:0 for y in range(k)} for x in ["A","C","G","T"]}
    for i in range(len(S)):
        substring = S[i][loci[i]:loci[i]+k]
        for j in range(k):
            profile[substring[j]][j] += 1
    # The best nucleotide in each position is the one with the highest
    # count.
    for j in range(k):
        best_nucleotide = "A"
        best_count = profile["A"][j]
        for nucleotide in ["C","G","T"]:
            count = profile[nucleotide][j]
            if count > best_count:
                best_count = count
                best_nucleotide = nucleotide
        motif += best_nucleotide
    # Now find best loci for d don't cares by counts in profile matrix
    best_list = [[profile[motif[j]][j],j,motif[j]] for j in range(k)]
    best_list.sort()   # Primary sort is by profile matrix count
    dont_cares = 0
    i = 0
    while dont_cares < d:
        if best_list[i][1] not in {0,k-1}:
            best_list[i][2] = "*"    # Lowest counts get the don't cares
            dont_cares += 1
        i += 1
    best_list = [[best_list[i][1],best_list[i][2]] for i in range(k)]
    best_list.sort()    # Sort is now by position
    motif_list = [best_list[j][1] for j in range(k)]    # Select nucleotides
                                                        # and don't cares
    motif = "".join(motif_list)
    return (motif,profile)

"""
LogLikelihood computes the log likelihood of the given motif in the given
loci.
"""
def LogLikelihood(S,loci,motif,profile,probability):
    n = len(S)
    log_likelihood = 0.0
    for j in range(len(motif)):
        nucleotide = motif[j]
        if nucleotide == "*":
            continue
        log_likelihood += math.log2((profile[nucleotide][j]/n)/
                                    probability[nucleotide])
    logging.info("Evaluated motif " + motif)
    logging.info("Log likelihood is " + str(log_likelihood))
    return log_likelihood

"""
Read a FASTA file and return the sequences as a list.
"""
def ReadFASTA(FASTAfile):
    header = ""         # A header line starts with ">"
    sequence = ""       # Accumulated sequence is initially empty
    list_of_sequences = list()  # Initial empty list of sequences
    for line in open(FASTAfile):
        line = line.rstrip()    # Eliminate new line and trailing white space
        if len(line) == 0:      # Ignore blank lines
            continue
        elif line[0] == ">":    # Process a header line
            if header:
                if sequence:
                    list_of_sequences.append(sequence)
                else:
                    raise ValueError("FASTA header without sequence\n" +
                           header)
            header = line
            logging.info("header line found:\n\t" + header)
            sequence = ""
        elif re.match(r"^[ACGT]*$",line):    # Process a sequence line
            if not header:
                raise ValueError("FASTA sequence without header\n" + line)
            sequence += line
        else:                   # Line not valid FASTA format
            raise ValueError("Unrecognized FASTA format line\n" + line)
    if sequence:
        list_of_sequences.append(sequence)
    else:
        raise ValueError("FASTA file does not end with sequence")
    return list_of_sequences

def main():
    """
    The main program.
    Uses argparse to parse the command line arguments.
    Uses logging to record progress.
    Then call RandomMotifFinder to find the best motif by random sampling.
    Finally, send best motif and loci to standard output.
    """

    # Set up basic logging
    logging.basicConfig(filename="RandomMotifFinder.log",filemode="w",
        level=logging.INFO,format="%(levelname)s: %(asctime)s: %(message)s")
    logging.info("Starting RandomMotifFinder")

    # Process command line arguments
    argparser = argparse.ArgumentParser(
        description="Use random sampling to find a motif.")
    argparser.add_argument("FASTAfile",type=str,help="Input FASTA file")
    argparser.add_argument("-k",dest="k",type=int,default=6,
        help="Motif length")
    argparser.add_argument("-d",dest="d",type=int,default=0,
        help="Number of don't cares")
    argparser.add_argument("-t","--time",dest="runtime",type=int,
                           default=2, help="Maximum run time in seconds")
    args = argparser.parse_args()
    logging.info("Arguments sucessfully parsed")
    FASTAfile = args.FASTAfile
    logging.info("Input FASTA file is " + FASTAfile)
    k = args.k
    logging.info("Motif length is " + str(k))
    if k <= 1:
        raise ValueError("k must be at least 2")
    d = args.d
    logging.info("Number of don't cares is " + str(d))
    if d < 0 or d > k-2:
        raise ValueError("d must be between 0 and k-2")
    runtime = args.runtime
    logging.info("Maximum run time is " + str(runtime) + " seconds")
    if runtime <= 1:
        raise ValueError("Run time must be at least 2")

    # Randomly sample loci; return best motif, loci, and log likelihood
    (motif,loci,log_likelihood) = RandomMotifFinder(FASTAfile,k,d,runtime)

    # Print results to standard output
    print("Best motif of length {0} with {1} don't cares is {2}".format(
        k,d,motif))
    print("Log likelihood is {0}".format(log_likelihood))
    print("Loci of the best motif are here:")
    for locus in loci:
          print(locus+1)

if __name__ == "__main__":
    main()