#!/usr/bin/env python3
# Copyright (c) 2014 Lenwood S. Heath. All rights reserved.
# This program or module is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. It is provided for educational
# purposes and is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

"""
This program reads a FASTA file of sequences and the output from a
motif finder program to compute and verify the log likelihood reported.
"""

import argparse
import logging
import math
import re
import sys

"""
EvaluateLogLikelihood does the work of reading the reported log likelihood
and reporting whether it matches the computed log likelihood.
"""
def EvaluateLogLikelihood(FASTAfile,k,d):
    S = ReadFASTA(FASTAfile)
    # Accumulate and log basic statistics
    count = {nucleotide:0.0 for nucleotide in ["A","C","G","T"]}
    for sequence in S:
        for nucleotide in sequence:
            count[nucleotide] += 1.0
    total = 0.0
    for nucleotide in ["A","C","G","T"]:
        logging.info("Count of {0}'s is ".format(nucleotide)
                    + str(int(count[nucleotide])))
        total += count[nucleotide]
    logging.info("Total nucleotide count is " + str(int(total)))
    probability = {x:count[x]/total for x in ["A","C","G","T"]}
    for nucleotide in ["A","C","G","T"]:
        logging.info("Probability of {0} is ".format(nucleotide)
                    + str(probability[nucleotide]))
    # Read and parse standard input
    line = sys.stdin.readline()
    re_match = re.match(r"Best motif of length (\d+) with (\d+) "
                        + "don't cares is ([ACGT*]+)",
                        line)
    if re_match is None:
        raise ValueError("Standard input has incorrectly formatted line\n"
                         + line)
    (reported_k,reported_d,reported_motif) = re_match.group(1,2,3)
    reported_k = int(reported_k)
    reported_d = int(reported_d)
    logging.info("Reported k is " + str(reported_k))
    logging.info("Reported d is " + str(reported_d))
    logging.info("Reported motif is " + reported_motif)
    line = sys.stdin.readline()
    re_match = re.match(r"Log likelihood is (.+)",line)
    if re_match is None:
        raise ValueError("Standard input has incorrectly formatted line\n"
                         + line)
    reported_log_likelihood = float(re_match.group(1))
    line = sys.stdin.readline()
    re_match = re.match(r"Loci of the best motif are here:",line)
    if re_match is None:
        raise ValueError("Standard input has incorrectly formatted line\n"
                         + line)
    reported_loci = []
    for line in sys.stdin:
        line = line.rstrip()
        reported_locus = int(line)
        logging.info("Reported locus is " + str(reported_locus))
        reported_loci.append(reported_locus-1)
    # For the reported loci, find the best motif and log likelihood
    loci = reported_loci
    (motif,profile) = ProfileMotif(S,loci,k,d)
    log_likelihood = LogLikelihood(S,
                                    loci,
                                    motif,
                                    profile,
                                    probability)
    if reported_motif == motif:
        print("Reported motif {0} matches computed motif".format(motif))
    else:
        print("Reported motif {0} does not match computed motif {1}".format(
            reported_motif,motif))
    if abs(reported_log_likelihood-log_likelihood) < 0.0001:
        print("Reported log likelihood {0} matches\n".format(
            reported_log_likelihood) +
              "\tcomputed log likelihood {0}".format(log_likelihood))
    else:
        print("Reported log likelihood {0} does not match\n".format(
            reported_log_likelihood) +
              "\tcomputed log likelihood {0}".format(log_likelihood))
    return (motif,loci,log_likelihood)

"""
The job of ProfileMotif is to determine the best motif and the corresponding
profile for given loci in S.
"""
def ProfileMotif(S,loci,k,d):
    # First generate best motif of length k without don't cares
    motif = ""
    # Profile matrix is just count of nucleotides in each position
    # of alignment corresponding to loci.
    profile = {x:{y:0 for y in range(k)} for x in ["A","C","G","T"]}
    for i in range(len(S)):
        substring = S[i][loci[i]:loci[i]+k]
        for j in range(k):
            profile[substring[j]][j] += 1
    # The best nucleotide in each position is the one with the highest
    # count.
    for j in range(k):
        best_nucleotide = "A"
        best_count = profile["A"][j]
        for nucleotide in ["C","G","T"]:
            count = profile[nucleotide][j]
            if count > best_count:
                best_count = count
                best_nucleotide = nucleotide
        motif += best_nucleotide
    # Now find best loci for d don't cares by counts in profile matrix
    best_list = [[profile[motif[j]][j],j,motif[j]] for j in range(k)]
    best_list.sort()   # Primary sort is by profile matrix count
    dont_cares = 0
    i = 0
    while dont_cares < d:
        if best_list[i][1] not in {0,k-1}:
            best_list[i][2] = "*"    # Lowest counts get the don't cares
            dont_cares += 1
        i += 1
    best_list = [[best_list[i][1],best_list[i][2]] for i in range(k)]
    best_list.sort()    # Sort is now by position
    motif_list = [best_list[j][1] for j in range(k)]    # Select nucleotides
                                                        # and don't cares
    motif = "".join(motif_list)
    return (motif,profile)

"""
LogLikelihood computes the log likelihood of the given motif in the given
loci.
"""
def LogLikelihood(S,loci,motif,profile,probability):
    n = len(S)
    log_likelihood = 0.0
    for j in range(len(motif)):
        nucleotide = motif[j]
        if nucleotide == "*":
            continue
        log_likelihood += math.log2((profile[nucleotide][j]/n)/
                                    probability[nucleotide])
    logging.info("Evaluated motif " + motif)
    logging.info("Log likelihood is " + str(log_likelihood))
    return log_likelihood

"""
Read a FASTA file and return the sequences as a list.
"""
def ReadFASTA(FASTAfile):
    header = ""         # A header line starts with ">"
    sequence = ""       # Accumulated sequence is initially empty
    list_of_sequences = list()  # Initial empty list of sequences
    for line in open(FASTAfile):
        line = line.rstrip()    # Eliminate new line and trailing white space
        if len(line) == 0:      # Ignore blank lines
            continue
        elif line[0] == ">":    # Process a header line
            if header:
                if sequence:
                    list_of_sequences.append(sequence)
                else:
                    raise ValueError("FASTA header without sequence\n" +
                           header)
            header = line
            logging.info("header line found:\n\t" + header)
            sequence = ""
        elif re.match(r"^[ACGT]*$",line):    # Process a sequence line
            if not header:
                raise ValueError("FASTA sequence without header\n" + line)
            sequence += line
        else:                   # Line not valid FASTA format
            raise ValueError("Unrecognized FASTA format line\n" + line)
    if sequence:
        list_of_sequences.append(sequence)
    else:
        raise ValueError("FASTA file does not end with sequence")
    return list_of_sequences

def main():
    """
    The main program.
    Uses argparse to parse the command line arguments.
    Uses logging to record progress.
    Then call EvaluateLogLikelihood to evaluate the motif and log likelihood
    reported in standard input.
    """

    # Set up basic logging
    logging.basicConfig(filename="EvaluateLogLikelihood.log",filemode="w",
        level=logging.INFO,format="%(levelname)s: %(asctime)s: %(message)s")
    logging.info("Starting EvaluateLogLikelihood")

    # Process command line arguments
    argparser = argparse.ArgumentParser(
        description="Evaluate reported log likelihood from standard input.")
    argparser.add_argument("FASTAfile",type=str,help="Input FASTA file")
    argparser.add_argument("-k",dest="k",type=int,default=6,
        help="Motif length")
    argparser.add_argument("-d",dest="d",type=int,default=0,
        help="Number of don't cares")
    args = argparser.parse_args()
    logging.info("Arguments sucessfully parsed")
    FASTAfile = args.FASTAfile
    logging.info("Input FASTA file is " + FASTAfile)
    k = args.k
    logging.info("Motif length is " + str(k))
    if k <= 1:
        raise ValueError("k must be at least 2")
    d = args.d
    logging.info("Number of don't cares is " + str(d))
    if d < 0 or d > k-2:
        raise ValueError("d must be between 0 and k-2")

    # Evaluate the log likelihood of the motif in standard input
    (motif,loci,log_likelihood) = EvaluateLogLikelihood(FASTAfile,k,d)

    # Print results to standard output
    print("Best motif of length {0} with {1} don't cares is {2}".format(
        k,d,motif))
    print("Log likelihood is {0}".format(log_likelihood))
    print("Loci of the best motif are here:")
    for locus in loci:
          print(locus+1)
    logging.info("Ending EvaluateLogLikelihood")

if __name__ == "__main__":
    main()