#!/usr/bin/env python

from scipy import optimize
from math import pow, log, exp
from random import random


# we are going to use a summary that is a pair of numbers for each pair of males
#   it will represent:
#    (the number of bouts won by male 0, the number of bouts won by male 1)


real_data = [(0, 2),
              (1, 1),
              (1, 1),
              (0, 2),
              (2, 0),
              (1, 1),
              (0, 2),
              (2, 0),
              (1, 1),
              (0, 2),
            ]

## Data from 5 sampling schemes: 2 bouts/pair, 3 bouts/pair, ... 7 bouts/pair
##  for each sampling scheme we have 20 observations
#  real_data = [(0, 2), (0, 2), (0, 2), (0, 2), (0, 2), (2, 0), (2, 0), (2, 0), (1, 1), (0, 2),
#               (0, 2), (2, 0), (0, 2), (1, 1), (1, 1), (2, 0), (0, 2), (1, 1), (1, 1), (1, 1),
#               (3, 0), (2, 1), (0, 3), (1, 2), (1, 2), (0, 3), (0, 3), (2, 1), (0, 3), (1, 2),
#               (2, 1), (1, 2), (2, 1), (1, 2), (1, 2), (2, 1), (2, 1), (2, 1), (0, 3), (1, 2),
#               (0, 4), (3, 1), (0, 4), (3, 1), (1, 3), (2, 2), (1, 3), (4, 0), (1, 3), (2, 2),
#               (1, 3), (3, 1), (0, 4), (2, 2), (1, 3), (0, 4), (1, 3), (4, 0), (2, 2), (3, 1),
#               (2, 3), (4, 1), (4, 1), (4, 1), (2, 3), (2, 3), (2, 3), (0, 5), (1, 4), (2, 3),
#               (4, 1), (4, 1), (5, 0), (1, 4), (1, 4), (2, 3), (3, 2), (2, 3), (2, 3), (5, 0),
#               (3, 3), (1, 5), (0, 6), (4, 2), (4, 2), (4, 2), (0, 6), (4, 2), (1, 5), (3, 3),
#               (1, 5), (3, 3), (4, 2), (5, 1), (3, 3), (2, 4), (1, 5), (5, 1), (5, 1), (3, 3),
#               (4, 3), (7, 0), (6, 1), (6, 1), (3, 4), (3, 4), (4, 3), (1, 6), (4, 3), (5, 2),
#               (2, 5), (3, 4), (2, 5), (4, 3), (4, 3), (4, 3), (4, 3), (4, 3), (2, 5), (1, 6)]

############################################################################
# Begin model-specific initialization code
############################################################################
# This is a list of values to try for the numerical optimizer.  The length of 
#   this list is also used by some functions to determine the dimensionality
#   of the model
INITIAL_PARAMETER_GUESS = [.75, .75]

############################################################################
# End model-specific initialization code
############################################################################

verbose = False

def ln_likelihood(the_data, param_list):
    '''Calculates the log-likelihood of the parameter values in `param_list`
    based on `the_data`
    '''
    if verbose:
        sys.stderr.write('param = ' + str(param_list) + '\n')
    ############################################################################
    # Begin model-specific log-likelihood code
    ############################################################################

    p_strong, w = param_list
    ###########################################################################
    # Return -inf if the parameters are out of range.  This will keep the optimizer
    #  from returning "illegal" values of the parameters.
    #
    if w != w or p_strong != p_strong:
        return float('-inf')
    if (w < .5) or (w > 1.0) or (p_strong < 0.0) or (p_strong > 1.0):
        return float('-inf')

    p_SS = p_strong * p_strong
    p_SW = p_strong * (1 - p_strong)
    p_WS = p_SW
    p_WW = (1-p_strong)*(1 - p_strong)

    # Calculate the probability that the males are evenly matched
    #
    p_even = p_WW + p_SS

    # Calculate the probability of an outcome of a bout (0 or 1) conditional
    #    on what type of pairing we have:
    #
    prob0_even = 0.5
    prob1_even = 1 - prob0_even

    prob0_SW = w
    prob1_SW = 1 - prob0_SW

    prob0_WS = 1 - w
    prob1_WS = 1 - prob0_WS

    # initialize the ln_l to 0, we want this to be the sum of the log-likelihood
    #   for each datum, so we'll start at 0 and add to it.
    #
    ln_l = 0.0
    for datum in the_data:
        # Each datum is a count of the number of bouts won by male 0 (the 0-th
        #   element of the datum object), and the number of bouts won by
        #   male #1 (the element of the datum object at index 1).
        #
        n0_wins, n1_wins = datum
        
        # Now we calculate the probability of seeing the data conditional on
        #   each of the pairing scenarios  (even, strong/weak, and weak/strong).
        #
        p_datum_if_even = pow(prob0_even, n0_wins) * pow(prob1_even, n1_wins)
        p_datum_and_even = p_even*p_datum_if_even
        
        p_datum_if_SW = pow(prob0_SW, n0_wins) * pow(prob1_SW, n1_wins)
        p_datum_and_SW = p_SW * p_datum_if_SW
        
        p_datum_if_WS = pow(prob0_WS, n0_wins) * pow(prob1_WS, n1_wins)
        p_datum_and_WS = p_WS * p_datum_if_WS
        
        # By the law of total probability, the probability of this datum is
        #   simply the sum of the probability under each scenario multiplied by
        #   the probability that the scenario would occur
        #
        p_datum = p_datum_and_even + p_datum_and_SW + p_datum_and_WS

        # To avoid taking the log of 0.0, we'll return -infinity for the
        #   log-likelihood of any scenario that is incompatible with a datum
        #
        if p_datum <= 0.0:
            return float('-inf')

        # This is where we add the log-likelihood for this datum to the total
        #   for the whole data set.
        #
        ln_l = ln_l + log(p_datum)

    # This is how we send the result back to the function that called this
    #   function.  SciPy's numerical optimization code will use the values
    #   we return to try to find optimal values of p_strong and w by
    #   repeatedly trying out a lot of plausible combinations.
    #
    if verbose:
        sys.stderr.write('ln_l = ' + str(ln_l) + '\n')
    return ln_l
    ############################################################################
    # End model-specific log-likelihood code
    ############################################################################


def simulate_data(template_data, params):
    '''Simulate a data set of the same size as `template_data` but under a 
    model described by the parameter values in `params`
    '''
    ############################################################################
    # Begin model-specific simulation code
    ############################################################################
    # For our model the params are s and w, so we can unpack those from 
    #   the list of parameters
    s, w = params

    p_SS = s*s
    p_SW = s*(1 - s)
    p_WS = (1 - s)*s
    p_WW = (1 - s)*(1 - s)
    p_even = p_SS + p_WW


    sim_data_set = []

    for datum in template_data:
        n_bouts = sum(datum)

        # randomly pick the type of pairing the we have 'EVEN', 'SW' or 'WS'
        rand_match_p = random()
        if rand_match_p < p_even:
            match_type = 'EVEN'
        else:
            if rand_match_p < p_even + p_SW:
                match_type = 'SW'
            else:
                match_type = 'WS'

        # determine the probability that male 0 wins each bout.
        if match_type == 'EVEN':
            p_zero_wins = 0.5
        if match_type == 'WS':
            p_zero_wins = w
        if match_type == 'SW':
            p_zero_wins = 1 - w

        # start out with no bouts won by either, then we are going to
        #   simulate the result of n_bouts
        n0_won = 0
        n1_won = 0
        for bout in range(n_bouts):
            if random() < p_zero_wins:
                n0_won = n0_won + 1
            else:
                n1_won = n1_won + 1

        # add this simulated outcome to our simulated data set
        sim_datum = (n0_won, n1_won)
        sim_data_set.append(sim_datum)

    return sim_data_set
    ############################################################################
    # End model-specific simulation code
    ############################################################################


def calc_global_ml_solution(data):
    '''Uses SciPy's  optimize.fmin to find the mle. Starts the search at
    s = 0.75, and w = 0.75

    Returns the (s_mle, w_mle, ln_L)
    '''

    def scipy_ln_likelihood(x):
        '''SciPy minimizes functions. We want to maximize the likelihood. This
        function adapts our ln_likelihood function to the minimization context
        by returning the negative log-likelihood.
    
        We use this function with SciPy's minimization routine (minimizing the
        negative log-likelihood will maximize the log-likelihood).
        '''
        return -ln_likelihood(data, x)

    x0 = INITIAL_PARAMETER_GUESS
    solution = optimize.fmin(scipy_ln_likelihood, x0, xtol=1e-8, disp=False)
    solution = list(solution)
    ln_l = -scipy_ln_likelihood(solution)
    solution.append(ln_l)
    return solution

def calc_null_ml_solution(data, param_constraints):
    '''This function allows us to optimize those parameters that are set to 
    None in the list `param_constraints`. Other parameters are forced to assume
    the value listed in param_constraints.
    '''
    x0 = []
    adaptor_list = []
    for i, val in enumerate(INITIAL_PARAMETER_GUESS):
        if i >= len(param_constraints) or param_constraints[i] == None:
            x0.append(val)
            adaptor_list.append(None)
        else:
            adaptor_list.append(param_constraints[i])
    
    def intercalate_constraints(x):
        p = []
        x_index = 0
        for el in adaptor_list:
            if el is None:
                p.append(x[x_index])
                x_index = x_index + 1
            else:
                p.append(el)
        return p
        

    def constrained_scipy_ln_likelihood(x):
        all_params = intercalate_constraints(x)
        return -ln_likelihood(data, all_params)
        
    if len(x0) > 0:
        solution = optimize.fmin(constrained_scipy_ln_likelihood,
                                 x0,
                                 xtol=1e-8,
                                 disp=False)
        all_params =  all_params = intercalate_constraints(solution)
    else:
        all_params = list(param_constraints)
    ln_l = ln_likelihood(data, all_params)
    all_params = list(all_params)
    all_params.append(ln_l)
    return all_params


def calc_lrt_statistic(data, null_params):
    '''Returns (log-likelihood ratio test statistic,
                list of MLEs of all parameters and lnL at the global ML point,
                a list of the MLEs of all parameters and lnL under the null)
        
    for `data` with the null hypothesis being the parameters constrained
    to be at the values specified in the list `null_params`
    '''
    # First we calculate the global and null solutions
    #
    global_mle = calc_global_ml_solution(data)
    null_mle = calc_null_ml_solution(data, null_params)

    # the log-likelihood is returned as the last element of the list by these
    #   functions.  We can access this element by referring to element -1 using
    #   the list indexing syntax - which is [] braces:
    #
    global_max_ln_l = global_mle[-1]
    null_max_ln_l = null_mle[-1]

    # Now we can calculate the likelihood ratio test statistic, and return
    #   it as well as the global and null solutions
    #
    lrt = 2*(null_max_ln_l - global_max_ln_l)
    return lrt, global_mle, null_mle
    
    
if __name__ == '__main__':
    # user-interface and sanity checking...

    # we 
    import sys
    
    w_null = float(sys.argv[1])
    n_sims = int(sys.argv[2])
    
    # In my ln_likelihood, I decided that w will be the second parameter listed
    #
    null_params = [None, w_null]
    
    # Call a function to maximize the log-likelihood under the unconstrained
    #   and null conditions.  This returns the LRT statistic and the 
    #   parameter estimates as lists
    #
    lrt, mle_list, null_mle_list = calc_lrt_statistic(real_data, null_params)

    # We can "unpack" the list into 3 separate variables to make it easier to
    #   report
    p_strong_mle, w_mle, ln_l = mle_list
    p_strong_null, w_null, ln_l_null = null_mle_list

    print "MLE of p_strong =", p_strong_mle
    print "MLE of w =", w_mle
    print "lnL at MLEs =", ln_l
    print "L at MLEs =", exp(ln_l)
    print
    print "MLE of p_strong at null w =", p_strong_null
    print "null of w =", w_null
    print "ln_l at null =", ln_l_null
    print "L at null =", exp(ln_l_null)
    print
    print "2* log-likelihood ratio = ", lrt
    print


    # Do parametric bootstrapping to produce the null distribution of the LRT statistic
    #
    if n_sims < 1:
        sys.exit(0)
    print "Generating null distribution of LRT..."

    # We'll write the simulated LRT values to the "standard error stream" which
    #   is written to the screen when we run the program from a terminal.
    # We could use:
    #       pboot_out = open("param_boot.txt", "w")
    # if we wanted to write the results to a file called "param_boot.txt"
    #
    pboot_out = sys.stderr
    pboot_out.write("rep\tlrt\n")

    # null_dist will be a list that holds all of the simulated LRT values. We
    # use [] to create an empty list.
    #
    null_dist = []

    # a "for loop" will repeat the following instructions n_sims times
    #
    for i in range(n_sims):
        # This simulates a data set assuming that the parameters are at the
        #   values that the take under the null hypothesis (since we want the
        #   null distribution of the test statistic).
        #
        sim_params = [p_strong_null, w_null]
        sim_data = simulate_data(real_data, sim_params)

        # Calculate the LRT on the simulated data using the same functions that
        #   we used when we analyzed the real data
        #
        sim_lrt, sim_mle_list, sim_null = calc_lrt_statistic(sim_data, null_params)

        # Add the simulated LRT to our null distribution
        #
        null_dist.append(sim_lrt)

        # Write the value to the output stream.  The str() function converts
        #   numbers to strings so that they can be written to the output stream
        #
        pboot_out.write(str(i + 1))
        pboot_out.write('\t')
        pboot_out.write(str(sim_lrt))
        pboot_out.write('\n')

    # We want the most extreme (negative) values of the LRT from the simulations
    #   if we sort the list, then these values will be at the front of the list
    #   (available by indexing the list with small numbers)
    #
    null_dist.sort()

    # We can report the value of the LRT that is smaller than 95% of the simulated
    #   values...
    #
    n_for_p_point05 = int(0.05*n_sims)
    print "5% critical value is approx =", null_dist[n_for_p_point05]

    # And we can calculate the P-value for our data by counting the number of
    #   simulated replicates that are more extreme than our observed data.
    #   We do this by starting a counter at 0, walking through the stored
    #   null distribution, and adding 1 to our counter every time we see a
    #   simulated LRT that is more extreme than the "real" lrt.
    #
    n_more_extreme = 0
    for v in null_dist:
        if v <= lrt:
            n_more_extreme = n_more_extreme + 1
        else:
            break
    # Then we express this count of the number more extreme as a probability
    #    that a simulated value would be more extreme than the real data.
    #
    print "Approx P-value =", n_more_extreme/float(n_sims)