#!/usr/bin/env python from __future__ import print_function from scipy import optimize from math import log, exp from time import time from random import Random import sys WORST_LN_L = float('-inf') def calc_p_success(parameters, explanatory_vars): # Use the model to calculate a z response for every explanatory variable z_list = [] if len(parameters) == 2: # linear model a, b = parameters for x in explanatory_vars: z = a + b*x z_list.append(z) else: try: a, b, c = parameters except: sys.exit('Expecting a 2 parameter or 3 parameter model. Found {} parameters\n'.format(len(parameters))) # parabolic model for x in explanatory_vars: z = a + b*x + c*x*x z_list.append(z) # logistic link function maps z -> probability of success p_list = [] for z in z_list: try: p = 1.0 / (1.0 + exp(-z)) except: if -z < -300: p = 1.0 else: assert -z > 300 p = 0.0 p_list.append(p) return p_list def ln_likelihood(parameters): ln_l = 0.0 p_list = calc_p_success(parameters, X) for i, y in enumerate(Y): p = p_list[i] if y: # the datum was a success, so we use log(p_success) ln_l += log(p) else: # the datum was a failure, so we use log(p_success) ln_l += log(1.0 - p) return ln_l def estimate_global_MLE(params0): param_opt = optimize.fmin(scipy_ln_likelihood, list(params0), xtol=1e-8, disp=False) return param_opt, -scipy_ln_likelihood(param_opt) def scipy_ln_likelihood(parameters): negLnL = -ln_likelihood(parameters) return negLnL # Simulate data to derive a null distribution for the LRT test stat def simulate_data(params, explanatory_vars, rng): p_list = calc_p_success(params, explanatory_vars) sim_y = [] for p in p_list: if rng.random() < p: sim_y.append(True) else: sim_y.append(False) return sim_y def do_param_boot_rep(null_param_values, rng): global Y, X sim_y = simulate_data(null_param_values, X, rng) # because we use globals. We hold onto the real data, and swap it in later. real_data_Y = Y Y = sim_y est_simple_mle, simple_lnL = estimate_global_MLE(null_param_values) full_model_params = [0.0]*(1 + len(null_param_values)) est_rich_mle, rich_lnL = estimate_global_MLE(full_model_params) sim_lrt = 2*(rich_lnL - simple_lnL) Y = real_data_Y return sim_lrt def main(): # Calculate the MLEs and LRT linear_mles, linear_lnL = estimate_global_MLE([0.0, 0.0]) print('Linear model with logistic link function (two free parameters):') print(' The max lnL = ', linear_lnL) print(' MLEs = ', linear_mles) parabolic_mles, parabolic_lnL = estimate_global_MLE([0.0, 0.0, 0.0]) print('Parabolic model with logistic link function (three free parameters):') print(' The max lnL = ', parabolic_lnL) print(' MLEs = ', parabolic_mles) # lrt lrt = 2*(parabolic_lnL - linear_lnL) print('\nThe LRT test statistic =', lrt) if lrt < 3.84: word = 'NOT' else: word = '' print(' This is', word ,'significant using the chi-square with 1 degree of freedom to assess significance.') #print('First draw from the rng would be', rng.random()) if num_sims > 0: sim_lrts = [] for i in range(num_sims): if i % 50 == 0: sys.stderr.write(' sim replicate {}\n'.format(i)) null_true_lrt = do_param_boot_rep(linear_mles, rng) sim_lrts.append(null_true_lrt) sim_lrts.sort() five_percent_count = num_sims / 20 p = 0.95 print('Based on a simulated null distribution of for LRT:') index_cutoff = five_percent_count fmt = ' for P of about {:3.2f} the critical value is {:6.3f} (based on the {} out of {} values)' while p > 0.0: cutoff = sim_lrts[index_cutoff] print(fmt.format(p, cutoff, index_cutoff, num_sims)) index_cutoff += five_percent_count p -= 0.05 p_value = len([i for i in sim_lrts if i > lrt])/float(len(sim_lrts)) print('The approximation of the P-value is', p_value) if __name__ == '__main__': # User interface try: data_filename = sys.argv[1] num_sims = int(sys.argv[2]) assert num_sims >= 0 except: sys.exit('Expecting 2 arguments: a path to a data file and the # of parameteric bootstrap reps to run') if len(sys.argv) > 3: rn_seed = int(sys.argv[3]) else: rn_seed = int(time()) if num_sims > 0: sys.stderr.write('The random number seed is {}\n'.format(rn_seed)) rng = Random() rng.seed(rn_seed) if (num_sims % 20) != 0: sys.exit('Please choose a number of simulations that is a multiple of 20, so I do not have to think too hard about the summary of critical values.\n''') # read in the data filling the globals X and Y with open(data_filename, 'rU') as data_file: X = [] # data stored as global lists Y = [] for idx, row in enumerate(data_file): cols = row.replace('\n', '').split('\t') X.append(float(cols[0])) r = int(cols[1]) assert r in (0, 1) Y.append(r == 1) main()