#!/usr/bin/env python from scipy import optimize from math import pow, log, exp, sqrt from random import random, normalvariate from copy import copy import numpy verbose = False _use_fmin = False ############################################################################ # Begin model-specific initialization code ############################################################################ ############################################################################ # Here declare a "class" that allows us to bundle together different pieces # of information about our data in a convenient bundle. The read_data # function and simulate data function will create "Gekko" objects, and # store them in a list. # # Specifically the data_list will have an element for each mating group. # # Each of these elements will consist of three lists - one for each diet # treatment. Specifically, if group_data is the data for one mating group # (family), then: # group_data[0] will be a list of individuals exposed to the "Small" # insect diet, # group_data[1] will be a list of individuals exposed to the "Medium" # insect diet, and # group_data[2] will be a list of individuals exposed to the "Large" # insect diet. # Each of this lists will be a list of "Gekko objects" # # If s1 and s2 refer to two different Gekko objects then you can see if they # share a mother using code like this: # # if s1.family == s2.family: # here is a block of code to execute for sibs # else: # here is a block of code to execute for distinct families # # Note that we use the "." operator to examine an attribute inside the "Gekko" # objects. The objects are just a convenient way to keep info on family, # treatment, and y-value (the dependent variable, growth rate) bundled # together. ############################################################################ class Gekko(object): def __init__(self, family, treatment, y): self.family = family self.treatment = treatment self.y = y def __repr__(self): return "Gekko(family=" + str(self.family) + ", treatment=" + str(self.treatment) + ", y=" + str(self.y) + ")" def __str__(self): return self.repr() class ExponentialDistribution(object): def __init__(self, mean): self.mean = mean def calc_ln_prob_ratio(self, numerator_variate, denominator_variate): ''' f(x) = (mean^-1) e^(-x/mean) So: ln[f(x)] = -x/mean - ln[mean] So: ln[f(numerator)] - ln[f(denominator)] = (denominator - numerator)/mean ''' return (denominator_variate - numerator_variate)/self.mean class NormalDistribution(object): def __init__(self, mean, sd): self.mean = mean self.var = sd*sd def calc_ln_prob_ratio(self, numerator_variate, denominator_variate): ''' f(x) = (2 pi sd^2) ^ -0.5 e^(-(x-mean)^2/(2sd^2)) So: ln[f(x)] = -0.5 ln[2 pi sd^2] - (x-mean)^2/(2sd^2) So: ln[f(numerator)] - ln[f(denominator)] = ((denominator-mean)^2 - (numerator-mean)^2)/(2sd^2) = (denominator^2 - numerator^2-mean)^2 - (numerator-mean)^2)/(2sd^2) ''' d_sq = denominator_variate*denominator_variate n_sq = numerator_variate*numerator_variate third_term = 2*self.mean*(numerator_variate - denominator_variate) return (d_sq - n_sq + third_term)/(2*self.var) class Parameter(object): '''This class will keep information about each parameter bundled together for easy reference ''' def __init__(self, name, initial_value, prior=None, proposal_window=None, min_bound=None, max_bound=None): self.name = name self.initial_value = initial_value self.prior = prior self.proposal_window = proposal_window self.min_bound = min_bound if min_bound is not None else float('-inf') self.max_bound = max_bound if max_bound is not None else float('inf') global_parameter_list = [ Parameter(name='mu_S', initial_value=10.0, prior=NormalDistribution(mean=10.0, sd=5), proposal_window=1.0, min_bound=None, max_bound=None), Parameter(name='effect_L', initial_value=10.0, prior=NormalDistribution(mean=1.0, sd=10.0), proposal_window=1.0, min_bound=None, max_bound=None), Parameter(name='var_G', initial_value=1.0, prior=ExponentialDistribution(mean=10.0), proposal_window=1.0, min_bound=0.0, max_bound=None), Parameter(name='var_L_interaction', initial_value=0.5, prior=ExponentialDistribution(mean=10.0), proposal_window=1.0, min_bound=0.0, max_bound=None), Parameter(name='var_Error', initial_value=5.0, prior=ExponentialDistribution(mean=10.0), proposal_window=1.0, min_bound=0.0, max_bound=None), ] ############################################################################ # End model-specific initialization code ############################################################################ def calculate_incidence_row_for_indiv(gekko): '''This function should return a python list. This list will be a single row in the incidence matrix. It should contain a coefficient (e.g. 1 or 0) for each of the parameters that appear in the equation of the linear predictor for this particular gekko. The incidence matrix will be multiplied by a parameter values. So the length of each row that you return here should be equal to the number of parameters ''' treatment = gekko.treatment INCIDENCE_MATRIX_ROW = [] if treatment == 0: INCIDENCE_MATRIX_ROW = [1, 0] elif treatment == 1: INCIDENCE_MATRIX_ROW = [1, 1] return INCIDENCE_MATRIX_ROW # you should not need to alter this next line. the_incidence_matrix = None def ln_likelihood(the_data, param_list): '''Calculates the log-likelihood of the parameter values in `param_list` based on `the_data` ''' global the_incidence_matrix if verbose: sys.stderr.write('param = ' + str(param_list) + '\n') ############################################################################ # Begin model-specific log-likelihood code ############################################################################ first_p, second_p, third_p, fourth_p, fifth_p = param_list mu_S = first_p effect_L = second_p var_g = third_p var_L_interaction = fourth_p var_error = fifth_p # The data is stored in multiple forms. They will be explained below as needed. # You should not have to change the next 4 lines... blocked_by_family = the_data[0] observed_values = the_data[1] family_sizes = the_data[2] flattened_data = the_data[3] # do our "sanity-checking" to make sure that we are in the legal range of # the parameters. # for p in param_list: if p != p: return float('-inf') if var_g < 0.0: return float('-inf') if var_error < 0.0: return float('-inf') FIXED_EFFECT_LIST = [mu_S, effect_L] # this next line creates a column vector, to be multiplied by the incidence # matrix (you should not have to change this) fixed_effects = numpy.matrix(FIXED_EFFECT_LIST).transpose() # this calculates a vector of expected values from the fixed effects and # incidence matrix (you should not have to change this) expected_values = the_incidence_matrix*fixed_effects # Because the values for the gekkos from different familys will have a # covariance of 0, most of the covariance matrix will be 0's and we can # create it in block-diagonal form that is easier to work with (and results # in much faster calculations. # Here will create a list of matrices. This will be the # non-zero blocks along the diagonal of the covariance matrix. # You won't have to change this next line. non_zero_covariance_blocks = list() var_individ_s = var_g + var_error var_individ_l = var_g + var_L_interaction + var_error covar_fam_s = var_g covar_fam_l = var_g + var_L_interaction # Here we walk over the data set one family at a time. for family_index, family_data in enumerate(flattened_data): # num_in_family will hold the number of gekkos from this family # from all parental combinations and treatments. num_in_family = family_sizes[family_index] # make an empty covariance matrix for this family (no need to change this). empty_row = [None]*num_in_family # make an empty maxtrix by copying the empty row num_in_family times # (you don't have to change this line). family_cov = [copy(empty_row) for i in xrange(num_in_family)] for i, gekko_i in enumerate(family_data): for j in range(i, num_in_family): gekko_j = family_data[j] ################################################################ # here we need to fill in family_cov[i][j] with the # covariance for gekko_i and gekko_j # # We'll do this by examining the attributes (such as gekko_i.mom # and gekko_i.dad) in gekko_i and gekko_j to fill in cov_element # # and then storing cov_element ################################################################ if i == j: if gekko_i.treatment == 0: cov_element = var_individ_s else: cov_element = var_individ_l else: if gekko_i.treatment == 0: cov_element = covar_fam_s else: cov_element = covar_fam_l ################################################################ # this is where we store the covariance element (the cov matrix # is symmetric, so we store it in (i, j) and (j, i) # If you calculated the covariance and stored it in the variable # called cov_element, then you should not have to change this. ################################################################ family_cov[i][j] = cov_element family_cov[j][i] = cov_element # append this non-zero part of the covariance matrix to a list # that will be used to represent a block diagonal matrix for all # measurements. # first we'll convert it from a python list of lists to a numpy matrix # (you should not have to change this). numpy_fam_cov = numpy.matrix(family_cov) non_zero_covariance_blocks.append(numpy_fam_cov) # here we calculate the inverse (exploiting the block diagonal structure) inverse_var = invert_block_diagonal(non_zero_covariance_blocks) # here we calculate the log of the determinant (exploiting the block diagonal structure) ln_determinant = ln_determinant_block_diagonal(non_zero_covariance_blocks) # We can calculate residuals by substracting observed_values from # expected_values. When we convert the python lists of values # to numpy.matrix objects, we get a column matrix. residuals_column = expected_values - observed_values # We can transpose the column vector of residuals to get a row... residuals_row = residuals_column.transpose() scaled_dev_sq = residuals_row*inverse_var*residuals_column ln_l = -0.5*(ln_determinant + float(scaled_dev_sq)) if verbose: sys.stderr.write('ln_l = ' + str(ln_l) + '\n') # This is how we send the result back to the function that called this # function. return ln_l ############################################################################ # End model-specific log-likelihood code ############################################################################ def calculate_incidence_matrix(the_data): '''This function is called to fill in the incidence matrix for the model. You should not need to touch this code. It calls calculate_incidence_row_for_indiv repeatedly to actually fill the matrix. ''' global the_incidence_matrix the_incidence_matrix_row_list = [] for family_index, family_data in enumerate(the_data): for treatment_index, indiv_data in enumerate(family_data): for indiv in indiv_data: the_incidence_matrix_row = calculate_incidence_row_for_indiv(indiv) the_incidence_matrix_row_list.append(the_incidence_matrix_row) the_incidence_matrix = numpy.matrix(the_incidence_matrix_row_list) def block_diag(*arrs): """Create a new diagonal matrix from the provided arrays. Parameters ---------- a, b, c, ... : ndarray Input arrays. Returns ------- D : ndarray Array with a, b, c, ... on the diagonal. Code from http://mail.scipy.org/pipermail/scipy-user/attachments/20090520/f30c0928/attachment.obj posted by Stefan van der Walt http://mail.scipy.org/pipermail/scipy-user/2009-May/021101.html """ arrs = [numpy.asarray(a) for a in arrs] shapes = numpy.array([a.shape for a in arrs]) out = numpy.zeros(numpy.sum(shapes, axis=0)) r, c = 0, 0 for i, (rr, cc) in enumerate(shapes): out[r:r + rr, c:c + cc] = arrs[i] r += rr c += cc return out def invert_block_diagonal(block_list): inv_block_list = [] for block in block_list: inv_block = numpy.linalg.inv(block) inv_block_list.append(inv_block) return block_diag(*inv_block_list) def ln_determinant_block_diagonal(block_list): det = 0.0 for block in block_list: det = det + log(numpy.linalg.det(block)) return det def propose_parameter(old_p, param_obj): ''' Returns a new, proposed value that is drawn from U(old_p - window_width/2, old_p + window_width/2) ''' # draw a U(-0.5, 0.5) variable u = random() - 0.5 window_width = param_obj.proposal_window offset = u*window_width new_p = old_p + offset min_b = param_obj.min_bound max_b = param_obj.max_bound while new_p < min_b or new_p > max_b: if new_p < min_b: diff = min_b - new_p new_p = diff + min_b if new_p > max_b: diff = new_p - max_b new_p = max_b - diff # this a symmetric proposal, so the ln_hastings_ratio is always 0.0 return new_p, 0.0 def calc_ln_prior_ratio(old_p, new_p, full_param_value_list, param_obj): '''Calls the prior_distribution's calc_ln_prob_ratio method to determin the log of: the prior density of new_p / the prior density of old_p ''' return param_obj.prior.calc_ln_prob_ratio(new_p, old_p) def do_mcmc(data, num_iterations, param_output_stream): curr_params = [] proposed_params = [] for p in global_parameter_list: curr_params.append(p.initial_value) proposed_params.append(p.initial_value) curr_lnL = ln_likelihood(data, curr_params) num_params = len(curr_params) num_accepted = 0 for iteration in xrange(num_iterations): for m, param_obj in enumerate(global_parameter_list): curr_p = curr_params[m] new_p, ln_hastings_ratio = propose_parameter(curr_p, param_obj) proposed_params[m] = new_p proposed_lnL = ln_likelihood(data, proposed_params) ln_prior_ratio = calc_ln_prior_ratio(curr_p, new_p, proposed_params, param_obj) ln_like_ratio = proposed_lnL - curr_lnL ln_acceptance_prob = ln_like_ratio + ln_prior_ratio + ln_hastings_ratio if ln_acceptance_prob > 0.0 or log(random()) < ln_acceptance_prob: num_accepted = num_accepted + 1 curr_lnL = proposed_lnL curr_params[m] = new_p else: # reject the move, set the appropriate element of the proposed_params # list back to its previous value proposed_params[m] = curr_p # Record the parameter values to an output stream, so that we can # analyze the samples params_as_str_list = [str(i) for i in curr_params] params_tab_separated = '\t'.join(params_as_str_list) param_output_stream.write(str(iteration) + '\t' + str(curr_lnL) + '\t' + params_tab_separated + '\n') return num_accepted def process_data(data_set): '''To make it easier to deal with the data, we'll calculate a few summaries of the data. Three items will be returned: the data_set, the observations, the number of individuals in each mating group. ''' block_sizes = [] observed_values = [] flattened_data = [] for group_data in data_set: sz = 0 flat = [] for group_treatment_data in group_data: sz = sz + len(group_treatment_data) flat.extend(group_treatment_data) for gekko in group_treatment_data: observed_values.append(gekko.y) flattened_data.append(flat) block_sizes.append(sz) observed_values = numpy.matrix(observed_values).transpose() return data_set, observed_values, block_sizes, flattened_data def read_data(filepath): '''Reads filepath as a tab-separated csv file and returns a 3dimensional data matrix.''' import os import csv import itertools if not os.path.exists(filepath): raise ValueError('The file "' + filepath + '" does not exist') TREATMENT_CODES = 'SL' MAX_TREATMENT_VALUE = len(TREATMENT_CODES) - 1 # Here we create a csv reader and tell it that a tab (\t) is the column delimiter entries = csv.reader(open(filepath, 'rbU'), delimiter=',') # Here we check that file has the headers that we exect first_row = entries.next() expected_headers = ['family', 'treatment', 'y (growth rate)'] for got, expected in itertools.izip(first_row, expected_headers): if got.lower().strip() != expected: raise ValueError('Error reading "' + filepath + '": expecting a column labelled "' + expected + '", but found "' + got + '" instead.') # It is not too hard to have this loop put the data in the right spot in the # matrix, so that the data file can be somwhat flexible. by_family = {} for n, row in enumerate(entries): fam_id, treatment_code, value = row try: fam_id = int(fam_id) except: raise ValueError("Error reading data row " + str(1 + n) + ' of "' + filepath + '": expecting an integer for family ID, but got ' + str(fam_id)) try: treatment_id = TREATMENT_CODES.index(treatment_code.upper()) assert(treatment_id >= 0 and treatment_id <= MAX_TREATMENT_VALUE) except: raise ValueError("Error reading data row " + str(1 + n) + ' of "' + filepath + '": expecting a single letter code (one of "' + TREATMENT_CODES + '") for the treatment, but got ' + str(treatment_code)) try: value = float(value) except: raise ValueError("Error reading data row " + str(1 + n) + ' of "' + filepath + '": expecting an number for the trait value, but got ' + str(value)) # a new family corresponds to an empty dictionary of individuals # for the 0, and 1 treatment. So we can create a list with two empty # dictionaries as the "blank" entry. empty_family_entry = [[] for i in range(MAX_TREATMENT_VALUE + 1)] fam_array = by_family.setdefault(fam_id, empty_family_entry) # now we grab the appropriate one for this treatment list_for_fam_treatment = fam_array[treatment_id] list_for_fam_treatment.append(Gekko(family=fam_id, treatment=treatment_id, y=value)) fam_keys = by_family.keys() fam_keys.sort() # Now we'll sort the data matrix and print some status information status_stream = sys.stdout status_stream.write("Data read for " + str(len(fam_keys)) + " families...\n") full_data = [] for i, fam_id in enumerate(fam_keys): treatments_list = by_family[fam_id] status_stream.write(" Mating group index=" + str(i) + " (id in datafile = " + str(fam_id) + "):\n") full_data.append(treatments_list) for treat_ind, indiv_list in enumerate(treatments_list): treatment_code = TREATMENT_CODES[treat_ind] status_stream.write(' ' + str(len(indiv_list)) + ' individuals in with treatment code "' + treatment_code + '" (numerical code ' + str(treat_ind) + ')\n') return process_data(full_data) def print_help(): num_args_expected = 2 + len(global_parameter_list) output_stream = sys.stdout output_stream.write('Expecting ' + str(num_args_expected) + ''' arguments. The last argument should be the number MCMC iterations, The first argument should be the path (filename) for the datafile (which should be a csv file with tab as the column separator. The intervening arguments should be starting values for each of the parameters The order of the parameters is in this constraint statement is: ''') for p in global_parameter_list: output_stream.write(p.name + ' ') output_stream.write('\n\n') if __name__ == '__main__': # user-interface and sanity checking... # we import sys try: filepath = sys.argv[1] arguments = sys.argv[2:] if len(arguments) < 1 + len(global_parameter_list): print len(arguments) print_help() sys.exit(1) except Exception, e: print 'Error:', e, '\n' print_help() sys.exit(1) real_data = read_data(filepath) calculate_incidence_matrix(real_data[0]) # The number of simulations is the last parameter... # num_iterations = int(arguments[-1]) for n, v in enumerate(arguments[:-1]): param = global_parameter_list[n] try: param.initial_value = float(v) except: sys.exit('Expecting an initial value for the parameter "' + param.name + '" but got: "' + v + '"') if param.min_bound is not None and param.initial_value < param.min_bound: sys.exit('Expecting an initial value for the parameter "' + param.name + '" to be >=' + param.min_bound + ' but got "' + v + '"') if param.max_bound is not None and param.initial_value > param.max_bound: sys.exit('Expecting an initial value for the parameter "' + param.name + '" to be <=' + param.max_bound + ' but got "' + v + '"') param_output_stream = sys.stderr param_output_stream.write("Iteration\tlnL") for p in global_parameter_list: param_output_stream.write('\t' + p.name) param_output_stream.write('\n') num_accepted = do_mcmc(real_data, num_iterations, param_output_stream) print "Accepted " + str(num_accepted) + " updates\n" print "Ran " + str(num_iterations) + " iterations over all " + str(len(global_parameter_list)) + " parameters.\n"