#!/usr/bin/env python ''' I opted for a slower, but (hopefully) easier system that the data structure used in the latent_gekko example. Either style works (and all of the theory is the same in both cases). I just thought this template might be easier to work from. I've tried to use ALL_CAPS to flag things that you will definitely have to change. Unlike previous assignments, you'll need to look a bit at the data processing (see the process_data function). ''' from scipy import optimize from math import pow, log, exp, sqrt from random import random, normalvariate from copy import copy verbose = False class Coconut(object): '''This object models an individual plant.''' def __init__(self, field, year, plus_n, plus_p, mass): self.field = field self.year = year self.plus_n = plus_n self.plus_p = plus_p self.mass = mass def __repr__(self): attributes_str = ", ".join([i + "=" + str(getattr(self, i)) for i in ["field", "year", "plus_n", "plus_p", "mass"]]) return "Coconut(" + attributes_str + ")" def __str__(self): return self.repr() class Dataset(object): pass class SameFieldTreatmentYearGroup(list): '''This is basically a list Coconut objects designed to hold individuals from the same field, treatment, and year. The length of this list will be determined by number of individuals in that treatment x field x year combination. By putting individuals in to "homogenous" groups like this we can quickly sum effects by multiplication (num*effect) rather than walking over every individual in the group ''' def __init__(self): self.num = 0 self.sum_mass = 0.0 self.treatment_variable = [] self.field_variable = None self.year_variable = None def calc_sum_treatment(self): '''Returns the sum of the fertilization treatments effects across the group''' if not self.treatment_variable: return 0.0 return self.num * sum([i.value for i in self.treatment_variable]) def calc_sum_field(self): '''Returns the sum of the field effects across the group''' return self.num * self.field_variable.value def calc_sum_year(self): '''Returns the sum of the year effects across the group''' return self.num * self.year_variable.value def calc_sum_field_treatment(self): '''Returns the sum of the field and treatment effects across the group''' return self.calc_sum_treatment() + self.calc_sum_field() def calc_sum_year_treatment(self): '''Returns the sum of the year and treatment effects across the group''' return self.calc_sum_treatment() + self.calc_sum_year() def calc_sum_field_year(self): '''Returns the sum of the field and treatment effects across the group''' return self.calc_sum_field() + self.calc_sum_year() def calc_sum_effects(self): '''Returns the sum of the field and treatment effects across the group''' return self.calc_sum_treatment() + self.calc_sum_field() + self.calc_sum_year() class SharedEffectGroupList(list): ''' This list will hold SameFieldTreatmentYearGroup objects. The intent is that you can put SameFieldTreatmentYearGroup objects that share an effect into this list. Then when you need to update that effect it will be easy to do calculations only over the groups of individuals that "use" that effect. This save wasted calculations. Note that the num attribute and the sum_mass attribute depend on the data and not the parameters, so they only have to updated when the data is read. It would be possible to make a more efficient implementation by "caching" previous sums of effects and assuring that we only call recalculating those parts of the sum that have changed rather than calling calculuate_sum_over_effects(), but calculuate_sum_over_effects() will work fine for this exercise ''' def __init__(self): self.num = 0 self.sum_mass = 0.0 def recalculate_data_sums(self): n = 0 mass = 0.0 for element in self: n = n + element.num mass = mass + element.sum_mass self.num = n self.sum_mass = mass def recalculate_all_sums(self): self.recalculate_data_sums() return self.calculuate_sum_over_effects() def calculuate_sum_over_effects(self): ft = 0.0 for group in self: ft = ft + group.calc_sum_effects() return ft ################################################################################ # The following four classes encapsulate probability distrbutions that can # class Parameter(object): '''This class will keep information about each parameter bundled together for easy reference ''' def __init__(self, name='', value=0.0, prior=None, proposal_window=None, min_bound=None, max_bound=None): self.name = name self.value = value self.prior = prior self.proposal_window = proposal_window self.min_bound = min_bound if min_bound is not None else float('-inf') self.max_bound = max_bound if max_bound is not None else float('inf') class LatentVariable(Parameter): '''For the purpose of MCMC a latent variable acts just like a parameter. So here we create a class for LatentVariable objects, but the word "pass" tells python that we have nothing to add. The "LatentVariable(Parameter)" syntax means that a LatentVariable object can be treated like a Parameter object. In computing jargon we would say that LatentVariable is derived from Parameter ''' pass class ParamIndex: '''This "class" is just a mechanism for associating a parameter name with an index. It is useful, because during the MCMC command, you'll need to update each parameter (in other programming languages this is done with a facility called an "enum"). The parameters can be stored in a global list of parameter (called global_parameter_list in this code), but it can be hard to remember what index into that list will give you the variance_e parameter, for instance. By writing var_e_param = global_parameter_list[ParameIndex.VAR_E] you can use ParameIndex.VAR_E to represent the number. You have to keep the argument to "range" equal to the number of parameters names that you want to register. And you have to keep the order of the parameter names identical to the order of the parameter objects in the global_parameter_list. You should not use this class to register names for the latent variables (the number of those depends on the dataset size, so you can't write the code in a generic way). ''' SOME_PARAM_NAME, ANOTHER_PARAM_NAME, VAR_E = range(3) ################################################################################ # The following four classes encapsulate probability distrbutions that can # be used as priors in MCMC calculations. # To create an object that corresponds to a specific instance from one of these # classes of distributions use syntax like this: # # x = NormalDistribution(mean=1.0, variance=1.0) # y = ExponentialDistribution(mean=1.0) # z = GammaDistribution(mean=2.2, variance=4.0) # # Note that the mean and variance arguments can be Parameter objects or # LatentVariable objects, if these properties of the distribution can # change. So # # m = Parameter(value=5) # y = NormalDistribution(mean=m, variance=2) # # # the mean of y is now 5 # # m.value = 4 # # # the mean of y is now 4 # ################################################################################ class ContinuousDistribution(object): '''The continous distribution class just has some helpers that make it easier to write code when the key qualities of a distribution (mean and variance) could either be specified by fixed numbers or Parameter objects These methods are used in the classes derived from ContinuousDistribution ''' def get_mean(self): try: return self.mean.value except: return self.mean def get_variance(self): try: return self.var.value except: if self.var is None: try: sd = self.sd.value except: sd = self.sd return sd*sd else: return self.var class GammaDistribution(ContinuousDistribution): def __init__(self, mean, variance): ''' mean = shape*scale var = shape*scale*scale ''' self.var = variance self.mean = mean def get_scale_shape(self): '''Returns the scale and shape parameters of the Gamma Distribution (calculated from the mean and variance). ''' mean = self.get_mean() variance = self.get_variance() scale = variance/mean shape = mean/scale return scale, shape def calc_ln_prob_ratio(self, numerator_variate, denominator_variate): '''Returns the log of the ratio of the probability density evaluated at `numerator_variate` over the density evaluated at the `denominator_variate`. f(x) = x^{shape-1}e^{-x/scale}/Gamma function So: ln[f(x)] = (shape-1)ln(x) -x/scale - ln[Some constant based on the Gamma function] So: ln[f(numerator)] - ln[f(denominator)] = (denominator - numerator)/mean ''' scale, shape = self.get_scale_shape() ln_numerator = (shape-1)*log(numerator_variate) - numerator_variate/scale ln_denominator = (shape-1)*log(denominator_variate) - denominator_variate/scale return ln_numerator - ln_denominator class ExponentialDistribution(ContinuousDistribution): def __init__(self, mean): assert(mean >= 0.0) self.mean = mean def calc_ln_prob_ratio(self, numerator_variate, denominator_variate): '''Returns the log of the ratio of the probability density evaluated at `numerator_variate` over the density evaluated at the `denominator_variate`. f(x) = (mean^-1) e^(-x/mean) So: ln[f(x)] = -x/mean - ln[mean] So: ln[f(numerator)] - ln[f(denominator)] = (denominator - numerator)/mean ''' return (denominator_variate - numerator_variate)/self.get_mean() class NormalDistribution(ContinuousDistribution): def __init__(self, mean, sd=None, variance=None): '''Takes the mean and standard deviation.''' self.mean = mean self.sd = sd self.var = variance def calc_ln_prob_ratio(self, numerator_variate, denominator_variate): '''Returns the log of the ratio of the probability density evaluated at `numerator_variate` over the density evaluated at the `denominator_variate`. f(x) = (2 pi sd^2) ^ -0.5 e^(-(x-mean)^2/(2sd^2)) So: ln[f(x)] = -0.5 ln[2 pi sd^2] - (x-mean)^2/(2sd^2) So: ln[f(numerator)] - ln[f(denominator)] = ((denominator - mean)^2 - (numerator-mean)^2)/(2sd^2) = (denominator^2 - numerator^2 + 2*(numerator-denominator)*mean)/(2sd^2) ''' mean = self.get_mean() variance = self.get_variance() d_sq = denominator_variate*denominator_variate n_sq = numerator_variate*numerator_variate third_term = 2*mean*(numerator_variate - denominator_variate) return (d_sq - n_sq + third_term)/(2*variance) ################################################################################ # Here we create the parameter objects. Our parameters are associated # with a prior and a proposal, in addition to having a value, a name, and # possibly minumum or maximum bounds. Thus, it it easier to bundle # all of that info in one class (See Parmeter above # ################################################################################ global_parameter_list = [ YOU_WILL_NEED_TO_CREATE_YOUR_PARAMETERS_BUT_NOT_LATENT_VARIABLES_HERE THE SYNTAX IS LIKE THIS Parameter(name='some_descriptive_name_here', value=10.0, # or whatever value prior=ExponentialDistribution(mean=-5), # USE A REAL PRIOR HERE, -5 is not a valid mean proposal_window=1.0, # this controls how drastic the proposals are. min_bound=None, # set to 0.0 if the parameter cannot be negative max_bound=None), # put a number as the value if the parameter has a maximum Parameter(name='next_parameter_', ...), # REMEMBER TO KEEP THE ORDER OF THE PARAMETERS IDENTICAL TO THE ORDER OF NAMES # IN ParamIndex, IF YOU ARE GOING TO USE ParamIndex IN YOUR CODE ] def propose_parameter(old_p, param_obj): '''Sliding window move which uses the param_obj.proposal_window to determine the window around old_p to use. The move also keeps the value within the bounds on param_obj. The proposed value that is drawn from: U(old_p - window_width/2, old_p + window_width/2) Returns the proposed value and the log of the Hastings Ratio ''' # draw a U(-0.5, 0.5) variable u = random() - 0.5 window_width = param_obj.proposal_window offset = u*window_width new_p = old_p + offset min_b = param_obj.min_bound max_b = param_obj.max_bound while new_p < min_b or new_p > max_b: if new_p < min_b: diff = min_b - new_p new_p = diff + min_b if new_p > max_b: diff = new_p - max_b new_p = max_b - diff # this a symmetric proposal, so the ln_hastings_ratio is always 0.0 return new_p, 0.0 def metrop_hastings(ln_like_ratio, ln_prior_ratio, ln_hastings_ratio): '''Uses the Metropolis-Hastings algorithm to decide whether the move should be accepted or not. Returns True to indicate acceptance (False otherwise). ''' ln_acceptance = ln_like_ratio + ln_prior_ratio + ln_hastings_ratio if ln_acceptance > 0.0: return True if log(random()) < ln_acceptance: return True return False def update_param_var_e(relevant_slice_of_data, curr_param_list): '''This is just an example (which is not valid) to demonstrate syntax. You'll need some function like this for every parameter. do_mcmc is written to assume that the function will return the number of updates that are accepted ''' param_obj = curr_param_list[ParamIndex.VAR_E] current_value = param_obj.value proposed_value, ln_hastings_ratio = propose_parameter(current_value, param_obj) ln_prior_ratio = param_obj.prior.calc_ln_prob_ratio(proposed_value, current_value) ln_like_ratio = YOU_WILL_HAVE_TO_USE_THE_DATA_AND_PARAMETER_VALUES_TO_CALCULATE_THE_LN_OF_THE_LIKELIHOOD_RATIO if metrop_hastings(ln_like_ratio, ln_prior_ratio, ln_hastings_ratio): param_obj.value = proposed_value return 1 return 0 def update_latent_effects_for_nitrogen(relevant_groups_of_data, curr_param_list, latent_effects): ''' It is easiest to walk through the list of each type of latent effect, so this function receives the list of latent_effects (a list with LatentVariable objects) and should be sent a corresponding list of the relevant data slices. The ideas is that for every latent variable, you can get a SharedEffectGroupList that will contain all of the SameFieldTreatmentYearGroup items that "use" this latent effect. So if you base the likelihood calculations on just that SharedEffectGroupList, then you will not be examining data that is irrelevant (you want be calculating terms that will cross out in the likelihood ratio calculation). ''' # the following assert assures that you have the right number of SharedEffectGroupList objects (one for each latent_effect). assert(len(relevant_groups_of_data) == len(latent_effects)) var_error = curr_param_list[ParamIndex.VAR_E] num_accepted = 0 for effect_index, data_for_effect in enumerate(relevant_groups_of_data): latent_variable = latent_effects[effect_index] current_value = latent_variable.value # We need to make sure that the sums over the other parameter/latent variables are up to date # before we use them... # There are more efficient ways to do this (namely by only updating the parts of the # sum that change), but it won't make a huge difference for this application. # Here we'll use a trick. We'll set the effect to 0, sum the effects # for each group of plants, and then restore the effect to its # previous value. This lets us get the sum of all of the other effects # (and they will be included the appropriate number of times). latent_variable.value = 0.0 sum_other_effects = data_for_effect.calculuate_sum_over_effects() latent_variable.value = current_value # At this point sum_other_effects will hold the summation of all of the # effects other than `latent_variable` over all of the individuals # that latent_variable applies to. If you did your calculation # of the ln_likelihood_ratio in step 2.5 of the homework correctly # (hint, hint), then at least sum updates can be simplified by summing # over effects. This allows us to avoid iterating over every individual. proposed_value, ln_hastings_ratio = propose_parameter(current_value, latent_variable) # num will be the number of relevant individuals... num = data_for_effect.num # sum_mass will be the sum of the mass for all relevant individuals... sum_mass = data_for_effect.sum_mass ln_like_ratio = SOMETHING_GOES_HERE ln_prior_ratio = SOMETHING_ELSE_GOES_HERE if metrop_hastings(ln_like_ratio, ln_prior_ratio, ln_hastings_ratio): num_accepted += 1 latent_variable.value = proposed_value return num_accepted def write_sampled_parameters(param_output_stream, iteration, all_variables_list): # We convert every parameter or latent variable value to a string of characters to be written params_as_str_list = [str(i.value) for i in all_variables_list] # We introduce tabs between each element params_tab_separated = '\t'.join(params_as_str_list) # We write a line to the parameter output stream. param_output_stream.write(str(iteration) + '\t' + params_tab_separated + '\n') def do_mcmc(data_and_latent_variables, num_iterations, param_output_stream, sample_freq): data = data_and_latent_variables[0] # These are some useful "slices" of the data (see process_data) by_homogenous_group = data.by_same_year_field_treatment all_individuals = data.individuals by_year = data.by_year by_field = data.by_field by_field_nitro = data.by_field_nitro by_field_phosph = data.by_field_phosph by_field_both = data.by_field_both # Each of the following "lv_list" holds LatentVariable objects that will # need to be updated during MCMC (just like the parameters need to be updated) year_lv_list = data_and_latent_variables[1] field_lv_list = data_and_latent_variables[2] field_nitro_lv_list = data_and_latent_variables[3] field_phospho_lv_list = data_and_latent_variables[4] field_both_lv_list = data_and_latent_variables[5] num_accepted = 0 # make a huge list of all of the parameters and latent variables (this will make it # easier to print out the state at each iteration of the MCMC) all_variables_list = global_parameter_list + year_lv_list + field_lv_list + field_nitro_lv_list + field_phospho_lv_list + field_both_lv_list for iteration in xrange(num_iterations): accepted = update_param_var_e(SLICE_OF_DATA_OR_OTHER_INFO_YOU_NEED_TO_UPDATE_THIS_PARAMETER, global_parameter_list) num_accepted += accepted accepted = update_param_SOME_OTHER_PARAMETER(DATA_SLICE_YOU_NEED_TO_UPDATE_THIS_PARAMETER, global_parameter_list) num_accepted += accepted accepted = update_latent_effects_for_nitrogen(RELEVANT_SLICE_OF_DATA, global_parameter_list, field_nitro_lv_list) num_accepted += accepted accepted = update_SOME_OTHER_latent_effects(RELEVANT_SLICE_OF_DATA, global_parameter_list, THE_LIST_OF_LATENT_VARIABLEL) num_accepted += accepted YOU_WILL_HAVE_TO_WRITE_UPDATE_FUNCTIONS_FOR_ALL_YOUR_PARAMETERS_AND_LATENT_VARIABLES # the funky % syntax means that we will only call write_sampled_parameters # if iteration is a multiple of sample_freq if iteration % sample_freq == 0: print("iter=" + str(iteration)) write_sampled_parameters(param_output_stream, iteration, all_variables_list) return num_accepted def process_data(no_fert_by_yf, with_nitrogen_by_yf, with_phosphorus_by_yf, with_both_by_yf, all_years, all_fields): '''This function will create the LatentVariable instances, and it will also create groupings of the Coconut objects that will make it easy to traverse the entire data set in a variety of ways. You can think of these groupings as different slices of the full matrix of data. The function returns: 0. the data set; 1. a list of "year effect" latent variables; 2. a list of "field effect" latent variables. 3. a list of "field x nitrogen effect" latent variables. 4. a list of "field x phosphorus effect" latent variables. 5. a list of "field x both fertilizers effect" latent variables. The data_set object has the following attributes by_same_year_field_treatment a list containing all the data grouped into SameFieldTreatmentYearGroup objects (each individual in the a SameFieldTreatmentYearGroup shares the same year, field, and treatment. individuals (a list of all Coconut objects) by_year (a list of SharedEffectGroupList objects) by_field (a list of SharedEffectGroupList objects) by_field_nitro (a list of SharedEffectGroupList objects) by_field_phosph (a list of SharedEffectGroupList objects) by_field_both (a list of SharedEffectGroupList objects) The arguments are dictionaries that contain all of the individuals. There are four dictionaries (one for each treatment) and all so a set of all years, and a set field numbers. Python dictionaries store associations between arbitary keys and objects. They are convenient when you don't know how many items you'll be dealing with. You won't need to understand how dictionaries work to complete the homework. You can just take my word that loops over things like sorted_years and sorted_fields are indeed letting us iterate over the data for those field and years. ''' sorted_years = list(all_years) sorted_years.sort() sorted_fields = list(all_fields) sorted_fields.sort() # create latent variables for each year year_lv_list = [] for year in sorted_years: # here, we create a latent variable for each year... # We create a new LatentVariable in the same way that we create # a Parameter object, so refer to the notes above (near global_parameter_list creation) # for further info. year_lv = LatentVariable(name="year_" + str(year), value=SOMETHING, prior=SOMETHING proposal_window=SOMETHING) year_lv_list.append(year_lv) field_lv_list = [] field_nitro_lv_list = [] field_phospho_lv_list = [] field_both_lv_list = [] for field in sorted_fields: # here we create latent variables for each field and treatment combination # and store them in the proper list.. field_lv = LatentVariable(name="field_" + str(field), SOME_STUFF_HERE) field_lv_list.append(field_lv) nitro_lv = LatentVariable(name="field_" + str(field) + "_nitro", SOME_STUFF_HERE) field_nitro_lv_list.append(nitro_lv) phospho_lv = LatentVariable(name="field_" + str(field) + "_phospho", SOME_STUFF_HERE) field_phospho_lv_list.append(phospho_lv) both_lv = LatentVariable(name="field_" + str(field) + "_both", SOME_STUFF_HERE) field_both_lv_list.append(both_lv) # it is possible that you won't need to modify the rest of this function. # but you may. It depends on the parameterization that you chose. # Check out the comments below. data = Dataset() data.by_same_year_field_treatment = [] data.individuals = [] data.by_year = [] data.by_field = [] data.by_field_nitro = [] data.by_field_phosph = [] data.by_field_both = [] for year_index, year in enumerate(sorted_years): # same_year_group will be a "slice" of the data that was collected # for this year only. This will be useful when we want to # update the latent variable associated with this year (because # the data contained in same_year_group will be the only bits of # data that are relevant to that latent variable). same_year_group = SharedEffectGroupList() # year_lv is the LatentVariable for this year. year_lv = year_lv_list[year_index] for field_index, field in enumerate(sorted_fields): # field_lv is the LatentVariable for this field. field_lv = field_lv_list[field_index] key = (year, field) # The next few lines get a SameFieldTreatmentYearGroup for each of the # four treatments for this year x field combination, and perform # some basic initialization of those objects (such as counting # the number of individuals and summing the masses within # a SameFieldTreatmentYearGroup object. no_fertilizer = no_fert_by_yf.setdefault(key, SameFieldTreatmentYearGroup()) no_fertilizer.num = len(no_fertilizer) no_fertilizer.sum_mass = sum([el.mass for el in no_fertilizer]) nitro = with_nitrogen_by_yf.setdefault(key, SameFieldTreatmentYearGroup()) nitro.num = len(nitro) nitro.sum_mass = sum([el.mass for el in nitro]) phospho = with_phosphorus_by_yf.setdefault(key, SameFieldTreatmentYearGroup()) phospho.num = len(phospho) phospho.sum_mass = sum([el.mass for el in phospho]) both = with_both_by_yf.setdefault(key, SameFieldTreatmentYearGroup()) both.num = len(both) both.sum_mass = sum([el.mass for el in both]) # This is where we associate the groups of objects that are exposed # to the same conditions (the SameFieldTreatmentYearGroup lists of # Coconut objects), with the latent variables that are thought to # influence them. The SameFieldTreatmentYearGroup class # uses: # year_variable the latent variable for this year # field_variable the latent variable for this field # treatment_variable list of latent variables pertaining to fertilization # to allow it to sum over the effects for the group. # So we have to "tell" each object what its latent variables are. # # Depending on what parameterization you use for latent variables # you may need to modify the following lines. # First we get the treatment variables for this field. nitro_treament = field_nitro_lv_list[field_index] phospo_treatment = field_phospho_lv_list[field_index] interaction_effect = field_both_lv_list[field_index] # Now for each SameFieldTreatmentYearGroup we set the appropriate # attributes so that it "knows" which latent variables apply to it. no_fertilizer.treatment_variable = [] no_fertilizer.field_variable = field_lv no_fertilizer.year_variable = year_lv nitro.treatment_variable = [nitro_treament] nitro.field_variable = field_lv nitro.year_variable = year_lv phospho.treatment_variable = [phospo_treatment] phospho.field_variable = field_lv phospho.year_variable = year_lv both.treatment_variable = [nitro_treament, phospo_treatment, interaction_effect] both.field_variable = field_lv both.year_variable = year_lv # Finally we store the individuals and the SameFieldTreatmentYearGroup # in the appropriate slots in our `data` object. # Every individual should go in data.individuals, so we extend that list... data.individuals.extend(no_fertilizer) data.individuals.extend(nitro) data.individuals.extend(phospho) data.individuals.extend(both) # data.by_same_year_field_treatment is a list, but it holds # SameFieldTreatmentYearGroup objects rather than the individual # Coconut objects. So we just add each of the SameFieldTreatmentYearGroup # objects to the end of the by_same_year_field_treatment slice. data.by_same_year_field_treatment.append(no_fertilizer) data.by_same_year_field_treatment.append(nitro) data.by_same_year_field_treatment.append(phospho) data.by_same_year_field_treatment.append(both) # same_year_group is also a list of SameFieldTreatmentYearGroup objects # but it only pertains to this year. same_year_group.append(no_fertilizer) same_year_group.append(nitro) same_year_group.append(phospho) same_year_group.append(both) # After we have considered every field in this year, we add all of the # accumulated data to the by_year group. recalculate_all_sums() # calculates the sum of the mass and other quantities for # the same_year_group. same_year_group.recalculate_all_sums() data.by_year.append(same_year_group) # This loop creates the slices that correspond to different treatments # See notes below. for field_index, field in enumerate(sorted_fields): same_field_group = SharedEffectGroupList() same_field_nitro_group = SharedEffectGroupList() same_field_phospho_group = SharedEffectGroupList() same_field_both_group = SharedEffectGroupList() for year_index, year in enumerate(sorted_years): key = (year, field) # the following four lines get the SameFieldTreatmentYearGroup # objects for each of the four treatments for this field x year combination. no_fertilizer = no_fert_by_yf[key] nitro = with_nitrogen_by_yf[key] phospho = with_phosphorus_by_yf[key] both = with_both_by_yf[key] # In one possible parameterization you have: # - a field effect that applies to all four treatmens # - a nitrogen effect that applies to two treatmens # - a phosphorous effect that applies to two treatmens # - an interaction effect that applies to only one # The following lines add the groups of data to the relevant # lists assuming this parameterization. If you are using an # alternative, then you'll have to change which # lists a particular SameFieldTreatmentYearGroup # gets added to: same_field_group.append(no_fertilizer) same_field_group.append(nitro) same_field_group.append(phospho) same_field_group.append(both) same_field_nitro_group.append(nitro) same_field_nitro_group.append(both) same_field_phospho_group.append(phospho) same_field_phospho_group.append(both) same_field_both_group.append(both) # Now that we have data in groups we pre-calculate things, and add # the groups to the Dataset object. same_field_group.recalculate_all_sums() data.by_field.append(same_field_group) same_field_nitro_group.recalculate_all_sums() data.by_field_nitro.append(same_field_nitro_group) same_field_phospho_group.recalculate_all_sums() data.by_field_phosph.append(same_field_phospho_group) same_field_both_group.recalculate_all_sums() data.by_field_both.append(same_field_both_group) # now we can do some "sanity checks" to make sure that we didn't miss anything # we will "assert" some statements that will be true (if we don't have any # bugs). data.num = len(data.individuals) data.sum_mass = sum([i.mass for i in data.individuals]) assert(data.num == sum([i.num for i in data.by_year])) assert(data.num == sum([i.num for i in data.by_field])) assert(data.sum_mass - sum([i.sum_mass for i in data.by_year]) < 1e-5) assert(data.sum_mass - sum([i.sum_mass for i in data.by_field]) < 1e-5) return data, year_lv_list, field_lv_list, field_nitro_lv_list, field_phospho_lv_list, field_both_lv_list def read_data(filepath): '''Reads filepath as a tab-separated csv file. Returns a Dataset object and five lists of latent variables. The returned values come from process_data, so see that function for details. You probably won't need to dig into this function to complete the homework. ''' import os import csv import itertools if not os.path.exists(filepath): raise ValueError('The file "' + filepath + '" does not exist') # Here we create a csv reader and tell it that a tab (\t) is the column delimiter entries = csv.reader(open(filepath, 'rbU'), delimiter=',') # Here we check that file has the headers that we exect first_row = entries.next() expected_headers = ['year', 'field', 'nitrogen', 'phosphorus', 'individual', 'mass'] for got, expected in itertools.izip(first_row, expected_headers): if got.lower().strip() != expected: raise ValueError('Error reading "' + filepath + '": expecting a column labelled "' + expected + '", but found "' + got + '" instead.') with_nitrogen_by_yf = {} with_phosphorus_by_yf = {} with_both_by_yf = {} no_fert_by_yf = {} all_years = set() all_fields = set() for n, row in enumerate(entries): year, field, nitrogen, phosphorus, individual, mass = row try: year = int(year) except: raise ValueError("Error reading data row " + str(1 + n) + ' of "' + filepath + '": expecting an integer for year, but got ' + str(year)) try: field = int(field) assert(field >= 0) except: raise ValueError("Error reading data row " + str(1 + n) + ' of "' + filepath + '": expecting an integer for field, but got ' + str(field)) try: nitrogen = int(nitrogen) assert(nitrogen == 0 or nitrogen == 1) except: raise ValueError("Error reading data row " + str(1 + n) + ' of "' + filepath + '": expecting an 0 or 1 for nitrogen, but got ' + str(nitrogen)) try: phosphorus = int(phosphorus) assert(phosphorus == 0 or phosphorus == 1) except: raise ValueError("Error reading data row " + str(1 + n) + ' of "' + filepath + '": expecting an 0 or 1 for phosphorus, but got ' + str(phosphorus)) try: mass = float(mass) assert(mass >= 0.0) except: raise ValueError("Error reading data row " + str(1 + n) + ' of "' + filepath + '": expecting a positive number for mass, but got ' + str(mass)) all_years.add(year) all_fields.add(field) coconut = Coconut(field=field, year=year, plus_n=nitrogen, plus_p=phosphorus, mass=mass) # Get the group with the same year, field, and treatment if nitrogen == 1: if phosphorus == 1: coconut.same_yft_group = with_both_by_yf.setdefault((year, field), SameFieldTreatmentYearGroup()) else: coconut.same_yft_group = with_nitrogen_by_yf.setdefault((year, field), SameFieldTreatmentYearGroup()) else: if phosphorus == 1: coconut.same_yft_group = with_phosphorus_by_yf.setdefault((year, field), SameFieldTreatmentYearGroup()) else: coconut.same_yft_group = no_fert_by_yf.setdefault((year, field), SameFieldTreatmentYearGroup()) coconut.same_yft_group.append(coconut) return process_data(no_fert_by_yf, with_nitrogen_by_yf, with_phosphorus_by_yf, with_both_by_yf, all_years, all_fields) def print_help(): num_args_expected = 3 + len(global_parameter_list) output_stream = sys.stdout output_stream.write('Expecting ' + str(num_args_expected) + ''' arguments. The second-to-last argument should be the number MCMC iterations, The last argument should be the number MCMC iterations between sampling the chain (the periodicity for thinning) The first argument should be the path (filename) for the datafile (which should be a csv file with tab as the column separator. The intervening arguments should be starting values for each of the parameters The order of the parameters is in this constraint statement is: ''') for p in global_parameter_list: output_stream.write(p.name + ' ') output_stream.write('\n\n') if __name__ == '__main__': # user-interface and sanity checking... # we import sys try: filepath = sys.argv[1] arguments = sys.argv[2:] if len(arguments) < 2 + len(global_parameter_list): print len(arguments) print_help() sys.exit(1) except Exception, e: print 'Error:', e, '\n' print_help() sys.exit(1) # The number of simulations is the last parameter... # num_iterations = int(arguments[-2]) sample_freq = int(arguments[-1]) for n, v in enumerate(arguments[:-2]): param = global_parameter_list[n] try: param.value = float(v) except: sys.exit('Expecting an initial value for the parameter "' + param.name + '" but got: "' + v + '"') if param.min_bound is not None and param.value < param.min_bound: sys.exit('Expecting an initial value for the parameter "' + param.name + '" to be >=' + param.min_bound + ' but got "' + v + '"') if param.max_bound is not None and param.value > param.max_bound: sys.exit('Expecting an initial value for the parameter "' + param.name + '" to be <=' + param.max_bound + ' but got "' + v + '"') print "param #", n, param.name, "=", param.value data_and_latent_variables = read_data(filepath) # the observed Dataset object is the first thing returned... obs_data = data_and_latent_variables[0] # next come the lists of latent variables. year_lv_list = data_and_latent_variables[1] field_lv_list = data_and_latent_variables[2] field_nitro_lv_list = data_and_latent_variables[3] field_phospho_lv_list = data_and_latent_variables[4] field_both_lv_list = data_and_latent_variables[5] param_output_stream = sys.stderr param_output_stream.write("Iteration") for p in global_parameter_list: param_output_stream.write('\t' + p.name) for p in year_lv_list: param_output_stream.write('\t' + p.name) for p in field_lv_list: param_output_stream.write('\t' + p.name) for p in field_nitro_lv_list: param_output_stream.write('\t' + p.name) for p in field_phospho_lv_list: param_output_stream.write('\t' + p.name) for p in field_both_lv_list: param_output_stream.write('\t' + p.name) param_output_stream.write('\n') num_accepted = do_mcmc(data_and_latent_variables, num_iterations, param_output_stream, sample_freq) print "Accepted " + str(num_accepted) + " updates\n" print "Ran " + str(num_iterations) + " iterations over all " + str(len(global_parameter_list)) + " parameters.\n"