Note: The writing of this vignette is currently in progress. Please, contact the author for assistance, in case you need to use PCMFit immediately and cannot find your way through the coding examples. Thanks for your understanding.

Simulating parametric bootstrap datasets

# make results reproducible
set.seed(12, kind = "Mersenne-Twister", normal.kind = "Inversion")

numBootstraps <- 10

bestModel <- 
  RetrieveBestFitScore(PCMFitDemoObjects$fitMGPM_A_F_BC2_RR)$inferredModel

metaI <- PCMInfo(
  X = NULL, tree = attr(bestModel, "tree"), model = bestModel)

# Simulate data with the best fit model from the mammal data with SEs
valuesBootstrapMGPM_A_F_BC2_RR <- data.table(
  IdGlob = seq_len(numBootstraps),
  X = lapply(seq_len(numBootstraps), function(id) {
    PCMSim(
      tree = attr(bestModel, "tree"), 
      model = bestModel, 
      X0 = bestModel$X0,
      metaI = metaI)
  })
)

options(PCMBase.Value.NA = -1e20)
options(PCMBase.Lmr.mode = 11)
options(PCMBase.Threshold.EV = 1e-7)

valuesBootstrapMGPM_A_F_BC2_RR[
  , c("df", "logLik", "score") := {
    resLists <- lapply(.I, function(i) {
      attr(bestModel, "X") <- X[[i]]
      ll <- logLik(bestModel)
      aic <- AIC(bestModel)
      df <- attr(ll, "df")
      c(df, ll, aic)
    })
    list(df = sapply(resLists, function(.) .[1]),
         logLik = sapply(resLists, function(.) .[2]),
         score = sapply(resLists, function(.) .[3]))
  }]

PCMFitDemoObjects$valuesBootstrapMGPM_A_F_BC2_RR <- valuesBootstrapMGPM_A_F_BC2_RR

Model inference on the parametric bootstrap datasets

To make the following example work, create a directory where several files and execution results should be stored. Here, we call this directory TEST.

Writing a parametric bootstrap inference R-script

  • Copy the file vignettes/DefineParameterLimits.R from the PCMFit R-package source-code to the TEST directory.
  • Save the following code to a file named FitBootstrap_MGPM_A_F_BC2_RR.R in the TEST directory.
# File: FitBootstrap_MGPM_A_F_BC2_RR.R
# Usage: R --vanilla --slave -f ../../FitBootstrap_MGPM_A_F_BC2_RR.R --args 1
library(PCMBase)
library(PCMBaseCpp)
library(PCMFit)
library(data.table)

# extract dataset identifier and possibly other parameters from the command line:
args <- commandArgs(trailingOnly = TRUE)
if(length(args) > 0) {
  data_id <- as.integer(args[1])
} else {
  data_id <- 1L
}

# A character string used in filenames for a model inference on a given data:
prefixFiles = paste0("MGPM_A_F_BC2_RR_BSID_", data_id)

# Creating the cluster for this PCMFit run.
# Uncomment the follwing code only if doMPI package is available and working.
# if(!exists("cluster") || is.null(cluster)) {
#   if(require(doMPI)) {
#     # using MPI cluster as distributed node cluster (possibly running on a
#     # cluster of multiple nodes)
#     # Get the number of cores. Assume this is run in a batch job.
#     p = strtoi(Sys.getenv('LSB_DJOB_NUMPROC'))
#     cluster <- startMPIcluster(count = p-1, verbose = TRUE)
#     doMPI::registerDoMPI(cluster)
#   } else {
#     # possibly running on personal computer without mpi installation
#     cluster <- parallel::makeCluster(
#       parallel::detectCores(logical = TRUE),
#       outfile = paste0("log_", prefixFiles, ".txt"))
#     doParallel::registerDoParallel(cluster)
#   }
#}

# This function is going to be executed on each worker node.
# Hence, it is the convenient place to define global settings, 
# in particular:
# - set global option values, 
# - call the function PCMGenerateModelTypes to generate the default PCM model types;
# - write custom S3 methods for PCMBase::PCMParentClasses and PCMBase::PCMSpecify 
# for custom PCM model types (see corresponding man pages in the PCMBase package), 
# - define S3 methods for the PCMBase::PCMParamUpperLimit and PCMParamLowerLimit in the 
# global environment (see R-code below). 
generatePCMModelsFunction <- function() {
  # make results reproducible but keep in mind that data_id is unique for this specific 
  # worker node, and will be used to generate a unique parametric boostrap set of trait values.
  set.seed(data_id, kind = "Mersenne-Twister", normal.kind = "Inversion")

  PCMGenerateModelTypes()
  
  # An example DefineParameterLimits.R file can be found in 
  # vignettes/DefineParameterLimits.R of the PCMFit package source-code. 
  # Note that the path here is relative to the working directory of
  # the worker node R-process. 
  source('../../DefineParameterLimits.R', local = FALSE)
}

# Inferred best model (
# replace this with the result from a previous call to  RetrieveBestFitScore(fitObject)$inferredModel)
bestModel <- 
  RetrieveBestFitScore(PCMFitDemoObjects$fitMGPM_A_F_BC2_RR)$inferredModel
# Inferred tree
tree <- attr(bestModel, "tree")
# Simulate parameteric bootstrap trait values
X <- PCMSim(tree, bestModel, bestModel$X0)[, seq_len(PCMTreeNumTips(tree))]

currentResultFile <- paste0("CurrentResults_fits_", prefixFiles, ".RData")
if(file.exists(currentResultFile)) {
  load(currentResultFile)
  tableFitsPrev <- listResults$tableFits
} else {
  tableFitsPrev <- NULL
}

fitMGPM_A_F_BC2_RR <- PCMFitMixed(
    X = X, tree = tree, metaIFun = PCMInfoCpp,
    generatePCMModelsFun = generatePCMModelsFunction, 
    maxNumRoundRobins = 2, maxNumPartitionsInRoundRobins = 2,
    tableFitsPrev = tableFitsPrev,
    prefixFiles = prefixFiles,
    doParallel = TRUE)

save(fitMGPM_A_F_BC2_RR, file = paste0("Result_", prefixFiles, ".RData"))

Running the parametric bootstrap inference script on a cluster

  • Copy the following unix shell script code to a file FitBootstrap_MGPM_A_F_BC2_RR_bsub.sh in the TEST directory:
# File: FitBootstrap_MGPM_A_F_BC2_RR_bsub.sh
# Usage: sh FitBootstrap_MGPM_A_F_BC2_RR_bsub.sh
for id in `seq 1 1 10`
do
mkdir -p ResultsBootstrap/MGPM_A_F_BC2_RR_BSID_$id
cd ResultsBootstrap/MGPM_A_F_BC2_RR_BSID_$id
if [ -f "Result_MGPM_A_F_BC2_RR_BSID_"$id".RData" ]
then
rm MPI*.log
rm CurrentResults*.RData
else
# The following would execute only if the bsub command is available on your system
# bsub -M 10000 -n 8 -W 3:59 -R ib R --vanilla --slave -f ../../FitBootstrap_MGPM_A_F_BC2_RR.R --args $id
# Should run on unix systems. Comment or remove this line if uncommenting the above.
R --vanilla --slave -f ../../FitBootstrap_MGPM_A_F_BC2_RR.R --args $id
fi
cd ../..
done
  • Run the script by calling the command sh FitBootstrap_MGPM_A_F_BC2_RR_bsub.sh from the command prompt in the TEST directory.

Collect bootstrap results

inferredModel <- 
  RetrieveBestFitScore(PCMFitDemoObjects$fitMGPM_A_F_BC2_RR)$inferredModel
inferredTree <- PCMTree(attr(inferredModel, "tree"))

epochsHD <- seq(.2, max(PCMTreeNodeTimes(inferredTree)), 
                by = 2)

bs_MGPM_A_F_BC2_RR <- CollectBootstrapResults(
  resultDir = "ResultsBootstrap",
  prefixFiles = "MGPM_A_F_BC2_RR_BSID_",
  nameFitObject = 'fitMGPM_A_F_BC2_RR',
  ids = seq_len(10),
  inferredTree = inferredTree,
  epochs = epochsHD,
  minLength = 0.2,
  verbose = FALSE)

References