1 files changed, 0 insertions, 1107 deletions
diff --git a/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c b/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c
deleted file mode 100644
index 5c2263f2..00000000
--- a/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c
+++ /dev/null
@@ -1,1107 +0,0 @@
-/// \file
-/// Main program
-///
-/// \mainpage CoMD: A Classical Molecular Dynamics Mini-app
-///
-/// CoMD is a reference implementation of typical classical molecular
-/// dynamics algorithms and workloads.  It is created and maintained by
-/// The Exascale Co-Design Center for Materials in Extreme Environments
-/// (ExMatEx).  http://codesign.lanl.gov/projects/exmatex.  The
-/// code is intended to serve as a vehicle for co-design by allowing
-/// others to extend and/or reimplement it as needed to test performance of 
-/// new architectures, programming models, etc.
-///
-/// The current version of CoMD is available from:
-/// http://exmatex.github.io/CoMD
-///
-/// To contact the developers of CoMD send email to: exmatex-comd@llnl.gov.
-///
-/// Table of Contents
-/// =================
-///
-/// Click on the links below to browse the CoMD documentation.
-///
-/// \subpage pg_md_basics
-///
-/// \subpage pg_building_comd
-///
-/// \subpage pg_running_comd
-///
-/// \subpage pg_measuring_performance
-///
-/// \subpage pg_problem_selection_and_scaling
-///
-/// \subpage pg_verifying_correctness
-///
-/// \subpage pg_comd_architecture
-///
-/// \subpage pg_optimization_targets
-///
-/// \subpage pg_whats_new
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <unistd.h>
-#include <assert.h>
-
-#include "CoMDTypes.h"
-#include "decomposition.h"
-#include "linkCells.h"
-#include "eam.h"
-#include "ljForce.h"
-#include "initAtoms.h"
-#include "memUtils.h"
-#include "yamlOutput.h"
-#include "parallel.h"
-#include "performanceTimers.h"
-#include "mycommand.h"
-#include "timestep.h"
-#include "constants.h"
-
-#define REDIRECT_OUTPUT 0
-#define   MIN(A,B) ((A) < (B) ? (A) : (B))
-
-static SimFlat* initSimulation(Command cmd);
-static void destroySimulation(SimFlat** ps);
-
-static void initSubsystems(void);
-static void finalizeSubsystems(void);
-
-static BasePotential* initPotential(
-   int doeam, const char* potDir, const char* potName, const char* potType);
-static SpeciesData* initSpecies(BasePotential* pot);
-static Validate* initValidate(SimFlat* s);
-static void validateResult(const Validate* val, SimFlat *sim);
-
-static void sumAtoms(SimFlat* s);
-static void printThings(SimFlat* s, int iStep, double elapsedTime);
-static void printSimulationDataYaml(FILE* file, SimFlat* s);
-static void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]);
-
-
-int main(int argc, char** argv)
-{
-   // Prolog
-   initParallel(&argc, &argv);
-   profileStart(totalTimer);
-   initSubsystems();
-   timestampBarrier("Starting Initialization\n");
-
-#ifdef PRINT_YAML
-   yamlAppInfo(yamlFile);
-#endif
-   yamlAppInfo(screenOut);
-
-   Command cmd = parseCommandLine(argc, argv);
-#ifdef PRINT_YAML
-   printCmdYaml(yamlFile, &cmd);
-#endif
-   printCmdYaml(screenOut, &cmd);
-
-   SimFlat* sim = initSimulation(cmd);
-#ifdef PRINT_YAML
-   printSimulationDataYaml(yamlFile, sim);
-#endif
-   printSimulationDataYaml(screenOut, sim);
-
-   Validate* validate = initValidate(sim); // atom counts, energy
-   timestampBarrier("Initialization Finished\n");
-
-   timestampBarrier("Starting simulation\n");
-
-   // This is the CoMD main loop
-   const int nSteps = sim->nSteps;
-   const int printRate = sim->printRate;
-   int iStep = 0;
-   profileStart(loopTimer);
-   for (; iStep<nSteps;)
-   {
-      startTimer(commReduceTimer);
-      sumAtoms(sim);
-      stopTimer(commReduceTimer);
-
-      printThings(sim, iStep, getElapsedTime(timestepTimer));
-
-      startTimer(timestepTimer);
-      timestep(sim, printRate, sim->dt);
-      stopTimer(timestepTimer);
-
-      iStep += printRate;
-   }
-   profileStop(loopTimer);
-
-   sumAtoms(sim);
-   printThings(sim, iStep, getElapsedTime(timestepTimer));
-   timestampBarrier("Ending simulation\n");
-
-   // Epilog
-   validateResult(validate, sim);
-   profileStop(totalTimer);
-
-#ifdef PRINT_TIMING
-   printPerformanceResults(sim->atoms->nGlobal, sim->printRate);
-   printPerformanceResultsYaml(yamlFile);
-#endif
-
-   destroySimulation(&sim);
-   comdFree(validate);
-   finalizeSubsystems();
-
-   timestampBarrier("CoMD Ending\n");
-   destroyParallel();
-
-   return 0;
-}
-
-/// Initialized the main CoMD data stucture, SimFlat, based on command
-/// line input from the user.  Also performs certain sanity checks on
-/// the input to screen out certain non-sensical inputs.
-///
-/// Simple data members such as the time step dt are initialized
-/// directly, substructures such as the potential, the link cells, the
-/// atoms, etc., are initialized by calling additional initialization
-/// functions (initPotential(), initLinkCells(), initAtoms(), etc.).
-/// Initialization order is set by the natural dependencies of the
-/// substructure such as the atoms need the link cells so the link cells
-/// must be initialized before the atoms.
-SimFlat* initSimulation(Command cmd)
-{
-   SimFlat* sim = comdMalloc(sizeof(SimFlat));
-   sim->nSteps = cmd.nSteps;
-   sim->printRate = cmd.printRate;
-   sim->dt = cmd.dt;
-   sim->domain = NULL;
-   sim->boxes = NULL;
-   sim->atoms = NULL;
-   sim->ePotential = 0.0;
-   sim->eKinetic = 0.0;
-   sim->atomExchange = NULL;
-
-   sim->pot = initPotential(cmd.doeam, cmd.potDir, cmd.potName, cmd.potType);
-   real_t latticeConstant = cmd.lat;
-   if (cmd.lat < 0.0)
-      latticeConstant = sim->pot->lat;
-
-   // ensure input parameters make sense.
-   sanityChecks(cmd, sim->pot->cutoff, latticeConstant, sim->pot->latticeType);
-
-   sim->species = initSpecies(sim->pot);
-
-   real3 globalExtent;
-   globalExtent[0] = cmd.nx * latticeConstant;
-   globalExtent[1] = cmd.ny * latticeConstant;
-   globalExtent[2] = cmd.nz * latticeConstant;
-
-   sim->domain = initDecomposition(
-      cmd.xproc, cmd.yproc, cmd.zproc, globalExtent);
-
-   sim->boxes = initLinkCells(sim->domain, sim->pot->cutoff);
-   sim->atoms = initAtoms(sim->boxes);
-
-   // create lattice with desired temperature and displacement.
-   createFccLattice(cmd.nx, cmd.ny, cmd.nz, latticeConstant, sim);
-   setTemperature(sim, cmd.temperature);
-   randomDisplacements(sim, cmd.initialDelta);
-
-   sim->atomExchange = initAtomHaloExchange(sim->domain, sim->boxes);
-
-   // Forces must be computed before we call the time stepper.
-   startTimer(redistributeTimer);
-   redistributeAtoms(sim);
-   stopTimer(redistributeTimer);
-
-   startTimer(computeForceTimer);
-   computeForce(sim);
-   stopTimer(computeForceTimer);
-
-   kineticEnergy(sim);
-
-   return sim;
-}
-
-/// frees all data associated with *ps and frees *ps
-void destroySimulation(SimFlat** ps)
-{
-   if ( ! ps ) return;
-
-   SimFlat* s = *ps;
-   if ( ! s ) return;
-
-   BasePotential* pot = s->pot;
-   if ( pot) pot->destroy(&pot);
-   destroyLinkCells(&(s->boxes));
-   destroyAtoms(s->atoms);
-   destroyHaloExchange(&(s->atomExchange));
-   comdFree(s->species);
-   comdFree(s->domain);
-   comdFree(s);
-   *ps = NULL;
-
-   return;
-}
-
-void initSubsystems(void)
-{
-#if REDIRECT_OUTPUT
-   freopen("testOut.txt","w",screenOut);
-#endif
-
-#ifdef PRINT_YAML
-   yamlBegin();
-#endif
-}
-
-void finalizeSubsystems(void)
-{
-#if REDIRECT_OUTPUT
-   fclose(screenOut);
-#endif
-
-#ifdef PRINT_YAML
-   yamlEnd();
-#endif
-}
-
-/// decide whether to get LJ or EAM potentials
-BasePotential* initPotential(
-   int doeam, const char* potDir, const char* potName, const char* potType)
-{
-   BasePotential* pot = NULL;
-
-   if (doeam) 
-      pot = initEamPot(potDir, potName, potType);
-   else 
-      pot = initLjPot();
-   assert(pot);
-   return pot;
-}
-
-SpeciesData* initSpecies(BasePotential* pot)
-{
-   SpeciesData* species = comdMalloc(sizeof(SpeciesData));
-
-   strcpy(species->name, pot->name);
-   species->atomicNo = pot->atomicNo;
-   species->mass = pot->mass;
-
-   return species;
-}
-
-Validate* initValidate(SimFlat* sim)
-{
-   sumAtoms(sim);
-   Validate* val = comdMalloc(sizeof(Validate));
-   val->eTot0 = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
-   val->nAtoms0 = sim->atoms->nGlobal;
-
-   if (printRank())
-   {
-      fprintf(screenOut, "\n");
-      printSeparator(screenOut);
-      fprintf(screenOut, "Initial energy : %14.12f, atom count : %d \n", 
-            val->eTot0, val->nAtoms0);
-      fprintf(screenOut, "\n");
-   }
-   return val;
-}
-
-void validateResult(const Validate* val, SimFlat* sim)
-{
-   if (printRank())
-   {
-      real_t eFinal = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
-
-      int nAtomsDelta = (sim->atoms->nGlobal - val->nAtoms0);
-
-      fprintf(screenOut, "\n");
-      fprintf(screenOut, "\n");
-      fprintf(screenOut, "Simulation Validation:\n");
-
-      fprintf(screenOut, "  Initial energy  : %14.12f\n", val->eTot0);
-      fprintf(screenOut, "  Final energy    : %14.12f\n", eFinal);
-      fprintf(screenOut, "  eFinal/eInitial : %f\n", eFinal/val->eTot0);
-      if ( nAtomsDelta == 0)
-      {
-         fprintf(screenOut, "  Final atom count : %d, no atoms lost\n",
-               sim->atoms->nGlobal);
-      }
-      else
-      {
-         fprintf(screenOut, "#############################\n");
-         fprintf(screenOut, "# WARNING: %6d atoms lost #\n", nAtomsDelta);
-         fprintf(screenOut, "#############################\n");
-      }
-   }
-}
-
-void sumAtoms(SimFlat* s)
-{
-   // sum atoms across all processers
-   s->atoms->nLocal = 0;
-   for (int i = 0; i < s->boxes->nLocalBoxes; i++)
-   {
-      s->atoms->nLocal += s->boxes->nAtoms[i];
-   }
-
-   startTimer(commReduceTimer);
-   addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1);
-   stopTimer(commReduceTimer);
-}
-
-/// Prints current time, energy, performance etc to monitor the state of
-/// the running simulation.  Performance per atom is scaled by the
-/// number of local atoms per process this should give consistent timing
-/// assuming reasonable load balance
-void printThings(SimFlat* s, int iStep, double elapsedTime)
-{
-   // keep track previous value of iStep so we can calculate number of steps.
-   static int iStepPrev = -1; 
-   static int firstCall = 1;
-
-   int nEval = iStep - iStepPrev; // gives nEval = 1 for zeroth step.
-   iStepPrev = iStep;
-   
-   if (! printRank() )
-      return;
-
-   if (firstCall)
-   {
-      firstCall = 0;
-      fprintf(screenOut, 
-       "#                                                                                         Performance\n" 
-       "#  Loop   Time(fs)       Total Energy   Potential Energy     Kinetic Energy  Temperature   (us/atom)     # Atoms\n");
-      fflush(screenOut);
-   }
-
-   real_t time = iStep*s->dt;
-   real_t eTotal = (s->ePotential+s->eKinetic) / s->atoms->nGlobal;
-   real_t eK = s->eKinetic / s->atoms->nGlobal;
-   real_t eU = s->ePotential / s->atoms->nGlobal;
-   real_t Temp = (s->eKinetic / s->atoms->nGlobal) / (kB_eV * 1.5);
-
-   double timePerAtom = 1.0e6*elapsedTime/(double)(nEval*s->atoms->nLocal);
-#ifndef PRINT_TIMING
-   timePerAtom = 0.0;
-#endif
-
-   fprintf(screenOut, " %6d %10.2f %18.12f %18.12f %18.12f %12.4f %10.4f %12d\n",
-           iStep, time, eTotal, eU, eK, Temp, timePerAtom, s->atoms->nGlobal);
-}
-
-/// Print information about the simulation in a format that is (mostly)
-/// YAML compliant.
-void printSimulationDataYaml(FILE* file, SimFlat* s)
-{
-   // All ranks get maxOccupancy
-   int maxOcc = maxOccupancy(s->boxes);
-
-   // Only rank 0 prints
-   if (! printRank())
-      return;
-   
-   fprintf(file,"Simulation data: \n");
-   fprintf(file,"  Total atoms        : %d\n", 
-           s->atoms->nGlobal);
-   fprintf(file,"  Min global bounds  : [ %14.10f, %14.10f, %14.10f ]\n",
-           s->domain->globalMin[0], s->domain->globalMin[1], s->domain->globalMin[2]);
-   fprintf(file,"  Max global bounds  : [ %14.10f, %14.10f, %14.10f ]\n",
-           s->domain->globalMax[0], s->domain->globalMax[1], s->domain->globalMax[2]);
-   printSeparator(file);
-   fprintf(file,"Decomposition data: \n");
-   fprintf(file,"  Processors         : %6d,%6d,%6d\n", 
-           s->domain->procGrid[0], s->domain->procGrid[1], s->domain->procGrid[2]);
-   fprintf(file,"  Local boxes        : %6d,%6d,%6d = %8d\n", 
-           s->boxes->gridSize[0], s->boxes->gridSize[1], s->boxes->gridSize[2], 
-           s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]);
-   fprintf(file,"  Box size           : [ %14.10f, %14.10f, %14.10f ]\n", 
-           s->boxes->boxSize[0], s->boxes->boxSize[1], s->boxes->boxSize[2]);
-   fprintf(file,"  Box factor         : [ %14.10f, %14.10f, %14.10f ] \n", 
-           s->boxes->boxSize[0]/s->pot->cutoff,
-           s->boxes->boxSize[1]/s->pot->cutoff,
-           s->boxes->boxSize[2]/s->pot->cutoff);
-   fprintf(file, "  Max Link Cell Occupancy: %d of %d\n",
-           maxOcc, MAXATOMS);
-   printSeparator(file);
-   fprintf(file,"Potential data: \n");
-   s->pot->print(file, s->pot);
-   
-   // Memory footprint diagnostics
-   int perAtomSize = 10*sizeof(real_t)+2*sizeof(int);
-   float mbPerAtom = perAtomSize/1024/1024;
-   float totalMemLocal = (float)(perAtomSize*s->atoms->nLocal)/1024/1024;
-   float totalMemGlobal = (float)(perAtomSize*s->atoms->nGlobal)/1024/1024;
-
-   int nLocalBoxes = s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2];
-   int nTotalBoxes = (s->boxes->gridSize[0]+2)*(s->boxes->gridSize[1]+2)*(s->boxes->gridSize[2]+2);
-   float paddedMemLocal = (float) nLocalBoxes*(perAtomSize*MAXATOMS)/1024/1024;
-   float paddedMemTotal = (float) nTotalBoxes*(perAtomSize*MAXATOMS)/1024/1024;
-
-   printSeparator(file);
-   fprintf(file,"Memory data: \n");
-   fprintf(file, "  Intrinsic atom footprint = %4d B/atom \n", perAtomSize);
-   fprintf(file, "  Total atom footprint     = %7.3f MB (%6.2f MB/node)\n", totalMemGlobal, totalMemLocal);
-   fprintf(file, "  Link cell atom footprint = %7.3f MB/node\n", paddedMemLocal);
-   fprintf(file, "  Link cell atom footprint = %7.3f MB/node (including halo cell data\n", paddedMemTotal);
-
-   fflush(file);      
-}
-
-/// Check that the user input meets certain criteria.
-void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8])
-{
-   int failCode = 0;
-
-   // Check that domain grid matches number of ranks. (fail code 1)
-   int nProcs = cmd.xproc * cmd.yproc * cmd.zproc;
-   if (nProcs != getNRanks())
-   {
-      failCode |= 1;
-      if (printRank() )
-         fprintf(screenOut,
-                 "\nNumber of MPI ranks must match xproc * yproc * zproc\n");
-   }
-
-   // Check whether simuation is too small (fail code 2)
-   double minx = 2*cutoff*cmd.xproc;
-   double miny = 2*cutoff*cmd.yproc;
-   double minz = 2*cutoff*cmd.zproc;
-   double sizex = cmd.nx*latticeConst;
-   double sizey = cmd.ny*latticeConst;
-   double sizez = cmd.nz*latticeConst;
-
-   if ( sizex < minx || sizey < miny || sizez < minz)
-   {
-      failCode |= 2;
-      if (printRank())
-         fprintf(screenOut,"\nSimulation too small.\n"
-                 "  Increase the number of unit cells to make the simulation\n"
-                 "  at least (%3.2f, %3.2f. %3.2f) Ansgstroms in size\n",
-                 minx, miny, minz);
-   }
-
-   // Check for supported lattice structure (fail code 4)
-   if (strcasecmp(latticeType, "FCC") != 0)
-   {
-      failCode |= 4;
-      if ( printRank() )
-         fprintf(screenOut,
-                 "\nOnly FCC Lattice type supported, not %s. Fatal Error.\n",
-                 latticeType);
-   }
-   int checkCode = failCode;
-   bcastParallel(&checkCode, sizeof(int), 0);
-   // This assertion can only fail if different tasks failed different
-   // sanity checks.  That should not be possible.
-   assert(checkCode == failCode);
-      
-   if (failCode != 0)
-      exit(failCode);
-}
-
-// --------------------------------------------------------------
-
-
-/// \page pg_building_comd Building CoMD
-///
-/// CoMD is written with portability in mind and should compile using
-/// practically any compiler that implements the C99 standard.  You will
-/// need to create a Makefile by copying the sample provided with the
-/// distribution (Makefile.vanilla).
-/// 
-///     $ cp Makefile.vanilla Makefile
-///
-/// and use the make command to build the code
-/// 
-///    $ make
-///
-/// The sample Makefile will compile the code on many platforms.  See
-/// comments in Makefile.vanilla for information about specifying the
-/// name of the C compiler, and/or additional compiler switches that
-/// might be necessary for your platform.
-/// 
-/// The main options available in the Makefile are toggling single/double 
-/// precision and enabling/disabling MPI. In the event MPI is not
-/// available, setting the DO_MPI flag to OFF will create a purely
-/// serial build (you will likely also need to change the setting of
-/// CC).
-/// 
-/// The makefile should handle all the dependency checking needed, via
-/// makedepend.
-/// 
-/// 'make clean' removes the object and dependency files. 
-/// 
-/// 'make distclean' additionally removes the executable file and the
-/// documentation files.
-/// 
-/// Other build options
-/// -------------------
-///
-/// Various other options are made available by \#define arguments within 
-/// some of the source files. 
-///
-/// #REDIRECT_OUTPUT in CoMD.c
-///
-/// Setting this to 1 will redirect all screen output to a file,
-/// currently set to 'testOut.txt'.
-///
-/// #POT_SHIFT in ljForce.c
-///
-/// This is set to 1.0 by default, and shifts the values of the cohesive
-/// energy given by the Lennard-Jones potential so it is zero at the
-/// cutoff radius.  This setting improves energy conservation
-/// step-to-step as it reduces the noise generated by atoms crossing the
-/// cutoff threshold. However, it does not affect the long-term energy
-/// conservation of the code.
-///
-/// #MAXATOMS in linkCells.h
-/// 
-/// The default value is 64, which allows ample padding of the linkCell
-/// structure to allow for density fluctuations. Reducing it may improve
-/// the efficiency of the code via improved thread utilization and
-/// reduced memory footprint.
-
-// --------------------------------------------------------------
-
-
-// --------------------------------------------------------------
-
-
-/// \page pg_measuring_performance Measuring Performance
-///
-/// CoMD implements a simple and extensible system of internal timers to
-/// measure the performance profile of the code.  As explained in
-/// performanceTimers.c, it is easy to create additional timers and
-/// associate them with code regions of specific interest.  In addition,
-/// the getTime() and getTick() functions can be easily reimplemented to
-/// take advantage of platform specific timing resources.
-///
-/// A timing report is printed at the end of each simulation. 
-///
-/// ~~~~
-/// Timings for Rank 0
-///         Timer        # Calls    Avg/Call (s)   Total (s)    % Loop
-/// ___________________________________________________________________
-/// total                      1      50.6701       50.6701      100.04
-/// loop                       1      50.6505       50.6505      100.00
-/// timestep                   1      50.6505       50.6505      100.00
-///   position             10000       0.0000        0.0441        0.09
-///   velocity             20000       0.0000        0.0388        0.08
-///   redistribute         10001       0.0003        3.4842        6.88
-///     atomHalo           10001       0.0002        2.4577        4.85
-///   force                10001       0.0047       47.0856       92.96
-///     eamHalo            10001       0.0001        1.0592        2.09
-/// commHalo               60006       0.0000        1.7550        3.46
-/// commReduce                12       0.0000        0.0003        0.00
-/// 
-/// Timing Statistics Across 8 Ranks:
-///         Timer        Rank: Min(s)       Rank: Max(s)      Avg(s)    Stdev(s)
-/// _____________________________________________________________________________
-/// total                3:   50.6697       0:   50.6701     50.6699      0.0001
-/// loop                 0:   50.6505       4:   50.6505     50.6505      0.0000
-/// timestep             0:   50.6505       4:   50.6505     50.6505      0.0000
-///   position           2:    0.0437       0:    0.0441      0.0439      0.0001
-///   velocity           2:    0.0380       4:    0.0392      0.0385      0.0004
-///   redistribute       0:    3.4842       1:    3.7085      3.6015      0.0622
-///     atomHalo         0:    2.4577       7:    2.6441      2.5780      0.0549
-///   force              1:   46.8624       0:   47.0856     46.9689      0.0619
-///     eamHalo          3:    0.2269       6:    1.2936      1.0951      0.3344
-/// commHalo             3:    1.0803       6:    2.1856      1.9363      0.3462
-/// commReduce           6:    0.0002       2:    0.0003      0.0003      0.0000
-/// 
-/// ---------------------------------------------------
-///  Average atom update rate:   9.39 us/atom/task
-/// ---------------------------------------------------
-///
-/// ~~~~
-/// This report consists of two blocks.  The upper block lists the absolute
-/// wall clock time spent in each timer on rank 0 of the job.  The lower
-/// block reports minimum, maximum, average, and standard deviation of
-/// times across all tasks.
-/// The ranks where the minimum and maximum values occured are also reported
-/// to aid in identifying hotspots or load imbalances.
-///
-/// The last line of the report gives the atom update rate in
-/// microseconds/atom/task.  Since this quantity is normalized by both
-/// the number of atoms and the number of tasks it provides a simple
-/// figure of merit to compare performance between runs with different
-/// numbers of atoms and different numbers of tasks.  Any increase in
-/// this number relative to a large number of atoms on a single task
-/// represents a loss of parallel efficiency.
-/// 
-/// Choosing the problem size correctly has important implications for the 
-/// reported performance. Small problem sizes may run entirely in the cache 
-/// of some architectures, leading to very good performance results. 
-/// For general characterization of performance, it is probably best to 
-/// choose problem sizes which force the code to access main memory, even
-/// though there may be strong scaling scenarios where the code is indeed 
-/// running mainly in cache.
-///
-/// *** Architecture/Configuration for above timing numbers:
-/// SGI XE1300 cluster with dual-socket Intel quad-core Nehalem processors. 
-/// Each node has 2 Quad-Core Xeon X5550 processors runnning at 2.66 GHz
-/// with 3 GB of memory per core.
-
-// --------------------------------------------------------------
-
-
-/// \page pg_problem_selection_and_scaling Problem Selection and Scaling
-///
-/// CoMD is a reference molecular dynamics simulation code as used in
-/// materials science.
-///
-/// Problem Specification  {#sec_problem_spec}
-/// ======================
-///
-/// The reference problem is solid Copper starting from a face-centered
-/// cubic (FCC) lattice.  The initial thermodynamic conditions
-/// (Temperature and Volume (via the lattice spacing, lat))can be specified
-/// from the command line input. The default is 600 K and standard
-/// volume (lat = 3.615 Angstroms).  
-/// Different temperatures (e.g. T =3000K) and volumes can be
-/// specified to melt the system and enhance the interchange of atoms
-/// between domains.
-///
-/// The dynamics is micro-canonical (NVE = constant Number of atoms,
-/// constant total system Volume, and constant total system Energy). As
-/// a result, the temperature is not fixed. Rather, the temperature will
-/// adjust from the initial temperature (as specified on the command line)
-/// to a final temperature as the total system kinetic energy comes into
-/// equilibrium with the total system potential energy.
-///
-/// The total size of the problem (number of atoms) is specified by the
-/// number (nx, ny, nz) of FCC unit cells in the x, y, z directions: nAtoms
-/// = 4 * nx * ny * nz. The default size is nx = ny = nz = 20 or 32,000 atoms.
-///
-/// The simulation models bulk copper by replicating itself in every
-/// direction using periodic boundary conditions.
-///
-/// Two interatomic force models are available: the Lennard-Jones (LJ)
-/// two-body potential (ljForce.c) and the many-body Embedded-Atom Model (EAM)
-/// potential (eam.c). The LJ potential is included for comparison and
-/// is a valid approximation for constant volume and uniform
-/// density. The EAM potential is a more accurate model of cohesion in
-/// simple metals like Copper and includes the energetics necessary to
-/// model non-uniform density and free surfaces.
-///
-/// Scaling Studies in CoMD  {#sec_scaling_studies}
-/// =======================
-///
-/// CoMD implements a simple geometric domain decomposition to divide
-/// the total problem space into domains, which are owned by MPI
-/// ranks. Each domain is a single-program multiple data (SPMD)
-/// partition of the larger problem.
-///
-/// Caution: When doing scaling studies, it is important to distinguish
-/// between the problem setup phase and the problem execution phase. Both
-/// are important to the workflow of doing molecular dynamics, but it
-/// is the execution phase we want to quantify in the scaling studies
-/// described below, for that dominates the execution time for long runs
-/// (millions of time steps). The problem setup can be an appreciable fraction
-/// of the execution time for short runs (the default is 100 time steps)
-/// and erroneous conclusions drawn.
-///
-/// This code is configured with timers. The times are reported per particle
-/// and the timers for the force calculation, timestep, etc start after the
-/// initialization phase is done.
-///
-/// Weak Scaling  {#ssec_weak_scaling}
-/// -----------
-///
-/// A weak scaling test fixes the amount of work per processor and
-/// compares the execution time over number of processors. Weak scaling
-/// keeps the ratio of inter-processor communication (surface) to
-/// intra-processor work (volume) fixed. The amount of inter-processor
-/// work scales with the number of processors in the domain and O(1000)
-/// atoms per domain are needed for reasonable performance.
-///
-/// Examples,
-///
-/// - Increase in processor count by 8: <br>
-///    (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=40)
-///
-/// - Increase in processor count by 2: <br>
-///    (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=20, nz=40)
-///
-/// In general, it is wise to keep the ratio of processor count to
-/// system size in each direction fixed (i.e. cubic domains): xproc_0 / nx_0 = xproc_1 /
-/// nx_1, since this minimizes surface area to volume. 
-/// Feel free to experiment, you might learn something about
-/// algorithms to optimize communication relative to work.
-///
-/// Strong Scaling {#ssec_strong_scaling}
-/// ---------------
-///
-/// A strong scaling test fixes the total problem size and compares the
-/// execution time for different numbers of processors. Strong scaling
-/// increases the ratio of inter-processor communication (surface) to
-/// intra-processor work (volume).
-///
-/// Examples,
-///
-/// - Increase in processor count by 8: <br>
-///    (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=20)
-///
-/// - Increase in processor count by 2: <br>
-///    (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=nz=20)
-///
-/// The domain decomposition requires O(1000) atoms per domain and
-/// begins to scale poorly for small numbers of atoms per domain. 
-/// Again, feel free to experiment, you might learn something here as
-/// well.  For example, when molecular dynamics codes were written for
-/// vector supercomputers, large lists of force pairs were created for
-/// the vector processor. These force lists provide a natural force
-/// decomposition for early parallel computers (Fast Parallel Algorithms
-/// for Short-Range Molecular Dynamics, S. J. Plimpton, J Comp Phys,
-/// 117, 1-19 (1995).) Using replicated data, force decomposition can
-/// scale to fewer than one atom per processor and is a natural
-/// mechanism to exploit intra-processor parallelism.
-///
-/// For further details see for example:
-/// https://support.scinet.utoronto.ca/wiki/index.php/Introduction_To_Performance
-
-
-// --------------------------------------------------------------
-
-
-/// \page pg_verifying_correctness Verifying Correctness
-///
-/// Verifying the correctness of an MD simulation is challenging.
-/// Because MD is Lyapunov unstable, any small errors, even harmless
-/// round-off errors, will lead to a long-term divergence in the atom
-/// trajectories.  Hence, comparing atom positions at the end of a run
-/// is not always a useful verification technique.  (Such divergences
-/// are not a problem for science applications of MD since they do not
-/// alter the statistical physics.)  Small, single-particle errors can
-/// also be difficult to detect in system-wide quantities such as the
-/// kinetic or potential energy that are averaged over a large number of
-/// particles.
-///
-/// In spite of these challenges, there are several methods which are
-/// likely to catch significant errors.
-///
-/// Cohesive Energy {#sec_ver_cohesive_energy}
-/// ===============
-///
-/// With a perfect lattice as the initial structure (this is the
-/// default), the potential energy per atom is the cohesive energy.
-/// This value should be computed correctly to many decimal places.  Any
-/// variation beyond the last 1 or 2 decimal places is cause for
-/// investigation.  The correct values for the cohesive energy are
-///
-/// | Potential      | Cohesive Energy |
-/// | :------------- | :-------------- |
-/// | Lennard-Jones  | -1.243619295058 |
-/// | EAM (Adams)    | -3.538079224691 |
-/// | EAM (Mishin)   | -3.539999969176 |
-///
-/// The \link sec_command_line_options command
-/// line options \endlink documentation explains the switches used to
-/// select the potential used in the simulation.
-///
-/// Note that the cohesive energy calculation is not sensitive to errors
-/// in forces.  It is also performed on a highly symmetric structure so
-/// there are many errors this will not catch.  Still, it is a good
-/// first check.
-///
-/// Energy Conservation {#sec_ver_energy_conservation}
-/// ===================
-///
-/// A correctly implemented force kernel, with an appropriate time step
-/// (the default value of 1 fs is conservative for temperatures under
-/// 10,000K) will conserve total energy over long times to 5 or more
-/// digits.  Any long term systematic drift in the total energy is a
-/// cause for concern.
-///
-/// To facilitate checking energy conservation CoMD prints the final and
-/// initial values of the total energy.  When comparing these values, pay
-/// careful attention to these details:
-///
-/// - It is common to observe an initial transient change in the total
-///   energy.  Differences in the total energy of 2-3% can be expected in
-///   the first 10-100 time steps.
-/// - The best way to check energy conservation is to run at least
-///   several thousand steps and look at the slope of the total energy
-///   ignoring at least the first one or two thousand steps.  More steps
-///   are even better.
-/// - Set the temperature to at least several hundred K.  This ensures
-///   that atoms will sample a large range of configurations and expose
-///   possible errors.
-/// - Fluctuations in the energy can make it difficult to tell if
-///   conservation is observed.  Increasing the number of atoms will reduce
-///   the fluctuations.
-/// 
-///
-/// Particle Conservation {#sec_ver_particle_conservation}
-/// =====================
-///
-/// The simulation should always end with the same number of particles
-/// it started with.  Any change is a bug.  CoMD checks the initial and
-/// final number of particles and prints a warning at the end of the
-/// simulation if they are not equal.
-///
-/// Reproducibility {#sec_ver_reproducibility}
-/// ===============
-///
-/// The same simulation run repeatedly on the same hardware should
-/// produce the same result.  Because parallel computing can add
-/// elements of non-determinism we do not expect perfect long term
-/// reproducibility, however over a few hundred to a few thousand time
-/// steps the energies should not exhibit run-to-run differences outside
-/// the last 1 or 2 decimal places.  Larger differences are a sign of
-/// trouble and should be investigated.  This kind of test is
-/// practically the only way to detect race conditions in shared memory
-/// parallelism.
-///
-/// Portability {#sec_ver_portability}
-/// ===========
-///
-/// In our experience, simulations that start from the same initial
-/// condition tend to produce very similar trajectories over short terms
-/// (100 to 1000 time step), even on different hardware platforms.
-/// Short term differences beyond the last 1 or 2 decimal places should
-/// likely be investigated.
-///
-/// General Principles {#sec_ver_general}
-/// =======================
-///
-/// - Simulations run at 0K are too trivial for verification, set
-///   the initial temperature to at least several hundred K.
-/// - Longer runs are better to check conservation.  Compare
-///   energies after initial transients are damped out.
-/// - Larger runs are better to check conservation.  Fluctuations in the
-///   energy are averaged out.
-/// - Short term (order 100 time steps) discrepancies from run-to-run
-///   or platform-to platform beyond the last one or two decimal places
-///   are reason for concern.  Differences in 4th or 5th decimal place
-///   is almost certainly a bug.
-/// - Contact the CoMD developers (exmatex-comd@llnl.gov) if you have
-///   questions about validation.
-///
-
-// --------------------------------------------------------------
-
-
-/// \page pg_comd_architecture CoMD Architecture
-///
-/// Program Flow {#sec_program_flow}
-/// ============
-///
-/// We have attempted to make the program flow in CoMD 1.1 as simple and
-/// transparent as possible.  The main program consists of three blocks:
-/// prolog, main loop, and epilog.
-///
-/// Prolog {#ssec_flow_prolog}
-/// -------
-///
-/// The job of the prolog is to initialize the simulation and prepare
-/// for the main loop.  Notable tasks in the prolog include calling
-/// - initParallel() to start MPI
-/// - parseCommandLine() to read the command line options
-/// - initSimulation() to initialize the main data structure, SimFlatSt.
-///   This includes tasks such as
-///   - initEamPot() to read tabular data for the potential function
-///   - initDecomposition() to set up the domain decomposition
-///   - createFccLattice() to generate an initial structure for the atoms
-/// - initValidate() to store initial data for a simple validation check
-///
-/// In CoMD 1.1 all atomic structures are internally generated so
-/// there is no need to read large files with atom coordinate data.
-///
-/// Main Loop {#ssec_flow_main_loop}
-/// ---------
-///
-/// The main loop calls
-/// - timestep(), the integrator to update particle positions,
-/// - printThings() to periodically prints simulation information
-///
-/// The timestep() function is the heart of the code as it choreographs
-/// updating the particle positions, along with computing forces
-/// (computeForce()) and communicating atoms between ranks
-/// (redistributeAtoms()).
-///
-/// Epilog {#ssec_flow_epilog}
-/// -------
-///
-/// The epilog code handles end of run bookkeeping such as
-/// - validateResult() to check validation
-/// - printPerformanceResults() to print a performance summary 
-/// - destroySimulation() to free memory
-///
-/// Key Data Structures {#sec_key_data_structures}
-/// ==================
-///
-/// Practically all data in CoMD belongs to the SimFlatSt structure.
-/// This includes:
-/// - BasePotentialSt A polymorphic structure for the potential model
-/// - HaloExchangeSt A polymorphic strcuture for communication halo data
-/// - DomainSt The parallel domain decomposition
-/// - LinkCellSt The link cells
-/// - AtomsSt The atom coordinates and velocities
-/// - SpeciesDataSt Properties of the atomic species being simulated.
-///
-/// Consult the individual pages for each of these structures to learn
-/// more.  The descriptions in haloExchange.c and initLinkCells() are
-/// especially useful to understand how the atoms are commuicated
-/// between tasks and stored in link cells for fast pair finding.
-
-// --------------------------------------------------------------
-
-
-/// \page pg_optimization_targets Optimization Targets
-///
-/// Computation {#sec_computation}
-/// ============
-///
-/// The computational effort of classical MD is usually highly focused
-/// in the force kernel.  The two force kernels supplied by CoMD are
-/// eamForce() and ljForce().  Both kernels are fundamentally loops over
-/// pairs of atoms with significant opportunity to exploit high levels
-/// of concurrency.  One potential challenge when reordering or
-/// parallelizing the pair loop structure is preventing race conditions
-/// that result if two concurrent pair evaluations try to simultaneously
-/// increment the forces and energies on the same atom.
-///
-/// The supplied EAM kernel uses interpolation from tabular data to
-/// evaluate functions.  Hence the interpolate() function is another
-/// potential optimization target.  Note that the two potential files
-/// distributed with CoMD have very different sizes.  The Adams
-/// potential (Cu_u6.eam) has 500 points per function in the table while
-/// the Mishin potential (Cu01.eam.alloy) has 10,000 points per
-/// function.  This difference could potentially impact important
-/// details such as cache miss rates.
-///
-/// Communication {#sec_communication}
-/// =============
-///  
-/// As the number of atoms per MPI rank decreases, the communication
-/// routines will start to require a significant fraction of the
-/// run time.  The main communication routine in CoMD is haloExchange().
-/// The halo exchange is simple nearest neighbor, point-to-point
-/// communication so it should scale well to practically any number of
-/// nodes.
-///
-/// The halo exchange in CoMD 1.1 is a very simple 6-direction
-/// structured halo exchange (see haloExchange.c).  Other exchange
-/// algorithms can be implemented without much difficulty.
-///
-/// The halo exchange function is called in two very different contexts.
-/// The main usage is to exchange halo particle information (see
-/// initAtomHaloExchange()).  This process is coordinated by the
-/// redistributeAtoms() function.
-///
-/// In addition to the atom exchange, when using the EAM potential, a
-/// halo exchange is performed in the force routine (see
-/// initForceHaloExchange()).
-
-
-// --------------------------------------------------------------
-
-
-/// \page pg_whats_new New Features and Changes in CoMD 1.1
-///
-/// The main goals of the 1.1 release were to add support for MPI and to
-/// improve the structure and clarity of the code.  Achieving these
-/// goals required considerable changes compared to the 1.0 release.
-/// However, the core structure of the most computationally intensive
-/// kernels (the force routines) is mostly unchanged.  We believe that
-/// lessons learned from optimizing 1.0 force kernels to specific
-/// hardware or programming models can be quickly transferred to kernels
-/// in the 1.1 release.
-///
-/// Significant changes in CoMD 1.1 include:
-///
-/// - MPI support.  Both MPI and single node serial executables can be
-///   built from the same source files.
-///
-/// - Improved modularity and code clarity.  Major data structures are
-///   now organized with their own structs and initialization routines.
-///
-/// - The build system has been simplified to use only standard
-///   Makefiles instead of CMake.
-///
-/// - The halo exchange operation needed to communicate remote particle
-///   data between MPI ranks also creates "image" particles in the
-///   serial build.
-///
-/// - Unified force kernels for both serial and MPI builds
-///
-///   - The addition of remote/image atoms allows periodic boundary
-///     conditions to be handled outside the force loop.
-///
-///   - An additional communication/data copy step to handle electron
-///     density on remote/image atoms has been added to the EAM force
-///     loop.
-///
-/// - The coordinate system has been simplified to a single global
-///   coordinate system for all particles.
-///
-/// - Evaluation of energies and forces using a Chebyshev polynomial
-///   fits has been removed.  Polynomial approximation of energies and
-///   forces will return in a future CoMD version.
-///
-/// - Atomic structures are now generated internally, eliminating the
-///   requirement to read, write, and distribute large atom
-///   configuration files.  Arbitrarily large initial structures can
-///   be generated with specified initial temperature and random
-///   displacements from lattice positions.  Code to read/write atomic
-///   positions has been removed.
-///
-/// - EAM potentials are now read from standard funcfl and setfl format
-///   files.  Voter style files are no longer supported.
-///
-/// - Collection of performance metrics is significantly improved.
-///   Developers can easily add new timers to regions of interest.  The
-///   system is also designed to allow easy integration with platform
-///   specific API's to high resolution timers, cycle counters,
-///   hardware counters, etc.
-///
-///
-/// - Hooks to in-situ analysis and visualization have been removed.
-///   In-situ analysis capabilities will return in a future CoMD release.
-///
-/// Please contact the CoMD developers (exematex-comd@llnl.gov) if
-/// any of the deleted features negative impacts your work.  We
-/// may be able to help produce a custom version that includes the code
-/// you need.
-
-
-// --------------------------------------------------------------
-
-
-/// \page pg_md_basics MD Basics
-///
-/// The molecular dynamics (MD) computer simulation method is a well
-/// established and important tool for the study of the dynamical
-/// properties of liquids, solids, and other systems of interest in
-/// Materials Science and Engineering, Chemistry and Biology. A material
-/// is represented in terms of atoms and molecules. The method of MD
-/// simulation involves the evaluation of the force acting on each atom
-/// due to all other atoms in the system and the numerical integration
-/// of the Newtonian equations of motion. Though MD was initially
-/// developed to compute the equilibrium thermodynamic behavior of
-/// materials (equation of state), most recent applications have used MD
-/// to study non-equilibrium processes.
-///
-/// Wikipeda offers a basic introduction to molecular dynamics with
-/// many references:
-///
-/// http://en.wikipedia.org/wiki/Molecular_dynamics
-///
-/// For a thorough treatment of MD methods, see:
-/// - "Computer simulation of liquids" by M.P. Allen and D.J. Tildesley
-///    (Oxford, 1989)
-///    ISBN-10: 0198556454 | ISBN-13: 978-0198556459.
-///
-/// For an understanding of MD simulations and application to statistical mechanics:
-/// - "Understanding Molecular Simulation, Second Edition: From Algorithms
-///    to Applications," by D. Frenkel and B. Smit (Academic Press, 2001)
-///    ISBN-10: 0122673514 | ISBN-13: 978-0122673511
-/// - "Statistical and Thermal Physics: With Computer Applications," by
-///    H. Gould and J. Tobochnik (Princeton, 2010)
-///    ISBN-10: 0691137447 | ISBN-13: 978-0691137445
-///
-/// CoMD implements both the Lennard-Jones Potential (ljForce.c) and the
-/// Embedded Atom Method Potential (eam.c).
-///