aboutsummaryrefslogtreecommitdiff
path: root/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c
diff options
context:
space:
mode:
Diffstat (limited to 'MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c')
-rw-r--r--MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c1107
1 files changed, 0 insertions, 1107 deletions
diff --git a/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c b/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c
deleted file mode 100644
index 5c2263f2..00000000
--- a/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c
+++ /dev/null
@@ -1,1107 +0,0 @@
-/// \file
-/// Main program
-///
-/// \mainpage CoMD: A Classical Molecular Dynamics Mini-app
-///
-/// CoMD is a reference implementation of typical classical molecular
-/// dynamics algorithms and workloads. It is created and maintained by
-/// The Exascale Co-Design Center for Materials in Extreme Environments
-/// (ExMatEx). http://codesign.lanl.gov/projects/exmatex. The
-/// code is intended to serve as a vehicle for co-design by allowing
-/// others to extend and/or reimplement it as needed to test performance of
-/// new architectures, programming models, etc.
-///
-/// The current version of CoMD is available from:
-/// http://exmatex.github.io/CoMD
-///
-/// To contact the developers of CoMD send email to: exmatex-comd@llnl.gov.
-///
-/// Table of Contents
-/// =================
-///
-/// Click on the links below to browse the CoMD documentation.
-///
-/// \subpage pg_md_basics
-///
-/// \subpage pg_building_comd
-///
-/// \subpage pg_running_comd
-///
-/// \subpage pg_measuring_performance
-///
-/// \subpage pg_problem_selection_and_scaling
-///
-/// \subpage pg_verifying_correctness
-///
-/// \subpage pg_comd_architecture
-///
-/// \subpage pg_optimization_targets
-///
-/// \subpage pg_whats_new
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <unistd.h>
-#include <assert.h>
-
-#include "CoMDTypes.h"
-#include "decomposition.h"
-#include "linkCells.h"
-#include "eam.h"
-#include "ljForce.h"
-#include "initAtoms.h"
-#include "memUtils.h"
-#include "yamlOutput.h"
-#include "parallel.h"
-#include "performanceTimers.h"
-#include "mycommand.h"
-#include "timestep.h"
-#include "constants.h"
-
-#define REDIRECT_OUTPUT 0
-#define MIN(A,B) ((A) < (B) ? (A) : (B))
-
-static SimFlat* initSimulation(Command cmd);
-static void destroySimulation(SimFlat** ps);
-
-static void initSubsystems(void);
-static void finalizeSubsystems(void);
-
-static BasePotential* initPotential(
- int doeam, const char* potDir, const char* potName, const char* potType);
-static SpeciesData* initSpecies(BasePotential* pot);
-static Validate* initValidate(SimFlat* s);
-static void validateResult(const Validate* val, SimFlat *sim);
-
-static void sumAtoms(SimFlat* s);
-static void printThings(SimFlat* s, int iStep, double elapsedTime);
-static void printSimulationDataYaml(FILE* file, SimFlat* s);
-static void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]);
-
-
-int main(int argc, char** argv)
-{
- // Prolog
- initParallel(&argc, &argv);
- profileStart(totalTimer);
- initSubsystems();
- timestampBarrier("Starting Initialization\n");
-
-#ifdef PRINT_YAML
- yamlAppInfo(yamlFile);
-#endif
- yamlAppInfo(screenOut);
-
- Command cmd = parseCommandLine(argc, argv);
-#ifdef PRINT_YAML
- printCmdYaml(yamlFile, &cmd);
-#endif
- printCmdYaml(screenOut, &cmd);
-
- SimFlat* sim = initSimulation(cmd);
-#ifdef PRINT_YAML
- printSimulationDataYaml(yamlFile, sim);
-#endif
- printSimulationDataYaml(screenOut, sim);
-
- Validate* validate = initValidate(sim); // atom counts, energy
- timestampBarrier("Initialization Finished\n");
-
- timestampBarrier("Starting simulation\n");
-
- // This is the CoMD main loop
- const int nSteps = sim->nSteps;
- const int printRate = sim->printRate;
- int iStep = 0;
- profileStart(loopTimer);
- for (; iStep<nSteps;)
- {
- startTimer(commReduceTimer);
- sumAtoms(sim);
- stopTimer(commReduceTimer);
-
- printThings(sim, iStep, getElapsedTime(timestepTimer));
-
- startTimer(timestepTimer);
- timestep(sim, printRate, sim->dt);
- stopTimer(timestepTimer);
-
- iStep += printRate;
- }
- profileStop(loopTimer);
-
- sumAtoms(sim);
- printThings(sim, iStep, getElapsedTime(timestepTimer));
- timestampBarrier("Ending simulation\n");
-
- // Epilog
- validateResult(validate, sim);
- profileStop(totalTimer);
-
-#ifdef PRINT_TIMING
- printPerformanceResults(sim->atoms->nGlobal, sim->printRate);
- printPerformanceResultsYaml(yamlFile);
-#endif
-
- destroySimulation(&sim);
- comdFree(validate);
- finalizeSubsystems();
-
- timestampBarrier("CoMD Ending\n");
- destroyParallel();
-
- return 0;
-}
-
-/// Initialized the main CoMD data stucture, SimFlat, based on command
-/// line input from the user. Also performs certain sanity checks on
-/// the input to screen out certain non-sensical inputs.
-///
-/// Simple data members such as the time step dt are initialized
-/// directly, substructures such as the potential, the link cells, the
-/// atoms, etc., are initialized by calling additional initialization
-/// functions (initPotential(), initLinkCells(), initAtoms(), etc.).
-/// Initialization order is set by the natural dependencies of the
-/// substructure such as the atoms need the link cells so the link cells
-/// must be initialized before the atoms.
-SimFlat* initSimulation(Command cmd)
-{
- SimFlat* sim = comdMalloc(sizeof(SimFlat));
- sim->nSteps = cmd.nSteps;
- sim->printRate = cmd.printRate;
- sim->dt = cmd.dt;
- sim->domain = NULL;
- sim->boxes = NULL;
- sim->atoms = NULL;
- sim->ePotential = 0.0;
- sim->eKinetic = 0.0;
- sim->atomExchange = NULL;
-
- sim->pot = initPotential(cmd.doeam, cmd.potDir, cmd.potName, cmd.potType);
- real_t latticeConstant = cmd.lat;
- if (cmd.lat < 0.0)
- latticeConstant = sim->pot->lat;
-
- // ensure input parameters make sense.
- sanityChecks(cmd, sim->pot->cutoff, latticeConstant, sim->pot->latticeType);
-
- sim->species = initSpecies(sim->pot);
-
- real3 globalExtent;
- globalExtent[0] = cmd.nx * latticeConstant;
- globalExtent[1] = cmd.ny * latticeConstant;
- globalExtent[2] = cmd.nz * latticeConstant;
-
- sim->domain = initDecomposition(
- cmd.xproc, cmd.yproc, cmd.zproc, globalExtent);
-
- sim->boxes = initLinkCells(sim->domain, sim->pot->cutoff);
- sim->atoms = initAtoms(sim->boxes);
-
- // create lattice with desired temperature and displacement.
- createFccLattice(cmd.nx, cmd.ny, cmd.nz, latticeConstant, sim);
- setTemperature(sim, cmd.temperature);
- randomDisplacements(sim, cmd.initialDelta);
-
- sim->atomExchange = initAtomHaloExchange(sim->domain, sim->boxes);
-
- // Forces must be computed before we call the time stepper.
- startTimer(redistributeTimer);
- redistributeAtoms(sim);
- stopTimer(redistributeTimer);
-
- startTimer(computeForceTimer);
- computeForce(sim);
- stopTimer(computeForceTimer);
-
- kineticEnergy(sim);
-
- return sim;
-}
-
-/// frees all data associated with *ps and frees *ps
-void destroySimulation(SimFlat** ps)
-{
- if ( ! ps ) return;
-
- SimFlat* s = *ps;
- if ( ! s ) return;
-
- BasePotential* pot = s->pot;
- if ( pot) pot->destroy(&pot);
- destroyLinkCells(&(s->boxes));
- destroyAtoms(s->atoms);
- destroyHaloExchange(&(s->atomExchange));
- comdFree(s->species);
- comdFree(s->domain);
- comdFree(s);
- *ps = NULL;
-
- return;
-}
-
-void initSubsystems(void)
-{
-#if REDIRECT_OUTPUT
- freopen("testOut.txt","w",screenOut);
-#endif
-
-#ifdef PRINT_YAML
- yamlBegin();
-#endif
-}
-
-void finalizeSubsystems(void)
-{
-#if REDIRECT_OUTPUT
- fclose(screenOut);
-#endif
-
-#ifdef PRINT_YAML
- yamlEnd();
-#endif
-}
-
-/// decide whether to get LJ or EAM potentials
-BasePotential* initPotential(
- int doeam, const char* potDir, const char* potName, const char* potType)
-{
- BasePotential* pot = NULL;
-
- if (doeam)
- pot = initEamPot(potDir, potName, potType);
- else
- pot = initLjPot();
- assert(pot);
- return pot;
-}
-
-SpeciesData* initSpecies(BasePotential* pot)
-{
- SpeciesData* species = comdMalloc(sizeof(SpeciesData));
-
- strcpy(species->name, pot->name);
- species->atomicNo = pot->atomicNo;
- species->mass = pot->mass;
-
- return species;
-}
-
-Validate* initValidate(SimFlat* sim)
-{
- sumAtoms(sim);
- Validate* val = comdMalloc(sizeof(Validate));
- val->eTot0 = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
- val->nAtoms0 = sim->atoms->nGlobal;
-
- if (printRank())
- {
- fprintf(screenOut, "\n");
- printSeparator(screenOut);
- fprintf(screenOut, "Initial energy : %14.12f, atom count : %d \n",
- val->eTot0, val->nAtoms0);
- fprintf(screenOut, "\n");
- }
- return val;
-}
-
-void validateResult(const Validate* val, SimFlat* sim)
-{
- if (printRank())
- {
- real_t eFinal = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
-
- int nAtomsDelta = (sim->atoms->nGlobal - val->nAtoms0);
-
- fprintf(screenOut, "\n");
- fprintf(screenOut, "\n");
- fprintf(screenOut, "Simulation Validation:\n");
-
- fprintf(screenOut, " Initial energy : %14.12f\n", val->eTot0);
- fprintf(screenOut, " Final energy : %14.12f\n", eFinal);
- fprintf(screenOut, " eFinal/eInitial : %f\n", eFinal/val->eTot0);
- if ( nAtomsDelta == 0)
- {
- fprintf(screenOut, " Final atom count : %d, no atoms lost\n",
- sim->atoms->nGlobal);
- }
- else
- {
- fprintf(screenOut, "#############################\n");
- fprintf(screenOut, "# WARNING: %6d atoms lost #\n", nAtomsDelta);
- fprintf(screenOut, "#############################\n");
- }
- }
-}
-
-void sumAtoms(SimFlat* s)
-{
- // sum atoms across all processers
- s->atoms->nLocal = 0;
- for (int i = 0; i < s->boxes->nLocalBoxes; i++)
- {
- s->atoms->nLocal += s->boxes->nAtoms[i];
- }
-
- startTimer(commReduceTimer);
- addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1);
- stopTimer(commReduceTimer);
-}
-
-/// Prints current time, energy, performance etc to monitor the state of
-/// the running simulation. Performance per atom is scaled by the
-/// number of local atoms per process this should give consistent timing
-/// assuming reasonable load balance
-void printThings(SimFlat* s, int iStep, double elapsedTime)
-{
- // keep track previous value of iStep so we can calculate number of steps.
- static int iStepPrev = -1;
- static int firstCall = 1;
-
- int nEval = iStep - iStepPrev; // gives nEval = 1 for zeroth step.
- iStepPrev = iStep;
-
- if (! printRank() )
- return;
-
- if (firstCall)
- {
- firstCall = 0;
- fprintf(screenOut,
- "# Performance\n"
- "# Loop Time(fs) Total Energy Potential Energy Kinetic Energy Temperature (us/atom) # Atoms\n");
- fflush(screenOut);
- }
-
- real_t time = iStep*s->dt;
- real_t eTotal = (s->ePotential+s->eKinetic) / s->atoms->nGlobal;
- real_t eK = s->eKinetic / s->atoms->nGlobal;
- real_t eU = s->ePotential / s->atoms->nGlobal;
- real_t Temp = (s->eKinetic / s->atoms->nGlobal) / (kB_eV * 1.5);
-
- double timePerAtom = 1.0e6*elapsedTime/(double)(nEval*s->atoms->nLocal);
-#ifndef PRINT_TIMING
- timePerAtom = 0.0;
-#endif
-
- fprintf(screenOut, " %6d %10.2f %18.12f %18.12f %18.12f %12.4f %10.4f %12d\n",
- iStep, time, eTotal, eU, eK, Temp, timePerAtom, s->atoms->nGlobal);
-}
-
-/// Print information about the simulation in a format that is (mostly)
-/// YAML compliant.
-void printSimulationDataYaml(FILE* file, SimFlat* s)
-{
- // All ranks get maxOccupancy
- int maxOcc = maxOccupancy(s->boxes);
-
- // Only rank 0 prints
- if (! printRank())
- return;
-
- fprintf(file,"Simulation data: \n");
- fprintf(file," Total atoms : %d\n",
- s->atoms->nGlobal);
- fprintf(file," Min global bounds : [ %14.10f, %14.10f, %14.10f ]\n",
- s->domain->globalMin[0], s->domain->globalMin[1], s->domain->globalMin[2]);
- fprintf(file," Max global bounds : [ %14.10f, %14.10f, %14.10f ]\n",
- s->domain->globalMax[0], s->domain->globalMax[1], s->domain->globalMax[2]);
- printSeparator(file);
- fprintf(file,"Decomposition data: \n");
- fprintf(file," Processors : %6d,%6d,%6d\n",
- s->domain->procGrid[0], s->domain->procGrid[1], s->domain->procGrid[2]);
- fprintf(file," Local boxes : %6d,%6d,%6d = %8d\n",
- s->boxes->gridSize[0], s->boxes->gridSize[1], s->boxes->gridSize[2],
- s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]);
- fprintf(file," Box size : [ %14.10f, %14.10f, %14.10f ]\n",
- s->boxes->boxSize[0], s->boxes->boxSize[1], s->boxes->boxSize[2]);
- fprintf(file," Box factor : [ %14.10f, %14.10f, %14.10f ] \n",
- s->boxes->boxSize[0]/s->pot->cutoff,
- s->boxes->boxSize[1]/s->pot->cutoff,
- s->boxes->boxSize[2]/s->pot->cutoff);
- fprintf(file, " Max Link Cell Occupancy: %d of %d\n",
- maxOcc, MAXATOMS);
- printSeparator(file);
- fprintf(file,"Potential data: \n");
- s->pot->print(file, s->pot);
-
- // Memory footprint diagnostics
- int perAtomSize = 10*sizeof(real_t)+2*sizeof(int);
- float mbPerAtom = perAtomSize/1024/1024;
- float totalMemLocal = (float)(perAtomSize*s->atoms->nLocal)/1024/1024;
- float totalMemGlobal = (float)(perAtomSize*s->atoms->nGlobal)/1024/1024;
-
- int nLocalBoxes = s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2];
- int nTotalBoxes = (s->boxes->gridSize[0]+2)*(s->boxes->gridSize[1]+2)*(s->boxes->gridSize[2]+2);
- float paddedMemLocal = (float) nLocalBoxes*(perAtomSize*MAXATOMS)/1024/1024;
- float paddedMemTotal = (float) nTotalBoxes*(perAtomSize*MAXATOMS)/1024/1024;
-
- printSeparator(file);
- fprintf(file,"Memory data: \n");
- fprintf(file, " Intrinsic atom footprint = %4d B/atom \n", perAtomSize);
- fprintf(file, " Total atom footprint = %7.3f MB (%6.2f MB/node)\n", totalMemGlobal, totalMemLocal);
- fprintf(file, " Link cell atom footprint = %7.3f MB/node\n", paddedMemLocal);
- fprintf(file, " Link cell atom footprint = %7.3f MB/node (including halo cell data\n", paddedMemTotal);
-
- fflush(file);
-}
-
-/// Check that the user input meets certain criteria.
-void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8])
-{
- int failCode = 0;
-
- // Check that domain grid matches number of ranks. (fail code 1)
- int nProcs = cmd.xproc * cmd.yproc * cmd.zproc;
- if (nProcs != getNRanks())
- {
- failCode |= 1;
- if (printRank() )
- fprintf(screenOut,
- "\nNumber of MPI ranks must match xproc * yproc * zproc\n");
- }
-
- // Check whether simuation is too small (fail code 2)
- double minx = 2*cutoff*cmd.xproc;
- double miny = 2*cutoff*cmd.yproc;
- double minz = 2*cutoff*cmd.zproc;
- double sizex = cmd.nx*latticeConst;
- double sizey = cmd.ny*latticeConst;
- double sizez = cmd.nz*latticeConst;
-
- if ( sizex < minx || sizey < miny || sizez < minz)
- {
- failCode |= 2;
- if (printRank())
- fprintf(screenOut,"\nSimulation too small.\n"
- " Increase the number of unit cells to make the simulation\n"
- " at least (%3.2f, %3.2f. %3.2f) Ansgstroms in size\n",
- minx, miny, minz);
- }
-
- // Check for supported lattice structure (fail code 4)
- if (strcasecmp(latticeType, "FCC") != 0)
- {
- failCode |= 4;
- if ( printRank() )
- fprintf(screenOut,
- "\nOnly FCC Lattice type supported, not %s. Fatal Error.\n",
- latticeType);
- }
- int checkCode = failCode;
- bcastParallel(&checkCode, sizeof(int), 0);
- // This assertion can only fail if different tasks failed different
- // sanity checks. That should not be possible.
- assert(checkCode == failCode);
-
- if (failCode != 0)
- exit(failCode);
-}
-
-// --------------------------------------------------------------
-
-
-/// \page pg_building_comd Building CoMD
-///
-/// CoMD is written with portability in mind and should compile using
-/// practically any compiler that implements the C99 standard. You will
-/// need to create a Makefile by copying the sample provided with the
-/// distribution (Makefile.vanilla).
-///
-/// $ cp Makefile.vanilla Makefile
-///
-/// and use the make command to build the code
-///
-/// $ make
-///
-/// The sample Makefile will compile the code on many platforms. See
-/// comments in Makefile.vanilla for information about specifying the
-/// name of the C compiler, and/or additional compiler switches that
-/// might be necessary for your platform.
-///
-/// The main options available in the Makefile are toggling single/double
-/// precision and enabling/disabling MPI. In the event MPI is not
-/// available, setting the DO_MPI flag to OFF will create a purely
-/// serial build (you will likely also need to change the setting of
-/// CC).
-///
-/// The makefile should handle all the dependency checking needed, via
-/// makedepend.
-///
-/// 'make clean' removes the object and dependency files.
-///
-/// 'make distclean' additionally removes the executable file and the
-/// documentation files.
-///
-/// Other build options
-/// -------------------
-///
-/// Various other options are made available by \#define arguments within
-/// some of the source files.
-///
-/// #REDIRECT_OUTPUT in CoMD.c
-///
-/// Setting this to 1 will redirect all screen output to a file,
-/// currently set to 'testOut.txt'.
-///
-/// #POT_SHIFT in ljForce.c
-///
-/// This is set to 1.0 by default, and shifts the values of the cohesive
-/// energy given by the Lennard-Jones potential so it is zero at the
-/// cutoff radius. This setting improves energy conservation
-/// step-to-step as it reduces the noise generated by atoms crossing the
-/// cutoff threshold. However, it does not affect the long-term energy
-/// conservation of the code.
-///
-/// #MAXATOMS in linkCells.h
-///
-/// The default value is 64, which allows ample padding of the linkCell
-/// structure to allow for density fluctuations. Reducing it may improve
-/// the efficiency of the code via improved thread utilization and
-/// reduced memory footprint.
-
-// --------------------------------------------------------------
-
-
-// --------------------------------------------------------------
-
-
-/// \page pg_measuring_performance Measuring Performance
-///
-/// CoMD implements a simple and extensible system of internal timers to
-/// measure the performance profile of the code. As explained in
-/// performanceTimers.c, it is easy to create additional timers and
-/// associate them with code regions of specific interest. In addition,
-/// the getTime() and getTick() functions can be easily reimplemented to
-/// take advantage of platform specific timing resources.
-///
-/// A timing report is printed at the end of each simulation.
-///
-/// ~~~~
-/// Timings for Rank 0
-/// Timer # Calls Avg/Call (s) Total (s) % Loop
-/// ___________________________________________________________________
-/// total 1 50.6701 50.6701 100.04
-/// loop 1 50.6505 50.6505 100.00
-/// timestep 1 50.6505 50.6505 100.00
-/// position 10000 0.0000 0.0441 0.09
-/// velocity 20000 0.0000 0.0388 0.08
-/// redistribute 10001 0.0003 3.4842 6.88
-/// atomHalo 10001 0.0002 2.4577 4.85
-/// force 10001 0.0047 47.0856 92.96
-/// eamHalo 10001 0.0001 1.0592 2.09
-/// commHalo 60006 0.0000 1.7550 3.46
-/// commReduce 12 0.0000 0.0003 0.00
-///
-/// Timing Statistics Across 8 Ranks:
-/// Timer Rank: Min(s) Rank: Max(s) Avg(s) Stdev(s)
-/// _____________________________________________________________________________
-/// total 3: 50.6697 0: 50.6701 50.6699 0.0001
-/// loop 0: 50.6505 4: 50.6505 50.6505 0.0000
-/// timestep 0: 50.6505 4: 50.6505 50.6505 0.0000
-/// position 2: 0.0437 0: 0.0441 0.0439 0.0001
-/// velocity 2: 0.0380 4: 0.0392 0.0385 0.0004
-/// redistribute 0: 3.4842 1: 3.7085 3.6015 0.0622
-/// atomHalo 0: 2.4577 7: 2.6441 2.5780 0.0549
-/// force 1: 46.8624 0: 47.0856 46.9689 0.0619
-/// eamHalo 3: 0.2269 6: 1.2936 1.0951 0.3344
-/// commHalo 3: 1.0803 6: 2.1856 1.9363 0.3462
-/// commReduce 6: 0.0002 2: 0.0003 0.0003 0.0000
-///
-/// ---------------------------------------------------
-/// Average atom update rate: 9.39 us/atom/task
-/// ---------------------------------------------------
-///
-/// ~~~~
-/// This report consists of two blocks. The upper block lists the absolute
-/// wall clock time spent in each timer on rank 0 of the job. The lower
-/// block reports minimum, maximum, average, and standard deviation of
-/// times across all tasks.
-/// The ranks where the minimum and maximum values occured are also reported
-/// to aid in identifying hotspots or load imbalances.
-///
-/// The last line of the report gives the atom update rate in
-/// microseconds/atom/task. Since this quantity is normalized by both
-/// the number of atoms and the number of tasks it provides a simple
-/// figure of merit to compare performance between runs with different
-/// numbers of atoms and different numbers of tasks. Any increase in
-/// this number relative to a large number of atoms on a single task
-/// represents a loss of parallel efficiency.
-///
-/// Choosing the problem size correctly has important implications for the
-/// reported performance. Small problem sizes may run entirely in the cache
-/// of some architectures, leading to very good performance results.
-/// For general characterization of performance, it is probably best to
-/// choose problem sizes which force the code to access main memory, even
-/// though there may be strong scaling scenarios where the code is indeed
-/// running mainly in cache.
-///
-/// *** Architecture/Configuration for above timing numbers:
-/// SGI XE1300 cluster with dual-socket Intel quad-core Nehalem processors.
-/// Each node has 2 Quad-Core Xeon X5550 processors runnning at 2.66 GHz
-/// with 3 GB of memory per core.
-
-// --------------------------------------------------------------
-
-
-/// \page pg_problem_selection_and_scaling Problem Selection and Scaling
-///
-/// CoMD is a reference molecular dynamics simulation code as used in
-/// materials science.
-///
-/// Problem Specification {#sec_problem_spec}
-/// ======================
-///
-/// The reference problem is solid Copper starting from a face-centered
-/// cubic (FCC) lattice. The initial thermodynamic conditions
-/// (Temperature and Volume (via the lattice spacing, lat))can be specified
-/// from the command line input. The default is 600 K and standard
-/// volume (lat = 3.615 Angstroms).
-/// Different temperatures (e.g. T =3000K) and volumes can be
-/// specified to melt the system and enhance the interchange of atoms
-/// between domains.
-///
-/// The dynamics is micro-canonical (NVE = constant Number of atoms,
-/// constant total system Volume, and constant total system Energy). As
-/// a result, the temperature is not fixed. Rather, the temperature will
-/// adjust from the initial temperature (as specified on the command line)
-/// to a final temperature as the total system kinetic energy comes into
-/// equilibrium with the total system potential energy.
-///
-/// The total size of the problem (number of atoms) is specified by the
-/// number (nx, ny, nz) of FCC unit cells in the x, y, z directions: nAtoms
-/// = 4 * nx * ny * nz. The default size is nx = ny = nz = 20 or 32,000 atoms.
-///
-/// The simulation models bulk copper by replicating itself in every
-/// direction using periodic boundary conditions.
-///
-/// Two interatomic force models are available: the Lennard-Jones (LJ)
-/// two-body potential (ljForce.c) and the many-body Embedded-Atom Model (EAM)
-/// potential (eam.c). The LJ potential is included for comparison and
-/// is a valid approximation for constant volume and uniform
-/// density. The EAM potential is a more accurate model of cohesion in
-/// simple metals like Copper and includes the energetics necessary to
-/// model non-uniform density and free surfaces.
-///
-/// Scaling Studies in CoMD {#sec_scaling_studies}
-/// =======================
-///
-/// CoMD implements a simple geometric domain decomposition to divide
-/// the total problem space into domains, which are owned by MPI
-/// ranks. Each domain is a single-program multiple data (SPMD)
-/// partition of the larger problem.
-///
-/// Caution: When doing scaling studies, it is important to distinguish
-/// between the problem setup phase and the problem execution phase. Both
-/// are important to the workflow of doing molecular dynamics, but it
-/// is the execution phase we want to quantify in the scaling studies
-/// described below, for that dominates the execution time for long runs
-/// (millions of time steps). The problem setup can be an appreciable fraction
-/// of the execution time for short runs (the default is 100 time steps)
-/// and erroneous conclusions drawn.
-///
-/// This code is configured with timers. The times are reported per particle
-/// and the timers for the force calculation, timestep, etc start after the
-/// initialization phase is done.
-///
-/// Weak Scaling {#ssec_weak_scaling}
-/// -----------
-///
-/// A weak scaling test fixes the amount of work per processor and
-/// compares the execution time over number of processors. Weak scaling
-/// keeps the ratio of inter-processor communication (surface) to
-/// intra-processor work (volume) fixed. The amount of inter-processor
-/// work scales with the number of processors in the domain and O(1000)
-/// atoms per domain are needed for reasonable performance.
-///
-/// Examples,
-///
-/// - Increase in processor count by 8: <br>
-/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=40)
-///
-/// - Increase in processor count by 2: <br>
-/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=20, nz=40)
-///
-/// In general, it is wise to keep the ratio of processor count to
-/// system size in each direction fixed (i.e. cubic domains): xproc_0 / nx_0 = xproc_1 /
-/// nx_1, since this minimizes surface area to volume.
-/// Feel free to experiment, you might learn something about
-/// algorithms to optimize communication relative to work.
-///
-/// Strong Scaling {#ssec_strong_scaling}
-/// ---------------
-///
-/// A strong scaling test fixes the total problem size and compares the
-/// execution time for different numbers of processors. Strong scaling
-/// increases the ratio of inter-processor communication (surface) to
-/// intra-processor work (volume).
-///
-/// Examples,
-///
-/// - Increase in processor count by 8: <br>
-/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=20)
-///
-/// - Increase in processor count by 2: <br>
-/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=nz=20)
-///
-/// The domain decomposition requires O(1000) atoms per domain and
-/// begins to scale poorly for small numbers of atoms per domain.
-/// Again, feel free to experiment, you might learn something here as
-/// well. For example, when molecular dynamics codes were written for
-/// vector supercomputers, large lists of force pairs were created for
-/// the vector processor. These force lists provide a natural force
-/// decomposition for early parallel computers (Fast Parallel Algorithms
-/// for Short-Range Molecular Dynamics, S. J. Plimpton, J Comp Phys,
-/// 117, 1-19 (1995).) Using replicated data, force decomposition can
-/// scale to fewer than one atom per processor and is a natural
-/// mechanism to exploit intra-processor parallelism.
-///
-/// For further details see for example:
-/// https://support.scinet.utoronto.ca/wiki/index.php/Introduction_To_Performance
-
-
-// --------------------------------------------------------------
-
-
-/// \page pg_verifying_correctness Verifying Correctness
-///
-/// Verifying the correctness of an MD simulation is challenging.
-/// Because MD is Lyapunov unstable, any small errors, even harmless
-/// round-off errors, will lead to a long-term divergence in the atom
-/// trajectories. Hence, comparing atom positions at the end of a run
-/// is not always a useful verification technique. (Such divergences
-/// are not a problem for science applications of MD since they do not
-/// alter the statistical physics.) Small, single-particle errors can
-/// also be difficult to detect in system-wide quantities such as the
-/// kinetic or potential energy that are averaged over a large number of
-/// particles.
-///
-/// In spite of these challenges, there are several methods which are
-/// likely to catch significant errors.
-///
-/// Cohesive Energy {#sec_ver_cohesive_energy}
-/// ===============
-///
-/// With a perfect lattice as the initial structure (this is the
-/// default), the potential energy per atom is the cohesive energy.
-/// This value should be computed correctly to many decimal places. Any
-/// variation beyond the last 1 or 2 decimal places is cause for
-/// investigation. The correct values for the cohesive energy are
-///
-/// | Potential | Cohesive Energy |
-/// | :------------- | :-------------- |
-/// | Lennard-Jones | -1.243619295058 |
-/// | EAM (Adams) | -3.538079224691 |
-/// | EAM (Mishin) | -3.539999969176 |
-///
-/// The \link sec_command_line_options command
-/// line options \endlink documentation explains the switches used to
-/// select the potential used in the simulation.
-///
-/// Note that the cohesive energy calculation is not sensitive to errors
-/// in forces. It is also performed on a highly symmetric structure so
-/// there are many errors this will not catch. Still, it is a good
-/// first check.
-///
-/// Energy Conservation {#sec_ver_energy_conservation}
-/// ===================
-///
-/// A correctly implemented force kernel, with an appropriate time step
-/// (the default value of 1 fs is conservative for temperatures under
-/// 10,000K) will conserve total energy over long times to 5 or more
-/// digits. Any long term systematic drift in the total energy is a
-/// cause for concern.
-///
-/// To facilitate checking energy conservation CoMD prints the final and
-/// initial values of the total energy. When comparing these values, pay
-/// careful attention to these details:
-///
-/// - It is common to observe an initial transient change in the total
-/// energy. Differences in the total energy of 2-3% can be expected in
-/// the first 10-100 time steps.
-/// - The best way to check energy conservation is to run at least
-/// several thousand steps and look at the slope of the total energy
-/// ignoring at least the first one or two thousand steps. More steps
-/// are even better.
-/// - Set the temperature to at least several hundred K. This ensures
-/// that atoms will sample a large range of configurations and expose
-/// possible errors.
-/// - Fluctuations in the energy can make it difficult to tell if
-/// conservation is observed. Increasing the number of atoms will reduce
-/// the fluctuations.
-///
-///
-/// Particle Conservation {#sec_ver_particle_conservation}
-/// =====================
-///
-/// The simulation should always end with the same number of particles
-/// it started with. Any change is a bug. CoMD checks the initial and
-/// final number of particles and prints a warning at the end of the
-/// simulation if they are not equal.
-///
-/// Reproducibility {#sec_ver_reproducibility}
-/// ===============
-///
-/// The same simulation run repeatedly on the same hardware should
-/// produce the same result. Because parallel computing can add
-/// elements of non-determinism we do not expect perfect long term
-/// reproducibility, however over a few hundred to a few thousand time
-/// steps the energies should not exhibit run-to-run differences outside
-/// the last 1 or 2 decimal places. Larger differences are a sign of
-/// trouble and should be investigated. This kind of test is
-/// practically the only way to detect race conditions in shared memory
-/// parallelism.
-///
-/// Portability {#sec_ver_portability}
-/// ===========
-///
-/// In our experience, simulations that start from the same initial
-/// condition tend to produce very similar trajectories over short terms
-/// (100 to 1000 time step), even on different hardware platforms.
-/// Short term differences beyond the last 1 or 2 decimal places should
-/// likely be investigated.
-///
-/// General Principles {#sec_ver_general}
-/// =======================
-///
-/// - Simulations run at 0K are too trivial for verification, set
-/// the initial temperature to at least several hundred K.
-/// - Longer runs are better to check conservation. Compare
-/// energies after initial transients are damped out.
-/// - Larger runs are better to check conservation. Fluctuations in the
-/// energy are averaged out.
-/// - Short term (order 100 time steps) discrepancies from run-to-run
-/// or platform-to platform beyond the last one or two decimal places
-/// are reason for concern. Differences in 4th or 5th decimal place
-/// is almost certainly a bug.
-/// - Contact the CoMD developers (exmatex-comd@llnl.gov) if you have
-/// questions about validation.
-///
-
-// --------------------------------------------------------------
-
-
-/// \page pg_comd_architecture CoMD Architecture
-///
-/// Program Flow {#sec_program_flow}
-/// ============
-///
-/// We have attempted to make the program flow in CoMD 1.1 as simple and
-/// transparent as possible. The main program consists of three blocks:
-/// prolog, main loop, and epilog.
-///
-/// Prolog {#ssec_flow_prolog}
-/// -------
-///
-/// The job of the prolog is to initialize the simulation and prepare
-/// for the main loop. Notable tasks in the prolog include calling
-/// - initParallel() to start MPI
-/// - parseCommandLine() to read the command line options
-/// - initSimulation() to initialize the main data structure, SimFlatSt.
-/// This includes tasks such as
-/// - initEamPot() to read tabular data for the potential function
-/// - initDecomposition() to set up the domain decomposition
-/// - createFccLattice() to generate an initial structure for the atoms
-/// - initValidate() to store initial data for a simple validation check
-///
-/// In CoMD 1.1 all atomic structures are internally generated so
-/// there is no need to read large files with atom coordinate data.
-///
-/// Main Loop {#ssec_flow_main_loop}
-/// ---------
-///
-/// The main loop calls
-/// - timestep(), the integrator to update particle positions,
-/// - printThings() to periodically prints simulation information
-///
-/// The timestep() function is the heart of the code as it choreographs
-/// updating the particle positions, along with computing forces
-/// (computeForce()) and communicating atoms between ranks
-/// (redistributeAtoms()).
-///
-/// Epilog {#ssec_flow_epilog}
-/// -------
-///
-/// The epilog code handles end of run bookkeeping such as
-/// - validateResult() to check validation
-/// - printPerformanceResults() to print a performance summary
-/// - destroySimulation() to free memory
-///
-/// Key Data Structures {#sec_key_data_structures}
-/// ==================
-///
-/// Practically all data in CoMD belongs to the SimFlatSt structure.
-/// This includes:
-/// - BasePotentialSt A polymorphic structure for the potential model
-/// - HaloExchangeSt A polymorphic strcuture for communication halo data
-/// - DomainSt The parallel domain decomposition
-/// - LinkCellSt The link cells
-/// - AtomsSt The atom coordinates and velocities
-/// - SpeciesDataSt Properties of the atomic species being simulated.
-///
-/// Consult the individual pages for each of these structures to learn
-/// more. The descriptions in haloExchange.c and initLinkCells() are
-/// especially useful to understand how the atoms are commuicated
-/// between tasks and stored in link cells for fast pair finding.
-
-// --------------------------------------------------------------
-
-
-/// \page pg_optimization_targets Optimization Targets
-///
-/// Computation {#sec_computation}
-/// ============
-///
-/// The computational effort of classical MD is usually highly focused
-/// in the force kernel. The two force kernels supplied by CoMD are
-/// eamForce() and ljForce(). Both kernels are fundamentally loops over
-/// pairs of atoms with significant opportunity to exploit high levels
-/// of concurrency. One potential challenge when reordering or
-/// parallelizing the pair loop structure is preventing race conditions
-/// that result if two concurrent pair evaluations try to simultaneously
-/// increment the forces and energies on the same atom.
-///
-/// The supplied EAM kernel uses interpolation from tabular data to
-/// evaluate functions. Hence the interpolate() function is another
-/// potential optimization target. Note that the two potential files
-/// distributed with CoMD have very different sizes. The Adams
-/// potential (Cu_u6.eam) has 500 points per function in the table while
-/// the Mishin potential (Cu01.eam.alloy) has 10,000 points per
-/// function. This difference could potentially impact important
-/// details such as cache miss rates.
-///
-/// Communication {#sec_communication}
-/// =============
-///
-/// As the number of atoms per MPI rank decreases, the communication
-/// routines will start to require a significant fraction of the
-/// run time. The main communication routine in CoMD is haloExchange().
-/// The halo exchange is simple nearest neighbor, point-to-point
-/// communication so it should scale well to practically any number of
-/// nodes.
-///
-/// The halo exchange in CoMD 1.1 is a very simple 6-direction
-/// structured halo exchange (see haloExchange.c). Other exchange
-/// algorithms can be implemented without much difficulty.
-///
-/// The halo exchange function is called in two very different contexts.
-/// The main usage is to exchange halo particle information (see
-/// initAtomHaloExchange()). This process is coordinated by the
-/// redistributeAtoms() function.
-///
-/// In addition to the atom exchange, when using the EAM potential, a
-/// halo exchange is performed in the force routine (see
-/// initForceHaloExchange()).
-
-
-// --------------------------------------------------------------
-
-
-/// \page pg_whats_new New Features and Changes in CoMD 1.1
-///
-/// The main goals of the 1.1 release were to add support for MPI and to
-/// improve the structure and clarity of the code. Achieving these
-/// goals required considerable changes compared to the 1.0 release.
-/// However, the core structure of the most computationally intensive
-/// kernels (the force routines) is mostly unchanged. We believe that
-/// lessons learned from optimizing 1.0 force kernels to specific
-/// hardware or programming models can be quickly transferred to kernels
-/// in the 1.1 release.
-///
-/// Significant changes in CoMD 1.1 include:
-///
-/// - MPI support. Both MPI and single node serial executables can be
-/// built from the same source files.
-///
-/// - Improved modularity and code clarity. Major data structures are
-/// now organized with their own structs and initialization routines.
-///
-/// - The build system has been simplified to use only standard
-/// Makefiles instead of CMake.
-///
-/// - The halo exchange operation needed to communicate remote particle
-/// data between MPI ranks also creates "image" particles in the
-/// serial build.
-///
-/// - Unified force kernels for both serial and MPI builds
-///
-/// - The addition of remote/image atoms allows periodic boundary
-/// conditions to be handled outside the force loop.
-///
-/// - An additional communication/data copy step to handle electron
-/// density on remote/image atoms has been added to the EAM force
-/// loop.
-///
-/// - The coordinate system has been simplified to a single global
-/// coordinate system for all particles.
-///
-/// - Evaluation of energies and forces using a Chebyshev polynomial
-/// fits has been removed. Polynomial approximation of energies and
-/// forces will return in a future CoMD version.
-///
-/// - Atomic structures are now generated internally, eliminating the
-/// requirement to read, write, and distribute large atom
-/// configuration files. Arbitrarily large initial structures can
-/// be generated with specified initial temperature and random
-/// displacements from lattice positions. Code to read/write atomic
-/// positions has been removed.
-///
-/// - EAM potentials are now read from standard funcfl and setfl format
-/// files. Voter style files are no longer supported.
-///
-/// - Collection of performance metrics is significantly improved.
-/// Developers can easily add new timers to regions of interest. The
-/// system is also designed to allow easy integration with platform
-/// specific API's to high resolution timers, cycle counters,
-/// hardware counters, etc.
-///
-///
-/// - Hooks to in-situ analysis and visualization have been removed.
-/// In-situ analysis capabilities will return in a future CoMD release.
-///
-/// Please contact the CoMD developers (exematex-comd@llnl.gov) if
-/// any of the deleted features negative impacts your work. We
-/// may be able to help produce a custom version that includes the code
-/// you need.
-
-
-// --------------------------------------------------------------
-
-
-/// \page pg_md_basics MD Basics
-///
-/// The molecular dynamics (MD) computer simulation method is a well
-/// established and important tool for the study of the dynamical
-/// properties of liquids, solids, and other systems of interest in
-/// Materials Science and Engineering, Chemistry and Biology. A material
-/// is represented in terms of atoms and molecules. The method of MD
-/// simulation involves the evaluation of the force acting on each atom
-/// due to all other atoms in the system and the numerical integration
-/// of the Newtonian equations of motion. Though MD was initially
-/// developed to compute the equilibrium thermodynamic behavior of
-/// materials (equation of state), most recent applications have used MD
-/// to study non-equilibrium processes.
-///
-/// Wikipeda offers a basic introduction to molecular dynamics with
-/// many references:
-///
-/// http://en.wikipedia.org/wiki/Molecular_dynamics
-///
-/// For a thorough treatment of MD methods, see:
-/// - "Computer simulation of liquids" by M.P. Allen and D.J. Tildesley
-/// (Oxford, 1989)
-/// ISBN-10: 0198556454 | ISBN-13: 978-0198556459.
-///
-/// For an understanding of MD simulations and application to statistical mechanics:
-/// - "Understanding Molecular Simulation, Second Edition: From Algorithms
-/// to Applications," by D. Frenkel and B. Smit (Academic Press, 2001)
-/// ISBN-10: 0122673514 | ISBN-13: 978-0122673511
-/// - "Statistical and Thermal Physics: With Computer Applications," by
-/// H. Gould and J. Tobochnik (Princeton, 2010)
-/// ISBN-10: 0691137447 | ISBN-13: 978-0691137445
-///
-/// CoMD implements both the Lennard-Jones Potential (ljForce.c) and the
-/// Embedded Atom Method Potential (eam.c).
-///