5.6. Executing Multicore / Multithreaded ComputeUnits

5.6.1. Multithreaded Applications

5.6.2. MPI Applications

To define an MPI ComputeUnit, all you need to do is to set the cores and the mpi arguments in the ComputeUnitDescription.

pdesc = radical.pilot.ComputeUnitDescription()
[...]
pdesc.mpi      = True
pdesc.cores    = 32
import os
import sys
import radical.pilot as rp

# READ: The RADICAL-Pilot documentation: 
#   http://radicalpilot.readthedocs.org/en/latest
#
# Try running this example with RADICAL_PILOT_VERBOSE=debug set if 
# you want to see what happens behind the scenes!


#------------------------------------------------------------------------------
#
def pilot_state_cb (pilot, state) :
    """ this callback is invoked on all pilot state changes """

    print "[Callback]: ComputePilot '%s' state: %s." % (pilot.uid, state)

    if  state == rp.FAILED :
        sys.exit (1)


#------------------------------------------------------------------------------
#
def unit_state_cb (unit, state) :
    """ this callback is invoked on all unit state changes """

    print "[Callback]: ComputeUnit  '%s' state: %s." % (unit.uid, state)

    if  state == rp.FAILED :
        sys.exit (1)


# ------------------------------------------------------------------------------
#
if __name__ == "__main__":

    # Create a new session. A session is the 'root' object for all other
    # RADICAL-Pilot objects. It encapsulates the MongoDB connection(s).
    session = rp.Session()

    # Add a Pilot Manager. Pilot managers manage one or more ComputePilots.
    pmgr = rp.PilotManager(session=session)

    # Register our callback with the PilotManager. This callback will get
    # called every time any of the pilots managed by the PilotManager
    # change their state.
    pmgr.register_callback(pilot_state_cb)

    # Define a X-core on stamped that runs for N minutes and
    # uses $HOME/radical.pilot.sandbox as sandbox directoy. 
    pdesc = rp.ComputePilotDescription()
    pdesc.resource = "xsede.stampede"
    pdesc.runtime  = 15 # N minutes
    pdesc.cores    = 16 # X cores
    pdesc.project  = "TG-MCB090174"

    # Launch the pilot.
    pilot = pmgr.submit_pilots(pdesc)

    cud_list = []

    for unit_count in range(0, 4):
        cu = rp.ComputeUnitDescription()
        cu.pre_exec      = ["module load python intel mvapich2 mpi4py"]
        cu.executable    = "python"
        cu.arguments     = ["helloworld_mpi.py"]
        cu.input_staging = ["helloworld_mpi.py"]

        # These two parameters are relevant to MPI execution:
        #   'cores' sets the number of cores required by the task
        #   'mpi' identifies the task as an MPI taskg
        cu.cores         = 8
        cu.mpi           = True


        cud_list.append(cu)

    # Combine the ComputePilot, the ComputeUnits and a scheduler via
    # a UnitManager object.
    umgr = rp.UnitManager(
        session=session,
        scheduler=rp.SCHED_DIRECT_SUBMISSION)

    # Register our callback with the UnitManager. This callback will get
    # called every time any of the units managed by the UnitManager
    # change their state.
    umgr.register_callback(unit_state_cb)

    # Add the previously created ComputePilot to the UnitManager.
    umgr.add_pilots(pilot)

    # Submit the previously created ComputeUnit descriptions to the
    # PilotManager. This will trigger the selected scheduler to start
    # assigning ComputeUnits to the ComputePilots.
    units = umgr.submit_units(cud_list)

    # Wait for all compute units to reach a terminal state (DONE or FAILED).
    umgr.wait_units()

    if not isinstance(units, list):
        units = [units]
    for unit in units:
        print "* Task %s - state: %s, exit code: %s, started: %s, finished: %s, stdout: %s" \
            % (unit.uid, unit.state, unit.exit_code, unit.start_time, unit.stop_time, unit.stdout)

    session.close()

# ------------------------------------------------------------------------------
This example uses this simple MPI4Py example as MPI executable
(requires MPI4Py installed on the remote cluster):
#!/usr/bin/env python

# This is an example MPI4Py program that is used
# by different examples and tests.

import sys
import time
import traceback
from   mpi4py import MPI

try :
    print "start"
    SLEEP = 10
    name  = MPI.Get_processor_name()
    comm  = MPI.COMM_WORLD

    print "mpi rank %d/%d/%s"  % (comm.rank, comm.size, name)

    time.sleep(SLEEP)

    comm.Barrier()   # wait for everybody to synchronize here

except Exception as e :
    traceback.print_exc ()
    print "error : %s" % s
    sys.exit (1)

finally :
    print "done"
    sys.exit (0)