Source code for radical.pilot.unit_manager

    #pylint: disable=C0301, C0103, W0212

"""
.. module:: radical.pilot.unit_manager
   :platform: Unix
   :synopsis: Implementation of the UnitManager class.

.. moduleauthor:: Ole Weidner <ole.weidner@rutgers.edu>
"""

__copyright__ = "Copyright 2013-2014, http://radical.rutgers.edu"
__license__ = "MIT"

import os
import time
import weakref

from radical.pilot.compute_unit import ComputeUnit
from radical.pilot.utils.logger import logger

from radical.pilot.controller   import UnitManagerController
from radical.pilot.scheduler    import get_scheduler

from radical.pilot.types        import *
from radical.pilot.states       import *
from radical.pilot.exceptions   import *

# -----------------------------------------------------------------------------
#
[docs]class UnitManager(object):
    """A UnitManager manages :class:`radical.pilot.ComputeUnit` instances which
    represent the **executable** workload in RADICAL-Pilot. A UnitManager connects
    the ComputeUnits with one or more :class:`Pilot` instances (which represent
    the workload **executors** in RADICAL-Pilot) and a **scheduler** which
    determines which :class:`ComputeUnit` gets executed on which
    :class:`Pilot`.

    Each UnitManager has a unique identifier :data:`radical.pilot.UnitManager.uid`
    that can be used to re-connect to previoulsy created UnitManager in a
    given :class:`radical.pilot.Session`.

    **Example**::

        s = radical.pilot.Session(database_url=DBURL)

        pm = radical.pilot.PilotManager(session=s)

        pd = radical.pilot.ComputePilotDescription()
        pd.resource = "futuregrid.alamo"
        pd.cores = 16

        p1 = pm.submit_pilots(pd) # create first pilot with 16 cores
        p2 = pm.submit_pilots(pd) # create second pilot with 16 cores

        # Create a workload of 128 '/bin/sleep' compute units
        compute_units = []
        for unit_count in range(0, 128):
            cu = radical.pilot.ComputeUnitDescription()
            cu.executable = "/bin/sleep"
            cu.arguments = ['60']
            compute_units.append(cu)

        # Combine the two pilots, the workload and a scheduler via
        # a UnitManager.
        um = radical.pilot.UnitManager(session=session,
                                   scheduler=radical.pilot.SCHED_ROUND_ROBIN)
        um.add_pilot(p1)
        um.submit_units(compute_units)
    """

    # -------------------------------------------------------------------------
    #
[docs]    def __init__(self, session, scheduler=None, input_transfer_workers=2,
                 output_transfer_workers=2, _reconnect=False):
        """Creates a new UnitManager and attaches it to the session.

        **Args:**

            * session (`string`): The session instance to use.

            * scheduler (`string`): The name of the scheduler plug-in to use.

            * input_transfer_workers (`int`): The number of input file transfer 
              worker processes to launch in the background. 

            * output_transfer_workers (`int`): The number of output file transfer 
              worker processes to launch in the background. 

        .. note:: `input_transfer_workers` and `output_transfer_workers` can be
                  used to tune RADICAL-Pilot's file transfer performance. 
                  However, you should only change the default values if you 
                  know what you are doing.

        **Raises:**
            * :class:`radical.pilot.PilotException`
        """
        self._session = session
        self._worker  = None 
        self._pilots  = list()

        # keep track of some changing metrics
        self.wait_queue_size = 0

        if _reconnect is False:
            # Start a worker process fo this UnitManager instance. The worker
            # process encapsulates database access et al.
            self._worker = UnitManagerController(
                unit_manager_uid=None, 
                scheduler=scheduler,
                input_transfer_workers=input_transfer_workers,
                output_transfer_workers=output_transfer_workers, 
                session=self._session,
                db_connection=session._dbs,
                db_connection_info=session._connection_info)
            self._worker.start()

            self._uid = self._worker.unit_manager_uid
            self._scheduler = get_scheduler(name=scheduler, 
                                            manager=self, 
                                            session=self._session)

            # Each unit manager has a worker thread associated with it.
            # The task of the worker thread is to check and update the state
            # of units, fire callbacks and so on.
            self._session._unit_manager_objects.append(self)
            self._session._process_registry.register(self._uid, self._worker)

        else:
            # re-connect. do nothing
            pass


    #--------------------------------------------------------------------------
    #
[docs]    def close(self):
        """Shuts down the UnitManager and its background workers in a 
        coordinated fashion.
        """
        if not self._uid:
            logger.warning("UnitManager object already closed.")
            return

        if self._worker is not None:
            self._worker.stop()
            # Remove worker from registry
            self._session._process_registry.remove(self._uid)

        logger.info("Closed UnitManager %s." % str(self._uid))
        self._uid = None

    #--------------------------------------------------------------------------
    #
    @classmethod
    def _reconnect(cls, session, unit_manager_id):
        """PRIVATE: Reconnect to an existing UnitManager.
        """
        uid_exists = UnitManagerController.uid_exists(
            db_connection=session._dbs,
            unit_manager_uid=unit_manager_id)

        if not uid_exists:
            raise BadParameter(
                "UnitManager with id '%s' not in database." % unit_manager_id)

        # The UnitManager object
        obj = cls(session=session, scheduler=None, _reconnect=True)

        # Retrieve or start a worker process fo this PilotManager instance.
        worker = session._process_registry.retrieve(unit_manager_id)
        if worker is not None:
            obj._worker = worker
        else:
            obj._worker = UnitManagerController(
                unit_manager_uid=unit_manager_id,
                session=session,
                db_connection=session._dbs,
                db_connection_info=session._connection_info)
            session._process_registry.register(unit_manager_id, obj._worker)

        # start the worker if it's not already running
        if obj._worker.is_alive() is False:
            obj._worker.start()

        # Now that the worker is running (again), we can get more information
        # about the UnitManager
        um_data = obj._worker.get_unit_manager_data()

        obj._scheduler = get_scheduler(name=um_data['scheduler'], 
                                       manager=obj,
                                       session=obj._session)
        # FIXME: we need to tell the scheduler about all the pilots...

        obj._uid = unit_manager_id

        logger.info("Reconnected to existing UnitManager %s." % str(obj))
        return obj

    # -------------------------------------------------------------------------
    #
[docs]    def as_dict(self):
        """Returns a Python dictionary representation of the UnitManager
        object.
        """
        obj_dict = {
            'uid':               self.uid,
            'scheduler':         self.scheduler,
            'scheduler_details': self.scheduler_details
        }
        return obj_dict

    # -------------------------------------------------------------------------
    #
    def __str__(self):
        """Returns a string representation of the UnitManager object.
        """
        return str(self.as_dict())

    #--------------------------------------------------------------------------
    #
    @property
    def uid(self):
        """Returns the unique id.
        """
        return self._uid

    #--------------------------------------------------------------------------
    #
    @property
    def scheduler(self):
        """Returns the scheduler name.
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return self._scheduler.name

    #--------------------------------------------------------------------------
    #
    @property
    def scheduler_details(self):
        """Returns the scheduler logs.
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return "NO SCHEDULER DETAILS (Not Implemented)"

    # -------------------------------------------------------------------------
    #
[docs]    def add_pilots(self, pilots):
        """Associates one or more pilots with the unit manager.

        **Arguments:**

            * **pilots** [:class:`radical.pilot.ComputePilot` or list of
              :class:`radical.pilot.ComputePilot`]: The pilot objects that will be
              added to the unit manager.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        if not isinstance(pilots, list):
            pilots = [pilots]

        pilot_ids = self.list_pilots()

        for pilot in pilots :
            if  pilot.uid in pilot_ids :
                logger.warning ('adding the same pilot twice (%s)' % pilot.uid)

        self._worker.add_pilots(pilots)

        # let the scheduler know...
        for pilot in pilots :
            self._scheduler.pilot_added (pilot)

        # also keep the instances around
        for pilot in pilots :
            self._pilots.append (pilot)


    # -------------------------------------------------------------------------
    #
[docs]    def list_pilots(self):
        """Lists the UIDs of the pilots currently associated with
        the unit manager.

        **Returns:**

              * A list of :class:`radical.pilot.ComputePilot` UIDs [`string`].

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return self._worker.get_pilot_uids()


    # -------------------------------------------------------------------------
    #
[docs]    def get_pilots(self):
        """get the pilots instances currently associated with
        the unit manager.

        **Returns:**

              * A list of :class:`radical.pilot.ComputePilot` instances.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return self._pilots

    # -------------------------------------------------------------------------
    #
[docs]    def remove_pilots(self, pilot_ids, drain=True):
        """Disassociates one or more pilots from the unit manager.

        TODO: Implement 'drain'.

        After a pilot has been removed from a unit manager, it won't process
        any of the unit manager's units anymore. Calling `remove_pilots`
        doesn't stop the pilot itself.

        **Arguments:**

            * **drain** [`boolean`]: Drain determines what happens to the units
              which are managed by the removed pilot(s). If `True`, all units
              currently assigned to the pilot are allowed to finish execution.
              If `False` (the default), then `ACTIVE` units will be canceled.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        if not isinstance(pilot_ids, list):
            pilot_ids = [pilot_ids]

        self._worker.remove_pilots(pilot_ids)


        # FIXME:
        # if a pilot gets removed, we need to re-assign all its units to other
        # pilots.   We thus move them all into the wait queue, and call the
        # global rescheduler.  Some of the CUs might already be in final state
        # -- those are ignored (they'll be in the done_queue anyways).  We leave
        # it to the scheduling policy what happens to non-NEW CUs in the
        # wait_queue, i.e. if they get rescheduled, or if they'll raise an
        # error.

        # let the scheduler know...
        for pilot_id in pilot_ids :
            self._scheduler.pilot_removed (pilot_id)

        # update instance list
        for pilot_id in pilot_ids :
            for pilot in self._pilots[:] :
                if  pilot_id == pilot.uid :
                    self._pilots.remove (pilot)

    # -------------------------------------------------------------------------
    #
[docs]    def list_units(self):
        """Returns the UIDs of the :class:`radical.pilot.ComputeUnit` managed by
        this unit manager.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` UIDs [`string`].

        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return self._worker.get_compute_unit_uids()


    # -------------------------------------------------------------------------
    #
[docs]    def submit_units(self, unit_descriptions):
        """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**

            * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` objects.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """

        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return_list_type = True
        if not isinstance(unit_descriptions, list):
            return_list_type  = False
            unit_descriptions = [unit_descriptions]

        # we return a list of compute units
        ret = list()

        # the scheduler will return a dictionary of the form:
        #   {
        #     ud_1 : pilot_id_a,
        #     ud_2 : pilot_id_b
        #     ...
        #   }
        #
        # The scheduler may not be able to schedule some units - those will
        # have 'None' as pilot ID.

        units = list()
        for ud in unit_descriptions :

            units.append (ComputeUnit._create (unit_description=ud,
                                               unit_manager_obj=self, 
                                               local_state=NEW))

        self._worker.publish_compute_units (units=units)

        schedule = None
        try:
            schedule = self._scheduler.schedule (units=units)
       
        except Exception as e:
            logger.exception ("Internal error - unit scheduler failed")
            raise 

        self.handle_schedule (schedule)

        if  return_list_type :
            return units
        else :
            return units[0]


    # -------------------------------------------------------------------------
    #
    def handle_schedule (self, schedule) :

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.
        
        if  not schedule :
            logger.debug ('skipping empty unit schedule')
            return

      # print 'handle schedule:'
      # import pprint
      # pprint.pprint (schedule)
      #
        pilot_cu_map = dict()
        unscheduled  = list()

        pilot_ids = self.list_pilots ()

        for unit in schedule['units'].keys() :

            pid = schedule['units'][unit]

            if  None == pid :
                unscheduled.append (unit)
                continue

            else :

                if  pid not in pilot_ids :
                    raise RuntimeError ("schedule points to unknown pilot %s" % pid)

                if  pid not in pilot_cu_map :
                    pilot_cu_map[pid] = list()

                pilot_cu_map[pid].append (unit)


        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid] :

                if  not pid in schedule['pilots'] :
                    # lost pilot, do not schedule unit
                    logger.warn ("unschedule unit %s, lost pilot %s" % (unit.uid, pid))
                    continue

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/unit-" + str(unit.uid)

                ud = unit.description

                if  'kernel' in ud and ud['kernel'] :

                    try :
                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex :
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                              "radical.ensemblemd.mdkernels!")
                        # FIXME: unit needs a '_set_state() method or something!
                        self._session._dbs.set_compute_unit_state (unit._uid, FAILED, 
                                ["kernel expansion failed"])
                        continue

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd           = MDTaskDescription ()
                    mdtd.kernel    = ud.kernel
                    mdtd_bound     = mdtd.bind (resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec    = mdtd_bound.pre_exec
                    ud.executable  = mdtd_bound.executable
                    ud.mpi         = mdtd_bound.mpi


                units_to_schedule.append (unit)

            if  len(units_to_schedule) :
                self._worker.schedule_compute_units (pilot_uid=pid,
                                                     units=units_to_schedule)


        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if  old_wait_queue_size != self.wait_queue_size :
            self._worker.fire_manager_callback (WAIT_QUEUE_SIZE, self,
                                                self.wait_queue_size)

        if  len(unscheduled) :
            self._worker.unschedule_compute_units (units=unscheduled)

        logger.info ('%s units remain unscheduled' % len(unscheduled))


    # -------------------------------------------------------------------------
    #
[docs]    def get_units(self, unit_ids=None):
        """Returns one or more compute units identified by their IDs.

        **Arguments:**

            * **unit_ids** [`string` or `list of strings`]: The IDs of the
              compute unit objects to return.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` objects.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return_list_type = True
        if (not isinstance(unit_ids, list)) and (unit_ids is not None):
            return_list_type = False
            unit_ids = [unit_ids]

        units = ComputeUnit._get(unit_ids=unit_ids, unit_manager_obj=self)

        if  return_list_type :
            return units
        else :
            return units[0]

    # -------------------------------------------------------------------------
    #
[docs]    def wait_units(self, unit_ids=None,
                   state=[DONE, FAILED, CANCELED],
                   timeout=None):
        """Returns when one or more :class:`radical.pilot.ComputeUnits` reach a
        specific state.

        If `unit_uids` is `None`, `wait_units` returns when **all**
        ComputeUnits reach the state defined in `state`.

        **Example**::

            # TODO -- add example

        **Arguments:**

            * **unit_uids** [`string` or `list of strings`]
              If unit_uids is set, only the ComputeUnits with the specified
              uids are considered. If unit_uids is `None` (default), all
              ComputeUnits are considered.

            * **state** [`string`]
              The state that ComputeUnits have to reach in order for the call
              to return.

              By default `wait_units` waits for the ComputeUnits to
              reach a terminal state, which can be one of the following:

              * :data:`radical.pilot.DONE`
              * :data:`radical.pilot.FAILED`
              * :data:`radical.pilot.CANCELED`

            * **timeout** [`float`]
              Timeout in seconds before the call returns regardless of Pilot
              state changes. The default value **None** waits forever.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if  not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        if not isinstance(state, list):
            state = [state]

        return_list_type = True
        if (not isinstance(unit_ids, list)) and (unit_ids is not None):
            return_list_type = False
            unit_ids = [unit_ids]

        units  = self.get_units (unit_ids)
        start  = time.time()
        all_ok = False
        states = list()

        while not all_ok :

            all_ok = True
            states = list()

            for unit in units :
                if  unit.state not in state :
                    all_ok = False

                states.append (unit.state)

            # check timeout
            if  (None != timeout) and (timeout <= (time.time() - start)):
                if  not all_ok :
                    logger.debug ("wait timed out: %s" % states)
                break

            # sleep a little if this cycle was idle
            if  not all_ok :
                time.sleep (0.1)

        # done waiting
        if  return_list_type :
            return states
        else :
            return states[0]


    # -------------------------------------------------------------------------
    #
[docs]    def cancel_units(self, unit_ids=None):
        """Cancel one or more :class:`radical.pilot.ComputeUnits`.

        **Arguments:**

            * **unit_ids** [`string` or `list of strings`]: The IDs of the
              compute unit objects to cancel.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """
        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        if (not isinstance(unit_ids, list)) and (unit_ids is not None):
            unit_ids = [unit_ids]

        cus = self.get_units(unit_ids)
        for cu in cus:
            cu.cancel()


    # -------------------------------------------------------------------------
    #
[docs]    def register_callback(self, callback_function, metric=UNIT_STATE, callback_data=None):

        """
        Registers a new callback function with the UnitManager.  Manager-level
        callbacks get called if the specified metric changes.  The default
        metric `UNIT_STATE` fires the callback if any of the ComputeUnits
        managed by the PilotManager change their state.

        All callback functions need to have the same signature::

            def callback_func(obj, value, data)

        where ``object`` is a handle to the object that triggered the callback,
        ``value`` is the metric, and ``data`` is the data provided on
        callback registration..  In the example of `UNIT_STATE` above, the
        object would be the unit in question, and the value would be the new
        state of the unit.

        Available metrics are:

          * `UNIT_STATE`: fires when the state of any of the units which are
            managed by this unit manager instance is changing.  It communicates
            the unit object instance and the units new state.

          * `WAIT_QUEUE_SIZE`: fires when the number of unscheduled units (i.e.
            of units which have not been assigned to a pilot for execution)
            changes.
        """

        if  metric not in UNIT_MANAGER_METRICS :
            raise ValueError ("Metric '%s' is not available on the unit manager" % metric)

        self._worker.register_manager_callback(callback_function, metric, callback_data)