Source code for radical.pilot.session


__copyright__ = "Copyright 2013-2016, http://radical.rutgers.edu"
__license__   = "MIT"

# the session needs to get rid of child process handles after forks, as Python's
# multiprocessing module does not allow to check for child process health from
# processes which did not originally spawn the children.  For htis we use
# `at_fork`, which monkeypatches `os.fork()` to support prepare, parent and
# child hooks.  We then register a child hook during session initialization.
#
# Since the monkeypatch needs to be applies before `os` is imported, we do that
# right here, in the (probably vain) hope that os was not imported before.  If
# it was, at_fork will raise an error.
#


import os
import sys
import copy
import time
import glob
import copy
import pprint
import threading

import radical.utils        as ru
import saga                 as rs
import saga.utils.pty_shell as rsup

from . import utils         as rpu
from . import states        as rps
from . import constants     as rpc
from . import types         as rpt

from .unit_manager    import UnitManager
from .pilot_manager   import PilotManager
from .resource_config import ResourceConfig
from .db              import DBSession

from .utils import version_detail as rp_version_detail


# ------------------------------------------------------------------------------
#
[docs]class Session(rs.Session):
    """
    A Session encapsulates a RADICAL-Pilot instance and is the *root* object

    A Session holds :class:`radical.pilot.PilotManager` and
    :class:`radical.pilot.UnitManager` instances which in turn hold
    :class:`radical.pilot.ComputePilot` and :class:`radical.pilot.ComputeUnit`
    instances.
    """

    # the reporter is an applicataion-level singleton
    _reporter = None

    # We keep a static typemap for component startup. If we ever want to
    # become reeeealy fancy, we can derive that typemap from rp module
    # inspection.
    #
    # --------------------------------------------------------------------------
    #
[docs]    def __init__(self, dburl=None, uid=None, cfg=None, _connect=True):
        """
        Creates a new session.  A new Session instance is created and 
        stored in the database.

        **Arguments:**
            * **dburl** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **uid** (`string`): Create a session with this UID.  
              *Only use this when you know what you are doing!*

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        if os.uname()[0] == 'Darwin':
            # on MacOS, we are running out of file descriptors soon.  The code
            # below attempts to increase the limit of open files - but any error
            # is silently ignored, so this is an best-effort, no guarantee.  We
            # leave responsibility for system limits with the user.
            try:
                import resource
                limits    = list(resource.getrlimit(resource.RLIMIT_NOFILE))
                limits[0] = 512
                resource.setrlimit(resource.RLIMIT_NOFILE, limits)
            except:
                pass

        self._dh          = ru.DebugHelper()
        self._valid       = True
        self._closed      = False
        self._valid_iter  = 0  # detect recursive calls of `is_valid()`

        # class state
        self._dbs         = None
        self._uid         = None
        self._dburl       = None
        self._reconnected = False

        self._cache       = dict()  # cache sandboxes etc.
        self._cache_lock  = threading.RLock()

        self._cache['resource_sandbox'] = dict()
        self._cache['session_sandbox']  = dict()
        self._cache['pilot_sandbox']    = dict()

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        # NOTE: should this also include agents?
        self._pmgrs      = dict()
        self._umgrs      = dict()
        self._bridges    = list()
        self._components = list()

        # FIXME: we work around some garbage collection issues we don't yet
        #        understand: instead of relying on the GC to eventually collect
        #        some stuff, we actively free those on `session.close()`, at
        #        least for the current process.  Usually, all resources get
        #        nicely collected on process termination - but not when we
        #        create many sessions (one after the other) in the same
        #        application instance (ie. the same process).  This workarounf
        #        takes care of that use case.
        #        The clean solution would be to ensure clean termination
        #        sequence, something which I seem to be unable to implement...
        #        :/
        self._to_close   = list()
        self._to_stop    = list()
        self._to_destroy = list()

        # cache the client sandbox
        # FIXME: this needs to be overwritten if configured differently in the
        #        session config, as should be the case for any agent side
        #        session instance.
        self._client_sandbox = os.getcwd()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        # if a config is given, us its values:
        if cfg:
            self._cfg = copy.deepcopy(cfg)
        else:
            # otherwise we need a config
            self._cfg = ru.read_json("%s/configs/session_%s.json" \
                    % (os.path.dirname(__file__),
                       os.environ.get('RADICAL_PILOT_SESSION_CFG', 'default')))

        # fall back to config data where possible
        # sanity check on parameters
        if not uid : 
            uid = self._cfg.get('session_id')

        if uid:
            self._uid         = uid
            self._reconnected = True
        else:
            # generate new uid, reset all other ID counters
            # FIXME: this will screw up counters for *concurrent* sessions, 
            #        as the ID generation is managed in a process singleton.
            self._uid = ru.generate_id('rp.session',  mode=ru.ID_PRIVATE)
            ru.reset_id_counters(prefix='rp.session', reset_all_others=True)

        if not self._cfg.get('session_id'): self._cfg['session_id'] = self._uid 
        if not self._cfg.get('owner')     : self._cfg['owner']      = self._uid 
        if not self._cfg.get('logdir')    : self._cfg['logdir']     = '%s/%s' \
                                                     % (os.getcwd(), self._uid)
        self._logdir = self._cfg['logdir']
        self._prof   = self._get_profiler(name=self._cfg['owner'])
        self._rep    = self._get_reporter(name=self._cfg['owner'])
        self._log    = self._get_logger  (name=self._cfg['owner'],
                                          level=self._cfg.get('debug'))

        if _connect:

            # we need a dburl to connect to.
            if not dburl:
                dburl = os.environ.get("RADICAL_PILOT_DBURL")

            if not dburl:
                dburl = self._cfg.get('default_dburl')

            if not dburl:
                dburl = self._cfg.get('dburl')

            if not dburl:
                # we forgive missing dburl on reconnect, but not otherwise
                raise RuntimeError("no database URL (set RADICAL_PILOT_DBURL)")  


        self._dburl = ru.Url(dburl)
        self._cfg['dburl'] = str(self._dburl)

        # now we have config and uid - initialize base class (saga session)
        rs.Session.__init__(self, uid=self._uid)


        # ----------------------------------------------------------------------
        # create new session
        if _connect:
            self._log.info("using database %s" % self._dburl)

            # if the database url contains a path element, we interpret that as
            # database name (without the leading slash)
            if  not self._dburl.path         or \
                self._dburl.path[0]   != '/' or \
                len(self._dburl.path) <=  1  :
                if not uid:
                    # we fake reconnnect if no DB is available -- but otherwise we
                    # really really need a db connection...
                    raise ValueError("incomplete DBURL '%s' no db name!" % self._dburl)

        if not self._reconnected:
            self._prof.prof('session_start', uid=self._uid)
            self._rep.info ('<<new session: ')
            self._rep.plain('[%s]' % self._uid)
            self._rep.info ('<<database   : ')
            self._rep.plain('[%s]' % self._dburl)

        self._load_resource_configs()

        self._rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION')
        if self._rec:
            # NOTE: Session recording cannot handle reconnected sessions, yet.
            #       We thus turn it off here with a warning
            if self._reconnected:
                self._log.warn("no session recording on reconnected session")

            else:
                # append session ID to recording path
                self._rec = "%s/%s" % (self._rec, self._uid)

                # create recording path and record session
                os.system('mkdir -p %s' % self._rec)
                ru.write_json({'dburl': str(self.dburl)}, 
                              "%s/session.json" % self._rec)
                self._log.info("recording session in %s" % self._rec)


        # create/connect database handle
        try:
            self._dbs = DBSession(sid=self.uid, dburl=str(self._dburl),
                                  cfg=self._cfg, logger=self._log, 
                                  connect=_connect)

            # from here on we should be able to close the session again
            self._log.info("New Session created: %s." % self.uid)

        except Exception, ex:
            self._rep.error(">>err\n")
            self._log.exception('session create failed')
            raise RuntimeError("Couldn't create new session (database URL '%s' incorrect?): %s" \
                            % (dburl, ex))  

        # the session must not carry bridge and component handles across forks
        ru.atfork(self._atfork_prepare, self._atfork_parent, self._atfork_child)

        # if bridges and components are specified in the config, start them
        ruc = rpu.Component
        self._bridges    = ruc.start_bridges   (self._cfg, self, self._log)
        self._components = ruc.start_components(self._cfg, self, self._log)
        self.is_valid()

        # at this point we have a DB connection, logger, etc, and can record
        # some metadata
        self._log.info('radical.pilot version: %s' % rp_version_detail)
        self._log.info('radical.saga  version: %s' % rs.version_detail)
        self._log.info('radical.utils version: %s' % ru.version_detail)

        py_version_detail = sys.version.replace("\n", " ")
        self.inject_metadata({'radical_stack' : {'rp': rp_version_detail,
                                                 'rs': rs.version_detail,
                                                 'ru': ru.version_detail,
                                                 'py': py_version_detail}})


        # FIXME: make sure the above code results in a usable session on
        #        reconnect
        self._rep.ok('>>ok\n')


    # --------------------------------------------------------------------------
    #
    def _atfork_prepare(self): 
        pass

    def _atfork_parent(self) :
        pass

    def _atfork_child(self)  : 
        self._components = list()
        self._bridges    = list()
        self._to_close   = list()
        self._to_stop    = list()
        self._to_destroy = list()

    
    # --------------------------------------------------------------------------
    # Allow Session to function as a context manager in a `with` clause
    def __enter__(self):
        return self


    # --------------------------------------------------------------------------
    # Allow Session to function as a context manager in a `with` clause
    def __exit__(self, type, value, traceback):

        # FIXME: use cleanup_on_close, terminate_on_close attributes
        self.close()


    # --------------------------------------------------------------------------
    #
    def is_valid(self, term=True):

        # don't check validity during termination
        if self._closed:
            return True

        # if we check any manager or agent, it will likely also check the
        # session in turn.  We break that loop here.
        self._valid_iter += 1

        try:
            if self._valid_iter >= 2:
                # we are too deep - abort this line or tests
                return True

            if self._valid:
                for _,umgr in self._umgrs.iteritems():
                    if not umgr.is_valid(term):
                        self._valid = False
                        break

            if self._valid:
                for _,pmgr in self._pmgrs.iteritems():
                    if not pmgr.is_valid(term):
                        self._valid = False
                        break

            if self._valid:
                for bridge in self._bridges:
                    if not bridge.is_valid(term):
                        self._valid = False
                        break

            if self._valid:
                for component in self._components:
                    if not component.is_valid(term):
                        self._valid = False
                        break

        finally:
            pass

        if not self._valid and term:
            self._log.warn("session %s is invalid" % self.uid)
            self.close()
          # raise RuntimeError("session %s is invalid" % self.uid)

        return self._valid


    # --------------------------------------------------------------------------
    #
    def _load_resource_configs(self):

        self.is_valid()

        self._prof.prof('config_parser_start', uid=self._uid)

        # Loading all "default" resource configurations
        module_path  = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/resource_*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                self._log.info("Load resource configurations from %s" % config_file)
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                self._log.exception("skip config file %s: %s" % (config_file, e))
                raise RuntimeError('config error (%s) - abort' % e)

            for rc in rcs:
                self._log.info("Load resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 
                self._log.debug('read rcfg for %s (%s)', 
                        rc, self._resource_configs[rc].get('cores_per_node'))

        home         = os.environ.get('HOME', '')
        user_cfgs    = "%s/.radical/pilot/configs/resource_*.json" % home
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                self._log.exception("skip config file %s: %s" % (config_file, e))
                raise RuntimeError('config error (%s) - abort' % e)

            for rc in rcs:
                self._log.info("Load resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

                self._log.debug('fix  rcfg for %s (%s)', 
                        rc, self._resource_configs[rc].get('cores_per_node'))

        default_aliases = "%s/configs/resource_aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        # check if we have aliases to merge
        usr_aliases = '%s/.radical/pilot/configs/resource_aliases.json' % home
        if os.path.isfile(usr_aliases):
            ru.dict_merge(self._resource_aliases,
                          ru.read_json_str(usr_aliases).get('aliases', {}),
                          policy='overwrite')

        self._prof.prof('config_parser_stop', uid=self._uid)


    # --------------------------------------------------------------------------
    #
[docs]    def close(self, cleanup=False, terminate=True, download=False):
        """Closes the session.

        All subsequent attempts access objects attached to the session will 
        result in an error. If cleanup is set to True (default) the session
        data is removed from the database.

        **Arguments:**
            * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate)
            * **terminate** (`bool`): Shut down all pilots associated with the session. 

        **Raises:**
            * :class:`radical.pilot.IncorrectState` if the session is closed
              or doesn't exist. 
        """

        # close only once
        if self._closed:
            return

        self._rep.info('closing session %s' % self._uid)
        self._log.debug("session %s closing", self._uid)
        self._prof.prof("session_close", uid=self._uid)

        # set defaults
        if cleanup   == None: cleanup   = True
        if terminate == None: terminate = True

        if  cleanup:
            # cleanup implies terminate
            terminate = True

        for umgr_uid,umgr in self._umgrs.iteritems():
            self._log.debug("session %s closes umgr   %s", self._uid, umgr_uid)
            umgr.close()
            self._log.debug("session %s closed umgr   %s", self._uid, umgr_uid)

        for pmgr_uid,pmgr in self._pmgrs.iteritems():
            self._log.debug("session %s closes pmgr   %s", self._uid, pmgr_uid)
            pmgr.close(terminate=terminate)
            self._log.debug("session %s closed pmgr   %s", self._uid, pmgr_uid)

        for comp in self._components:
            self._log.debug("session %s closes comp   %s", self._uid, comp.uid)
            comp.stop()
            comp.join()
            self._log.debug("session %s closed comp   %s", self._uid, comp.uid)

        for bridge in self._bridges:
            self._log.debug("session %s closes bridge %s", self._uid, bridge.uid)
            bridge.stop()
            bridge.join()
            self._log.debug("session %s closed bridge %s", self._uid, bridge.uid)

        if self._dbs:
            self._log.debug("session %s closes db (%s)", self._uid, cleanup)
            self._dbs.close(delete=cleanup)

        self._log.debug("session %s closed (delete=%s)", self._uid, cleanup)
        self._prof.prof("session_stop", uid=self._uid)
        self._prof.close()

        # support GC
        for x in self._to_close: 
            try:    x.close()
            except: pass
        for x in self._to_stop:
            try:    x.stop()
            except: pass
        for x in self._to_destroy:
            try:    x.destroy()
            except: pass

        self._closed = True
        self._valid = False

        # after all is said and done, we attempt to download the pilot log- and
        # profiles, if so wanted
        if download:

            self._prof.prof("session_fetch_start", uid=self._uid)
            self._log.debug('start download')
            tgt = os.getcwd()
            self.fetch_json    (tgt='%s/%s' % (tgt, self.uid))
            self.fetch_profiles(tgt=tgt)
            self.fetch_logfiles(tgt=tgt)

            self._prof.prof("session_fetch_stop", uid=self._uid)

        self._rep.info('<<session lifetime: %.1fs' % (self.closed - self.created))
        self._rep.ok('>>ok\n')


    # --------------------------------------------------------------------------
    #
[docs]    def as_dict(self):
        """Returns a Python dictionary representation of the object.
        """

        self.is_valid()

        object_dict = {
            "uid"       : self._uid,
            "created"   : self.created,
            "connected" : self.connected,
            "closed"    : self.closed,
            "dburl"     : str(self.dburl),
            "cfg"       : copy.deepcopy(self._cfg)
        }
        return object_dict


    # --------------------------------------------------------------------------
    #
    def __str__(self):
        """Returns a string representation of the object.
        """
        return str(self.as_dict())


    # --------------------------------------------------------------------------
    #
    @property
    def uid(self):
        return self._uid


    # --------------------------------------------------------------------------
    #
    @property
    def logdir(self):
        return self._logdir


    # --------------------------------------------------------------------------
    #
    @property
    def dburl(self):
        return self._dburl


    # --------------------------------------------------------------------------
    #
    def get_db(self):

        self.is_valid()

        if self._dbs: return self._dbs.get_db()
        else        : return None


    
    # --------------------------------------------------------------------------
    #
    @property
    def created(self):
        """Returns the UTC date and time the session was created.
        """
        if self._dbs: return self._dbs.created
        else        : return None


    # --------------------------------------------------------------------------
    #
    @property
    def connected(self):
        """Returns the most recent UTC date and time the session was
        reconnected to.
        """
        if self._dbs: return self._dbs.connected
        else        : return None


    # -------------------------------------------------------------------------
    #
    @property
    def is_connected(self):

        self.is_valid()

        return self._dbs.is_connected


    # --------------------------------------------------------------------------
    #
    @property
    def closed(self):
        """
        Returns the time of closing
        """
        if self._dbs: return self._dbs.closed
        else        : return None


    # --------------------------------------------------------------------------
    #
    def _get_logger(self, name, level=None):
        """
        This is a thin wrapper around `ru.Logger()` which makes sure that
        log files end up in a separate directory with the name of `session.uid`.
        """
        return ru.Logger(name=name, ns='radical.pilot', targets=['.'], 
                         path=self._logdir, level=level)


    # --------------------------------------------------------------------------
    #
    def _get_reporter(self, name):
        """
        This is a thin wrapper around `ru.Reporter()` which makes sure that
        log files end up in a separate directory with the name of `session.uid`.
        """

        return ru.Reporter(name=name, ns='radical.pilot', targets=['stdout'],
                           path=self._logdir)


    # --------------------------------------------------------------------------
    #
    def _get_profiler(self, name):
        """
        This is a thin wrapper around `ru.Profiler()` which makes sure that
        log files end up in a separate directory with the name of `session.uid`.
        """

        prof = ru.Profiler(name=name, ns='radical.pilot', path=self._logdir)

        return prof


    # --------------------------------------------------------------------------
    #
    def _get_reporter(self, name):
        """
        This is a thin wrapper around `ru.Reporter()` which makes sure that
        log files end up in a separate directory with the name of `session.uid`.
        """

        if not self._reporter:
            self._reporter = ru.Reporter(name=name, ns='radical.pilot',
                                         targets=['stdout'], path=self._logdir)
        return self._reporter


    # --------------------------------------------------------------------------
    #
    def _get_profiler(self, name):
        """
        This is a thin wrapper around `ru.Profiler()` which makes sure that
        log files end up in a separate directory with the name of `session.uid`.
        """

        prof = ru.Profiler(name=name, ns='radical.pilot', path=self._logdir)

        return prof


    # --------------------------------------------------------------------------
    #
[docs]    def inject_metadata(self, metadata):
        """
        Insert (experiment) metadata into an active session
        RP stack version info always get added.
        """

        self.is_valid()

        if not isinstance(metadata, dict):
            raise Exception("Session metadata should be a dict!")

        if self._dbs and self._dbs._c:
            self._dbs._c.update({'type'  : 'session',
                                 "uid"   : self.uid},
                                {"$push" : {"metadata": metadata}})


    # --------------------------------------------------------------------------
    #
    def _register_pmgr(self, pmgr):

        self.is_valid()

        self._dbs.insert_pmgr(pmgr.as_dict())
        self._pmgrs[pmgr.uid] = pmgr


    # --------------------------------------------------------------------------
    #
[docs]    def list_pilot_managers(self):
        """
        Lists the unique identifiers of all :class:`radical.pilot.PilotManager` 
        instances associated with this session.

        **Returns:**
            * A list of :class:`radical.pilot.PilotManager` uids (`list` of `strings`).
        """

        self.is_valid()
        return self._pmgrs.keys()


    # --------------------------------------------------------------------------
    #
[docs]    def get_pilot_managers(self, pmgr_uids=None):
        """ 
        returns known PilotManager(s).

        **Arguments:**

            * **pmgr_uids** [`string`]: 
              unique identifier of the PilotManager we want

        **Returns:**
            * One or more [:class:`radical.pilot.PilotManager`] objects.
        """

        self.is_valid()

        return_scalar = False
        if not isinstance(pmgr_uids, list):
            pmgr_uids     = [pmgr_uids]
            return_scalar = True

        if pmgr_uids: pmgrs = [self._pmgrs[uid] for uid in pmgr_uids]
        else        : pmgrs =  self._pmgrs.values()

        if return_scalar: return pmgrs[0]
        else            : return pmgrs


    # --------------------------------------------------------------------------
    #
    def _register_umgr(self, umgr):

        self.is_valid()

        self._dbs.insert_umgr(umgr.as_dict())
        self._umgrs[umgr.uid] = umgr


    # --------------------------------------------------------------------------
    #
[docs]    def list_unit_managers(self):
        """
        Lists the unique identifiers of all :class:`radical.pilot.UnitManager` 
        instances associated with this session.

        **Returns:**
            * A list of :class:`radical.pilot.UnitManager` uids (`list` of `strings`).
        """

        self.is_valid()
        return self._umgrs.keys()


    # --------------------------------------------------------------------------
    #
[docs]    def get_unit_managers(self, umgr_uids=None):
        """ 
        returns known UnitManager(s).

        **Arguments:**

            * **umgr_uids** [`string`]: 
              unique identifier of the UnitManager we want

        **Returns:**
            * One or more [:class:`radical.pilot.UnitManager`] objects.
        """

        self.is_valid()

        return_scalar = False
        if not isinstance(umgr_uids, list):
            umgr_uids     = [umgr_uids]
            return_scalar = True

        if umgr_uids: umgrs = [self._umgrs[uid] for uid in umgr_uids]
        else        : umgrs =  self._umgrs.values()

        if return_scalar: return umgrs[0]
        else            : return umgrs


    # -------------------------------------------------------------------------
    #
[docs]    def list_resources(self):
        '''
        Returns a list of known resource labels which can be used in a pilot
        description.  Not that resource aliases won't be listed.
        '''

        return sorted(self._resource_configs.keys())


    # -------------------------------------------------------------------------
    #
[docs]    def add_resource_config(self, resource_config):
        """Adds a new :class:`radical.pilot.ResourceConfig` to the PilotManager's 
           dictionary of known resources, or accept a string which points to
           a configuration file.

           For example::

                  rc = radical.pilot.ResourceConfig(label="mycluster")
                  rc.job_manager_endpoint = "ssh+pbs://mycluster
                  rc.filesystem_endpoint  = "sftp://mycluster
                  rc.default_queue        = "private"
                  rc.bootstrapper         = "default_bootstrapper.sh"

                  pm = radical.pilot.PilotManager(session=s)
                  pm.add_resource_config(rc)

                  pd = radical.pilot.ComputePilotDescription()
                  pd.resource = "mycluster"
                  pd.cores    = 16
                  pd.runtime  = 5 # minutes

                  pilot = pm.submit_pilots(pd)
        """

        self.is_valid()

        if isinstance(resource_config, basestring):

            # let exceptions fall through
            rcs = ResourceConfig.from_file(resource_config)

            for rc in rcs:
                self._log.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 
                self._log.debug('add  rcfg for %s (%s)', 
                        rc, self._resource_configs[rc].get('cores_per_node'))

        else:
            self._resource_configs[resource_config.label] = resource_config.as_dict()
            self._log.debug('Add  rcfg for %s (%s)', 
                    resource_config.label, 
                    self._resource_configs[resource_config.label].get('cores_per_node'))

    # -------------------------------------------------------------------------
    #
[docs]    def get_resource_config(self, resource, schema=None):
        """
        Returns a dictionary of the requested resource config
        """

        self.is_valid()

        if  resource in self._resource_aliases:
            self._log.warning("using alias '%s' for deprecated resource key '%s'" \
                              % (self._resource_aliases[resource], resource))
            resource = self._resource_aliases[resource]

        if  resource not in self._resource_configs:
            raise RuntimeError("Resource '%s' is not known." % resource)

        resource_cfg = copy.deepcopy(self._resource_configs[resource])
        self._log.debug('get rcfg 1 for %s (%s)',  resource, resource_cfg.get('cores_per_node'))

        if  not schema:
            if 'schemas' in resource_cfg:
                schema = resource_cfg['schemas'][0]

        if  schema:
            if  schema not in resource_cfg:
                raise RuntimeError("schema %s unknown for resource %s" \
                                  % (schema, resource))

            for key in resource_cfg[schema]:
                # merge schema specific resource keys into the
                # resource config
                resource_cfg[key] = resource_cfg[schema][key]

        self._log.debug('get rcfg 2 for %s (%s)',  resource, resource_cfg.get('cores_per_node'))

        return resource_cfg


    # -------------------------------------------------------------------------
    #
    def fetch_profiles(self, tgt=None, fetch_client=False):

        return rpu.fetch_profiles(self._uid, dburl=self.dburl, tgt=tgt, 
                                  session=self)


    # -------------------------------------------------------------------------
    #
    def fetch_logfiles(self, tgt=None, fetch_client=False):

        return rpu.fetch_logfiles(self._uid, dburl=self.dburl, tgt=tgt, 
                                  session=self)


    # -------------------------------------------------------------------------
    #
    def fetch_json(self, tgt=None, fetch_client=False):

        return rpu.fetch_json(self._uid, dburl=self.dburl, tgt=tgt,
                              session=self)



    # -------------------------------------------------------------------------
    #
    def _get_client_sandbox(self):
        """
        For the session in the client application, this is os.getcwd().  For the
        session in any other component, specifically in pilot components, the
        client sandbox needs to be read from the session config (or pilot
        config).  The latter is not yet implemented, so the pilot can not yet
        interpret client sandboxes.  Since pilot-side stagting to and from the
        client sandbox is not yet supported anyway, this seems acceptable
        (FIXME).
        """

        return self._client_sandbox


    # -------------------------------------------------------------------------
    #
    def _get_resource_sandbox(self, pilot):
        """
        for a given pilot dict, determine the global RP sandbox, based on the
        pilot's 'resource' attribute.
        """

        self.is_valid()

        # FIXME: this should get 'resource, schema=None' as parameters

        resource = pilot['description'].get('resource')
        schema   = pilot['description'].get('access_schema')

        if not resource:
            raise ValueError('Cannot get pilot sandbox w/o resource target')

        # the global sandbox will be the same for all pilots on any resource, so
        # we cache it
        with self._cache_lock:

            if resource not in self._cache['resource_sandbox']:

                # cache miss -- determine sandbox and fill cache
                rcfg   = self.get_resource_config(resource, schema)
                fs_url = rs.Url(rcfg['filesystem_endpoint'])

                # Get the sandbox from either the pilot_desc or resource conf
                sandbox_raw = pilot['description'].get('sandbox')
                if not sandbox_raw:
                    sandbox_raw = rcfg.get('default_remote_workdir', "$PWD")
        
                # If the sandbox contains expandables, we need to resolve those remotely.
                # NOTE: Note that this will only work for (gsi)ssh or shell based access mechanisms
                if '$' not in sandbox_raw and '`' not in sandbox_raw:
                    # no need to expand further
                    sandbox_base = sandbox_raw

                else:
                    js_url = rs.Url(rcfg['job_manager_endpoint'])
        
                    if 'ssh' in js_url.schema.split('+'):
                        js_url.schema = 'ssh'
                    elif 'gsissh' in js_url.schema.split('+'):
                        js_url.schema = 'gsissh'
                    elif 'fork' in js_url.schema.split('+'):
                        js_url.schema = 'fork'
                    elif '+' not in js_url.schema:
                        # For local access to queueing systems use fork
                        js_url.schema = 'fork'
                    else:
                        raise Exception("unsupported access schema: %s" % js_url.schema)
        
                    self._log.debug("rsup.PTYShell('%s')", js_url)
                    shell = rsup.PTYShell(js_url, self)
        
                    ret, out, err = shell.run_sync(' echo "WORKDIR: %s"' % sandbox_raw)
                    if ret == 0 and 'WORKDIR:' in out:
                        sandbox_base = out.split(":")[1].strip()
                        self._log.debug("sandbox base %s: '%s'", js_url, sandbox_base)
                    else:
                        raise RuntimeError("Couldn't get remote working directory.")
        
                # at this point we have determined the remote 'pwd' - the global sandbox
                # is relative to it.
                fs_url.path = "%s/radical.pilot.sandbox" % sandbox_base
        
                # before returning, keep the URL string in cache
                self._cache['resource_sandbox'][resource] = fs_url

            return self._cache['resource_sandbox'][resource]


    # --------------------------------------------------------------------------
    #
    def _get_session_sandbox(self, pilot):

        self.is_valid()

        # FIXME: this should get 'resource, schema=None' as parameters

        resource = pilot['description'].get('resource')

        if not resource:
            raise ValueError('Cannot get session sandbox w/o resource target')

        with self._cache_lock:

            if resource not in self._cache['session_sandbox']:

                # cache miss
                resource_sandbox      = self._get_resource_sandbox(pilot)
                session_sandbox       = rs.Url(resource_sandbox)
                session_sandbox.path += '/%s' % self.uid

                self._cache['session_sandbox'][resource] = session_sandbox

            return self._cache['session_sandbox'][resource]


    # --------------------------------------------------------------------------
    #
    def _get_pilot_sandbox(self, pilot):

        self.is_valid()

        # FIXME: this should get 'pid, resource, schema=None' as parameters

        self.is_valid()

        pilot_sandbox = pilot.get('pilot_sandbox')
        if str(pilot_sandbox):
            return rs.Url(pilot_sandbox)

        pid = pilot['uid']
        with self._cache_lock:
            if  pid in self._cache['pilot_sandbox']:
                return self._cache['pilot_sandbox'][pid]

        # cache miss
        session_sandbox     = self._get_session_sandbox(pilot)
        pilot_sandbox       = rs.Url(session_sandbox)
        pilot_sandbox.path += '/%s/' % pilot['uid']

        with self._cache_lock:
            self._cache['pilot_sandbox'][pid] = pilot_sandbox

        return pilot_sandbox


    # --------------------------------------------------------------------------
    #
    def _get_unit_sandbox(self, unit, pilot):

        self.is_valid()

        # we don't cache unit sandboxes, they are just a string concat.
        pilot_sandbox = self._get_pilot_sandbox(pilot)
        return "%s/%s/" % (pilot_sandbox, unit['uid'])


    # --------------------------------------------------------------------------
    #
    def _get_jsurl(self, pilot):
        '''
        get job service endpoint and hop URL for the pilot's target resource.
        '''

        self.is_valid()

        resrc   = pilot['description']['resource']
        schema  = pilot['description']['access_schema']
        rcfg    = self.get_resource_config(resrc, schema)

        js_url  = rs.Url(rcfg.get('job_manager_endpoint'))
        js_hop  = rs.Url(rcfg.get('job_manager_hop', js_url))

        # make sure the js_hop url points to an interactive access
        # TODO: this is an unreliable heuristics - we should require the js_hop
        #       URL to be specified in the resource configs.
        if   '+gsissh' in js_hop.schema or \
             'gsissh+' in js_hop.schema    : js_hop.schema = 'gsissh'
        elif '+ssh'    in js_hop.schema or \
             'ssh+'    in js_hop.schema    : js_hop.schema = 'ssh'
        else                               : js_hop.schema = 'fork'

        return js_url, js_hop


    # --------------------------------------------------------------------------
    #
    @staticmethod
    def autopilot(user, passwd):

        import github3
        import random

        labels = 'type:autopilot'
        titles = ['+++ Out of Cheese Error +++',
                  '+++ Redo From Start! +++',
                  '+++ Mr. Jelly! Mr. Jelly! +++',
                  '+++ Melon melon melon',
                  '+++ Wahhhhhhh! Mine! +++',
                  '+++ Divide By Cucumber Error +++',
                  '+++ Please Reinstall Universe And Reboot +++',
                  '+++ Whoops! Here comes the cheese! +++',
                  '+++ End of Cheese Error +++',
                  '+++ Can Not Find Drive Z: +++',
                  '+++ Unknown Application Error +++',
                  '+++ Please Reboot Universe +++',
                  '+++ Year Of The Sloth +++',
                  '+++ error of type 5307 has occured +++',
                  '+++ Eternal domain error +++',
                  '+++ Error at Address Number 6, Treacle Mine Road +++']

        def excuse():
            cmd_fetch  = "telnet bofh.jeffballard.us 666 2>&1 "
            cmd_filter = "grep 'Your excuse is:' | cut -f 2- -d :"
            out        = ru.sh_callout("%s | %s" % (cmd_fetch, cmd_filter),
                                       shell=True)[0]
            return out.strip()


        github = github3.login(user, passwd)
        repo   = github.repository("radical-cybertools", "radical.pilot")

        title = 'autopilot: %s' % titles[random.randint(0, len(titles)-1)]

        print '----------------------------------------------------'
        print 'autopilot'

        for issue in repo.issues(labels=labels, state='open'):
            if issue.title == title:
                reply = 'excuse: %s' % excuse()
                issue.create_comment(reply)
                print '  resolve: %s' % reply
                return

        # issue not found - create
        body  = 'problem: %s' % excuse()
        issue = repo.create_issue(title=title, body=body, labels=[labels],
                                  assignee=user)
        print '  issue  : %s' % title
        print '  problem: %s' % body
        print '----------------------------------------------------'


# -----------------------------------------------------------------------------