Source code for opalalgorithms.utils.algorithmrunner

"""Given an algorithm object, run the algorithm."""
from __future__ import division, print_function

import signal
import sys
import multiprocessing as mp
import os
import textwrap
import json

import requests
import six
import codejail
from codejail.safe_exec import not_safe_exec
from codejail.limits import set_limit

__all__ = ["AlgorithmRunner"]

class GracefulExit(Exception):
    """Graceful exit exception class."""

def sigint_handler(signum, thread):
    """Handle interrupt signal."""
    raise GracefulExit()

def check_environ():
    """Check that all environment variable exists.

        - Required environment variables are `OPALALGO_SANDBOX_VENV` and

    for environ_var in req_environ_vars:
        if environ_var not in os.environ:
            raise RuntimeError(
                'Environment variable {} not set'.format(environ_var))

[docs]def get_jail(python_version=sys.version_info[0]): """Return codejail object. Note: - Please set environmental variables `OPALALGO_SANDBOX_VENV` and `OPALALGO_SANDBOX_USER` before calling this function. - `OPALALGO_SANDBOX_VENV` must be set to the path of the sandbox virtual environment. - `OPALALGO_SANDBOX_USER` must be set to the user running the sandboxed algorithms. """ sandbox_env = os.environ.get('OPALALGO_SANDBOX_VENV') sandbox_user = os.environ.get('OPALALGO_SANDBOX_USER') set_limit("REALTIME", None) set_limit("CPU", 15) codejail.configure( 'python', os.path.join(sandbox_env, 'bin', 'python'), user=sandbox_user) codejail.configure( 'python3', os.path.join(sandbox_env, 'bin', 'python'), user=sandbox_user) if python_version < 3: jail = codejail.get_codejail('python') else: jail = codejail.get_codejail('python3') return jail
[docs]def process_user_csv(params, user_csv_file, algorithm, dev_mode, sandboxing, jail): """Process a single user csv file. Args: params (dict): Parameters for the request. user_csv_file (string): Path to user csv file. algorithm (dict): Dictionary with keys `code` and `className` specifying algorithm code and className. dev_mode (bool): Should the algorithm run in development mode or production mode. sandboxing (bool): Should sandboxing be used or not. jail (codejail.Jail): Jail object. Returns: Result of the execution. Raises: SafeExecException: If the execution wasn't successful. """ username = os.path.splitext(os.path.basename(user_csv_file))[0] globals_dict = { 'params': params, } user_specific_code = textwrap.dedent( """ def run_code(): import bandicoot algorithmobj = {}() bandicoot_user = bandicoot.read_csv( '{}', '', describe={}, warnings={}) return, bandicoot_user) result = run_code() """.format( algorithm['className'], username, str(dev_mode), str(dev_mode))) code = "{}\n{}".format(algorithm['code'], user_specific_code) if sandboxing: jail.safe_exec( code, globals_dict, files=[user_csv_file]) else: not_safe_exec( code, globals_dict, files=[user_csv_file]) result = globals_dict['result'] return result
[docs]def mapper(writing_queue, params, file_queue, algorithm, dev_mode=False, sandboxing=True, python_version=2): """Call the map function and insert result into the queue if valid. Args: writing_queue (mp.manager.Queue): Queue for inserting results. params (dict): Parameters to be used by each map of the algorithm. users_csv_files (list): List of paths of csv files of users. algorithm (dict): Dictionary with keys `code` and `className` specifying algorithm code and className. dev_mode (bool): Should the algorithm run in development mode or production mode. sandboxing (bool): Should sandboxing be used or not. python_version (int): Python version being used for sandboxing. """ jail = get_jail(python_version) while not file_queue.empty(): filepath = None scaler = None try: result = file_queue.get(timeout=1) filepath, scaler = result except Exception as exc: print(exc) break result = process_user_csv( params, filepath, algorithm, dev_mode, sandboxing, jail) if result and is_valid_result(result): writing_queue.put((result, scaler)) elif result and dev_mode: print("Error in result {}".format(result))
def scale_result(result, scaler): """Return scaled result. Args: result (dict): Result. scaler (number): Factor by which results need to be scaled. Returns: dict: Scaled result. """ scaled_result = {} for key, val in six.iteritems(result): scaled_result[key] = scaler * val return scaled_result
[docs]def collector(writing_queue, params, dev_mode=False): """Collect the results in writing queue and post to aggregator. Args: writing_queue (mp.manager.Queue): Queue from which collect results. results_csv_path (str): CSV where we have to save results. dev_mode (bool): Whether to run algorithm in development mode. Returns: bool: True on successful exit if `dev_mode` is set to False. Note: If `dev_mode` is set to true, then collector will just return all the results in a list format. """ result_processor = ResultProcessor(params, dev_mode) while True: # wait for result to appear in the queue processed_result = writing_queue.get() # if got signal 'kill' exit the loop if processed_result == 'kill': break result, scaler = processed_result result_processor(result, scaler=scaler) return result_processor.get_result()
[docs]def is_valid_result(result): """Check if result is valid. Args: result: Output of the algorithm. Note: Result is valid if it is a dict. All keys of the dict must be be a string. All values must be numbers. These results are sent to reducer which will sum, count, mean, median, mode of the values belonging to same key. Example: - {"alpha1": 1, "ant199": 1, ..} Returns: bool: Specifying if the result is valid or not. Todo: * Define what is valid with privacy and other concerns """ # check result must be a dict if not isinstance(result, dict): return False # check each value must be an integer or float if not (all([isinstance(x, six.integer_types) or isinstance(x, float) for x in six.itervalues(result)])): return False # check each key must be a string. if not (all([isinstance(x, six.string_types) for x in six.iterkeys(result)])): return False return True
class ResultProcessor(object): """Process results. Args: params (dict): Dictionary of parameters. dev_mode (bool): Specify if dev_mode is on. """ def __init__(self, params, dev_mode): """Initialize result processor.""" self.params = params self.dev_mode = dev_mode self.result_list = [] def __call__(self, result, scaler=1): """Process the result. If dev_mode is set to true, it appends the result to a list. Else it send the post request to `aggregationServiceUrl`. Args: result (dict): Result of the processed algorithm. scaler (int): Scale results by what value. """ result = scale_result(result, scaler) if self.dev_mode: self.result_list.append(result) else: self._send_request(result) def _send_request(self, result): """Send request to aggregationServiceUrl. Args: result (dict): Result to be sent as an update. """ response = self.params['aggregationServiceUrl'], json={'update': result}) if response.status_code != 200: raise RuntimeError( 'Aggregation service returned {}'.format( response.status_code)) def get_result(self): """Return the result after processing. Returns: dict: if dev_mode is set to true else returns `True` """ if self.dev_mode: return self.result_list return True
[docs]class AlgorithmRunner(object): """Algorithm runner. Args: algorithm (dict): Dictionary containing `code` and `className`. dev_mode (bool): Development mode switch multiprocess (bool): Use multiprocessing or single process for complete execution. sandboxing (bool): Use sandboxing for execution or execute in unsafe environment. """ def __init__(self, algorithm, dev_mode=False, multiprocess=True, sandboxing=True): """Initialize class.""" self.algorithm = algorithm self.dev_mode = dev_mode self.multiprocess = multiprocess self.sandboxing = sandboxing
[docs] def __call__(self, params, data_dir, num_threads, weights_file=None): """Run algorithm. Selects the csv files from the data directory. Divides the csv files into chunks of equal size across the `num_threads` threads. Each thread performs calls map function of the csv file and processes the result. The collector thread, waits for results before posting it to aggregator service. Args: params (dict): Dictionary containing all the parameters for the algorithm data_dir (str): Data directory with csv files. num_threads (int): Number of threads weights_file (str): Path to the json file containing weights. Returns: int: Amount of time required for computation in microseconds. """ check_environ() csv_files = [os.path.join( os.path.abspath(data_dir), f) for f in os.listdir(data_dir) if f.endswith('.csv')] csv2weights = self._get_weights(csv_files, weights_file) if self.multiprocess: return self._multiprocess( params, num_threads, csv_files, csv2weights) return self._singleprocess(params, csv_files, csv2weights)
def _get_weights(self, csv_files, weights_file): """Return weights for each user if available, else return 1.""" weights = None if weights_file: with open(weights_file) as file_path: weights = json.load(file_path) csv2weights = {} for file_path in csv_files: csv_weight = 1 # default weight user = os.path.splitext(os.path.basename(file_path))[0] if weights and user in weights: csv_weight = weights[user] csv2weights[file_path] = csv_weight return csv2weights def _multiprocess(self, params, num_threads, csv_files, csv2weights): # set up parallel processing manager = mp.Manager() writing_queue = manager.Queue() file_queue = manager.Queue() for fpath in csv_files: file_queue.put((fpath, csv2weights[fpath])) jobs = [] # additional 1 process for writer signal.signal(signal.SIGINT, signal.SIG_IGN) pool = mp.Pool(processes=num_threads + 1) signal.signal(signal.SIGINT, sigint_handler) try: collector_job = pool.apply_async( collector, (writing_queue, params, self.dev_mode)) # Compute the density for _ in range(num_threads): jobs.append(pool.apply_async(mapper, ( writing_queue, params, file_queue, self.algorithm, self.dev_mode, self.sandboxing))) # Clean up parallel processing (close pool, wait for processes to # finish, kill writing_queue, wait for queue to be killed) pool.close() for job in jobs: job.get() writing_queue.put('kill') # stop collection result = collector_job.get() pool.join() return result except GracefulExit: pool.terminate() print("Exiting") pool.join() raise RuntimeError("Received interrupt signal, exiting. Bye.") def _singleprocess(self, params, csv_files, csv2weights): result_processor = ResultProcessor(params, self.dev_mode) jail = get_jail(python_version=2) for fpath in csv_files: scaler = csv2weights[fpath] result = process_user_csv( params, fpath, self.algorithm, self.dev_mode, self.sandboxing, jail) result_processor(result, scaler=scaler) return result_processor.get_result()