Source code for deepobs.tensorflow.runners.runner

"""Module implementing StandardRunner."""

from __future__ import print_function
import numpy as np
import importlib
import tensorflow as tf
import abc
from .. import config
from .. import testproblems
from . import runner_utils
from copy import deepcopy
from deepobs import config as global_config
from deepobs.abstract_runner.abstract_runner import Runner


[docs]class TFRunner(Runner):
[docs]    def __init__(self, optimizer_class, hyperparameter_names):

        super(TFRunner, self).__init__(optimizer_class, hyperparameter_names)

[docs]    @staticmethod
    def init_summary(loss,
                     learning_rate_var,
                     batch_size,
                     tb_log_dir):
        """Initializes the tensorboard summaries"""
        # per iteration
        mb_train_loss_summary = tf.summary.scalar(
            "training/minibatch_train_losses",
            loss,
            collections=[tf.GraphKeys.SUMMARIES, "per_iteration"])
        # per epoch
        lr_summary = tf.summary.scalar(
            "hyperparams/learning_rate",
            learning_rate_var,
            collections=[tf.GraphKeys.SUMMARIES, "per_epoch"])
        batch_summary = tf.summary.scalar(
            "hyperparams/batch_size",
            batch_size,
            collections=[tf.GraphKeys.SUMMARIES, "per_epoch"])

        per_iter_summaries = tf.summary.merge_all(key="per_iteration")
        per_epoch_summaries = tf.summary.merge_all(key="per_epoch")
        summary_writer = tf.summary.FileWriter(tb_log_dir)
        return per_iter_summaries, per_epoch_summaries, summary_writer

[docs]    @staticmethod
    def write_per_epoch_summary(
                                sess,
                                loss_,
                                acc_,
                                current_step,
                                per_epoch_summaries,
                                summary_writer,
                                phase):
        """Writes the tensorboard epoch summary"""
        if phase == "TEST":
            tag = "epoch/test_"
        elif phase == "TRAIN":
            tag = "epoch/train_"
        elif phase == "VALID":
            tag = "epoch/valid_"
        else:
            raise NotImplementedError("Phase " + phase + ' not implemented for write_epoch_summary().')
        summary = tf.Summary()
        summary.value.add(tag=tag + "loss_", simple_value=loss_)
        summary.value.add(tag=tag + "acc_", simple_value=acc_)
        per_epoch_summary_ = sess.run(per_epoch_summaries)
        summary_writer.add_summary(per_epoch_summary_,
                                   current_step)
        summary_writer.add_summary(summary, current_step)
        summary_writer.flush()
        return

[docs]    @staticmethod
    def write_per_iter_summary(
                               sess,
                               per_iter_summaries,
                               summary_writer,
                               current_step):
        """Writes the tensorboard iteration summary"""
        per_iter_summary_ = sess.run(per_iter_summaries)
        summary_writer.add_summary(per_iter_summary_, current_step)

[docs]    @staticmethod
    def create_testproblem(testproblem, batch_size, weight_decay, random_seed):
        """Sets up the deepobs.tensorflow.testproblems.testproblem instance.

        Args:
            testproblem (str): The name of the testproblem.
            batch_size (int): Batch size that is used for training
            weight_decay (float): Regularization factor
            random_seed (int): The random seed of the framework

        Returns:
            deepobs.tensorflow.testproblems.testproblem: An instance of deepobs.pytorch.testproblems.testproblem
        """
        # Find testproblem by name and instantiate with batch size and weight decay.
        try:
            testproblem_mod = importlib.import_module(testproblem)
            testproblem_cls = getattr(testproblem_mod, testproblem)
            print("Loading local testproblem.")
        except:
            testproblem_cls = getattr(testproblems, testproblem)
        if weight_decay is not None:
            tproblem = testproblem_cls(batch_size, weight_decay)
        else:
            tproblem = testproblem_cls(batch_size)

        # Set up the testproblem.
        tf.reset_default_graph()
        tf.set_random_seed(random_seed)
        tproblem.set_up()
        return tproblem

    # Wrapper functions for the evaluation phase.
[docs]    @staticmethod
    def evaluate(tproblem, sess, loss, phase):
        """Computes average loss and accuracy in the evaluation phase.
        Args:
            tproblem (deepobs.tensorflow.testproblems.testproblem): The testproblem instance.
            sess (tensorflow.Session): The current TensorFlow Session.
            loss: The TensorFlow operation that computes the loss.
            phase (str): The phase of the evaluation. Muste be one of 'TRAIN', 'VALID' or 'TEST'
        """
        if phase == 'TEST':
            sess.run(tproblem.test_init_op)
            msg = "TEST:"
        elif phase == 'TRAIN':
            sess.run(tproblem.train_eval_init_op)
            msg = "TRAIN:"
        elif phase == 'VALID':
            # TODO !!!!!! IS CURRENTLY USING TEST_INIT_OP. CHANGE THIS TO VALID_INIT_OP ONCE THEY ARE IMPLEMENTED !!!!!!
            # sess.run(tproblem.valid_init_op)
            sess.run(tproblem.test_init_op)
            msg = "VALID: WARNING: THIS IS CURRENTLY ALSO THE EVALUATION ON THE TEST DATA SET!!: "
        else:
            raise NotImplementedError("Phase " + phase + ' not implemented for evaluate().')
        # Compute average loss and (if applicable) accuracy.
        loss_ = 0.0
        num_iters = 0.0
        acc_ = 0.0
        if tproblem.accuracy is not None:
            while True:
                try:
                    l_, a_ = sess.run([loss, tproblem.accuracy])
                    loss_ += l_
                    acc_ += a_
                    num_iters += 1.0
                except tf.errors.OutOfRangeError:
                    break
        else:  # accuracy is None
            acc_ = 0.0
            while True:
                try:
                    l_ = sess.run(loss)
                    loss_ += l_
                    num_iters += 1.0
                except tf.errors.OutOfRangeError:
                    break
        loss_ /= num_iters
        acc_ /= num_iters
#
        if acc_ != 0.0:
            print("{0:s} loss {1:g}, acc {2:f}".format(msg, loss_, acc_))
        else:
            print("{0:s} loss {1:g}".format(msg, loss_))

        # Print and return the results.
        return loss_, acc_

    def evaluate_all(self, n,
                     num_epochs,
                     tproblem,
                     sess,
                     loss,
                     tb_log,
                     per_epoch_summaries,
                     summary_writer,
                     train_losses,
                     valid_losses,
                     test_losses,
                     train_accuracies,
                     valid_accuracies,
                     test_accuracies):
        print("********************************")
        print("Evaluating after {0:d} of {1:d} epochs...".format(n, num_epochs))

        loss_, acc_ = self.evaluate(tproblem, sess, loss, phase='TRAIN')
        if tb_log:
            current_step = len(train_losses)
            self.write_per_epoch_summary(sess,
                                         loss_,
                                         acc_,
                                         current_step,
                                         per_epoch_summaries,
                                         summary_writer,
                                         phase='TRAIN')
        train_losses.append(loss_)
        train_accuracies.append(acc_)

        loss_, acc_ = self.evaluate(tproblem, sess, loss, phase='VALID')
        if tb_log:
            current_step = len(train_losses)
            self.write_per_epoch_summary(sess,
                                         loss_,
                                         acc_,
                                         current_step,
                                         per_epoch_summaries,
                                         summary_writer,
                                         phase='VALID')
        valid_losses.append(loss_)
        valid_accuracies.append(acc_)

        loss_, acc_ = self.evaluate(tproblem, sess, loss, phase='TEST')
        if tb_log:
            current_step = len(test_losses)
            self.write_per_epoch_summary(sess,
                                         loss_,
                                         acc_,
                                         current_step,
                                         per_epoch_summaries,
                                         summary_writer,
                                         phase='TEST')
        test_losses.append(loss_)
        test_accuracies.append(acc_)
        print("********************************")

[docs]    @abc.abstractmethod
    def training(self, tproblem, hyperparams, num_epochs, print_train_iter, train_log_interval, tb_log, tb_log_dir, **training_params):
        return


[docs]class StandardRunner(TFRunner):

[docs]    def __init__(self, optimizer_class, hyperparameter_names):

        super(StandardRunner, self).__init__(optimizer_class, hyperparameter_names)

[docs]    def training(self,
                 tproblem, hyperparams, num_epochs, print_train_iter, train_log_interval, tb_log, tb_log_dir):

        loss = tf.reduce_mean(tproblem.losses) + tproblem.regularizer

        # Set up the optimizer and create learning rate schedule.
        global_step = tf.Variable(0, trainable=False)

        # this is neccesary to apply the lr_sched later.
        learning_rate = hyperparams['learning_rate']
        learning_rate_var = tf.Variable(learning_rate, trainable=False)
        hyperparams_ = deepcopy(hyperparams)
        hyperparams_.pop('learning_rate')

        opt = self._optimizer_class(learning_rate_var, **hyperparams_)

        # Call optimizer's minimize on loss to update all variables in the
        # TRAINABLE_VARIABLES collection (with a dependency on performing all ops
        # in the collection UPDATE_OPS collection for batch norm, etc).
        with tf.control_dependencies(
                tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            step = opt.minimize(loss, global_step=global_step)

        # Lists to track train/test loss and accuracy.
        train_losses = []
        valid_losses = []
        test_losses = []
        minibatch_train_losses = []
        train_accuracies = []
        valid_accuracies = []
        test_accuracies = []

        # Tensorboard summaries
        if tb_log:
            batch_size = tproblem._batch_size
            per_iter_summaries, per_epoch_summaries, summary_writer = self.init_summary(loss,
                                                                                        learning_rate_var,
                                                                                        batch_size,
                                                                                        tb_log_dir)
        else:    # make sure that they are assigned for evaluate_all()
            per_epoch_summaries = None,
            summary_writer = None

        # Start tensorflow session and initialize variables.
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        # Start of training loop.
        for n in range(num_epochs + 1):
            # Evaluate at beginning of epoch.
            self.evaluate_all(n,
                              num_epochs,
                              tproblem,
                              sess,
                              loss,
                              tb_log,
                              per_epoch_summaries,
                              summary_writer,
                              train_losses,
                              valid_losses,
                              test_losses,
                              train_accuracies,
                              valid_accuracies,
                              test_accuracies)

            # Break from train loop after the last round of evaluation
            if n == num_epochs:
                break

            # Training
            sess.run(tproblem.train_init_op)
            s = 0
            while True:
                try:
                    if s % train_log_interval == 0:
                        # Training step, with logging if we hit the train_log_interval
                        _, loss_ = sess.run([step, loss])
                        minibatch_train_losses.append(loss_.astype(float))

                        if tb_log:
                            current_step = sess.run(global_step)
                            self.write_per_iter_summary(sess,
                                                        per_iter_summaries,
                                                        summary_writer,
                                                        current_step)

                        minibatch_train_losses.append(loss_.astype(float))
                        if print_train_iter:
                            print("Epoch {0:d}, step {1:d}: loss {2:g}".format(
                                n, s, loss_))
                    else:
                        sess.run(step)
                    s += 1
                except tf.errors.OutOfRangeError:
                    break

            # break from training if it goes wrong
            if not np.isfinite(loss_):
                self._abort_routine(n,
                                    num_epochs,
                                    train_losses,
                                    valid_losses,
                                    test_losses,
                                    train_accuracies,
                                    valid_accuracies,
                                    test_accuracies,
                                    minibatch_train_losses)
                break
            else:
                continue

        sess.close()
        # --- End of training loop.

        # Put results into output dictionary.
        output = {
            "train_losses": train_losses,
            "valid_losses": valid_losses,
            "test_losses": test_losses,
            "train_accuracies": train_accuracies,
            "valid_accuracies": valid_accuracies,
            "test_accuracies": test_accuracies,
            "minibatch_train_losses": minibatch_train_losses,
        }
            
        return output


[docs]class LearningRateScheduleRunner(TFRunner):

[docs]    def __init__(self, optimizer_class, hyperparameter_names):

        super(LearningRateScheduleRunner, self).__init__(optimizer_class, hyperparameter_names)

    def _add_training_params_to_argparse(self, parser, args, training_params):
        try:
            args['lr_sched_epochs'] = training_params['lr_sched_epochs']
        except KeyError:
            parser.add_argument(
                "--lr_sched_epochs",
                nargs="+",
                type=int,
                help="""One or more epoch numbers (positive integers) that mark
          learning rate changes. The base learning rate has to be passed via
          '--learing_rate' and the factors by which to change have to be passed
          via '--lr_sched_factors'. Example: '--lr 0.3 --lr_sched_epochs 50 100
          --lr_sched_factors 0.1 0.01' will start with a learning rate of 0.3,
          then decrease to 0.1*0.3=0.03 after training for 50 epochs, and
          decrease to 0.01*0.3=0.003' after training for 100 epochs.""")

        try:
            args['lr_sched_factors'] = training_params['lr_sched_factors']
        except KeyError:
            parser.add_argument(
                "--lr_sched_factors",
                nargs="+",
                type=float,
                help=
                """One or more factors (floats) by which to change the learning
          rate. The base learning rate has to be passed via '--learing_rate' and
          the epochs at which to change the learning rate have to be passed via
          '--lr_sched_factors'. Example: '--lr 0.3 --lr_sched_epochs 50 100
          --lr_sched_factors 0.1 0.01' will start with a learning rate of 0.3,
          then decrease to 0.1*0.3=0.03 after training for 50 epochs, and
          decrease to 0.01*0.3=0.003' after training for 100 epochs.""")

[docs]    def training(self,
                 tproblem,
                 hyperparams,
                 num_epochs,
                 print_train_iter,
                 train_log_interval,
                 tb_log,
                 tb_log_dir,
                 # the following are the training_params
                 lr_sched_epochs=None,
                 lr_sched_factors=None):
        """Performs the training and stores the metrices.

        Args:
            tproblem (deepobs.[tensorflow/pytorch].testproblems.testproblem): The testproblem instance to train on.
            hyperparams (dict): The optimizer hyperparameters to use for the training.
            num_epochs (int): The number of training epochs.
            print_train_iter (bool): Whether to print the training progress at every train_log_interval
            train_log_interval (int): Mini-batch interval for logging.
            tb_log (bool): Whether to use tensorboard logging or not
            tb_log_dir (str): The path where to save tensorboard events.
            lr_sched_epochs (list): The epochs where to adjust the learning rate.
            lr_sched_factors (list): The corresponding factors by which to adjust the learning rate.

        Returns:
            dict: The logged metrices. Is of the form: \
                {'test_losses' : [...], \
                'valid_losses': [...], \
                 'train_losses': [...],  \
                 'test_accuracies': [...], \
                 'valid_accuracies': [...], \
                 'train_accuracies': [...] \
                 } \
            where the metrices values are lists that were filled during training.
    """

        loss = tf.reduce_mean(tproblem.losses) + tproblem.regularizer

        # Set up the optimizer and create learning rate schedule.
        global_step = tf.Variable(0, trainable=False)

        # this is neccesary to apply the lr_sched later.
        learning_rate = hyperparams['learning_rate']
        learning_rate_var = tf.Variable(learning_rate, trainable=False)
        hyperparams_ = deepcopy(hyperparams)
        hyperparams_.pop('learning_rate')

        opt = self._optimizer_class(learning_rate_var, **hyperparams_)
        lr_schedule = runner_utils.make_lr_schedule(
            learning_rate, lr_sched_epochs, lr_sched_factors)

        # Call optimizer's minimize on loss to update all variables in the
        # TRAINABLE_VARIABLES collection (with a dependency on performing all ops
        # in the collection UPDATE_OPS collection for batch norm, etc).
        with tf.control_dependencies(
                tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            step = opt.minimize(loss, global_step=global_step)

        # Lists to track train/test loss and accuracy.
        train_losses = []
        valid_losses = []
        test_losses = []
        minibatch_train_losses = []
        train_accuracies = []
        valid_accuracies = []
        test_accuracies = []

        # Tensorboard summaries
        if tb_log:
            batch_size = tproblem._batch_size
            per_iter_summaries, per_epoch_summaries, summary_writer = self.init_summary(loss,
                                                                                        learning_rate_var,
                                                                                        batch_size,
                                                                                        tb_log_dir)
        else:    # make sure that they are assigned for evaluate_all()
            per_epoch_summaries = None,
            summary_writer = None

        # Start tensorflow session and initialize variables.
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        # Start of training loop.
        for n in range(num_epochs + 1):
            # Evaluate at beginning of epoch.
            self.evaluate_all(n,
                              num_epochs,
                              tproblem,
                              sess,
                              loss,
                              tb_log,
                              per_epoch_summaries,
                              summary_writer,
                              train_losses,
                              valid_losses,
                              test_losses,
                              train_accuracies,
                              valid_accuracies,
                              test_accuracies)

            # Break from train loop after the last round of evaluation
            if n == num_epochs:
                break

            # Training
            if n in lr_schedule:
                sess.run(learning_rate_var.assign(lr_schedule[n]))
                print("Setting learning rate to {0:f}".format(lr_schedule[n]))
            sess.run(tproblem.train_init_op)
            s = 0
            while True:
                try:
                    if s % train_log_interval == 0:
                        # Training step, with logging if we hit the train_log_interval
                        _, loss_ = sess.run([step, loss])
                        minibatch_train_losses.append(loss_.astype(float))

                        if tb_log:
                            current_step = sess.run(global_step)
                            self.write_per_iter_summary(sess,
                                                        per_iter_summaries,
                                                        summary_writer,
                                                        current_step)

                        minibatch_train_losses.append(loss_.astype(float))
                        if print_train_iter:
                            print("Epoch {0:d}, step {1:d}: loss {2:g}".format(
                                n, s, loss_))
                    else:
                        sess.run(step)
                    s += 1
                except tf.errors.OutOfRangeError:
                    break

            # break from training if it goes wrong
            if not np.isfinite(loss_):
                self._abort_routine(n,
                                    num_epochs,
                                    train_losses,
                                    valid_losses,
                                    test_losses,
                                    train_accuracies,
                                    valid_accuracies,
                                    test_accuracies,
                                    minibatch_train_losses)

                break
            else:
                continue

        sess.close()
        # --- End of training loop.

        # Put results into output dictionary.
        output = {
            "train_losses": train_losses,
            "valid_losses": valid_losses,
            "test_losses": test_losses,
            "train_accuracies": train_accuracies,
            "valid_accuracies": valid_accuracies,
            "test_accuracies": test_accuracies,
            "minibatch_train_losses": minibatch_train_losses,
        }

        return output