/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysml.runtime.matrix.data;

import jcuda.Pointer;
import jcuda.jcublas.cublasHandle;
import jcuda.jcusparse.cusparseHandle;
import jcuda.runtime.JCuda;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.functionobjects.And;
import org.apache.sysml.runtime.functionobjects.Builtin;
import org.apache.sysml.runtime.functionobjects.CM;
import org.apache.sysml.runtime.functionobjects.Divide;
import org.apache.sysml.runtime.functionobjects.Equals;
import org.apache.sysml.runtime.functionobjects.GreaterThan;
import org.apache.sysml.runtime.functionobjects.GreaterThanEquals;
import org.apache.sysml.runtime.functionobjects.IndexFunction;
import org.apache.sysml.runtime.functionobjects.IntegerDivide;
import org.apache.sysml.runtime.functionobjects.KahanPlus;
import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
import org.apache.sysml.runtime.functionobjects.LessThan;
import org.apache.sysml.runtime.functionobjects.LessThanEquals;
import org.apache.sysml.runtime.functionobjects.Mean;
import org.apache.sysml.runtime.functionobjects.Minus;
import org.apache.sysml.runtime.functionobjects.Minus1Multiply;
import org.apache.sysml.runtime.functionobjects.MinusNz;
import org.apache.sysml.runtime.functionobjects.Modulus;
import org.apache.sysml.runtime.functionobjects.Multiply;
import org.apache.sysml.runtime.functionobjects.Multiply2;
import org.apache.sysml.runtime.functionobjects.NotEquals;
import org.apache.sysml.runtime.functionobjects.Or;
import org.apache.sysml.runtime.functionobjects.Plus;
import org.apache.sysml.runtime.functionobjects.Power;
import org.apache.sysml.runtime.functionobjects.Power2;
import org.apache.sysml.runtime.functionobjects.ReduceAll;
import org.apache.sysml.runtime.functionobjects.ReduceCol;
import org.apache.sysml.runtime.functionobjects.ReduceDiag;
import org.apache.sysml.runtime.functionobjects.ReduceRow;
import org.apache.sysml.runtime.functionobjects.ValueFunction;
import org.apache.sysml.runtime.instructions.cp.DoubleObject;
import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
import org.apache.sysml.runtime.matrix.data.CudaSupportFunctions;
import org.apache.sysml.runtime.matrix.data.DoublePrecisionCudaSupportFunctions;
import org.apache.sysml.runtime.matrix.data.LibMatrixCuMatMult;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.matrix.data.SinglePrecisionCudaSupportFunctions;
import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
import org.apache.sysml.runtime.matrix.operators.RightScalarOperator;
import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
import org.apache.sysml.runtime.util.IndexRange;
import org.apache.sysml.utils.GPUStatistics;

public class LibMatrixCUDA {
    private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
    protected static int CUDNN_DATA_TYPE = 1;
    public static CudaSupportFunctions cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions();
    public static int sizeOfDataType = 8;
    public static String customKernelSuffix = "_d";
    private static int _MAX_THREADS = -1;
    private static int _MAX_BLOCKS = -1;
    private static int _WARP_SIZE = -1;
    protected static long maxNumElementsOfCuDNNTensor = 2000000000L;
    private static Pointer _one;
    private static Pointer _zero;
    private static int oldDataTypeSize;

    public static void resetFloatingPointPrecision() throws DMLRuntimeException {
        if (DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("double")) {
            CUDNN_DATA_TYPE = 1;
            cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions();
            sizeOfDataType = 8;
            customKernelSuffix = "_d";
        } else if (DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("single")) {
            CUDNN_DATA_TYPE = 0;
            cudaSupportFunctions = new SinglePrecisionCudaSupportFunctions();
            sizeOfDataType = 4;
            customKernelSuffix = "_f";
        } else {
            throw new DMLRuntimeException("Unsupported floating point precision: " + DMLScript.FLOATING_POINT_PRECISION);
        }
    }

    static int getMaxThreads(GPUContext gCtx) throws DMLRuntimeException {
        if (_MAX_THREADS == -1) {
            _MAX_THREADS = gCtx.getMaxThreadsPerBlock();
        }
        return _MAX_THREADS;
    }

    static int getMaxBlocks(GPUContext gCtx) throws DMLRuntimeException {
        if (_MAX_BLOCKS == -1) {
            _MAX_BLOCKS = gCtx.getMaxBlocks();
        }
        return _MAX_BLOCKS;
    }

    static int getWarpSize(GPUContext gCtx) throws DMLRuntimeException {
        if (_WARP_SIZE == -1) {
            _WARP_SIZE = gCtx.getWarpSize();
        }
        return _WARP_SIZE;
    }

    public static boolean isInSparseFormat(GPUContext gCtx, MatrixObject mo) {
        if (mo.getGPUObject(gCtx) != null && mo.getGPUObject(gCtx).isAllocated()) {
            return mo.getGPUObject(gCtx).isSparse();
        }
        return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz());
    }

    public static long getNnz(GPUContext gCtx, String instName, MatrixObject mo, boolean recomputeDenseNNZ) throws DMLRuntimeException {
        if (mo.getGPUObject(gCtx) != null && mo.getGPUObject(gCtx).isAllocated()) {
            return mo.getGPUObject(gCtx).getNnz(instName, recomputeDenseNNZ);
        }
        return mo.getNnz();
    }

    protected static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException {
        return gCtx.getCusparseHandle();
    }

    protected static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
        return gCtx.getCublasHandle();
    }

    protected static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
        return gCtx.getKernels();
    }

    public static Pointer double2float(GPUContext gCtx, Pointer A, Pointer ret, int numElems) throws DMLRuntimeException {
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("double2float", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), A, ret, numElems);
        return ret;
    }

    public static Pointer float2double(GPUContext gCtx, Pointer A, Pointer ret, int numElems) throws DMLRuntimeException {
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("float2double", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), A, ret, numElems);
        return ret;
    }

    public static Pointer one() {
        if (_one == null || oldDataTypeSize != sizeOfDataType) {
            _one = LibMatrixCUDA.dataTypePointerTo(1.0);
            oldDataTypeSize = sizeOfDataType;
        }
        return _one;
    }

    public static Pointer zero() {
        if (_zero == null || oldDataTypeSize != sizeOfDataType) {
            _zero = LibMatrixCUDA.dataTypePointerTo(0.0);
            oldDataTypeSize = sizeOfDataType;
        }
        return _zero;
    }

    protected static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
        if (LibMatrixCUDA.isInSparseFormat(gCtx, input)) {
            input.getGPUObject(gCtx).sparseToDense(instName);
        }
        return input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
    }

    protected static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
        if (!LibMatrixCUDA.isInSparseFormat(gCtx, input)) {
            input.getGPUObject(gCtx).denseToSparse();
        }
        return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
    }

    protected static Pointer dataTypePointerTo(double value) {
        if (sizeOfDataType == 8) {
            return Pointer.to((double[])new double[]{value});
        }
        if (sizeOfDataType == 4) {
            return Pointer.to((float[])new float[]{(float)value});
        }
        throw new RuntimeException("Unsupported datatype with size " + sizeOfDataType);
    }

    public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : reluBackward, GPUContext=" + gCtx);
        }
        long rows = input.getNumRows();
        long cols = input.getNumColumns();
        Pointer imagePointer = LibMatrixCUDA.getDensePointer(gCtx, input, instName);
        Pointer doutPointer = LibMatrixCUDA.getDensePointer(gCtx, dout, instName);
        Pointer outputPointer = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, instName);
        long t1 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("relu_backward", ExecutionConfig.getConfigForSimpleMatrixOperations(LibMatrixCUDA.toInt(rows), LibMatrixCUDA.toInt(cols)), imagePointer, doutPointer, outputPointer, LibMatrixCUDA.toInt(rows), LibMatrixCUDA.toInt(cols));
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnrbk", System.nanoTime() - t1);
        }
    }

    public static void channelSums(GPUContext gCtx, String instName, MatrixObject input, MatrixObject outputBlock, long C, long HW) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : channelSums, GPUContext=" + gCtx);
        }
        int N = LibMatrixCUDA.toInt(input.getNumRows());
        int cols = LibMatrixCUDA.toInt(input.getNumColumns());
        if ((long)cols != C * HW) {
            throw new DMLRuntimeException("Incorrect parameters, number of columns " + cols + " != " + C + "*" + HW);
        }
        Pointer imagePointer = LibMatrixCUDA.getDensePointer(gCtx, input, instName);
        Pointer outputPointer = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, instName);
        Pointer tmp = gCtx.allocate(instName, cols * sizeOfDataType);
        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols);
        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, LibMatrixCUDA.toInt(C), LibMatrixCUDA.toInt(HW));
        gCtx.cudaFreeHelper(tmp);
    }

    public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : biasMultiply, GPUContext=" + gCtx);
        }
        if (LibMatrixCUDA.isInSparseFormat(gCtx, input)) {
            input.getGPUObject(gCtx).sparseToDense(instName);
        }
        if (LibMatrixCUDA.isInSparseFormat(gCtx, bias)) {
            bias.getGPUObject(gCtx).sparseToDense(instName);
        }
        long rows = input.getNumRows();
        long cols = input.getNumColumns();
        long K = bias.getNumRows();
        long PQ = cols / K;
        if (bias.getNumColumns() != 1L || cols % K != 0L) {
            throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
        }
        Pointer imagePointer = input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
        Pointer biasPointer = bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
        Pointer outputPointer = outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
        long t1 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("bias_multiply", ExecutionConfig.getConfigForSimpleMatrixOperations(LibMatrixCUDA.toInt(rows), LibMatrixCUDA.toInt(cols)), imagePointer, biasPointer, outputPointer, LibMatrixCUDA.toInt(rows), LibMatrixCUDA.toInt(cols), LibMatrixCUDA.toInt(PQ));
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnba", System.nanoTime() - t1);
        }
    }

    public static void biasAdd(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
        Pointer imagePointer = LibMatrixCUDA.getDensePointer(gCtx, input, instName);
        Pointer biasPointer = LibMatrixCUDA.getDensePointer(gCtx, bias, instName);
        Pointer outputPointer = LibMatrixCUDA.getDensePointer(gCtx, outputBlock, instName);
        int rows = LibMatrixCUDA.toInt(input.getNumRows());
        int cols = LibMatrixCUDA.toInt(input.getNumColumns());
        int K = LibMatrixCUDA.toInt(bias.getNumRows());
        if (bias.getNumColumns() != 1L || cols % K != 0) {
            throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
        }
        LibMatrixCUDA.biasAdd(gCtx, instName, imagePointer, biasPointer, outputPointer, rows, cols, K);
    }

    private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : biasAdd, GPUContext=" + gCtx);
        }
        int PQ = cols / k;
        long t1 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("bias_add", ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols), image, bias, output, rows, cols, PQ);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "nnba", System.nanoTime() - t1);
        }
    }

    public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName, boolean isLeftTransposed) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : matmultTSMM, GPUContext=" + gCtx);
        }
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (LibMatrixCUDA.isInSparseFormat(gCtx, left)) {
            LibMatrixCuMatMult.matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
            return;
        }
        int transa = isLeftTransposed ? 0 : 1;
        int m = LibMatrixCUDA.toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
        int k = LibMatrixCUDA.toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
        MatrixObject output = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, m, m);
        if (m == -1) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        int lda = LibMatrixCUDA.toInt(isLeftTransposed ? (long)m : (long)k);
        int ldc = m;
        if (!left.getGPUObject(gCtx).isAllocated()) {
            throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject(gCtx).isAllocated());
        }
        if (!output.getGPUObject(gCtx).isAllocated()) {
            throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject(gCtx).isAllocated());
        }
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, left, instName);
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, output, instName);
        long t0 = 0L;
        long t1 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        cudaSupportFunctions.cublassyrk(LibMatrixCUDA.getCublasHandle(gCtx), 0, transa, m, k, LibMatrixCUDA.one(), A, lda, LibMatrixCUDA.zero(), C, ldc);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "Msyrk", System.nanoTime() - t0);
        }
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.copyUpperToLowerTriangle(gCtx, instName, output);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "u2lk", System.nanoTime() - t1);
        }
    }

    private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : copyUpperToLowerTriangle, GPUContext=" + gCtx);
        }
        if (LibMatrixCUDA.isInSparseFormat(gCtx, ret)) {
            throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented");
        }
        if (ret.getNumRows() != ret.getNumColumns()) {
            throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle");
        }
        int dim = LibMatrixCUDA.toInt(ret.getNumRows());
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("copy_u2l_dense", ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), LibMatrixCUDA.getDensePointer(gCtx, ret, instName), dim, dim * dim);
    }

    public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : unaryAggregate, GPUContext=" + gCtx);
        }
        boolean REDUCTION_ALL = true;
        int REDUCTION_ROW = 2;
        int REDUCTION_COL = 3;
        int REDUCTION_DIAG = 4;
        boolean OP_PLUS = true;
        int OP_PLUS_SQ = 2;
        int OP_MEAN = 3;
        int OP_VARIANCE = 4;
        int OP_MULTIPLY = 5;
        int OP_MAX = 6;
        int OP_MIN = 7;
        int OP_MAXINDEX = 8;
        int OP_MININDEX = 9;
        if (!in1.getGPUObject(gCtx).isAllocated()) {
            throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject(gCtx).isAllocated());
        }
        boolean isSparse = in1.getGPUObject(gCtx).isSparse();
        IndexFunction indexFn = op.indexFn;
        AggregateOperator aggOp = op.aggOp;
        int reductionDirection = -1;
        if (indexFn instanceof ReduceAll) {
            reductionDirection = 1;
        } else if (indexFn instanceof ReduceRow) {
            reductionDirection = 2;
        } else if (indexFn instanceof ReduceCol) {
            reductionDirection = 3;
        } else if (indexFn instanceof ReduceDiag) {
            reductionDirection = 4;
        } else {
            throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
        }
        if (reductionDirection == -1) {
            throw new DMLRuntimeException("Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction");
        }
        int opIndex = -1;
        if (aggOp.increOp.fn instanceof KahanPlus) {
            opIndex = 1;
        } else if (aggOp.increOp.fn instanceof KahanPlusSq) {
            opIndex = 2;
        } else if (aggOp.increOp.fn instanceof Mean) {
            opIndex = 3;
        } else if (aggOp.increOp.fn instanceof CM) {
            if (((CM)aggOp.increOp.fn).getAggOpType() != CMOperator.AggregateOperationTypes.VARIANCE) {
                throw new DMLRuntimeException("Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU");
            }
            opIndex = 4;
        } else if (aggOp.increOp.fn instanceof Plus) {
            opIndex = 1;
        } else if (aggOp.increOp.fn instanceof Multiply) {
            opIndex = 5;
        } else if (aggOp.increOp.fn instanceof Builtin) {
            Builtin b = (Builtin)aggOp.increOp.fn;
            switch (b.bFunc) {
                case MAX: {
                    opIndex = 6;
                    break;
                }
                case MIN: {
                    opIndex = 7;
                    break;
                }
                case MAXINDEX: {
                    opIndex = 8;
                    break;
                }
                case MININDEX: {
                    opIndex = 9;
                    break;
                }
                default: {
                    new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
                    break;
                }
            }
        } else {
            throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
        }
        if (opIndex == -1) {
            throw new DMLRuntimeException("Internal Error - Incorrect type of operation set for aggregate unary GPU instruction");
        }
        int rlen = (int)in1.getNumRows();
        int clen = (int)in1.getNumColumns();
        if (isSparse) {
            in1.getGPUObject(gCtx).sparseToDense(instName);
        }
        long outRLen = -1L;
        long outCLen = -1L;
        if (indexFn instanceof ReduceRow) {
            outRLen = 1L;
            outCLen = clen;
        } else if (indexFn instanceof ReduceCol) {
            outRLen = rlen;
            outCLen = 1L;
        }
        Pointer out = null;
        if (reductionDirection == 3 || reductionDirection == 2) {
            MatrixObject out1 = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, output, outRLen, outCLen);
            out = LibMatrixCUDA.getDensePointer(gCtx, out1, instName);
        }
        Pointer in = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
        int size = rlen * clen;
        block6 : switch (opIndex) {
            case 1: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
                        break block6;
                    }
                    case 4: {
                        throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
                    }
                }
                break;
            }
            case 2: {
                Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
                LibMatrixCUDA.squareMatrix(gCtx, instName, in, tmp, rlen, clen);
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", tmp, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
                        break;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
                        break;
                    }
                    default: {
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
                    }
                }
                gCtx.cudaFreeHelper(instName, tmp);
                break;
            }
            case 3: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", in, size);
                        double mean = result / (double)size;
                        ec.setScalarOutput(output, new DoubleObject(mean));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
            }
            case 5: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_prod", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
            }
            case 6: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_max", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
            }
            case 7: {
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_min", in, size);
                        ec.setScalarOutput(output, new DoubleObject(result));
                        break block6;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
                        break block6;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
                        break block6;
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
            }
            case 4: {
                Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
                Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType);
                switch (reductionDirection) {
                    case 1: {
                        double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", in, size);
                        double mean = result / (double)size;
                        RightScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
                        LibMatrixCUDA.matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                        double result2 = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
                        double variance = result2 / (double)(size - 1);
                        ec.setScalarOutput(output, new DoubleObject(variance));
                        break;
                    }
                    case 3: {
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
                        BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                        LibMatrixCUDA.matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                        Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType);
                        LibMatrixCUDA.reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
                        RightScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
                        LibMatrixCUDA.matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
                        gCtx.cudaFreeHelper(instName, tmpRow);
                        break;
                    }
                    case 2: {
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
                        BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
                        LibMatrixCUDA.matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
                        LibMatrixCUDA.squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
                        Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType);
                        LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
                        RightScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
                        LibMatrixCUDA.matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
                        gCtx.cudaFreeHelper(instName, tmpCol);
                        break;
                    }
                    default: {
                        throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
                    }
                }
                gCtx.cudaFreeHelper(instName, tmp);
                gCtx.cudaFreeHelper(instName, tmp2);
                break;
            }
            case 8: {
                switch (reductionDirection) {
                    case 3: {
                        throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
            }
            case 9: {
                switch (reductionDirection) {
                    case 3: {
                        throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
                    }
                }
                throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
            }
            default: {
                throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
            }
        }
    }

    private static void squareMatrix(GPUContext gCtx, String instName, Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException {
        RightScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2.0);
        LibMatrixCUDA.matrixScalarOp(gCtx, instName, in, 2.0, rlen, clen, out, power2op);
    }

    private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
        }
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceAll(gCtx, n);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        Pointer tempOut = gCtx.allocate(instName, n * sizeOfDataType);
        long t1 = 0L;
        long t2 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rallk", System.nanoTime() - t1);
        }
        int s = blocks;
        while (s > 1) {
            tmp = LibMatrixCUDA.getKernelParamsForReduceAll(gCtx, s);
            blocks = tmp[0];
            threads = tmp[1];
            sharedMem = tmp[2];
            if (DMLScript.FINEGRAINED_STATISTICS) {
                t2 = System.nanoTime();
            }
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "rallk", System.nanoTime() - t2);
            }
            s = (s + (threads * 2 - 1)) / (threads * 2);
        }
        double[] result = new double[]{-1.0};
        cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, instName, false);
        gCtx.cudaFreeHelper(instName, tempOut);
        return result[0];
    }

    private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx);
        }
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceByRow(gCtx, rows, cols);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        long t0 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rrowk", System.nanoTime() - t0);
        }
    }

    private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx);
        }
        int[] tmp = LibMatrixCUDA.getKernelParamsForReduceByCol(gCtx, rows, cols);
        int blocks = tmp[0];
        int threads = tmp[1];
        int sharedMem = tmp[2];
        long t0 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rcolk", System.nanoTime() - t0);
        }
    }

    private static int[] getKernelParamsForReduceAll(GPUContext gCtx, int n) throws DMLRuntimeException {
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads(gCtx);
        int MAX_BLOCKS = LibMatrixCUDA.getMaxBlocks(gCtx);
        int WARP_SIZE = LibMatrixCUDA.getWarpSize(gCtx);
        int threads = n < MAX_THREADS * 2 ? LibMatrixCUDA.nextPow2((n + 1) / 2) : MAX_THREADS;
        int blocks = (n + (threads * 2 - 1)) / (threads * 2);
        blocks = Math.min(MAX_BLOCKS, blocks);
        int sharedMemSize = threads * sizeOfDataType;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int[] getKernelParamsForReduceByRow(GPUContext gCtx, int rows, int cols) throws DMLRuntimeException {
        int WARP_SIZE = LibMatrixCUDA.getWarpSize(gCtx);
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads(gCtx);
        int threads = cols < MAX_THREADS * 2 ? LibMatrixCUDA.nextPow2((cols + 1) / 2) : MAX_THREADS;
        int blocks = rows;
        int sharedMemSize = threads * sizeOfDataType;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int[] getKernelParamsForReduceByCol(GPUContext gCtx, int rows, int cols) throws DMLRuntimeException {
        int MAX_THREADS = LibMatrixCUDA.getMaxThreads(gCtx);
        int MAX_BLOCKS = LibMatrixCUDA.getMaxBlocks(gCtx);
        int WARP_SIZE = LibMatrixCUDA.getWarpSize(gCtx);
        int threads = Math.min(cols, MAX_THREADS);
        int blocks = Math.min(cols / MAX_THREADS, MAX_BLOCKS);
        if (cols % MAX_THREADS != 0) {
            ++blocks;
        }
        int sharedMemSize = threads * sizeOfDataType;
        if (threads <= WARP_SIZE) {
            sharedMemSize *= 2;
        }
        return new int[]{blocks, threads, sharedMemSize};
    }

    private static int nextPow2(int x) {
        --x;
        x |= x >> 1;
        x |= x >> 2;
        x |= x >> 4;
        x |= x >> 8;
        x |= x >> 16;
        return ++x;
    }

    public static void matrixScalarRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, ScalarOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        double constant = op.getConstant();
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : matrixScalarRelational, scalar: " + constant + ", GPUContext=" + gCtx);
        }
        if (LibMatrixCUDA.isSparseAndEmpty(gCtx, in)) {
            LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, op.executeScalar(0.0), outputName, in.getNumRows(), in.getNumColumns());
            return;
        }
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in, instName);
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns());
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        int rlenA = LibMatrixCUDA.toInt(in.getNumRows());
        int clenA = LibMatrixCUDA.toInt(in.getNumColumns());
        LibMatrixCUDA.matrixScalarOp(gCtx, instName, A, constant, rlenA, clenA, C, op);
    }

    public static void matrixScalarArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
        int outCLen;
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        double constant = op.getConstant();
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx);
        }
        int outRLen = isInputTransposed ? (int)in.getNumColumns() : (int)in.getNumRows();
        int n = outCLen = isInputTransposed ? (int)in.getNumRows() : (int)in.getNumColumns();
        if (constant == 0.0) {
            if (op.fn instanceof Plus || op.fn instanceof Minus && op instanceof RightScalarOperator || op.fn instanceof Or) {
                LibMatrixCUDA.deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
            } else if (op.fn instanceof Multiply || op.fn instanceof And) {
                LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen);
            } else if (op.fn instanceof Power) {
                LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
            } else {
                LibMatrixCUDA.matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
            }
        } else if (constant == 1.0 && op.fn instanceof Or) {
            LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
        } else if (constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
            LibMatrixCUDA.deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
        } else {
            LibMatrixCUDA.matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
        }
    }

    public static void matrixMatrixRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, BinaryOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        boolean in1SparseAndEmpty = LibMatrixCUDA.isSparseAndEmpty(gCtx, in1);
        boolean in2SparseAndEmpty = LibMatrixCUDA.isSparseAndEmpty(gCtx, in2);
        if (in1SparseAndEmpty && in2SparseAndEmpty) {
            if (op.fn instanceof LessThan || op.fn instanceof GreaterThan || op.fn instanceof NotEquals) {
                LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 0.0, outputName, in1.getNumRows(), in1.getNumColumns());
            } else if (op.fn instanceof LessThanEquals || op.fn instanceof GreaterThanEquals || op.fn instanceof Equals) {
                LibMatrixCUDA.setOutputToConstant(ec, gCtx, instName, 1.0, outputName, in1.getNumRows(), in1.getNumColumns());
            }
        } else if (in1SparseAndEmpty) {
            LibMatrixCUDA.matrixScalarRelational(ec, gCtx, instName, in2, outputName, new LeftScalarOperator(op.fn, 0.0));
        } else if (in2SparseAndEmpty) {
            LibMatrixCUDA.matrixScalarRelational(ec, gCtx, instName, in1, outputName, new RightScalarOperator(op.fn, 0.0));
        } else {
            LibMatrixCUDA.matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, false, false, op);
        }
    }

    public static void matrixMatrixArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
        boolean isCUDALibAvailable;
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        boolean bl = isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !LibMatrixCUDA.isSparseAndEmpty(gCtx, in1) && !LibMatrixCUDA.isSparseAndEmpty(gCtx, in2) && !LibMatrixCUDA.isVector(in1) && !LibMatrixCUDA.isVector(in2);
        if (!isCUDALibAvailable) {
            LibMatrixCUDA.matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
        } else {
            double beta;
            double alpha;
            if (op.fn instanceof Plus) {
                alpha = 1.0;
                beta = 1.0;
            } else if (op.fn instanceof Minus) {
                alpha = 1.0;
                beta = -1.0;
            } else {
                throw new DMLRuntimeException("Unsupported op");
            }
            LibMatrixCUDA.dgeam(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, alpha, beta);
        }
    }

    public static void matrixScalarOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (isInputTransposed) {
            throw new DMLRuntimeException("Transposing the input is not supported");
        }
        int rlenA = LibMatrixCUDA.toInt(in.getNumRows());
        int clenA = LibMatrixCUDA.toInt(in.getNumColumns());
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in, instName);
        double scalar = op.getConstant();
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlenA, clenA);
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        LibMatrixCUDA.matrixScalarOp(gCtx, instName, A, scalar, rlenA, clenA, C, op);
    }

    private static void matrixScalarOp(GPUContext gCtx, String instName, Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : matrix_scalar_op, GPUContext=" + gCtx);
        }
        int isLeftScalar = op instanceof LeftScalarOperator ? 1 : 0;
        int size = rlenA * clenA;
        long t0 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("matrix_scalar_op", ExecutionConfig.getConfigForSimpleVectorOperations(size), a, scalar, c, size, LibMatrixCUDA.getBinaryOp(op.fn), isLeftScalar);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "msk", System.nanoTime() - t0);
        }
    }

    private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        boolean isEmpty1 = LibMatrixCUDA.isSparseAndEmpty(gCtx, in1);
        boolean isEmpty2 = LibMatrixCUDA.isSparseAndEmpty(gCtx, in2);
        int rlenA = LibMatrixCUDA.toInt(in1.getNumRows());
        int rlenB = LibMatrixCUDA.toInt(in2.getNumRows());
        int clenA = LibMatrixCUDA.toInt(in1.getNumColumns());
        int clenB = LibMatrixCUDA.toInt(in2.getNumColumns());
        int vecStatusA = LibMatrixCUDA.getVectorStatus(rlenA, clenA).code();
        int vecStatusB = LibMatrixCUDA.getVectorStatus(rlenB, clenB).code();
        if (isLeftTransposed || isRightTransposed) {
            throw new DMLRuntimeException("Unsupported operator: GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed);
        }
        long outRLen = Math.max(rlenA, rlenB);
        long outCLen = Math.max(clenA, clenB);
        if (isEmpty1 && isEmpty2) {
            MatrixObject out = ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
            if (op.fn instanceof Divide || op.fn instanceof IntegerDivide || op.fn instanceof Modulus) {
                out.getGPUObject(gCtx).allocateAndFillDense(Double.NaN);
            } else if (op.fn instanceof Minus1Multiply) {
                out.getGPUObject(gCtx).allocateAndFillDense(1.0);
            } else {
                out.getGPUObject(gCtx).allocateSparseAndEmpty();
            }
        } else if (isEmpty1 && clenB != 1 && rlenB != 1) {
            LibMatrixCUDA.matrixScalarArithmetic(ec, gCtx, instName, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
        } else if (isEmpty2 && clenA != 1 && rlenA != 1) {
            LibMatrixCUDA.matrixScalarArithmetic(ec, gCtx, instName, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
        } else {
            Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
            Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
            MatrixObject out = null;
            try {
                out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
            }
            catch (DMLRuntimeException e) {
                throw new DMLRuntimeException("Incorrect dimensions: dimA:[" + rlenA + "," + clenA + "] dimB:[" + rlenB + "," + clenB + "] out:[" + outRLen + "," + outCLen + "]", e);
            }
            Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            int maxRlen = Math.max(rlenA, rlenB);
            int maxClen = Math.max(clenA, clenB);
            LibMatrixCUDA.matrixMatrixOp(gCtx, instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op);
        }
    }

    private static void matrixMatrixOp(GPUContext gCtx, String instName, Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, Pointer c, BinaryOperator op) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : matrix_matrix_cellwise_op, GPUContext=" + gCtx);
        }
        long t0 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, LibMatrixCUDA.getBinaryOp(op.fn));
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "mmck", System.nanoTime() - t0);
        }
    }

    private static VectorShape getVectorStatus(long rows, long cols) {
        if (cols == 1L) {
            return VectorShape.COLUMN;
        }
        if (rows == 1L) {
            return VectorShape.ROW;
        }
        return VectorShape.NONE;
    }

    private static boolean isVector(MatrixObject in) {
        return in.getNumRows() == 1L || in.getNumColumns() == 1L;
    }

    private static boolean isSparseAndEmpty(GPUContext gCtx, MatrixObject in1) {
        boolean isSparse1 = LibMatrixCUDA.isInSparseFormat(gCtx, in1);
        boolean isEmpty1 = isSparse1 && in1.getGPUObject((GPUContext)gCtx).getJcudaSparseMatrixPtr().nnz == 0L;
        return isEmpty1;
    }

    private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject src, String outputName, boolean isInputTransposed) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (!isInputTransposed) {
            LibMatrixCUDA.deviceCopy(ec, gCtx, instName, src, outputName);
        } else {
            LibMatrixCUDA.transpose(ec, gCtx, instName, src, outputName);
        }
    }

    private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject src, String outputName) throws DMLRuntimeException {
        Pointer srcPtr = LibMatrixCUDA.getDensePointer(gCtx, src, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, LibMatrixCUDA.toInt(src.getNumRows()), LibMatrixCUDA.toInt(src.getNumColumns()));
        Pointer destPtr = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        LibMatrixCUDA.deviceCopy(instName, srcPtr, destPtr, (int)src.getNumRows(), (int)src.getNumColumns());
    }

    private static void setOutputToConstant(ExecutionContext ec, GPUContext gCtx, String instName, double constant, String outputName, long numRows, long numCols) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (constant == 0.0) {
            LibMatrixCUDA.getSparseMatrixOutputForGPUInstruction(ec, numRows, numCols, 0L, instName, outputName);
        } else {
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols);
            Pointer A = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            int rlen = LibMatrixCUDA.toInt(out.getNumRows());
            int clen = LibMatrixCUDA.toInt(out.getNumColumns());
            long t0 = 0L;
            if (DMLScript.FINEGRAINED_STATISTICS) {
                t0 = System.nanoTime();
            }
            int size = rlen * clen;
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "fillk", System.nanoTime() - t0);
            }
        }
    }

    private static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) throws DMLRuntimeException {
        long t0 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        int size = rlen * clen * sizeOfDataType;
        JCuda.cudaMemcpy((Pointer)dest, (Pointer)src, (long)size, (int)3);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "D2D", System.nanoTime() - t0);
        }
    }

    private static int getBinaryOp(ValueFunction fn) throws DMLRuntimeException {
        if (fn instanceof Plus) {
            return 0;
        }
        if (fn instanceof Minus) {
            return 1;
        }
        if (fn instanceof Multiply) {
            return 2;
        }
        if (fn instanceof Divide) {
            return 3;
        }
        if (fn instanceof Power) {
            return 4;
        }
        if (fn instanceof LessThan) {
            return 5;
        }
        if (fn instanceof LessThanEquals) {
            return 6;
        }
        if (fn instanceof GreaterThan) {
            return 7;
        }
        if (fn instanceof GreaterThanEquals) {
            return 8;
        }
        if (fn instanceof Equals) {
            return 9;
        }
        if (fn instanceof NotEquals) {
            return 10;
        }
        if (fn instanceof And) {
            return 13;
        }
        if (fn instanceof Or) {
            return 14;
        }
        if (fn instanceof Multiply2) {
            return 2;
        }
        if (fn instanceof Power2) {
            return 4;
        }
        if (fn instanceof Minus1Multiply) {
            return 15;
        }
        if (fn instanceof MinusNz) {
            return 16;
        }
        if (fn instanceof Modulus) {
            return 17;
        }
        if (fn instanceof IntegerDivide) {
            return 18;
        }
        if (fn instanceof Builtin && ((Builtin)fn).getBuiltinCode() == Builtin.BuiltinCode.MIN) {
            return 11;
        }
        if (fn instanceof Builtin && ((Builtin)fn).getBuiltinCode() == Builtin.BuiltinCode.MAX) {
            return 12;
        }
        throw new DMLRuntimeException("The given value function is not supported:" + fn.getClass().getName());
    }

    private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : dgeam, GPUContext=" + gCtx);
        }
        Pointer alphaPtr = LibMatrixCUDA.dataTypePointerTo(alpha);
        Pointer betaPtr = LibMatrixCUDA.dataTypePointerTo(beta);
        int transa = isLeftTransposed ? 1 : 0;
        int transb = isRightTransposed ? 1 : 0;
        long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows();
        long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns();
        MatrixObject out = ec.getMatrixObject(outputName);
        boolean isSparse1 = LibMatrixCUDA.isInSparseFormat(gCtx, in1);
        boolean isSparse2 = LibMatrixCUDA.isInSparseFormat(gCtx, in2);
        long t0 = 0L;
        long t1 = 0L;
        if (isSparse1 || isSparse2) {
            int m = (int)in1.getNumRows();
            int n = (int)in1.getNumColumns();
            if (!LibMatrixCUDA.isInSparseFormat(gCtx, in1)) {
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    t0 = System.nanoTime();
                }
                in1.getGPUObject(gCtx).denseToSparse();
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t0);
                }
            }
            CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
            if (!LibMatrixCUDA.isInSparseFormat(gCtx, in2)) {
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    t0 = System.nanoTime();
                }
                in2.getGPUObject(gCtx).denseToSparse();
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "d2s", System.nanoTime() - t0);
                }
            }
            CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
            ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
            if (in1 == in2 && isLeftTransposed && isLeftTransposed == isRightTransposed) {
                int nnz = (int)A.nnz;
                CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
                out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
                cudaSupportFunctions.cusparsecsr2csc(LibMatrixCUDA.getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, 1, 0);
            } else {
                if (isLeftTransposed || isRightTransposed) {
                    throw new DMLRuntimeException("Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
                }
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    t1 = System.nanoTime();
                }
                CSRPointer C = CSRPointer.allocateForDgeam(gCtx, LibMatrixCUDA.getCusparseHandle(gCtx), A, B, m, n);
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "Msao", System.nanoTime() - t1);
                }
                out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    t0 = System.nanoTime();
                }
                cudaSupportFunctions.cusparsecsrgeam(LibMatrixCUDA.getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, LibMatrixCUDA.toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr, B.descr, LibMatrixCUDA.toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(instName, "sdgeaml", System.nanoTime() - t0);
                }
            }
        } else {
            int lda = LibMatrixCUDA.toInt(in1.getNumColumns());
            int ldb = LibMatrixCUDA.toInt(in2.getNumColumns());
            int m = LibMatrixCUDA.toInt(in1.getNumColumns());
            int n = LibMatrixCUDA.toInt(in2.getNumRows());
            if (isLeftTransposed && isRightTransposed) {
                m = LibMatrixCUDA.toInt(in1.getNumRows());
                n = LibMatrixCUDA.toInt(in2.getNumColumns());
            } else if (isLeftTransposed) {
                m = LibMatrixCUDA.toInt(in1.getNumRows());
            } else if (isRightTransposed) {
                n = LibMatrixCUDA.toInt(in2.getNumColumns());
            }
            int ldc = m;
            Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
            Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
            LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
            Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                t0 = System.nanoTime();
            }
            cudaSupportFunctions.cublasgeam(LibMatrixCUDA.getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "ddgeaml", System.nanoTime() - t0);
            }
        }
    }

    public static void transpose(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        LibMatrixCUDA.dgeam(ec, gCtx, instName, in, in, outputName, true, true, 1.0, 0.0);
    }

    public static int toInt(long num) throws DMLRuntimeException {
        if (num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
            throw new DMLRuntimeException("GPU : Exceeded supported size " + num);
        }
        return (int)num;
    }

    public static void sliceOperations(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, IndexRange ixrange, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : sliceOperations, GPUContext=" + gCtx);
        }
        int rl = (int)ixrange.rowStart;
        int ru = (int)ixrange.rowEnd;
        int cl = (int)ixrange.colStart;
        int cu = (int)ixrange.colEnd;
        if (rl < 0 || (long)rl >= in1.getNumRows() || ru < rl || (long)ru >= in1.getNumRows() || cl < 0 || (long)cu >= in1.getNumColumns() || cu < cl || (long)cu >= in1.getNumColumns()) {
            throw new DMLRuntimeException("Invalid values for matrix indexing: [" + (rl + 1) + ":" + (ru + 1) + "," + (cl + 1) + ":" + (cu + 1) + "] must be within matrix dimensions [" + in1.getNumRows() + "," + in1.getNumColumns() + "]");
        }
        int len1 = LibMatrixCUDA.toInt(in1.getNumColumns());
        if (LibMatrixCUDA.isInSparseFormat(gCtx, in1)) {
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
            CSRPointer inPointer = LibMatrixCUDA.getSparsePointer(gCtx, in1, instName);
            Pointer outPointer = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            LibMatrixCUDA.sliceSparseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1);
        } else {
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
            Pointer inPointer = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
            Pointer outPointer = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            LibMatrixCUDA.sliceDenseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1);
        }
    }

    protected static void sliceDenseDense(GPUContext gCtx, String instName, Pointer inPointer, Pointer outPointer, int rl, int ru, int cl, int cu, int inClen) throws DMLRuntimeException {
        long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0L;
        long retClen = cu - cl + 1;
        if ((long)inClen == retClen) {
            JCuda.cudaMemcpy((Pointer)outPointer, (Pointer)inPointer.withByteOffset((long)(rl * inClen * sizeOfDataType)), (long)((ru - rl + 1) * inClen * sizeOfDataType), (int)3);
        } else {
            long retRlen = ru - rl + 1;
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("slice_dense_dense", ExecutionConfig.getConfigForSimpleVectorOperations(LibMatrixCUDA.toInt(retRlen * retClen)), inPointer, outPointer, rl, ru, cl, cu, inClen, retRlen, retClen);
        }
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "drix", System.nanoTime() - t0);
        }
    }

    protected static void sliceSparseDense(GPUContext gCtx, String instName, CSRPointer inPointer, Pointer outPointer, int rl, int ru, int cl, int cu, int inClen) throws DMLRuntimeException {
        int size = LibMatrixCUDA.getNnz(inPointer, rl, ru);
        if (size == 0) {
            return;
        }
        int retRlen = ru - rl + 1;
        long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0L;
        int retClen = cu - cl + 1;
        String kernel = null;
        String timer = null;
        if (inClen > 10 && retClen > 2 * retRlen) {
            timer = "sdrixn";
            kernel = "slice_sparse_dense_nnz";
        } else {
            size = retRlen;
            timer = "sdrixr";
            kernel = "slice_sparse_dense_row";
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu, retClen);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, timer, System.nanoTime() - t0);
        }
    }

    private static int getNnz(CSRPointer inPointer, int rl, int ru) {
        int[] rlPtr = new int[]{-1};
        int[] ruPtr = new int[]{-1};
        JCuda.cudaMemcpy((Pointer)Pointer.to((int[])rlPtr), (Pointer)inPointer.rowPtr.withByteOffset((long)(rl * 4)), (long)4L, (int)2);
        JCuda.cudaMemcpy((Pointer)Pointer.to((int[])ruPtr), (Pointer)inPointer.rowPtr.withByteOffset((long)((ru + 1) * 4)), (long)4L, (int)2);
        return ruPtr[0] - rlPtr[0];
    }

    public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : cbind, GPUContext=" + gCtx);
        }
        long t1 = 0L;
        long rowsA = LibMatrixCUDA.toInt(in1.getNumRows());
        long colsA = LibMatrixCUDA.toInt(in1.getNumColumns());
        long rowsB = LibMatrixCUDA.toInt(in2.getNumRows());
        long colsB = LibMatrixCUDA.toInt(in2.getNumColumns());
        if (rowsA != rowsB) {
            throw new DMLRuntimeException("GPU : Invalid internal state - the rows must match up for a cbind operation");
        }
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rowsA, colsA + colsB);
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
        Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
        int maxRows = LibMatrixCUDA.toInt(Math.max(rowsA, rowsB));
        int maxCols = LibMatrixCUDA.toInt(Math.max(colsA, colsB));
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, rowsA, colsA, rowsB, colsB);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "cbindk", System.nanoTime() - t1);
        }
    }

    public static void rbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : rbind, GPUContext=" + gCtx);
        }
        long t1 = 0L;
        int rowsA = LibMatrixCUDA.toInt(in1.getNumRows());
        int colsA = LibMatrixCUDA.toInt(in1.getNumColumns());
        int rowsB = LibMatrixCUDA.toInt(in2.getNumRows());
        int colsB = LibMatrixCUDA.toInt(in2.getNumColumns());
        if (colsA != colsB) {
            throw new DMLRuntimeException("GPU : Invalid internal state - the columns must match up for a rbind operation");
        }
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rowsA + rowsB, colsA);
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
        Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
        int maxRows = Math.max(rowsA, rowsB);
        int maxCols = Math.max(colsA, colsB);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t1 = System.nanoTime();
        }
        LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, rowsA, colsA, rowsB, colsB);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "rbindk", System.nanoTime() - t1);
        }
    }

    public static void exp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : exp, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_exp", 1.0, outputName, instName, "expk");
    }

    public static void sqrt(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : sqrt, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_sqrt", 0.0, outputName, instName, "sqrtk");
    }

    public static void round(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : round, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_round", 0.0, outputName, instName, "roundk");
    }

    public static void abs(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : abs, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_abs", 0.0, outputName, instName, "absk");
    }

    public static void log(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : log, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, outputName, instName, "logk");
    }

    public static void floor(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : floor, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_floor", 0.0, outputName, instName, "floork");
    }

    public static void ceil(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : ceil, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_ceil", 0.0, outputName, instName, "ceilk");
    }

    public static void sin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : sin, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_sin", 0.0, outputName, instName, "sink");
    }

    public static void cos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : cos, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_cos", 1.0, outputName, instName, "cosk");
    }

    public static void tan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : tan, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_tan", 0.0, outputName, instName, "tank");
    }

    public static void sinh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : sinh, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_sinh", 0.0, outputName, instName, "sinhk");
    }

    public static void cosh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : cosh, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_cosh", 1.0, outputName, instName, "coshk");
    }

    public static void tanh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : tanh, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_tanh", 0.0, outputName, instName, "tanhk");
    }

    public static void asin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : asin, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_asin", 0.0, outputName, instName, "asink");
    }

    public static void acos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : acos, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_acos", 1.5707963267948966, outputName, instName, "acosk");
    }

    public static void atan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : atan, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_atan", 0.0, outputName, instName, "atank");
    }

    public static void sign(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : sign, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_sign", 0.0, outputName, instName, "signk");
    }

    public static void sigmoid(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : sigmoid, GPUContext=" + gCtx);
        }
        LibMatrixCUDA.unaryOp(ec, gCtx, in1, "matrix_sigmoid", 0.5, outputName, instName, "sigmk");
    }

    private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        GPUObject in = in1.getGPUObject(gCtx);
        boolean isSparseAndEmpty = in.isSparseAndEmpty();
        long t1 = 0L;
        if (isSparseAndEmpty) {
            MatrixObject out = ec.getMatrixObject(outputName);
            ec.allocateGPUMatrixObject(outputName, in1.getNumRows(), in1.getNumColumns());
            out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
        } else {
            MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumRows(), in1.getNumColumns());
            Pointer output = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
            Pointer input = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
            int size = LibMatrixCUDA.toInt(in1.getNumColumns() * in1.getNumRows());
            if (DMLScript.FINEGRAINED_STATISTICS) {
                t1 = System.nanoTime();
            }
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1);
            }
        }
    }

    public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, double constant) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName);
        Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName);
        MatrixObject out = ec.getMatrixObject(outputName);
        LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumRows(), in1.getNumColumns());
        Pointer C = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
        long t1 = 0L;
        long t2 = 0L;
        if (in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("GPU : cublasDaxpy, GPUContext=" + gCtx);
            }
            long n = in1.getNumRows() * in2.getNumColumns();
            Pointer alphaPtr = LibMatrixCUDA.dataTypePointerTo(constant);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                t1 = System.nanoTime();
            }
            JCuda.cudaMemcpy((Pointer)C, (Pointer)A, (long)(n * (long)sizeOfDataType), (int)3);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "D2D", System.nanoTime() - t1);
            }
            if (DMLScript.FINEGRAINED_STATISTICS) {
                t2 = System.nanoTime();
            }
            cudaSupportFunctions.cublasaxpy(LibMatrixCUDA.getCublasHandle(gCtx), LibMatrixCUDA.toInt(n), alphaPtr, B, 1, C, 1);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "daxpy", System.nanoTime() - t2);
            }
        } else {
            if (LOG.isTraceEnabled()) {
                LOG.trace("GPU : daxpy_matrix_vector, GPUContext=" + gCtx);
            }
            if (DMLScript.FINEGRAINED_STATISTICS) {
                t1 = System.nanoTime();
            }
            int rlenA = LibMatrixCUDA.toInt(in1.getNumRows());
            int clenA = LibMatrixCUDA.toInt(in1.getNumColumns());
            int rlenB = LibMatrixCUDA.toInt(in2.getNumRows());
            int clenB = LibMatrixCUDA.toInt(in2.getNumColumns());
            LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), A, B, constant, C, rlenA, clenA, rlenB, clenB);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "daxpymv", System.nanoTime() - t1);
            }
        }
    }

    public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
        if (ec.getGPUContext(0) != gCtx) {
            throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : solve, GPUContext=" + gCtx);
        }
        long t0 = -1L;
        GPUObject Aobj = in1.getGPUObject(gCtx);
        if (LibMatrixCUDA.isInSparseFormat(gCtx, in1)) {
            Aobj.sparseToDense(instName);
        }
        GPUObject bobj = in2.getGPUObject(gCtx);
        if (LibMatrixCUDA.isInSparseFormat(gCtx, in2)) {
            bobj.sparseToDense(instName);
        }
        int m = (int)in1.getNumRows();
        int n = (int)in1.getNumColumns();
        if ((int)in2.getNumRows() != m) {
            throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
        }
        if ((int)in2.getNumColumns() != 1) {
            throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
        }
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        GPUObject ATobj = (GPUObject)Aobj.clone();
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "clone", System.nanoTime() - t0);
        }
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        ATobj.denseRowMajorToColumnMajor();
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "r2c", System.nanoTime() - t0);
        }
        Pointer A = ATobj.getJcudaDenseMatrixPtr();
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        GPUObject bTobj = (GPUObject)bobj.clone();
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "clone", System.nanoTime() - t0);
        }
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        bTobj.denseRowMajorToColumnMajor();
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "r2c", System.nanoTime() - t0);
        }
        Pointer b = bTobj.getJcudaDenseMatrixPtr();
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        int[] lwork = new int[]{0};
        cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "qr_buffer", System.nanoTime() - t0);
        }
        Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType);
        Pointer tau = gCtx.allocate(instName, m * sizeOfDataType);
        Pointer devInfo = gCtx.allocate(4L);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "qr", System.nanoTime() - t0);
        }
        int[] qrError = new int[]{-1};
        JCuda.cudaMemcpy((Pointer)Pointer.to((int[])qrError), (Pointer)devInfo, (long)4L, (int)2);
        if (qrError[0] != 0) {
            throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
        }
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), 0, 1, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "ormqr", System.nanoTime() - t0);
        }
        JCuda.cudaMemcpy((Pointer)Pointer.to((int[])qrError), (Pointer)devInfo, (long)4L, (int)2);
        if (qrError[0] != 0) {
            throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
        }
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(), 0, 1, 0, 0, n, 1, LibMatrixCUDA.dataTypePointerTo(1.0), A, m, b, m);
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "trsm", System.nanoTime() - t0);
        }
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        bTobj.denseColumnMajorToRowMajor();
        if (DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "c2r", System.nanoTime() - t0);
        }
        MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1L);
        JCuda.cudaMemcpy((Pointer)out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), (Pointer)bTobj.getJcudaDenseMatrixPtr(), (long)(n * 1 * sizeOfDataType), (int)3);
        gCtx.cudaFreeHelper(instName, work);
        gCtx.cudaFreeHelper(instName, tau);
        ATobj.clearData();
        bTobj.clearData();
    }

    protected static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) throws DMLRuntimeException {
        Pair<MatrixObject, Boolean> mb;
        long t0 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        if ((mb = ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols)).getValue().booleanValue() && DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "ad", System.nanoTime() - t0);
        }
        return mb.getKey();
    }

    private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, long nnz, String instName, String name) throws DMLRuntimeException {
        Pair<MatrixObject, Boolean> mb;
        long t0 = 0L;
        if (DMLScript.FINEGRAINED_STATISTICS) {
            t0 = System.nanoTime();
        }
        if ((mb = ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz)).getValue().booleanValue() && DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instName, "as", System.nanoTime() - t0);
        }
        return mb.getKey();
    }

    static enum VectorShape {
        COLUMN(1),
        ROW(2),
        NONE(0);

        private final int code;

        private VectorShape(int code) {
            this.code = code;
        }

        int code() {
            return this.code;
        }
    }
}

