OpenCLDevice.java

/*
MIT License

Copyright (c) 2019 Gleethos

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

   ____                    _____ _      _____             _
  / __ \                  / ____| |    |  __ \           (_)
 | |  | |_ __   ___ _ __ | |    | |    | |  | | _____   ___  ___ ___
 | |  | | '_ \ / _ \ '_ \| |    | |    | |  | |/ _ \ \ / / |/ __/ _ \
 | |__| | |_) |  __/ | | | |____| |____| |__| |  __/\ V /| | (_|  __/
  \____/| .__/ \___|_| |_|\_____|______|_____/ \___| \_/ |_|\___\___|
        | |
        |_|

------------------------------------------------------------------------------------------------------------------------

   'Any fool can write code that a computer can understand.
    Good programmers write code that humans can understand.'
    – Martin Fowler

    Use the following as search keys :)

    $(1) : FIELD VARIABLES
    $(2) : CONSTRUCTION
    $(3) : OPENCL PROPERTIES
    $(4) : NESTED CLASSES

*/

package neureka.devices.opencl;

import neureka.Data;
import neureka.Neureka;
import neureka.Tensor;
import neureka.backend.api.*;
import neureka.backend.main.implementations.CLImplementation;
import neureka.backend.ocl.CLBackend;
import neureka.common.composition.Component;
import neureka.common.utility.DataConverter;
import neureka.common.utility.LogUtil;
import neureka.devices.*;
import neureka.devices.opencl.utility.CLFunctionCompiler;
import neureka.dtype.DataType;
import neureka.dtype.NumericType;
import neureka.dtype.custom.F32;
import neureka.framing.Relation;
import neureka.math.Function;
import neureka.ndim.config.NDConfiguration;
import org.jocl.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.WeakHashMap;
import java.util.function.Supplier;

import static org.jocl.CL.*;

/**
 * This class models OpenCL supporting accelerator hardware like GPUs or FPGAs
 * for storing tensors and executing operations on them.
 */
public class OpenCLDevice extends AbstractDevice<Number>
{
    private static final Logger _LOG = LoggerFactory.getLogger(OpenCLDevice.class);

    static OpenCLDevice of( OpenCLPlatform platform, cl_device_id did ) {
        if (!platform.has(did)) platform.put(did, new OpenCLDevice(platform, did));
        return platform.get(did);
    }

    public enum Type {
        CPU, GPU, ACCELERATOR, DEFAULT, CUSTOM, ALL, UNKNOWN
    }

    enum cl_dtype { F32, F64, I64, I32, I16, I8, U32, U16, U8 }

    /*==================================================================================================================
    |
    |       §(1) : FIELD VARIABLES
    |   ---------------------------
    */

    private final KernelCache _kernelCache = new KernelCache();

    private final cl_device_id _deviceId;

    /**
     * The OpenCLPlatform :
     * This method is a simple getter for the OpenCLPlatform instance hosting this current device.
     * A platform would for example be vendor specific like Intel, AMD, Nvidia...
     */
    private final OpenCLPlatform _platform;

    /**
     * The OpenCL command queue
     */
    private final cl_command_queue _queue;


    private final Map<NDConfiguration, cl_config> _configs = new WeakHashMap<>();

    /*==================================================================================================================
    |
    |       §(2) : CONSTRUCTION
    |   ---------------------------
    */

    /**
     * @param platform The platform containing this device.
     * @param deviceId The underlying OpenCL id of this device.
     */
    private OpenCLDevice( OpenCLPlatform platform, cl_device_id deviceId ) {
        super();
        _deviceId = deviceId;
        _platform = platform;
        _queue = clCreateCommandQueueWithProperties(// Create a command-queue for the selected device
                        platform.getContext(), deviceId,
                        null,
                        null
                    );
        _cleaning(this, () -> clReleaseCommandQueue(_queue));
    }

    public final String toString() {
        return "OpenCLDevice[id=0x" + Long.toHexString(_deviceId.getNativePointer()) + ",platform=0x" + Long.toHexString(_platform.getId()) + "]";
    }

    public final cl_device_id getId() { return _deviceId; }

    public final OpenCLPlatform getPlatform() { return _platform; }

    /**
     * @param name The name of the kernel whose presents should be checked.
     * @return True if the kernel is present in the cache, false otherwise.
     */
    public boolean hasAdHocKernel( String name ) { return _kernelCache.has(name); }

    /**
     * @param name The name of the kernel which should be retrieved.
     * @return The kernel with the given name if it is present in the cache, throws an exception otherwise.
     */
    public KernelCaller getAdHocKernel( String name ) {
        cl_ad_hoc adHoc = _kernelCache.get(name);
        if (adHoc != null) return new KernelCaller(adHoc.kernel, _queue);
        else throw new IllegalArgumentException("No ad hoc kernel with name '" + name + "' found!");
    }

    /**
     * @param name The name of the kernel which should be retrieved.
     * @return An {@link Optional} containing the kernel with the given name if it is present in the cache, an empty optional otherwise.
     */
    public Optional<KernelCaller> findAdHocKernel( String name ) {
        cl_ad_hoc adHoc = _kernelCache.get(name);
        if (adHoc != null) return Optional.of(new KernelCaller(adHoc.kernel, _queue));
        else return Optional.empty();
    }

    /**
     * @param name The name of the kernel which should be retrieved.
     * @param source The source code of the kernel which should be compiled if it is not present in the cache.
     * @return The kernel caller for the kernel of the requested name, either from cache,
     *          or compiled from the given source code if it was not present in the cache.
     */
    public KernelCaller findOrCompileAdHocKernel( String name, Supplier<String> source ) {
        cl_ad_hoc adHoc = _kernelCache.get(name);
        if ( adHoc != null ) return new KernelCaller(adHoc.kernel, _queue);
        else return compileAndGetAdHocKernel(name, source.get());
    }

    /**
     * This method compiles and returns the {@link KernelCaller} for a so called "ad hoc" kernel.
     * Ad hoc is a Latin phrase meaning literally 'to this'.
     * In English, it generally signifies a solution designed for a specific problem or task,
     * non-generalizable, and not intended to be adapted to other purposes.
     * This leads to the purpose of ad hoc kernel compilation, namely to be able to compile
     * unique kernels with a specific purpose created on the fly during runtime by operations.
     * This might be useful for high performance operations on tensors with specific dimensions and
     * or possibly other variables / properties which might be taken into account...
     *
     * @param name   The name of the kernel which ought to be compiled.
     * @param source The source of the kernel which ought to be compiled.
     * @return The {@link KernelCaller} for the compiled kernel.
     */
    public synchronized KernelCaller compileAndGetAdHocKernel( String name, String source ) {
        return compileAdHocKernel( name, source )
                .findAdHocKernel( name )
                .orElseThrow(() -> new RuntimeException("Failed to compile kernel: " + name));
    }

    /**
     * This method compiles so called "ad hoc" kernel.
     * Ad hoc is a Latin phrase meaning literally 'to this'.
     * In English, it generally signifies a solution designed for a specific problem or task,
     * non-generalizable, and not intended to be adapted to other purposes.
     * This leads to the purpose of ad hoc kernel compilation, namely to be able to compile
     * unique kernels with a specific purpose created on the fly during runtime by operations.
     * This might be useful for high performance operations on tensors with specific dimensions and
     * or possibly other variables / properties which might be taken into account...
     *
     * @param name   The name of the kernel which ought to be compiled.
     * @param source The source of the kernel which ought to be compiled.
     * @return This very instance in order to enable the factory pattern.
     */
    public synchronized OpenCLDevice compileAdHocKernel( String name, String source ) {
        if (this.hasAdHocKernel(name)) {
            cl_ad_hoc adHoc = _kernelCache.get(name);
            String message =
                "Cannot compile kernel source for name '" + name + "' because the name is already taken.\n" +
                "Use another name or find out why this kernel already exists.\n" +
                (
                        adHoc.source.equals(source)
                                ? "Besides the name, the source code of the existing kernel is also identical.\n" : ""
                );
            _log.error(message);
            throw new IllegalArgumentException(message);
        }

        // Create the program for the kernel
        cl_program cpProgram = clCreateProgramWithSource(
                getPlatform().getContext(),
                1,
                new String[]{source},
                null,
                null
        );

        // Build the program
        int err = clBuildProgram(
                        cpProgram,
                        1,
                        new cl_device_id[]{_deviceId},
                        "-cl-mad-enable",
                        null,
                        null
                );

        if ( err != CL_SUCCESS )
            _log.error("Error when trying to compile 'ad hoc kernel' named '"+name+"'! Error code: "+err);

        //TODO: check compilation errors!
        cl_kernel kernel;
        try {
            // Create the kernel
            kernel = clCreateKernel(cpProgram, name, null);
        } catch (Exception e) {
            if (e.getMessage().equals("CL_INVALID_KERNEL_NAME") && !source.contains("__kernel void " + name)) {
                String message = "Method 'clCreateKernel' failed! The name of the '__kernel' method declared inside \n" +
                                 "the source String does not match the provided name needed for kernel creation.";
                _log.error(message, e);
                throw new IllegalArgumentException(message);
            }
            _log.error("Method call 'clCreateKernel(.., name=\"" + name + "\", ..)' failed!", e);
            throw e;
        }
        cl_ad_hoc adHoc = new cl_ad_hoc(source, kernel, cpProgram);

        // Storing the ad hoc object in a weak hash map for fast access by operations :
        _kernelCache.put( name, adHoc );

        _cleaning(adHoc, () -> {
            clReleaseKernel(kernel);
            clReleaseProgram(cpProgram);
        });
        return this;
    }

    @Override
    public Operation optimizedOperationOf( Function function, String name ) {
        return new CLFunctionCompiler( this, function, name ).optimize();
    }

    /**
     * This method tells the to restore all tensors stored on it and release all resources.
     */
    @Override
    public void dispose() {
        _numberOfTensors = 0;
        clFinish( _queue );
        clReleaseCommandQueue( _queue );
    }

    /**
     * This method assumes that the passed tensor is stored on this device instance.
     * If the tensor is stored on the device then the method loads the outsourced
     * data of the tensor back into primitive JVM arrays and restores the tensor
     * freshly in RAM.
     *
     * @param tensor The tensor whose data ought to be restored (loaded to RAM).
     * @return This device, which enables method chaining.
     */
    @Override
    public Device<Number> restore( Tensor<Number> tensor ) {
        if ( !this.has( tensor ) ) {
            String message = "The passed tensor cannot be restored from this OpenCL device " +
                                "because the tensor is not stored on the device.\n";
            _log.error(message);
            throw new IllegalArgumentException(message);
        }

        Object value  = _read(JVMData.of(tensor.itemType(), tensor.isVirtual() ? 1 : tensor.size()), tensor, 0).getArray();

        Class<?> arrayType = Objects.requireNonNull(tensor.getDataType().getTypeClassInstance(NumericType.class)).holderArrayType();

        value = DataConverter.get().convert( value, arrayType );

        this.free( tensor );
        tensor.find( Tensor.class ).ifPresent( this::restore );
        tensor.getMut().setItems( value );
        return this;
    }


    /**
     * Implementations of this method ought to store the value
     * of the given tensor and the "parent" tensor in whatever
     * formant suites the underlying implementation and or final type.
     * {@link Device} implementations are also tensor storages
     * which may also have to store tensors which are slices of bigger tensors.   <br><br>
     *
     * @param tensor The tensor whose data ought to be stored.
     */
    private <T extends Number> void _store(Tensor<T> tensor, Tensor<T> parent ) {
        if (!parent.isOutsourced()) throw new IllegalStateException("Data parent is not outsourced!");
        _add(
            tensor.getMut().upcast(Number.class),
            parent.getMut().getData(),
            () -> tensor.set((Component) this)
        );
    }

    private <T extends Number> void _add(
        Tensor<Number> tensor,
        Data<T> parentData,
        Runnable migration // Causes the device to be a component of the tensor!
    ) {
        if ( this.has( tensor ) ) {
            _LOG.debug("Trying to add a tensor to a device which already reports hosting it.");
            return;
        }

        boolean convertToFloat = Neureka.get()
                                    .backend()
                                    .find(CLBackend.class)
                                    .map( it -> it.getSettings().isAutoConvertToFloat() )
                                    .orElse(false);

        Data<Number> data;
        if ( parentData == null ) {
            if ( tensor.getMut().getData().owner() == this ) {
                migration.run();
                return;
            }
            JVMData jvmData = null;
            jvmData = JVMData.of( tensor.getMut().getData().getOrNull(), convertToFloat );
            cl_tsr<Number, Number> newClt;
            newClt = _storeNew( jvmData );
            if ( tensor.rqsGradient() && tensor.hasGradient() )
                this.store(tensor.gradient().orElseThrow(()->new IllegalStateException("Gradient missing!")));

            cl_mem[] memos = new cl_mem[]{newClt.value.data};
            clEnqueueMigrateMemObjects(
                    _queue, memos.length, memos,
                    CL_MIGRATE_MEM_OBJECT_HOST,
                    0,
                    null,
                    null
                );

            data = _dataArrayOf(newClt, (DataType<Number>) _dataTypeOf(newClt));
        }
        else
            data = (Data<Number>) parentData;

        tensor.getMut().setData( data );
        migration.run();

        // When tensors get stored on this device,
        // they can be implicitly converted to a float tensor:
        if ( convertToFloat )
            tensor.getMut().toType(F32.class);
    }

    private cl_tsr<Number, Number> _storeNew( JVMData jvmData ) {
        return _storeNew( jvmData, false );
    }

    private cl_tsr<Number, Number> _storeNew( JVMData jvmData, boolean allocateTargetSize ) {
        cl_tsr.cl_value newVal = new cl_tsr.cl_value((int) (allocateTargetSize ? jvmData.getTargetLength() : jvmData.getLength()));
        cl_tsr<Number, Number> newClt = new cl_tsr<>(newVal, jvmData.getType());
        _store( jvmData, newClt, allocateTargetSize );
        return newClt;
    }

    public cl_config clConfigOf(Tensor<?> t ) {
        return clConfigOf( t.getNDConf() );
    }

    public cl_config clConfigOf(NDConfiguration ndc ) {
        cl_config config = _configs.get(ndc);
        if ( config == null ) {
            config = _writeNewNDConfig( ndc );
            _configs.put(ndc, config);
        }
        return config;
    }

    private cl_config _writeNewNDConfig(NDConfiguration ndc ) {

        cl_config clf = new cl_config();

        //Config format: <[ shape | strides | indicesMap | indices | scale ]>
        int[] config = ndc.asInlineArray();

        //shape/strides/map/offset/spread
        clf.data = clCreateBuffer(
                    _platform.getContext(),
                    CL_MEM_READ_WRITE,
                    (long) config.length * Sizeof.cl_int,
                    null, null
                );

        clEnqueueWriteBuffer(
                _queue, clf.data, CL_TRUE, 0,
                (long) config.length * Sizeof.cl_int,
                Pointer.to(config),
                0,
                null, null
            );
        final cl_mem clConfMem = clf.data;
        _cleaning( clf, () -> clReleaseMemObject(clConfMem) );
        return clf;
    }

    private void _store(
       JVMData jvmData,
       cl_tsr<?, ?> newClTensor,
       boolean allocateTarget
    ) {
        long bufferLength = allocateTarget ? jvmData.getTargetLength() : jvmData.getLength();

        cl_mem mem = clCreateBuffer(
                        _platform.getContext(),
                        CL_MEM_READ_WRITE,
                        (long) jvmData.getItemSize() * bufferLength,
                        null,
                        null
                    );

        newClTensor.value.data = mem;

        // Virtual means that there is only a single value in the JVM array.
        // So we don't have to write the whole array to the device!
        // Instead, we can just fill the device memory with the single value.
        boolean isASingleValue = jvmData.isVirtual();

        if ( isASingleValue )
            clEnqueueFillBuffer(
                    _queue, mem, jvmData.getPointer(), // pattern
                    jvmData.getItemSize(), 0,
                    (long) jvmData.getItemSize() * bufferLength,
                    0, null, null
                );
        else
            clEnqueueWriteBuffer(
                    _queue, mem,
                    CL_TRUE, 0,
                    (long) jvmData.getItemSize() * bufferLength,
                    jvmData.getPointer(), 0, null, null
                );
    }

    @Override
    public final <T extends Number> Device<Number> free( Tensor<T> tensor ) {
        cl_tsr<?, ?> clt = tensor.getMut().getData().as( cl_tsr.class);
        if ( clt == null ) return this;
        tensor.getMut().setData(null);
        tensor.find(Device.class).ifPresent(
            device -> {
                tensor.remove( Device.class );
                tensor.find(Tensor.class).ifPresent(
                    gradient ->
                        ( (Tensor<Number>) gradient ).find(Device.class).ifPresent(
                            gradDevice -> {
                                try {
                                    if ( this.has( gradient ) ) gradDevice.restore( gradient );
                                }
                                catch ( Exception exception ) {
                                    _LOG.error(
                                        "Gradient could not be restored from device component when trying to migrate it back to RAM.",
                                        exception
                                    );
                                    throw exception;
                                }
                                gradient.remove( Device.class );
                            })
                );
            }
        );
        return this;
    }

    @Override
    protected final <T extends Number> T _readItem( Tensor<T> tensor, int index ) {
        return (T) _read(JVMData.of(tensor.itemType(), 1), tensor.getMut().upcast(Number.class), index).getElementAt(0);
    }

    @Override
    protected final <T extends Number, A> A _readArray( Tensor<T> tensor, Class<A> arrayType, int start, int size ) {
        return (A) _read(JVMData.of(tensor.itemType(), size), tensor.getMut().upcast(Number.class), start).getArray();
    }

    @Override
    protected final <T extends Number> void _writeItem( Tensor<T> tensor, T item, int start, int size ) {
        _overwrite( tensor, start, JVMData.of(item, size, 0) );
    }

    @Override
    protected final <T extends Number> void _writeArray(
        Tensor<T> tensor,
        Object array,
        int offset,
        int start,
        int size
    ) {
        _overwrite( tensor, start, JVMData.of(array, size, offset) );
    }

    @Override
    public <T extends Number> Data<T> allocate( DataType<T> dataType, NDConfiguration ndc ) {
        JVMData jvmData = JVMData.of( dataType.getItemTypeClass(), ndc.size() );
        cl_tsr<Number, Number> clt = _storeNew(jvmData );
        return (Data<T>) _dataArrayOf(clt, (DataType<Number>) _dataTypeOf(clt));
    }

    @Override
    public <T extends Number> Data<T> allocateFromOne( DataType<T> dataType, NDConfiguration ndc, T initialValue ) {
        JVMData jvmData = JVMData.of( initialValue, ndc.size(), false, true );
        cl_tsr<Number, Number> clt = _storeNew(jvmData );
        return (Data<T>) _dataArrayOf(clt, (DataType<Number>) _dataTypeOf(clt));
    }

    @Override
    public <T extends Number> Data<T> allocateFromAll( DataType<T> dataType, NDConfiguration ndc, Object data ) {
        JVMData jvmData = JVMData.of( data );
        cl_tsr<Number, Number> clt = _storeNew(jvmData );
        return (Data<T>) _dataArrayOf(clt, (DataType<Number>) _dataTypeOf(clt));
    }

    @Override
    protected Data<Number> _actualize( Tensor<?> tensor ) {
        NDConfiguration ndc = tensor.getNDConf();
        Object initialValue = tensor.item();
        cl_tsr<?, ?> clt = tensor.getMut().getData().as( cl_tsr.class);
        if ( clt == null ) throw new IllegalStateException("The tensor has no device component!");
        JVMData jvmData = JVMData.of( initialValue, ndc.size(), false, true );
        clt = _storeNew( jvmData, true );
        return _dataArrayOf(clt, (DataType<Number>) _dataTypeOf(clt));
    }

    @Override
    protected Data<Number> _virtualize( Tensor<?> tensor ) {
        NDConfiguration ndc = tensor.getNDConf();
        Object initialValue = tensor.item();
        cl_tsr<?, ?> clt = tensor.getMut().getData().as( cl_tsr.class);
        if ( clt == null ) throw new IllegalStateException("The tensor has no device component!");
        JVMData jvmData = JVMData.of( initialValue, ndc.size(), false, true );
        clt = _storeNew( jvmData, false );
        return _dataArrayOf(clt, (DataType<Number>) _dataTypeOf(clt));
    }

    @Override
    protected final DataType<?> _dataTypeOf( Object rawData ) {
        LogUtil.nullArgCheck( rawData, "rawData", Object.class );
        if ( rawData instanceof cl_tsr ) {
            cl_dtype type = ((cl_tsr) rawData).dtype;
            switch ( type ) {
                case F32: return DataType.of( Float.class );
                case F64: return DataType.of( Double.class );
                case I32: case U32:
                    return DataType.of( Integer.class );
                case I64: return DataType.of( Long.class );
                case I16: case U16:
                    return DataType.of( Short.class );
                case I8: case U8:
                    return DataType.of( Byte.class );
                default: throw new IllegalStateException("Unknown OpenCL data type!");
            }
        }
        throw new IllegalStateException("Unknown data type "+rawData.getClass()+"!");
    }

    private void _overwrite(
        Tensor<?> tensor, long offset, JVMData jvmData
    ) {
        if ( jvmData.getLength() == 0 ) return;
        cl_tsr<?, ?> clt = tensor.getMut().getData().as( cl_tsr.class);

        if ( clt.value.event != null ) clWaitForEvents(1, new cl_event[]{clt.value.event});
        clt.value.event = new cl_event();
        long start = offset * jvmData.getItemSize();
        long size  = jvmData.getItemSize() * jvmData.getLength();
        clEnqueueWriteBuffer(
                _queue, clt.value.data, CL_TRUE,
                start, size,
                jvmData.getPointer(), 0, null,
                clt.value.event
            );
    }

    @Override
    protected final <T extends Number> void _swap( Tensor<T> former, Tensor<T> replacement) {
        cl_tsr<Number, T> clTensor = former.mut().getData().as( cl_tsr.class);
        former.getMut().setData(null);
        replacement.getMut().setData( _dataArrayOf(clTensor, (DataType<T>) _dataTypeOf(clTensor)) );
    }

    @Override
    public boolean update( OwnerChangeRequest<Tensor<Number>> changeRequest ) {
        super.update(changeRequest);
        if ( changeRequest.type() == IsBeing.ADDED ) {
            Tensor<Number> newOwner = changeRequest.getNewOwner();
            _updateInternal(newOwner, changeRequest::executeChange);
        } else
            changeRequest.executeChange(); // This can be an 'add', 'remove' or 'transfer' of this component!
        return true;
    }

    @Override
    protected <T extends Number> int _sizeOccupiedBy( Tensor<T> tensor ) { return tensor.getMut().getData().as( cl_tsr.class).value.size; }

    @Override
    protected <T extends Number> Object _readAll( Tensor<T> tensor, boolean clone ) {
        cl_tsr<?, ?> clt = tensor.getMut().getData().as( cl_tsr.class);
        return _readArray( tensor, tensor.getDataType().dataArrayType(), 0, clt.value.size );
    }

    private void _updateInternal( Tensor<Number> newOwner, Runnable migration) {
        Tensor<Number> root = _findRoot( newOwner );
        if (root != null) _store(newOwner, root);
        else _add( newOwner, null, migration );
    }

    private Tensor<Number> _findRoot( Tensor<Number> newOwner ) {
        Tensor<Number> root = null;
        Relation<Number> relation = newOwner.get(Relation.class);
        if ( relation != null )
            root = ((Relation<Number>) newOwner.get(Relation.class)).findRootTensor().orElse(null);

        return root;
    }

    private JVMData _read( JVMData jvmData, Tensor<Number> tensor, int offset ) {
        cl_tsr<?, ?> clt = tensor.getMut().getData().as( cl_tsr.class);
        clEnqueueReadBuffer(
                _queue,
                clt.value.data,
                CL_TRUE,
                (long) offset * jvmData.getItemSize(), // one double == eight byte
                (long) jvmData.getItemSize() * jvmData.getLength(),
                jvmData.getPointer(),
                0,
                null,
                null
        );
        return jvmData;
    }

    /**
     * @param call The {@link ExecutionCall} which will be queried for a {@link CLImplementation} holding the kernel.
     * @return The kernel call which uses the builder pattern to receive kernel arguments.
     */
    public KernelCaller getKernel( ExecutionCall<OpenCLDevice> call ) {
        String chosen;
        Algorithm algorithm = call.getAlgorithm();
        DeviceAlgorithm<?> deviceAlgorithm = ( algorithm instanceof DeviceAlgorithm ? ((DeviceAlgorithm<?>) algorithm) : null );
        // We create the kernel name from the chosen algorithm:
        ImplementationFor<OpenCLDevice> impl = ( deviceAlgorithm == null ? null : deviceAlgorithm.getImplementationFor(OpenCLDevice.class) );
        if ( impl instanceof CLImplementation && _platform.hasKernel(((CLImplementation) impl).getKernelFor(call).getName()) ) {
            chosen = ((CLImplementation) impl).getKernelFor( call ).getName();
        }
        else
            chosen = call.getAlgorithm().getName() + "_" + call.getOperation().getIdentifier();

        cl_kernel kernel = _platform.getKernel( chosen );
        if ( kernel == null )
            throw new IllegalStateException(
                    "No kernel found for signature '" + chosen + "' for operation '" +  call.getOperation().getIdentifier() + "'."
                );

        return new KernelCaller(kernel, _queue);
    }

    /**
     * @param name The name of the kernel for which a {@link KernelCaller} should be returned.
     * @return A {@link KernelCaller} for calling the requested kernel.
     */
    public KernelCaller getKernel( String name ) {
        cl_kernel kernel = _platform.getKernel( name );
        if ( kernel == null )
            throw new IllegalStateException("No kernel found with name '" + name + "'.");
        return new KernelCaller(kernel, _queue);
    }

    @Override
    protected boolean _approveExecutionOf(Tensor<?>[] tensors, int d, Operation type ) { return true; }


    /*==================================================================================================================
    |
    |       §(3) : OPENCL PROPERTIES
    |   ---------------------------
    */

    public String name() { return Query.getString( _deviceId, CL_DEVICE_NAME ); }

    public String vendor() { return Query.getString(_deviceId, CL_DEVICE_VENDOR); }

    public String version() { return Query.getString(_deviceId, CL_DRIVER_VERSION); }

    public Type type() {
        long deviceType = Query.getLong(_deviceId, CL_DEVICE_TYPE);
        if ( (deviceType & CL_DEVICE_TYPE_CPU         ) != 0 ) return Type.CPU;
        if ( (deviceType & CL_DEVICE_TYPE_GPU         ) != 0 ) return Type.GPU;
        if ( (deviceType & CL_DEVICE_TYPE_ACCELERATOR ) != 0 ) return Type.ACCELERATOR;
        if ( (deviceType & CL_DEVICE_TYPE_DEFAULT     ) != 0 ) return Type.DEFAULT;
        if ( (deviceType & CL_DEVICE_TYPE_CUSTOM      ) != 0 ) return Type.CUSTOM;
        if ( (deviceType & CL_DEVICE_TYPE_ALL         ) != 0 ) return Type.ALL;
        return Type.UNKNOWN;
    }

    public int maxComputeUnits() { return Query.getInt(_deviceId, CL_DEVICE_MAX_COMPUTE_UNITS); }

    public long maxWorkItemSimensions() { return Query.getLong(_deviceId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); }

    public long[] maxWorkItemSizes() { return Query.getSizes(_deviceId, CL_DEVICE_MAX_WORK_ITEM_SIZES, 3); }

    public long maxWorkGroupSize() { return Query.getSize(_deviceId, CL_DEVICE_MAX_WORK_GROUP_SIZE); }

    public long maxClockFrequenzy() { return Query.getLong(_deviceId, CL_DEVICE_MAX_CLOCK_FREQUENCY); }

    public int maxAddressBits() { return Query.getInt(_deviceId, CL_DEVICE_ADDRESS_BITS); }

    public long maxMemAllocSize() { return Query.getLong(_deviceId, CL_DEVICE_MAX_MEM_ALLOC_SIZE); }

    public long globalMemSize() { return Query.getLong(_deviceId, CL_DEVICE_GLOBAL_MEM_SIZE); }

    public int errorCorrectionSupport() { return Query.getInt(_deviceId, CL_DEVICE_ERROR_CORRECTION_SUPPORT); }

    public int localMemType() { return Query.getInt(_deviceId, CL_DEVICE_LOCAL_MEM_TYPE); }

    public long localMemSize() { return Query.getLong(_deviceId, CL_DEVICE_LOCAL_MEM_SIZE); }

    public long maxConstantBufferSize() { return Query.getLong(_deviceId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE); }

    public long maxConstantBufferSizeKB() { return (int) (Query.getLong(_deviceId, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) / 1024); }

    public int imageSupport() { return Query.getInt(_deviceId, CL_DEVICE_IMAGE_SUPPORT); }

    public int maxReadImageArgs() { return Query.getInt(_deviceId, CL_DEVICE_MAX_READ_IMAGE_ARGS); }

    public int maxWriteImageArgs() { return Query.getInt(_deviceId, CL_DEVICE_MAX_WRITE_IMAGE_ARGS); }

    public long singleFPConfig() { return Query.getLong(_deviceId, CL_DEVICE_SINGLE_FP_CONFIG); }

    public long image2DMaxWidth() { return Query.getSize(_deviceId, CL_DEVICE_IMAGE2D_MAX_WIDTH); }

    public long image2DMaxHeight() { return Query.getSize(_deviceId, CL_DEVICE_IMAGE2D_MAX_HEIGHT); }

    public long image3DMaxWidth() { return Query.getSize(_deviceId, CL_DEVICE_IMAGE3D_MAX_WIDTH); }

    public long image3DMaxHeight() { return Query.getSize(_deviceId, CL_DEVICE_IMAGE3D_MAX_HEIGHT); }

    public long image3DMaxDepth() { return Query.getSize(_deviceId, CL_DEVICE_IMAGE3D_MAX_DEPTH); }

    public int prefVecWidthChar() { return Query.getInt(_deviceId, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR); }

    public int prefVecWidthShort() { return Query.getInt(_deviceId, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT); }

    public int prefVecWidthInt() { return Query.getInt(_deviceId, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT); }

    public int prefVecWidthLong() { return Query.getInt(_deviceId, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG); }

    public int prefVecWidthFloat() { return Query.getInt(_deviceId, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT); }

    public int prefVecWidthDouble() { return Query.getInt(_deviceId, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE); }

    public static class Query {
        /**
         * Returns the value of the device info parameter with the given name
         *
         * @param device    The device
         * @param paramName The parameter name
         * @return The value
         */
        public static int getInt(cl_device_id device, int paramName) {
            return getInts(device, paramName, 1)[0];
        }

        /**
         * Returns the values of the device info parameter with the given name
         *
         * @param device    The device
         * @param paramName The parameter name
         * @param numValues The number of values
         * @return The value
         */
        public static int[] getInts(cl_device_id device, int paramName, int numValues) {
            int[] values = new int[numValues];
            clGetDeviceInfo(device, paramName, (long) Sizeof.cl_int * numValues, Pointer.to(values), null);
            return values;
        }

        /**
         * Returns the value of the device info parameter with the given name
         *
         * @param device    The device
         * @param paramName The parameter name
         * @return The value
         */
        public static long getLong(cl_device_id device, int paramName) {
            return getLongs(device, paramName, 1)[0];
        }

        /**
         * Returns the values of the device info parameter with the given name
         *
         * @param device    The device
         * @param paramName The parameter name
         * @param numValues The number of values
         * @return The value
         */
        public static long[] getLongs(cl_device_id device, int paramName, int numValues) {
            long[] values = new long[numValues];
            clGetDeviceInfo(device, paramName, (long) Sizeof.cl_long * numValues, Pointer.to(values), null);
            return values;
        }

        /**
         * Returns the value of the device info parameter with the given name
         *
         * @param device    The device
         * @param paramName The parameter name
         * @return The value
         */
        public static String getString(cl_device_id device, int paramName) {
            // Obtain the length of the string that will be queried
            long[] size = new long[1];
            clGetDeviceInfo(device, paramName, 0, null, size);

            // Create a buffer of the appropriate size and fill it with the info
            byte[] buffer = new byte[(int) size[0]];
            clGetDeviceInfo(device, paramName, buffer.length, Pointer.to(buffer), null);

            // Create a string from the buffer (excluding the trailing \0 byte)
            return new String(buffer, 0, buffer.length - 1);
        }

        /**
         * Returns the value of the platform info parameter with the given name
         *
         * @param platform  The platform
         * @param paramName The parameter name
         * @return The value
         */
        public static String getString(cl_platform_id platform, int paramName) {
            // Obtain the length of the string that will be queried
            long[] size = new long[1];
            clGetPlatformInfo(platform, paramName, 0, null, size);

            // Create a buffer of the appropriate size and fill it with the info
            byte[] buffer = new byte[(int) size[0]];
            clGetPlatformInfo(platform, paramName, buffer.length, Pointer.to(buffer), null);

            // Create a string from the buffer (excluding the trailing \0 byte)
            return new String(buffer, 0, buffer.length - 1);
        }

        /**
         * Returns the value of the device info parameter with the given name
         *
         * @param device    The device
         * @param paramName The parameter name
         * @return The value64
         */
        public static long getSize(cl_device_id device, int paramName) {
            return getSizes(device, paramName, 1)[0];
        }

        /**
         * Returns the values of the device info parameter with the given name
         *
         * @param device    The device
         * @param paramName The parameter name
         * @param numValues The number of values
         * @return The value64
         */
        public static long[] getSizes(cl_device_id device, int paramName, int numValues) {
            // The size of the returned data has to depend on
            // the size of a size_t, which is handled here
            ByteBuffer buffer = ByteBuffer.allocate(numValues * Sizeof.size_t).order(ByteOrder.nativeOrder());
            clGetDeviceInfo(
                    device,
                    paramName,
                    (long) Sizeof.size_t * numValues,
                    Pointer.to(buffer),
                    null
            );
            long[] values = new long[numValues];
            return getLongs(numValues, buffer, values);
        }

        public static long[] getLongs(int numValues, ByteBuffer buffer, long[] values) {
            if (Sizeof.size_t == 4)
                for (int i = 0; i < numValues; i++)
                    values[i] = buffer.getInt(i * Sizeof.size_t);
            else
                for ( int i = 0; i < numValues; i++ )
                    values[i] = buffer.getLong(i * Sizeof.size_t);

            return values;
        }

    }


    private <T extends Number> Data<T> _dataArrayOf( Object data, DataType<T> dataType ) {
        return (Data<T>) new CLData(this, data, (DataType<Number>) dataType);
    }

    private static class CLData extends AbstractDeviceData<Number> {

        public CLData( AbstractBaseDevice<Number> owner, Object dataRef, DataType<Number> dataType ) {
            super(owner, dataRef, dataType, ()->{
                // In this lambda we free the memory, because the data is no longer needed!
                cl_tsr<?,?> clTsr = (cl_tsr<?,?>) dataRef;
                if ( clTsr.value.event != null ) clWaitForEvents(1, new cl_event[]{clTsr.value.event});
                clReleaseMemObject(clTsr.value.data); // Removing data from the device!
            });
            assert !(dataRef instanceof Data);
        }

    }

    /*==================================================================================================================
    |
    |       §(4) : NESTED CLASSES
    |   ---------------------------
    */

    /**
     * This class is an OpenCL-Device specific tensor component used to store
     * the floating point size ( 1:float, 2:double, ...),
     * a reference to a wrapper containing a pointer to the tensor's configuration (cl_config),
     * and
     * a reference to a wrapper containing a pointer to the tensor's data (cl_data)
     * The latter two lend their identity for garbage collection!
     */
    static class cl_tsr<V, T extends V> {

        cl_tsr(cl_tsr.cl_value value, cl_dtype  dtype) {
            this.value = value;
            this.dtype = dtype;
        }

        /**
         * This class is responsible for representing the
         * data of a tensor stored on the device.
         * Instances of this class lend their identity to utilize garbage collection
         * of the data that they reference via their "cl_mem" field.
         * Meaning this inner memory object "cl_mem" will
         * be freed via a call hook stored inside a Cleaner instance...
         */
        static class cl_value
        {
            cl_value( int size ) { this.size = size; }

            public final int size;
            public cl_mem    data;
            public cl_event  event;
        }

        public final cl_dtype  dtype;
        public final cl_value  value;

        @Override
        public boolean equals(Object obj) {
            if ( !(obj instanceof cl_tsr) ) return false;
            return ((cl_tsr) obj).value == this.value;
        }

        @Override public int hashCode() {
            return value.hashCode();
        }
    }

    /**
     * This class manages a reference to a so called "ad hoc" program & kernel.
     * Ad hoc is a Latin phrase meaning literally 'to this'.
     * In English, it generally signifies a solution designed for a specific problem or task,
     * non-generalizable, and not intended to be adapted to other purposes.
     * This leads to the purpose of this class, namely to hold the context to a unique kernel with
     * a uniquely associated purpose which has been created by an operation possibly for specific
     * tensor dimensions or possibly other properties...
     */
    static final class cl_ad_hoc
    {
        public final String source;
        public final cl_kernel kernel;
        public final cl_program program;

        public cl_ad_hoc(
                String source, cl_kernel kernel, cl_program program
        ) {
            this.source = source;
            this.kernel = kernel;
            this.program = program;
        }
    }

    /**
     * This is the class responsible for representing NDConfiguration data.
     * Instances of this class lend their identity to utilize garbage collection
     * of the data that they reference via their "cl_mem" field.
     * Meaning this inner memory object "cl_mem" will
     * be freed via a call hook stored inside a Cleaner instance...
     */
    static final class cl_config {
        public cl_mem data;
    }
}