CLDot.java
- package neureka.backend.main.implementations.linear;
- import neureka.Shape;
- import neureka.Tensor;
- import neureka.backend.api.ExecutionCall;
- import neureka.backend.api.ImplementationFor;
- import neureka.backend.main.operations.linear.internal.opencl.CLSum;
- import neureka.devices.opencl.KernelCaller;
- import neureka.devices.opencl.OpenCLDevice;
- import java.util.function.Supplier;
- /**
- * Performs a dot product on two vectors using OpenCL.
- */
- public class CLDot implements ImplementationFor<OpenCLDevice>
- {
- @Override
- public Tensor<?> run(ExecutionCall<OpenCLDevice> call ) {
- // First we unpack the input tensors:
- Tensor<Float> c = call.input(Float.class, 0);
- Tensor<Float> a = call.input(Float.class, 1);
- Tensor<Float> b = call.input(Float.class, 2);
- OpenCLDevice device = call.getDevice();
- if ( a.rank() != 1 || b.rank() != 1 )
- throw new IllegalArgumentException("Input tensors must be vectors.");
- int size = a.shape(0);
- if ( b.shape(0) != size )
- throw new IllegalArgumentException("Input vectors must have the same length.");
- // First we multiply the two vectors:
- String kernelName = "multiply_arrays_for_dot_product";
- Supplier<String> code = () ->
- "__kernel void " + kernelName + "(__global const float* a, \n" +
- " __global const float* b, \n" +
- " __global float* c,\n" +
- " const int n) {\n" +
- " int i = get_global_id(0);\n" +
- " if (i < n) {\n" +
- " c[i] = a[i] * b[i];\n" +
- " }\n" +
- "}";
- Tensor<Float> temp = Tensor.of(Float.class, Shape.of(size), 0).to(device).mut().setIsVirtual(false);
- // Kernels are cached, so if it is already compiled, it will be retrieved from the cache:
- KernelCaller caller = device.findOrCompileAdHocKernel(kernelName, code);
- // We call OpenCL to do the work:
- caller.pass(a).pass(b).pass(temp).pass(size).call(new long[]{size}, null);
- Tensor<Float> out = CLSum.run(temp, device);
- c.mut().at(0).set(out.item());
- return c;
- }
- }