api/gpu_8h_source.html

#ifndef GPU_H

#define GPU_H


#include <array>

#include <cassert>

#include <cstring>

#include <future>

#include <initializer_list>

#include <memory>

#include <set>

#include <string>

#include <tuple>

#include <type_traits>

#include <unordered_map>

#include <utility> // std::pair

#include <vector>


#include "webgpu/webgpu.h"


#include "numeric_types/half.h"

#include "utils/logging.h"


#ifdef __EMSCRIPTEN__

#include "emscripten/emscripten.h"

#endif


namespace gpu {


struct Array {

  WGPUBuffer buffer;

  WGPUBufferUsageFlags usage;

  size_t size; // in bytes

};


struct Shape {

  static constexpr size_t kMaxRank = 8; // Maximum rank of a tensor, avoids

                                        // dynamic allocation for shape data

  std::array<size_t, kMaxRank> data = {0};

  size_t rank = 0;

  inline Shape() = default;


  inline Shape(std::initializer_list<size_t> dims) {

    assert(dims.size() <= kMaxRank);

    std::copy(dims.begin(), dims.end(), data.begin());

    rank = dims.size();

  }


  inline size_t &operator[](size_t index) {

    assert(index < rank);

    return data[index];

  }


  inline const size_t &operator[](size_t index) const {

    assert(index < rank);

    return data[index];

  }


};


inline size_t size(const Shape &shape) {

  size_t numels = 1;

  for (size_t i = 0; i < shape.rank; i++) {

    numels *= shape.data[i];

  }

  return numels;

}


struct Tensor {

  Array data;

  Shape shape;

};


struct TensorView {

  Tensor data; // non-owning view

  size_t offset = 0;

  size_t span = 0;

};


template <std::size_t N> struct Bindings {

  std::array<Tensor, N> data;

  std::array<size_t, N> viewOffsets;

  std::array<size_t, N> viewSpans;


  Bindings(const std::initializer_list<Tensor> &init) {

    std::copy(begin(init), end(init), begin(data));

    std::fill(begin(viewOffsets), end(viewOffsets), 0);

    for (size_t i = 0; i < N; ++i) {

      viewSpans[i] = data[i].data.size;

    }

  }


  Bindings(const std::initializer_list<TensorView> &init) {

    size_t i = 0;

    for (const auto &tv : init) {

      data[i] = tv.data;

      viewOffsets[i] = tv.offset;

      viewSpans[i] = tv.span;

      ++i;

    }

  }


  Bindings(const std::initializer_list<Array> &init) {

    std::copy(begin(init), end(init), begin(data));

    std::fill(begin(viewOffsets), end(viewOffsets), 0);

    for (size_t i = 0; i < N; ++i) {

      viewSpans[i] = data[i].size;

    }

  }


  Tensor &operator[](std::size_t index) { return data[index]; }

  const Tensor &operator[](std::size_t index) const { return data[index]; }

};


template <std::size_t N> Bindings(std::array<Tensor, N>) -> Bindings<N>;

template <typename... Args> Bindings(Args...) -> Bindings<sizeof...(Args)>;


struct Context; // Forward declaration so that TensorPool can have a pointer to

                // Context


struct TensorPool {

  inline TensorPool(Context *ctx) : ctx(ctx), data() {};

  Context *ctx;

  std::unordered_map<WGPUBuffer, Tensor> data;

  ~TensorPool();

};


enum NumType {

  kf16, // (experimental)

  kf32

};


inline size_t sizeBytes(const NumType &type) {

  switch (type) {

  case kf16:

    return sizeof(uint16_t);

  case kf32:

    return sizeof(float);

  default:

    LOG(kDefLog, kError, "Invalid NumType in size calculation.");

    return 0;

  }

}


inline std::string toString(NumType type) {

  switch (type) {

  case kf16:

    return "f16";

  case kf32:

    return "f32";

  default:

    LOG(kDefLog, kError, "Invalid NumType in string conversion.");

    return "unknown";

  }

}


inline std::string toString(const Shape &shape) {

  std::string str;

  for (size_t i = 0; i < shape.rank; i++) {

    str += std::to_string(shape.data[i]);

    if (i < shape.rank - 1) {

      str += ", ";

    }

  }

  return str;

}


inline std::string toString(size_t value) { return std::to_string(value); }


inline void replaceAll(std::string &str, const std::string &from,

                       const std::string &to) {

  size_t start_pos = 0;

  while ((start_pos = str.find(from, start_pos)) != std::string::npos) {

    str.replace(start_pos, from.length(), to);

    start_pos += to.length();

  }

}


struct KernelCode {


  inline KernelCode(const std::string &pData = "", size_t workgroupSize = 256,

                    NumType precision = kf32)

      : data(pData), workgroupSize({workgroupSize, 1, 1}),

        precision(precision) {

    if (precision == kf16) {

      data = "enable f16;\n" + data;

    }

    replaceAll(data, "{{workgroupSize}}", toString({workgroupSize, 1, 1}));

    replaceAll(data, "{{precision}}", toString(precision));

    LOG(kDefLog, kTrace, "Shader code:\n%s", data.c_str());

  }


  inline KernelCode(const std::string &pData,

                    const Shape &workgroupSize = {256, 1, 1},

                    NumType precision = kf32)

      : data(pData), workgroupSize(workgroupSize), precision(precision) {

    replaceAll(data, "{{workgroupSize}}", toString(workgroupSize));

    replaceAll(data, "{{precision}}", toString(precision));

    LOG(kDefLog, kInfo, "Shader code:\n%s", data.c_str());

  }


  std::string data;

  Shape workgroupSize;

  NumType precision = kf32;

  std::string label = "kernel";

  std::string entryPoint = "main";

};


inline void


replaceAll(std::string &str,

           const std::vector<std::pair<std::string, std::string>> &reps) {

  for (const auto &rep : reps) {

    replaceAll(str, rep.first, rep.second);

  }

}


struct CallbackData {

  WGPUBuffer buffer; // managed by owning Kernel

  size_t bufferSize;

  void *output; // non-owning, only for target memory in toCPU, not used for

                // kernel invocations

  std::promise<void> *promise;

  std::future<void> *future;

};


struct CopyData {

  WGPUCommandBuffer commandBuffer;

  WGPUBuffer readbackBuffer;

  std::promise<void> promise;

  std::future<void> future;

};


struct Kernel {

  std::unique_ptr<WGPUBuffer[]> buffers; // non-owning

  std::unique_ptr<size_t[]> bufferSizes;

  size_t numBindings;

  Shape nWorkgroups;

  WGPUBindGroup bindGroup;             // persists between submission

  WGPUComputePipeline computePipeline; // persists between submission

  WGPUCommandBuffer commandBuffer;     // destroyed upon submission

};


inline bool operator<(const Kernel &lhs, const Kernel &rhs) {

  return lhs.commandBuffer < rhs.commandBuffer;

}


struct KernelPool {

  inline KernelPool(Context *ctx) : ctx(ctx), data() {}

  Context *ctx;

  std::set<Kernel *> data;


  inline ~KernelPool() {

    // Note : Some kernel resources such as commandBuffer are harvested by

    // queue submission, explicitly destroying readback and callback buffers

    // produces runtime errors.

    data.clear();

  }


};


inline void processEvents(const WGPUInstance& instance) {

#ifdef __EMSCRIPTEN__

  emscripten_sleep(0);

#else

  wgpuInstanceProcessEvents(instance);

#endif

}


struct Context {

  WGPUInstance instance;

  WGPUAdapter adapter;

  WGPUDevice device;

  WGPUQueue queue;

  TensorPool pool = TensorPool(this);

  KernelPool kernelPool = KernelPool(this);


  ~Context() {

    LOG(kDefLog, kTrace, "Destroying context");

    if (queue) {

      wgpuQueueRelease(queue);

    } else {

      LOG(kDefLog, kWarn, "Queue is null");

    }

    if (device) {

      wgpuDeviceRelease(device);

      processEvents(instance);

    } else {

      LOG(kDefLog, kWarn, "Device is null");

    }

    if (adapter) {

      wgpuAdapterRelease(adapter);

      processEvents(instance);

    } else {

      LOG(kDefLog, kWarn, "Adapter is null");

    }

    if (instance) {

      wgpuInstanceRelease(instance);

    } else {

      LOG(kDefLog, kWarn, "Instance is null");

    }

    LOG(kDefLog, kInfo, "Context destroyed");

  }


};


inline Tensor


createTensor(TensorPool &pool, WGPUDevice &device, const Shape &shape,

             NumType dtype,

             WGPUBufferUsageFlags usage = WGPUBufferUsage_Storage |

                                          WGPUBufferUsage_CopyDst |

                                          WGPUBufferUsage_CopySrc) {

  LOG(kDefLog, kTrace, "Creating tensor");

  size_t numElements = size(shape);

  size_t size = sizeBytes(dtype) * numElements;

  WGPUBufferDescriptor bufferDesc = {

      .usage = usage,

      .size = size,

  };

  WGPUBuffer buffer = wgpuDeviceCreateBuffer(device, &bufferDesc);

  pool.data[buffer] = Tensor{

      .data = Array{.buffer = buffer, .usage = usage, .size = size},

      .shape = shape,

  };

  return pool.data[buffer];

}


inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype) {

  return createTensor(ctx.pool, ctx.device, shape, dtype);

}


inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,

                           float *data) {

  assert(dtype == kf32);

  Tensor tensor =

      createTensor(ctx.pool, ctx.device, shape, dtype,

                   WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |

                       WGPUBufferUsage_CopySrc);

  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,

                       tensor.data.size);

  return tensor;

}


inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,

                           half *data) {

  assert(dtype == kf16);

  Tensor tensor =

      createTensor(ctx.pool, ctx.device, shape, dtype,

                   WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |

                       WGPUBufferUsage_CopySrc);

  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,

                       tensor.data.size);

  return tensor;

}


inline void FreeTensor(TensorPool &pool, Tensor tensor) {

  if (tensor.data.buffer) {

    wgpuBufferRelease(tensor.data.buffer);

  } else {

    LOG(kDefLog, kWarn, "Tried to free tensor with null buffer");

  }

  if (pool.data.find(tensor.data.buffer) != pool.data.end()) {

    pool.data.erase(tensor.data.buffer);

  } else {

    LOG(kDefLog, kWarn, "Tried to free tensor that was not in pool");

  }

}


inline TensorPool::~TensorPool() {

  // Need to get keys in a separate iteration, otherwise iterator is getting

  // invalidated during erase.

  std::vector<WGPUBuffer> keys;

  for (auto &pair : data) {

    keys.push_back(pair.first);

  }

  for (auto &key : keys) {

    FreeTensor(*this, data[key]);

    LOG(kDefLog, kTrace, "Freed tensor");

  }

}


inline void check(bool condition, const char *message,

                  const char *file = "unkown", int line = -1) {

  if (!condition) {

    LOG(kDefLog, kError, "Error in file %s line %d:\n%s", file, line, message);

    exit(1);

  } else {

    LOG(kDefLog, kTrace, "Success in file %s line %d:\n%s", file, line,

        message);

  }

}


inline Context createContext(const WGPUInstanceDescriptor &desc = {},

                             const WGPURequestAdapterOptions &adapterOpts = {},

                             const WGPUDeviceDescriptor &devDescriptor = {}) {

  Context context;

  {

#ifndef __EMSCRIPTEN__

    context.instance = wgpuCreateInstance(&desc);

#else

    // Emscripten does not support the instance descriptor

    // and throws an assertion error if it is not nullptr.

    context.instance = wgpuCreateInstance(nullptr);

#endif

    check(context.instance, "Initialize WebGPU", __FILE__, __LINE__);

  }

  LOG(kDefLog, kInfo, "Requesting adapter");

  {

    struct AdapterData {

      WGPUAdapter adapter = nullptr;

      bool requestEnded = false;

    };

    AdapterData adapterData;

    auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status,

                                    WGPUAdapter adapter, char const *message,

                                    void *pUserData) {

      AdapterData &adapterData = *reinterpret_cast<AdapterData *>(pUserData);

      check(status == WGPURequestAdapterStatus_Success,

            "Request WebGPU adapter", __FILE__, __LINE__);

      adapterData.adapter = adapter;

      adapterData.requestEnded = true;

    };

    wgpuInstanceRequestAdapter(context.instance, &adapterOpts,

                               onAdapterRequestEnded, (void *)&adapterData);

    while (!adapterData.requestEnded) {

      processEvents(context.instance);

    }

    assert(adapterData.requestEnded);

    context.adapter = adapterData.adapter;

  }

  LOG(kDefLog, kInfo, "Requesting device");

  {

    struct DeviceData {

      WGPUDevice device = nullptr;

      bool requestEnded = false;

    };

    DeviceData devData;

    auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,

                                   WGPUDevice device, char const *message,

                                   void *pUserData) {

      DeviceData &devData = *reinterpret_cast<DeviceData *>(pUserData);

      check(status == WGPURequestDeviceStatus_Success,

            "Could not get WebGPU device.", __FILE__, __LINE__);

      LOG(kDefLog, kTrace, "Device Request succeeded %x",

          static_cast<void *>(device));

      devData.device = device;

      devData.requestEnded = true;

    };

#ifdef WEBGPU_BACKEND_DAWN

    devDescriptor.deviceLostCallbackInfo = {

        .callback =

            [](WGPUDevice const *device, WGPUDeviceLostReason reason,

               char const *message, void *userdata) {

              if (reason != WGPUDeviceLostReason_Destroyed) {

                LOG(kDefLog, kError, "Device lost (code %d):\n%s", reason,

                    message);

              } else {

                LOG(kDefLog, kInfo, "Device destroyed: %s", message);

              }

            },

    };

#endif

    LOG(kDefLog, kInfo, "Requesting device");

    wgpuAdapterRequestDevice(context.adapter, &devDescriptor,

                             onDeviceRequestEnded, (void *)&devData);

    LOG(kDefLog, kInfo, "Waiting for device request to end");

    while (!devData.requestEnded) {

      processEvents(context.instance);

    }

    LOG(kDefLog, kInfo, "Device request ended");

    assert(devData.requestEnded);

    context.device = devData.device;

    wgpuDeviceSetUncapturedErrorCallback(

        context.device,

        [](WGPUErrorType type, char const *message, void *devData) {

          LOG(kDefLog, kError, "Device uncaptured error: %s", message);

          throw std::runtime_error("Device uncaptured exception.");

        },

        nullptr);

  }

  context.queue = wgpuDeviceGetQueue(context.device);

  return context;

}


inline void wait(Context &ctx, std::future<void> &future) {

  while (future.wait_for(std::chrono::seconds(0)) !=

         std::future_status::ready) {

    processEvents(ctx.instance);

  }

}


inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,

                  CopyData &op) {

  wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);

  CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise,

                               &op.future};

  wgpuQueueOnSubmittedWorkDone(

      ctx.queue,

      [](WGPUQueueWorkDoneStatus status, void *callbackData) {

        check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",

              __FILE__, __LINE__);

        const auto *data = static_cast<CallbackData *>(callbackData);

        wgpuBufferMapAsync(

            data->buffer, WGPUMapMode_Read, 0, data->bufferSize,

            [](WGPUBufferMapAsyncStatus status, void *captureData) {

              const auto *data = static_cast<CallbackData *>(captureData);

              check(status == WGPUBufferMapAsyncStatus_Success,

                    "Map readbackBuffer", __FILE__, __LINE__);

              const void *mappedData = wgpuBufferGetConstMappedRange(

                  data->buffer, /*offset=*/0, data->bufferSize);

              check(mappedData, "Get mapped range", __FILE__, __LINE__);

              memcpy(data->output, mappedData, data->bufferSize);

              wgpuBufferUnmap(data->buffer);

              data->promise->set_value();

            },

            callbackData);

      },

      &callbackData);

  wait(ctx, op.future);

}


inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {

  CopyData op;

  op.future = op.promise.get_future();

  {

    WGPUBufferDescriptor readbackBufferDescriptor = {

        .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,

        .size = bufferSize,

    };

    op.readbackBuffer =

        wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);

  }

  {

    WGPUCommandEncoder commandEncoder;

    WGPUComputePassEncoder computePassEncoder;

    commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);

    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,

                                         op.readbackBuffer, 0, bufferSize);

    op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);

    check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);

  }

  toCPU(ctx, tensor, data, bufferSize, op);

}


template <size_t N>


void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {

  toCPU(ctx, tensor, data.data(), sizeof(data));

}


inline void toGPU(Context &ctx, const void *data, WGPUBuffer buffer,

                  size_t size) {

  wgpuQueueWriteBuffer(ctx.queue, buffer, 0, data, size);

}


inline void toGPU(Context &ctx, const float *data, Tensor &tensor) {

  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,

                       tensor.data.size);

}


inline void toGPU(Context &ctx, const half *data, Tensor &tensor) {

  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,

                       tensor.data.size);

}


template <typename Params>


inline void toGPU(Context &ctx, Params &params, Kernel &op) {

  // TODO(avh): Maintain params metadata in Kernel and check for consistency.

  // If a kernel does not have parameters this will quietly overwrite

  // the last buffer in the bind group with the parameters buffer.

  if (op.numBindings > 0) {

    wgpuQueueWriteBuffer(ctx.queue, op.buffers[op.numBindings - 1], 0,

                         static_cast<void *>(&params), sizeof(params));

  }

}


inline void resetCommandBuffer(WGPUDevice &device, Kernel &op) {

  {

    WGPUCommandEncoder commandEncoder =

        wgpuDeviceCreateCommandEncoder(device, nullptr);

    WGPUComputePassEncoder computePassEncoder =

        wgpuCommandEncoderBeginComputePass(commandEncoder, nullptr);

    wgpuComputePassEncoderSetPipeline(computePassEncoder, op.computePipeline);

    wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, op.bindGroup, 0,

                                       nullptr);

    wgpuComputePassEncoderDispatchWorkgroups(

        computePassEncoder, op.nWorkgroups[0], op.nWorkgroups[1],

        op.nWorkgroups[2]);

    wgpuComputePassEncoderEnd(computePassEncoder);

    op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);

  }

}


struct NoParam {};

template <typename T> constexpr bool IsNoParam = std::is_same_v<T, NoParam>;


inline size_t cdiv(size_t n, size_t d) { return (n + d - 1) / d; }


inline Shape cdiv(Shape total, Shape group) {

  assert(total.rank == group.rank);

  Shape result;

  result.rank = total.rank;

  for (size_t dim = 0; dim < total.rank; ++dim) {

    result[dim] = cdiv(total[dim], group[dim]);

  }

  return result;

}


inline Kernel createKernel(Context &ctx, const KernelCode &code,

                           const Tensor *dataBindings, size_t numTensors,

                           const size_t *viewOffsets, const Shape &nWorkgroups,

                           const void *params = nullptr,

                           size_t paramsSize = 0) {

  assert(nWorkgroups.rank == 3);

  WGPUDevice device = ctx.device;

  WGPUQueue queue = ctx.queue;

  Kernel op;

  // paramIndex is the index into bgLayoutEntries for the parameters buffer If

  // there are no parameters for the kernel, paramsSize == 0 and paramIndex is

  // effectively undefined (== -1)

  size_t paramIndex = -1;

  // Note: paramIndex is undefined unless paramsSize > 0

  size_t numBindings = numTensors;

  if (paramsSize > 0) {

    numBindings++;                // parameters buffer

    paramIndex = numBindings - 1; // index of the parameters buffer within

                                  // op.buffers, op.bufferSizes and

                                  // bgLayoutEntries

  }

  op.buffers = std::make_unique<WGPUBuffer[]>(numBindings);

  op.bufferSizes = std::make_unique<size_t[]>(numBindings);

  op.numBindings = numBindings;

  std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);

  // Create layout entries for input buffers

  for (size_t i = 0; i < numTensors; ++i) {

    bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{

        .binding = static_cast<uint32_t>(i),

        .visibility = WGPUShaderStage_Compute,

        .buffer =

            WGPUBufferBindingLayout{

                .type = WGPUBufferBindingType_Storage,

                .minBindingSize = dataBindings[i].data.size,

            },

    };

  }

  if (paramsSize > 0) {

    LOG(kDefLog, kInfo, "Create layout entry for the params buffer");

    // Create layout entry for the params buffer

    bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{

        .binding = static_cast<uint32_t>(paramIndex),

        .visibility = WGPUShaderStage_Compute,

        .buffer =

            WGPUBufferBindingLayout{

                .type = WGPUBufferBindingType_Uniform,

                .minBindingSize = paramsSize,

            },

    };

  }

  WGPUBindGroupLayoutDescriptor bgLayoutDesc = {

      .entryCount = static_cast<uint32_t>(bgLayoutEntries.size()),

      .entries = bgLayoutEntries.data(),

  };

  WGPUBindGroupLayout bgLayout =

      wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc);

  for (size_t i = 0; i < numTensors; ++i) {

    op.buffers[i] = dataBindings[i].data.buffer;

    op.bufferSizes[i] = dataBindings[i].data.size;

  }

  // Create a buffer for the Params struct

  if (paramsSize > 0) {

    WGPUBufferDescriptor paramsBufferDesc = {

        .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,

        .size = paramsSize,

        .mappedAtCreation = false,

    };

    op.buffers[paramIndex] = wgpuDeviceCreateBuffer(device, &paramsBufferDesc);

    op.bufferSizes[paramIndex] = paramsSize;

    wgpuQueueWriteBuffer(queue, op.buffers[paramIndex], 0, params, paramsSize);

    LOG(kDefLog, kTrace, "Params buffer written");

  } else {

    LOG(kDefLog, kTrace, "No params buffer needed");

  }

  std::vector<WGPUBindGroupEntry> bindGroupEntries(numBindings);

  for (size_t i = 0; i < numTensors; ++i) {

    bindGroupEntries[i] = WGPUBindGroupEntry{

        .binding = static_cast<uint32_t>(i),

        .buffer = op.buffers[i],

        .offset = viewOffsets[i],

        .size = op.bufferSizes[i],

    };

  }

  if (paramsSize > 0) {

    LOG(kDefLog, kInfo, "Create bind group entry for the params buffer");

    LOG(kDefLog, kInfo, "paramIndex: %d", paramIndex);

    bindGroupEntries[paramIndex] = WGPUBindGroupEntry{

        .binding = static_cast<uint32_t>(paramIndex),

        .buffer = op.buffers[paramIndex],

        .offset = 0,

        .size = paramsSize,

    };

  }

  LOG(kDefLog, kTrace, "BG Entries Size: %d", numBindings);

  WGPUBindGroupDescriptor bindGroupDesc = {

      .layout = bgLayout,

      .entryCount = static_cast<uint32_t>(numBindings),

      .entries = bindGroupEntries.data(),

  };

  op.bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);

  {

    WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {

        .bindGroupLayoutCount = 1,

        .bindGroupLayouts = &bgLayout,

    };

    WGPUPipelineLayout pipelineLayout =

        wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);

    WGPUShaderModuleWGSLDescriptor wgslDesc = {

        .code = code.data.c_str(),

    };

    wgslDesc.chain.sType = WGPUSType_ShaderModuleWGSLDescriptor;

    WGPUShaderModuleDescriptor shaderModuleDesc = {};

    shaderModuleDesc.nextInChain = &wgslDesc.chain;

    shaderModuleDesc.label = code.label.c_str();

    WGPUComputePipelineDescriptor computePipelineDesc = {};

    computePipelineDesc.layout = pipelineLayout;

    computePipelineDesc.compute.module =

        wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);

    computePipelineDesc.compute.entryPoint = code.entryPoint.c_str();

    computePipelineDesc.label = code.label.c_str();

    op.computePipeline =

        wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);

  }

  /*

  op.nWorkgroups = {cdiv(nThreads[0], code.workgroupSize[0]),

                    cdiv(nThreads[1], code.workgroupSize[1]),

                    cdiv(nThreads[2], code.workgroupSize[2])};

  */

  op.nWorkgroups = {nWorkgroups[0], nWorkgroups[1], nWorkgroups[2]};

  resetCommandBuffer(device, op);

  ctx.kernelPool.data.insert(&op);

  return op;

}


template <typename ParamsType = NoParam, size_t numInputs>


Kernel createKernel(Context &ctx, const KernelCode &code,

                    const Bindings<numInputs> &dataBindings,

                    const Shape &nWorkgroups,

                    const ParamsType &params = ParamsType{}) {

  if constexpr (!IsNoParam<ParamsType>) {

    // LOG(kDefLog, kTrace, "Using params of size %d bytes",

    // sizeof(ParamsType));

    return createKernel(ctx, code, dataBindings.data.data(), numInputs,

                        dataBindings.viewOffsets.data(), nWorkgroups,

                        reinterpret_cast<const void *>(&params),

                        sizeof(ParamsType));

  } else {

    // LOG(kDefLog, kTrace , "No params");

    return createKernel(ctx, code, dataBindings.data.data(), numInputs,

                        dataBindings.viewOffsets.data(), nWorkgroups, nullptr,

                        0);

  }

}


inline void dispatchKernel(Context &ctx, Kernel &kernel,

                           std::promise<void> &promise) {

  // Submit the command buffer

  wgpuQueueSubmit(ctx.queue, 1, &kernel.commandBuffer);

  wgpuQueueOnSubmittedWorkDone(

      ctx.queue,

      [](WGPUQueueWorkDoneStatus status, void *data) {

        check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",

              __FILE__, __LINE__);

        auto *promise = static_cast<std::promise<void> *>(data);

        promise->set_value();

      },

      &promise);

}


} // namespace gpu


#endif // GPU_H

logging.h

gpu
Definition gpu.h:27

gpu::kDefLog
static Logger kDefLog
Default logger for logging messages to stdout at the info level. Output stream and logging level for ...
Definition logging.h:64

gpu::FreeTensor
void FreeTensor(TensorPool &pool, Tensor tensor)
Frees a tensor resource and updates the tensor pool.
Definition gpu.h:608

gpu::toCPU
void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, CopyData &op)
Copies data from a GPU buffer to CPU memory.
Definition gpu.h:789

gpu::check
void check(bool condition, const char *message, const char *file="unkown", int line=-1)
Checks a condition and logs an error message if the condition is false. In debug mode,...
Definition gpu.h:646

gpu::createTensor
Tensor createTensor(TensorPool &pool, WGPUDevice &device, const Shape &shape, NumType dtype, WGPUBufferUsageFlags usage=WGPUBufferUsage_Storage|WGPUBufferUsage_CopyDst|WGPUBufferUsage_CopySrc)
Tensor factory function to create a tensor (a Tensor type is simply an Array with an N-dimensional Sh...
Definition gpu.h:491

gpu::LOG
void LOG(Logger &logger, int level, const char *message,...)
Log a message to the logger. If NDEBUG is defined in a source or as a compiler flag,...
Definition logging.h:34

gpu::processEvents
void processEvents(const WGPUInstance &instance)
Definition gpu.h:419

gpu::createKernel
Kernel createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, size_t numTensors, const size_t *viewOffsets, const Shape &nWorkgroups, const void *params=nullptr, size_t paramsSize=0)
A factory function to create a kernel on the GPU. The kernel is created with the given WGSL code,...
Definition gpu.h:1007

gpu::toString
std::string toString(NumType type)
Converts NumType to string.
Definition gpu.h:206

gpu::NumType
NumType
Definition gpu.h:183

gpu::kf16
@ kf16
Definition gpu.h:184

gpu::kf32
@ kf32
Definition gpu.h:185

gpu::Bindings
Bindings(std::array< Tensor, N >) -> Bindings< N >
Deduction guide for Bindings.

gpu::kError
@ kError
Definition logging.h:9

gpu::kWarn
@ kWarn
Definition logging.h:9

gpu::kTrace
@ kTrace
Definition logging.h:9

gpu::kInfo
@ kInfo
Definition logging.h:9

gpu::replaceAll
void replaceAll(std::string &str, const std::string &from, const std::string &to)
simple in-place string replacement helper function for substituting placeholders in a WGSL string tem...
Definition gpu.h:256

gpu::IsNoParam
constexpr bool IsNoParam
Definition gpu.h:959

gpu::operator<
bool operator<(const Kernel &lhs, const Kernel &rhs)
Operator implementation to make the Kernel type hashable.
Definition gpu.h:398

gpu::resetCommandBuffer
void resetCommandBuffer(WGPUDevice &device, Kernel &op)
Resets the command buffer in preparation for a kernel dispatch. Since command buffers are consumed up...
Definition gpu.h:937

gpu::createContext
Context createContext(const WGPUInstanceDescriptor &desc={}, const WGPURequestAdapterOptions &adapterOpts={}, const WGPUDeviceDescriptor &devDescriptor={})
Factory function to create a GPU context, which aggregates WebGPU API handles to interact with the GP...
Definition gpu.h:678

gpu::toGPU
void toGPU(Context &ctx, const void *data, WGPUBuffer buffer, size_t size)
Copies data from CPU memory to a GPU buffer. The toGPU overloads are effectively a convenience wrappe...
Definition gpu.h:888

gpu::size
size_t size(const Shape &shape)
Returns the number of elements in a tensor with the given shape, which is equal to the product of the...
Definition gpu.h:80

gpu::dispatchKernel
void dispatchKernel(Context &ctx, Kernel &kernel, std::promise< void > &promise)
Asynchronously submits a kernel to the GPU queue for execution. It also sets up a callback to notify ...
Definition gpu.h:1200

gpu::wait
void wait(Context &ctx, std::future< void > &future)
Definition gpu.h:770

gpu::cdiv
size_t cdiv(size_t n, size_t d)
Ceiling division.
Definition gpu.h:964

gpu::sizeBytes
size_t sizeBytes(const NumType &type)
Returns the number of bytes of a number type.
Definition gpu.h:191

gpu::Array
Represents a buffer of values on the GPU.
Definition gpu.h:32

gpu::Array::size
size_t size
Definition gpu.h:35

gpu::Array::buffer
WGPUBuffer buffer
Definition gpu.h:33

gpu::Array::usage
WGPUBufferUsageFlags usage
Definition gpu.h:34

gpu::Bindings
Represents an ordered collection of WGPUBuffers (wrapped as tensors, non-overlapping views,...
Definition gpu.h:124

gpu::Bindings::operator[]
const Tensor & operator[](std::size_t index) const
Definition gpu.h:155

gpu::Bindings::operator[]
Tensor & operator[](std::size_t index)
Definition gpu.h:154

gpu::Bindings::Bindings
Bindings(const std::initializer_list< Tensor > &init)
Definition gpu.h:128

gpu::Bindings::Bindings
Bindings(const std::initializer_list< Array > &init)
Definition gpu.h:146

gpu::Bindings::viewSpans
std::array< size_t, N > viewSpans
Definition gpu.h:127

gpu::Bindings::viewOffsets
std::array< size_t, N > viewOffsets
Definition gpu.h:126

gpu::Bindings::data
std::array< Tensor, N > data
Definition gpu.h:125

gpu::Bindings::Bindings
Bindings(const std::initializer_list< TensorView > &init)
Definition gpu.h:136

gpu::CallbackData
Used for on-done callback data for asynchronous operations sduch as kernel launching.
Definition gpu.h:357

gpu::CallbackData::future
std::future< void > * future
Definition gpu.h:363

gpu::CallbackData::promise
std::promise< void > * promise
Definition gpu.h:362

gpu::CallbackData::output
void * output
Definition gpu.h:360

gpu::CallbackData::buffer
WGPUBuffer buffer
Definition gpu.h:358

gpu::CallbackData::bufferSize
size_t bufferSize
Definition gpu.h:359

gpu::Context
Represents a GPU context, aggregates WebGPU API handles to interact with the GPU including the instan...
Definition gpu.h:434

gpu::Context::pool
TensorPool pool
Definition gpu.h:439

gpu::Context::queue
WGPUQueue queue
Definition gpu.h:438

gpu::Context::instance
WGPUInstance instance
Definition gpu.h:435

gpu::Context::kernelPool
KernelPool kernelPool
Definition gpu.h:440

gpu::Context::adapter
WGPUAdapter adapter
Definition gpu.h:436

gpu::Context::~Context
~Context()
Definition gpu.h:441

gpu::Context::device
WGPUDevice device
Definition gpu.h:437

gpu::CopyData
Staging buffer and callback data for copying data between the GPU and CPU.
Definition gpu.h:370

gpu::CopyData::commandBuffer
WGPUCommandBuffer commandBuffer
Definition gpu.h:371

gpu::CopyData::readbackBuffer
WGPUBuffer readbackBuffer
Definition gpu.h:372

gpu::CopyData::promise
std::promise< void > promise
Definition gpu.h:373

gpu::CopyData::future
std::future< void > future
Definition gpu.h:374

gpu::KernelCode
KernelCode is the representation of WGSL GPU code with template substitutions applied....
Definition gpu.h:272

gpu::KernelCode::workgroupSize
Shape workgroupSize
Definition gpu.h:326

gpu::KernelCode::KernelCode
KernelCode(const std::string &pData="", size_t workgroupSize=256, NumType precision=kf32)
Constructor to create a code object from a template string and optional workgroup size and precision.
Definition gpu.h:289

gpu::KernelCode::entryPoint
std::string entryPoint
Definition gpu.h:329

gpu::KernelCode::data
std::string data
Definition gpu.h:325

gpu::KernelCode::KernelCode
KernelCode(const std::string &pData, const Shape &workgroupSize={256, 1, 1}, NumType precision=kf32)
Overload of the constructor to create a code object from a template string and workgroup size....
Definition gpu.h:317

gpu::KernelCode::label
std::string label
Definition gpu.h:328

gpu::KernelCode::precision
NumType precision
Definition gpu.h:327

gpu::Kernel
Represents handles + metadata for a reusable kernel on the GPU. The struct members can be divided int...
Definition gpu.h:382

gpu::Kernel::bufferSizes
std::unique_ptr< size_t[]> bufferSizes
Definition gpu.h:384

gpu::Kernel::numBindings
size_t numBindings
Definition gpu.h:385

gpu::Kernel::commandBuffer
WGPUCommandBuffer commandBuffer
Definition gpu.h:389

gpu::Kernel::computePipeline
WGPUComputePipeline computePipeline
Definition gpu.h:388

gpu::Kernel::bindGroup
WGPUBindGroup bindGroup
Definition gpu.h:387

gpu::Kernel::nWorkgroups
Shape nWorkgroups
Definition gpu.h:386

gpu::Kernel::buffers
std::unique_ptr< WGPUBuffer[]> buffers
Definition gpu.h:383

gpu::KernelPool
A pool of kernels to manage GPU resources. For simple use cases this is instantiated as a member in t...
Definition gpu.h:407

gpu::KernelPool::~KernelPool
~KernelPool()
Definition gpu.h:411

gpu::KernelPool::ctx
Context * ctx
Definition gpu.h:409

gpu::KernelPool::data
std::set< Kernel * > data
Definition gpu.h:410

gpu::KernelPool::KernelPool
KernelPool(Context *ctx)
Definition gpu.h:408

gpu::NoParam
NoParam is a no-op type used to indicate that a kernel does not have any parameters.
Definition gpu.h:958

gpu::Shape
Represents the shape of a tensor.
Definition gpu.h:49

gpu::Shape::Shape
Shape()=default

gpu::Shape::rank
size_t rank
Definition gpu.h:53

gpu::Shape::operator[]
const size_t & operator[](size_t index) const
Definition gpu.h:64

gpu::Shape::data
std::array< size_t, kMaxRank > data
Definition gpu.h:52

gpu::Shape::Shape
Shape(std::initializer_list< size_t > dims)
Definition gpu.h:55

gpu::Shape::operator[]
size_t & operator[](size_t index)
Definition gpu.h:60

gpu::Shape::kMaxRank
static constexpr size_t kMaxRank
Definition gpu.h:50

gpu::Tensor
Represents a tensor on the GPU, which is a buffer of values with a shape.
Definition gpu.h:96

gpu::Tensor::shape
Shape shape
Definition gpu.h:98

gpu::Tensor::data
Array data
Definition gpu.h:97

gpu::TensorPool
Represents a pool of tensors to manage GPU resources. The pool is responsible for managing the lifeti...
Definition gpu.h:176

gpu::TensorPool::ctx
Context * ctx
Definition gpu.h:178

gpu::TensorPool::TensorPool
TensorPool(Context *ctx)
Definition gpu.h:177

gpu::TensorPool::data
std::unordered_map< WGPUBuffer, Tensor > data
Definition gpu.h:179

gpu::TensorPool::~TensorPool
~TensorPool()
Destructor for TensorPool which frees all tensors in the pool.
Definition gpu.h:624

gpu::TensorView
Represents a non-owning view into a tensor specifying an offset and a subspan. This is useful for spe...
Definition gpu.h:110

gpu::TensorView::span
size_t span
Definition gpu.h:113

gpu::TensorView::offset
size_t offset
Definition gpu.h:112

gpu::TensorView::data
Tensor data
Definition gpu.h:111