8#include <initializer_list>
14#include <unordered_map>
18#include "webgpu/webgpu.h"
20#include "numeric_types/half.h"
24#include "emscripten/emscripten.h"
52 std::array<size_t, kMaxRank>
data = {0};
55 inline Shape(std::initializer_list<size_t> dims) {
57 std::copy(dims.begin(), dims.end(),
data.begin());
82 for (
size_t i = 0; i < shape.
rank; i++) {
83 numels *= shape.
data[i];
128 Bindings(
const std::initializer_list<Tensor> &init) {
129 std::copy(begin(init), end(init), begin(
data));
131 for (
size_t i = 0; i < N; ++i) {
136 Bindings(
const std::initializer_list<TensorView> &init) {
138 for (
const auto &tv : init) {
146 Bindings(
const std::initializer_list<Array> &init) {
147 std::copy(begin(init), end(init), begin(
data));
149 for (
size_t i = 0; i < N; ++i) {
179 std::unordered_map<WGPUBuffer, Tensor>
data;
194 return sizeof(uint16_t);
196 return sizeof(float);
224 for (
size_t i = 0; i < shape.
rank; i++) {
225 str += std::to_string(shape.
data[i]);
226 if (i < shape.
rank - 1) {
238inline std::string
toString(
size_t value) {
return std::to_string(value); }
256inline void replaceAll(std::string &str,
const std::string &from,
257 const std::string &to) {
258 size_t start_pos = 0;
259 while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
260 str.replace(start_pos, from.length(), to);
261 start_pos += to.length();
347 const std::vector<std::pair<std::string, std::string>> &reps) {
348 for (
const auto &rep : reps) {
423 wgpuInstanceProcessEvents(instance);
444 wgpuQueueRelease(
queue);
449 wgpuDeviceRelease(
device);
493 WGPUBufferUsageFlags usage = WGPUBufferUsage_Storage |
494 WGPUBufferUsage_CopyDst |
495 WGPUBufferUsage_CopySrc) {
497 size_t numElements =
size(shape);
499 WGPUBufferDescriptor bufferDesc = {
503 WGPUBuffer buffer = wgpuDeviceCreateBuffer(device, &bufferDesc);
508 return pool.
data[buffer];
554 assert(dtype ==
kf32);
557 WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
558 WGPUBufferUsage_CopySrc);
584 assert(dtype ==
kf16);
587 WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
588 WGPUBufferUsage_CopySrc);
627 std::vector<WGPUBuffer> keys;
628 for (
auto &pair :
data) {
629 keys.push_back(pair.first);
631 for (
auto &key : keys) {
646inline void check(
bool condition,
const char *message,
647 const char *file =
"unkown",
int line = -1) {
679 const WGPURequestAdapterOptions &adapterOpts = {},
680 const WGPUDeviceDescriptor &devDescriptor = {}) {
683#ifndef __EMSCRIPTEN__
684 context.
instance = wgpuCreateInstance(&desc);
688 context.instance = wgpuCreateInstance(
nullptr);
690 check(context.instance,
"Initialize WebGPU", __FILE__, __LINE__);
695 WGPUAdapter adapter =
nullptr;
696 bool requestEnded =
false;
698 AdapterData adapterData;
699 auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status,
700 WGPUAdapter adapter,
char const *message,
702 AdapterData &adapterData = *
reinterpret_cast<AdapterData *
>(pUserData);
703 check(status == WGPURequestAdapterStatus_Success,
704 "Request WebGPU adapter", __FILE__, __LINE__);
705 adapterData.adapter = adapter;
706 adapterData.requestEnded =
true;
708 wgpuInstanceRequestAdapter(context.instance, &adapterOpts,
709 onAdapterRequestEnded, (
void *)&adapterData);
710 while (!adapterData.requestEnded) {
713 assert(adapterData.requestEnded);
714 context.adapter = adapterData.adapter;
719 WGPUDevice device =
nullptr;
720 bool requestEnded =
false;
723 auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
724 WGPUDevice device,
char const *message,
726 DeviceData &devData = *
reinterpret_cast<DeviceData *
>(pUserData);
727 check(status == WGPURequestDeviceStatus_Success,
728 "Could not get WebGPU device.", __FILE__, __LINE__);
730 static_cast<void *
>(device));
731 devData.device = device;
732 devData.requestEnded =
true;
734#ifdef WEBGPU_BACKEND_DAWN
735 devDescriptor.deviceLostCallbackInfo = {
737 [](WGPUDevice
const *device, WGPUDeviceLostReason reason,
738 char const *message,
void *userdata) {
739 if (reason != WGPUDeviceLostReason_Destroyed) {
749 wgpuAdapterRequestDevice(context.adapter, &devDescriptor,
750 onDeviceRequestEnded, (
void *)&devData);
752 while (!devData.requestEnded) {
756 assert(devData.requestEnded);
757 context.device = devData.device;
758 wgpuDeviceSetUncapturedErrorCallback(
760 [](WGPUErrorType type,
char const *message,
void *devData) {
761 LOG(kDefLog, kError,
"Device uncaptured error: %s", message);
762 throw std::runtime_error(
"Device uncaptured exception.");
766 context.queue = wgpuDeviceGetQueue(context.device);
771 while (future.wait_for(std::chrono::seconds(0)) !=
772 std::future_status::ready) {
794 wgpuQueueOnSubmittedWorkDone(
796 [](WGPUQueueWorkDoneStatus status,
void *callbackData) {
797 check(status == WGPUQueueWorkDoneStatus_Success,
"Queue work done",
799 const auto *data = static_cast<CallbackData *>(callbackData);
801 data->buffer, WGPUMapMode_Read, 0, data->bufferSize,
802 [](WGPUBufferMapAsyncStatus status, void *captureData) {
803 const auto *data = static_cast<CallbackData *>(captureData);
804 check(status == WGPUBufferMapAsyncStatus_Success,
805 "Map readbackBuffer", __FILE__, __LINE__);
806 const void *mappedData = wgpuBufferGetConstMappedRange(
807 data->buffer, 0, data->bufferSize);
808 check(mappedData,
"Get mapped range", __FILE__, __LINE__);
809 memcpy(data->output, mappedData, data->bufferSize);
810 wgpuBufferUnmap(data->buffer);
811 data->promise->set_value();
816 wait(ctx, op.future);
838 WGPUBufferDescriptor readbackBufferDescriptor = {
839 .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
843 wgpuDeviceCreateBuffer(ctx.
device, &readbackBufferDescriptor);
846 WGPUCommandEncoder commandEncoder;
847 WGPUComputePassEncoder computePassEncoder;
848 commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.
device,
nullptr);
849 wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.
data.
buffer, 0,
851 op.
commandBuffer = wgpuCommandEncoderFinish(commandEncoder,
nullptr);
854 toCPU(ctx, tensor, data, bufferSize, op);
870 toCPU(ctx, tensor, data.
data(),
sizeof(data));
890 wgpuQueueWriteBuffer(ctx.
queue, buffer, 0, data,
size);
914template <
typename Params>
921 static_cast<void *
>(¶ms),
sizeof(params));
939 WGPUCommandEncoder commandEncoder =
940 wgpuDeviceCreateCommandEncoder(device,
nullptr);
941 WGPUComputePassEncoder computePassEncoder =
942 wgpuCommandEncoderBeginComputePass(commandEncoder,
nullptr);
943 wgpuComputePassEncoderSetPipeline(computePassEncoder, op.
computePipeline);
944 wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, op.
bindGroup, 0,
946 wgpuComputePassEncoderDispatchWorkgroups(
949 wgpuComputePassEncoderEnd(computePassEncoder);
950 op.
commandBuffer = wgpuCommandEncoderFinish(commandEncoder,
nullptr);
959template <
typename T>
constexpr bool IsNoParam = std::is_same_v<T, NoParam>;
964inline size_t cdiv(
size_t n,
size_t d) {
return (n + d - 1) / d; }
974 for (
size_t dim = 0; dim < total.
rank; ++dim) {
975 result[dim] =
cdiv(total[dim], group[dim]);
1008 const Tensor *dataBindings,
size_t numTensors,
1009 const size_t *viewOffsets,
const Shape &nWorkgroups,
1010 const void *params =
nullptr,
1011 size_t paramsSize = 0) {
1012 assert(nWorkgroups.
rank == 3);
1013 WGPUDevice device = ctx.
device;
1014 WGPUQueue queue = ctx.
queue;
1019 size_t paramIndex = -1;
1021 size_t numBindings = numTensors;
1022 if (paramsSize > 0) {
1024 paramIndex = numBindings - 1;
1028 op.
buffers = std::make_unique<WGPUBuffer[]>(numBindings);
1029 op.
bufferSizes = std::make_unique<size_t[]>(numBindings);
1031 std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);
1033 for (
size_t i = 0; i < numTensors; ++i) {
1034 bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{
1035 .binding =
static_cast<uint32_t
>(i),
1036 .visibility = WGPUShaderStage_Compute,
1038 WGPUBufferBindingLayout{
1039 .type = WGPUBufferBindingType_Storage,
1040 .minBindingSize = dataBindings[i].
data.
size,
1044 if (paramsSize > 0) {
1045 LOG(kDefLog,
kInfo,
"Create layout entry for the params buffer");
1047 bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{
1048 .binding =
static_cast<uint32_t
>(paramIndex),
1049 .visibility = WGPUShaderStage_Compute,
1051 WGPUBufferBindingLayout{
1052 .type = WGPUBufferBindingType_Uniform,
1053 .minBindingSize = paramsSize,
1057 WGPUBindGroupLayoutDescriptor bgLayoutDesc = {
1058 .entryCount =
static_cast<uint32_t
>(bgLayoutEntries.size()),
1059 .entries = bgLayoutEntries.data(),
1061 WGPUBindGroupLayout bgLayout =
1062 wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc);
1063 for (
size_t i = 0; i < numTensors; ++i) {
1068 if (paramsSize > 0) {
1069 WGPUBufferDescriptor paramsBufferDesc = {
1070 .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
1072 .mappedAtCreation =
false,
1074 op.
buffers[paramIndex] = wgpuDeviceCreateBuffer(device, ¶msBufferDesc);
1076 wgpuQueueWriteBuffer(queue, op.
buffers[paramIndex], 0, params, paramsSize);
1077 LOG(kDefLog,
kTrace,
"Params buffer written");
1079 LOG(kDefLog,
kTrace,
"No params buffer needed");
1081 std::vector<WGPUBindGroupEntry> bindGroupEntries(numBindings);
1082 for (
size_t i = 0; i < numTensors; ++i) {
1083 bindGroupEntries[i] = WGPUBindGroupEntry{
1084 .binding =
static_cast<uint32_t
>(i),
1086 .offset = viewOffsets[i],
1090 if (paramsSize > 0) {
1091 LOG(kDefLog,
kInfo,
"Create bind group entry for the params buffer");
1092 LOG(kDefLog,
kInfo,
"paramIndex: %d", paramIndex);
1093 bindGroupEntries[paramIndex] = WGPUBindGroupEntry{
1094 .binding =
static_cast<uint32_t
>(paramIndex),
1095 .buffer = op.
buffers[paramIndex],
1100 LOG(kDefLog,
kTrace,
"BG Entries Size: %d", numBindings);
1101 WGPUBindGroupDescriptor bindGroupDesc = {
1103 .entryCount =
static_cast<uint32_t
>(numBindings),
1104 .entries = bindGroupEntries.data(),
1106 op.
bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);
1108 WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {
1109 .bindGroupLayoutCount = 1,
1110 .bindGroupLayouts = &bgLayout,
1112 WGPUPipelineLayout pipelineLayout =
1113 wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);
1114 WGPUShaderModuleWGSLDescriptor wgslDesc = {
1115 .code = code.
data.c_str(),
1117 wgslDesc.chain.sType = WGPUSType_ShaderModuleWGSLDescriptor;
1118 WGPUShaderModuleDescriptor shaderModuleDesc = {};
1119 shaderModuleDesc.nextInChain = &wgslDesc.chain;
1120 shaderModuleDesc.label = code.
label.c_str();
1121 WGPUComputePipelineDescriptor computePipelineDesc = {};
1122 computePipelineDesc.layout = pipelineLayout;
1123 computePipelineDesc.compute.module =
1124 wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);
1125 computePipelineDesc.compute.entryPoint = code.
entryPoint.c_str();
1126 computePipelineDesc.label = code.
label.c_str();
1128 wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);
1135 op.
nWorkgroups = {nWorkgroups[0], nWorkgroups[1], nWorkgroups[2]};
1162template <
typename ParamsType = NoParam,
size_t numInputs>
1165 const Shape &nWorkgroups,
1166 const ParamsType ¶ms = ParamsType{}) {
1167 if constexpr (!IsNoParam<ParamsType>) {
1170 return createKernel(ctx, code, dataBindings.
data.data(), numInputs,
1172 reinterpret_cast<const void *
>(¶ms),
1173 sizeof(ParamsType));
1177 dataBindings.
viewOffsets.data(), nWorkgroups,
nullptr,
1201 std::promise<void> &promise) {
1204 wgpuQueueOnSubmittedWorkDone(
1206 [](WGPUQueueWorkDoneStatus status,
void *data) {
1207 check(status == WGPUQueueWorkDoneStatus_Success,
"Queue work done",
1208 __FILE__, __LINE__);
1209 auto *promise = static_cast<std::promise<void> *>(data);
1210 promise->set_value();
static Logger kDefLog
Default logger for logging messages to stdout at the info level. Output stream and logging level for ...
void FreeTensor(TensorPool &pool, Tensor tensor)
Frees a tensor resource and updates the tensor pool.
void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, CopyData &op)
Copies data from a GPU buffer to CPU memory.
void check(bool condition, const char *message, const char *file="unkown", int line=-1)
Checks a condition and logs an error message if the condition is false. In debug mode,...
Tensor createTensor(TensorPool &pool, WGPUDevice &device, const Shape &shape, NumType dtype, WGPUBufferUsageFlags usage=WGPUBufferUsage_Storage|WGPUBufferUsage_CopyDst|WGPUBufferUsage_CopySrc)
Tensor factory function to create a tensor (a Tensor type is simply an Array with an N-dimensional Sh...
void LOG(Logger &logger, int level, const char *message,...)
Log a message to the logger. If NDEBUG is defined in a source or as a compiler flag,...
void processEvents(const WGPUInstance &instance)
Kernel createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, size_t numTensors, const size_t *viewOffsets, const Shape &nWorkgroups, const void *params=nullptr, size_t paramsSize=0)
A factory function to create a kernel on the GPU. The kernel is created with the given WGSL code,...
std::string toString(NumType type)
Converts NumType to string.
Bindings(std::array< Tensor, N >) -> Bindings< N >
Deduction guide for Bindings.
void replaceAll(std::string &str, const std::string &from, const std::string &to)
simple in-place string replacement helper function for substituting placeholders in a WGSL string tem...
bool operator<(const Kernel &lhs, const Kernel &rhs)
Operator implementation to make the Kernel type hashable.
void resetCommandBuffer(WGPUDevice &device, Kernel &op)
Resets the command buffer in preparation for a kernel dispatch. Since command buffers are consumed up...
Context createContext(const WGPUInstanceDescriptor &desc={}, const WGPURequestAdapterOptions &adapterOpts={}, const WGPUDeviceDescriptor &devDescriptor={})
Factory function to create a GPU context, which aggregates WebGPU API handles to interact with the GP...
void toGPU(Context &ctx, const void *data, WGPUBuffer buffer, size_t size)
Copies data from CPU memory to a GPU buffer. The toGPU overloads are effectively a convenience wrappe...
size_t size(const Shape &shape)
Returns the number of elements in a tensor with the given shape, which is equal to the product of the...
void dispatchKernel(Context &ctx, Kernel &kernel, std::promise< void > &promise)
Asynchronously submits a kernel to the GPU queue for execution. It also sets up a callback to notify ...
void wait(Context &ctx, std::future< void > &future)
size_t cdiv(size_t n, size_t d)
Ceiling division.
size_t sizeBytes(const NumType &type)
Returns the number of bytes of a number type.
Represents a buffer of values on the GPU.
WGPUBufferUsageFlags usage
Represents an ordered collection of WGPUBuffers (wrapped as tensors, non-overlapping views,...
const Tensor & operator[](std::size_t index) const
Tensor & operator[](std::size_t index)
Bindings(const std::initializer_list< Tensor > &init)
Bindings(const std::initializer_list< Array > &init)
std::array< size_t, N > viewSpans
std::array< size_t, N > viewOffsets
std::array< Tensor, N > data
Bindings(const std::initializer_list< TensorView > &init)
Used for on-done callback data for asynchronous operations sduch as kernel launching.
std::future< void > * future
std::promise< void > * promise
Represents a GPU context, aggregates WebGPU API handles to interact with the GPU including the instan...
Staging buffer and callback data for copying data between the GPU and CPU.
WGPUCommandBuffer commandBuffer
WGPUBuffer readbackBuffer
std::promise< void > promise
std::future< void > future
KernelCode is the representation of WGSL GPU code with template substitutions applied....
KernelCode(const std::string &pData="", size_t workgroupSize=256, NumType precision=kf32)
Constructor to create a code object from a template string and optional workgroup size and precision.
KernelCode(const std::string &pData, const Shape &workgroupSize={256, 1, 1}, NumType precision=kf32)
Overload of the constructor to create a code object from a template string and workgroup size....
Represents handles + metadata for a reusable kernel on the GPU. The struct members can be divided int...
std::unique_ptr< size_t[]> bufferSizes
WGPUCommandBuffer commandBuffer
WGPUComputePipeline computePipeline
std::unique_ptr< WGPUBuffer[]> buffers
A pool of kernels to manage GPU resources. For simple use cases this is instantiated as a member in t...
std::set< Kernel * > data
NoParam is a no-op type used to indicate that a kernel does not have any parameters.
Represents the shape of a tensor.
const size_t & operator[](size_t index) const
std::array< size_t, kMaxRank > data
Shape(std::initializer_list< size_t > dims)
size_t & operator[](size_t index)
static constexpr size_t kMaxRank
Represents a tensor on the GPU, which is a buffer of values with a shape.
Represents a pool of tensors to manage GPU resources. The pool is responsible for managing the lifeti...
std::unordered_map< WGPUBuffer, Tensor > data
~TensorPool()
Destructor for TensorPool which frees all tensors in the pool.
Represents a non-owning view into a tensor specifying an offset and a subspan. This is useful for spe...