gpu.cpp 0.1.0
 
Loading...
Searching...
No Matches
gpu.h
Go to the documentation of this file.
1#ifndef GPU_H
2#define GPU_H
3
4#include <array>
5#include <cassert>
6#include <cstring>
7#include <future>
8#include <initializer_list>
9#include <memory>
10#include <set>
11#include <string>
12#include <tuple>
13#include <type_traits>
14#include <unordered_map>
15#include <utility> // std::pair
16#include <vector>
17
18#include "webgpu/webgpu.h"
19
20#include "numeric_types/half.h"
21#include "utils/logging.h"
22
23#ifdef __EMSCRIPTEN__
24#include "emscripten/emscripten.h"
25#endif
26
27namespace gpu {
28
32struct Array {
33 WGPUBuffer buffer;
34 WGPUBufferUsageFlags usage;
35 size_t size; // in bytes
36};
37
49struct Shape {
50 static constexpr size_t kMaxRank = 8; // Maximum rank of a tensor, avoids
51 // dynamic allocation for shape data
52 std::array<size_t, kMaxRank> data = {0};
53 size_t rank = 0;
54 inline Shape() = default;
55 inline Shape(std::initializer_list<size_t> dims) {
56 assert(dims.size() <= kMaxRank);
57 std::copy(dims.begin(), dims.end(), data.begin());
58 rank = dims.size();
59 }
60 inline size_t &operator[](size_t index) {
61 assert(index < rank);
62 return data[index];
63 }
64 inline const size_t &operator[](size_t index) const {
65 assert(index < rank);
66 return data[index];
67 }
68};
69
80inline size_t size(const Shape &shape) {
81 size_t numels = 1;
82 for (size_t i = 0; i < shape.rank; i++) {
83 numels *= shape.data[i];
84 }
85 return numels;
86}
87
100
111 Tensor data; // non-owning view
112 size_t offset = 0;
113 size_t span = 0;
114};
115
124template <std::size_t N> struct Bindings {
125 std::array<Tensor, N> data;
126 std::array<size_t, N> viewOffsets;
127 std::array<size_t, N> viewSpans;
128 Bindings(const std::initializer_list<Tensor> &init) {
129 std::copy(begin(init), end(init), begin(data));
130 std::fill(begin(viewOffsets), end(viewOffsets), 0);
131 for (size_t i = 0; i < N; ++i) {
132 viewSpans[i] = data[i].data.size;
133 }
134 }
135
136 Bindings(const std::initializer_list<TensorView> &init) {
137 size_t i = 0;
138 for (const auto &tv : init) {
139 data[i] = tv.data;
140 viewOffsets[i] = tv.offset;
141 viewSpans[i] = tv.span;
142 ++i;
143 }
144 }
145
146 Bindings(const std::initializer_list<Array> &init) {
147 std::copy(begin(init), end(init), begin(data));
148 std::fill(begin(viewOffsets), end(viewOffsets), 0);
149 for (size_t i = 0; i < N; ++i) {
150 viewSpans[i] = data[i].size;
151 }
152 }
153
154 Tensor &operator[](std::size_t index) { return data[index]; }
155 const Tensor &operator[](std::size_t index) const { return data[index]; }
156};
157
161template <std::size_t N> Bindings(std::array<Tensor, N>) -> Bindings<N>;
162template <typename... Args> Bindings(Args...) -> Bindings<sizeof...(Args)>;
163
164struct Context; // Forward declaration so that TensorPool can have a pointer to
165 // Context
166
177 inline TensorPool(Context *ctx) : ctx(ctx), data() {};
179 std::unordered_map<WGPUBuffer, Tensor> data;
180 ~TensorPool();
181};
182
184 kf16, // (experimental)
185 kf32
187
191inline size_t sizeBytes(const NumType &type) {
192 switch (type) {
193 case kf16:
194 return sizeof(uint16_t);
195 case kf32:
196 return sizeof(float);
197 default:
198 LOG(kDefLog, kError, "Invalid NumType in size calculation.");
199 return 0;
200 }
201}
202
206inline std::string toString(NumType type) {
207 switch (type) {
208 case kf16:
209 return "f16";
210 case kf32:
211 return "f32";
212 default:
213 LOG(kDefLog, kError, "Invalid NumType in string conversion.");
214 return "unknown";
215 }
216}
217
222inline std::string toString(const Shape &shape) {
223 std::string str;
224 for (size_t i = 0; i < shape.rank; i++) {
225 str += std::to_string(shape.data[i]);
226 if (i < shape.rank - 1) {
227 str += ", ";
228 }
229 }
230 return str;
231}
232
238inline std::string toString(size_t value) { return std::to_string(value); }
239
256inline void replaceAll(std::string &str, const std::string &from,
257 const std::string &to) {
258 size_t start_pos = 0;
259 while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
260 str.replace(start_pos, from.length(), to);
261 start_pos += to.length();
262 }
263}
264
289 inline KernelCode(const std::string &pData = "", size_t workgroupSize = 256,
291 : data(pData), workgroupSize({workgroupSize, 1, 1}),
293 if (precision == kf16) {
294 data = "enable f16;\n" + data;
295 }
296 replaceAll(data, "{{workgroupSize}}", toString({workgroupSize, 1, 1}));
297 replaceAll(data, "{{precision}}", toString(precision));
298 LOG(kDefLog, kTrace, "Shader code:\n%s", data.c_str());
299 }
300
317 inline KernelCode(const std::string &pData,
318 const Shape &workgroupSize = {256, 1, 1},
321 replaceAll(data, "{{workgroupSize}}", toString(workgroupSize));
322 replaceAll(data, "{{precision}}", toString(precision));
323 LOG(kDefLog, kInfo, "Shader code:\n%s", data.c_str());
324 }
325 std::string data;
328 std::string label = "kernel";
329 std::string entryPoint = "main";
330};
331
345inline void
346replaceAll(std::string &str,
347 const std::vector<std::pair<std::string, std::string>> &reps) {
348 for (const auto &rep : reps) {
349 replaceAll(str, rep.first, rep.second);
350 }
351}
352
358 WGPUBuffer buffer; // managed by owning Kernel
360 void *output; // non-owning, only for target memory in toCPU, not used for
361 // kernel invocations
362 std::promise<void> *promise;
363 std::future<void> *future;
364};
365
370struct CopyData {
371 WGPUCommandBuffer commandBuffer;
372 WGPUBuffer readbackBuffer;
373 std::promise<void> promise;
374 std::future<void> future;
375};
376
382struct Kernel {
383 std::unique_ptr<WGPUBuffer[]> buffers; // non-owning
384 std::unique_ptr<size_t[]> bufferSizes;
387 WGPUBindGroup bindGroup; // persists between submission
388 WGPUComputePipeline computePipeline; // persists between submission
389 WGPUCommandBuffer commandBuffer; // destroyed upon submission
390};
391
398inline bool operator<(const Kernel &lhs, const Kernel &rhs) {
399 return lhs.commandBuffer < rhs.commandBuffer;
400}
401
408 inline KernelPool(Context *ctx) : ctx(ctx), data() {}
410 std::set<Kernel *> data;
411 inline ~KernelPool() {
412 // Note : Some kernel resources such as commandBuffer are harvested by
413 // queue submission, explicitly destroying readback and callback buffers
414 // produces runtime errors.
415 data.clear();
416 }
417};
418
419inline void processEvents(const WGPUInstance& instance) {
420#ifdef __EMSCRIPTEN__
421 emscripten_sleep(0);
422#else
423 wgpuInstanceProcessEvents(instance);
424#endif
425}
426
434struct Context {
435 WGPUInstance instance;
436 WGPUAdapter adapter;
437 WGPUDevice device;
438 WGPUQueue queue;
442 LOG(kDefLog, kTrace, "Destroying context");
443 if (queue) {
444 wgpuQueueRelease(queue);
445 } else {
446 LOG(kDefLog, kWarn, "Queue is null");
447 }
448 if (device) {
449 wgpuDeviceRelease(device);
451 } else {
452 LOG(kDefLog, kWarn, "Device is null");
453 }
454 if (adapter) {
455 wgpuAdapterRelease(adapter);
457 } else {
458 LOG(kDefLog, kWarn, "Adapter is null");
459 }
460 if (instance) {
461 wgpuInstanceRelease(instance);
462 } else {
463 LOG(kDefLog, kWarn, "Instance is null");
464 }
465 LOG(kDefLog, kInfo, "Context destroyed");
466 }
467};
468
490inline Tensor
491createTensor(TensorPool &pool, WGPUDevice &device, const Shape &shape,
492 NumType dtype,
493 WGPUBufferUsageFlags usage = WGPUBufferUsage_Storage |
494 WGPUBufferUsage_CopyDst |
495 WGPUBufferUsage_CopySrc) {
496 LOG(kDefLog, kTrace, "Creating tensor");
497 size_t numElements = size(shape);
498 size_t size = sizeBytes(dtype) * numElements;
499 WGPUBufferDescriptor bufferDesc = {
500 .usage = usage,
501 .size = size,
502 };
503 WGPUBuffer buffer = wgpuDeviceCreateBuffer(device, &bufferDesc);
504 pool.data[buffer] = Tensor{
505 .data = Array{.buffer = buffer, .usage = usage, .size = size},
506 .shape = shape,
507 };
508 return pool.data[buffer];
509}
510
530inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype) {
531 return createTensor(ctx.pool, ctx.device, shape, dtype);
532}
533
552inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
553 float *data) {
554 assert(dtype == kf32);
555 Tensor tensor =
556 createTensor(ctx.pool, ctx.device, shape, dtype,
557 WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
558 WGPUBufferUsage_CopySrc);
559 wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
560 tensor.data.size);
561 return tensor;
562}
563
582inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
583 half *data) {
584 assert(dtype == kf16);
585 Tensor tensor =
586 createTensor(ctx.pool, ctx.device, shape, dtype,
587 WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
588 WGPUBufferUsage_CopySrc);
589 wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
590 tensor.data.size);
591 return tensor;
592}
593
608inline void FreeTensor(TensorPool &pool, Tensor tensor) {
609 if (tensor.data.buffer) {
610 wgpuBufferRelease(tensor.data.buffer);
611 } else {
612 LOG(kDefLog, kWarn, "Tried to free tensor with null buffer");
613 }
614 if (pool.data.find(tensor.data.buffer) != pool.data.end()) {
615 pool.data.erase(tensor.data.buffer);
616 } else {
617 LOG(kDefLog, kWarn, "Tried to free tensor that was not in pool");
618 }
619}
620
625 // Need to get keys in a separate iteration, otherwise iterator is getting
626 // invalidated during erase.
627 std::vector<WGPUBuffer> keys;
628 for (auto &pair : data) {
629 keys.push_back(pair.first);
630 }
631 for (auto &key : keys) {
632 FreeTensor(*this, data[key]);
633 LOG(kDefLog, kTrace, "Freed tensor");
634 }
635}
636
646inline void check(bool condition, const char *message,
647 const char *file = "unkown", int line = -1) {
648 if (!condition) {
649 LOG(kDefLog, kError, "Error in file %s line %d:\n%s", file, line, message);
650 exit(1);
651 } else {
652 LOG(kDefLog, kTrace, "Success in file %s line %d:\n%s", file, line,
653 message);
654 }
655}
656
678inline Context createContext(const WGPUInstanceDescriptor &desc = {},
679 const WGPURequestAdapterOptions &adapterOpts = {},
680 const WGPUDeviceDescriptor &devDescriptor = {}) {
681 Context context;
682 {
683#ifndef __EMSCRIPTEN__
684 context.instance = wgpuCreateInstance(&desc);
685#else
686 // Emscripten does not support the instance descriptor
687 // and throws an assertion error if it is not nullptr.
688 context.instance = wgpuCreateInstance(nullptr);
689#endif
690 check(context.instance, "Initialize WebGPU", __FILE__, __LINE__);
691 }
692 LOG(kDefLog, kInfo, "Requesting adapter");
693 {
694 struct AdapterData {
695 WGPUAdapter adapter = nullptr;
696 bool requestEnded = false;
697 };
698 AdapterData adapterData;
699 auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status,
700 WGPUAdapter adapter, char const *message,
701 void *pUserData) {
702 AdapterData &adapterData = *reinterpret_cast<AdapterData *>(pUserData);
703 check(status == WGPURequestAdapterStatus_Success,
704 "Request WebGPU adapter", __FILE__, __LINE__);
705 adapterData.adapter = adapter;
706 adapterData.requestEnded = true;
707 };
708 wgpuInstanceRequestAdapter(context.instance, &adapterOpts,
709 onAdapterRequestEnded, (void *)&adapterData);
710 while (!adapterData.requestEnded) {
711 processEvents(context.instance);
712 }
713 assert(adapterData.requestEnded);
714 context.adapter = adapterData.adapter;
715 }
716 LOG(kDefLog, kInfo, "Requesting device");
717 {
718 struct DeviceData {
719 WGPUDevice device = nullptr;
720 bool requestEnded = false;
721 };
722 DeviceData devData;
723 auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
724 WGPUDevice device, char const *message,
725 void *pUserData) {
726 DeviceData &devData = *reinterpret_cast<DeviceData *>(pUserData);
727 check(status == WGPURequestDeviceStatus_Success,
728 "Could not get WebGPU device.", __FILE__, __LINE__);
729 LOG(kDefLog, kTrace, "Device Request succeeded %x",
730 static_cast<void *>(device));
731 devData.device = device;
732 devData.requestEnded = true;
733 };
734#ifdef WEBGPU_BACKEND_DAWN
735 devDescriptor.deviceLostCallbackInfo = {
736 .callback =
737 [](WGPUDevice const *device, WGPUDeviceLostReason reason,
738 char const *message, void *userdata) {
739 if (reason != WGPUDeviceLostReason_Destroyed) {
740 LOG(kDefLog, kError, "Device lost (code %d):\n%s", reason,
741 message);
742 } else {
743 LOG(kDefLog, kInfo, "Device destroyed: %s", message);
744 }
745 },
746 };
747#endif
748 LOG(kDefLog, kInfo, "Requesting device");
749 wgpuAdapterRequestDevice(context.adapter, &devDescriptor,
750 onDeviceRequestEnded, (void *)&devData);
751 LOG(kDefLog, kInfo, "Waiting for device request to end");
752 while (!devData.requestEnded) {
753 processEvents(context.instance);
754 }
755 LOG(kDefLog, kInfo, "Device request ended");
756 assert(devData.requestEnded);
757 context.device = devData.device;
758 wgpuDeviceSetUncapturedErrorCallback(
759 context.device,
760 [](WGPUErrorType type, char const *message, void *devData) {
761 LOG(kDefLog, kError, "Device uncaptured error: %s", message);
762 throw std::runtime_error("Device uncaptured exception.");
763 },
764 nullptr);
765 }
766 context.queue = wgpuDeviceGetQueue(context.device);
767 return context;
768}
769
770inline void wait(Context &ctx, std::future<void> &future) {
771 while (future.wait_for(std::chrono::seconds(0)) !=
772 std::future_status::ready) {
774 }
775}
776
789inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
790 CopyData &op) {
791 wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
792 CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise,
793 &op.future};
794 wgpuQueueOnSubmittedWorkDone(
795 ctx.queue,
796 [](WGPUQueueWorkDoneStatus status, void *callbackData) {
797 check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
798 __FILE__, __LINE__);
799 const auto *data = static_cast<CallbackData *>(callbackData);
800 wgpuBufferMapAsync(
801 data->buffer, WGPUMapMode_Read, 0, data->bufferSize,
802 [](WGPUBufferMapAsyncStatus status, void *captureData) {
803 const auto *data = static_cast<CallbackData *>(captureData);
804 check(status == WGPUBufferMapAsyncStatus_Success,
805 "Map readbackBuffer", __FILE__, __LINE__);
806 const void *mappedData = wgpuBufferGetConstMappedRange(
807 data->buffer, /*offset=*/0, data->bufferSize);
808 check(mappedData, "Get mapped range", __FILE__, __LINE__);
809 memcpy(data->output, mappedData, data->bufferSize);
810 wgpuBufferUnmap(data->buffer);
811 data->promise->set_value();
812 },
813 callbackData);
814 },
815 &callbackData);
816 wait(ctx, op.future);
817}
818
834inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
835 CopyData op;
836 op.future = op.promise.get_future();
837 {
838 WGPUBufferDescriptor readbackBufferDescriptor = {
839 .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
840 .size = bufferSize,
841 };
842 op.readbackBuffer =
843 wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
844 }
845 {
846 WGPUCommandEncoder commandEncoder;
847 WGPUComputePassEncoder computePassEncoder;
848 commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
849 wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
850 op.readbackBuffer, 0, bufferSize);
851 op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
852 check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
853 }
854 toCPU(ctx, tensor, data, bufferSize, op);
855}
856
868template <size_t N>
869void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
870 toCPU(ctx, tensor, data.data(), sizeof(data));
871}
872
888inline void toGPU(Context &ctx, const void *data, WGPUBuffer buffer,
889 size_t size) {
890 wgpuQueueWriteBuffer(ctx.queue, buffer, 0, data, size);
891}
892
904inline void toGPU(Context &ctx, const float *data, Tensor &tensor) {
905 wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
906 tensor.data.size);
907}
908
909inline void toGPU(Context &ctx, const half *data, Tensor &tensor) {
910 wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
911 tensor.data.size);
912}
913
914template <typename Params>
915inline void toGPU(Context &ctx, Params &params, Kernel &op) {
916 // TODO(avh): Maintain params metadata in Kernel and check for consistency.
917 // If a kernel does not have parameters this will quietly overwrite
918 // the last buffer in the bind group with the parameters buffer.
919 if (op.numBindings > 0) {
920 wgpuQueueWriteBuffer(ctx.queue, op.buffers[op.numBindings - 1], 0,
921 static_cast<void *>(&params), sizeof(params));
922 }
923}
924
937inline void resetCommandBuffer(WGPUDevice &device, Kernel &op) {
938 {
939 WGPUCommandEncoder commandEncoder =
940 wgpuDeviceCreateCommandEncoder(device, nullptr);
941 WGPUComputePassEncoder computePassEncoder =
942 wgpuCommandEncoderBeginComputePass(commandEncoder, nullptr);
943 wgpuComputePassEncoderSetPipeline(computePassEncoder, op.computePipeline);
944 wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, op.bindGroup, 0,
945 nullptr);
946 wgpuComputePassEncoderDispatchWorkgroups(
947 computePassEncoder, op.nWorkgroups[0], op.nWorkgroups[1],
948 op.nWorkgroups[2]);
949 wgpuComputePassEncoderEnd(computePassEncoder);
950 op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
951 }
952}
953
958struct NoParam {};
959template <typename T> constexpr bool IsNoParam = std::is_same_v<T, NoParam>;
960
964inline size_t cdiv(size_t n, size_t d) { return (n + d - 1) / d; }
965
970inline Shape cdiv(Shape total, Shape group) {
971 assert(total.rank == group.rank);
972 Shape result;
973 result.rank = total.rank;
974 for (size_t dim = 0; dim < total.rank; ++dim) {
975 result[dim] = cdiv(total[dim], group[dim]);
976 }
977 return result;
978}
979
1007inline Kernel createKernel(Context &ctx, const KernelCode &code,
1008 const Tensor *dataBindings, size_t numTensors,
1009 const size_t *viewOffsets, const Shape &nWorkgroups,
1010 const void *params = nullptr,
1011 size_t paramsSize = 0) {
1012 assert(nWorkgroups.rank == 3);
1013 WGPUDevice device = ctx.device;
1014 WGPUQueue queue = ctx.queue;
1015 Kernel op;
1016 // paramIndex is the index into bgLayoutEntries for the parameters buffer If
1017 // there are no parameters for the kernel, paramsSize == 0 and paramIndex is
1018 // effectively undefined (== -1)
1019 size_t paramIndex = -1;
1020 // Note: paramIndex is undefined unless paramsSize > 0
1021 size_t numBindings = numTensors;
1022 if (paramsSize > 0) {
1023 numBindings++; // parameters buffer
1024 paramIndex = numBindings - 1; // index of the parameters buffer within
1025 // op.buffers, op.bufferSizes and
1026 // bgLayoutEntries
1027 }
1028 op.buffers = std::make_unique<WGPUBuffer[]>(numBindings);
1029 op.bufferSizes = std::make_unique<size_t[]>(numBindings);
1030 op.numBindings = numBindings;
1031 std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);
1032 // Create layout entries for input buffers
1033 for (size_t i = 0; i < numTensors; ++i) {
1034 bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{
1035 .binding = static_cast<uint32_t>(i),
1036 .visibility = WGPUShaderStage_Compute,
1037 .buffer =
1038 WGPUBufferBindingLayout{
1039 .type = WGPUBufferBindingType_Storage,
1040 .minBindingSize = dataBindings[i].data.size,
1041 },
1042 };
1043 }
1044 if (paramsSize > 0) {
1045 LOG(kDefLog, kInfo, "Create layout entry for the params buffer");
1046 // Create layout entry for the params buffer
1047 bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{
1048 .binding = static_cast<uint32_t>(paramIndex),
1049 .visibility = WGPUShaderStage_Compute,
1050 .buffer =
1051 WGPUBufferBindingLayout{
1052 .type = WGPUBufferBindingType_Uniform,
1053 .minBindingSize = paramsSize,
1054 },
1055 };
1056 }
1057 WGPUBindGroupLayoutDescriptor bgLayoutDesc = {
1058 .entryCount = static_cast<uint32_t>(bgLayoutEntries.size()),
1059 .entries = bgLayoutEntries.data(),
1060 };
1061 WGPUBindGroupLayout bgLayout =
1062 wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc);
1063 for (size_t i = 0; i < numTensors; ++i) {
1064 op.buffers[i] = dataBindings[i].data.buffer;
1065 op.bufferSizes[i] = dataBindings[i].data.size;
1066 }
1067 // Create a buffer for the Params struct
1068 if (paramsSize > 0) {
1069 WGPUBufferDescriptor paramsBufferDesc = {
1070 .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
1071 .size = paramsSize,
1072 .mappedAtCreation = false,
1073 };
1074 op.buffers[paramIndex] = wgpuDeviceCreateBuffer(device, &paramsBufferDesc);
1075 op.bufferSizes[paramIndex] = paramsSize;
1076 wgpuQueueWriteBuffer(queue, op.buffers[paramIndex], 0, params, paramsSize);
1077 LOG(kDefLog, kTrace, "Params buffer written");
1078 } else {
1079 LOG(kDefLog, kTrace, "No params buffer needed");
1080 }
1081 std::vector<WGPUBindGroupEntry> bindGroupEntries(numBindings);
1082 for (size_t i = 0; i < numTensors; ++i) {
1083 bindGroupEntries[i] = WGPUBindGroupEntry{
1084 .binding = static_cast<uint32_t>(i),
1085 .buffer = op.buffers[i],
1086 .offset = viewOffsets[i],
1087 .size = op.bufferSizes[i],
1088 };
1089 }
1090 if (paramsSize > 0) {
1091 LOG(kDefLog, kInfo, "Create bind group entry for the params buffer");
1092 LOG(kDefLog, kInfo, "paramIndex: %d", paramIndex);
1093 bindGroupEntries[paramIndex] = WGPUBindGroupEntry{
1094 .binding = static_cast<uint32_t>(paramIndex),
1095 .buffer = op.buffers[paramIndex],
1096 .offset = 0,
1097 .size = paramsSize,
1098 };
1099 }
1100 LOG(kDefLog, kTrace, "BG Entries Size: %d", numBindings);
1101 WGPUBindGroupDescriptor bindGroupDesc = {
1102 .layout = bgLayout,
1103 .entryCount = static_cast<uint32_t>(numBindings),
1104 .entries = bindGroupEntries.data(),
1105 };
1106 op.bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);
1107 {
1108 WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {
1109 .bindGroupLayoutCount = 1,
1110 .bindGroupLayouts = &bgLayout,
1111 };
1112 WGPUPipelineLayout pipelineLayout =
1113 wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);
1114 WGPUShaderModuleWGSLDescriptor wgslDesc = {
1115 .code = code.data.c_str(),
1116 };
1117 wgslDesc.chain.sType = WGPUSType_ShaderModuleWGSLDescriptor;
1118 WGPUShaderModuleDescriptor shaderModuleDesc = {};
1119 shaderModuleDesc.nextInChain = &wgslDesc.chain;
1120 shaderModuleDesc.label = code.label.c_str();
1121 WGPUComputePipelineDescriptor computePipelineDesc = {};
1122 computePipelineDesc.layout = pipelineLayout;
1123 computePipelineDesc.compute.module =
1124 wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);
1125 computePipelineDesc.compute.entryPoint = code.entryPoint.c_str();
1126 computePipelineDesc.label = code.label.c_str();
1127 op.computePipeline =
1128 wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);
1129 }
1130 /*
1131 op.nWorkgroups = {cdiv(nThreads[0], code.workgroupSize[0]),
1132 cdiv(nThreads[1], code.workgroupSize[1]),
1133 cdiv(nThreads[2], code.workgroupSize[2])};
1134 */
1135 op.nWorkgroups = {nWorkgroups[0], nWorkgroups[1], nWorkgroups[2]};
1136 resetCommandBuffer(device, op);
1137 ctx.kernelPool.data.insert(&op);
1138 return op;
1139}
1140
1162template <typename ParamsType = NoParam, size_t numInputs>
1164 const Bindings<numInputs> &dataBindings,
1165 const Shape &nWorkgroups,
1166 const ParamsType &params = ParamsType{}) {
1167 if constexpr (!IsNoParam<ParamsType>) {
1168 // LOG(kDefLog, kTrace, "Using params of size %d bytes",
1169 // sizeof(ParamsType));
1170 return createKernel(ctx, code, dataBindings.data.data(), numInputs,
1171 dataBindings.viewOffsets.data(), nWorkgroups,
1172 reinterpret_cast<const void *>(&params),
1173 sizeof(ParamsType));
1174 } else {
1175 // LOG(kDefLog, kTrace , "No params");
1176 return createKernel(ctx, code, dataBindings.data.data(), numInputs,
1177 dataBindings.viewOffsets.data(), nWorkgroups, nullptr,
1178 0);
1179 }
1180}
1181
1200inline void dispatchKernel(Context &ctx, Kernel &kernel,
1201 std::promise<void> &promise) {
1202 // Submit the command buffer
1203 wgpuQueueSubmit(ctx.queue, 1, &kernel.commandBuffer);
1204 wgpuQueueOnSubmittedWorkDone(
1205 ctx.queue,
1206 [](WGPUQueueWorkDoneStatus status, void *data) {
1207 check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
1208 __FILE__, __LINE__);
1209 auto *promise = static_cast<std::promise<void> *>(data);
1210 promise->set_value();
1211 },
1212 &promise);
1213}
1214
1215} // namespace gpu
1216
1217#endif // GPU_H
Definition gpu.h:27
static Logger kDefLog
Default logger for logging messages to stdout at the info level. Output stream and logging level for ...
Definition logging.h:64
void FreeTensor(TensorPool &pool, Tensor tensor)
Frees a tensor resource and updates the tensor pool.
Definition gpu.h:608
void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, CopyData &op)
Copies data from a GPU buffer to CPU memory.
Definition gpu.h:789
void check(bool condition, const char *message, const char *file="unkown", int line=-1)
Checks a condition and logs an error message if the condition is false. In debug mode,...
Definition gpu.h:646
Tensor createTensor(TensorPool &pool, WGPUDevice &device, const Shape &shape, NumType dtype, WGPUBufferUsageFlags usage=WGPUBufferUsage_Storage|WGPUBufferUsage_CopyDst|WGPUBufferUsage_CopySrc)
Tensor factory function to create a tensor (a Tensor type is simply an Array with an N-dimensional Sh...
Definition gpu.h:491
void LOG(Logger &logger, int level, const char *message,...)
Log a message to the logger. If NDEBUG is defined in a source or as a compiler flag,...
Definition logging.h:34
void processEvents(const WGPUInstance &instance)
Definition gpu.h:419
Kernel createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, size_t numTensors, const size_t *viewOffsets, const Shape &nWorkgroups, const void *params=nullptr, size_t paramsSize=0)
A factory function to create a kernel on the GPU. The kernel is created with the given WGSL code,...
Definition gpu.h:1007
std::string toString(NumType type)
Converts NumType to string.
Definition gpu.h:206
NumType
Definition gpu.h:183
@ kf16
Definition gpu.h:184
@ kf32
Definition gpu.h:185
Bindings(std::array< Tensor, N >) -> Bindings< N >
Deduction guide for Bindings.
@ kError
Definition logging.h:9
@ kWarn
Definition logging.h:9
@ kTrace
Definition logging.h:9
@ kInfo
Definition logging.h:9
void replaceAll(std::string &str, const std::string &from, const std::string &to)
simple in-place string replacement helper function for substituting placeholders in a WGSL string tem...
Definition gpu.h:256
constexpr bool IsNoParam
Definition gpu.h:959
bool operator<(const Kernel &lhs, const Kernel &rhs)
Operator implementation to make the Kernel type hashable.
Definition gpu.h:398
void resetCommandBuffer(WGPUDevice &device, Kernel &op)
Resets the command buffer in preparation for a kernel dispatch. Since command buffers are consumed up...
Definition gpu.h:937
Context createContext(const WGPUInstanceDescriptor &desc={}, const WGPURequestAdapterOptions &adapterOpts={}, const WGPUDeviceDescriptor &devDescriptor={})
Factory function to create a GPU context, which aggregates WebGPU API handles to interact with the GP...
Definition gpu.h:678
void toGPU(Context &ctx, const void *data, WGPUBuffer buffer, size_t size)
Copies data from CPU memory to a GPU buffer. The toGPU overloads are effectively a convenience wrappe...
Definition gpu.h:888
size_t size(const Shape &shape)
Returns the number of elements in a tensor with the given shape, which is equal to the product of the...
Definition gpu.h:80
void dispatchKernel(Context &ctx, Kernel &kernel, std::promise< void > &promise)
Asynchronously submits a kernel to the GPU queue for execution. It also sets up a callback to notify ...
Definition gpu.h:1200
void wait(Context &ctx, std::future< void > &future)
Definition gpu.h:770
size_t cdiv(size_t n, size_t d)
Ceiling division.
Definition gpu.h:964
size_t sizeBytes(const NumType &type)
Returns the number of bytes of a number type.
Definition gpu.h:191
Represents a buffer of values on the GPU.
Definition gpu.h:32
size_t size
Definition gpu.h:35
WGPUBuffer buffer
Definition gpu.h:33
WGPUBufferUsageFlags usage
Definition gpu.h:34
Represents an ordered collection of WGPUBuffers (wrapped as tensors, non-overlapping views,...
Definition gpu.h:124
const Tensor & operator[](std::size_t index) const
Definition gpu.h:155
Tensor & operator[](std::size_t index)
Definition gpu.h:154
Bindings(const std::initializer_list< Tensor > &init)
Definition gpu.h:128
Bindings(const std::initializer_list< Array > &init)
Definition gpu.h:146
std::array< size_t, N > viewSpans
Definition gpu.h:127
std::array< size_t, N > viewOffsets
Definition gpu.h:126
std::array< Tensor, N > data
Definition gpu.h:125
Bindings(const std::initializer_list< TensorView > &init)
Definition gpu.h:136
Used for on-done callback data for asynchronous operations sduch as kernel launching.
Definition gpu.h:357
std::future< void > * future
Definition gpu.h:363
std::promise< void > * promise
Definition gpu.h:362
void * output
Definition gpu.h:360
WGPUBuffer buffer
Definition gpu.h:358
size_t bufferSize
Definition gpu.h:359
Represents a GPU context, aggregates WebGPU API handles to interact with the GPU including the instan...
Definition gpu.h:434
TensorPool pool
Definition gpu.h:439
WGPUQueue queue
Definition gpu.h:438
WGPUInstance instance
Definition gpu.h:435
KernelPool kernelPool
Definition gpu.h:440
WGPUAdapter adapter
Definition gpu.h:436
~Context()
Definition gpu.h:441
WGPUDevice device
Definition gpu.h:437
Staging buffer and callback data for copying data between the GPU and CPU.
Definition gpu.h:370
WGPUCommandBuffer commandBuffer
Definition gpu.h:371
WGPUBuffer readbackBuffer
Definition gpu.h:372
std::promise< void > promise
Definition gpu.h:373
std::future< void > future
Definition gpu.h:374
KernelCode is the representation of WGSL GPU code with template substitutions applied....
Definition gpu.h:272
Shape workgroupSize
Definition gpu.h:326
KernelCode(const std::string &pData="", size_t workgroupSize=256, NumType precision=kf32)
Constructor to create a code object from a template string and optional workgroup size and precision.
Definition gpu.h:289
std::string entryPoint
Definition gpu.h:329
std::string data
Definition gpu.h:325
KernelCode(const std::string &pData, const Shape &workgroupSize={256, 1, 1}, NumType precision=kf32)
Overload of the constructor to create a code object from a template string and workgroup size....
Definition gpu.h:317
std::string label
Definition gpu.h:328
NumType precision
Definition gpu.h:327
Represents handles + metadata for a reusable kernel on the GPU. The struct members can be divided int...
Definition gpu.h:382
std::unique_ptr< size_t[]> bufferSizes
Definition gpu.h:384
size_t numBindings
Definition gpu.h:385
WGPUCommandBuffer commandBuffer
Definition gpu.h:389
WGPUComputePipeline computePipeline
Definition gpu.h:388
WGPUBindGroup bindGroup
Definition gpu.h:387
Shape nWorkgroups
Definition gpu.h:386
std::unique_ptr< WGPUBuffer[]> buffers
Definition gpu.h:383
A pool of kernels to manage GPU resources. For simple use cases this is instantiated as a member in t...
Definition gpu.h:407
Context * ctx
Definition gpu.h:409
std::set< Kernel * > data
Definition gpu.h:410
KernelPool(Context *ctx)
Definition gpu.h:408
NoParam is a no-op type used to indicate that a kernel does not have any parameters.
Definition gpu.h:958
Represents the shape of a tensor.
Definition gpu.h:49
Shape()=default
size_t rank
Definition gpu.h:53
const size_t & operator[](size_t index) const
Definition gpu.h:64
std::array< size_t, kMaxRank > data
Definition gpu.h:52
Shape(std::initializer_list< size_t > dims)
Definition gpu.h:55
size_t & operator[](size_t index)
Definition gpu.h:60
static constexpr size_t kMaxRank
Definition gpu.h:50
Represents a tensor on the GPU, which is a buffer of values with a shape.
Definition gpu.h:96
Shape shape
Definition gpu.h:98
Array data
Definition gpu.h:97
Represents a pool of tensors to manage GPU resources. The pool is responsible for managing the lifeti...
Definition gpu.h:176
Context * ctx
Definition gpu.h:178
TensorPool(Context *ctx)
Definition gpu.h:177
std::unordered_map< WGPUBuffer, Tensor > data
Definition gpu.h:179
~TensorPool()
Destructor for TensorPool which frees all tensors in the pool.
Definition gpu.h:624
Represents a non-owning view into a tensor specifying an offset and a subspan. This is useful for spe...
Definition gpu.h:110
size_t span
Definition gpu.h:113
size_t offset
Definition gpu.h:112
Tensor data
Definition gpu.h:111