#include <array>
#include <cassert>
#include <cstring>
#include <future>
#include <initializer_list>
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
#include "webgpu/webgpu.h"
#include "numeric_types/half.h"
#include "utils/logging.h"

Classes
struct	gpu::Array
	Represents a buffer of values on the GPU. More...

struct	gpu::Shape
	Represents the shape of a tensor. More...

struct	gpu::Tensor
	Represents a tensor on the GPU, which is a buffer of values with a shape. More...

struct	gpu::TensorView
	Represents a non-owning view into a tensor specifying an offset and a subspan. This is useful for specifying a slice of a tensor on the GPU without copying the data. More...

struct	gpu::Bindings< N >
	Represents an ordered collection of WGPUBuffers (wrapped as tensors, non-overlapping views, or arrays) for the purpose of binding them to a kernel operation to make them accessible to the GPU kernel. More...

struct	gpu::TensorPool
	Represents a pool of tensors to manage GPU resources. The pool is responsible for managing the lifetime of the tensors and freeing them when the pool is destroyed. More...

struct	gpu::KernelCode
	KernelCode is the representation of WGSL GPU code with template substitutions applied. It is a type around the code string with additional metadata for workgroup size and precision since they are specified in the WGSL code. Additionally, label and entryPoint are used by `createKernel()` to specify the label and entry point of the kernel. More...

struct	gpu::CallbackData
	Used for on-done callback data for asynchronous operations sduch as kernel launching. More...

struct	gpu::CopyData
	Staging buffer and callback data for copying data between the GPU and CPU. More...

struct	gpu::Kernel
	Represents handles + metadata for a reusable kernel on the GPU. The struct members can be divided into "consumed upon dispatch" (commandBuffer) and reusable ahead-of-time setup (all other members). More...

struct	gpu::KernelPool
	A pool of kernels to manage GPU resources. For simple use cases this is instantiated as a member in the Context struct although it's possible to have multiple resource pools of kernels in more complex scenarios. More...

struct	gpu::Context
	Represents a GPU context, aggregates WebGPU API handles to interact with the GPU including the instance, adapter, device, and queue. More...

struct	gpu::NoParam
	NoParam is a no-op type used to indicate that a kernel does not have any parameters. More...

Namespaces
namespace	gpu

Enumerations
enum	gpu::NumType { gpu::kf16 , gpu::kf32 }

Functions
size_t	gpu::size (const Shape &shape)
	Returns the number of elements in a tensor with the given shape, which is equal to the product of the dimensions.

template<std::size_t N>
	gpu::Bindings (std::array< Tensor, N >) -> Bindings< N >
	Deduction guide for Bindings.

template<typename... Args>
	gpu::Bindings (Args...) -> Bindings< sizeof...(Args)>

size_t	gpu::sizeBytes (const NumType &type)
	Returns the number of bytes of a number type.

std::string	gpu::toString (NumType type)
	Converts NumType to string.

std::string	gpu::toString (const Shape &shape)
	Converts Shape to string. The string formatting is meant to be slotted into WGSL code (hence no additional parentheses or brackets).

std::string	gpu::toString (size_t value)
	Converts size_t to string. Wraps std::to_string for consistency, instead of having to remember to switch between std::to_string and toString depending on the type.

void	gpu::replaceAll (std::string &str, const std::string &from, const std::string &to)
	simple in-place string replacement helper function for substituting placeholders in a WGSL string template.

void	gpu::replaceAll (std::string &str, const std::vector< std::pair< std::string, std::string > > &reps)
	Overload of the string replacement helper function to replace multiple substrings in a string with multiple replacements.

bool	gpu::operator< (const Kernel &lhs, const Kernel &rhs)
	Operator implementation to make the Kernel type hashable.

void	gpu::processEvents (const WGPUInstance &instance)

Tensor	gpu::createTensor (TensorPool &pool, WGPUDevice &device, const Shape &shape, NumType dtype, WGPUBufferUsageFlags usage=WGPUBufferUsage_Storage\|WGPUBufferUsage_CopyDst\|WGPUBufferUsage_CopySrc)
	Tensor factory function to create a tensor (a Tensor type is simply an Array with an N-dimensional Shape specification) on the GPU. The tensor is created with the given shape, data type, and usage flags, added to the TensorPool, and returned.

Tensor	gpu::createTensor (Context &ctx, const Shape &shape, NumType dtype)
	Overload of the tensor factory function to instantiate a tensor on the GPU with a given shape and data type.

Tensor	gpu::createTensor (Context &ctx, const Shape &shape, NumType dtype, float *data)
	Overload of the tensor factory function to instantiate a tensor on the GPU with a given shape, data type. This overload also takes initial float* data to populate the tensor with.

Tensor	gpu::createTensor (Context &ctx, const Shape &shape, NumType dtype, half *data)
	Overload of the tensor factory function to instantiate a tensor on the GPU with a given shape, data type. This overload also takes initial half* data to populate the tensor with.

void	gpu::FreeTensor (TensorPool &pool, Tensor tensor)
	Frees a tensor resource and updates the tensor pool.

void	gpu::check (bool condition, const char message, const char file="unkown", int line=-1)
	Checks a condition and logs an error message if the condition is false. In debug mode, it will also exit the program with an error code.

Context	gpu::createContext (const WGPUInstanceDescriptor &desc={}, const WGPURequestAdapterOptions &adapterOpts={}, const WGPUDeviceDescriptor &devDescriptor={})
	Factory function to create a GPU context, which aggregates WebGPU API handles to interact with the GPU including the instance, adapter, device, and queue.

void	gpu::wait (Context &ctx, std::future< void > &future)

void	gpu::toCPU (Context &ctx, Tensor &tensor, void *data, size_t bufferSize, CopyData &op)
	Copies data from a GPU buffer to CPU memory.

void	gpu::toCPU (Context &ctx, Tensor &tensor, void *data, size_t bufferSize)
	Overload of the toCPU function to copy data from a GPU buffer to CPU but initializes a staging buffer and promise/future for the operation for you.

template<size_t N>
void	gpu::toCPU (Context &ctx, Tensor &tensor, std::array< float, N > &data)
	Overload of the toCPU function to copy data from a GPU buffer to CPU memory for an array of floats instead of a pointer to a float buffer.

void	gpu::toGPU (Context &ctx, const void *data, WGPUBuffer buffer, size_t size)
	Copies data from CPU memory to a GPU buffer. The toGPU overloads are effectively a convenience wrapper around the WebGPU API call wgpuQueueWriteBuffer.

void	gpu::toGPU (Context &ctx, const float *data, Tensor &tensor)
	Overload of the toGPU function to copy data from CPU memory to a GPU taking a Tensor instance instead of a WGPUBuffer instance.

void	gpu::toGPU (Context &ctx, const half *data, Tensor &tensor)

template<typename Params >
void	gpu::toGPU (Context &ctx, Params &params, Kernel &op)

void	gpu::resetCommandBuffer (WGPUDevice &device, Kernel &op)
	Resets the command buffer in preparation for a kernel dispatch. Since command buffers are consumed upon submission, this function is used both in the initial kernel creation and every time the kernel is to be reused for a dispatch.

size_t	gpu::cdiv (size_t n, size_t d)
	Ceiling division.

Shape	gpu::cdiv (Shape total, Shape group)
	cdiv for shape specification. Mostly useful for evenly dividing total # threads by workgroup size dimensions.

Kernel	gpu::createKernel (Context &ctx, const KernelCode &code, const Tensor dataBindings, size_t numTensors, const size_t viewOffsets, const Shape &nWorkgroups, const void *params=nullptr, size_t paramsSize=0)
	A factory function to create a kernel on the GPU. The kernel is created with the given WGSL code, input tensors, output tensor, and optional parameters.

template<typename ParamsType = NoParam, size_t numInputs>
Kernel	gpu::createKernel (Context &ctx, const KernelCode &code, const Bindings< numInputs > &dataBindings, const Shape &nWorkgroups, const ParamsType &params=ParamsType{})
	Overload which wraps the createKernel factory function to create a kernel on the GPU. This overload uses takes a static collection of input tensors instead of a pointer and a statically determined ParamsType instead of casting params to a void pointer.

void	gpu::dispatchKernel (Context &ctx, Kernel &kernel, std::promise< void > &promise)
	Asynchronously submits a kernel to the GPU queue for execution. It also sets up a callback to notify when the kernel has finished executing by setting the value of the promise in the kernel instance argument.

Variables
template<typename T >
constexpr bool	gpu::IsNoParam = std::is_same_v<T, NoParam>

Classes

Namespaces

Enumerations

Functions

Variables