927 lines
34 KiB
TypeScript
927 lines
34 KiB
TypeScript
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
import { Env, Tensor, TRACE, TRACE_FUNC_BEGIN, TRACE_FUNC_END } from 'onnxruntime-common';
|
|
|
|
import { DataType, tensorDataTypeEnumToString } from '../wasm-common';
|
|
|
|
import { configureLogger, LOG_DEBUG } from './log';
|
|
import { createView, TensorView } from './tensor-view';
|
|
import { createGpuDataManager, downloadGpuData, GpuDataManager } from './webgpu/gpu-data-manager';
|
|
import { RunFunction, WEBGPU_OP_RESOLVE_RULES } from './webgpu/op-resolve-rules';
|
|
import { ProgramManager } from './webgpu/program-manager';
|
|
import {
|
|
AdapterInfo,
|
|
ComputeContext,
|
|
GpuArchitecture,
|
|
GpuData,
|
|
GpuVendor,
|
|
ProgramInfo,
|
|
ProgramInputTensorInfoDependency,
|
|
SessionState,
|
|
TimestampQuery,
|
|
} from './webgpu/types';
|
|
|
|
interface CommandInfo {
|
|
readonly kernelId: number;
|
|
readonly computePipeline: GPUComputePipeline;
|
|
readonly bindGroup: GPUBindGroup;
|
|
readonly dispatchGroup: [number, number, number];
|
|
}
|
|
|
|
interface KernelInfo {
|
|
readonly kernelType: string;
|
|
readonly kernelName: string;
|
|
readonly kernelEntry: RunFunction;
|
|
readonly attributes: [((attribute: unknown) => unknown) | undefined, unknown];
|
|
}
|
|
|
|
interface PendingKernelInfo {
|
|
readonly kernelId: number;
|
|
readonly programName: string;
|
|
readonly inputTensorViews: readonly TensorView[];
|
|
readonly outputTensorViews: readonly TensorView[];
|
|
}
|
|
|
|
const getProgramInputTensorInfoDependencyKey = (
|
|
inputTensors: readonly TensorView[],
|
|
inputDependencies: readonly ProgramInputTensorInfoDependency[],
|
|
): string => {
|
|
if (inputDependencies.length !== inputTensors.length) {
|
|
throw new Error(
|
|
`inputDependencies length ${inputDependencies.length} is not equal to inputTensors length ${
|
|
inputTensors.length
|
|
}.`,
|
|
);
|
|
}
|
|
|
|
const inputInfos: string[] = [];
|
|
for (let i = 0; i < inputTensors.length; ++i) {
|
|
const type = inputTensors[i].dataType;
|
|
switch (inputDependencies[i]) {
|
|
case 'none': {
|
|
inputInfos.push('');
|
|
break;
|
|
}
|
|
case 'type': {
|
|
inputInfos.push(`${type}`);
|
|
break;
|
|
}
|
|
case 'rank': {
|
|
const rank = inputTensors[i].dims.length;
|
|
inputInfos.push(`${type};${rank}`);
|
|
break;
|
|
}
|
|
case 'dims': {
|
|
const dims = inputTensors[i].dims.join(',');
|
|
inputInfos.push(`${type};${dims}`);
|
|
break;
|
|
}
|
|
default:
|
|
throw new Error(`unsupported input dependency: ${inputDependencies[i]}`);
|
|
}
|
|
}
|
|
|
|
return inputInfos.join('|');
|
|
};
|
|
|
|
/**
|
|
* get a unique key representing the program from the program info, input shapes and types.
|
|
*
|
|
* @returns a unique key is a shorter string than the shader source, which contains all the information to identify a
|
|
* program. if the key is the same, the program shader source should be the same, so we can reuse the program.
|
|
*
|
|
*/
|
|
const getProgramInfoUniqueKey = (
|
|
programInfo: ProgramInfo,
|
|
inputTensors: readonly TensorView[],
|
|
is1DimensionDispatch: boolean,
|
|
): string => {
|
|
// final key format:
|
|
// <PROGRAM_NAME>[<PROGRAM_CUSTOM_CACHE_HINT>]:is1DimensionDispatch:<INPUTS_INFO_0>|<INPUTS_INFO_1>|...
|
|
let key = programInfo.name;
|
|
if (programInfo.shaderCache?.hint) {
|
|
key += '[' + programInfo.shaderCache.hint + ']';
|
|
}
|
|
key +=
|
|
':' +
|
|
is1DimensionDispatch +
|
|
`:${getProgramInputTensorInfoDependencyKey(
|
|
inputTensors,
|
|
programInfo.shaderCache?.inputDependencies ??
|
|
new Array<ProgramInputTensorInfoDependency>(inputTensors.length).fill('dims'),
|
|
)}`;
|
|
return key;
|
|
};
|
|
|
|
class AdapterInfoImpl implements AdapterInfo {
|
|
readonly architecture?: string;
|
|
readonly vendor?: string;
|
|
|
|
constructor(adapterInfo: GPUAdapterInfo) {
|
|
if (adapterInfo) {
|
|
this.architecture = adapterInfo.architecture;
|
|
this.vendor = adapterInfo.vendor;
|
|
}
|
|
}
|
|
|
|
isArchitecture(architecture: GpuArchitecture): boolean {
|
|
return this.architecture === architecture;
|
|
}
|
|
|
|
isVendor(vendor: GpuVendor): boolean {
|
|
return this.vendor === vendor;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
|
|
* the first parameter so that it is stored for future use.
|
|
*/
|
|
export class WebGpuBackend {
|
|
adapterInfo: AdapterInfoImpl;
|
|
device: GPUDevice;
|
|
/**
|
|
* an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
|
|
*/
|
|
gpuDataManager: GpuDataManager;
|
|
/**
|
|
* an instance of ProgramManager to build and run WebGPU compute shader program, and manage a ProgramKey -> Program
|
|
* artifacts mapping
|
|
*/
|
|
programManager: ProgramManager;
|
|
|
|
/**
|
|
* representing the session ID of which is currently being run.
|
|
* `null` means no session is being run.
|
|
* only valid when session.run is executed.
|
|
*/
|
|
currentSessionId: number | null = null;
|
|
|
|
/**
|
|
* representing the kernel ID of which is currently being computed (CPU code perspective).
|
|
* `null` means no kernel is being computed.
|
|
* only one kernel can be computed at a moment.
|
|
*/
|
|
currentKernelId: number | null = null;
|
|
/**
|
|
* a list of temporary GPU data for the current kernel. should release when the kernel done computation.
|
|
*/
|
|
private temporaryData: GpuData[];
|
|
/**
|
|
* a KernelID -> a GPU data list, which stores persistent GPU data owned by the specific kernel.
|
|
*/
|
|
private kernelPersistentData: Map<number, GpuData[]>;
|
|
/**
|
|
* a KernelID -> a custom data, which stores custom data owned by the specific kernel.
|
|
*/
|
|
private kernelCustomData: Map<number, { [key: string]: unknown }>;
|
|
/**
|
|
* get the custom data of the current kernel
|
|
*/
|
|
get currentKernelCustomData(): { [key: string]: unknown } {
|
|
if (this.currentKernelId === null) {
|
|
throw new Error('currentKernelCustomData(): currentKernelId is null. (should not happen)');
|
|
}
|
|
|
|
let data = this.kernelCustomData.get(this.currentKernelId);
|
|
if (!data) {
|
|
data = {};
|
|
this.kernelCustomData.set(this.currentKernelId, data);
|
|
}
|
|
|
|
return data;
|
|
}
|
|
|
|
// KernelID -> kernelInfo mapping
|
|
kernels: Map<number, KernelInfo>;
|
|
private commandEncoder: GPUCommandEncoder | null = null;
|
|
private computePassEncoder: GPUComputePassEncoder | null = null;
|
|
maxDispatchNumber = 16;
|
|
pendingDispatchNumber = 0;
|
|
|
|
// info of kernels pending submission for a single batch
|
|
private pendingKernels: PendingKernelInfo[] = [];
|
|
// queryReadBuffer -> pendingKernels mapping for all the batches
|
|
private pendingQueries: Map<GPUBuffer, PendingKernelInfo[]> = new Map();
|
|
private queryResolveBuffer?: GPUBuffer;
|
|
private querySet?: GPUQuerySet;
|
|
private queryTimeBase?: bigint;
|
|
queryType: TimestampQuery;
|
|
|
|
env: Env;
|
|
sessionStatus: SessionState = 'default';
|
|
/**
|
|
* a SessionID -> CommandInfo[] mapping. It's used to record all GPU commands for corresponding session.
|
|
*/
|
|
capturedCommandList: Map<number, CommandInfo[]> = new Map();
|
|
|
|
/**
|
|
* a SessionID -> PendingKernelInfo[] mapping for profiling.
|
|
*/
|
|
private capturedPendingKernels: Map<number, PendingKernelInfo[]> = new Map();
|
|
|
|
/**
|
|
* a SessionID -> a Map of (InputOutputIndex -> [ID, GPUBuffer]) mapping.
|
|
*/
|
|
sessionExternalDataMapping: Map<number, Map<number, [number, GPUBuffer]>> = new Map();
|
|
|
|
async initialize(env: Env, adapter: GPUAdapter): Promise<void> {
|
|
this.env = env;
|
|
const requiredFeatures: GPUFeatureName[] = [];
|
|
const deviceDescriptor: GPUDeviceDescriptor = {
|
|
requiredLimits: {
|
|
maxComputeWorkgroupStorageSize: adapter.limits.maxComputeWorkgroupStorageSize,
|
|
maxComputeWorkgroupsPerDimension: adapter.limits.maxComputeWorkgroupsPerDimension,
|
|
maxStorageBufferBindingSize: adapter.limits.maxStorageBufferBindingSize,
|
|
maxBufferSize: adapter.limits.maxBufferSize,
|
|
maxComputeInvocationsPerWorkgroup: adapter.limits.maxComputeInvocationsPerWorkgroup,
|
|
maxComputeWorkgroupSizeX: adapter.limits.maxComputeWorkgroupSizeX,
|
|
maxComputeWorkgroupSizeY: adapter.limits.maxComputeWorkgroupSizeY,
|
|
maxComputeWorkgroupSizeZ: adapter.limits.maxComputeWorkgroupSizeZ,
|
|
},
|
|
requiredFeatures,
|
|
};
|
|
|
|
// Try requiring WebGPU features
|
|
const requireFeatureIfAvailable = (feature: GPUFeatureName) =>
|
|
adapter.features.has(feature) && requiredFeatures.push(feature) && true;
|
|
// Try chromium-experimental-timestamp-query-inside-passes and fallback to timestamp-query
|
|
if (!requireFeatureIfAvailable('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName)) {
|
|
requireFeatureIfAvailable('timestamp-query');
|
|
}
|
|
requireFeatureIfAvailable('shader-f16');
|
|
// Try subgroups
|
|
requireFeatureIfAvailable('subgroups' as GPUFeatureName);
|
|
|
|
this.device = await adapter.requestDevice(deviceDescriptor);
|
|
this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo()));
|
|
this.gpuDataManager = createGpuDataManager(this);
|
|
this.programManager = new ProgramManager(this);
|
|
this.kernels = new Map();
|
|
this.kernelPersistentData = new Map();
|
|
this.kernelCustomData = new Map();
|
|
|
|
// set up flags for logger
|
|
configureLogger(env.logLevel!, !!env.debug);
|
|
|
|
// TODO: set up flags
|
|
|
|
this.device.onuncapturederror = (ev) => {
|
|
if (ev.error instanceof GPUValidationError) {
|
|
// eslint-disable-next-line no-console
|
|
console.error(`An uncaught WebGPU validation error was raised: ${ev.error.message}`);
|
|
}
|
|
};
|
|
|
|
Object.defineProperty(this.env.webgpu, 'device', {
|
|
value: this.device,
|
|
writable: false,
|
|
enumerable: true,
|
|
configurable: false,
|
|
});
|
|
Object.defineProperty(this.env.webgpu, 'adapter', {
|
|
value: adapter,
|
|
writable: false,
|
|
enumerable: true,
|
|
configurable: false,
|
|
});
|
|
|
|
// init queryType, which is necessary for InferenceSession.create
|
|
this.setQueryType();
|
|
}
|
|
|
|
dispose(): void {
|
|
if (typeof this.querySet !== 'undefined') {
|
|
this.querySet.destroy();
|
|
}
|
|
this.gpuDataManager.dispose();
|
|
}
|
|
|
|
getCommandEncoder(): GPUCommandEncoder {
|
|
if (!this.commandEncoder) {
|
|
this.commandEncoder = this.device.createCommandEncoder();
|
|
}
|
|
return this.commandEncoder;
|
|
}
|
|
|
|
getComputePassEncoder(): GPUComputePassEncoder {
|
|
if (!this.computePassEncoder) {
|
|
const commandEncoder = this.getCommandEncoder();
|
|
const computePassDescriptor: GPUComputePassDescriptor = {};
|
|
|
|
if (this.queryType === 'at-passes') {
|
|
computePassDescriptor.timestampWrites = {
|
|
querySet: this.querySet!,
|
|
beginningOfPassWriteIndex: this.pendingDispatchNumber * 2,
|
|
endOfPassWriteIndex: this.pendingDispatchNumber * 2 + 1,
|
|
};
|
|
}
|
|
|
|
this.computePassEncoder = commandEncoder.beginComputePass(computePassDescriptor);
|
|
}
|
|
return this.computePassEncoder;
|
|
}
|
|
|
|
endComputePass(): void {
|
|
if (this.computePassEncoder) {
|
|
this.computePassEncoder.end();
|
|
this.computePassEncoder = null;
|
|
}
|
|
}
|
|
|
|
flush(): void {
|
|
if (!this.commandEncoder) {
|
|
return;
|
|
}
|
|
|
|
TRACE_FUNC_BEGIN();
|
|
|
|
this.endComputePass();
|
|
let queryReadBuffer: GPUBuffer;
|
|
if (this.queryType !== 'none') {
|
|
this.commandEncoder.resolveQuerySet(
|
|
this.querySet!,
|
|
0,
|
|
this.pendingDispatchNumber * 2,
|
|
this.queryResolveBuffer!,
|
|
0,
|
|
);
|
|
|
|
queryReadBuffer = this.device.createBuffer(
|
|
// eslint-disable-next-line no-bitwise
|
|
{ size: this.pendingDispatchNumber * 2 * 8, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST },
|
|
);
|
|
|
|
this.pendingQueries.set(queryReadBuffer, this.pendingKernels);
|
|
this.pendingKernels = [];
|
|
this.commandEncoder.copyBufferToBuffer(
|
|
this.queryResolveBuffer!,
|
|
0,
|
|
queryReadBuffer,
|
|
0,
|
|
this.pendingDispatchNumber * 2 * 8,
|
|
);
|
|
}
|
|
|
|
this.device.queue.submit([this.commandEncoder.finish()]);
|
|
this.gpuDataManager.refreshPendingBuffers();
|
|
this.commandEncoder = null;
|
|
this.pendingDispatchNumber = 0;
|
|
|
|
if (this.queryType !== 'none') {
|
|
void queryReadBuffer!.mapAsync(GPUMapMode.READ).then(() => {
|
|
const mappedData = new BigUint64Array(queryReadBuffer.getMappedRange());
|
|
const pendingKernels = this.pendingQueries.get(queryReadBuffer)!;
|
|
for (let i = 0; i < mappedData.length / 2; i++) {
|
|
const pendingKernelInfo = pendingKernels[i];
|
|
const kernelId = pendingKernelInfo.kernelId;
|
|
const kernelInfo = this.kernels.get(kernelId)!;
|
|
const kernelType = kernelInfo.kernelType;
|
|
const kernelName = kernelInfo.kernelName;
|
|
const programName = pendingKernelInfo.programName;
|
|
const inputTensorViews = pendingKernelInfo.inputTensorViews;
|
|
const outputTensorViews = pendingKernelInfo.outputTensorViews;
|
|
const startTimeU64 = mappedData[i * 2];
|
|
const endTimeU64 = mappedData[i * 2 + 1];
|
|
|
|
if (typeof this.queryTimeBase === 'undefined') {
|
|
this.queryTimeBase = startTimeU64;
|
|
}
|
|
|
|
const startTime = Number(startTimeU64 - this.queryTimeBase);
|
|
const endTime = Number(endTimeU64 - this.queryTimeBase);
|
|
|
|
if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
|
|
throw new RangeError('incorrect timestamp range');
|
|
}
|
|
|
|
if (this.env.webgpu.profiling?.ondata) {
|
|
this.env.webgpu.profiling.ondata({
|
|
version: 1,
|
|
inputsMetadata: inputTensorViews.map((value) => ({
|
|
dims: value.dims,
|
|
dataType: tensorDataTypeEnumToString(value.dataType),
|
|
})),
|
|
outputsMetadata: outputTensorViews.map((value) => ({
|
|
dims: value.dims,
|
|
dataType: tensorDataTypeEnumToString(value.dataType),
|
|
})),
|
|
kernelId,
|
|
kernelType,
|
|
kernelName,
|
|
programName,
|
|
startTime,
|
|
endTime,
|
|
});
|
|
} else {
|
|
// if no callback is provided, print the profiling message to console
|
|
let inputShapes = '';
|
|
inputTensorViews.forEach((value, i) => {
|
|
inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
|
|
});
|
|
let outputShapes = '';
|
|
outputTensorViews.forEach((value, i) => {
|
|
outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
|
|
});
|
|
// eslint-disable-next-line no-console
|
|
console.log(
|
|
`[profiling] kernel "${kernelId}|${kernelType}|${kernelName}|${programName}" ${inputShapes}${
|
|
outputShapes
|
|
}execution time: ${endTime - startTime} ns`,
|
|
);
|
|
}
|
|
TRACE('GPU', `${programName}::${startTimeU64}::${endTimeU64}`);
|
|
}
|
|
queryReadBuffer.unmap();
|
|
this.pendingQueries.delete(queryReadBuffer);
|
|
});
|
|
}
|
|
TRACE_FUNC_END();
|
|
}
|
|
|
|
/**
|
|
* run a WebGPU program.
|
|
* @param program a ProgramInfo instance
|
|
* @param inputTensorViews a TensorView array. each element represents a value already exists in GPU.
|
|
* @param outputIndices an indices array. each element can be either -1 (temporary data), -2 (persistent data) or an
|
|
* index to the kernel's output.
|
|
* @param createKernelOutput a callback function that create a value to kernel's output with the given index
|
|
* @param createIntermediateOutput a callback function that create a value as a intermediate value, either temporary
|
|
* or persistent (owned by the current kernel)
|
|
* @returns a TensorView array representing the result.
|
|
*/
|
|
run(
|
|
program: ProgramInfo,
|
|
inputTensorViews: readonly TensorView[],
|
|
outputIndices: readonly number[],
|
|
createKernelOutput: (index: number, dataType: number, dims: readonly number[]) => TensorView,
|
|
createIntermediateOutput: (dataType: number, dims: readonly number[]) => TensorView,
|
|
outputCount: number,
|
|
): TensorView[] {
|
|
TRACE_FUNC_BEGIN(program.name);
|
|
// create info for inputs
|
|
const inputDatas: GpuData[] = [];
|
|
for (let i = 0; i < inputTensorViews.length; ++i) {
|
|
const data = inputTensorViews[i].data;
|
|
// if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
|
|
if (data === 0) {
|
|
continue;
|
|
}
|
|
const gpuData = this.gpuDataManager.get(data);
|
|
if (!gpuData) {
|
|
throw new Error(`no GPU data for input: ${data}`);
|
|
}
|
|
inputDatas.push(gpuData);
|
|
}
|
|
|
|
const { outputs, dispatchGroup, programUniforms } = program.getRunData(inputTensorViews);
|
|
|
|
// check output indices
|
|
const validatedOutputIndices = outputIndices.length === 0 ? outputs.map((_, i) => i) : outputIndices;
|
|
if (validatedOutputIndices.length !== outputs.length) {
|
|
throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${outputs.length}.`);
|
|
}
|
|
|
|
// create info for outputs
|
|
const outputTensorViews: TensorView[] = [];
|
|
const outputDatas: GpuData[] = [];
|
|
for (let i = 0; i < outputs.length; ++i) {
|
|
// value -1 and -2 are used for creating temporary and persistent outputs.
|
|
// value -3 is used for placeholder output. So -3, -2, -1 and 0, 1, 2, ... are valid
|
|
// output indices. see type definition of ComputeContextInputsOutputsMapping for more details.
|
|
if (
|
|
!Number.isInteger(validatedOutputIndices[i]) ||
|
|
validatedOutputIndices[i] < -3 ||
|
|
validatedOutputIndices[i] >= outputCount
|
|
) {
|
|
throw new Error(`Invalid output index: ${validatedOutputIndices[i]}`);
|
|
}
|
|
if (validatedOutputIndices[i] === -3) {
|
|
continue;
|
|
}
|
|
const isTemporary = validatedOutputIndices[i] === -1;
|
|
const isPersistent = validatedOutputIndices[i] === -2;
|
|
const tensorView =
|
|
isTemporary || isPersistent
|
|
? createIntermediateOutput(outputs[i].dataType, outputs[i].dims)
|
|
: createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims);
|
|
outputTensorViews.push(tensorView);
|
|
// if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
|
|
if (tensorView.data === 0) {
|
|
continue;
|
|
}
|
|
const gpuData = this.gpuDataManager.get(tensorView.data);
|
|
if (!gpuData) {
|
|
throw new Error(`no GPU data for output: ${tensorView.data}`);
|
|
}
|
|
if (isTemporary) {
|
|
this.temporaryData.push(gpuData);
|
|
}
|
|
if (isPersistent) {
|
|
let persistentData = this.kernelPersistentData.get(this.currentKernelId!);
|
|
if (!persistentData) {
|
|
persistentData = [];
|
|
this.kernelPersistentData.set(this.currentKernelId!, persistentData);
|
|
}
|
|
persistentData.push(gpuData);
|
|
}
|
|
outputDatas.push(gpuData);
|
|
}
|
|
|
|
// when there are any zero-sized tensor in the inputs or outputs, we should report error unless all outputs are
|
|
// zero-sized tensors.
|
|
if (inputDatas.length !== inputTensorViews.length || outputDatas.length !== outputTensorViews.length) {
|
|
// if all outputs are zero-sized tensors, there is no need to run the program.
|
|
if (outputDatas.length === 0) {
|
|
TRACE_FUNC_END(program.name);
|
|
return outputTensorViews;
|
|
}
|
|
// if some outputs are zero-sized tensors, report an error.
|
|
//
|
|
// TODO: so far we don't see any use case that outputs include both zero-sized tensors and non-zero-sized tensors.
|
|
// If we see such use case, we need to make a change here to support it.
|
|
throw new Error(
|
|
`Program ${program.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`,
|
|
);
|
|
}
|
|
|
|
// load uniforms
|
|
// TODO: add cache for uniform (is it necessary?)
|
|
//
|
|
let uniformBufferBinding: GPUBindingResource | undefined;
|
|
if (programUniforms) {
|
|
let currentOffset = 0;
|
|
const offsets: number[] = [];
|
|
|
|
programUniforms.forEach((v) => {
|
|
const data = typeof v.data === 'number' ? [v.data] : v.data;
|
|
if (data.length === 0) {
|
|
return;
|
|
}
|
|
// https://www.w3.org/TR/WGSL/#alignof
|
|
const sizeOfElement = v.type === DataType.float16 ? 2 : 4;
|
|
let sizeOfVecOrMat;
|
|
let baseAlignment;
|
|
if (v.type === DataType.float16) {
|
|
baseAlignment = data.length > 4 ? 16 : data.length > 2 ? 8 : data.length * sizeOfElement;
|
|
sizeOfVecOrMat = data.length > 4 ? 16 : sizeOfElement * data.length;
|
|
} else {
|
|
baseAlignment = data.length <= 2 ? data.length * sizeOfElement : 16;
|
|
sizeOfVecOrMat = 16;
|
|
}
|
|
currentOffset = Math.ceil(currentOffset / baseAlignment) * baseAlignment;
|
|
offsets.push(currentOffset);
|
|
// For non-float16 type, when data.length > 4, the uniform variable is of type array<vec4<i32|u32|f32>,N>, where
|
|
// N = Math.ceil(data.length / 4) and SizeOf(vec4<i32|u32|f32>) = 16. The total byte length is N *
|
|
// SizeOf(vec4<i32|u32|f32>). For float16 type, when data.length > 4, the uniform variable is of type
|
|
// array<mat2x4<f16>,N>, where N = Math.ceil(data.length / 8) and SizeOf(mat2x4<f16>) = 16. The total byte
|
|
// length is N * SizeOf(mat2x4<f16>).
|
|
const elementPerVecOrMat = v.type === DataType.float16 ? 8 : 4;
|
|
currentOffset +=
|
|
data.length > 4 ? Math.ceil(data.length / elementPerVecOrMat) * sizeOfVecOrMat : data.length * sizeOfElement;
|
|
});
|
|
|
|
// Meet alignment of struct here: https://www.w3.org/TR/WGSL/#alignment-and-size. For simplicity, set
|
|
// maxAlignmentOfField to 16 since the underlying buffer has been rounded up to 16.
|
|
const maxAlignmentOfField = 16;
|
|
currentOffset = Math.ceil(currentOffset / maxAlignmentOfField) * maxAlignmentOfField;
|
|
const arrayBuffer = new ArrayBuffer(currentOffset);
|
|
programUniforms.forEach((v, i) => {
|
|
const offset = offsets[i];
|
|
const data = typeof v.data === 'number' ? [v.data] : v.data;
|
|
if (v.type === DataType.int32) {
|
|
new Int32Array(arrayBuffer, offset, data.length).set(data);
|
|
} else if (v.type === DataType.uint32) {
|
|
new Uint32Array(arrayBuffer, offset, data.length).set(data);
|
|
} else if (v.type === DataType.float16) {
|
|
new Uint16Array(arrayBuffer, offset, data.length).set(data);
|
|
} else if (v.type === DataType.float) {
|
|
new Float32Array(arrayBuffer, offset, data.length).set(data);
|
|
} else {
|
|
throw new Error(`Unsupported uniform type: ${tensorDataTypeEnumToString(v.type)}`);
|
|
}
|
|
});
|
|
|
|
const uniformBufferData =
|
|
// eslint-disable-next-line no-bitwise
|
|
this.gpuDataManager.create(currentOffset, GPUBufferUsage.COPY_DST | GPUBufferUsage.UNIFORM);
|
|
this.device.queue.writeBuffer(uniformBufferData.buffer, 0, arrayBuffer, 0, currentOffset);
|
|
this.gpuDataManager.release(uniformBufferData.id);
|
|
uniformBufferBinding = { offset: 0, size: currentOffset, buffer: uniformBufferData.buffer };
|
|
}
|
|
|
|
const normalizedDispatchGroup = this.programManager.normalizeDispatchGroupSize(dispatchGroup);
|
|
const is1DimensionDispatch = normalizedDispatchGroup[1] === 1 && normalizedDispatchGroup[2] === 1;
|
|
// get program info
|
|
const key = getProgramInfoUniqueKey(program, inputTensorViews, is1DimensionDispatch);
|
|
let artifact = this.programManager.getArtifact(key);
|
|
if (!artifact) {
|
|
artifact = this.programManager.build(program, normalizedDispatchGroup);
|
|
this.programManager.setArtifact(key, artifact);
|
|
LOG_DEBUG('info', () => `[artifact] key: ${key}, programName: ${program.name}`);
|
|
}
|
|
|
|
// validate uniform variables
|
|
if (programUniforms && artifact.uniformVariablesInfo) {
|
|
if (programUniforms.length !== artifact.uniformVariablesInfo.length) {
|
|
throw new Error(
|
|
`Uniform variables count mismatch: expect ${artifact.uniformVariablesInfo.length}, got ${
|
|
programUniforms.length
|
|
} in program "${artifact.programInfo.name}".`,
|
|
);
|
|
}
|
|
for (let i = 0; i < programUniforms.length; i++) {
|
|
const uniform = programUniforms[i];
|
|
const actualType = uniform.type;
|
|
const actualLength = typeof uniform.data === 'number' ? 1 : uniform.data.length;
|
|
const [type, length] = artifact.uniformVariablesInfo[i];
|
|
if (actualType !== type || actualLength !== length) {
|
|
throw new Error(
|
|
`Uniform variable ${i} mismatch: expect type ${type} with size ${length}, got type ${
|
|
actualType
|
|
} with size ${actualLength} in program "${artifact.programInfo.name}".`,
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
LOG_DEBUG(
|
|
'info',
|
|
() =>
|
|
`[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
|
|
normalizedDispatchGroup[1]
|
|
}x${normalizedDispatchGroup[2]}`,
|
|
);
|
|
|
|
if (this.queryType !== 'none' || this.sessionStatus === 'capturing') {
|
|
const pendingKernelInfo: PendingKernelInfo = {
|
|
kernelId: this.currentKernelId!,
|
|
programName: artifact.programInfo.name,
|
|
inputTensorViews,
|
|
outputTensorViews,
|
|
};
|
|
this.pendingKernels.push(pendingKernelInfo);
|
|
|
|
if (this.sessionStatus === 'capturing') {
|
|
const sessionPendingKernels = this.capturedPendingKernels.get(this.currentSessionId!);
|
|
sessionPendingKernels!.push(pendingKernelInfo);
|
|
}
|
|
}
|
|
|
|
this.programManager.run(artifact, inputDatas, outputDatas, normalizedDispatchGroup, uniformBufferBinding);
|
|
|
|
TRACE_FUNC_END(program.name);
|
|
return outputTensorViews;
|
|
}
|
|
|
|
upload(gpuDataId: number, data: Uint8Array): void {
|
|
this.gpuDataManager.upload(gpuDataId, data);
|
|
}
|
|
|
|
memcpy(src: number, dst: number): void {
|
|
this.gpuDataManager.memcpy(src, dst);
|
|
}
|
|
|
|
async download(gpuDataId: number, getTargetBuffer: () => Uint8Array): Promise<void> {
|
|
// the underlying buffer may be changed after the async function is called. so we use a getter function to make sure
|
|
// the buffer is up-to-date.
|
|
await this.gpuDataManager.download(gpuDataId, getTargetBuffer);
|
|
}
|
|
|
|
alloc(size: number): number {
|
|
return this.gpuDataManager.create(size).id;
|
|
}
|
|
|
|
free(ptr: number): number {
|
|
return this.gpuDataManager.release(ptr);
|
|
}
|
|
|
|
createKernel(kernelType: string, kernelId: number, attribute: unknown, kernelName: string): void {
|
|
const op = WEBGPU_OP_RESOLVE_RULES.get(kernelType);
|
|
if (!op) {
|
|
throw new Error(`kernel not implemented: ${kernelType}`);
|
|
}
|
|
|
|
const kernelInfo: KernelInfo = {
|
|
kernelType,
|
|
kernelName,
|
|
kernelEntry: op[0],
|
|
attributes: [op[1], attribute],
|
|
};
|
|
this.kernels.set(kernelId, kernelInfo);
|
|
}
|
|
|
|
releaseKernel(kernelId: number): void {
|
|
const persistentData = this.kernelPersistentData.get(kernelId);
|
|
if (persistentData) {
|
|
for (const data of persistentData) {
|
|
this.gpuDataManager.release(data.id);
|
|
}
|
|
this.kernelPersistentData.delete(kernelId);
|
|
}
|
|
|
|
this.kernelCustomData.delete(kernelId);
|
|
this.kernels.delete(kernelId);
|
|
}
|
|
|
|
computeKernel(kernelId: number, context: ComputeContext, errors: Array<Promise<string | null>>): number {
|
|
const kernel = this.kernels.get(kernelId);
|
|
if (!kernel) {
|
|
throw new Error(`kernel not created: ${kernelId}`);
|
|
}
|
|
const kernelType = kernel.kernelType;
|
|
const kernelName = kernel.kernelName;
|
|
const kernelEntry = kernel.kernelEntry;
|
|
const attributes = kernel.attributes;
|
|
if (this.currentKernelId !== null) {
|
|
throw new Error(`kernel "[${kernelType}] ${kernelName}" is not allowed to be called recursively`);
|
|
}
|
|
this.currentKernelId = kernelId;
|
|
|
|
// parse attributes if necessary
|
|
if (attributes[0]) {
|
|
attributes[1] = attributes[0](attributes[1]);
|
|
attributes[0] = undefined;
|
|
}
|
|
|
|
LOG_DEBUG('info', () => `[WebGPU] Start to run kernel "[${kernelType}] ${kernelName}"...`);
|
|
|
|
const useErrorScope = this.env.debug;
|
|
|
|
this.temporaryData = [];
|
|
try {
|
|
if (useErrorScope) {
|
|
this.device.pushErrorScope('validation');
|
|
}
|
|
|
|
kernelEntry(context, attributes[1]);
|
|
return 0; // ORT_OK
|
|
} catch (e) {
|
|
errors.push(Promise.resolve(`[WebGPU] Kernel "[${kernelType}] ${kernelName}" failed. ${e}`));
|
|
return 1; // ORT_FAIL
|
|
} finally {
|
|
if (useErrorScope) {
|
|
errors.push(
|
|
this.device
|
|
.popErrorScope()
|
|
.then((err) =>
|
|
err ? `GPU validation error for kernel "[${kernelType}] ${kernelName}": ${err.message}` : null,
|
|
),
|
|
);
|
|
}
|
|
|
|
for (const data of this.temporaryData) {
|
|
this.gpuDataManager.release(data.id);
|
|
}
|
|
this.temporaryData = [];
|
|
this.currentKernelId = null;
|
|
}
|
|
}
|
|
|
|
// #region external buffer
|
|
registerBuffer(sessionId: number, index: number, buffer: GPUBuffer, size: number): number {
|
|
let sessionInputOutputMapping = this.sessionExternalDataMapping.get(sessionId);
|
|
if (!sessionInputOutputMapping) {
|
|
sessionInputOutputMapping = new Map();
|
|
this.sessionExternalDataMapping.set(sessionId, sessionInputOutputMapping);
|
|
}
|
|
|
|
// the buffer may be user created, or managed by GPU data manager.
|
|
// The GPU data manager will not manage these buffers. we register them as external buffers.
|
|
//
|
|
// The map `sessionInputOutputMapping` is used to store the data ID and buffer for each input/output. Once a
|
|
// specific input/output is registered, the data ID will not change.
|
|
const previousBuffer = sessionInputOutputMapping.get(index);
|
|
const id = this.gpuDataManager.registerExternalBuffer(buffer, size, previousBuffer);
|
|
sessionInputOutputMapping.set(index, [id, buffer]);
|
|
return id;
|
|
}
|
|
unregisterBuffers(sessionId: number): void {
|
|
const sessionInputOutputMapping = this.sessionExternalDataMapping.get(sessionId);
|
|
if (sessionInputOutputMapping) {
|
|
sessionInputOutputMapping.forEach((bufferInfo) => this.gpuDataManager.unregisterExternalBuffer(bufferInfo[0]));
|
|
this.sessionExternalDataMapping.delete(sessionId);
|
|
}
|
|
}
|
|
getBuffer(gpuDataId: number): GPUBuffer {
|
|
const gpuData = this.gpuDataManager.get(gpuDataId);
|
|
if (!gpuData) {
|
|
throw new Error(`no GPU data for buffer: ${gpuDataId}`);
|
|
}
|
|
return gpuData.buffer;
|
|
}
|
|
createDownloader(
|
|
gpuBuffer: GPUBuffer,
|
|
size: number,
|
|
type: Tensor.GpuBufferDataTypes,
|
|
): () => Promise<Tensor.DataType> {
|
|
return async () => {
|
|
const data = await downloadGpuData(this, gpuBuffer, size);
|
|
return createView(data.buffer, type);
|
|
};
|
|
}
|
|
// #endregion
|
|
writeTimestamp(index: number): void {
|
|
if (this.queryType !== 'inside-passes') {
|
|
return;
|
|
}
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
(this.computePassEncoder as any).writeTimestamp(this.querySet, index);
|
|
}
|
|
setQueryType(): void {
|
|
this.queryType = 'none';
|
|
if (
|
|
this.env.webgpu.profiling?.mode === 'default' ||
|
|
(typeof this.env.trace === 'undefined' ? this.env.wasm.trace : this.env.trace)
|
|
) {
|
|
if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) {
|
|
this.queryType = 'inside-passes';
|
|
} else if (this.device.features.has('timestamp-query')) {
|
|
this.queryType = 'at-passes';
|
|
}
|
|
|
|
if (this.queryType !== 'none' && typeof this.querySet === 'undefined') {
|
|
this.querySet = this.device.createQuerySet({
|
|
type: 'timestamp',
|
|
count: this.maxDispatchNumber * 2,
|
|
});
|
|
this.queryResolveBuffer = this.device.createBuffer(
|
|
// eslint-disable-next-line no-bitwise
|
|
{ size: this.maxDispatchNumber * 2 * 8, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE },
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
captureBegin(): void {
|
|
LOG_DEBUG('info', 'captureBegin');
|
|
if (!this.capturedCommandList.get(this.currentSessionId!)) {
|
|
this.capturedCommandList.set(this.currentSessionId!, []);
|
|
}
|
|
if (!this.capturedPendingKernels.get(this.currentSessionId!)) {
|
|
this.capturedPendingKernels.set(this.currentSessionId!, []);
|
|
}
|
|
// flush the left commands before we change the status.
|
|
this.flush();
|
|
this.sessionStatus = 'capturing';
|
|
}
|
|
captureEnd(): void {
|
|
LOG_DEBUG('info', 'captureEnd');
|
|
// flush the left commands before we change the status.
|
|
this.flush();
|
|
this.sessionStatus = 'default';
|
|
}
|
|
replay(): void {
|
|
LOG_DEBUG('info', 'replay');
|
|
this.sessionStatus = 'replaying';
|
|
const sessionCommandList = this.capturedCommandList.get(this.currentSessionId!);
|
|
const sessionPendingKernels = this.capturedPendingKernels.get(this.currentSessionId!);
|
|
const length = sessionCommandList!.length;
|
|
this.pendingKernels = [];
|
|
for (let i = 0; i < length; i++) {
|
|
const computePassEncoder = this.getComputePassEncoder();
|
|
const command = sessionCommandList![i];
|
|
this.writeTimestamp(this.pendingDispatchNumber * 2);
|
|
computePassEncoder.setPipeline(command.computePipeline);
|
|
computePassEncoder.setBindGroup(0, command.bindGroup);
|
|
computePassEncoder.dispatchWorkgroups(...command.dispatchGroup);
|
|
this.writeTimestamp(this.pendingDispatchNumber * 2 + 1);
|
|
this.pendingDispatchNumber++;
|
|
if (this.queryType !== 'none') {
|
|
this.pendingKernels.push(sessionPendingKernels![i]);
|
|
}
|
|
if (this.pendingDispatchNumber >= this.maxDispatchNumber || this.queryType === 'at-passes') {
|
|
this.endComputePass();
|
|
}
|
|
if (this.pendingDispatchNumber >= this.maxDispatchNumber) {
|
|
this.flush();
|
|
}
|
|
}
|
|
// flush the left commands before we change the status.
|
|
this.flush();
|
|
this.sessionStatus = 'default';
|
|
}
|
|
|
|
onCreateSession(): void {
|
|
this.gpuDataManager.onCreateSession();
|
|
}
|
|
|
|
onReleaseSession(sessionId: number): void {
|
|
this.unregisterBuffers(sessionId);
|
|
if (this.capturedCommandList.has(sessionId)) {
|
|
this.capturedCommandList.delete(sessionId);
|
|
}
|
|
if (this.capturedPendingKernels.has(sessionId)) {
|
|
this.capturedPendingKernels.delete(sessionId);
|
|
}
|
|
this.gpuDataManager.onReleaseSession(sessionId);
|
|
}
|
|
|
|
onRunStart(sessionId: number): void {
|
|
this.currentSessionId = sessionId;
|
|
this.setQueryType();
|
|
}
|
|
}
|