diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl index 821f7f79b0e..63ca8c45b23 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl @@ -26,7 +26,6 @@ layout(std430) buffer; #include "indexing.glslh" #include "common.glslh" -#include "conv2d_common.glslh" ${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)} ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)} @@ -38,7 +37,6 @@ ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False // Metadata for input/output tensors (memory layout agnostic) ${layout_declare_ubo(B, "BufferMetadata", "outp")} ${layout_declare_ubo(B, "BufferMetadata", "inp")} -${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} layout(push_constant) uniform restrict Block { float input_scale; @@ -56,6 +54,17 @@ ${layout_declare_spec_const(C, "int", "activation_type", "0")} ${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")} ${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")} +// Conv2D parameter specialization constants +${layout_declare_spec_const(C, "int", "kernel_size_x", "1")} +${layout_declare_spec_const(C, "int", "kernel_size_y", "1")} +${layout_declare_spec_const(C, "int", "stride_x", "1")} +${layout_declare_spec_const(C, "int", "stride_y", "1")} +${layout_declare_spec_const(C, "int", "padding_x", "0")} +${layout_declare_spec_const(C, "int", "padding_y", "0")} +${layout_declare_spec_const(C, "int", "dilation_x", "1")} +${layout_declare_spec_const(C, "int", "dilation_y", "1")} +${layout_declare_spec_const(C, "int", "groups", "1")} + // Load weight block for a given (ic4, kx, ky, oc4) position. // Weight texture layout (from pack_q8_conv2d_weights.glsl): // block_x = oc4 * K_w + kx @@ -101,8 +110,8 @@ void main() { const int IC = int(inp.sizes[0][2]); // Compute channels per group - const int OC_per_group = OC / conv2d_params.groups; - const int IC_per_group = IC / conv2d_params.groups; + const int OC_per_group = OC / groups; + const int IC_per_group = IC / groups; const int IC4_per_group = div_up_4(IC_per_group); // Determine which group this output channel block belongs to @@ -113,14 +122,14 @@ void main() { const int inp_w_stride = int(inp.strides[0][0]); const int inp_h_stride = int(inp.strides[0][1]); const int inp_c_stride = int(inp.strides[0][2]); - const int w_texel_step = conv2d_params.dilation.x * inp_w_stride; - const int h_texel_step = conv2d_params.dilation.y * inp_h_stride; - const int subtile_w_step = conv2d_params.stride.x * inp_w_stride; + const int w_texel_step = dilation_x * inp_w_stride; + const int h_texel_step = dilation_y * inp_h_stride; + const int subtile_w_step = stride_x * inp_w_stride; // Compute base input position (for subtile_w=0, ic4=0) TensorIndex4D inp_tidx; - inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x; - inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y; + inp_tidx.data[0] = outp_tidx.data[0] * stride_x - padding_x; + inp_tidx.data[1] = outp_tidx.data[1] * stride_y - padding_y; inp_tidx.data[2] = ic_group_start; inp_tidx.data[3] = 0; @@ -142,7 +151,7 @@ void main() { } // Perform convolution using packed int8 dot products - for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) { + for (int ky = 0; ky < kernel_size_y; ky++) { const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H); // Process input channels in blocks of 4 @@ -153,10 +162,10 @@ void main() { // Reset width coordinate at start of each ic4 iteration inp_tidx.data[0] = base_inp_w; - for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) { + for (int kx = 0; kx < kernel_size_x; kx++) { // Load weight block: 4 output channels × 4 input channels // weight_block[oc] contains packed weights for ic4*4 to ic4*4+3 -> oc - const ivec4 weight_block = load_weight_block(ic4, kx, ky, oc4, IC4_per_group, conv2d_params.kernel_size.x); + const ivec4 weight_block = load_weight_block(ic4, kx, ky, oc4, IC4_per_group, kernel_size_x); // Process 4 adjacent width positions [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { @@ -187,16 +196,16 @@ void main() { } // Advance to next output position's input coordinate - inp_tidx.data[0] += conv2d_params.stride.x; + inp_tidx.data[0] += stride_x; } // Adjust for net dilation step - inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x; + inp_tidx.data[0] += dilation_x - 4 * stride_x; } } // Advance height by dilation for next kernel row - inp_tidx.data[1] += conv2d_params.dilation.y; + inp_tidx.data[1] += dilation_y; if (get_outer_packed_dim_block_size(inp_layout) == 1) { // Advance base index by height step for next kernel row diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl index 7f4d03887df..70f76c4f375 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl @@ -20,7 +20,6 @@ layout(std430) buffer; #include "indexing.glslh" #include "common.glslh" -#include "conv2d_common.glslh" ${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)} ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)} @@ -32,7 +31,6 @@ ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False // Metadata for input/output tensors (memory layout agnostic) ${layout_declare_ubo(B, "BufferMetadata", "outp")} ${layout_declare_ubo(B, "BufferMetadata", "inp")} -${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} layout(push_constant) uniform restrict Block { float input_scale; @@ -50,6 +48,16 @@ ${layout_declare_spec_const(C, "int", "activation_type", "0")} ${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")} ${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")} +// Conv2D parameter specialization constants +${layout_declare_spec_const(C, "int", "kernel_size_x", "1")} +${layout_declare_spec_const(C, "int", "kernel_size_y", "1")} +${layout_declare_spec_const(C, "int", "stride_x", "1")} +${layout_declare_spec_const(C, "int", "stride_y", "1")} +${layout_declare_spec_const(C, "int", "padding_x", "0")} +${layout_declare_spec_const(C, "int", "padding_y", "0")} +${layout_declare_spec_const(C, "int", "dilation_x", "1")} +${layout_declare_spec_const(C, "int", "dilation_y", "1")} + #include "block_indexing.glslh" // Load a 4xint8 block of weights. @@ -89,22 +97,22 @@ void main() { } // Compute weight addressing constants - const int KW4 = int(div_up_4(conv2d_params.kernel_size.x)); + const int KW4 = int(div_up_4(kernel_size_x)); // Get strides for width and height dimensions (in texel space) const int w_stride = int(inp.strides[0][0]); const int h_stride = int(inp.strides[0][1]); // Pre-compute step sizes for efficient indexing - const int w_texel_step = conv2d_params.dilation.x * w_stride; - const int h_texel_step = conv2d_params.dilation.y * h_stride; + const int w_texel_step = dilation_x * w_stride; + const int h_texel_step = dilation_y * h_stride; // Step between adjacent output width positions in input texel space - const int subtile_w_step = conv2d_params.stride.x * w_stride; + const int subtile_w_step = stride_x * w_stride; // Compute base input position for subtile_w=0 TensorIndex4D inp_tidx; - inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x; - inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y; + inp_tidx.data[0] = outp_tidx.data[0] * stride_x - padding_x; + inp_tidx.data[1] = outp_tidx.data[1] * stride_y - padding_y; inp_tidx.data[2] = outp_tidx.data[2]; inp_tidx.data[3] = 0; // batch = 0 since N == 1 @@ -128,13 +136,13 @@ void main() { const int inp_H = int(inp.sizes[0][1]); // Perform depthwise convolution - for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) { + for (int ky = 0; ky < kernel_size_y; ky++) { const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H); // Reset width coordinate at start of each kernel row inp_tidx.data[0] = base_inp_w; - for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) { + for (int kx = 0; kx < kernel_size_x; kx++) { // Load weight once, reuse for all 4 width positions const int packed_weight = load_weight(kx, ky, c4, KW4, C4); const ivec4 weight_4c = unpack_int8x4(packed_weight); @@ -148,7 +156,7 @@ void main() { if (get_outer_packed_dim_block_size(inp_layout) == 1) { inp_texel_idx = base_inp_texel_idx + kx * w_texel_step + subtile_w * subtile_w_step; } else { - // const int w_offset = kx * conv2d_params.dilation.x + subtile_w * conv2d_params.stride.x; + // const int w_offset = kx * dilation_x + subtile_w * stride_x; // inp_texel_idx = base_inp_texel_idx + div_4(w_offset) * w_stride + mod_4(w_offset); // inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout); const int w4 = div_4(inp_tidx.data[0]); @@ -162,15 +170,15 @@ void main() { acc[subtile_w] += weight_4c * input_4c; // Advance to next output position's input coordinate - inp_tidx.data[0] += conv2d_params.stride.x; + inp_tidx.data[0] += stride_x; } // We advanced by 4*stride.x during subtile loop; adjust for net dilation step - inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x; + inp_tidx.data[0] += dilation_x - 4 * stride_x; } // Advance height by dilation for next kernel row - inp_tidx.data[1] += conv2d_params.dilation.y; + inp_tidx.data[1] += dilation_y; if (get_outer_packed_dim_block_size(inp_layout) == 1) { // Advance base index by height step for next kernel row diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_im2col.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_im2col.glsl index ed4e124ac45..9d073ab3f79 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q8ta_im2col.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_im2col.glsl @@ -15,7 +15,6 @@ layout(std430) buffer; #include "indexing.glslh" -#include "conv2d_common.glslh" ${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)} @@ -23,7 +22,6 @@ ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scala // Metadata for im2col output and input tensors (layout-agnostic) ${layout_declare_ubo(B, "BufferMetadata", "im2col_outp")} ${layout_declare_ubo(B, "BufferMetadata", "inp")} -${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} ${layout_declare_spec_const(C, "int", "apply_bias", "1")} @@ -31,6 +29,17 @@ ${layout_declare_spec_const(C, "int", "apply_bias", "1")} ${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")} ${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")} +// Conv2D parameter specialization constants +${layout_declare_spec_const(C, "int", "kernel_size_x", "1")} +${layout_declare_spec_const(C, "int", "stride_x", "1")} +${layout_declare_spec_const(C, "int", "stride_y", "1")} +${layout_declare_spec_const(C, "int", "padding_x", "0")} +${layout_declare_spec_const(C, "int", "padding_y", "0")} +${layout_declare_spec_const(C, "int", "dilation_x", "1")} +${layout_declare_spec_const(C, "int", "dilation_y", "1")} +${layout_declare_spec_const(C, "int", "in_channels_per_group", "1")} +${layout_declare_spec_const(C, "int", "K_per_group", "1")} + layout(push_constant) uniform restrict Block { int zp; }; @@ -64,23 +73,23 @@ void main() { const int im2col_h = h_idx; const int im2col_k = mul_4(c4_idx); - const int group_idx = im2col_k / conv2d_params.K_per_group; - const int k_in_group = im2col_k % conv2d_params.K_per_group; + const int group_idx = im2col_k / K_per_group; + const int k_in_group = im2col_k % K_per_group; - const int c_in_group = k_in_group % conv2d_params.in_channels_per_group; - const int krow = k_in_group / conv2d_params.in_channels_per_group; - const int kernel_x = krow % conv2d_params.kernel_size.x; - const int kernel_y = krow / conv2d_params.kernel_size.x; + const int c_in_group = k_in_group % in_channels_per_group; + const int krow = k_in_group / in_channels_per_group; + const int kernel_x = krow % kernel_size_x; + const int kernel_y = krow / kernel_size_x; // Base input position const int input_x_base = - (im2col_w * conv2d_params.stride.x) - conv2d_params.padding.x + - (kernel_x * conv2d_params.dilation.x); + (im2col_w * stride_x) - padding_x + + (kernel_x * dilation_x); const int input_y = - (im2col_h * conv2d_params.stride.y) - conv2d_params.padding.y + - (kernel_y * conv2d_params.dilation.y); + (im2col_h * stride_y) - padding_y + + (kernel_y * dilation_y); const int input_z = - group_idx * conv2d_params.in_channels_per_group + c_in_group; + group_idx * in_channels_per_group + c_in_group; // Input tensor extents const int input_W = input_sizes.x; @@ -98,7 +107,7 @@ void main() { // Each loaded int contains 4 packed int8 channel values. ivec4 im2col_block; for (int i = 0; i < 4; i++) { - const int x = input_x_base + i * conv2d_params.stride.x; + const int x = input_x_base + i * stride_x; if (!y_z_in_bounds || x < 0 || x >= input_W) { im2col_block[i] = zp_packed; } else { diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp index f6e89bef03d..e219d23e849 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp @@ -296,16 +296,26 @@ void add_q8ta_conv2d_node( // Pass metadata for both output and input tensors vkapi::ParamsBindList param_buffers = { graph.buffer_meta_ubo(packed_int8_output), - graph.buffer_meta_ubo(packed_int8_input), - graph.create_params_buffer(conv_params)}; + graph.buffer_meta_ubo(packed_int8_input)}; - // Build spec constants: apply_bias, apply_relu + layout constants + // Build spec constants: apply_bias, activation_type, layout constants, + // conv2d params vkapi::SpecVarList spec_constants = { apply_bias, activation_type, // Layout specialization constants graph.hashed_layout_of(packed_int8_input), graph.hashed_layout_of(packed_int8_output), + // Conv2D parameter specialization constants + static_cast(conv_params.kernel_size[0]), + static_cast(conv_params.kernel_size[1]), + static_cast(conv_params.stride[0]), + static_cast(conv_params.stride[1]), + static_cast(conv_params.padding[0]), + static_cast(conv_params.padding[1]), + static_cast(conv_params.dilation[0]), + static_cast(conv_params.dilation[1]), + static_cast(conv_params.groups), }; graph.execute_nodes().emplace_back(new DynamicDispatchNode( diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp index e690ff435a8..7369de57d1d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp @@ -332,16 +332,25 @@ void add_q8ta_conv2d_dw_node( // Pass metadata for both output and input tensors vkapi::ParamsBindList param_buffers = { graph.buffer_meta_ubo(packed_int8_output), - graph.buffer_meta_ubo(packed_int8_input), - graph.create_params_buffer(conv_params)}; + graph.buffer_meta_ubo(packed_int8_input)}; - // Build spec constants: apply_bias, activation_type + layout constants + // Build spec constants: apply_bias, activation_type, layout constants, + // conv2d params vkapi::SpecVarList spec_constants = { apply_bias, activation_type, // Layout specialization constants graph.hashed_layout_of(packed_int8_input), graph.hashed_layout_of(packed_int8_output), + // Conv2D parameter specialization constants + static_cast(conv_params.kernel_size[0]), + static_cast(conv_params.kernel_size[1]), + static_cast(conv_params.stride[0]), + static_cast(conv_params.stride[1]), + static_cast(conv_params.padding[0]), + static_cast(conv_params.padding[1]), + static_cast(conv_params.dilation[0]), + static_cast(conv_params.dilation[1]), }; graph.execute_nodes().emplace_back(new DynamicDispatchNode( diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp index b43fe9eacc6..42622f5c618 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp @@ -135,18 +135,27 @@ void add_q8ta_im2col_node( vkapi::ParamsBindList param_buffers = { graph.buffer_meta_ubo(packed_int8_im2col), - graph.buffer_meta_ubo(packed_int8_input), - graph.create_params_buffer(conv_params)}; + graph.buffer_meta_ubo(packed_int8_input)}; std::vector push_constants = { PushConstantDataInfo(&zp, sizeof(zp)), }; - // Build spec constants: apply_bias + layout constants (for generic shader) + // Build spec constants: apply_bias, layout constants, conv2d params vkapi::SpecVarList spec_constants = { 1u, graph.hashed_layout_of(packed_int8_im2col), graph.hashed_layout_of(packed_int8_input), + // Conv2D parameter specialization constants + static_cast(conv_params.kernel_size[0]), + static_cast(conv_params.stride[0]), + static_cast(conv_params.stride[1]), + static_cast(conv_params.padding[0]), + static_cast(conv_params.padding[1]), + static_cast(conv_params.dilation[0]), + static_cast(conv_params.dilation[1]), + static_cast(conv_params.in_channels_per_group), + static_cast(conv_params.K_per_group), }; // // Add layout specialization constants (only for generic shader)