Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 24 additions & 15 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ layout(std430) buffer;

#include "indexing.glslh"
#include "common.glslh"
#include "conv2d_common.glslh"

${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)}
${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)}
Expand All @@ -38,7 +37,6 @@ ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False
// Metadata for input/output tensors (memory layout agnostic)
${layout_declare_ubo(B, "BufferMetadata", "outp")}
${layout_declare_ubo(B, "BufferMetadata", "inp")}
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

layout(push_constant) uniform restrict Block {
float input_scale;
Expand All @@ -56,6 +54,17 @@ ${layout_declare_spec_const(C, "int", "activation_type", "0")}
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}

// Conv2D parameter specialization constants
${layout_declare_spec_const(C, "int", "kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "kernel_size_y", "1")}
${layout_declare_spec_const(C, "int", "stride_x", "1")}
${layout_declare_spec_const(C, "int", "stride_y", "1")}
${layout_declare_spec_const(C, "int", "padding_x", "0")}
${layout_declare_spec_const(C, "int", "padding_y", "0")}
${layout_declare_spec_const(C, "int", "dilation_x", "1")}
${layout_declare_spec_const(C, "int", "dilation_y", "1")}
${layout_declare_spec_const(C, "int", "groups", "1")}

// Load weight block for a given (ic4, kx, ky, oc4) position.
// Weight texture layout (from pack_q8_conv2d_weights.glsl):
// block_x = oc4 * K_w + kx
Expand Down Expand Up @@ -101,8 +110,8 @@ void main() {
const int IC = int(inp.sizes[0][2]);

// Compute channels per group
const int OC_per_group = OC / conv2d_params.groups;
const int IC_per_group = IC / conv2d_params.groups;
const int OC_per_group = OC / groups;
const int IC_per_group = IC / groups;
const int IC4_per_group = div_up_4(IC_per_group);

// Determine which group this output channel block belongs to
Expand All @@ -113,14 +122,14 @@ void main() {
const int inp_w_stride = int(inp.strides[0][0]);
const int inp_h_stride = int(inp.strides[0][1]);
const int inp_c_stride = int(inp.strides[0][2]);
const int w_texel_step = conv2d_params.dilation.x * inp_w_stride;
const int h_texel_step = conv2d_params.dilation.y * inp_h_stride;
const int subtile_w_step = conv2d_params.stride.x * inp_w_stride;
const int w_texel_step = dilation_x * inp_w_stride;
const int h_texel_step = dilation_y * inp_h_stride;
const int subtile_w_step = stride_x * inp_w_stride;

// Compute base input position (for subtile_w=0, ic4=0)
TensorIndex4D inp_tidx;
inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x;
inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y;
inp_tidx.data[0] = outp_tidx.data[0] * stride_x - padding_x;
inp_tidx.data[1] = outp_tidx.data[1] * stride_y - padding_y;
inp_tidx.data[2] = ic_group_start;
inp_tidx.data[3] = 0;

Expand All @@ -142,7 +151,7 @@ void main() {
}

// Perform convolution using packed int8 dot products
for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
for (int ky = 0; ky < kernel_size_y; ky++) {
const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H);

// Process input channels in blocks of 4
Expand All @@ -153,10 +162,10 @@ void main() {
// Reset width coordinate at start of each ic4 iteration
inp_tidx.data[0] = base_inp_w;

for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
for (int kx = 0; kx < kernel_size_x; kx++) {
// Load weight block: 4 output channels × 4 input channels
// weight_block[oc] contains packed weights for ic4*4 to ic4*4+3 -> oc
const ivec4 weight_block = load_weight_block(ic4, kx, ky, oc4, IC4_per_group, conv2d_params.kernel_size.x);
const ivec4 weight_block = load_weight_block(ic4, kx, ky, oc4, IC4_per_group, kernel_size_x);

// Process 4 adjacent width positions
[[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
Expand Down Expand Up @@ -187,16 +196,16 @@ void main() {
}

// Advance to next output position's input coordinate
inp_tidx.data[0] += conv2d_params.stride.x;
inp_tidx.data[0] += stride_x;
}

// Adjust for net dilation step
inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x;
inp_tidx.data[0] += dilation_x - 4 * stride_x;
}
}

// Advance height by dilation for next kernel row
inp_tidx.data[1] += conv2d_params.dilation.y;
inp_tidx.data[1] += dilation_y;

if (get_outer_packed_dim_block_size(inp_layout) == 1) {
// Advance base index by height step for next kernel row
Expand Down
36 changes: 22 additions & 14 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ layout(std430) buffer;

#include "indexing.glslh"
#include "common.glslh"
#include "conv2d_common.glslh"

${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)}
${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)}
Expand All @@ -32,7 +31,6 @@ ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False
// Metadata for input/output tensors (memory layout agnostic)
${layout_declare_ubo(B, "BufferMetadata", "outp")}
${layout_declare_ubo(B, "BufferMetadata", "inp")}
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

layout(push_constant) uniform restrict Block {
float input_scale;
Expand All @@ -50,6 +48,16 @@ ${layout_declare_spec_const(C, "int", "activation_type", "0")}
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}

// Conv2D parameter specialization constants
${layout_declare_spec_const(C, "int", "kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "kernel_size_y", "1")}
${layout_declare_spec_const(C, "int", "stride_x", "1")}
${layout_declare_spec_const(C, "int", "stride_y", "1")}
${layout_declare_spec_const(C, "int", "padding_x", "0")}
${layout_declare_spec_const(C, "int", "padding_y", "0")}
${layout_declare_spec_const(C, "int", "dilation_x", "1")}
${layout_declare_spec_const(C, "int", "dilation_y", "1")}

#include "block_indexing.glslh"

// Load a 4xint8 block of weights.
Expand Down Expand Up @@ -89,22 +97,22 @@ void main() {
}

// Compute weight addressing constants
const int KW4 = int(div_up_4(conv2d_params.kernel_size.x));
const int KW4 = int(div_up_4(kernel_size_x));

// Get strides for width and height dimensions (in texel space)
const int w_stride = int(inp.strides[0][0]);
const int h_stride = int(inp.strides[0][1]);

// Pre-compute step sizes for efficient indexing
const int w_texel_step = conv2d_params.dilation.x * w_stride;
const int h_texel_step = conv2d_params.dilation.y * h_stride;
const int w_texel_step = dilation_x * w_stride;
const int h_texel_step = dilation_y * h_stride;
// Step between adjacent output width positions in input texel space
const int subtile_w_step = conv2d_params.stride.x * w_stride;
const int subtile_w_step = stride_x * w_stride;

// Compute base input position for subtile_w=0
TensorIndex4D inp_tidx;
inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x;
inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y;
inp_tidx.data[0] = outp_tidx.data[0] * stride_x - padding_x;
inp_tidx.data[1] = outp_tidx.data[1] * stride_y - padding_y;
inp_tidx.data[2] = outp_tidx.data[2];
inp_tidx.data[3] = 0; // batch = 0 since N == 1

Expand All @@ -128,13 +136,13 @@ void main() {
const int inp_H = int(inp.sizes[0][1]);

// Perform depthwise convolution
for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
for (int ky = 0; ky < kernel_size_y; ky++) {
const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H);

// Reset width coordinate at start of each kernel row
inp_tidx.data[0] = base_inp_w;

for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
for (int kx = 0; kx < kernel_size_x; kx++) {
// Load weight once, reuse for all 4 width positions
const int packed_weight = load_weight(kx, ky, c4, KW4, C4);
const ivec4 weight_4c = unpack_int8x4(packed_weight);
Expand All @@ -148,7 +156,7 @@ void main() {
if (get_outer_packed_dim_block_size(inp_layout) == 1) {
inp_texel_idx = base_inp_texel_idx + kx * w_texel_step + subtile_w * subtile_w_step;
} else {
// const int w_offset = kx * conv2d_params.dilation.x + subtile_w * conv2d_params.stride.x;
// const int w_offset = kx * dilation_x + subtile_w * stride_x;
// inp_texel_idx = base_inp_texel_idx + div_4(w_offset) * w_stride + mod_4(w_offset);
// inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout);
const int w4 = div_4(inp_tidx.data[0]);
Expand All @@ -162,15 +170,15 @@ void main() {
acc[subtile_w] += weight_4c * input_4c;

// Advance to next output position's input coordinate
inp_tidx.data[0] += conv2d_params.stride.x;
inp_tidx.data[0] += stride_x;
}

// We advanced by 4*stride.x during subtile loop; adjust for net dilation step
inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x;
inp_tidx.data[0] += dilation_x - 4 * stride_x;
}

// Advance height by dilation for next kernel row
inp_tidx.data[1] += conv2d_params.dilation.y;
inp_tidx.data[1] += dilation_y;

if (get_outer_packed_dim_block_size(inp_layout) == 1) {
// Advance base index by height step for next kernel row
Expand Down
37 changes: 23 additions & 14 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_im2col.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,31 @@
layout(std430) buffer;

#include "indexing.glslh"
#include "conv2d_common.glslh"

${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)}

// Metadata for im2col output and input tensors (layout-agnostic)
${layout_declare_ubo(B, "BufferMetadata", "im2col_outp")}
${layout_declare_ubo(B, "BufferMetadata", "inp")}
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

${layout_declare_spec_const(C, "int", "apply_bias", "1")}

// Layout specialization constants
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}

// Conv2D parameter specialization constants
${layout_declare_spec_const(C, "int", "kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "stride_x", "1")}
${layout_declare_spec_const(C, "int", "stride_y", "1")}
${layout_declare_spec_const(C, "int", "padding_x", "0")}
${layout_declare_spec_const(C, "int", "padding_y", "0")}
${layout_declare_spec_const(C, "int", "dilation_x", "1")}
${layout_declare_spec_const(C, "int", "dilation_y", "1")}
${layout_declare_spec_const(C, "int", "in_channels_per_group", "1")}
${layout_declare_spec_const(C, "int", "K_per_group", "1")}

layout(push_constant) uniform restrict Block {
int zp;
};
Expand Down Expand Up @@ -64,23 +73,23 @@ void main() {
const int im2col_h = h_idx;
const int im2col_k = mul_4(c4_idx);

const int group_idx = im2col_k / conv2d_params.K_per_group;
const int k_in_group = im2col_k % conv2d_params.K_per_group;
const int group_idx = im2col_k / K_per_group;
const int k_in_group = im2col_k % K_per_group;

const int c_in_group = k_in_group % conv2d_params.in_channels_per_group;
const int krow = k_in_group / conv2d_params.in_channels_per_group;
const int kernel_x = krow % conv2d_params.kernel_size.x;
const int kernel_y = krow / conv2d_params.kernel_size.x;
const int c_in_group = k_in_group % in_channels_per_group;
const int krow = k_in_group / in_channels_per_group;
const int kernel_x = krow % kernel_size_x;
const int kernel_y = krow / kernel_size_x;

// Base input position
const int input_x_base =
(im2col_w * conv2d_params.stride.x) - conv2d_params.padding.x +
(kernel_x * conv2d_params.dilation.x);
(im2col_w * stride_x) - padding_x +
(kernel_x * dilation_x);
const int input_y =
(im2col_h * conv2d_params.stride.y) - conv2d_params.padding.y +
(kernel_y * conv2d_params.dilation.y);
(im2col_h * stride_y) - padding_y +
(kernel_y * dilation_y);
const int input_z =
group_idx * conv2d_params.in_channels_per_group + c_in_group;
group_idx * in_channels_per_group + c_in_group;

// Input tensor extents
const int input_W = input_sizes.x;
Expand All @@ -98,7 +107,7 @@ void main() {
// Each loaded int contains 4 packed int8 channel values.
ivec4 im2col_block;
for (int i = 0; i < 4; i++) {
const int x = input_x_base + i * conv2d_params.stride.x;
const int x = input_x_base + i * stride_x;
if (!y_z_in_bounds || x < 0 || x >= input_W) {
im2col_block[i] = zp_packed;
} else {
Expand Down
16 changes: 13 additions & 3 deletions backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,16 +296,26 @@ void add_q8ta_conv2d_node(
// Pass metadata for both output and input tensors
vkapi::ParamsBindList param_buffers = {
graph.buffer_meta_ubo(packed_int8_output),
graph.buffer_meta_ubo(packed_int8_input),
graph.create_params_buffer(conv_params)};
graph.buffer_meta_ubo(packed_int8_input)};

// Build spec constants: apply_bias, apply_relu + layout constants
// Build spec constants: apply_bias, activation_type, layout constants,
// conv2d params
vkapi::SpecVarList spec_constants = {
apply_bias,
activation_type,
// Layout specialization constants
graph.hashed_layout_of(packed_int8_input),
graph.hashed_layout_of(packed_int8_output),
// Conv2D parameter specialization constants
static_cast<uint32_t>(conv_params.kernel_size[0]),
static_cast<uint32_t>(conv_params.kernel_size[1]),
static_cast<uint32_t>(conv_params.stride[0]),
static_cast<uint32_t>(conv_params.stride[1]),
static_cast<uint32_t>(conv_params.padding[0]),
static_cast<uint32_t>(conv_params.padding[1]),
static_cast<uint32_t>(conv_params.dilation[0]),
static_cast<uint32_t>(conv_params.dilation[1]),
static_cast<uint32_t>(conv_params.groups),
};

graph.execute_nodes().emplace_back(new DynamicDispatchNode(
Expand Down
15 changes: 12 additions & 3 deletions backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,16 +332,25 @@ void add_q8ta_conv2d_dw_node(
// Pass metadata for both output and input tensors
vkapi::ParamsBindList param_buffers = {
graph.buffer_meta_ubo(packed_int8_output),
graph.buffer_meta_ubo(packed_int8_input),
graph.create_params_buffer(conv_params)};
graph.buffer_meta_ubo(packed_int8_input)};

// Build spec constants: apply_bias, activation_type + layout constants
// Build spec constants: apply_bias, activation_type, layout constants,
// conv2d params
vkapi::SpecVarList spec_constants = {
apply_bias,
activation_type,
// Layout specialization constants
graph.hashed_layout_of(packed_int8_input),
graph.hashed_layout_of(packed_int8_output),
// Conv2D parameter specialization constants
static_cast<uint32_t>(conv_params.kernel_size[0]),
static_cast<uint32_t>(conv_params.kernel_size[1]),
static_cast<uint32_t>(conv_params.stride[0]),
static_cast<uint32_t>(conv_params.stride[1]),
static_cast<uint32_t>(conv_params.padding[0]),
static_cast<uint32_t>(conv_params.padding[1]),
static_cast<uint32_t>(conv_params.dilation[0]),
static_cast<uint32_t>(conv_params.dilation[1]),
};

graph.execute_nodes().emplace_back(new DynamicDispatchNode(
Expand Down
15 changes: 12 additions & 3 deletions backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,18 +135,27 @@ void add_q8ta_im2col_node(

vkapi::ParamsBindList param_buffers = {
graph.buffer_meta_ubo(packed_int8_im2col),
graph.buffer_meta_ubo(packed_int8_input),
graph.create_params_buffer(conv_params)};
graph.buffer_meta_ubo(packed_int8_input)};

std::vector<PushConstantDataInfo> push_constants = {
PushConstantDataInfo(&zp, sizeof(zp)),
};

// Build spec constants: apply_bias + layout constants (for generic shader)
// Build spec constants: apply_bias, layout constants, conv2d params
vkapi::SpecVarList spec_constants = {
1u,
graph.hashed_layout_of(packed_int8_im2col),
graph.hashed_layout_of(packed_int8_input),
// Conv2D parameter specialization constants
static_cast<uint32_t>(conv_params.kernel_size[0]),
static_cast<uint32_t>(conv_params.stride[0]),
static_cast<uint32_t>(conv_params.stride[1]),
static_cast<uint32_t>(conv_params.padding[0]),
static_cast<uint32_t>(conv_params.padding[1]),
static_cast<uint32_t>(conv_params.dilation[0]),
static_cast<uint32_t>(conv_params.dilation[1]),
static_cast<uint32_t>(conv_params.in_channels_per_group),
static_cast<uint32_t>(conv_params.K_per_group),
};

// // Add layout specialization constants (only for generic shader)
Expand Down
Loading