Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/caching.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Caching methods accelerate diffusion inference by reusing intermediate computati
| `dbcache` | DiT models | Block-level L1 residual threshold |
| `taylorseer` | DiT models | Taylor series approximation |
| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
| `spectrum` | UNET models | Chebyshev + Taylor output forecasting |

### UCache (UNET Models)

Expand Down Expand Up @@ -118,6 +119,28 @@ Mask values: `1` = compute, `0` = can cache.
--scm-policy dynamic
```

### Spectrum (UNET Models)

Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).

```bash
sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
```

#### Parameters

| Parameter | Description | Default |
|-----------|-------------|---------|
| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
| `m` | Chebyshev polynomial degree | 3 |
| `lam` | Ridge regression regularization | 1.0 |
| `window` | Initial window size (compute every N steps) | 2 |
| `flex` | Window growth per computed step after warmup | 0.50 |
| `warmup` | Steps to always compute before caching starts | 4 |
| `stop` | Stop caching at this fraction of total steps | 0.9 |

```

### Performance Tips

- Start with default thresholds and adjust based on output quality
Expand Down
8 changes: 5 additions & 3 deletions examples/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,12 @@ Generation Options:
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
'spectrum' (UNET Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0"
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
--cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
Expand Down
31 changes: 28 additions & 3 deletions examples/common/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1422,8 +1422,8 @@ struct SDGenerationParams {
}
cache_mode = argv_to_utf8(index, argv);
if (cache_mode != "easycache" && cache_mode != "ucache" &&
cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit") {
fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', or 'cache-dit'\n", cache_mode.c_str());
cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit" && cache_mode != "spectrum") {
fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', 'cache-dit', or 'spectrum'\n", cache_mode.c_str());
return -1;
}
return 1;
Expand Down Expand Up @@ -1779,7 +1779,23 @@ struct SDGenerationParams {
} else if (key == "Bn" || key == "bn") {
cache_params.Bn_compute_blocks = std::stoi(val);
} else if (key == "warmup") {
cache_params.max_warmup_steps = std::stoi(val);
if (cache_mode == "spectrum") {
cache_params.spectrum_warmup_steps = std::stoi(val);
} else {
cache_params.max_warmup_steps = std::stoi(val);
}
} else if (key == "w") {
cache_params.spectrum_w = std::stof(val);
} else if (key == "m") {
cache_params.spectrum_m = std::stoi(val);
} else if (key == "lam") {
cache_params.spectrum_lam = std::stof(val);
} else if (key == "window") {
cache_params.spectrum_window_size = std::stoi(val);
} else if (key == "flex") {
cache_params.spectrum_flex_window = std::stof(val);
} else if (key == "stop") {
cache_params.spectrum_stop_percent = std::stof(val);
} else {
LOG_ERROR("error: unknown cache parameter '%s'", key.c_str());
return false;
Expand Down Expand Up @@ -1827,6 +1843,15 @@ struct SDGenerationParams {
cache_params.Bn_compute_blocks = 0;
cache_params.residual_diff_threshold = 0.08f;
cache_params.max_warmup_steps = 8;
} else if (cache_mode == "spectrum") {
cache_params.mode = SD_CACHE_SPECTRUM;
cache_params.spectrum_w = 0.40f;
cache_params.spectrum_m = 3;
cache_params.spectrum_lam = 1.0f;
cache_params.spectrum_window_size = 2;
cache_params.spectrum_flex_window = 0.50f;
cache_params.spectrum_warmup_steps = 4;
cache_params.spectrum_stop_percent = 0.9f;
}

if (!cache_option.empty()) {
Expand Down
8 changes: 8 additions & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ enum sd_cache_mode_t {
SD_CACHE_DBCACHE,
SD_CACHE_TAYLORSEER,
SD_CACHE_CACHE_DIT,
SD_CACHE_SPECTRUM,
};

typedef struct {
Expand All @@ -271,6 +272,13 @@ typedef struct {
int taylorseer_skip_interval;
const char* scm_mask;
bool scm_policy_dynamic;
float spectrum_w;
int spectrum_m;
float spectrum_lam;
int spectrum_window_size;
float spectrum_flex_window;
int spectrum_warmup_steps;
float spectrum_stop_percent;
} sd_cache_params_t;

typedef struct {
Expand Down
195 changes: 195 additions & 0 deletions src/spectrum.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#ifndef __SPECTRUM_HPP__
#define __SPECTRUM_HPP__

#include <cmath>
#include <cstring>
#include <vector>

#include "ggml_extend.hpp"

struct SpectrumConfig {
float w = 0.40f;
int m = 3;
float lam = 1.0f;
int window_size = 2;
float flex_window = 0.50f;
int warmup_steps = 4;
float stop_percent = 0.9f;
};

struct SpectrumState {
SpectrumConfig config;
int cnt = 0;
int num_cached = 0;
float curr_ws = 2.0f;
int K = 6;
int stop_step = 0;
int total_steps_skipped = 0;

std::vector<std::vector<float>> H_buf;
std::vector<float> T_buf;

void init(const SpectrumConfig& cfg, size_t total_steps) {
config = cfg;
cnt = 0;
num_cached = 0;
curr_ws = (float)cfg.window_size;
K = std::max(cfg.m + 1, 6);
stop_step = (int)(cfg.stop_percent * (float)total_steps);
total_steps_skipped = 0;
H_buf.clear();
T_buf.clear();
}

float taus(int step_cnt) const {
return (step_cnt / 50.0f) * 2.0f - 1.0f;
}

bool should_predict() {
if (cnt < config.warmup_steps)
return false;
if (stop_step > 0 && cnt >= stop_step)
return false;
if ((int)H_buf.size() < 2)
return false;

int ws = std::max(1, (int)std::floor(curr_ws));
return (num_cached + 1) % ws != 0;
}

void update(const struct ggml_tensor* denoised) {
int64_t ne = ggml_nelements(denoised);
const float* data = (const float*)denoised->data;

H_buf.emplace_back(data, data + ne);
T_buf.push_back(taus(cnt));

while ((int)H_buf.size() > K) {
H_buf.erase(H_buf.begin());
T_buf.erase(T_buf.begin());
}

if (cnt >= config.warmup_steps)
curr_ws += config.flex_window;

num_cached = 0;
cnt++;
}

void predict(struct ggml_tensor* denoised) {
int64_t F = (int64_t)H_buf[0].size();
int K_curr = (int)H_buf.size();
int M1 = config.m + 1;
float tau_at = taus(cnt);

// Design matrix X: K_curr x M1 (Chebyshev basis)
std::vector<float> X(K_curr * M1);
for (int i = 0; i < K_curr; i++) {
X[i * M1] = 1.0f;
if (M1 > 1)
X[i * M1 + 1] = T_buf[i];
for (int j = 2; j < M1; j++)
X[i * M1 + j] = 2.0f * T_buf[i] * X[i * M1 + j - 1] - X[i * M1 + j - 2];
}

// x_star: Chebyshev basis at current tau
std::vector<float> x_star(M1);
x_star[0] = 1.0f;
if (M1 > 1)
x_star[1] = tau_at;
for (int j = 2; j < M1; j++)
x_star[j] = 2.0f * tau_at * x_star[j - 1] - x_star[j - 2];

// XtX = X^T X + lambda I
std::vector<float> XtX(M1 * M1, 0.0f);
for (int i = 0; i < M1; i++) {
for (int j = 0; j < M1; j++) {
float sum = 0.0f;
for (int k = 0; k < K_curr; k++)
sum += X[k * M1 + i] * X[k * M1 + j];
XtX[i * M1 + j] = sum + (i == j ? config.lam : 0.0f);
}
}

// Cholesky decomposition
std::vector<float> L(M1 * M1, 0.0f);
if (!cholesky_decompose(XtX.data(), L.data(), M1)) {
float trace = 0.0f;
for (int i = 0; i < M1; i++)
trace += XtX[i * M1 + i];
for (int i = 0; i < M1; i++)
XtX[i * M1 + i] += 1e-4f * trace / M1;
cholesky_decompose(XtX.data(), L.data(), M1);
}

// Solve XtX v = x_star
std::vector<float> v(M1);
cholesky_solve(L.data(), x_star.data(), v.data(), M1);

// Prediction weights per history entry
std::vector<float> weights(K_curr, 0.0f);
for (int k = 0; k < K_curr; k++)
for (int j = 0; j < M1; j++)
weights[k] += X[k * M1 + j] * v[j];

// Blend Chebyshev and Taylor predictions
float* out = (float*)denoised->data;
float w_cheb = config.w;
float w_taylor = 1.0f - w_cheb;
const float* h_last = H_buf.back().data();
const float* h_prev = H_buf[H_buf.size() - 2].data();

for (int64_t f = 0; f < F; f++) {
float pred_cheb = 0.0f;
for (int k = 0; k < K_curr; k++)
pred_cheb += weights[k] * H_buf[k][f];

float pred_taylor = h_last[f] + 0.5f * (h_last[f] - h_prev[f]);

out[f] = w_taylor * pred_taylor + w_cheb * pred_cheb;
}

num_cached++;
total_steps_skipped++;
cnt++;
}

private:
static bool cholesky_decompose(const float* A, float* L, int n) {
std::memset(L, 0, n * n * sizeof(float));
for (int i = 0; i < n; i++) {
for (int j = 0; j <= i; j++) {
float sum = 0.0f;
for (int k = 0; k < j; k++)
sum += L[i * n + k] * L[j * n + k];
if (i == j) {
float diag = A[i * n + i] - sum;
if (diag <= 0.0f)
return false;
L[i * n + j] = std::sqrt(diag);
} else {
L[i * n + j] = (A[i * n + j] - sum) / L[j * n + j];
}
}
}
return true;
}

static void cholesky_solve(const float* L, const float* b, float* x, int n) {
std::vector<float> y(n);
for (int i = 0; i < n; i++) {
float sum = 0.0f;
for (int j = 0; j < i; j++)
sum += L[i * n + j] * y[j];
y[i] = (b[i] - sum) / L[i * n + i];
}
for (int i = n - 1; i >= 0; i--) {
float sum = 0.0f;
for (int j = i + 1; j < n; j++)
sum += L[j * n + i] * x[j];
x[i] = (y[i] - sum) / L[i * n + i];
}
}
};

#endif // __SPECTRUM_HPP__
Loading
Loading