withcatai · giladgd · Mar 17, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md
@@ -164,3 +164,50 @@ Ensure you're not using the `Administrator` user for `npm install` nor to run th
 To do that, go to `Settings > Update & Security > For developers` and enable `Developer mode`.
 
 After that, delete the `.cache` folder under your user directory and try building the app again.
+
+## Customizing `postinstall` Behavior {#postinstall-behavior}
+When installing `node-llama-cpp`, its `postinstall` script checks whether the prebuilt binaries
+are compatible with current machine (which they almost always are, at least the CPU-only ones which are the last resort fallback),
+and when not, attempts [building the native bindings from source](./building-from-source.md).
+
+When attempting to [build from source](./building-from-source.md), if the machine lacks the required build tools,
+the build will fail and indicative error messages will direct you to the specific commands you need to run
+or packages you need to install in order for the build process to succeed.
+
+If you want to customize the `postinstall` behavior, you can do so using any of the following methods:
+* Passing the `--node-llama-cpp-postinstall=<behavior>` flag to the `npm install` command.
+* Setting the `NODE_LLAMA_CPP_POSTINSTALL` environment variable to `<behavior>` before running `npm install`.
+* Configuring `config.nodeLlamaCppPostinstall` on your project's `package.json` to `<behavior>`.
+  <br/>
+  This will only work when your module is installed globally using `npm install -g` or for a non-library project when you run `npm install` in the project root; it will not work when your module is installed as a dependency of another module.
+
+Where `<behavior>` can be one of the following options:
+* **`auto` (default)**: the default behavior explained above.
+* **`ignoreFailedBuild`**: same as the default behavior,
+    but a failed build will not throw an error and will be ignored, which means the installation will succeed.
+    Using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source again by default.
+* **`skip`**: skip the entire `postinstall` script.
+    If the prebuilt binaries are incompatible with the current machine,
+    using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source by default.
+
+::: code-group
+```shell [<code>npm install</code> flag]
+npm install --node-llama-cpp-postinstall=ignoreFailedBuild
+```
+
+```shell [env var (bash)]
+NODE_LLAMA_CPP_POSTINSTALL=ignoreFailedBuild npm install
+```
+
+```shell [env var (using <code>cross-env</code>)]
+npx --yes cross-env NODE_LLAMA_CPP_POSTINSTALL=ignoreFailedBuild npm install
+```
+
+```json [<code>package.json</code>]
+{
+    "config": {
+        "nodeLlamaCppPostinstall": "ignoreFailedBuild"
+    }
+}
+```
+:::
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -443,6 +443,20 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
             context_params.no_perf = !(options.Get("performanceTracking").As<Napi::Boolean>().Value());
         }
 
+        if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
+            auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
+            if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
+                context_params.type_k = static_cast<ggml_type>(keyType);
+            }
+        }
+
+        if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
+            auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
+            if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
+                context_params.type_v = static_cast<ggml_type>(valueType);
+            }
+        }
+
         if (options.Has("swaFullCache")) {
             context_params.swa_full = options.Get("swaFullCache").As<Napi::Boolean>().Value();
         }
@@ -1063,7 +1077,7 @@ void AddonContext::init(Napi::Object exports) {
 }
 
 AddonContextSequenceCheckpoint::AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonContextSequenceCheckpoint>(info) {
-    
+
 }
 AddonContextSequenceCheckpoint::~AddonContextSequenceCheckpoint() {
     dispose();
@@ -1099,7 +1113,7 @@ class AddonContextSequenceCheckpointInitWorker : public Napi::AsyncWorker {
                 checkpoint->minPos = llama_memory_seq_pos_min(llama_get_memory(context->ctx), checkpoint->sequenceId);
                 checkpoint->maxPos = llama_memory_seq_pos_max(llama_get_memory(context->ctx), checkpoint->sequenceId);
                 const size_t checkpointSize = llama_state_seq_get_size_ext(context->ctx, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                
+
                 checkpoint->data.resize(checkpointSize, 0);
                 llama_state_seq_get_data_ext(context->ctx, checkpoint->data.data(), checkpointSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
             } catch (const std::exception& e) {
@@ -1164,4 +1178,4 @@ void AddonContextSequenceCheckpoint::init(Napi::Object exports) {
             }
         )
     );
-}
+}
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
@@ -31,6 +31,8 @@ export type BindingModule = {
             ranking?: boolean,
             threads?: number,
             performanceTracking?: boolean,
+            kvCacheKeyType?: number,
+            kvCacheValueType?: number,
             swaFullCache?: boolean
         }): AddonContext
     },

diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
@@ -31,6 +31,7 @@ import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescripti
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
 import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 
 type ChatCommand = {
     modelPath?: string,
@@ -46,6 +47,8 @@ type ChatCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean,
     noTrimWhitespace: boolean,
     grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[1],
@@ -172,6 +175,26 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
+                ] as const,
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
+                ] as const,
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -379,7 +402,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt,
-        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
+        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache,
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
@@ -390,8 +413,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         try {
             await RunChat({
                 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
-                batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
-                temperature, minP, topK, topP, seed, xtc,
+                batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile,
+                threads, temperature, minP, topK, topP, seed, xtc,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
                 maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
@@ -408,7 +431,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
 
 async function RunChat({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
-    contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
+    contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
@@ -444,12 +467,16 @@ async function RunChat({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
         swaFullCache,
+        kvCacheKeyType,
+        kvCacheValueType,
         useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
             swaFullCache,
+            kvCacheKeyType,
+            kvCacheValueType,
             useMmap,
             consoleTitle: "Draft model file"
         })
@@ -495,6 +522,8 @@ async function RunChat({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -530,6 +559,8 @@ async function RunChat({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                    experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,

diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
@@ -23,6 +23,7 @@ import {documentationPageUrls} from "../../config.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
 import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 
 type CompleteCommand = {
     modelPath?: string,
@@ -34,6 +35,8 @@ type CompleteCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean,
     threads?: number,
     temperature: number,
@@ -129,6 +132,26 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
+                ] as const,
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
+                ] as const,
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -299,15 +322,16 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
-        flashAttention, swaFullCache, threads, temperature, minP, topK,
+        flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK,
         topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
         dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
         debug, numa, meter, timing, noMmap, useDirectIo, printTimings
     }) {
         try {
             await RunCompletion({
-                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
+                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
+                kvCacheKeyType, kvCacheValueType, swaFullCache,
                 threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength,
                 dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens,
@@ -323,7 +347,8 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
 
 
 async function RunCompletion({
-    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
+    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
+    kvCacheKeyType, kvCacheValueType, swaFullCache,
     threads, temperature, minP, topK, topP, seed, xtc, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
     dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
@@ -356,13 +381,17 @@ async function RunCompletion({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
         swaFullCache,
-        useMmap
+        useMmap,
+        kvCacheKeyType,
+        kvCacheValueType
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
             swaFullCache,
             useMmap,
+            kvCacheKeyType,
+            kvCacheValueType,
             consoleTitle: "Draft model file"
         })
         : undefined;
@@ -400,6 +429,8 @@ async function RunCompletion({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -435,6 +466,8 @@ async function RunCompletion({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                    experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,