Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions docs/guide/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,50 @@ Ensure you're not using the `Administrator` user for `npm install` nor to run th
To do that, go to `Settings > Update & Security > For developers` and enable `Developer mode`.

After that, delete the `.cache` folder under your user directory and try building the app again.

## Customizing `postinstall` Behavior {#postinstall-behavior}
When installing `node-llama-cpp`, its `postinstall` script checks whether the prebuilt binaries
are compatible with current machine (which they almost always are, at least the CPU-only ones which are the last resort fallback),
and when not, attempts [building the native bindings from source](./building-from-source.md).

When attempting to [build from source](./building-from-source.md), if the machine lacks the required build tools,
the build will fail and indicative error messages will direct you to the specific commands you need to run
or packages you need to install in order for the build process to succeed.

If you want to customize the `postinstall` behavior, you can do so using any of the following methods:
* Passing the `--node-llama-cpp-postinstall=<behavior>` flag to the `npm install` command.
* Setting the `NODE_LLAMA_CPP_POSTINSTALL` environment variable to `<behavior>` before running `npm install`.
* Configuring `config.nodeLlamaCppPostinstall` on your project's `package.json` to `<behavior>`.
<br/>
This will only work when your module is installed globally using `npm install -g` or for a non-library project when you run `npm install` in the project root; it will not work when your module is installed as a dependency of another module.

Where `<behavior>` can be one of the following options:
* **`auto` (default)**: the default behavior explained above.
* **`ignoreFailedBuild`**: same as the default behavior,
but a failed build will not throw an error and will be ignored, which means the installation will succeed.
Using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source again by default.
* **`skip`**: skip the entire `postinstall` script.
If the prebuilt binaries are incompatible with the current machine,
using [`getLlama`](../api/functions/getLlama.md) for the first time will attempt building from source by default.

::: code-group
```shell [<code>npm install</code> flag]
npm install --node-llama-cpp-postinstall=ignoreFailedBuild
```

```shell [env var (bash)]
NODE_LLAMA_CPP_POSTINSTALL=ignoreFailedBuild npm install
```

```shell [env var (using <code>cross-env</code>)]
npx --yes cross-env NODE_LLAMA_CPP_POSTINSTALL=ignoreFailedBuild npm install
```

```json [<code>package.json</code>]
{
"config": {
"nodeLlamaCppPostinstall": "ignoreFailedBuild"
}
}
```
:::
20 changes: 17 additions & 3 deletions llama/addon/AddonContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,20 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
context_params.no_perf = !(options.Get("performanceTracking").As<Napi::Boolean>().Value());
}

if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
context_params.type_k = static_cast<ggml_type>(keyType);
}
}

if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
context_params.type_v = static_cast<ggml_type>(valueType);
}
}

if (options.Has("swaFullCache")) {
context_params.swa_full = options.Get("swaFullCache").As<Napi::Boolean>().Value();
}
Expand Down Expand Up @@ -1063,7 +1077,7 @@ void AddonContext::init(Napi::Object exports) {
}

AddonContextSequenceCheckpoint::AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonContextSequenceCheckpoint>(info) {

}
AddonContextSequenceCheckpoint::~AddonContextSequenceCheckpoint() {
dispose();
Expand Down Expand Up @@ -1099,7 +1113,7 @@ class AddonContextSequenceCheckpointInitWorker : public Napi::AsyncWorker {
checkpoint->minPos = llama_memory_seq_pos_min(llama_get_memory(context->ctx), checkpoint->sequenceId);
checkpoint->maxPos = llama_memory_seq_pos_max(llama_get_memory(context->ctx), checkpoint->sequenceId);
const size_t checkpointSize = llama_state_seq_get_size_ext(context->ctx, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);

checkpoint->data.resize(checkpointSize, 0);
llama_state_seq_get_data_ext(context->ctx, checkpoint->data.data(), checkpointSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
} catch (const std::exception& e) {
Expand Down Expand Up @@ -1164,4 +1178,4 @@ void AddonContextSequenceCheckpoint::init(Napi::Object exports) {
}
)
);
}
}
2 changes: 2 additions & 0 deletions src/bindings/AddonTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ export type BindingModule = {
ranking?: boolean,
threads?: number,
performanceTracking?: boolean,
kvCacheKeyType?: number,
kvCacheValueType?: number,
swaFullCache?: boolean
}): AddonContext
},
Expand Down
39 changes: 35 additions & 4 deletions src/cli/commands/ChatCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescripti
import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";

type ChatCommand = {
modelPath?: string,
Expand All @@ -46,6 +47,8 @@ type ChatCommand = {
contextSize?: number,
batchSize?: number,
flashAttention?: boolean,
kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
swaFullCache?: boolean,
noTrimWhitespace: boolean,
grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[1],
Expand Down Expand Up @@ -172,6 +175,26 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
default: false,
description: "Enable flash attention"
})
.option("kvCacheKeyType", {
alias: "kvckt",
type: "string",
choices: [
"currentQuant",
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
] as const,
default: "F16" as const,
description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
})
.option("kvCacheValueType", {
alias: "kvcvt",
type: "string",
choices: [
"currentQuant",
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
] as const,
default: "F16" as const,
description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
})
.option("swaFullCache", {
alias: "noSwa",
type: "boolean",
Expand Down Expand Up @@ -379,7 +402,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
},
async handler({
modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt,
promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache,
noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
Expand All @@ -390,8 +413,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
try {
await RunChat({
modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
temperature, minP, topK, topP, seed, xtc,
batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile,
threads, temperature, minP, topK, topP, seed, xtc,
gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
Expand All @@ -408,7 +431,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {

async function RunChat({
modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
Expand Down Expand Up @@ -444,12 +467,16 @@ async function RunChat({
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
flashAttention,
swaFullCache,
kvCacheKeyType,
kvCacheValueType,
useMmap
});
const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
flashAttention,
swaFullCache,
kvCacheKeyType,
kvCacheValueType,
useMmap,
consoleTitle: "Draft model file"
})
Expand Down Expand Up @@ -495,6 +522,8 @@ async function RunChat({
? {fitContext: {contextSize}}
: undefined,
defaultContextFlashAttention: flashAttention,
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
defaultContextSwaFullCache: swaFullCache,
useMmap,
useDirectIo,
Expand Down Expand Up @@ -530,6 +559,8 @@ async function RunChat({
return await llama.loadModel({
modelPath: resolvedDraftModelPath,
defaultContextFlashAttention: flashAttention,
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
defaultContextSwaFullCache: swaFullCache,
useMmap,
useDirectIo,
Expand Down
41 changes: 37 additions & 4 deletions src/cli/commands/CompleteCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import {documentationPageUrls} from "../../config.js";
import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";

type CompleteCommand = {
modelPath?: string,
Expand All @@ -34,6 +35,8 @@ type CompleteCommand = {
contextSize?: number,
batchSize?: number,
flashAttention?: boolean,
kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
swaFullCache?: boolean,
threads?: number,
temperature: number,
Expand Down Expand Up @@ -129,6 +132,26 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
default: false,
description: "Enable flash attention"
})
.option("kvCacheKeyType", {
alias: "kvckt",
type: "string",
choices: [
"currentQuant",
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
] as const,
default: "F16" as const,
description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
})
.option("kvCacheValueType", {
alias: "kvcvt",
type: "string",
choices: [
"currentQuant",
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
] as const,
default: "F16" as const,
description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
})
.option("swaFullCache", {
alias: "noSwa",
type: "boolean",
Expand Down Expand Up @@ -299,15 +322,16 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
},
async handler({
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
flashAttention, swaFullCache, threads, temperature, minP, topK,
flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK,
topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
debug, numa, meter, timing, noMmap, useDirectIo, printTimings
}) {
try {
await RunCompletion({
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
kvCacheKeyType, kvCacheValueType, swaFullCache,
threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty,
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength,
dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens,
Expand All @@ -323,7 +347,8 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {


async function RunCompletion({
modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
kvCacheKeyType, kvCacheValueType, swaFullCache,
threads, temperature, minP, topK, topP, seed, xtc, gpuLayers,
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
Expand Down Expand Up @@ -356,13 +381,17 @@ async function RunCompletion({
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
flashAttention,
swaFullCache,
useMmap
useMmap,
kvCacheKeyType,
kvCacheValueType
});
const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
flashAttention,
swaFullCache,
useMmap,
kvCacheKeyType,
kvCacheValueType,
consoleTitle: "Draft model file"
})
: undefined;
Expand Down Expand Up @@ -400,6 +429,8 @@ async function RunCompletion({
? {fitContext: {contextSize}}
: undefined,
defaultContextFlashAttention: flashAttention,
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
defaultContextSwaFullCache: swaFullCache,
useMmap,
useDirectIo,
Expand Down Expand Up @@ -435,6 +466,8 @@ async function RunCompletion({
return await llama.loadModel({
modelPath: resolvedDraftModelPath,
defaultContextFlashAttention: flashAttention,
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
defaultContextSwaFullCache: swaFullCache,
useMmap,
useDirectIo,
Expand Down
Loading
Loading