feat: Improve agents config

This commit is contained in:
Petr Mironychev
2026-06-29 23:12:44 +02:00
parent 080947c0dc
commit 2a3fd4f5be
26 changed files with 274 additions and 15 deletions

View File

@@ -18,6 +18,9 @@
<file>openai_quick_refactor.toml</file>
<file>google_base_chat.toml</file>
<file>google_chat.toml</file>
<file>google_completion.toml</file>
<file>google_compression.toml</file>
<file>google_quick_refactor.toml</file>
<file>mistral_base_chat.toml</file>
<file>mistral_chat.toml</file>
<file>mistral_chat_reasoning.toml</file>
@@ -40,6 +43,16 @@
<file>ollama_compression_16gb.toml</file>
<file>ollama_compression_32gb.toml</file>
<file>ollama_chat_gemma4.toml</file>
<file>llamacpp_base_chat.toml</file>
<file>llamacpp_chat.toml</file>
<file>llamacpp_completion_fim.toml</file>
<file>llamacpp_compression.toml</file>
<file>llamacpp_quick_refactor.toml</file>
<file>lmstudio_base_responses.toml</file>
<file>lmstudio_chat.toml</file>
<file>lmstudio_completion.toml</file>
<file>lmstudio_compression.toml</file>
<file>lmstudio_quick_refactor.toml</file>
</qresource>
<qresource prefix="/roles">
<file alias="qt-cpp-developer.md">roles/qt-cpp-developer.md</file>

View File

@@ -16,7 +16,7 @@ system_prompt = """
[body]
max_tokens = 512
temperature = 0.2
temperature = 0
stop_sequences = ["</code_context>"]
messages = """
[

View File

@@ -9,4 +9,4 @@ tags = ["completion", "codestral", "mistral", "cloud", "fim"]
[body]
max_tokens = 256
temperature = 0.2
temperature = 0

View File

@@ -0,0 +1,31 @@
schema_version = 1
extends = "Google Base Chat"
name = "Google Completion"
description = "Google Gemini 3.1 Flash-Lite — code completion using the <code_context> chat format over generateContent. Thinking disabled (thinkingBudget=0) and temperature=0 for fast, deterministic insertions; stops at </code_context>."
model = "gemini-3.1-flash-lite"
tags = ["completion", "gemini", "google", "cloud"]
system_prompt = """
{%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }}
{%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }}
{%- else %}{{ read_file(":/roles/code-completion.md") }}
{%- endif %}
{{ read_file(":/tasks/code-completion.md") }}"""
[body]
contents = """
[
{
"role": "user",
"parts": [ { "text": {{ tojson("Here is the code context with insertion points:\\n<code_context>\\n" + ctx.prefix + "<cursor>" + ctx.suffix + "\\n</code_context>") }} } ]
}
]
"""
[body.generationConfig]
maxOutputTokens = 1024
temperature = 0
stopSequences = ["</code_context>"]
thinkingConfig = { thinkingBudget = 0 }

View File

@@ -0,0 +1,16 @@
schema_version = 1
extends = "Google Base Chat"
name = "Google Compression"
description = "Google Gemini 3.1 Flash-Lite — fast, low-cost conversation summarization. Carries the summary system prompt; no tools, thinking disabled (thinkingBudget=0)."
model = "gemini-3.1-flash-lite"
enable_tools = false
tags = ["compression", "gemini", "google", "cloud"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
[body.generationConfig]
maxOutputTokens = 16000
temperature = 0.3
thinkingConfig = { thinkingBudget = 0 }

View File

@@ -0,0 +1,17 @@
schema_version = 1
extends = "Google Base Chat"
name = "Google Quick Refactor"
description = "Google Gemini 3.5 Flash — agentic inline refactor with tools and thinking (gathers context before editing). Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context (file, code, cursor/selection)."
model = "gemini-3.5-flash"
enable_tools = true
enable_thinking = true
tags = ["refactor", "gemini", "google", "cloud"]
system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
[body.generationConfig]
maxOutputTokens = 16000
temperature = 1
thinkingConfig = { includeThoughts = true, thinkingBudget = 8192 }

View File

@@ -0,0 +1,9 @@
schema_version = 1
extends = "OpenAI Base Chat"
name = "llama.cpp Base Chat"
description = "llama.cpp server Chat Completions request body (OpenAI-compatible /v1/chat/completions). Abstract — extend it and set model."
abstract = true
provider_instance = "llama.cpp"
endpoint = "/v1/chat/completions"

View File

@@ -0,0 +1,15 @@
schema_version = 1
extends = "llama.cpp Base Chat"
name = "llama.cpp Chat"
description = "Local llama.cpp (llama-server) — coding chat via the OpenAI-compatible Chat Completions API. llama-server serves whichever GGUF you loaded, so 'model' is only a label. Tool calling needs a tool-capable model and llama-server started with --jinja."
model = "qwen2.5-coder-7b-instruct"
enable_tools = true
tags = ["chat", "llama.cpp", "local"]
system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}"""
[body]
max_tokens = 8192
temperature = 0.7

View File

@@ -0,0 +1,15 @@
schema_version = 1
name = "llama.cpp Completion — FIM"
description = "Local llama.cpp native fill-in-the-middle via the /infill endpoint (input_prefix + input_suffix). Fast and clean, but the loaded GGUF MUST be a FIM-trained model (qwen2.5-coder, codellama-code, deepseek-coder, starcoder2, codegemma). A plain chat model produces garbage here — there is no native completion path for those. llama-server serves whichever model is loaded, so 'model' is only a label."
provider_instance = "llama.cpp"
endpoint = "/infill"
model = "qwen2.5-coder-7b"
tags = ["completion", "llama.cpp", "local", "fim"]
[body]
input_prefix = """{{ tojson(ctx.prefix) }}"""
input_suffix = """{% if existsIn(ctx, "suffix") %}{{ tojson(ctx.suffix) }}{% endif %}"""
n_predict = 256
temperature = 0.2

View File

@@ -0,0 +1,15 @@
schema_version = 1
extends = "llama.cpp Base Chat"
name = "llama.cpp Compression"
description = "Local llama.cpp — conversation summarization via the OpenAI-compatible Chat Completions API. Carries the summary system prompt; no tools. llama-server serves whichever GGUF is loaded, so 'model' is only a label."
model = "qwen2.5-coder-7b-instruct"
enable_tools = false
tags = ["compression", "llama.cpp", "local"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
[body]
max_tokens = 16000
temperature = 0.3

View File

@@ -0,0 +1,15 @@
schema_version = 1
extends = "llama.cpp Base Chat"
name = "llama.cpp Quick Refactor"
description = "Local llama.cpp deterministic inline refactor via the OpenAI-compatible Chat Completions API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; enabling them needs a tool-capable model + llama-server --jinja."
model = "qwen2.5-coder-7b-instruct"
enable_tools = false
tags = ["refactor", "llama.cpp", "local"]
system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
[body]
max_tokens = 8192
temperature = 0.2

View File

@@ -0,0 +1,9 @@
schema_version = 1
extends = "OpenAI Base Responses"
name = "LM Studio Base Responses"
description = "LM Studio Responses API request body (OpenAI-compatible /v1/responses). Abstract — extend it and set model."
abstract = true
provider_instance = "LM Studio (Responses API)"
endpoint = "/v1/responses"

View File

@@ -0,0 +1,15 @@
schema_version = 1
extends = "LM Studio Base Responses"
name = "LM Studio Chat"
description = "Local LM Studio — coding chat via the OpenAI-compatible Responses API (/v1/responses) on Gemma 4 12B (tools + vision capable). Set 'model' to the identifier of the model loaded in LM Studio."
model = "google/gemma-4-12b"
enable_tools = true
tags = ["chat", "lmstudio", "responses", "local"]
system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}"""
[body]
max_output_tokens = 8192
temperature = 0.7

View File

@@ -0,0 +1,24 @@
schema_version = 1
extends = "LM Studio Base Responses"
name = "LM Studio Completion"
description = "Local LM Studio — code completion via the Responses API: the cursor sits in a <code_context> message, so the model continues the code. Use a NON-thinking instruct/code model and set `model` to whatever you have loaded (qwen2.5-coder-7b-instruct is a good pick). Avoid reasoning models: Gemma 4 (incl. -qat) and similar emit reasoning tokens before any code, the detailed completion task makes them deliberate for hundreds-to-thousands of tokens (worst on the no-op cases), and reasoning cannot be disabled via the Responses API — so they exhaust max_output_tokens and return an empty completion no matter how high it is set."
model = "qwen2.5-coder-7b-instruct"
tags = ["completion", "lmstudio", "responses", "local"]
system_prompt = """
{%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }}
{%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }}
{%- else %}{{ read_file(":/roles/code-completion.md") }}
{%- endif %}
{{ read_file(":/tasks/code-completion.md") }}"""
[body]
max_output_tokens = 256
temperature = 0
input = """
[
{ "role": "user", "content": {{ tojson("Here is the code context with insertion points:\\n<code_context>\\n" + ctx.prefix + "<cursor>" + ctx.suffix + "\\n</code_context>") }} }
]
"""

View File

@@ -0,0 +1,15 @@
schema_version = 1
extends = "LM Studio Base Responses"
name = "LM Studio Compression"
description = "Local LM Studio — conversation summarization via the Responses API. Carries the summary system prompt; no tools. Gemma 4 12B by default; set 'model' to the loaded model's identifier."
model = "google/gemma-4-12b"
enable_tools = false
tags = ["compression", "lmstudio", "responses", "local"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
[body]
max_output_tokens = 16000
temperature = 0.3

View File

@@ -0,0 +1,15 @@
schema_version = 1
extends = "LM Studio Base Responses"
name = "LM Studio Quick Refactor"
description = "Local LM Studio deterministic inline refactor via the Responses API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; Gemma 4 12B is tool-capable if you enable them."
model = "google/gemma-4-12b"
enable_tools = false
tags = ["refactor", "lmstudio", "responses", "local"]
system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
[body]
max_output_tokens = 8192
temperature = 0.2

View File

@@ -10,4 +10,4 @@ tags = ["completion", "mistral", "codestral", "cloud", "fim"]
[body]
max_tokens = 256
temperature = 0.2
temperature = 0

View File

@@ -2,12 +2,12 @@ schema_version = 1
extends = "Ollama Base FIM"
name = "Ollama Completion — FIM"
description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with models that ship a FIM template, and those are few: the base / '-code' variants, NOT instruct/chat models. Verified to work: qwen2.5-coder (incl. -base), codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show <model> --modelfile` must mention 'Suffix'."
description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with a true BASE / '-code' model. Pick a base tag explicitly: the bare 'qwen2.5-coder:7b' tag is the INSTRUCT model (it ships an im_start chat template), and on FIM it rambles whole programs and prose — use 'qwen2.5-coder:7b-base' instead. Verified base/-code FIM models: qwen2.5-coder:7b-base, codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat/instruct model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show <model> --modelfile` must mention 'Suffix' and must NOT have an im_start/chat template."
model = "qwen2.5-coder:7b"
model = "qwen2.5-coder:7b-base-q5_K_M"
tags = ["completion", "ollama", "local", "fim", "8gb"]
[body.options]
num_predict = 256
temperature = 0.2
temperature = 0
keep_alive = "5m"

View File

@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "16gb"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
[body]
think = false
[body.options]
num_predict = 2048
temperature = 0.3

View File

@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "32gb"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
[body]
think = false
[body.options]
num_predict = 2048
temperature = 0.3

View File

@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "8gb"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
[body]
think = false
[body.options]
num_predict = 2048
temperature = 0.3

View File

@@ -1,20 +1,25 @@
Core Requirements:
1. Continue code exactly from the cursor position, ensuring it properly connects with any existing code after the cursor
2. Never repeat existing code before or after the cursor
2. Never repeat existing code before or after the cursor — the text after <cursor> already exists, so do not reproduce any of it
Specific Guidelines:
- For function calls: Complete parameters with appropriate types and names
- For class members: Respect access modifiers and class conventions
- Respect existing indentation and formatting
- Consider scope and visibility of referenced symbols
- Respect existing indentation and formatting; do not re-emit indentation that already precedes the cursor
- Consider scope and visibility of referenced symbols (do not use a symbol that is only declared after the cursor)
- Ensure seamless integration with code both before and after the cursor
When nothing should be inserted, return an empty code block. This applies only when:
- Any insertion would duplicate code that already appears after the cursor, or
- The cursor sits in the middle of an existing identifier or type name, or between a complete type and its variable name
Otherwise, always provide a completion (for example, fill an empty initializer list or argument list). In the no-insertion cases above, output an empty code block and nothing else — never describe the code, report errors, ask questions, or suggest alternatives.
Context Format:
<code_context>
...code before the cursor...<cursor>...code after the cursor...
</code_context>
Response Format:
- No explanations or comments
- Only include new characters needed to create valid code
- Should be codeblock with language
- Your entire response must be exactly one code block tagged with the language, and nothing else
- Never write any sentence, note, explanation, or comment before or after the code block — not even to state that the code is already complete
- Inside the block, include only the new characters needed at the cursor to form valid code; leave the block empty only in the no-insertion cases listed above

View File

@@ -48,7 +48,26 @@ QFuture<QList<QString>> GenericProvider::getInstalledModels(const QString &url)
{
m_client->setUrl(url);
m_client->setApiKey(apiKey());
return m_client->listModels();
return m_client->listModels(modelsEndpoint(url));
}
QString GenericProvider::modelsEndpoint(const QString &url) const
{
switch (m_id) {
case ProviderID::OpenAI:
case ProviderID::OpenAIResponses:
case ProviderID::OpenAICompatible:
case ProviderID::LMStudio:
case ProviderID::OpenRouter:
break;
default:
return {};
}
QString base = url;
while (base.endsWith('/'))
base.chop(1);
return base.endsWith("/v1") ? QStringLiteral("/models") : QStringLiteral("/v1/models");
}
RequestID GenericProvider::sendRequest(

View File

@@ -39,6 +39,8 @@ public:
const QUrl &url, const QJsonObject &payload, const QString &endpoint) override;
private:
QString modelsEndpoint(const QString &url) const;
QString m_name;
ProviderID m_id;
::LLMQore::BaseClient *m_client;

View File

@@ -58,7 +58,7 @@ protected:
return {};
}
LLMQore::RequestID ask(const QString &, LLMQore::RequestMode) override { return {}; }
QFuture<QList<QString>> listModels() override { return {}; }
QFuture<QList<QString>> listModels(const QString & = {}) override { return {}; }
LLMQore::ToolSchemaFormat toolSchemaFormat() const override
{
return LLMQore::ToolSchemaFormat::Claude;