feat: Improve agents config

2026-06-30 18:19:11 -04:00 · 2026-06-29 23:12:44 +02:00
parent 080947c0dc
commit 2a3fd4f5be
26 changed files with 274 additions and 15 deletions
--- a/sources/agents/agents.qrc
+++ b/sources/agents/agents.qrc
@@ -18,6 +18,9 @@
        <file>openai_quick_refactor.toml</file>
        <file>google_base_chat.toml</file>
        <file>google_chat.toml</file>
        <file>google_completion.toml</file>
        <file>google_compression.toml</file>
        <file>google_quick_refactor.toml</file>
        <file>mistral_base_chat.toml</file>
        <file>mistral_chat.toml</file>
        <file>mistral_chat_reasoning.toml</file>
@@ -40,6 +43,16 @@
        <file>ollama_compression_16gb.toml</file>
        <file>ollama_compression_32gb.toml</file>
        <file>ollama_chat_gemma4.toml</file>
        <file>llamacpp_base_chat.toml</file>
        <file>llamacpp_chat.toml</file>
        <file>llamacpp_completion_fim.toml</file>
        <file>llamacpp_compression.toml</file>
        <file>llamacpp_quick_refactor.toml</file>
        <file>lmstudio_base_responses.toml</file>
        <file>lmstudio_chat.toml</file>
        <file>lmstudio_completion.toml</file>
        <file>lmstudio_compression.toml</file>
        <file>lmstudio_quick_refactor.toml</file>
    </qresource>
    <qresource prefix="/roles">
        <file alias="qt-cpp-developer.md">roles/qt-cpp-developer.md</file>
--- a/sources/agents/claude_completion.toml
+++ b/sources/agents/claude_completion.toml
@@ -16,7 +16,7 @@ system_prompt = """
 [body]
 max_tokens = 512
-temperature = 0.2
+temperature = 0
 stop_sequences = ["</code_context>"]
 messages = """
 [
--- a/sources/agents/codestral_completion_fim.toml
+++ b/sources/agents/codestral_completion_fim.toml
@@ -9,4 +9,4 @@ tags  = ["completion", "codestral", "mistral", "cloud", "fim"]
 [body]
 max_tokens  = 256
-temperature = 0.2
+temperature = 0
--- a/sources/agents/google_completion.toml
+++ b/sources/agents/google_completion.toml
@@ -0,0 +1,31 @@
 schema_version = 1
 extends     = "Google Base Chat"
 name        = "Google Completion"
 description = "Google Gemini 3.1 Flash-Lite — code completion using the <code_context> chat format over generateContent. Thinking disabled (thinkingBudget=0) and temperature=0 for fast, deterministic insertions; stops at </code_context>."
 model = "gemini-3.1-flash-lite"
 tags  = ["completion", "gemini", "google", "cloud"]
 system_prompt = """
 {%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }}
 {%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }}
 {%- else %}{{ read_file(":/roles/code-completion.md") }}
 {%- endif %}
 {{ read_file(":/tasks/code-completion.md") }}"""
 [body]
 contents = """
 [
  {
    "role": "user",
    "parts": [ { "text": {{ tojson("Here is the code context with insertion points:\\n<code_context>\\n" + ctx.prefix + "<cursor>" + ctx.suffix + "\\n</code_context>") }} } ]
  }
 ]
 """
 [body.generationConfig]
 maxOutputTokens = 1024
 temperature     = 0
 stopSequences   = ["</code_context>"]
 thinkingConfig  = { thinkingBudget = 0 }
--- a/sources/agents/google_compression.toml
+++ b/sources/agents/google_compression.toml
@@ -0,0 +1,16 @@
 schema_version = 1
 extends     = "Google Base Chat"
 name        = "Google Compression"
 description = "Google Gemini 3.1 Flash-Lite — fast, low-cost conversation summarization. Carries the summary system prompt; no tools, thinking disabled (thinkingBudget=0)."
 model        = "gemini-3.1-flash-lite"
 enable_tools = false
 tags         = ["compression", "gemini", "google", "cloud"]
 system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
 [body.generationConfig]
 maxOutputTokens = 16000
 temperature     = 0.3
 thinkingConfig  = { thinkingBudget = 0 }
--- a/sources/agents/google_quick_refactor.toml
+++ b/sources/agents/google_quick_refactor.toml
@@ -0,0 +1,17 @@
 schema_version = 1
 extends     = "Google Base Chat"
 name        = "Google Quick Refactor"
 description = "Google Gemini 3.5 Flash — agentic inline refactor with tools and thinking (gathers context before editing). Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context (file, code, cursor/selection)."
 model           = "gemini-3.5-flash"
 enable_tools    = true
 enable_thinking = true
 tags            = ["refactor", "gemini", "google", "cloud"]
 system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
 [body.generationConfig]
 maxOutputTokens = 16000
 temperature     = 1
 thinkingConfig  = { includeThoughts = true, thinkingBudget = 8192 }
--- a/sources/agents/llamacpp_base_chat.toml
+++ b/sources/agents/llamacpp_base_chat.toml
@@ -0,0 +1,9 @@
 schema_version = 1
 extends     = "OpenAI Base Chat"
 name        = "llama.cpp Base Chat"
 description = "llama.cpp server Chat Completions request body (OpenAI-compatible /v1/chat/completions). Abstract — extend it and set model."
 abstract    = true
 provider_instance = "llama.cpp"
 endpoint          = "/v1/chat/completions"
--- a/sources/agents/llamacpp_chat.toml
+++ b/sources/agents/llamacpp_chat.toml
@@ -0,0 +1,15 @@
 schema_version = 1
 extends     = "llama.cpp Base Chat"
 name        = "llama.cpp Chat"
 description = "Local llama.cpp (llama-server) — coding chat via the OpenAI-compatible Chat Completions API. llama-server serves whichever GGUF you loaded, so 'model' is only a label. Tool calling needs a tool-capable model and llama-server started with --jinja."
 model        = "qwen2.5-coder-7b-instruct"
 enable_tools = true
 tags         = ["chat", "llama.cpp", "local"]
 system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}"""
 [body]
 max_tokens  = 8192
 temperature = 0.7
--- a/sources/agents/llamacpp_completion_fim.toml
+++ b/sources/agents/llamacpp_completion_fim.toml
@@ -0,0 +1,15 @@
 schema_version = 1
 name        = "llama.cpp Completion — FIM"
 description = "Local llama.cpp native fill-in-the-middle via the /infill endpoint (input_prefix + input_suffix). Fast and clean, but the loaded GGUF MUST be a FIM-trained model (qwen2.5-coder, codellama-code, deepseek-coder, starcoder2, codegemma). A plain chat model produces garbage here — there is no native completion path for those. llama-server serves whichever model is loaded, so 'model' is only a label."
 provider_instance = "llama.cpp"
 endpoint          = "/infill"
 model             = "qwen2.5-coder-7b"
 tags              = ["completion", "llama.cpp", "local", "fim"]
 [body]
 input_prefix = """{{ tojson(ctx.prefix) }}"""
 input_suffix = """{% if existsIn(ctx, "suffix") %}{{ tojson(ctx.suffix) }}{% endif %}"""
 n_predict    = 256
 temperature  = 0.2
--- a/sources/agents/llamacpp_compression.toml
+++ b/sources/agents/llamacpp_compression.toml
@@ -0,0 +1,15 @@
 schema_version = 1
 extends     = "llama.cpp Base Chat"
 name        = "llama.cpp Compression"
 description = "Local llama.cpp — conversation summarization via the OpenAI-compatible Chat Completions API. Carries the summary system prompt; no tools. llama-server serves whichever GGUF is loaded, so 'model' is only a label."
 model        = "qwen2.5-coder-7b-instruct"
 enable_tools = false
 tags         = ["compression", "llama.cpp", "local"]
 system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
 [body]
 max_tokens  = 16000
 temperature = 0.3
--- a/sources/agents/llamacpp_quick_refactor.toml
+++ b/sources/agents/llamacpp_quick_refactor.toml
@@ -0,0 +1,15 @@
 schema_version = 1
 extends     = "llama.cpp Base Chat"
 name        = "llama.cpp Quick Refactor"
 description = "Local llama.cpp deterministic inline refactor via the OpenAI-compatible Chat Completions API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; enabling them needs a tool-capable model + llama-server --jinja."
 model        = "qwen2.5-coder-7b-instruct"
 enable_tools = false
 tags         = ["refactor", "llama.cpp", "local"]
 system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
 [body]
 max_tokens  = 8192
 temperature = 0.2
--- a/sources/agents/lmstudio_base_responses.toml
+++ b/sources/agents/lmstudio_base_responses.toml
@@ -0,0 +1,9 @@
 schema_version = 1
 extends     = "OpenAI Base Responses"
 name        = "LM Studio Base Responses"
 description = "LM Studio Responses API request body (OpenAI-compatible /v1/responses). Abstract — extend it and set model."
 abstract    = true
 provider_instance = "LM Studio (Responses API)"
 endpoint          = "/v1/responses"
--- a/sources/agents/lmstudio_chat.toml
+++ b/sources/agents/lmstudio_chat.toml
@@ -0,0 +1,15 @@
 schema_version = 1
 extends     = "LM Studio Base Responses"
 name        = "LM Studio Chat"
 description = "Local LM Studio — coding chat via the OpenAI-compatible Responses API (/v1/responses) on Gemma 4 12B (tools + vision capable). Set 'model' to the identifier of the model loaded in LM Studio."
 model        = "google/gemma-4-12b"
 enable_tools = true
 tags         = ["chat", "lmstudio", "responses", "local"]
 system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}"""
 [body]
 max_output_tokens = 8192
 temperature       = 0.7
--- a/sources/agents/lmstudio_completion.toml
+++ b/sources/agents/lmstudio_completion.toml
@@ -0,0 +1,24 @@
 schema_version = 1
 extends     = "LM Studio Base Responses"
 name        = "LM Studio Completion"
 description = "Local LM Studio — code completion via the Responses API: the cursor sits in a <code_context> message, so the model continues the code. Use a NON-thinking instruct/code model and set `model` to whatever you have loaded (qwen2.5-coder-7b-instruct is a good pick). Avoid reasoning models: Gemma 4 (incl. -qat) and similar emit reasoning tokens before any code, the detailed completion task makes them deliberate for hundreds-to-thousands of tokens (worst on the no-op cases), and reasoning cannot be disabled via the Responses API — so they exhaust max_output_tokens and return an empty completion no matter how high it is set."
 model = "qwen2.5-coder-7b-instruct"
 tags  = ["completion", "lmstudio", "responses", "local"]
 system_prompt = """
 {%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }}
 {%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }}
 {%- else %}{{ read_file(":/roles/code-completion.md") }}
 {%- endif %}
 {{ read_file(":/tasks/code-completion.md") }}"""
 [body]
 max_output_tokens = 256
 temperature       = 0
 input = """
 [
  { "role": "user", "content": {{ tojson("Here is the code context with insertion points:\\n<code_context>\\n" + ctx.prefix + "<cursor>" + ctx.suffix + "\\n</code_context>") }} }
 ]
 """
--- a/sources/agents/lmstudio_compression.toml
+++ b/sources/agents/lmstudio_compression.toml
@@ -0,0 +1,15 @@
 schema_version = 1
 extends     = "LM Studio Base Responses"
 name        = "LM Studio Compression"
 description = "Local LM Studio — conversation summarization via the Responses API. Carries the summary system prompt; no tools. Gemma 4 12B by default; set 'model' to the loaded model's identifier."
 model        = "google/gemma-4-12b"
 enable_tools = false
 tags         = ["compression", "lmstudio", "responses", "local"]
 system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
 [body]
 max_output_tokens = 16000
 temperature       = 0.3
--- a/sources/agents/lmstudio_quick_refactor.toml
+++ b/sources/agents/lmstudio_quick_refactor.toml
@@ -0,0 +1,15 @@
 schema_version = 1
 extends     = "LM Studio Base Responses"
 name        = "LM Studio Quick Refactor"
 description = "Local LM Studio deterministic inline refactor via the Responses API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; Gemma 4 12B is tool-capable if you enable them."
 model        = "google/gemma-4-12b"
 enable_tools = false
 tags         = ["refactor", "lmstudio", "responses", "local"]
 system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
 [body]
 max_output_tokens = 8192
 temperature       = 0.2
--- a/sources/agents/mistral_completion_codestral_fim.toml
+++ b/sources/agents/mistral_completion_codestral_fim.toml
@@ -10,4 +10,4 @@ tags              = ["completion", "mistral", "codestral", "cloud", "fim"]
 [body]
 max_tokens  = 256
-temperature = 0.2
+temperature = 0
--- a/sources/agents/ollama_completion_fim.toml
+++ b/sources/agents/ollama_completion_fim.toml
@@ -2,12 +2,12 @@ schema_version = 1
 extends     = "Ollama Base FIM"
 name        = "Ollama Completion — FIM"
-description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with models that ship a FIM template, and those are few: the base / '-code' variants, NOT instruct/chat models. Verified to work: qwen2.5-coder (incl. -base), codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show <model> --modelfile` must mention 'Suffix'."
+description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with a true BASE / '-code' model. Pick a base tag explicitly: the bare 'qwen2.5-coder:7b' tag is the INSTRUCT model (it ships an im_start chat template), and on FIM it rambles whole programs and prose — use 'qwen2.5-coder:7b-base' instead. Verified base/-code FIM models: qwen2.5-coder:7b-base, codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat/instruct model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show <model> --modelfile` must mention 'Suffix' and must NOT have an im_start/chat template."
-model = "qwen2.5-coder:7b"
+model = "qwen2.5-coder:7b-base-q5_K_M"
 tags  = ["completion", "ollama", "local", "fim", "8gb"]
 [body.options]
 num_predict = 256
-temperature = 0.2
+temperature = 0
 keep_alive  = "5m"
--- a/sources/agents/ollama_compression_16gb.toml
+++ b/sources/agents/ollama_compression_16gb.toml
@@ -10,6 +10,9 @@ tags         = ["compression", "ollama", "local", "16gb"]
 system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
 [body]
 think = false
 [body.options]
 num_predict = 2048
 temperature = 0.3
--- a/sources/agents/ollama_compression_32gb.toml
+++ b/sources/agents/ollama_compression_32gb.toml
@@ -10,6 +10,9 @@ tags         = ["compression", "ollama", "local", "32gb"]
 system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
 [body]
 think = false
 [body.options]
 num_predict = 2048
 temperature = 0.3
--- a/sources/agents/ollama_compression_8gb.toml
+++ b/sources/agents/ollama_compression_8gb.toml
@@ -10,6 +10,9 @@ tags         = ["compression", "ollama", "local", "8gb"]
 system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
 [body]
 think = false
 [body.options]
 num_predict = 2048
 temperature = 0.3
--- a/sources/agents/tasks/code-completion.md
+++ b/sources/agents/tasks/code-completion.md
@@ -1,20 +1,25 @@
 Core Requirements:
 1. Continue code exactly from the cursor position, ensuring it properly connects with any existing code after the cursor
-2. Never repeat existing code before or after the cursor
+2. Never repeat existing code before or after the cursor — the text after <cursor> already exists, so do not reproduce any of it
 Specific Guidelines:
 - For function calls: Complete parameters with appropriate types and names
 - For class members: Respect access modifiers and class conventions
- Respect existing indentation and formatting
+- Respect existing indentation and formatting; do not re-emit indentation that already precedes the cursor
- Consider scope and visibility of referenced symbols
+- Consider scope and visibility of referenced symbols (do not use a symbol that is only declared after the cursor)
 - Ensure seamless integration with code both before and after the cursor
 When nothing should be inserted, return an empty code block. This applies only when:
 - Any insertion would duplicate code that already appears after the cursor, or
 - The cursor sits in the middle of an existing identifier or type name, or between a complete type and its variable name
 Otherwise, always provide a completion (for example, fill an empty initializer list or argument list). In the no-insertion cases above, output an empty code block and nothing else — never describe the code, report errors, ask questions, or suggest alternatives.
 Context Format:
 <code_context>
 ...code before the cursor...<cursor>...code after the cursor...
 </code_context>
 Response Format:
- No explanations or comments
+- Your entire response must be exactly one code block tagged with the language, and nothing else
- Only include new characters needed to create valid code
+- Never write any sentence, note, explanation, or comment before or after the code block — not even to state that the code is already complete
- Should be codeblock with language
+- Inside the block, include only the new characters needed at the cursor to form valid code; leave the block empty only in the no-insertion cases listed above
--- a/sources/external/llmqore
+++ b/sources/external/llmqore
--- a/sources/providers/GenericProvider.cpp
+++ b/sources/providers/GenericProvider.cpp
@@ -48,7 +48,26 @@ QFuture<QList<QString>> GenericProvider::getInstalledModels(const QString &url)
 {
    m_client->setUrl(url);
    m_client->setApiKey(apiKey());
-    return m_client->listModels();
+    return m_client->listModels(modelsEndpoint(url));
 }
 QString GenericProvider::modelsEndpoint(const QString &url) const
 {
    switch (m_id) {
    case ProviderID::OpenAI:
    case ProviderID::OpenAIResponses:
    case ProviderID::OpenAICompatible:
    case ProviderID::LMStudio:
    case ProviderID::OpenRouter:
        break;
    default:
        return {};
    }
    QString base = url;
    while (base.endsWith('/'))
        base.chop(1);
    return base.endsWith("/v1") ? QStringLiteral("/models") : QStringLiteral("/v1/models");
 }
 RequestID GenericProvider::sendRequest(
--- a/sources/providers/GenericProvider.hpp
+++ b/sources/providers/GenericProvider.hpp
@@ -39,6 +39,8 @@ public:
        const QUrl &url, const QJsonObject &payload, const QString &endpoint) override;
 private:
    QString modelsEndpoint(const QString &url) const;
    QString m_name;
    ProviderID m_id;
    ::LLMQore::BaseClient *m_client;
--- a/test/ResponseRouterTest.cpp
+++ b/test/ResponseRouterTest.cpp
@@ -58,7 +58,7 @@ protected:
        return {};
    }
    LLMQore::RequestID ask(const QString &, LLMQore::RequestMode) override { return {}; }
-    QFuture<QList<QString>> listModels() override { return {}; }
+    QFuture<QList<QString>> listModels(const QString & = {}) override { return {}; }
    LLMQore::ToolSchemaFormat toolSchemaFormat() const override
    {
        return LLMQore::ToolSchemaFormat::Claude;