diff --git a/sources/agents/agents.qrc b/sources/agents/agents.qrc
index ab8f57a..9359bc0 100644
--- a/sources/agents/agents.qrc
+++ b/sources/agents/agents.qrc
@@ -18,6 +18,9 @@
openai_quick_refactor.toml
google_base_chat.toml
google_chat.toml
+ google_completion.toml
+ google_compression.toml
+ google_quick_refactor.toml
mistral_base_chat.toml
mistral_chat.toml
mistral_chat_reasoning.toml
@@ -40,6 +43,16 @@
ollama_compression_16gb.toml
ollama_compression_32gb.toml
ollama_chat_gemma4.toml
+ llamacpp_base_chat.toml
+ llamacpp_chat.toml
+ llamacpp_completion_fim.toml
+ llamacpp_compression.toml
+ llamacpp_quick_refactor.toml
+ lmstudio_base_responses.toml
+ lmstudio_chat.toml
+ lmstudio_completion.toml
+ lmstudio_compression.toml
+ lmstudio_quick_refactor.toml
roles/qt-cpp-developer.md
diff --git a/sources/agents/claude_completion.toml b/sources/agents/claude_completion.toml
index 5bc219b..a98357a 100644
--- a/sources/agents/claude_completion.toml
+++ b/sources/agents/claude_completion.toml
@@ -16,7 +16,7 @@ system_prompt = """
[body]
max_tokens = 512
-temperature = 0.2
+temperature = 0
stop_sequences = [""]
messages = """
[
diff --git a/sources/agents/codestral_completion_fim.toml b/sources/agents/codestral_completion_fim.toml
index 8d5eb10..fbee554 100644
--- a/sources/agents/codestral_completion_fim.toml
+++ b/sources/agents/codestral_completion_fim.toml
@@ -9,4 +9,4 @@ tags = ["completion", "codestral", "mistral", "cloud", "fim"]
[body]
max_tokens = 256
-temperature = 0.2
+temperature = 0
diff --git a/sources/agents/google_completion.toml b/sources/agents/google_completion.toml
new file mode 100644
index 0000000..0cee39b
--- /dev/null
+++ b/sources/agents/google_completion.toml
@@ -0,0 +1,31 @@
+schema_version = 1
+
+extends = "Google Base Chat"
+name = "Google Completion"
+description = "Google Gemini 3.1 Flash-Lite — code completion using the chat format over generateContent. Thinking disabled (thinkingBudget=0) and temperature=0 for fast, deterministic insertions; stops at ."
+
+model = "gemini-3.1-flash-lite"
+tags = ["completion", "gemini", "google", "cloud"]
+
+system_prompt = """
+{%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }}
+{%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }}
+{%- else %}{{ read_file(":/roles/code-completion.md") }}
+{%- endif %}
+{{ read_file(":/tasks/code-completion.md") }}"""
+
+[body]
+contents = """
+[
+ {
+ "role": "user",
+ "parts": [ { "text": {{ tojson("Here is the code context with insertion points:\\n\\n" + ctx.prefix + "" + ctx.suffix + "\\n") }} } ]
+ }
+]
+"""
+
+[body.generationConfig]
+maxOutputTokens = 1024
+temperature = 0
+stopSequences = [""]
+thinkingConfig = { thinkingBudget = 0 }
diff --git a/sources/agents/google_compression.toml b/sources/agents/google_compression.toml
new file mode 100644
index 0000000..acbdac5
--- /dev/null
+++ b/sources/agents/google_compression.toml
@@ -0,0 +1,16 @@
+schema_version = 1
+
+extends = "Google Base Chat"
+name = "Google Compression"
+description = "Google Gemini 3.1 Flash-Lite — fast, low-cost conversation summarization. Carries the summary system prompt; no tools, thinking disabled (thinkingBudget=0)."
+
+model = "gemini-3.1-flash-lite"
+enable_tools = false
+tags = ["compression", "gemini", "google", "cloud"]
+
+system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
+
+[body.generationConfig]
+maxOutputTokens = 16000
+temperature = 0.3
+thinkingConfig = { thinkingBudget = 0 }
diff --git a/sources/agents/google_quick_refactor.toml b/sources/agents/google_quick_refactor.toml
new file mode 100644
index 0000000..8cfcc94
--- /dev/null
+++ b/sources/agents/google_quick_refactor.toml
@@ -0,0 +1,17 @@
+schema_version = 1
+
+extends = "Google Base Chat"
+name = "Google Quick Refactor"
+description = "Google Gemini 3.5 Flash — agentic inline refactor with tools and thinking (gathers context before editing). Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context (file, code, cursor/selection)."
+
+model = "gemini-3.5-flash"
+enable_tools = true
+enable_thinking = true
+tags = ["refactor", "gemini", "google", "cloud"]
+
+system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
+
+[body.generationConfig]
+maxOutputTokens = 16000
+temperature = 1
+thinkingConfig = { includeThoughts = true, thinkingBudget = 8192 }
diff --git a/sources/agents/llamacpp_base_chat.toml b/sources/agents/llamacpp_base_chat.toml
new file mode 100644
index 0000000..56e714f
--- /dev/null
+++ b/sources/agents/llamacpp_base_chat.toml
@@ -0,0 +1,9 @@
+schema_version = 1
+
+extends = "OpenAI Base Chat"
+name = "llama.cpp Base Chat"
+description = "llama.cpp server Chat Completions request body (OpenAI-compatible /v1/chat/completions). Abstract — extend it and set model."
+abstract = true
+
+provider_instance = "llama.cpp"
+endpoint = "/v1/chat/completions"
diff --git a/sources/agents/llamacpp_chat.toml b/sources/agents/llamacpp_chat.toml
new file mode 100644
index 0000000..eb76897
--- /dev/null
+++ b/sources/agents/llamacpp_chat.toml
@@ -0,0 +1,15 @@
+schema_version = 1
+
+extends = "llama.cpp Base Chat"
+name = "llama.cpp Chat"
+description = "Local llama.cpp (llama-server) — coding chat via the OpenAI-compatible Chat Completions API. llama-server serves whichever GGUF you loaded, so 'model' is only a label. Tool calling needs a tool-capable model and llama-server started with --jinja."
+
+model = "qwen2.5-coder-7b-instruct"
+enable_tools = true
+tags = ["chat", "llama.cpp", "local"]
+
+system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}"""
+
+[body]
+max_tokens = 8192
+temperature = 0.7
diff --git a/sources/agents/llamacpp_completion_fim.toml b/sources/agents/llamacpp_completion_fim.toml
new file mode 100644
index 0000000..d63120b
--- /dev/null
+++ b/sources/agents/llamacpp_completion_fim.toml
@@ -0,0 +1,15 @@
+schema_version = 1
+
+name = "llama.cpp Completion — FIM"
+description = "Local llama.cpp native fill-in-the-middle via the /infill endpoint (input_prefix + input_suffix). Fast and clean, but the loaded GGUF MUST be a FIM-trained model (qwen2.5-coder, codellama-code, deepseek-coder, starcoder2, codegemma). A plain chat model produces garbage here — there is no native completion path for those. llama-server serves whichever model is loaded, so 'model' is only a label."
+
+provider_instance = "llama.cpp"
+endpoint = "/infill"
+model = "qwen2.5-coder-7b"
+tags = ["completion", "llama.cpp", "local", "fim"]
+
+[body]
+input_prefix = """{{ tojson(ctx.prefix) }}"""
+input_suffix = """{% if existsIn(ctx, "suffix") %}{{ tojson(ctx.suffix) }}{% endif %}"""
+n_predict = 256
+temperature = 0.2
diff --git a/sources/agents/llamacpp_compression.toml b/sources/agents/llamacpp_compression.toml
new file mode 100644
index 0000000..379c4d1
--- /dev/null
+++ b/sources/agents/llamacpp_compression.toml
@@ -0,0 +1,15 @@
+schema_version = 1
+
+extends = "llama.cpp Base Chat"
+name = "llama.cpp Compression"
+description = "Local llama.cpp — conversation summarization via the OpenAI-compatible Chat Completions API. Carries the summary system prompt; no tools. llama-server serves whichever GGUF is loaded, so 'model' is only a label."
+
+model = "qwen2.5-coder-7b-instruct"
+enable_tools = false
+tags = ["compression", "llama.cpp", "local"]
+
+system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
+
+[body]
+max_tokens = 16000
+temperature = 0.3
diff --git a/sources/agents/llamacpp_quick_refactor.toml b/sources/agents/llamacpp_quick_refactor.toml
new file mode 100644
index 0000000..7658365
--- /dev/null
+++ b/sources/agents/llamacpp_quick_refactor.toml
@@ -0,0 +1,15 @@
+schema_version = 1
+
+extends = "llama.cpp Base Chat"
+name = "llama.cpp Quick Refactor"
+description = "Local llama.cpp deterministic inline refactor via the OpenAI-compatible Chat Completions API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; enabling them needs a tool-capable model + llama-server --jinja."
+
+model = "qwen2.5-coder-7b-instruct"
+enable_tools = false
+tags = ["refactor", "llama.cpp", "local"]
+
+system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
+
+[body]
+max_tokens = 8192
+temperature = 0.2
diff --git a/sources/agents/lmstudio_base_responses.toml b/sources/agents/lmstudio_base_responses.toml
new file mode 100644
index 0000000..3c1c282
--- /dev/null
+++ b/sources/agents/lmstudio_base_responses.toml
@@ -0,0 +1,9 @@
+schema_version = 1
+
+extends = "OpenAI Base Responses"
+name = "LM Studio Base Responses"
+description = "LM Studio Responses API request body (OpenAI-compatible /v1/responses). Abstract — extend it and set model."
+abstract = true
+
+provider_instance = "LM Studio (Responses API)"
+endpoint = "/v1/responses"
diff --git a/sources/agents/lmstudio_chat.toml b/sources/agents/lmstudio_chat.toml
new file mode 100644
index 0000000..b2cb246
--- /dev/null
+++ b/sources/agents/lmstudio_chat.toml
@@ -0,0 +1,15 @@
+schema_version = 1
+
+extends = "LM Studio Base Responses"
+name = "LM Studio Chat"
+description = "Local LM Studio — coding chat via the OpenAI-compatible Responses API (/v1/responses) on Gemma 4 12B (tools + vision capable). Set 'model' to the identifier of the model loaded in LM Studio."
+
+model = "google/gemma-4-12b"
+enable_tools = true
+tags = ["chat", "lmstudio", "responses", "local"]
+
+system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}"""
+
+[body]
+max_output_tokens = 8192
+temperature = 0.7
diff --git a/sources/agents/lmstudio_completion.toml b/sources/agents/lmstudio_completion.toml
new file mode 100644
index 0000000..e4c12b8
--- /dev/null
+++ b/sources/agents/lmstudio_completion.toml
@@ -0,0 +1,24 @@
+schema_version = 1
+
+extends = "LM Studio Base Responses"
+name = "LM Studio Completion"
+description = "Local LM Studio — code completion via the Responses API: the cursor sits in a message, so the model continues the code. Use a NON-thinking instruct/code model and set `model` to whatever you have loaded (qwen2.5-coder-7b-instruct is a good pick). Avoid reasoning models: Gemma 4 (incl. -qat) and similar emit reasoning tokens before any code, the detailed completion task makes them deliberate for hundreds-to-thousands of tokens (worst on the no-op cases), and reasoning cannot be disabled via the Responses API — so they exhaust max_output_tokens and return an empty completion no matter how high it is set."
+
+model = "qwen2.5-coder-7b-instruct"
+tags = ["completion", "lmstudio", "responses", "local"]
+
+system_prompt = """
+{%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }}
+{%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }}
+{%- else %}{{ read_file(":/roles/code-completion.md") }}
+{%- endif %}
+{{ read_file(":/tasks/code-completion.md") }}"""
+
+[body]
+max_output_tokens = 256
+temperature = 0
+input = """
+[
+ { "role": "user", "content": {{ tojson("Here is the code context with insertion points:\\n\\n" + ctx.prefix + "" + ctx.suffix + "\\n") }} }
+]
+"""
diff --git a/sources/agents/lmstudio_compression.toml b/sources/agents/lmstudio_compression.toml
new file mode 100644
index 0000000..b9a954d
--- /dev/null
+++ b/sources/agents/lmstudio_compression.toml
@@ -0,0 +1,15 @@
+schema_version = 1
+
+extends = "LM Studio Base Responses"
+name = "LM Studio Compression"
+description = "Local LM Studio — conversation summarization via the Responses API. Carries the summary system prompt; no tools. Gemma 4 12B by default; set 'model' to the loaded model's identifier."
+
+model = "google/gemma-4-12b"
+enable_tools = false
+tags = ["compression", "lmstudio", "responses", "local"]
+
+system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
+
+[body]
+max_output_tokens = 16000
+temperature = 0.3
diff --git a/sources/agents/lmstudio_quick_refactor.toml b/sources/agents/lmstudio_quick_refactor.toml
new file mode 100644
index 0000000..e134e7a
--- /dev/null
+++ b/sources/agents/lmstudio_quick_refactor.toml
@@ -0,0 +1,15 @@
+schema_version = 1
+
+extends = "LM Studio Base Responses"
+name = "LM Studio Quick Refactor"
+description = "Local LM Studio deterministic inline refactor via the Responses API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; Gemma 4 12B is tool-capable if you enable them."
+
+model = "google/gemma-4-12b"
+enable_tools = false
+tags = ["refactor", "lmstudio", "responses", "local"]
+
+system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
+
+[body]
+max_output_tokens = 8192
+temperature = 0.2
diff --git a/sources/agents/mistral_completion_codestral_fim.toml b/sources/agents/mistral_completion_codestral_fim.toml
index 3844dd2..2c6d2c1 100644
--- a/sources/agents/mistral_completion_codestral_fim.toml
+++ b/sources/agents/mistral_completion_codestral_fim.toml
@@ -10,4 +10,4 @@ tags = ["completion", "mistral", "codestral", "cloud", "fim"]
[body]
max_tokens = 256
-temperature = 0.2
+temperature = 0
diff --git a/sources/agents/ollama_completion_fim.toml b/sources/agents/ollama_completion_fim.toml
index 647548f..eaa6521 100644
--- a/sources/agents/ollama_completion_fim.toml
+++ b/sources/agents/ollama_completion_fim.toml
@@ -2,12 +2,12 @@ schema_version = 1
extends = "Ollama Base FIM"
name = "Ollama Completion — FIM"
-description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with models that ship a FIM template, and those are few: the base / '-code' variants, NOT instruct/chat models. Verified to work: qwen2.5-coder (incl. -base), codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show --modelfile` must mention 'Suffix'."
+description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with a true BASE / '-code' model. Pick a base tag explicitly: the bare 'qwen2.5-coder:7b' tag is the INSTRUCT model (it ships an im_start chat template), and on FIM it rambles whole programs and prose — use 'qwen2.5-coder:7b-base' instead. Verified base/-code FIM models: qwen2.5-coder:7b-base, codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat/instruct model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show --modelfile` must mention 'Suffix' and must NOT have an im_start/chat template."
-model = "qwen2.5-coder:7b"
+model = "qwen2.5-coder:7b-base-q5_K_M"
tags = ["completion", "ollama", "local", "fim", "8gb"]
[body.options]
num_predict = 256
-temperature = 0.2
+temperature = 0
keep_alive = "5m"
diff --git a/sources/agents/ollama_compression_16gb.toml b/sources/agents/ollama_compression_16gb.toml
index b3e3cab..e0e63d0 100644
--- a/sources/agents/ollama_compression_16gb.toml
+++ b/sources/agents/ollama_compression_16gb.toml
@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "16gb"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
+[body]
+think = false
+
[body.options]
num_predict = 2048
temperature = 0.3
diff --git a/sources/agents/ollama_compression_32gb.toml b/sources/agents/ollama_compression_32gb.toml
index 1794a26..f1362d5 100644
--- a/sources/agents/ollama_compression_32gb.toml
+++ b/sources/agents/ollama_compression_32gb.toml
@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "32gb"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
+[body]
+think = false
+
[body.options]
num_predict = 2048
temperature = 0.3
diff --git a/sources/agents/ollama_compression_8gb.toml b/sources/agents/ollama_compression_8gb.toml
index 4bbb2d4..0f4e341 100644
--- a/sources/agents/ollama_compression_8gb.toml
+++ b/sources/agents/ollama_compression_8gb.toml
@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "8gb"]
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
+[body]
+think = false
+
[body.options]
num_predict = 2048
temperature = 0.3
diff --git a/sources/agents/tasks/code-completion.md b/sources/agents/tasks/code-completion.md
index 88d0285..060c138 100644
--- a/sources/agents/tasks/code-completion.md
+++ b/sources/agents/tasks/code-completion.md
@@ -1,20 +1,25 @@
Core Requirements:
1. Continue code exactly from the cursor position, ensuring it properly connects with any existing code after the cursor
-2. Never repeat existing code before or after the cursor
+2. Never repeat existing code before or after the cursor — the text after already exists, so do not reproduce any of it
Specific Guidelines:
- For function calls: Complete parameters with appropriate types and names
- For class members: Respect access modifiers and class conventions
-- Respect existing indentation and formatting
-- Consider scope and visibility of referenced symbols
+- Respect existing indentation and formatting; do not re-emit indentation that already precedes the cursor
+- Consider scope and visibility of referenced symbols (do not use a symbol that is only declared after the cursor)
- Ensure seamless integration with code both before and after the cursor
+When nothing should be inserted, return an empty code block. This applies only when:
+- Any insertion would duplicate code that already appears after the cursor, or
+- The cursor sits in the middle of an existing identifier or type name, or between a complete type and its variable name
+Otherwise, always provide a completion (for example, fill an empty initializer list or argument list). In the no-insertion cases above, output an empty code block and nothing else — never describe the code, report errors, ask questions, or suggest alternatives.
+
Context Format:
...code before the cursor......code after the cursor...
Response Format:
-- No explanations or comments
-- Only include new characters needed to create valid code
-- Should be codeblock with language
+- Your entire response must be exactly one code block tagged with the language, and nothing else
+- Never write any sentence, note, explanation, or comment before or after the code block — not even to state that the code is already complete
+- Inside the block, include only the new characters needed at the cursor to form valid code; leave the block empty only in the no-insertion cases listed above
diff --git a/sources/external/llmqore b/sources/external/llmqore
index ea44041..4450ece 160000
--- a/sources/external/llmqore
+++ b/sources/external/llmqore
@@ -1 +1 @@
-Subproject commit ea44041b24f5220e6529812ecdb0a901080810f8
+Subproject commit 4450eceda98105e8e471b9e4fa73bfe4cb42b448
diff --git a/sources/providers/GenericProvider.cpp b/sources/providers/GenericProvider.cpp
index bc12b1d..8b09a5e 100644
--- a/sources/providers/GenericProvider.cpp
+++ b/sources/providers/GenericProvider.cpp
@@ -48,7 +48,26 @@ QFuture> GenericProvider::getInstalledModels(const QString &url)
{
m_client->setUrl(url);
m_client->setApiKey(apiKey());
- return m_client->listModels();
+ return m_client->listModels(modelsEndpoint(url));
+}
+
+QString GenericProvider::modelsEndpoint(const QString &url) const
+{
+ switch (m_id) {
+ case ProviderID::OpenAI:
+ case ProviderID::OpenAIResponses:
+ case ProviderID::OpenAICompatible:
+ case ProviderID::LMStudio:
+ case ProviderID::OpenRouter:
+ break;
+ default:
+ return {};
+ }
+
+ QString base = url;
+ while (base.endsWith('/'))
+ base.chop(1);
+ return base.endsWith("/v1") ? QStringLiteral("/models") : QStringLiteral("/v1/models");
}
RequestID GenericProvider::sendRequest(
diff --git a/sources/providers/GenericProvider.hpp b/sources/providers/GenericProvider.hpp
index a9f11f2..c1c8c1d 100644
--- a/sources/providers/GenericProvider.hpp
+++ b/sources/providers/GenericProvider.hpp
@@ -39,6 +39,8 @@ public:
const QUrl &url, const QJsonObject &payload, const QString &endpoint) override;
private:
+ QString modelsEndpoint(const QString &url) const;
+
QString m_name;
ProviderID m_id;
::LLMQore::BaseClient *m_client;
diff --git a/test/ResponseRouterTest.cpp b/test/ResponseRouterTest.cpp
index 133aaab..1db5eb5 100644
--- a/test/ResponseRouterTest.cpp
+++ b/test/ResponseRouterTest.cpp
@@ -58,7 +58,7 @@ protected:
return {};
}
LLMQore::RequestID ask(const QString &, LLMQore::RequestMode) override { return {}; }
- QFuture> listModels() override { return {}; }
+ QFuture> listModels(const QString & = {}) override { return {}; }
LLMQore::ToolSchemaFormat toolSchemaFormat() const override
{
return LLMQore::ToolSchemaFormat::Claude;