mirror of
https://github.com/Palm1r/QodeAssist.git
synced 2026-06-30 18:19:11 -04:00
feat: Improve agents config
This commit is contained in:
@@ -18,6 +18,9 @@
|
||||
<file>openai_quick_refactor.toml</file>
|
||||
<file>google_base_chat.toml</file>
|
||||
<file>google_chat.toml</file>
|
||||
<file>google_completion.toml</file>
|
||||
<file>google_compression.toml</file>
|
||||
<file>google_quick_refactor.toml</file>
|
||||
<file>mistral_base_chat.toml</file>
|
||||
<file>mistral_chat.toml</file>
|
||||
<file>mistral_chat_reasoning.toml</file>
|
||||
@@ -40,6 +43,16 @@
|
||||
<file>ollama_compression_16gb.toml</file>
|
||||
<file>ollama_compression_32gb.toml</file>
|
||||
<file>ollama_chat_gemma4.toml</file>
|
||||
<file>llamacpp_base_chat.toml</file>
|
||||
<file>llamacpp_chat.toml</file>
|
||||
<file>llamacpp_completion_fim.toml</file>
|
||||
<file>llamacpp_compression.toml</file>
|
||||
<file>llamacpp_quick_refactor.toml</file>
|
||||
<file>lmstudio_base_responses.toml</file>
|
||||
<file>lmstudio_chat.toml</file>
|
||||
<file>lmstudio_completion.toml</file>
|
||||
<file>lmstudio_compression.toml</file>
|
||||
<file>lmstudio_quick_refactor.toml</file>
|
||||
</qresource>
|
||||
<qresource prefix="/roles">
|
||||
<file alias="qt-cpp-developer.md">roles/qt-cpp-developer.md</file>
|
||||
|
||||
@@ -16,7 +16,7 @@ system_prompt = """
|
||||
|
||||
[body]
|
||||
max_tokens = 512
|
||||
temperature = 0.2
|
||||
temperature = 0
|
||||
stop_sequences = ["</code_context>"]
|
||||
messages = """
|
||||
[
|
||||
|
||||
@@ -9,4 +9,4 @@ tags = ["completion", "codestral", "mistral", "cloud", "fim"]
|
||||
|
||||
[body]
|
||||
max_tokens = 256
|
||||
temperature = 0.2
|
||||
temperature = 0
|
||||
|
||||
31
sources/agents/google_completion.toml
Normal file
31
sources/agents/google_completion.toml
Normal file
@@ -0,0 +1,31 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "Google Base Chat"
|
||||
name = "Google Completion"
|
||||
description = "Google Gemini 3.1 Flash-Lite — code completion using the <code_context> chat format over generateContent. Thinking disabled (thinkingBudget=0) and temperature=0 for fast, deterministic insertions; stops at </code_context>."
|
||||
|
||||
model = "gemini-3.1-flash-lite"
|
||||
tags = ["completion", "gemini", "google", "cloud"]
|
||||
|
||||
system_prompt = """
|
||||
{%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }}
|
||||
{%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }}
|
||||
{%- else %}{{ read_file(":/roles/code-completion.md") }}
|
||||
{%- endif %}
|
||||
{{ read_file(":/tasks/code-completion.md") }}"""
|
||||
|
||||
[body]
|
||||
contents = """
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"parts": [ { "text": {{ tojson("Here is the code context with insertion points:\\n<code_context>\\n" + ctx.prefix + "<cursor>" + ctx.suffix + "\\n</code_context>") }} } ]
|
||||
}
|
||||
]
|
||||
"""
|
||||
|
||||
[body.generationConfig]
|
||||
maxOutputTokens = 1024
|
||||
temperature = 0
|
||||
stopSequences = ["</code_context>"]
|
||||
thinkingConfig = { thinkingBudget = 0 }
|
||||
16
sources/agents/google_compression.toml
Normal file
16
sources/agents/google_compression.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "Google Base Chat"
|
||||
name = "Google Compression"
|
||||
description = "Google Gemini 3.1 Flash-Lite — fast, low-cost conversation summarization. Carries the summary system prompt; no tools, thinking disabled (thinkingBudget=0)."
|
||||
|
||||
model = "gemini-3.1-flash-lite"
|
||||
enable_tools = false
|
||||
tags = ["compression", "gemini", "google", "cloud"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
|
||||
|
||||
[body.generationConfig]
|
||||
maxOutputTokens = 16000
|
||||
temperature = 0.3
|
||||
thinkingConfig = { thinkingBudget = 0 }
|
||||
17
sources/agents/google_quick_refactor.toml
Normal file
17
sources/agents/google_quick_refactor.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "Google Base Chat"
|
||||
name = "Google Quick Refactor"
|
||||
description = "Google Gemini 3.5 Flash — agentic inline refactor with tools and thinking (gathers context before editing). Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context (file, code, cursor/selection)."
|
||||
|
||||
model = "gemini-3.5-flash"
|
||||
enable_tools = true
|
||||
enable_thinking = true
|
||||
tags = ["refactor", "gemini", "google", "cloud"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
|
||||
|
||||
[body.generationConfig]
|
||||
maxOutputTokens = 16000
|
||||
temperature = 1
|
||||
thinkingConfig = { includeThoughts = true, thinkingBudget = 8192 }
|
||||
9
sources/agents/llamacpp_base_chat.toml
Normal file
9
sources/agents/llamacpp_base_chat.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "OpenAI Base Chat"
|
||||
name = "llama.cpp Base Chat"
|
||||
description = "llama.cpp server Chat Completions request body (OpenAI-compatible /v1/chat/completions). Abstract — extend it and set model."
|
||||
abstract = true
|
||||
|
||||
provider_instance = "llama.cpp"
|
||||
endpoint = "/v1/chat/completions"
|
||||
15
sources/agents/llamacpp_chat.toml
Normal file
15
sources/agents/llamacpp_chat.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "llama.cpp Base Chat"
|
||||
name = "llama.cpp Chat"
|
||||
description = "Local llama.cpp (llama-server) — coding chat via the OpenAI-compatible Chat Completions API. llama-server serves whichever GGUF you loaded, so 'model' is only a label. Tool calling needs a tool-capable model and llama-server started with --jinja."
|
||||
|
||||
model = "qwen2.5-coder-7b-instruct"
|
||||
enable_tools = true
|
||||
tags = ["chat", "llama.cpp", "local"]
|
||||
|
||||
system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}"""
|
||||
|
||||
[body]
|
||||
max_tokens = 8192
|
||||
temperature = 0.7
|
||||
15
sources/agents/llamacpp_completion_fim.toml
Normal file
15
sources/agents/llamacpp_completion_fim.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
schema_version = 1
|
||||
|
||||
name = "llama.cpp Completion — FIM"
|
||||
description = "Local llama.cpp native fill-in-the-middle via the /infill endpoint (input_prefix + input_suffix). Fast and clean, but the loaded GGUF MUST be a FIM-trained model (qwen2.5-coder, codellama-code, deepseek-coder, starcoder2, codegemma). A plain chat model produces garbage here — there is no native completion path for those. llama-server serves whichever model is loaded, so 'model' is only a label."
|
||||
|
||||
provider_instance = "llama.cpp"
|
||||
endpoint = "/infill"
|
||||
model = "qwen2.5-coder-7b"
|
||||
tags = ["completion", "llama.cpp", "local", "fim"]
|
||||
|
||||
[body]
|
||||
input_prefix = """{{ tojson(ctx.prefix) }}"""
|
||||
input_suffix = """{% if existsIn(ctx, "suffix") %}{{ tojson(ctx.suffix) }}{% endif %}"""
|
||||
n_predict = 256
|
||||
temperature = 0.2
|
||||
15
sources/agents/llamacpp_compression.toml
Normal file
15
sources/agents/llamacpp_compression.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "llama.cpp Base Chat"
|
||||
name = "llama.cpp Compression"
|
||||
description = "Local llama.cpp — conversation summarization via the OpenAI-compatible Chat Completions API. Carries the summary system prompt; no tools. llama-server serves whichever GGUF is loaded, so 'model' is only a label."
|
||||
|
||||
model = "qwen2.5-coder-7b-instruct"
|
||||
enable_tools = false
|
||||
tags = ["compression", "llama.cpp", "local"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
|
||||
|
||||
[body]
|
||||
max_tokens = 16000
|
||||
temperature = 0.3
|
||||
15
sources/agents/llamacpp_quick_refactor.toml
Normal file
15
sources/agents/llamacpp_quick_refactor.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "llama.cpp Base Chat"
|
||||
name = "llama.cpp Quick Refactor"
|
||||
description = "Local llama.cpp deterministic inline refactor via the OpenAI-compatible Chat Completions API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; enabling them needs a tool-capable model + llama-server --jinja."
|
||||
|
||||
model = "qwen2.5-coder-7b-instruct"
|
||||
enable_tools = false
|
||||
tags = ["refactor", "llama.cpp", "local"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
|
||||
|
||||
[body]
|
||||
max_tokens = 8192
|
||||
temperature = 0.2
|
||||
9
sources/agents/lmstudio_base_responses.toml
Normal file
9
sources/agents/lmstudio_base_responses.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "OpenAI Base Responses"
|
||||
name = "LM Studio Base Responses"
|
||||
description = "LM Studio Responses API request body (OpenAI-compatible /v1/responses). Abstract — extend it and set model."
|
||||
abstract = true
|
||||
|
||||
provider_instance = "LM Studio (Responses API)"
|
||||
endpoint = "/v1/responses"
|
||||
15
sources/agents/lmstudio_chat.toml
Normal file
15
sources/agents/lmstudio_chat.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "LM Studio Base Responses"
|
||||
name = "LM Studio Chat"
|
||||
description = "Local LM Studio — coding chat via the OpenAI-compatible Responses API (/v1/responses) on Gemma 4 12B (tools + vision capable). Set 'model' to the identifier of the model loaded in LM Studio."
|
||||
|
||||
model = "google/gemma-4-12b"
|
||||
enable_tools = true
|
||||
tags = ["chat", "lmstudio", "responses", "local"]
|
||||
|
||||
system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}"""
|
||||
|
||||
[body]
|
||||
max_output_tokens = 8192
|
||||
temperature = 0.7
|
||||
24
sources/agents/lmstudio_completion.toml
Normal file
24
sources/agents/lmstudio_completion.toml
Normal file
@@ -0,0 +1,24 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "LM Studio Base Responses"
|
||||
name = "LM Studio Completion"
|
||||
description = "Local LM Studio — code completion via the Responses API: the cursor sits in a <code_context> message, so the model continues the code. Use a NON-thinking instruct/code model and set `model` to whatever you have loaded (qwen2.5-coder-7b-instruct is a good pick). Avoid reasoning models: Gemma 4 (incl. -qat) and similar emit reasoning tokens before any code, the detailed completion task makes them deliberate for hundreds-to-thousands of tokens (worst on the no-op cases), and reasoning cannot be disabled via the Responses API — so they exhaust max_output_tokens and return an empty completion no matter how high it is set."
|
||||
|
||||
model = "qwen2.5-coder-7b-instruct"
|
||||
tags = ["completion", "lmstudio", "responses", "local"]
|
||||
|
||||
system_prompt = """
|
||||
{%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }}
|
||||
{%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }}
|
||||
{%- else %}{{ read_file(":/roles/code-completion.md") }}
|
||||
{%- endif %}
|
||||
{{ read_file(":/tasks/code-completion.md") }}"""
|
||||
|
||||
[body]
|
||||
max_output_tokens = 256
|
||||
temperature = 0
|
||||
input = """
|
||||
[
|
||||
{ "role": "user", "content": {{ tojson("Here is the code context with insertion points:\\n<code_context>\\n" + ctx.prefix + "<cursor>" + ctx.suffix + "\\n</code_context>") }} }
|
||||
]
|
||||
"""
|
||||
15
sources/agents/lmstudio_compression.toml
Normal file
15
sources/agents/lmstudio_compression.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "LM Studio Base Responses"
|
||||
name = "LM Studio Compression"
|
||||
description = "Local LM Studio — conversation summarization via the Responses API. Carries the summary system prompt; no tools. Gemma 4 12B by default; set 'model' to the loaded model's identifier."
|
||||
|
||||
model = "google/gemma-4-12b"
|
||||
enable_tools = false
|
||||
tags = ["compression", "lmstudio", "responses", "local"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
|
||||
|
||||
[body]
|
||||
max_output_tokens = 16000
|
||||
temperature = 0.3
|
||||
15
sources/agents/lmstudio_quick_refactor.toml
Normal file
15
sources/agents/lmstudio_quick_refactor.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
schema_version = 1
|
||||
|
||||
extends = "LM Studio Base Responses"
|
||||
name = "LM Studio Quick Refactor"
|
||||
description = "Local LM Studio deterministic inline refactor via the Responses API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; Gemma 4 12B is tool-capable if you enable them."
|
||||
|
||||
model = "google/gemma-4-12b"
|
||||
enable_tools = false
|
||||
tags = ["refactor", "lmstudio", "responses", "local"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}"""
|
||||
|
||||
[body]
|
||||
max_output_tokens = 8192
|
||||
temperature = 0.2
|
||||
@@ -10,4 +10,4 @@ tags = ["completion", "mistral", "codestral", "cloud", "fim"]
|
||||
|
||||
[body]
|
||||
max_tokens = 256
|
||||
temperature = 0.2
|
||||
temperature = 0
|
||||
|
||||
@@ -2,12 +2,12 @@ schema_version = 1
|
||||
|
||||
extends = "Ollama Base FIM"
|
||||
name = "Ollama Completion — FIM"
|
||||
description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with models that ship a FIM template, and those are few: the base / '-code' variants, NOT instruct/chat models. Verified to work: qwen2.5-coder (incl. -base), codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show <model> --modelfile` must mention 'Suffix'."
|
||||
description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with a true BASE / '-code' model. Pick a base tag explicitly: the bare 'qwen2.5-coder:7b' tag is the INSTRUCT model (it ships an im_start chat template), and on FIM it rambles whole programs and prose — use 'qwen2.5-coder:7b-base' instead. Verified base/-code FIM models: qwen2.5-coder:7b-base, codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat/instruct model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show <model> --modelfile` must mention 'Suffix' and must NOT have an im_start/chat template."
|
||||
|
||||
model = "qwen2.5-coder:7b"
|
||||
model = "qwen2.5-coder:7b-base-q5_K_M"
|
||||
tags = ["completion", "ollama", "local", "fim", "8gb"]
|
||||
|
||||
[body.options]
|
||||
num_predict = 256
|
||||
temperature = 0.2
|
||||
temperature = 0
|
||||
keep_alive = "5m"
|
||||
|
||||
@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "16gb"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
|
||||
|
||||
[body]
|
||||
think = false
|
||||
|
||||
[body.options]
|
||||
num_predict = 2048
|
||||
temperature = 0.3
|
||||
|
||||
@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "32gb"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
|
||||
|
||||
[body]
|
||||
think = false
|
||||
|
||||
[body.options]
|
||||
num_predict = 2048
|
||||
temperature = 0.3
|
||||
|
||||
@@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "8gb"]
|
||||
|
||||
system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}"""
|
||||
|
||||
[body]
|
||||
think = false
|
||||
|
||||
[body.options]
|
||||
num_predict = 2048
|
||||
temperature = 0.3
|
||||
|
||||
@@ -1,20 +1,25 @@
|
||||
Core Requirements:
|
||||
1. Continue code exactly from the cursor position, ensuring it properly connects with any existing code after the cursor
|
||||
2. Never repeat existing code before or after the cursor
|
||||
2. Never repeat existing code before or after the cursor — the text after <cursor> already exists, so do not reproduce any of it
|
||||
|
||||
Specific Guidelines:
|
||||
- For function calls: Complete parameters with appropriate types and names
|
||||
- For class members: Respect access modifiers and class conventions
|
||||
- Respect existing indentation and formatting
|
||||
- Consider scope and visibility of referenced symbols
|
||||
- Respect existing indentation and formatting; do not re-emit indentation that already precedes the cursor
|
||||
- Consider scope and visibility of referenced symbols (do not use a symbol that is only declared after the cursor)
|
||||
- Ensure seamless integration with code both before and after the cursor
|
||||
|
||||
When nothing should be inserted, return an empty code block. This applies only when:
|
||||
- Any insertion would duplicate code that already appears after the cursor, or
|
||||
- The cursor sits in the middle of an existing identifier or type name, or between a complete type and its variable name
|
||||
Otherwise, always provide a completion (for example, fill an empty initializer list or argument list). In the no-insertion cases above, output an empty code block and nothing else — never describe the code, report errors, ask questions, or suggest alternatives.
|
||||
|
||||
Context Format:
|
||||
<code_context>
|
||||
...code before the cursor...<cursor>...code after the cursor...
|
||||
</code_context>
|
||||
|
||||
Response Format:
|
||||
- No explanations or comments
|
||||
- Only include new characters needed to create valid code
|
||||
- Should be codeblock with language
|
||||
- Your entire response must be exactly one code block tagged with the language, and nothing else
|
||||
- Never write any sentence, note, explanation, or comment before or after the code block — not even to state that the code is already complete
|
||||
- Inside the block, include only the new characters needed at the cursor to form valid code; leave the block empty only in the no-insertion cases listed above
|
||||
|
||||
2
sources/external/llmqore
vendored
2
sources/external/llmqore
vendored
Submodule sources/external/llmqore updated: ea44041b24...4450eceda9
@@ -48,7 +48,26 @@ QFuture<QList<QString>> GenericProvider::getInstalledModels(const QString &url)
|
||||
{
|
||||
m_client->setUrl(url);
|
||||
m_client->setApiKey(apiKey());
|
||||
return m_client->listModels();
|
||||
return m_client->listModels(modelsEndpoint(url));
|
||||
}
|
||||
|
||||
QString GenericProvider::modelsEndpoint(const QString &url) const
|
||||
{
|
||||
switch (m_id) {
|
||||
case ProviderID::OpenAI:
|
||||
case ProviderID::OpenAIResponses:
|
||||
case ProviderID::OpenAICompatible:
|
||||
case ProviderID::LMStudio:
|
||||
case ProviderID::OpenRouter:
|
||||
break;
|
||||
default:
|
||||
return {};
|
||||
}
|
||||
|
||||
QString base = url;
|
||||
while (base.endsWith('/'))
|
||||
base.chop(1);
|
||||
return base.endsWith("/v1") ? QStringLiteral("/models") : QStringLiteral("/v1/models");
|
||||
}
|
||||
|
||||
RequestID GenericProvider::sendRequest(
|
||||
|
||||
@@ -39,6 +39,8 @@ public:
|
||||
const QUrl &url, const QJsonObject &payload, const QString &endpoint) override;
|
||||
|
||||
private:
|
||||
QString modelsEndpoint(const QString &url) const;
|
||||
|
||||
QString m_name;
|
||||
ProviderID m_id;
|
||||
::LLMQore::BaseClient *m_client;
|
||||
|
||||
@@ -58,7 +58,7 @@ protected:
|
||||
return {};
|
||||
}
|
||||
LLMQore::RequestID ask(const QString &, LLMQore::RequestMode) override { return {}; }
|
||||
QFuture<QList<QString>> listModels() override { return {}; }
|
||||
QFuture<QList<QString>> listModels(const QString & = {}) override { return {}; }
|
||||
LLMQore::ToolSchemaFormat toolSchemaFormat() const override
|
||||
{
|
||||
return LLMQore::ToolSchemaFormat::Claude;
|
||||
|
||||
Reference in New Issue
Block a user