diff --git a/sources/agents/agents.qrc b/sources/agents/agents.qrc index ab8f57a..9359bc0 100644 --- a/sources/agents/agents.qrc +++ b/sources/agents/agents.qrc @@ -18,6 +18,9 @@ openai_quick_refactor.toml google_base_chat.toml google_chat.toml + google_completion.toml + google_compression.toml + google_quick_refactor.toml mistral_base_chat.toml mistral_chat.toml mistral_chat_reasoning.toml @@ -40,6 +43,16 @@ ollama_compression_16gb.toml ollama_compression_32gb.toml ollama_chat_gemma4.toml + llamacpp_base_chat.toml + llamacpp_chat.toml + llamacpp_completion_fim.toml + llamacpp_compression.toml + llamacpp_quick_refactor.toml + lmstudio_base_responses.toml + lmstudio_chat.toml + lmstudio_completion.toml + lmstudio_compression.toml + lmstudio_quick_refactor.toml roles/qt-cpp-developer.md diff --git a/sources/agents/claude_completion.toml b/sources/agents/claude_completion.toml index 5bc219b..a98357a 100644 --- a/sources/agents/claude_completion.toml +++ b/sources/agents/claude_completion.toml @@ -16,7 +16,7 @@ system_prompt = """ [body] max_tokens = 512 -temperature = 0.2 +temperature = 0 stop_sequences = [""] messages = """ [ diff --git a/sources/agents/codestral_completion_fim.toml b/sources/agents/codestral_completion_fim.toml index 8d5eb10..fbee554 100644 --- a/sources/agents/codestral_completion_fim.toml +++ b/sources/agents/codestral_completion_fim.toml @@ -9,4 +9,4 @@ tags = ["completion", "codestral", "mistral", "cloud", "fim"] [body] max_tokens = 256 -temperature = 0.2 +temperature = 0 diff --git a/sources/agents/google_completion.toml b/sources/agents/google_completion.toml new file mode 100644 index 0000000..0cee39b --- /dev/null +++ b/sources/agents/google_completion.toml @@ -0,0 +1,31 @@ +schema_version = 1 + +extends = "Google Base Chat" +name = "Google Completion" +description = "Google Gemini 3.1 Flash-Lite — code completion using the chat format over generateContent. Thinking disabled (thinkingBudget=0) and temperature=0 for fast, deterministic insertions; stops at ." + +model = "gemini-3.1-flash-lite" +tags = ["completion", "gemini", "google", "cloud"] + +system_prompt = """ +{%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }} +{%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }} +{%- else %}{{ read_file(":/roles/code-completion.md") }} +{%- endif %} +{{ read_file(":/tasks/code-completion.md") }}""" + +[body] +contents = """ +[ + { + "role": "user", + "parts": [ { "text": {{ tojson("Here is the code context with insertion points:\\n\\n" + ctx.prefix + "" + ctx.suffix + "\\n") }} } ] + } +] +""" + +[body.generationConfig] +maxOutputTokens = 1024 +temperature = 0 +stopSequences = [""] +thinkingConfig = { thinkingBudget = 0 } diff --git a/sources/agents/google_compression.toml b/sources/agents/google_compression.toml new file mode 100644 index 0000000..acbdac5 --- /dev/null +++ b/sources/agents/google_compression.toml @@ -0,0 +1,16 @@ +schema_version = 1 + +extends = "Google Base Chat" +name = "Google Compression" +description = "Google Gemini 3.1 Flash-Lite — fast, low-cost conversation summarization. Carries the summary system prompt; no tools, thinking disabled (thinkingBudget=0)." + +model = "gemini-3.1-flash-lite" +enable_tools = false +tags = ["compression", "gemini", "google", "cloud"] + +system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}""" + +[body.generationConfig] +maxOutputTokens = 16000 +temperature = 0.3 +thinkingConfig = { thinkingBudget = 0 } diff --git a/sources/agents/google_quick_refactor.toml b/sources/agents/google_quick_refactor.toml new file mode 100644 index 0000000..8cfcc94 --- /dev/null +++ b/sources/agents/google_quick_refactor.toml @@ -0,0 +1,17 @@ +schema_version = 1 + +extends = "Google Base Chat" +name = "Google Quick Refactor" +description = "Google Gemini 3.5 Flash — agentic inline refactor with tools and thinking (gathers context before editing). Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context (file, code, cursor/selection)." + +model = "gemini-3.5-flash" +enable_tools = true +enable_thinking = true +tags = ["refactor", "gemini", "google", "cloud"] + +system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}""" + +[body.generationConfig] +maxOutputTokens = 16000 +temperature = 1 +thinkingConfig = { includeThoughts = true, thinkingBudget = 8192 } diff --git a/sources/agents/llamacpp_base_chat.toml b/sources/agents/llamacpp_base_chat.toml new file mode 100644 index 0000000..56e714f --- /dev/null +++ b/sources/agents/llamacpp_base_chat.toml @@ -0,0 +1,9 @@ +schema_version = 1 + +extends = "OpenAI Base Chat" +name = "llama.cpp Base Chat" +description = "llama.cpp server Chat Completions request body (OpenAI-compatible /v1/chat/completions). Abstract — extend it and set model." +abstract = true + +provider_instance = "llama.cpp" +endpoint = "/v1/chat/completions" diff --git a/sources/agents/llamacpp_chat.toml b/sources/agents/llamacpp_chat.toml new file mode 100644 index 0000000..eb76897 --- /dev/null +++ b/sources/agents/llamacpp_chat.toml @@ -0,0 +1,15 @@ +schema_version = 1 + +extends = "llama.cpp Base Chat" +name = "llama.cpp Chat" +description = "Local llama.cpp (llama-server) — coding chat via the OpenAI-compatible Chat Completions API. llama-server serves whichever GGUF you loaded, so 'model' is only a label. Tool calling needs a tool-capable model and llama-server started with --jinja." + +model = "qwen2.5-coder-7b-instruct" +enable_tools = true +tags = ["chat", "llama.cpp", "local"] + +system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}""" + +[body] +max_tokens = 8192 +temperature = 0.7 diff --git a/sources/agents/llamacpp_completion_fim.toml b/sources/agents/llamacpp_completion_fim.toml new file mode 100644 index 0000000..d63120b --- /dev/null +++ b/sources/agents/llamacpp_completion_fim.toml @@ -0,0 +1,15 @@ +schema_version = 1 + +name = "llama.cpp Completion — FIM" +description = "Local llama.cpp native fill-in-the-middle via the /infill endpoint (input_prefix + input_suffix). Fast and clean, but the loaded GGUF MUST be a FIM-trained model (qwen2.5-coder, codellama-code, deepseek-coder, starcoder2, codegemma). A plain chat model produces garbage here — there is no native completion path for those. llama-server serves whichever model is loaded, so 'model' is only a label." + +provider_instance = "llama.cpp" +endpoint = "/infill" +model = "qwen2.5-coder-7b" +tags = ["completion", "llama.cpp", "local", "fim"] + +[body] +input_prefix = """{{ tojson(ctx.prefix) }}""" +input_suffix = """{% if existsIn(ctx, "suffix") %}{{ tojson(ctx.suffix) }}{% endif %}""" +n_predict = 256 +temperature = 0.2 diff --git a/sources/agents/llamacpp_compression.toml b/sources/agents/llamacpp_compression.toml new file mode 100644 index 0000000..379c4d1 --- /dev/null +++ b/sources/agents/llamacpp_compression.toml @@ -0,0 +1,15 @@ +schema_version = 1 + +extends = "llama.cpp Base Chat" +name = "llama.cpp Compression" +description = "Local llama.cpp — conversation summarization via the OpenAI-compatible Chat Completions API. Carries the summary system prompt; no tools. llama-server serves whichever GGUF is loaded, so 'model' is only a label." + +model = "qwen2.5-coder-7b-instruct" +enable_tools = false +tags = ["compression", "llama.cpp", "local"] + +system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}""" + +[body] +max_tokens = 16000 +temperature = 0.3 diff --git a/sources/agents/llamacpp_quick_refactor.toml b/sources/agents/llamacpp_quick_refactor.toml new file mode 100644 index 0000000..7658365 --- /dev/null +++ b/sources/agents/llamacpp_quick_refactor.toml @@ -0,0 +1,15 @@ +schema_version = 1 + +extends = "llama.cpp Base Chat" +name = "llama.cpp Quick Refactor" +description = "Local llama.cpp deterministic inline refactor via the OpenAI-compatible Chat Completions API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; enabling them needs a tool-capable model + llama-server --jinja." + +model = "qwen2.5-coder-7b-instruct" +enable_tools = false +tags = ["refactor", "llama.cpp", "local"] + +system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}""" + +[body] +max_tokens = 8192 +temperature = 0.2 diff --git a/sources/agents/lmstudio_base_responses.toml b/sources/agents/lmstudio_base_responses.toml new file mode 100644 index 0000000..3c1c282 --- /dev/null +++ b/sources/agents/lmstudio_base_responses.toml @@ -0,0 +1,9 @@ +schema_version = 1 + +extends = "OpenAI Base Responses" +name = "LM Studio Base Responses" +description = "LM Studio Responses API request body (OpenAI-compatible /v1/responses). Abstract — extend it and set model." +abstract = true + +provider_instance = "LM Studio (Responses API)" +endpoint = "/v1/responses" diff --git a/sources/agents/lmstudio_chat.toml b/sources/agents/lmstudio_chat.toml new file mode 100644 index 0000000..b2cb246 --- /dev/null +++ b/sources/agents/lmstudio_chat.toml @@ -0,0 +1,15 @@ +schema_version = 1 + +extends = "LM Studio Base Responses" +name = "LM Studio Chat" +description = "Local LM Studio — coding chat via the OpenAI-compatible Responses API (/v1/responses) on Gemma 4 12B (tools + vision capable). Set 'model' to the identifier of the model loaded in LM Studio." + +model = "google/gemma-4-12b" +enable_tools = true +tags = ["chat", "lmstudio", "responses", "local"] + +system_prompt = """{{ read_file(":/roles/qt-cpp-developer.md") }}""" + +[body] +max_output_tokens = 8192 +temperature = 0.7 diff --git a/sources/agents/lmstudio_completion.toml b/sources/agents/lmstudio_completion.toml new file mode 100644 index 0000000..e4c12b8 --- /dev/null +++ b/sources/agents/lmstudio_completion.toml @@ -0,0 +1,24 @@ +schema_version = 1 + +extends = "LM Studio Base Responses" +name = "LM Studio Completion" +description = "Local LM Studio — code completion via the Responses API: the cursor sits in a message, so the model continues the code. Use a NON-thinking instruct/code model and set `model` to whatever you have loaded (qwen2.5-coder-7b-instruct is a good pick). Avoid reasoning models: Gemma 4 (incl. -qat) and similar emit reasoning tokens before any code, the detailed completion task makes them deliberate for hundreds-to-thousands of tokens (worst on the no-op cases), and reasoning cannot be disabled via the Responses API — so they exhaust max_output_tokens and return an empty completion no matter how high it is set." + +model = "qwen2.5-coder-7b-instruct" +tags = ["completion", "lmstudio", "responses", "local"] + +system_prompt = """ +{%- if language == "qml" %}{{ read_file(":/roles/code-completion-qml.md") }} +{%- else if language == "c-like" %}{{ read_file(":/roles/code-completion-c-like.md") }} +{%- else %}{{ read_file(":/roles/code-completion.md") }} +{%- endif %} +{{ read_file(":/tasks/code-completion.md") }}""" + +[body] +max_output_tokens = 256 +temperature = 0 +input = """ +[ + { "role": "user", "content": {{ tojson("Here is the code context with insertion points:\\n\\n" + ctx.prefix + "" + ctx.suffix + "\\n") }} } +] +""" diff --git a/sources/agents/lmstudio_compression.toml b/sources/agents/lmstudio_compression.toml new file mode 100644 index 0000000..b9a954d --- /dev/null +++ b/sources/agents/lmstudio_compression.toml @@ -0,0 +1,15 @@ +schema_version = 1 + +extends = "LM Studio Base Responses" +name = "LM Studio Compression" +description = "Local LM Studio — conversation summarization via the Responses API. Carries the summary system prompt; no tools. Gemma 4 12B by default; set 'model' to the loaded model's identifier." + +model = "google/gemma-4-12b" +enable_tools = false +tags = ["compression", "lmstudio", "responses", "local"] + +system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}""" + +[body] +max_output_tokens = 16000 +temperature = 0.3 diff --git a/sources/agents/lmstudio_quick_refactor.toml b/sources/agents/lmstudio_quick_refactor.toml new file mode 100644 index 0000000..e134e7a --- /dev/null +++ b/sources/agents/lmstudio_quick_refactor.toml @@ -0,0 +1,15 @@ +schema_version = 1 + +extends = "LM Studio Base Responses" +name = "LM Studio Quick Refactor" +description = "Local LM Studio deterministic inline refactor via the Responses API. Static output rules from :/tasks/quick-refactor.md; QuickRefactorHandler injects the live editor context. Tools off by default for a fast single shot; Gemma 4 12B is tool-capable if you enable them." + +model = "google/gemma-4-12b" +enable_tools = false +tags = ["refactor", "lmstudio", "responses", "local"] + +system_prompt = """{{ read_file(":/tasks/quick-refactor.md") }}""" + +[body] +max_output_tokens = 8192 +temperature = 0.2 diff --git a/sources/agents/mistral_completion_codestral_fim.toml b/sources/agents/mistral_completion_codestral_fim.toml index 3844dd2..2c6d2c1 100644 --- a/sources/agents/mistral_completion_codestral_fim.toml +++ b/sources/agents/mistral_completion_codestral_fim.toml @@ -10,4 +10,4 @@ tags = ["completion", "mistral", "codestral", "cloud", "fim"] [body] max_tokens = 256 -temperature = 0.2 +temperature = 0 diff --git a/sources/agents/ollama_completion_fim.toml b/sources/agents/ollama_completion_fim.toml index 647548f..eaa6521 100644 --- a/sources/agents/ollama_completion_fim.toml +++ b/sources/agents/ollama_completion_fim.toml @@ -2,12 +2,12 @@ schema_version = 1 extends = "Ollama Base FIM" name = "Ollama Completion — FIM" -description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with models that ship a FIM template, and those are few: the base / '-code' variants, NOT instruct/chat models. Verified to work: qwen2.5-coder (incl. -base), codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show --modelfile` must mention 'Suffix'." +description = "Native fill-in-the-middle completion — uses the model's OWN FIM template (prompt+suffix on /api/generate). Fast and clean (no markdown or prose), but works ONLY with a true BASE / '-code' model. Pick a base tag explicitly: the bare 'qwen2.5-coder:7b' tag is the INSTRUCT model (it ships an im_start chat template), and on FIM it rambles whole programs and prose — use 'qwen2.5-coder:7b-base' instead. Verified base/-code FIM models: qwen2.5-coder:7b-base, codellama:7b-code, deepseek-coder-v2 lite-base. A plain chat/instruct model outputs garbage here — use 'Ollama Completion — Chat-style' instead. Check a model: `ollama show --modelfile` must mention 'Suffix' and must NOT have an im_start/chat template." -model = "qwen2.5-coder:7b" +model = "qwen2.5-coder:7b-base-q5_K_M" tags = ["completion", "ollama", "local", "fim", "8gb"] [body.options] num_predict = 256 -temperature = 0.2 +temperature = 0 keep_alive = "5m" diff --git a/sources/agents/ollama_compression_16gb.toml b/sources/agents/ollama_compression_16gb.toml index b3e3cab..e0e63d0 100644 --- a/sources/agents/ollama_compression_16gb.toml +++ b/sources/agents/ollama_compression_16gb.toml @@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "16gb"] system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}""" +[body] +think = false + [body.options] num_predict = 2048 temperature = 0.3 diff --git a/sources/agents/ollama_compression_32gb.toml b/sources/agents/ollama_compression_32gb.toml index 1794a26..f1362d5 100644 --- a/sources/agents/ollama_compression_32gb.toml +++ b/sources/agents/ollama_compression_32gb.toml @@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "32gb"] system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}""" +[body] +think = false + [body.options] num_predict = 2048 temperature = 0.3 diff --git a/sources/agents/ollama_compression_8gb.toml b/sources/agents/ollama_compression_8gb.toml index 4bbb2d4..0f4e341 100644 --- a/sources/agents/ollama_compression_8gb.toml +++ b/sources/agents/ollama_compression_8gb.toml @@ -10,6 +10,9 @@ tags = ["compression", "ollama", "local", "8gb"] system_prompt = """{{ read_file(":/tasks/chat-compressor.md") }}""" +[body] +think = false + [body.options] num_predict = 2048 temperature = 0.3 diff --git a/sources/agents/tasks/code-completion.md b/sources/agents/tasks/code-completion.md index 88d0285..060c138 100644 --- a/sources/agents/tasks/code-completion.md +++ b/sources/agents/tasks/code-completion.md @@ -1,20 +1,25 @@ Core Requirements: 1. Continue code exactly from the cursor position, ensuring it properly connects with any existing code after the cursor -2. Never repeat existing code before or after the cursor +2. Never repeat existing code before or after the cursor — the text after already exists, so do not reproduce any of it Specific Guidelines: - For function calls: Complete parameters with appropriate types and names - For class members: Respect access modifiers and class conventions -- Respect existing indentation and formatting -- Consider scope and visibility of referenced symbols +- Respect existing indentation and formatting; do not re-emit indentation that already precedes the cursor +- Consider scope and visibility of referenced symbols (do not use a symbol that is only declared after the cursor) - Ensure seamless integration with code both before and after the cursor +When nothing should be inserted, return an empty code block. This applies only when: +- Any insertion would duplicate code that already appears after the cursor, or +- The cursor sits in the middle of an existing identifier or type name, or between a complete type and its variable name +Otherwise, always provide a completion (for example, fill an empty initializer list or argument list). In the no-insertion cases above, output an empty code block and nothing else — never describe the code, report errors, ask questions, or suggest alternatives. + Context Format: ...code before the cursor......code after the cursor... Response Format: -- No explanations or comments -- Only include new characters needed to create valid code -- Should be codeblock with language +- Your entire response must be exactly one code block tagged with the language, and nothing else +- Never write any sentence, note, explanation, or comment before or after the code block — not even to state that the code is already complete +- Inside the block, include only the new characters needed at the cursor to form valid code; leave the block empty only in the no-insertion cases listed above diff --git a/sources/external/llmqore b/sources/external/llmqore index ea44041..4450ece 160000 --- a/sources/external/llmqore +++ b/sources/external/llmqore @@ -1 +1 @@ -Subproject commit ea44041b24f5220e6529812ecdb0a901080810f8 +Subproject commit 4450eceda98105e8e471b9e4fa73bfe4cb42b448 diff --git a/sources/providers/GenericProvider.cpp b/sources/providers/GenericProvider.cpp index bc12b1d..8b09a5e 100644 --- a/sources/providers/GenericProvider.cpp +++ b/sources/providers/GenericProvider.cpp @@ -48,7 +48,26 @@ QFuture> GenericProvider::getInstalledModels(const QString &url) { m_client->setUrl(url); m_client->setApiKey(apiKey()); - return m_client->listModels(); + return m_client->listModels(modelsEndpoint(url)); +} + +QString GenericProvider::modelsEndpoint(const QString &url) const +{ + switch (m_id) { + case ProviderID::OpenAI: + case ProviderID::OpenAIResponses: + case ProviderID::OpenAICompatible: + case ProviderID::LMStudio: + case ProviderID::OpenRouter: + break; + default: + return {}; + } + + QString base = url; + while (base.endsWith('/')) + base.chop(1); + return base.endsWith("/v1") ? QStringLiteral("/models") : QStringLiteral("/v1/models"); } RequestID GenericProvider::sendRequest( diff --git a/sources/providers/GenericProvider.hpp b/sources/providers/GenericProvider.hpp index a9f11f2..c1c8c1d 100644 --- a/sources/providers/GenericProvider.hpp +++ b/sources/providers/GenericProvider.hpp @@ -39,6 +39,8 @@ public: const QUrl &url, const QJsonObject &payload, const QString &endpoint) override; private: + QString modelsEndpoint(const QString &url) const; + QString m_name; ProviderID m_id; ::LLMQore::BaseClient *m_client; diff --git a/test/ResponseRouterTest.cpp b/test/ResponseRouterTest.cpp index 133aaab..1db5eb5 100644 --- a/test/ResponseRouterTest.cpp +++ b/test/ResponseRouterTest.cpp @@ -58,7 +58,7 @@ protected: return {}; } LLMQore::RequestID ask(const QString &, LLMQore::RequestMode) override { return {}; } - QFuture> listModels() override { return {}; } + QFuture> listModels(const QString & = {}) override { return {}; } LLMQore::ToolSchemaFormat toolSchemaFormat() const override { return LLMQore::ToolSchemaFormat::Claude;