From 5dfcf74128dc867db544c7d69808db55472a7bf0 Mon Sep 17 00:00:00 2001 From: Petr Mironychev <9195189+Palm1r@users.noreply.github.com> Date: Sun, 9 Feb 2025 10:10:06 +0100 Subject: [PATCH] feat: add chunking --- ChatView/ChatRootView.cpp | 13 ++ ChatView/ChatRootView.hpp | 1 + ChatView/qml/RootItem.qml | 1 + ChatView/qml/parts/BottomBar.qml | 7 ++ context/CMakeLists.txt | 1 + context/ContextManager.cpp | 43 +++++++ context/ContextManager.hpp | 6 + context/FileChunker.cpp | 198 +++++++++++++++++++++++++++++++ context/FileChunker.hpp | 68 +++++++++++ 9 files changed, 338 insertions(+) create mode 100644 context/FileChunker.cpp create mode 100644 context/FileChunker.hpp diff --git a/ChatView/ChatRootView.cpp b/ChatView/ChatRootView.cpp index 256c133..2ada63b 100644 --- a/ChatView/ChatRootView.cpp +++ b/ChatView/ChatRootView.cpp @@ -39,6 +39,7 @@ #include "Logger.hpp" #include "ProjectSettings.hpp" #include "context/ContextManager.hpp" +#include "context/FileChunker.hpp" #include "context/RAGManager.hpp" #include "context/TokenUtils.hpp" @@ -484,6 +485,18 @@ void ChatRootView::testRAG(const QString &message) }); } +void ChatRootView::testChunking() +{ + auto project = ProjectExplorer::ProjectTree::currentProject(); + if (!project) { + qDebug() << "No active project found"; + return; + } + + Context::FileChunker::ChunkingConfig config; + Context::ContextManager::instance().testProjectChunks(project, config); +} + void ChatRootView::updateInputTokensCount() { int inputTokens = m_messageTokensCount; diff --git a/ChatView/ChatRootView.hpp b/ChatView/ChatRootView.hpp index 440e7ad..ec4cb93 100644 --- a/ChatView/ChatRootView.hpp +++ b/ChatView/ChatRootView.hpp @@ -66,6 +66,7 @@ public: Q_INVOKABLE void setIsSyncOpenFiles(bool state); Q_INVOKABLE void openChatHistoryFolder(); Q_INVOKABLE void testRAG(const QString &message); + Q_INVOKABLE void testChunking(); Q_INVOKABLE void updateInputTokensCount(); int inputTokensCount() const; diff --git a/ChatView/qml/RootItem.qml b/ChatView/qml/RootItem.qml index fa9869d..44a4e80 100644 --- a/ChatView/qml/RootItem.qml +++ b/ChatView/qml/RootItem.qml @@ -199,6 +199,7 @@ ChatRootView { attachFiles.onClicked: root.showAttachFilesDialog() linkFiles.onClicked: root.showLinkFilesDialog() testRag.onClicked: root.testRAG(messageInput.text) + testChunks.onClicked: root.testChunking() } } diff --git a/ChatView/qml/parts/BottomBar.qml b/ChatView/qml/parts/BottomBar.qml index 3873391..dd436f2 100644 --- a/ChatView/qml/parts/BottomBar.qml +++ b/ChatView/qml/parts/BottomBar.qml @@ -31,6 +31,7 @@ Rectangle { property alias attachFiles: attachFilesId property alias linkFiles: linkFilesId property alias testRag: testRagId + property alias testChunks: testChunksId color: palette.window.hslLightness > 0.5 ? Qt.darker(palette.window, 1.1) : @@ -98,6 +99,12 @@ Rectangle { text: qsTr("Test RAG") } + QoAButton { + id: testChunksId + + text: qsTr("Test Chunks") + } + Item { Layout.fillWidth: true } diff --git a/context/CMakeLists.txt b/context/CMakeLists.txt index 0d6fa73..b603f9a 100644 --- a/context/CMakeLists.txt +++ b/context/CMakeLists.txt @@ -12,6 +12,7 @@ add_library(Context STATIC RAGSimilaritySearch.hpp RAGSimilaritySearch.cpp RAGPreprocessor.hpp RAGPreprocessor.cpp EnhancedRAGSimilaritySearch.hpp EnhancedRAGSimilaritySearch.cpp + FileChunker.hpp FileChunker.cpp ) target_link_libraries(Context diff --git a/context/ContextManager.cpp b/context/ContextManager.cpp index e2280e8..ab0318b 100644 --- a/context/ContextManager.cpp +++ b/context/ContextManager.cpp @@ -26,6 +26,8 @@ #include #include +#include "FileChunker.hpp" + namespace QodeAssist::Context { ContextManager &ContextManager::instance() @@ -130,4 +132,45 @@ bool ContextManager::shouldProcessFile(const QString &filePath) const return supportedExtensions.contains(fileInfo.suffix().toLower()); } +void ContextManager::testProjectChunks( + ProjectExplorer::Project *project, const FileChunker::ChunkingConfig &config) +{ + if (!project) { + qDebug() << "No project provided"; + return; + } + + qDebug() << "\nStarting test chunking for project:" << project->displayName(); + + // Get source files + QStringList sourceFiles = getProjectSourceFiles(project); + qDebug() << "Found" << sourceFiles.size() << "source files"; + + // Create chunker + auto chunker = new FileChunker(config, this); + + // Connect progress and error signals + connect(chunker, &FileChunker::progressUpdated, this, [](int processed, int total) { + qDebug() << "Progress:" << processed << "/" << total << "files"; + }); + + connect(chunker, &FileChunker::error, this, [](const QString &error) { + qDebug() << "Error:" << error; + }); + + // Start chunking and handle results + auto future = chunker->chunkFiles(sourceFiles); + + // Используем QFutureWatcher для обработки результатов + auto watcher = new QFutureWatcher>(this); + + connect(watcher, &QFutureWatcher>::finished, this, [watcher, chunker]() { + // Очистка + watcher->deleteLater(); + chunker->deleteLater(); + }); + + watcher->setFuture(future); +} + } // namespace QodeAssist::Context diff --git a/context/ContextManager.hpp b/context/ContextManager.hpp index 984b912..402d6cb 100644 --- a/context/ContextManager.hpp +++ b/context/ContextManager.hpp @@ -20,9 +20,12 @@ #pragma once #include "ContentFile.hpp" + #include #include +#include "FileChunker.hpp" + namespace ProjectExplorer { class Project; } @@ -40,6 +43,9 @@ public: QList getContentFiles(const QStringList &filePaths) const; QStringList getProjectSourceFiles(ProjectExplorer::Project *project) const; + void testProjectChunks( + ProjectExplorer::Project *project, const FileChunker::ChunkingConfig &config); + private: explicit ContextManager(QObject *parent = nullptr); ~ContextManager() = default; diff --git a/context/FileChunker.cpp b/context/FileChunker.cpp new file mode 100644 index 0000000..2cb849e --- /dev/null +++ b/context/FileChunker.cpp @@ -0,0 +1,198 @@ +// FileChunker.cpp +#include "FileChunker.hpp" + +#include +#include +#include +#include + +#include +#include + +namespace QodeAssist::Context { + +FileChunker::FileChunker(QObject *parent) + : QObject(parent) +{} + +FileChunker::FileChunker(const ChunkingConfig &config, QObject *parent) + : QObject(parent) + , m_config(config) +{} + +QFuture> FileChunker::chunkFiles(const QStringList &filePaths) +{ + qDebug() << "\nStarting chunking process for" << filePaths.size() << "files"; + qDebug() << "Configuration:" + << "\n Max lines per chunk:" << m_config.maxLinesPerChunk + << "\n Overlap lines:" << m_config.overlapLines + << "\n Skip empty lines:" << m_config.skipEmptyLines + << "\n Preserve functions:" << m_config.preserveFunctions + << "\n Preserve classes:" << m_config.preserveClasses + << "\n Batch size:" << m_config.batchSize; + + auto promise = std::make_shared>>(); + promise->start(); + + if (filePaths.isEmpty()) { + qDebug() << "No files to process"; + promise->addResult({}); + promise->finish(); + return promise->future(); + } + + processNextBatch(promise, filePaths, 0); + return promise->future(); +} + +void FileChunker::processNextBatch( + std::shared_ptr>> promise, const QStringList &files, int startIndex) +{ + if (startIndex >= files.size()) { + emit chunkingComplete(); + promise->finish(); + return; + } + + int endIndex = qMin(startIndex + m_config.batchSize, files.size()); + QList batchChunks; + + for (int i = startIndex; i < endIndex; ++i) { + try { + auto chunks = processFile(files[i]); + batchChunks.append(chunks); + } catch (const std::exception &e) { + emit error(QString("Error processing file %1: %2").arg(files[i], e.what())); + } + emit progressUpdated(i + 1, files.size()); + } + + promise->addResult(batchChunks); + + // Планируем обработку следующего батча + QTimer::singleShot(0, this, [this, promise, files, endIndex]() { + processNextBatch(promise, files, endIndex); + }); +} + +QList FileChunker::processFile(const QString &filePath) +{ + qDebug() << "\nProcessing file:" << filePath; + + auto document = new TextEditor::TextDocument; + auto filePathObj = Utils::FilePath::fromString(filePath); + auto result = document->open(&m_error, filePathObj, filePathObj); + if (result != Core::IDocument::OpenResult::Success) { + qDebug() << "Failed to open document:" << filePath << "-" << m_error; + emit error(QString("Failed to open document: %1 - %2").arg(filePath, m_error)); + delete document; + return {}; + } + + qDebug() << "Document opened successfully. Line count:" << document->document()->blockCount(); + + auto chunks = createChunksForDocument(document); + qDebug() << "Created" << chunks.size() << "chunks for file"; + + delete document; + return chunks; +} + +QList FileChunker::createChunksForDocument(TextEditor::TextDocument *document) +{ + QList chunks; + QString filePath = document->filePath().toString(); + qDebug() << "\nCreating chunks for document:" << filePath << "\nConfiguration:" + << "\n Max lines per chunk:" << m_config.maxLinesPerChunk + << "\n Min lines per chunk:" << m_config.minLinesPerChunk + << "\n Overlap lines:" << m_config.overlapLines; + // Если файл меньше минимального размера чанка, создаем один чанк + if (document->document()->blockCount() <= m_config.minLinesPerChunk) { + FileChunk chunk; + chunk.filePath = filePath; + chunk.startLine = 0; + chunk.endLine = document->document()->blockCount() - 1; + chunk.createdAt = QDateTime::currentDateTime(); + chunk.updatedAt = chunk.createdAt; + + QString content; + QTextBlock block = document->document()->firstBlock(); + while (block.isValid()) { + content += block.text() + "\n"; + block = block.next(); + } + chunk.content = content; + + qDebug() << "File is smaller than minimum chunk size. Creating single chunk:" + << "\n Lines:" << chunk.lineCount() << "\n Content size:" << chunk.content.size() + << "bytes"; + + chunks.append(chunk); + return chunks; + } + + // Для больших файлов создаем чанки фиксированного размера с перекрытием + int currentStartLine = 0; + int lineCount = 0; + QString content; + QTextBlock block = document->document()->firstBlock(); + + while (block.isValid()) { + content += block.text() + "\n"; + lineCount++; + + // Если достигли размера чанка или это последний блок + if (lineCount >= m_config.maxLinesPerChunk || !block.next().isValid()) { + FileChunk chunk; + chunk.filePath = filePath; + chunk.startLine = currentStartLine; + chunk.endLine = currentStartLine + lineCount - 1; + chunk.content = content; + chunk.createdAt = QDateTime::currentDateTime(); + chunk.updatedAt = chunk.createdAt; + + qDebug() << "Creating chunk:" + << "\n Start line:" << chunk.startLine << "\n End line:" << chunk.endLine + << "\n Lines:" << chunk.lineCount() + << "\n Content size:" << chunk.content.size() << "bytes"; + + chunks.append(chunk); + + // Начинаем новый чанк с учетом перекрытия + if (block.next().isValid()) { + // Отступаем назад на размер перекрытия + int overlapLines = qMin(m_config.overlapLines, lineCount); + currentStartLine = chunk.endLine - overlapLines + 1; + + // Сбрасываем контент, но добавляем перекрывающиеся строки + content.clear(); + QTextBlock overlapBlock = document->document()->findBlockByLineNumber( + currentStartLine); + while (overlapBlock.isValid() && overlapBlock.blockNumber() <= chunk.endLine) { + content += overlapBlock.text() + "\n"; + overlapBlock = overlapBlock.next(); + } + lineCount = overlapLines; + } + } + + block = block.next(); + } + + qDebug() << "Finished creating chunks for file:" << filePath + << "\nTotal chunks:" << chunks.size(); + + return chunks; +} + +void FileChunker::setConfig(const ChunkingConfig &config) +{ + m_config = config; +} + +FileChunker::ChunkingConfig FileChunker::config() const +{ + return m_config; +} + +} // namespace QodeAssist::Context diff --git a/context/FileChunker.hpp b/context/FileChunker.hpp new file mode 100644 index 0000000..6bfe9ad --- /dev/null +++ b/context/FileChunker.hpp @@ -0,0 +1,68 @@ +// FileChunker.hpp +#pragma once + +#include +#include +#include +#include + +namespace QodeAssist::Context { + +struct FileChunk +{ + QString filePath; // Path to the source file + int startLine; // Starting line of the chunk + int endLine; // Ending line of the chunk + QDateTime createdAt; // When the chunk was created + QDateTime updatedAt; // When the chunk was last updated + QString content; // Content of the chunk + + // Helper methods + int lineCount() const { return endLine - startLine + 1; } + bool isValid() const { return !filePath.isEmpty() && startLine >= 0 && endLine >= startLine; } +}; + +class FileChunker : public QObject +{ + Q_OBJECT + +public: + struct ChunkingConfig + { + int maxLinesPerChunk = 80; // Размер чанка (было 200) + int minLinesPerChunk = 40; // Минимальный размер для начала чанкинга + int overlapLines = 20; // Перекрытие между чанками + bool skipEmptyLines = true; // Пропускать пустые строки + bool preserveFunctions = true; // Сохранять функции целиком + bool preserveClasses = true; // Сохранять классы целиком + int batchSize = 10; // Количество файлов для параллельной обработки + }; + + explicit FileChunker(QObject *parent = nullptr); + explicit FileChunker(const ChunkingConfig &config, QObject *parent = nullptr); + + // Main chunking method + QFuture> chunkFiles(const QStringList &filePaths); + + // Configuration + void setConfig(const ChunkingConfig &config); + ChunkingConfig config() const; + +signals: + void progressUpdated(int processedFiles, int totalFiles); + void chunkingComplete(); + void error(const QString &errorMessage); + +private: + QList processFile(const QString &filePath); + QList createChunksForDocument(TextEditor::TextDocument *document); + void processNextBatch( + std::shared_ptr>> promise, + const QStringList &files, + int startIndex); + + ChunkingConfig m_config; + QString m_error; +}; + +} // namespace QodeAssist::Context