Replace lexertl with a custom lexeter implementation

QueryLexeter does not parse "atWord" because I couldn't find what it is used for.
2025-07-18 21:14:33 -04:00 · 2021-01-12 18:56:59 +01:00
parent 5037f3ac92
commit a777aa3fe8
41 changed files with 187 additions and 12768 deletions
--- a/YACReaderLibrary/db/query_lexer.cpp
+++ b/YACReaderLibrary/db/query_lexer.cpp
@ -0,0 +1,94 @@
+#include "query_lexer.h"
+
+QueryLexer::QueryLexer(const std::string &input)
+    : input(input)
+{
+}
+
+Token QueryLexer::next()
+{
+    switch (peek()) {
+    case '\0':
+        return Token(Token::Type::eof);
+    case '(':
+    case ')':
+        return single(Token::Type::opcode);
+    case ' ':
+    case '\t':
+    case '\r':
+    case '\n':
+        return space();
+    case '"':
+        return quotedWord();
+    default:
+        return word();
+    }
+}
+
+char QueryLexer::peek()
+{
+    return input[index];
+}
+
+char QueryLexer::get()
+{
+    return input[index++];
+}
+
+Token QueryLexer::single(Token::Type type)
+{
+    return Token(type, input.substr(index++, 1));
+}
+
+Token QueryLexer::space()
+{
+    auto start = index;
+    get();
+    while (isSpace(peek()))
+        get();
+    return Token(Token::Type::space, input.substr(start, index - start));
+}
+
+Token QueryLexer::word()
+{
+    auto start = index;
+    get();
+    auto current = peek();
+    while (current != '\0' && !isSpace(current) && current != '"' && current != '(' && current != ')') {
+        get();
+        current = peek();
+    }
+    return Token(Token::Type::word, input.substr(start, index - start));
+}
+
+Token QueryLexer::quotedWord()
+{
+    auto start = index;
+    get();
+    auto current = peek();
+    while (current != '\0' && current != '"') {
+        get();
+        current = peek();
+    }
+
+    if (current == '"') {
+        get();
+        return Token(Token::Type::quotedWord, input.substr(start, index - start));
+    }
+
+    //This should be a lexical error, but the grammar doesn't support it
+    return Token(Token::Type::eof);
+}
+
+bool QueryLexer::isSpace(char c)
+{
+    switch (c) {
+    case ' ':
+    case '\t':
+    case '\r':
+    case '\n':
+        return true;
+    default:
+        return false;
+    }
+}
--- a/YACReaderLibrary/db/query_lexer.h
+++ b/YACReaderLibrary/db/query_lexer.h
@ -0,0 +1,59 @@
+#ifndef QUERY_LEXER_H
+#define QUERY_LEXER_H
+
+#include <string>
+
+class Token
+{
+public:
+    enum class Type {
+        eof,
+        opcode,
+        atWord,
+        word,
+        quotedWord,
+        space
+    };
+
+    Token(Type type, std::string lexeme = "")
+        : _type(type), _lexeme(std::move(lexeme))
+    {
+    }
+
+    Type type() const
+    {
+        return _type;
+    }
+
+    std::string lexeme() const
+    {
+        return _lexeme;
+    }
+
+private:
+    Type _type {};
+    std::string _lexeme {};
+};
+
+class QueryLexer
+{
+public:
+    QueryLexer(const std::string &input);
+    Token next();
+
+private:
+    std::string input;
+    int index = 0;
+
+    char peek();
+    char get();
+
+    Token single(Token::Type type);
+    Token space();
+    Token word();
+    Token quotedWord();
+
+    bool isSpace(char c);
+};
+
+#endif // QUERY_LEXER_H
--- a/YACReaderLibrary/db/query_parser.cpp
+++ b/YACReaderLibrary/db/query_parser.cpp
@ -68,21 +68,14 @@ int QueryParser::TreeNode::bindValues(QSqlQuery &selectQuery, int bindPosition)
 }

 QueryParser::QueryParser()
-    : lexScanner(0)
 {
-
-    lexScanner.push("[()]", static_cast<std::underlying_type<TokenType>::type>(TokenType::opcode));
-    lexScanner.push("@[^:]+:[^\\\")\\s]+", static_cast<std::underlying_type<TokenType>::type>(TokenType::atWord));
-    lexScanner.push("[^\\\"()\\s]+", static_cast<std::underlying_type<TokenType>::type>(TokenType::word));
-    lexScanner.push("\\\".*?\\\"", static_cast<std::underlying_type<TokenType>::type>(TokenType::quotedWord));
-    lexScanner.push("\\s+", static_cast<std::underlying_type<TokenType>::type>(TokenType::space));
-
-    lexertl::generator::build(lexScanner, sm);
 }

 QueryParser::TreeNode QueryParser::parse(const std::string &expr)
 {
-    tokenize(expr);
+    lexer = QueryLexer(expr);
+    advance();
+
    auto prog = orExpression();

    if (!isEof()) {
@ -104,7 +97,10 @@ std::string QueryParser::token(bool advance)
    if (isEof()) {
        return "";
    }
-    auto res = (tokenType() == TokenType::quotedWord) ? iter->substr(1, 1) : iter->str();
+
+    auto lexeme = currentToken.lexeme();
+
+    auto res = (tokenType() == Token::Type::quotedWord) ? currentToken.lexeme().substr(1, lexeme.size() - 2) : lexeme; //TODO process quotedWordDiferently?
    if (advance) {
        this->advance();
    }
@ -116,30 +112,32 @@ std::string QueryParser::lcaseToken(bool advance)
    if (isEof()) {
        return "";
    }
-    auto res = (tokenType() == TokenType::quotedWord) ? iter->substr(1, 1) : iter->str();
+
+    auto lexeme = currentToken.lexeme();
+
+    auto res = (tokenType() == Token::Type::quotedWord) ? currentToken.lexeme().substr(1, lexeme.size() - 2) : lexeme;
+
    if (advance) {
        this->advance();
    }
    return toLower(res);
 }

-QueryParser::TokenType QueryParser::tokenType()
+Token::Type QueryParser::tokenType()
 {
-    if (isEof()) {
-        return TokenType::eof;
-    }
-    return TokenType(iter->id);
+    return currentToken.type();
 }

 bool QueryParser::isEof() const
 {
-    return iter == end;
+    return currentToken.type() == Token::Type::eof;
 }

 void QueryParser::advance()
 {
-    ++iter;
-    if (tokenType() == TokenType::space)
+    currentToken = lexer.next();
+
+    if (tokenType() == Token::Type::space)
        advance();
 }

@ -154,11 +152,6 @@ QueryParser::FieldType QueryParser::fieldType(const std::string &str)
    return FieldType::unknown;
 }

-void QueryParser::tokenize(const std::string &expr)
-{
-    iter = lexertl::siterator(expr.begin(), expr.end(), sm);
-}
-
 std::string QueryParser::join(const QStringList &strings, const std::string &delim)
 {
    return std::accumulate(strings.begin(), strings.end(), std::string(),
@ -191,7 +184,7 @@ QueryParser::TreeNode QueryParser::andExpression()
        return { "and", { lhs, andExpression() } };
    }

-    if ((isIn(tokenType(), { TokenType::atWord, TokenType::word, TokenType::quotedWord }) || token() == "(") && lcaseToken() != "or") {
+    if ((isIn(tokenType(), { Token::Type::atWord, Token::Type::word, Token::Type::quotedWord }) || token() == "(") && lcaseToken() != "or") {
        return { "and", { lhs, andExpression() } };
    }

@ -209,15 +202,15 @@ QueryParser::TreeNode QueryParser::notExpression()

 QueryParser::TreeNode QueryParser::locationExpression()
 {
-    if (tokenType() == TokenType::opcode && token() == "(") {
+    if (tokenType() == Token::Type::opcode && token() == "(") {
        advance();
        auto res = orExpression();
-        if (tokenType() != TokenType::opcode || token(true) != ")") {
+        if (tokenType() != Token::Type::opcode || token(true) != ")") {
            throw std::invalid_argument("missing )'");
        }
        return res;
    }
-    if (!isIn(tokenType(), { TokenType::atWord, TokenType::word, TokenType::quotedWord })) {
+    if (!isIn(tokenType(), { Token::Type::atWord, Token::Type::word, Token::Type::quotedWord })) {
        throw std::invalid_argument("Invalid syntax. Expected a lookup name or a word");
    }
    return baseToken();
@ -225,7 +218,7 @@ QueryParser::TreeNode QueryParser::locationExpression()

 QueryParser::TreeNode QueryParser::baseToken()
 {
-    if (tokenType() == TokenType::quotedWord) {
+    if (tokenType() == Token::Type::quotedWord) {
        return { "token", { { "all", {} }, { token(true), {} } } };
    }

@ -234,7 +227,7 @@ QueryParser::TreeNode QueryParser::baseToken()
    if (words.size() > 1 && fieldType(words[0].toStdString()) != FieldType::unknown) {
        auto loc(toLower(words[0].toStdString()));
        words.erase(words.begin());
-        if (words.size() == 1 && tokenType() == TokenType::quotedWord) {
+        if (words.size() == 1 && tokenType() == Token::Type::quotedWord) {
            return { "token", { { loc, {} }, { token(true), {} } } };
        }
        return { "token", { { loc, {} }, { join(words, ":"), {} } } };
--- a/YACReaderLibrary/db/query_parser.h
+++ b/YACReaderLibrary/db/query_parser.h
@ -1,19 +1,19 @@
 #ifndef QUERY_PARSER_H
 #define QUERY_PARSER_H

-#include "lexertl/generator.hpp"
-#include "lexertl/iterator.hpp"
+#include "query_lexer.h"

 #include <map>
 #include <QSqlQuery>
 #include <string>
 #include <vector>
+#include <list>

 /**
 * This class is used to generate an SQL query string from a search expression,
 * with a syntax very similar to that used by the Google search engine.
 *
- * The code herin is based upon the SearchQueryParser python class written by
+ * The code herein is based upon the SearchQueryParser python class written by
 * Kovid Goyal as part of the Calibre eBook manager (https://calibre-ebook.com)
 *
 * Grammar:
@ -41,13 +41,6 @@
 class QueryParser
 {
 public:
-    enum class TokenType { eof,
-                           opcode,
-                           atWord,
-                           word,
-                           quotedWord,
-                           space };
-
    struct TreeNode {
        std::string t;
        std::vector<TreeNode> children;
@ -64,10 +57,13 @@ private:

    std::string token(bool advance = false);
    std::string lcaseToken(bool advance = false);
-    TokenType tokenType();
+    Token::Type tokenType();
    bool isEof() const;
    void advance();

+    QueryLexer lexer = QueryLexer("");
+    Token currentToken = Token(Token::Type::eof);
+
    template<typename T>
    static bool isIn(const T &e, const std::list<T> &v)
    {
@ -83,7 +79,6 @@ private:
                           filename };
    static FieldType fieldType(const std::string &str);

-    void tokenize(const std::string &expr);
    static std::string join(const QStringList &strings, const std::string &delim);
    static QStringList split(const std::string &string, char delim);

@ -93,11 +88,6 @@ private:
    TreeNode locationExpression();
    TreeNode baseToken();

-    lexertl::rules lexScanner;
-    lexertl::state_machine sm;
-    lexertl::siterator iter;
-    const lexertl::siterator end;
-
    static const std::map<FieldType, std::vector<std::string>> fieldNames;
 };