mirror of
https://github.com/YACReader/yacreader
synced 2025-07-18 21:14:33 -04:00
Replace lexertl with a custom lexeter implementation
QueryLexeter does not parse "atWord" because I couldn't find what it is used for.
This commit is contained in:
94
YACReaderLibrary/db/query_lexer.cpp
Normal file
94
YACReaderLibrary/db/query_lexer.cpp
Normal file
@ -0,0 +1,94 @@
|
||||
#include "query_lexer.h"
|
||||
|
||||
QueryLexer::QueryLexer(const std::string &input)
|
||||
: input(input)
|
||||
{
|
||||
}
|
||||
|
||||
Token QueryLexer::next()
|
||||
{
|
||||
switch (peek()) {
|
||||
case '\0':
|
||||
return Token(Token::Type::eof);
|
||||
case '(':
|
||||
case ')':
|
||||
return single(Token::Type::opcode);
|
||||
case ' ':
|
||||
case '\t':
|
||||
case '\r':
|
||||
case '\n':
|
||||
return space();
|
||||
case '"':
|
||||
return quotedWord();
|
||||
default:
|
||||
return word();
|
||||
}
|
||||
}
|
||||
|
||||
char QueryLexer::peek()
|
||||
{
|
||||
return input[index];
|
||||
}
|
||||
|
||||
char QueryLexer::get()
|
||||
{
|
||||
return input[index++];
|
||||
}
|
||||
|
||||
Token QueryLexer::single(Token::Type type)
|
||||
{
|
||||
return Token(type, input.substr(index++, 1));
|
||||
}
|
||||
|
||||
Token QueryLexer::space()
|
||||
{
|
||||
auto start = index;
|
||||
get();
|
||||
while (isSpace(peek()))
|
||||
get();
|
||||
return Token(Token::Type::space, input.substr(start, index - start));
|
||||
}
|
||||
|
||||
Token QueryLexer::word()
|
||||
{
|
||||
auto start = index;
|
||||
get();
|
||||
auto current = peek();
|
||||
while (current != '\0' && !isSpace(current) && current != '"' && current != '(' && current != ')') {
|
||||
get();
|
||||
current = peek();
|
||||
}
|
||||
return Token(Token::Type::word, input.substr(start, index - start));
|
||||
}
|
||||
|
||||
Token QueryLexer::quotedWord()
|
||||
{
|
||||
auto start = index;
|
||||
get();
|
||||
auto current = peek();
|
||||
while (current != '\0' && current != '"') {
|
||||
get();
|
||||
current = peek();
|
||||
}
|
||||
|
||||
if (current == '"') {
|
||||
get();
|
||||
return Token(Token::Type::quotedWord, input.substr(start, index - start));
|
||||
}
|
||||
|
||||
//This should be a lexical error, but the grammar doesn't support it
|
||||
return Token(Token::Type::eof);
|
||||
}
|
||||
|
||||
bool QueryLexer::isSpace(char c)
|
||||
{
|
||||
switch (c) {
|
||||
case ' ':
|
||||
case '\t':
|
||||
case '\r':
|
||||
case '\n':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
59
YACReaderLibrary/db/query_lexer.h
Normal file
59
YACReaderLibrary/db/query_lexer.h
Normal file
@ -0,0 +1,59 @@
|
||||
#ifndef QUERY_LEXER_H
|
||||
#define QUERY_LEXER_H
|
||||
|
||||
#include <string>
|
||||
|
||||
class Token
|
||||
{
|
||||
public:
|
||||
enum class Type {
|
||||
eof,
|
||||
opcode,
|
||||
atWord,
|
||||
word,
|
||||
quotedWord,
|
||||
space
|
||||
};
|
||||
|
||||
Token(Type type, std::string lexeme = "")
|
||||
: _type(type), _lexeme(std::move(lexeme))
|
||||
{
|
||||
}
|
||||
|
||||
Type type() const
|
||||
{
|
||||
return _type;
|
||||
}
|
||||
|
||||
std::string lexeme() const
|
||||
{
|
||||
return _lexeme;
|
||||
}
|
||||
|
||||
private:
|
||||
Type _type {};
|
||||
std::string _lexeme {};
|
||||
};
|
||||
|
||||
class QueryLexer
|
||||
{
|
||||
public:
|
||||
QueryLexer(const std::string &input);
|
||||
Token next();
|
||||
|
||||
private:
|
||||
std::string input;
|
||||
int index = 0;
|
||||
|
||||
char peek();
|
||||
char get();
|
||||
|
||||
Token single(Token::Type type);
|
||||
Token space();
|
||||
Token word();
|
||||
Token quotedWord();
|
||||
|
||||
bool isSpace(char c);
|
||||
};
|
||||
|
||||
#endif // QUERY_LEXER_H
|
@ -68,21 +68,14 @@ int QueryParser::TreeNode::bindValues(QSqlQuery &selectQuery, int bindPosition)
|
||||
}
|
||||
|
||||
QueryParser::QueryParser()
|
||||
: lexScanner(0)
|
||||
{
|
||||
|
||||
lexScanner.push("[()]", static_cast<std::underlying_type<TokenType>::type>(TokenType::opcode));
|
||||
lexScanner.push("@[^:]+:[^\\\")\\s]+", static_cast<std::underlying_type<TokenType>::type>(TokenType::atWord));
|
||||
lexScanner.push("[^\\\"()\\s]+", static_cast<std::underlying_type<TokenType>::type>(TokenType::word));
|
||||
lexScanner.push("\\\".*?\\\"", static_cast<std::underlying_type<TokenType>::type>(TokenType::quotedWord));
|
||||
lexScanner.push("\\s+", static_cast<std::underlying_type<TokenType>::type>(TokenType::space));
|
||||
|
||||
lexertl::generator::build(lexScanner, sm);
|
||||
}
|
||||
|
||||
QueryParser::TreeNode QueryParser::parse(const std::string &expr)
|
||||
{
|
||||
tokenize(expr);
|
||||
lexer = QueryLexer(expr);
|
||||
advance();
|
||||
|
||||
auto prog = orExpression();
|
||||
|
||||
if (!isEof()) {
|
||||
@ -104,7 +97,10 @@ std::string QueryParser::token(bool advance)
|
||||
if (isEof()) {
|
||||
return "";
|
||||
}
|
||||
auto res = (tokenType() == TokenType::quotedWord) ? iter->substr(1, 1) : iter->str();
|
||||
|
||||
auto lexeme = currentToken.lexeme();
|
||||
|
||||
auto res = (tokenType() == Token::Type::quotedWord) ? currentToken.lexeme().substr(1, lexeme.size() - 2) : lexeme; //TODO process quotedWordDiferently?
|
||||
if (advance) {
|
||||
this->advance();
|
||||
}
|
||||
@ -116,30 +112,32 @@ std::string QueryParser::lcaseToken(bool advance)
|
||||
if (isEof()) {
|
||||
return "";
|
||||
}
|
||||
auto res = (tokenType() == TokenType::quotedWord) ? iter->substr(1, 1) : iter->str();
|
||||
|
||||
auto lexeme = currentToken.lexeme();
|
||||
|
||||
auto res = (tokenType() == Token::Type::quotedWord) ? currentToken.lexeme().substr(1, lexeme.size() - 2) : lexeme;
|
||||
|
||||
if (advance) {
|
||||
this->advance();
|
||||
}
|
||||
return toLower(res);
|
||||
}
|
||||
|
||||
QueryParser::TokenType QueryParser::tokenType()
|
||||
Token::Type QueryParser::tokenType()
|
||||
{
|
||||
if (isEof()) {
|
||||
return TokenType::eof;
|
||||
}
|
||||
return TokenType(iter->id);
|
||||
return currentToken.type();
|
||||
}
|
||||
|
||||
bool QueryParser::isEof() const
|
||||
{
|
||||
return iter == end;
|
||||
return currentToken.type() == Token::Type::eof;
|
||||
}
|
||||
|
||||
void QueryParser::advance()
|
||||
{
|
||||
++iter;
|
||||
if (tokenType() == TokenType::space)
|
||||
currentToken = lexer.next();
|
||||
|
||||
if (tokenType() == Token::Type::space)
|
||||
advance();
|
||||
}
|
||||
|
||||
@ -154,11 +152,6 @@ QueryParser::FieldType QueryParser::fieldType(const std::string &str)
|
||||
return FieldType::unknown;
|
||||
}
|
||||
|
||||
void QueryParser::tokenize(const std::string &expr)
|
||||
{
|
||||
iter = lexertl::siterator(expr.begin(), expr.end(), sm);
|
||||
}
|
||||
|
||||
std::string QueryParser::join(const QStringList &strings, const std::string &delim)
|
||||
{
|
||||
return std::accumulate(strings.begin(), strings.end(), std::string(),
|
||||
@ -191,7 +184,7 @@ QueryParser::TreeNode QueryParser::andExpression()
|
||||
return { "and", { lhs, andExpression() } };
|
||||
}
|
||||
|
||||
if ((isIn(tokenType(), { TokenType::atWord, TokenType::word, TokenType::quotedWord }) || token() == "(") && lcaseToken() != "or") {
|
||||
if ((isIn(tokenType(), { Token::Type::atWord, Token::Type::word, Token::Type::quotedWord }) || token() == "(") && lcaseToken() != "or") {
|
||||
return { "and", { lhs, andExpression() } };
|
||||
}
|
||||
|
||||
@ -209,15 +202,15 @@ QueryParser::TreeNode QueryParser::notExpression()
|
||||
|
||||
QueryParser::TreeNode QueryParser::locationExpression()
|
||||
{
|
||||
if (tokenType() == TokenType::opcode && token() == "(") {
|
||||
if (tokenType() == Token::Type::opcode && token() == "(") {
|
||||
advance();
|
||||
auto res = orExpression();
|
||||
if (tokenType() != TokenType::opcode || token(true) != ")") {
|
||||
if (tokenType() != Token::Type::opcode || token(true) != ")") {
|
||||
throw std::invalid_argument("missing )'");
|
||||
}
|
||||
return res;
|
||||
}
|
||||
if (!isIn(tokenType(), { TokenType::atWord, TokenType::word, TokenType::quotedWord })) {
|
||||
if (!isIn(tokenType(), { Token::Type::atWord, Token::Type::word, Token::Type::quotedWord })) {
|
||||
throw std::invalid_argument("Invalid syntax. Expected a lookup name or a word");
|
||||
}
|
||||
return baseToken();
|
||||
@ -225,7 +218,7 @@ QueryParser::TreeNode QueryParser::locationExpression()
|
||||
|
||||
QueryParser::TreeNode QueryParser::baseToken()
|
||||
{
|
||||
if (tokenType() == TokenType::quotedWord) {
|
||||
if (tokenType() == Token::Type::quotedWord) {
|
||||
return { "token", { { "all", {} }, { token(true), {} } } };
|
||||
}
|
||||
|
||||
@ -234,7 +227,7 @@ QueryParser::TreeNode QueryParser::baseToken()
|
||||
if (words.size() > 1 && fieldType(words[0].toStdString()) != FieldType::unknown) {
|
||||
auto loc(toLower(words[0].toStdString()));
|
||||
words.erase(words.begin());
|
||||
if (words.size() == 1 && tokenType() == TokenType::quotedWord) {
|
||||
if (words.size() == 1 && tokenType() == Token::Type::quotedWord) {
|
||||
return { "token", { { loc, {} }, { token(true), {} } } };
|
||||
}
|
||||
return { "token", { { loc, {} }, { join(words, ":"), {} } } };
|
||||
|
@ -1,19 +1,19 @@
|
||||
#ifndef QUERY_PARSER_H
|
||||
#define QUERY_PARSER_H
|
||||
|
||||
#include "lexertl/generator.hpp"
|
||||
#include "lexertl/iterator.hpp"
|
||||
#include "query_lexer.h"
|
||||
|
||||
#include <map>
|
||||
#include <QSqlQuery>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
|
||||
/**
|
||||
* This class is used to generate an SQL query string from a search expression,
|
||||
* with a syntax very similar to that used by the Google search engine.
|
||||
*
|
||||
* The code herin is based upon the SearchQueryParser python class written by
|
||||
* The code herein is based upon the SearchQueryParser python class written by
|
||||
* Kovid Goyal as part of the Calibre eBook manager (https://calibre-ebook.com)
|
||||
*
|
||||
* Grammar:
|
||||
@ -41,13 +41,6 @@
|
||||
class QueryParser
|
||||
{
|
||||
public:
|
||||
enum class TokenType { eof,
|
||||
opcode,
|
||||
atWord,
|
||||
word,
|
||||
quotedWord,
|
||||
space };
|
||||
|
||||
struct TreeNode {
|
||||
std::string t;
|
||||
std::vector<TreeNode> children;
|
||||
@ -64,10 +57,13 @@ private:
|
||||
|
||||
std::string token(bool advance = false);
|
||||
std::string lcaseToken(bool advance = false);
|
||||
TokenType tokenType();
|
||||
Token::Type tokenType();
|
||||
bool isEof() const;
|
||||
void advance();
|
||||
|
||||
QueryLexer lexer = QueryLexer("");
|
||||
Token currentToken = Token(Token::Type::eof);
|
||||
|
||||
template<typename T>
|
||||
static bool isIn(const T &e, const std::list<T> &v)
|
||||
{
|
||||
@ -83,7 +79,6 @@ private:
|
||||
filename };
|
||||
static FieldType fieldType(const std::string &str);
|
||||
|
||||
void tokenize(const std::string &expr);
|
||||
static std::string join(const QStringList &strings, const std::string &delim);
|
||||
static QStringList split(const std::string &string, char delim);
|
||||
|
||||
@ -93,11 +88,6 @@ private:
|
||||
TreeNode locationExpression();
|
||||
TreeNode baseToken();
|
||||
|
||||
lexertl::rules lexScanner;
|
||||
lexertl::state_machine sm;
|
||||
lexertl::siterator iter;
|
||||
const lexertl::siterator end;
|
||||
|
||||
static const std::map<FieldType, std::vector<std::string>> fieldNames;
|
||||
};
|
||||
|
||||
|
Reference in New Issue
Block a user