Replace lexertl with a custom lexeter implementation

QueryLexeter does not parse "atWord" because I couldn't find what it is used for.
This commit is contained in:
Luis Ángel San Martín
2021-01-12 18:56:59 +01:00
parent 5037f3ac92
commit a777aa3fe8
41 changed files with 187 additions and 12768 deletions

View File

@ -0,0 +1,94 @@
#include "query_lexer.h"
QueryLexer::QueryLexer(const std::string &input)
: input(input)
{
}
Token QueryLexer::next()
{
switch (peek()) {
case '\0':
return Token(Token::Type::eof);
case '(':
case ')':
return single(Token::Type::opcode);
case ' ':
case '\t':
case '\r':
case '\n':
return space();
case '"':
return quotedWord();
default:
return word();
}
}
char QueryLexer::peek()
{
return input[index];
}
char QueryLexer::get()
{
return input[index++];
}
Token QueryLexer::single(Token::Type type)
{
return Token(type, input.substr(index++, 1));
}
Token QueryLexer::space()
{
auto start = index;
get();
while (isSpace(peek()))
get();
return Token(Token::Type::space, input.substr(start, index - start));
}
Token QueryLexer::word()
{
auto start = index;
get();
auto current = peek();
while (current != '\0' && !isSpace(current) && current != '"' && current != '(' && current != ')') {
get();
current = peek();
}
return Token(Token::Type::word, input.substr(start, index - start));
}
Token QueryLexer::quotedWord()
{
auto start = index;
get();
auto current = peek();
while (current != '\0' && current != '"') {
get();
current = peek();
}
if (current == '"') {
get();
return Token(Token::Type::quotedWord, input.substr(start, index - start));
}
//This should be a lexical error, but the grammar doesn't support it
return Token(Token::Type::eof);
}
bool QueryLexer::isSpace(char c)
{
switch (c) {
case ' ':
case '\t':
case '\r':
case '\n':
return true;
default:
return false;
}
}

View File

@ -0,0 +1,59 @@
#ifndef QUERY_LEXER_H
#define QUERY_LEXER_H
#include <string>
class Token
{
public:
enum class Type {
eof,
opcode,
atWord,
word,
quotedWord,
space
};
Token(Type type, std::string lexeme = "")
: _type(type), _lexeme(std::move(lexeme))
{
}
Type type() const
{
return _type;
}
std::string lexeme() const
{
return _lexeme;
}
private:
Type _type {};
std::string _lexeme {};
};
class QueryLexer
{
public:
QueryLexer(const std::string &input);
Token next();
private:
std::string input;
int index = 0;
char peek();
char get();
Token single(Token::Type type);
Token space();
Token word();
Token quotedWord();
bool isSpace(char c);
};
#endif // QUERY_LEXER_H

View File

@ -68,21 +68,14 @@ int QueryParser::TreeNode::bindValues(QSqlQuery &selectQuery, int bindPosition)
}
QueryParser::QueryParser()
: lexScanner(0)
{
lexScanner.push("[()]", static_cast<std::underlying_type<TokenType>::type>(TokenType::opcode));
lexScanner.push("@[^:]+:[^\\\")\\s]+", static_cast<std::underlying_type<TokenType>::type>(TokenType::atWord));
lexScanner.push("[^\\\"()\\s]+", static_cast<std::underlying_type<TokenType>::type>(TokenType::word));
lexScanner.push("\\\".*?\\\"", static_cast<std::underlying_type<TokenType>::type>(TokenType::quotedWord));
lexScanner.push("\\s+", static_cast<std::underlying_type<TokenType>::type>(TokenType::space));
lexertl::generator::build(lexScanner, sm);
}
QueryParser::TreeNode QueryParser::parse(const std::string &expr)
{
tokenize(expr);
lexer = QueryLexer(expr);
advance();
auto prog = orExpression();
if (!isEof()) {
@ -104,7 +97,10 @@ std::string QueryParser::token(bool advance)
if (isEof()) {
return "";
}
auto res = (tokenType() == TokenType::quotedWord) ? iter->substr(1, 1) : iter->str();
auto lexeme = currentToken.lexeme();
auto res = (tokenType() == Token::Type::quotedWord) ? currentToken.lexeme().substr(1, lexeme.size() - 2) : lexeme; //TODO process quotedWordDiferently?
if (advance) {
this->advance();
}
@ -116,30 +112,32 @@ std::string QueryParser::lcaseToken(bool advance)
if (isEof()) {
return "";
}
auto res = (tokenType() == TokenType::quotedWord) ? iter->substr(1, 1) : iter->str();
auto lexeme = currentToken.lexeme();
auto res = (tokenType() == Token::Type::quotedWord) ? currentToken.lexeme().substr(1, lexeme.size() - 2) : lexeme;
if (advance) {
this->advance();
}
return toLower(res);
}
QueryParser::TokenType QueryParser::tokenType()
Token::Type QueryParser::tokenType()
{
if (isEof()) {
return TokenType::eof;
}
return TokenType(iter->id);
return currentToken.type();
}
bool QueryParser::isEof() const
{
return iter == end;
return currentToken.type() == Token::Type::eof;
}
void QueryParser::advance()
{
++iter;
if (tokenType() == TokenType::space)
currentToken = lexer.next();
if (tokenType() == Token::Type::space)
advance();
}
@ -154,11 +152,6 @@ QueryParser::FieldType QueryParser::fieldType(const std::string &str)
return FieldType::unknown;
}
void QueryParser::tokenize(const std::string &expr)
{
iter = lexertl::siterator(expr.begin(), expr.end(), sm);
}
std::string QueryParser::join(const QStringList &strings, const std::string &delim)
{
return std::accumulate(strings.begin(), strings.end(), std::string(),
@ -191,7 +184,7 @@ QueryParser::TreeNode QueryParser::andExpression()
return { "and", { lhs, andExpression() } };
}
if ((isIn(tokenType(), { TokenType::atWord, TokenType::word, TokenType::quotedWord }) || token() == "(") && lcaseToken() != "or") {
if ((isIn(tokenType(), { Token::Type::atWord, Token::Type::word, Token::Type::quotedWord }) || token() == "(") && lcaseToken() != "or") {
return { "and", { lhs, andExpression() } };
}
@ -209,15 +202,15 @@ QueryParser::TreeNode QueryParser::notExpression()
QueryParser::TreeNode QueryParser::locationExpression()
{
if (tokenType() == TokenType::opcode && token() == "(") {
if (tokenType() == Token::Type::opcode && token() == "(") {
advance();
auto res = orExpression();
if (tokenType() != TokenType::opcode || token(true) != ")") {
if (tokenType() != Token::Type::opcode || token(true) != ")") {
throw std::invalid_argument("missing )'");
}
return res;
}
if (!isIn(tokenType(), { TokenType::atWord, TokenType::word, TokenType::quotedWord })) {
if (!isIn(tokenType(), { Token::Type::atWord, Token::Type::word, Token::Type::quotedWord })) {
throw std::invalid_argument("Invalid syntax. Expected a lookup name or a word");
}
return baseToken();
@ -225,7 +218,7 @@ QueryParser::TreeNode QueryParser::locationExpression()
QueryParser::TreeNode QueryParser::baseToken()
{
if (tokenType() == TokenType::quotedWord) {
if (tokenType() == Token::Type::quotedWord) {
return { "token", { { "all", {} }, { token(true), {} } } };
}
@ -234,7 +227,7 @@ QueryParser::TreeNode QueryParser::baseToken()
if (words.size() > 1 && fieldType(words[0].toStdString()) != FieldType::unknown) {
auto loc(toLower(words[0].toStdString()));
words.erase(words.begin());
if (words.size() == 1 && tokenType() == TokenType::quotedWord) {
if (words.size() == 1 && tokenType() == Token::Type::quotedWord) {
return { "token", { { loc, {} }, { token(true), {} } } };
}
return { "token", { { loc, {} }, { join(words, ":"), {} } } };

View File

@ -1,19 +1,19 @@
#ifndef QUERY_PARSER_H
#define QUERY_PARSER_H
#include "lexertl/generator.hpp"
#include "lexertl/iterator.hpp"
#include "query_lexer.h"
#include <map>
#include <QSqlQuery>
#include <string>
#include <vector>
#include <list>
/**
* This class is used to generate an SQL query string from a search expression,
* with a syntax very similar to that used by the Google search engine.
*
* The code herin is based upon the SearchQueryParser python class written by
* The code herein is based upon the SearchQueryParser python class written by
* Kovid Goyal as part of the Calibre eBook manager (https://calibre-ebook.com)
*
* Grammar:
@ -41,13 +41,6 @@
class QueryParser
{
public:
enum class TokenType { eof,
opcode,
atWord,
word,
quotedWord,
space };
struct TreeNode {
std::string t;
std::vector<TreeNode> children;
@ -64,10 +57,13 @@ private:
std::string token(bool advance = false);
std::string lcaseToken(bool advance = false);
TokenType tokenType();
Token::Type tokenType();
bool isEof() const;
void advance();
QueryLexer lexer = QueryLexer("");
Token currentToken = Token(Token::Type::eof);
template<typename T>
static bool isIn(const T &e, const std::list<T> &v)
{
@ -83,7 +79,6 @@ private:
filename };
static FieldType fieldType(const std::string &str);
void tokenize(const std::string &expr);
static std::string join(const QStringList &strings, const std::string &delim);
static QStringList split(const std::string &string, char delim);
@ -93,11 +88,6 @@ private:
TreeNode locationExpression();
TreeNode baseToken();
lexertl::rules lexScanner;
lexertl::state_machine sm;
lexertl::siterator iter;
const lexertl::siterator end;
static const std::map<FieldType, std::vector<std::string>> fieldNames;
};