yacreader/YACReaderLibrary/lexertl/rules.hpp

1019 lines
27 KiB
C++

// rules.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RULES_HPP
#define LEXERTL_RULES_HPP
#include "enums.hpp"
#include <locale>
#include <map>
#include "narrow.hpp"
#include "observer_ptr.hpp"
#include "parser/tokeniser/re_tokeniser.hpp"
#include "runtime_error.hpp"
#include <set>
#include <sstream>
#include <string>
#include <vector>
namespace lexertl
{
template<typename r_ch_type, typename ch_type,
typename id_ty = uint16_t>
class basic_rules
{
public:
using bool_vector = std::vector<bool>;
using bool_vector_vector = std::vector<bool_vector>;
using char_type = ch_type;
using rules_char_type = r_ch_type;
using id_type = id_ty;
using id_vector = std::vector<id_type>;
using id_vector_vector = std::vector<id_vector>;
using re_state = detail::basic_re_tokeniser_state<rules_char_type, id_type>;
using string = std::basic_string<rules_char_type>;
using string_token = basic_string_token<char_type>;
using string_vector = std::vector<string>;
using string_set = std::set<string>;
using string_pair = std::pair<string, string>;
using string_id_type_map = std::map<string, id_type>;
using string_id_type_pair = std::pair<string, id_type>;
using token = detail::basic_re_token<rules_char_type, char_type>;
using token_vector = std::vector<token>;
using token_vector_vector = std::vector<token_vector>;
using token_vector_vector_vector = std::vector<token_vector_vector>;
using macro_map = std::map<string, token_vector>;
using macro_pair = std::pair<string, token_vector>;
using tokeniser =
detail::basic_re_tokeniser<rules_char_type, char_type, id_type>;
// If you get a compile error here you have
// failed to define an unsigned id type.
static_assert(std::is_unsigned<id_type>::value, "Your id type is signed");
#ifdef _WIN32
basic_rules(const std::size_t flags_ = dot_not_cr_lf) :
#else
basic_rules(const std::size_t flags_ = dot_not_newline) :
#endif
_statemap(),
_macro_map(),
_regexes(),
_features(),
_ids(),
_user_ids(),
_next_dfas(),
_pushes(),
_pops(),
_flags(flags_),
_locale(),
_lexer_state_names()
{
push_state(initial());
}
void clear()
{
_statemap.clear();
_macro_map.clear();
_regexes.clear();
_features.clear();
_ids.clear();
_user_ids.clear();
_next_dfas.clear();
_pushes.clear();
_pops.clear();
#ifdef _WIN32
_flags = dot_not_cr_lf;
#else
_flags = dot_not_newline;
#endif
_locale = std::locale();
_lexer_state_names.clear();
push_state(initial());
}
void clear(const id_type dfa_)
{
if (_regexes.size() > dfa_)
{
_regexes[dfa_].clear();
_features[dfa_] = 0;
_ids[dfa_].clear();
_user_ids[dfa_].clear();
_next_dfas[dfa_].clear();
_pushes[dfa_].clear();
_pops[dfa_].clear();
}
}
void flags(const std::size_t flags_)
{
_flags = flags_;
}
std::size_t flags() const
{
return _flags;
}
static id_type skip()
{
return static_cast<id_type>(~1);
}
id_type eoi() const
{
return 0;
}
static id_type npos()
{
return static_cast<id_type>(~0);
}
std::locale imbue(const std::locale &locale_)
{
std::locale loc_ = _locale;
_locale = locale_;
return loc_;
}
const std::locale &locale() const
{
return _locale;
}
const rules_char_type *state(const id_type index_) const
{
if (index_ == 0)
{
return initial();
}
else
{
const id_type i_ = index_ - 1;
if (_lexer_state_names.size() > i_)
{
return _lexer_state_names[i_].c_str();
}
else
{
return 0;
}
}
}
id_type state(const rules_char_type *name_) const
{
typename string_id_type_map::const_iterator iter_ =
_statemap.find(name_);
if (iter_ == _statemap.end())
{
return npos();
}
else
{
return iter_->second;
}
}
id_type push_state(const rules_char_type *name_)
{
validate(name_);
if (_statemap.insert(string_id_type_pair(name_,
static_cast<id_type>(_statemap.size()))).second)
{
_regexes.push_back(token_vector_vector());
_features.push_back(0);
_ids.push_back(id_vector());
_user_ids.push_back(id_vector());
_next_dfas.push_back(id_vector());
_pushes.push_back(id_vector());
_pops.push_back(bool_vector());
if (string(name_) != initial())
{
_lexer_state_names.push_back(name_);
}
}
else
{
return _statemap.find(name_)->second;
}
if (_next_dfas.size() > npos())
{
// Overflow
throw runtime_error("The data type you have chosen cannot hold "
"this many lexer start states.");
}
// Initial is not stored, so no need to - 1.
return static_cast<id_type>(_lexer_state_names.size());
}
void insert_macro(const rules_char_type *name_,
const rules_char_type *regex_)
{
insert_macro(name_, string(regex_));
}
void insert_macro(const rules_char_type *name_,
const rules_char_type *regex_start_,
const rules_char_type *regex_end_)
{
insert_macro(name_, string(regex_start_, regex_end_));
}
void insert_macro(const rules_char_type *name_, const string &regex_)
{
validate(name_);
typename macro_map::const_iterator iter_ = _macro_map.find(name_);
if (iter_ == _macro_map.end())
{
auto pair_ = _macro_map.insert(macro_pair(name_, token_vector()));
tokenise(regex_, pair_.first->second, npos(), name_);
}
else
{
std::ostringstream ss_;
ss_ << "Attempt to redefine MACRO '";
narrow(name_, ss_);
ss_ << "'.";
throw runtime_error(ss_.str());
}
}
// Add rule to INITIAL
void push(const rules_char_type *regex_, const id_type id_,
const id_type user_id_ = npos())
{
push(string(regex_), id_, user_id_);
}
void push(const rules_char_type *regex_start_,
const rules_char_type *regex_end_,
const id_type id_, const id_type user_id_ = npos())
{
push(string(regex_start_, regex_end_), id_, user_id_);
}
void push(const string &regex_, const id_type id_,
const id_type user_id_ = npos())
{
check_for_invalid_id(id_);
_regexes.front().push_back(token_vector());
tokenise(regex_, _regexes.front().back(), id_, 0);
if (regex_[0] == '^')
{
_features.front() |= bol_bit;
}
if (regex_.size() > 0 && regex_[regex_.size() - 1] == '$')
{
_features.front() |= eol_bit;
}
if (id_ == skip())
{
_features.front() |= skip_bit;
}
else if (id_ == eoi())
{
_features.front() |= again_bit;
}
_ids.front().push_back(id_);
_user_ids.front().push_back(user_id_);
_next_dfas.front().push_back(0);
_pushes.front().push_back(npos());
_pops.front().push_back(false);
}
// Add rule with no id
void push(const rules_char_type *curr_dfa_,
const rules_char_type *regex_, const rules_char_type *new_dfa_)
{
push(curr_dfa_, string(regex_), new_dfa_);
}
void push(const rules_char_type *curr_dfa_,
const rules_char_type *regex_start_, const rules_char_type *regex_end_,
const rules_char_type *new_dfa_)
{
push(curr_dfa_, string(regex_start_, regex_end_), new_dfa_);
}
void push(const rules_char_type *curr_dfa_, const string &regex_,
const rules_char_type *new_dfa_)
{
push(curr_dfa_, regex_, eoi(), new_dfa_, false);
}
// Add rule with id
void push(const rules_char_type *curr_dfa_,
const rules_char_type *regex_, const id_type id_,
const rules_char_type *new_dfa_, const id_type user_id_ = npos())
{
push(curr_dfa_, string(regex_), id_, new_dfa_, user_id_);
}
void push(const rules_char_type *curr_dfa_,
const rules_char_type *regex_start_,
const rules_char_type *regex_end_, const id_type id_,
const rules_char_type *new_dfa_, const id_type user_id_ = npos())
{
push(curr_dfa_, string(regex_start_, regex_end_),
id_, new_dfa_, user_id_);
}
void push(const rules_char_type *curr_dfa_, const string &regex_,
const id_type id_, const rules_char_type *new_dfa_,
const id_type user_id_ = npos())
{
push(curr_dfa_, regex_, id_, new_dfa_, true, user_id_);
}
void reverse()
{
for (auto &state_ : _regexes)
{
for (auto &regex_ : state_)
{
reverse(regex_);
}
}
for (auto &pair_ : _macro_map)
{
reverse(pair_.second);
}
}
const string_id_type_map &statemap() const
{
return _statemap;
}
const token_vector_vector_vector &regexes() const
{
return _regexes;
}
const id_vector &features() const
{
return _features;
}
const id_vector_vector &ids() const
{
return _ids;
}
const id_vector_vector &user_ids() const
{
return _user_ids;
}
const id_vector_vector &next_dfas() const
{
return _next_dfas;
}
const id_vector_vector &pushes() const
{
return _pushes;
}
const bool_vector_vector &pops() const
{
return _pops;
}
bool empty() const
{
bool empty_ = true;
for (const auto &regex_ : _regexes)
{
if (!regex_.empty())
{
empty_ = false;
break;
}
}
return empty_;
}
static const rules_char_type *initial()
{
static const rules_char_type initial_ [] =
{ 'I', 'N', 'I', 'T', 'I', 'A', 'L', 0 };
return initial_;
}
static const rules_char_type *dot()
{
static const rules_char_type dot_ [] = { '.', 0 };
return dot_;
}
static const rules_char_type *all_states()
{
static const rules_char_type star_ [] = { '*', 0 };
return star_;
}
private:
string_id_type_map _statemap;
macro_map _macro_map;
token_vector_vector_vector _regexes;
id_vector _features;
id_vector_vector _ids;
id_vector_vector _user_ids;
id_vector_vector _next_dfas;
id_vector_vector _pushes;
bool_vector_vector _pops;
std::size_t _flags;
std::locale _locale;
string_vector _lexer_state_names;
void tokenise(const string &regex_, token_vector &tokens_,
const id_type id_, const rules_char_type *name_)
{
re_state state_(regex_.c_str(), regex_.c_str() + regex_.size(), id_,
_flags, _locale, name_);
string macro_;
rules_char_type diff_ = 0;
tokens_.push_back(token());
do
{
observer_ptr<token> lhs_ = &tokens_.back();
token rhs_;
tokeniser::next(*lhs_, state_, rhs_);
if (rhs_._type != detail::DIFF &&
lhs_->precedence(rhs_._type) == ' ')
{
std::ostringstream ss_;
ss_ << "A syntax error occurred: '" <<
lhs_->precedence_string() <<
"' against '" << rhs_.precedence_string() <<
"' preceding index " << state_.index() <<
" in ";
if (name_ != 0)
{
ss_ << "macro ";
narrow(name_, ss_);
}
else
{
ss_ << "rule id " << state_._id;
}
ss_ << '.';
throw runtime_error(ss_.str());
}
if (rhs_._type == detail::MACRO)
{
typename macro_map::const_iterator iter_ =
_macro_map.find(rhs_._extra);
macro_ = rhs_._extra;
if (iter_ == _macro_map.end())
{
const rules_char_type *rhs_name_ = rhs_._extra.c_str();
std::ostringstream ss_;
ss_ << "Unknown MACRO name '";
narrow(rhs_name_, ss_);
ss_ << "'.";
throw runtime_error(ss_.str());
}
else
{
const bool multiple_ = iter_->second.size() > 3;
if (diff_)
{
if (multiple_)
{
std::ostringstream ss_;
ss_ << "Single CHARSET must follow {-} or {+} at "
"index " << state_.index() - 1 << " in ";
if (name_ != 0)
{
ss_ << "macro ";
narrow(name_, ss_);
}
else
{
ss_ << "rule id " << state_._id;
}
ss_ << '.';
throw runtime_error(ss_.str());
}
else
{
rhs_ = iter_->second[1];
}
}
// Any macro with more than one charset (or quantifiers)
// requires bracketing.
if (multiple_)
{
token open_;
open_._type = detail::OPENPAREN;
open_._str.insert('(');
tokens_.push_back(open_);
}
// Don't need to store token if it is diff.
if (!diff_)
{
// Don't insert BEGIN or END tokens
tokens_.insert(tokens_.end(), iter_->second.begin() + 1,
iter_->second.end() - 1);
lhs_ = &tokens_.back();
}
if (multiple_)
{
token close_;
close_._type = detail::CLOSEPAREN;
close_._str.insert(')');
tokens_.push_back(close_);
}
}
}
else if (rhs_._type == detail::DIFF)
{
if (!macro_.empty())
{
typename macro_map::const_iterator iter_ =
_macro_map.find(macro_);
if (iter_->second.size() > 3)
{
std::ostringstream ss_;
ss_ << "Single CHARSET must precede {-} or {+} at "
"index " << state_.index() - 1 << " in ";
if (name_ != 0)
{
ss_ << "macro ";
narrow(name_, ss_);
}
else
{
ss_ << "rule id " << state_._id;
}
ss_ << '.';
throw runtime_error(ss_.str());
}
}
diff_ = rhs_._extra[0];
macro_.clear();
continue;
}
else if (!diff_)
{
tokens_.push_back(rhs_);
lhs_ = &tokens_.back();
macro_.clear();
}
// diff_ may have been set by previous conditional.
if (diff_)
{
if (rhs_._type != detail::CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must follow {-} or {+} at index " <<
state_.index() - 1 << " in ";
if (name_ != 0)
{
ss_ << "macro ";
narrow(name_, ss_);
}
else
{
ss_ << "rule id " << state_._id;
}
ss_ << '.';
throw runtime_error(ss_.str());
}
switch (diff_)
{
case '-':
lhs_->_str.remove(rhs_._str);
if (lhs_->_str.empty())
{
std::ostringstream ss_;
ss_ << "Empty charset created by {-} at index " <<
state_.index() - 1 << " in ";
if (name_ != 0)
{
ss_ << "macro ";
narrow(name_, ss_);
}
else
{
ss_ << "rule id " << state_._id;
}
ss_ << '.';
throw runtime_error(ss_.str());
}
break;
case '+':
lhs_->_str.insert(rhs_._str);
break;
}
diff_ = 0;
}
} while (tokens_.back()._type != detail::END);
if (tokens_.size() == 2)
{
std::ostringstream ss_;
ss_ << "Empty regex in ";
if (name_ != 0)
{
ss_ << "macro ";
narrow(name_, ss_);
}
else
{
ss_ << "rule id " << state_._id;
}
ss_ << " is not allowed.";
throw runtime_error(ss_.str());
}
}
void reverse(token_vector &vector_)
{
token_vector new_vector_(vector_.size(), token());
auto iter_ = vector_.rbegin();
auto end_ = vector_.rend();
auto dest_ = new_vector_.begin();
std::stack<typename token_vector::reverse_iterator> stack_;
for (; iter_ != end_; ++iter_, ++dest_)
{
switch (iter_->_type)
{
case detail::BEGIN:
iter_->swap(*dest_);
dest_->_type = detail::END;
break;
case detail::BOL:
iter_->swap(*dest_);
dest_->_type = detail::EOL;
break;
case detail::EOL:
iter_->swap(*dest_);
dest_->_type = detail::BOL;
break;
case detail::OPENPAREN:
iter_->swap(*dest_);
dest_->_type = detail::CLOSEPAREN;
if (stack_.top() != end_)
{
++dest_;
dest_->swap(*stack_.top());
}
stack_.pop();
break;
case detail::CLOSEPAREN:
iter_->swap(*dest_);
dest_->_type = detail::OPENPAREN;
stack_.push(end_);
break;
case detail::OPT:
case detail::AOPT:
case detail::ZEROORMORE:
case detail::AZEROORMORE:
case detail::ONEORMORE:
case detail::AONEORMORE:
case detail::REPEATN:
case detail::AREPEATN:
{
auto temp_ = iter_ + 1;
if (temp_->_type == detail::CLOSEPAREN)
{
stack_.push(iter_);
++iter_;
iter_->swap(*dest_);
dest_->_type = detail::OPENPAREN;
}
else
{
dest_->swap(*temp_);
++dest_;
dest_->swap(*iter_);
++iter_;
}
break;
}
case detail::END:
iter_->swap(*dest_);
dest_->_type = detail::BEGIN;
break;
default:
// detail::OR
// detail::CHARSET
iter_->swap(*dest_);
break;
}
}
new_vector_.swap(vector_);
}
void push(const rules_char_type *curr_dfa_, const string &regex_,
const id_type id_, const rules_char_type *new_dfa_,
const bool check_, const id_type user_id_ = npos())
{
const bool star_ = *curr_dfa_ == '*' && *(curr_dfa_ + 1) == 0;
const bool dot_ = *new_dfa_ == '.' && *(new_dfa_ + 1) == 0;
const bool push_ = *new_dfa_ == '>';
const rules_char_type *push_dfa_ = nullptr;
const bool pop_ = *new_dfa_ == '<';
if (push_ || pop_)
{
++new_dfa_;
}
if (check_)
{
check_for_invalid_id(id_);
}
if (!dot_ && !pop_)
{
const rules_char_type *temp_ = new_dfa_;
while (*temp_ && *temp_ != ':')
{
++temp_;
}
if (*temp_) push_dfa_ = temp_ + 1;
validate(new_dfa_, *temp_ ? temp_ : 0);
if (push_dfa_)
{
validate(push_dfa_);
}
}
// npos means pop here
id_type new_dfa_id_ = npos();
id_type push_dfa_id_ = npos();
typename string_id_type_map::const_iterator iter_;
auto end_ = _statemap.cend();
id_vector next_dfas_;
if (!dot_ && !pop_)
{
if (push_dfa_)
{
iter_ = _statemap.find(string(new_dfa_, push_dfa_ - 1));
}
else
{
iter_ = _statemap.find(new_dfa_);
}
if (iter_ == end_)
{
std::ostringstream ss_;
ss_ << "Unknown state name '";
narrow(new_dfa_, ss_);
ss_ << "'.";
throw runtime_error(ss_.str());
}
new_dfa_id_ = iter_->second;
if (push_dfa_)
{
iter_ = _statemap.find(push_dfa_);
if (iter_ == end_)
{
std::ostringstream ss_;
ss_ << "Unknown state name '";
narrow(push_dfa_, ss_);
ss_ << "'.";
throw runtime_error(ss_.str());
}
push_dfa_id_ = iter_->second;
}
}
if (star_)
{
const std::size_t size_ = _statemap.size();
for (id_type i_ = 0; i_ < size_; ++i_)
{
next_dfas_.push_back(i_);
}
}
else
{
const rules_char_type *start_ = curr_dfa_;
string next_dfa_;
while (*curr_dfa_)
{
while (*curr_dfa_ && *curr_dfa_ != ',')
{
++curr_dfa_;
}
next_dfa_.assign(start_, curr_dfa_);
if (*curr_dfa_)
{
++curr_dfa_;
start_ = curr_dfa_;
}
validate(next_dfa_.c_str());
iter_ = _statemap.find(next_dfa_.c_str());
if (iter_ == end_)
{
std::ostringstream ss_;
ss_ << "Unknown state name '";
curr_dfa_ = next_dfa_.c_str();
narrow(curr_dfa_, ss_);
ss_ << "'.";
throw runtime_error(ss_.str());
}
next_dfas_.push_back(iter_->second);
}
}
for (std::size_t i_ = 0, size_ = next_dfas_.size();
i_ < size_; ++i_)
{
const id_type curr_ = next_dfas_[i_];
_regexes[curr_].push_back(token_vector());
tokenise(regex_, _regexes[curr_].back(), id_, 0);
if (regex_[0] == '^')
{
_features[curr_] |= bol_bit;
}
if (regex_[regex_.size() - 1] == '$')
{
_features[curr_] |= eol_bit;
}
if (id_ == skip())
{
_features[curr_] |= skip_bit;
}
else if (id_ == eoi())
{
_features[curr_] |= again_bit;
}
if (push_ || pop_)
{
_features[curr_] |= recursive_bit;
}
_ids[curr_].push_back(id_);
_user_ids[curr_].push_back(user_id_);
_next_dfas[curr_].push_back(dot_ ? curr_ : new_dfa_id_);
_pushes[curr_].push_back(push_ ? (push_dfa_ ?
push_dfa_id_ : curr_) : npos());
_pops[curr_].push_back(pop_);
}
}
void validate(const rules_char_type *name_,
const rules_char_type *end_ = nullptr) const
{
const rules_char_type *start_ = name_;
if (*name_ != '_' && !(*name_ >= 'A' && *name_ <= 'Z') &&
!(*name_ >= 'a' && *name_ <= 'z'))
{
std::ostringstream ss_;
ss_ << "Invalid name '";
narrow(name_, ss_);
ss_ << "'.";
throw runtime_error(ss_.str());
}
else if (*name_)
{
++name_;
}
while (*name_ && name_ != end_)
{
if (*name_ != '_' && *name_ != '-' &&
!(*name_ >= 'A' && *name_ <= 'Z') &&
!(*name_ >= 'a' && *name_ <= 'z') &&
!(*name_ >= '0' && *name_ <= '9'))
{
std::ostringstream ss_;
ss_ << "Invalid name '";
name_ = start_;
narrow(name_, ss_);
ss_ << "'.";
throw runtime_error(ss_.str());
}
++name_;
}
}
void check_for_invalid_id(const id_type id_) const
{
if (id_ == eoi())
{
throw runtime_error("Cannot resuse the id for eoi.");
}
if (id_ == npos())
{
throw runtime_error("The id npos is reserved for the "
"UNKNOWN token.");
}
}
};
using rules = basic_rules<char, char>;
using wrules = basic_rules<wchar_t, wchar_t>;
using u32rules = basic_rules<char32_t, char32_t>;
}
#endif