mirror of
https://github.com/YACReader/yacreader
synced 2025-05-28 03:10:27 -04:00
1019 lines
27 KiB
C++
1019 lines
27 KiB
C++
// rules.hpp
|
|
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
|
//
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
#ifndef LEXERTL_RULES_HPP
|
|
#define LEXERTL_RULES_HPP
|
|
|
|
#include "enums.hpp"
|
|
#include <locale>
|
|
#include <map>
|
|
#include "narrow.hpp"
|
|
#include "observer_ptr.hpp"
|
|
#include "parser/tokeniser/re_tokeniser.hpp"
|
|
#include "runtime_error.hpp"
|
|
#include <set>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
namespace lexertl
|
|
{
|
|
template<typename r_ch_type, typename ch_type,
|
|
typename id_ty = uint16_t>
|
|
class basic_rules
|
|
{
|
|
public:
|
|
using bool_vector = std::vector<bool>;
|
|
using bool_vector_vector = std::vector<bool_vector>;
|
|
using char_type = ch_type;
|
|
using rules_char_type = r_ch_type;
|
|
using id_type = id_ty;
|
|
using id_vector = std::vector<id_type>;
|
|
using id_vector_vector = std::vector<id_vector>;
|
|
using re_state = detail::basic_re_tokeniser_state<rules_char_type, id_type>;
|
|
using string = std::basic_string<rules_char_type>;
|
|
using string_token = basic_string_token<char_type>;
|
|
using string_vector = std::vector<string>;
|
|
using string_set = std::set<string>;
|
|
using string_pair = std::pair<string, string>;
|
|
using string_id_type_map = std::map<string, id_type>;
|
|
using string_id_type_pair = std::pair<string, id_type>;
|
|
using token = detail::basic_re_token<rules_char_type, char_type>;
|
|
using token_vector = std::vector<token>;
|
|
using token_vector_vector = std::vector<token_vector>;
|
|
using token_vector_vector_vector = std::vector<token_vector_vector>;
|
|
using macro_map = std::map<string, token_vector>;
|
|
using macro_pair = std::pair<string, token_vector>;
|
|
using tokeniser =
|
|
detail::basic_re_tokeniser<rules_char_type, char_type, id_type>;
|
|
|
|
// If you get a compile error here you have
|
|
// failed to define an unsigned id type.
|
|
static_assert(std::is_unsigned<id_type>::value, "Your id type is signed");
|
|
|
|
#ifdef _WIN32
|
|
basic_rules(const std::size_t flags_ = dot_not_cr_lf) :
|
|
#else
|
|
basic_rules(const std::size_t flags_ = dot_not_newline) :
|
|
#endif
|
|
_statemap(),
|
|
_macro_map(),
|
|
_regexes(),
|
|
_features(),
|
|
_ids(),
|
|
_user_ids(),
|
|
_next_dfas(),
|
|
_pushes(),
|
|
_pops(),
|
|
_flags(flags_),
|
|
_locale(),
|
|
_lexer_state_names()
|
|
{
|
|
push_state(initial());
|
|
}
|
|
|
|
void clear()
|
|
{
|
|
_statemap.clear();
|
|
_macro_map.clear();
|
|
_regexes.clear();
|
|
_features.clear();
|
|
_ids.clear();
|
|
_user_ids.clear();
|
|
_next_dfas.clear();
|
|
_pushes.clear();
|
|
_pops.clear();
|
|
#ifdef _WIN32
|
|
_flags = dot_not_cr_lf;
|
|
#else
|
|
_flags = dot_not_newline;
|
|
#endif
|
|
_locale = std::locale();
|
|
_lexer_state_names.clear();
|
|
push_state(initial());
|
|
}
|
|
|
|
void clear(const id_type dfa_)
|
|
{
|
|
if (_regexes.size() > dfa_)
|
|
{
|
|
_regexes[dfa_].clear();
|
|
_features[dfa_] = 0;
|
|
_ids[dfa_].clear();
|
|
_user_ids[dfa_].clear();
|
|
_next_dfas[dfa_].clear();
|
|
_pushes[dfa_].clear();
|
|
_pops[dfa_].clear();
|
|
}
|
|
}
|
|
|
|
void flags(const std::size_t flags_)
|
|
{
|
|
_flags = flags_;
|
|
}
|
|
|
|
std::size_t flags() const
|
|
{
|
|
return _flags;
|
|
}
|
|
|
|
static id_type skip()
|
|
{
|
|
return static_cast<id_type>(~1);
|
|
}
|
|
|
|
id_type eoi() const
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static id_type npos()
|
|
{
|
|
return static_cast<id_type>(~0);
|
|
}
|
|
|
|
std::locale imbue(const std::locale &locale_)
|
|
{
|
|
std::locale loc_ = _locale;
|
|
|
|
_locale = locale_;
|
|
return loc_;
|
|
}
|
|
|
|
const std::locale &locale() const
|
|
{
|
|
return _locale;
|
|
}
|
|
|
|
const rules_char_type *state(const id_type index_) const
|
|
{
|
|
if (index_ == 0)
|
|
{
|
|
return initial();
|
|
}
|
|
else
|
|
{
|
|
const id_type i_ = index_ - 1;
|
|
|
|
if (_lexer_state_names.size() > i_)
|
|
{
|
|
return _lexer_state_names[i_].c_str();
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
id_type state(const rules_char_type *name_) const
|
|
{
|
|
typename string_id_type_map::const_iterator iter_ =
|
|
_statemap.find(name_);
|
|
|
|
if (iter_ == _statemap.end())
|
|
{
|
|
return npos();
|
|
}
|
|
else
|
|
{
|
|
return iter_->second;
|
|
}
|
|
}
|
|
|
|
id_type push_state(const rules_char_type *name_)
|
|
{
|
|
validate(name_);
|
|
|
|
if (_statemap.insert(string_id_type_pair(name_,
|
|
static_cast<id_type>(_statemap.size()))).second)
|
|
{
|
|
_regexes.push_back(token_vector_vector());
|
|
_features.push_back(0);
|
|
_ids.push_back(id_vector());
|
|
_user_ids.push_back(id_vector());
|
|
_next_dfas.push_back(id_vector());
|
|
_pushes.push_back(id_vector());
|
|
_pops.push_back(bool_vector());
|
|
|
|
if (string(name_) != initial())
|
|
{
|
|
_lexer_state_names.push_back(name_);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
return _statemap.find(name_)->second;
|
|
}
|
|
|
|
if (_next_dfas.size() > npos())
|
|
{
|
|
// Overflow
|
|
throw runtime_error("The data type you have chosen cannot hold "
|
|
"this many lexer start states.");
|
|
}
|
|
|
|
// Initial is not stored, so no need to - 1.
|
|
return static_cast<id_type>(_lexer_state_names.size());
|
|
}
|
|
|
|
void insert_macro(const rules_char_type *name_,
|
|
const rules_char_type *regex_)
|
|
{
|
|
insert_macro(name_, string(regex_));
|
|
}
|
|
|
|
void insert_macro(const rules_char_type *name_,
|
|
const rules_char_type *regex_start_,
|
|
const rules_char_type *regex_end_)
|
|
{
|
|
insert_macro(name_, string(regex_start_, regex_end_));
|
|
}
|
|
|
|
void insert_macro(const rules_char_type *name_, const string ®ex_)
|
|
{
|
|
validate(name_);
|
|
|
|
typename macro_map::const_iterator iter_ = _macro_map.find(name_);
|
|
|
|
if (iter_ == _macro_map.end())
|
|
{
|
|
auto pair_ = _macro_map.insert(macro_pair(name_, token_vector()));
|
|
|
|
tokenise(regex_, pair_.first->second, npos(), name_);
|
|
}
|
|
else
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Attempt to redefine MACRO '";
|
|
narrow(name_, ss_);
|
|
ss_ << "'.";
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
}
|
|
|
|
// Add rule to INITIAL
|
|
void push(const rules_char_type *regex_, const id_type id_,
|
|
const id_type user_id_ = npos())
|
|
{
|
|
push(string(regex_), id_, user_id_);
|
|
}
|
|
|
|
void push(const rules_char_type *regex_start_,
|
|
const rules_char_type *regex_end_,
|
|
const id_type id_, const id_type user_id_ = npos())
|
|
{
|
|
push(string(regex_start_, regex_end_), id_, user_id_);
|
|
}
|
|
|
|
void push(const string ®ex_, const id_type id_,
|
|
const id_type user_id_ = npos())
|
|
{
|
|
check_for_invalid_id(id_);
|
|
_regexes.front().push_back(token_vector());
|
|
tokenise(regex_, _regexes.front().back(), id_, 0);
|
|
|
|
if (regex_[0] == '^')
|
|
{
|
|
_features.front() |= bol_bit;
|
|
}
|
|
|
|
if (regex_.size() > 0 && regex_[regex_.size() - 1] == '$')
|
|
{
|
|
_features.front() |= eol_bit;
|
|
}
|
|
|
|
if (id_ == skip())
|
|
{
|
|
_features.front() |= skip_bit;
|
|
}
|
|
else if (id_ == eoi())
|
|
{
|
|
_features.front() |= again_bit;
|
|
}
|
|
|
|
_ids.front().push_back(id_);
|
|
_user_ids.front().push_back(user_id_);
|
|
_next_dfas.front().push_back(0);
|
|
_pushes.front().push_back(npos());
|
|
_pops.front().push_back(false);
|
|
}
|
|
|
|
// Add rule with no id
|
|
void push(const rules_char_type *curr_dfa_,
|
|
const rules_char_type *regex_, const rules_char_type *new_dfa_)
|
|
{
|
|
push(curr_dfa_, string(regex_), new_dfa_);
|
|
}
|
|
|
|
void push(const rules_char_type *curr_dfa_,
|
|
const rules_char_type *regex_start_, const rules_char_type *regex_end_,
|
|
const rules_char_type *new_dfa_)
|
|
{
|
|
push(curr_dfa_, string(regex_start_, regex_end_), new_dfa_);
|
|
}
|
|
|
|
void push(const rules_char_type *curr_dfa_, const string ®ex_,
|
|
const rules_char_type *new_dfa_)
|
|
{
|
|
push(curr_dfa_, regex_, eoi(), new_dfa_, false);
|
|
}
|
|
|
|
// Add rule with id
|
|
void push(const rules_char_type *curr_dfa_,
|
|
const rules_char_type *regex_, const id_type id_,
|
|
const rules_char_type *new_dfa_, const id_type user_id_ = npos())
|
|
{
|
|
push(curr_dfa_, string(regex_), id_, new_dfa_, user_id_);
|
|
}
|
|
|
|
void push(const rules_char_type *curr_dfa_,
|
|
const rules_char_type *regex_start_,
|
|
const rules_char_type *regex_end_, const id_type id_,
|
|
const rules_char_type *new_dfa_, const id_type user_id_ = npos())
|
|
{
|
|
push(curr_dfa_, string(regex_start_, regex_end_),
|
|
id_, new_dfa_, user_id_);
|
|
}
|
|
|
|
void push(const rules_char_type *curr_dfa_, const string ®ex_,
|
|
const id_type id_, const rules_char_type *new_dfa_,
|
|
const id_type user_id_ = npos())
|
|
{
|
|
push(curr_dfa_, regex_, id_, new_dfa_, true, user_id_);
|
|
}
|
|
|
|
void reverse()
|
|
{
|
|
for (auto &state_ : _regexes)
|
|
{
|
|
for (auto ®ex_ : state_)
|
|
{
|
|
reverse(regex_);
|
|
}
|
|
}
|
|
|
|
for (auto &pair_ : _macro_map)
|
|
{
|
|
reverse(pair_.second);
|
|
}
|
|
}
|
|
|
|
const string_id_type_map &statemap() const
|
|
{
|
|
return _statemap;
|
|
}
|
|
|
|
const token_vector_vector_vector ®exes() const
|
|
{
|
|
return _regexes;
|
|
}
|
|
|
|
const id_vector &features() const
|
|
{
|
|
return _features;
|
|
}
|
|
|
|
const id_vector_vector &ids() const
|
|
{
|
|
return _ids;
|
|
}
|
|
|
|
const id_vector_vector &user_ids() const
|
|
{
|
|
return _user_ids;
|
|
}
|
|
|
|
const id_vector_vector &next_dfas() const
|
|
{
|
|
return _next_dfas;
|
|
}
|
|
|
|
const id_vector_vector &pushes() const
|
|
{
|
|
return _pushes;
|
|
}
|
|
|
|
const bool_vector_vector &pops() const
|
|
{
|
|
return _pops;
|
|
}
|
|
|
|
bool empty() const
|
|
{
|
|
bool empty_ = true;
|
|
|
|
for (const auto ®ex_ : _regexes)
|
|
{
|
|
if (!regex_.empty())
|
|
{
|
|
empty_ = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return empty_;
|
|
}
|
|
|
|
static const rules_char_type *initial()
|
|
{
|
|
static const rules_char_type initial_ [] =
|
|
{ 'I', 'N', 'I', 'T', 'I', 'A', 'L', 0 };
|
|
|
|
return initial_;
|
|
}
|
|
|
|
static const rules_char_type *dot()
|
|
{
|
|
static const rules_char_type dot_ [] = { '.', 0 };
|
|
|
|
return dot_;
|
|
}
|
|
|
|
static const rules_char_type *all_states()
|
|
{
|
|
static const rules_char_type star_ [] = { '*', 0 };
|
|
|
|
return star_;
|
|
}
|
|
|
|
private:
|
|
string_id_type_map _statemap;
|
|
macro_map _macro_map;
|
|
token_vector_vector_vector _regexes;
|
|
id_vector _features;
|
|
id_vector_vector _ids;
|
|
id_vector_vector _user_ids;
|
|
id_vector_vector _next_dfas;
|
|
id_vector_vector _pushes;
|
|
bool_vector_vector _pops;
|
|
std::size_t _flags;
|
|
std::locale _locale;
|
|
string_vector _lexer_state_names;
|
|
|
|
void tokenise(const string ®ex_, token_vector &tokens_,
|
|
const id_type id_, const rules_char_type *name_)
|
|
{
|
|
re_state state_(regex_.c_str(), regex_.c_str() + regex_.size(), id_,
|
|
_flags, _locale, name_);
|
|
string macro_;
|
|
rules_char_type diff_ = 0;
|
|
|
|
tokens_.push_back(token());
|
|
|
|
do
|
|
{
|
|
observer_ptr<token> lhs_ = &tokens_.back();
|
|
token rhs_;
|
|
|
|
tokeniser::next(*lhs_, state_, rhs_);
|
|
|
|
if (rhs_._type != detail::DIFF &&
|
|
lhs_->precedence(rhs_._type) == ' ')
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "A syntax error occurred: '" <<
|
|
lhs_->precedence_string() <<
|
|
"' against '" << rhs_.precedence_string() <<
|
|
"' preceding index " << state_.index() <<
|
|
" in ";
|
|
|
|
if (name_ != 0)
|
|
{
|
|
ss_ << "macro ";
|
|
narrow(name_, ss_);
|
|
}
|
|
else
|
|
{
|
|
ss_ << "rule id " << state_._id;
|
|
}
|
|
|
|
ss_ << '.';
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
|
|
if (rhs_._type == detail::MACRO)
|
|
{
|
|
typename macro_map::const_iterator iter_ =
|
|
_macro_map.find(rhs_._extra);
|
|
|
|
macro_ = rhs_._extra;
|
|
|
|
if (iter_ == _macro_map.end())
|
|
{
|
|
const rules_char_type *rhs_name_ = rhs_._extra.c_str();
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Unknown MACRO name '";
|
|
narrow(rhs_name_, ss_);
|
|
ss_ << "'.";
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
else
|
|
{
|
|
const bool multiple_ = iter_->second.size() > 3;
|
|
|
|
if (diff_)
|
|
{
|
|
if (multiple_)
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Single CHARSET must follow {-} or {+} at "
|
|
"index " << state_.index() - 1 << " in ";
|
|
|
|
if (name_ != 0)
|
|
{
|
|
ss_ << "macro ";
|
|
narrow(name_, ss_);
|
|
}
|
|
else
|
|
{
|
|
ss_ << "rule id " << state_._id;
|
|
}
|
|
|
|
ss_ << '.';
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
else
|
|
{
|
|
rhs_ = iter_->second[1];
|
|
}
|
|
}
|
|
|
|
// Any macro with more than one charset (or quantifiers)
|
|
// requires bracketing.
|
|
if (multiple_)
|
|
{
|
|
token open_;
|
|
|
|
open_._type = detail::OPENPAREN;
|
|
open_._str.insert('(');
|
|
tokens_.push_back(open_);
|
|
}
|
|
|
|
// Don't need to store token if it is diff.
|
|
if (!diff_)
|
|
{
|
|
// Don't insert BEGIN or END tokens
|
|
tokens_.insert(tokens_.end(), iter_->second.begin() + 1,
|
|
iter_->second.end() - 1);
|
|
lhs_ = &tokens_.back();
|
|
}
|
|
|
|
if (multiple_)
|
|
{
|
|
token close_;
|
|
|
|
close_._type = detail::CLOSEPAREN;
|
|
close_._str.insert(')');
|
|
tokens_.push_back(close_);
|
|
}
|
|
}
|
|
}
|
|
else if (rhs_._type == detail::DIFF)
|
|
{
|
|
if (!macro_.empty())
|
|
{
|
|
typename macro_map::const_iterator iter_ =
|
|
_macro_map.find(macro_);
|
|
|
|
if (iter_->second.size() > 3)
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Single CHARSET must precede {-} or {+} at "
|
|
"index " << state_.index() - 1 << " in ";
|
|
|
|
if (name_ != 0)
|
|
{
|
|
ss_ << "macro ";
|
|
narrow(name_, ss_);
|
|
}
|
|
else
|
|
{
|
|
ss_ << "rule id " << state_._id;
|
|
}
|
|
|
|
ss_ << '.';
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
}
|
|
|
|
diff_ = rhs_._extra[0];
|
|
macro_.clear();
|
|
continue;
|
|
}
|
|
else if (!diff_)
|
|
{
|
|
tokens_.push_back(rhs_);
|
|
lhs_ = &tokens_.back();
|
|
macro_.clear();
|
|
}
|
|
|
|
// diff_ may have been set by previous conditional.
|
|
if (diff_)
|
|
{
|
|
if (rhs_._type != detail::CHARSET)
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "CHARSET must follow {-} or {+} at index " <<
|
|
state_.index() - 1 << " in ";
|
|
|
|
if (name_ != 0)
|
|
{
|
|
ss_ << "macro ";
|
|
narrow(name_, ss_);
|
|
}
|
|
else
|
|
{
|
|
ss_ << "rule id " << state_._id;
|
|
}
|
|
|
|
ss_ << '.';
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
|
|
switch (diff_)
|
|
{
|
|
case '-':
|
|
lhs_->_str.remove(rhs_._str);
|
|
|
|
if (lhs_->_str.empty())
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Empty charset created by {-} at index " <<
|
|
state_.index() - 1 << " in ";
|
|
|
|
if (name_ != 0)
|
|
{
|
|
ss_ << "macro ";
|
|
narrow(name_, ss_);
|
|
}
|
|
else
|
|
{
|
|
ss_ << "rule id " << state_._id;
|
|
}
|
|
|
|
ss_ << '.';
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
|
|
break;
|
|
case '+':
|
|
lhs_->_str.insert(rhs_._str);
|
|
break;
|
|
}
|
|
|
|
diff_ = 0;
|
|
}
|
|
} while (tokens_.back()._type != detail::END);
|
|
|
|
if (tokens_.size() == 2)
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Empty regex in ";
|
|
|
|
if (name_ != 0)
|
|
{
|
|
ss_ << "macro ";
|
|
narrow(name_, ss_);
|
|
}
|
|
else
|
|
{
|
|
ss_ << "rule id " << state_._id;
|
|
}
|
|
|
|
ss_ << " is not allowed.";
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
}
|
|
|
|
void reverse(token_vector &vector_)
|
|
{
|
|
token_vector new_vector_(vector_.size(), token());
|
|
auto iter_ = vector_.rbegin();
|
|
auto end_ = vector_.rend();
|
|
auto dest_ = new_vector_.begin();
|
|
std::stack<typename token_vector::reverse_iterator> stack_;
|
|
|
|
for (; iter_ != end_; ++iter_, ++dest_)
|
|
{
|
|
switch (iter_->_type)
|
|
{
|
|
case detail::BEGIN:
|
|
iter_->swap(*dest_);
|
|
dest_->_type = detail::END;
|
|
break;
|
|
case detail::BOL:
|
|
iter_->swap(*dest_);
|
|
dest_->_type = detail::EOL;
|
|
break;
|
|
case detail::EOL:
|
|
iter_->swap(*dest_);
|
|
dest_->_type = detail::BOL;
|
|
break;
|
|
case detail::OPENPAREN:
|
|
iter_->swap(*dest_);
|
|
dest_->_type = detail::CLOSEPAREN;
|
|
|
|
if (stack_.top() != end_)
|
|
{
|
|
++dest_;
|
|
dest_->swap(*stack_.top());
|
|
}
|
|
|
|
stack_.pop();
|
|
break;
|
|
case detail::CLOSEPAREN:
|
|
iter_->swap(*dest_);
|
|
dest_->_type = detail::OPENPAREN;
|
|
stack_.push(end_);
|
|
break;
|
|
case detail::OPT:
|
|
case detail::AOPT:
|
|
case detail::ZEROORMORE:
|
|
case detail::AZEROORMORE:
|
|
case detail::ONEORMORE:
|
|
case detail::AONEORMORE:
|
|
case detail::REPEATN:
|
|
case detail::AREPEATN:
|
|
{
|
|
auto temp_ = iter_ + 1;
|
|
|
|
if (temp_->_type == detail::CLOSEPAREN)
|
|
{
|
|
stack_.push(iter_);
|
|
++iter_;
|
|
iter_->swap(*dest_);
|
|
dest_->_type = detail::OPENPAREN;
|
|
}
|
|
else
|
|
{
|
|
dest_->swap(*temp_);
|
|
++dest_;
|
|
dest_->swap(*iter_);
|
|
++iter_;
|
|
}
|
|
|
|
break;
|
|
}
|
|
case detail::END:
|
|
iter_->swap(*dest_);
|
|
dest_->_type = detail::BEGIN;
|
|
break;
|
|
default:
|
|
// detail::OR
|
|
// detail::CHARSET
|
|
iter_->swap(*dest_);
|
|
break;
|
|
}
|
|
}
|
|
|
|
new_vector_.swap(vector_);
|
|
}
|
|
|
|
void push(const rules_char_type *curr_dfa_, const string ®ex_,
|
|
const id_type id_, const rules_char_type *new_dfa_,
|
|
const bool check_, const id_type user_id_ = npos())
|
|
{
|
|
const bool star_ = *curr_dfa_ == '*' && *(curr_dfa_ + 1) == 0;
|
|
const bool dot_ = *new_dfa_ == '.' && *(new_dfa_ + 1) == 0;
|
|
const bool push_ = *new_dfa_ == '>';
|
|
const rules_char_type *push_dfa_ = nullptr;
|
|
const bool pop_ = *new_dfa_ == '<';
|
|
|
|
if (push_ || pop_)
|
|
{
|
|
++new_dfa_;
|
|
}
|
|
|
|
if (check_)
|
|
{
|
|
check_for_invalid_id(id_);
|
|
}
|
|
|
|
if (!dot_ && !pop_)
|
|
{
|
|
const rules_char_type *temp_ = new_dfa_;
|
|
|
|
while (*temp_ && *temp_ != ':')
|
|
{
|
|
++temp_;
|
|
}
|
|
|
|
if (*temp_) push_dfa_ = temp_ + 1;
|
|
|
|
validate(new_dfa_, *temp_ ? temp_ : 0);
|
|
|
|
if (push_dfa_)
|
|
{
|
|
validate(push_dfa_);
|
|
}
|
|
}
|
|
|
|
// npos means pop here
|
|
id_type new_dfa_id_ = npos();
|
|
id_type push_dfa_id_ = npos();
|
|
typename string_id_type_map::const_iterator iter_;
|
|
auto end_ = _statemap.cend();
|
|
id_vector next_dfas_;
|
|
|
|
if (!dot_ && !pop_)
|
|
{
|
|
if (push_dfa_)
|
|
{
|
|
iter_ = _statemap.find(string(new_dfa_, push_dfa_ - 1));
|
|
}
|
|
else
|
|
{
|
|
iter_ = _statemap.find(new_dfa_);
|
|
}
|
|
|
|
if (iter_ == end_)
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Unknown state name '";
|
|
narrow(new_dfa_, ss_);
|
|
ss_ << "'.";
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
|
|
new_dfa_id_ = iter_->second;
|
|
|
|
if (push_dfa_)
|
|
{
|
|
iter_ = _statemap.find(push_dfa_);
|
|
|
|
if (iter_ == end_)
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Unknown state name '";
|
|
narrow(push_dfa_, ss_);
|
|
ss_ << "'.";
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
|
|
push_dfa_id_ = iter_->second;
|
|
}
|
|
}
|
|
|
|
if (star_)
|
|
{
|
|
const std::size_t size_ = _statemap.size();
|
|
|
|
for (id_type i_ = 0; i_ < size_; ++i_)
|
|
{
|
|
next_dfas_.push_back(i_);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
const rules_char_type *start_ = curr_dfa_;
|
|
string next_dfa_;
|
|
|
|
while (*curr_dfa_)
|
|
{
|
|
while (*curr_dfa_ && *curr_dfa_ != ',')
|
|
{
|
|
++curr_dfa_;
|
|
}
|
|
|
|
next_dfa_.assign(start_, curr_dfa_);
|
|
|
|
if (*curr_dfa_)
|
|
{
|
|
++curr_dfa_;
|
|
start_ = curr_dfa_;
|
|
}
|
|
|
|
validate(next_dfa_.c_str());
|
|
iter_ = _statemap.find(next_dfa_.c_str());
|
|
|
|
if (iter_ == end_)
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Unknown state name '";
|
|
curr_dfa_ = next_dfa_.c_str();
|
|
narrow(curr_dfa_, ss_);
|
|
ss_ << "'.";
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
|
|
next_dfas_.push_back(iter_->second);
|
|
}
|
|
}
|
|
|
|
for (std::size_t i_ = 0, size_ = next_dfas_.size();
|
|
i_ < size_; ++i_)
|
|
{
|
|
const id_type curr_ = next_dfas_[i_];
|
|
|
|
_regexes[curr_].push_back(token_vector());
|
|
tokenise(regex_, _regexes[curr_].back(), id_, 0);
|
|
|
|
if (regex_[0] == '^')
|
|
{
|
|
_features[curr_] |= bol_bit;
|
|
}
|
|
|
|
if (regex_[regex_.size() - 1] == '$')
|
|
{
|
|
_features[curr_] |= eol_bit;
|
|
}
|
|
|
|
if (id_ == skip())
|
|
{
|
|
_features[curr_] |= skip_bit;
|
|
}
|
|
else if (id_ == eoi())
|
|
{
|
|
_features[curr_] |= again_bit;
|
|
}
|
|
|
|
if (push_ || pop_)
|
|
{
|
|
_features[curr_] |= recursive_bit;
|
|
}
|
|
|
|
_ids[curr_].push_back(id_);
|
|
_user_ids[curr_].push_back(user_id_);
|
|
_next_dfas[curr_].push_back(dot_ ? curr_ : new_dfa_id_);
|
|
_pushes[curr_].push_back(push_ ? (push_dfa_ ?
|
|
push_dfa_id_ : curr_) : npos());
|
|
_pops[curr_].push_back(pop_);
|
|
}
|
|
}
|
|
|
|
void validate(const rules_char_type *name_,
|
|
const rules_char_type *end_ = nullptr) const
|
|
{
|
|
const rules_char_type *start_ = name_;
|
|
|
|
if (*name_ != '_' && !(*name_ >= 'A' && *name_ <= 'Z') &&
|
|
!(*name_ >= 'a' && *name_ <= 'z'))
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Invalid name '";
|
|
narrow(name_, ss_);
|
|
ss_ << "'.";
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
else if (*name_)
|
|
{
|
|
++name_;
|
|
}
|
|
|
|
while (*name_ && name_ != end_)
|
|
{
|
|
if (*name_ != '_' && *name_ != '-' &&
|
|
!(*name_ >= 'A' && *name_ <= 'Z') &&
|
|
!(*name_ >= 'a' && *name_ <= 'z') &&
|
|
!(*name_ >= '0' && *name_ <= '9'))
|
|
{
|
|
std::ostringstream ss_;
|
|
|
|
ss_ << "Invalid name '";
|
|
name_ = start_;
|
|
narrow(name_, ss_);
|
|
ss_ << "'.";
|
|
throw runtime_error(ss_.str());
|
|
}
|
|
|
|
++name_;
|
|
}
|
|
}
|
|
|
|
void check_for_invalid_id(const id_type id_) const
|
|
{
|
|
if (id_ == eoi())
|
|
{
|
|
throw runtime_error("Cannot resuse the id for eoi.");
|
|
}
|
|
|
|
if (id_ == npos())
|
|
{
|
|
throw runtime_error("The id npos is reserved for the "
|
|
"UNKNOWN token.");
|
|
}
|
|
}
|
|
};
|
|
|
|
using rules = basic_rules<char, char>;
|
|
using wrules = basic_rules<wchar_t, wchar_t>;
|
|
using u32rules = basic_rules<char32_t, char32_t>;
|
|
}
|
|
|
|
#endif
|