Add commit 43aab01 of BenHanson/lexertl14 from github

This commit is contained in:
Iain Benson 2018-11-16 22:34:49 +00:00 committed by Luis Ángel San Martín
parent c4f792bd40
commit d3de52ca82
37 changed files with 12723 additions and 1 deletions

View File

@ -147,7 +147,42 @@ HEADERS += comic_flow.h \
yacreader_comics_selection_helper.h \
yacreader_comic_info_helper.h \
db/reading_list.h \
current_comic_view_helper.h
current_comic_view_helper.h \
lexertl/parser/tokeniser/re_token.hpp \
lexertl/parser/tokeniser/re_tokeniser.hpp \
lexertl/parser/tokeniser/re_tokeniser_helper.hpp \
lexertl/parser/tokeniser/re_tokeniser_state.hpp \
lexertl/parser/tree/end_node.hpp \
lexertl/parser/tree/iteration_node.hpp \
lexertl/parser/tree/leaf_node.hpp \
lexertl/parser/tree/node.hpp \
lexertl/parser/tree/selection_node.hpp \
lexertl/parser/tree/sequence_node.hpp \
lexertl/parser/parser.hpp \
lexertl/partition/charset.hpp \
lexertl/partition/equivset.hpp \
lexertl/char_traits.hpp \
lexertl/debug.hpp \
lexertl/dot.hpp \
lexertl/enums.hpp \
lexertl/generate_cpp.hpp \
lexertl/generator.hpp \
lexertl/internals.hpp \
lexertl/iterator.hpp \
lexertl/lookup.hpp \
lexertl/match_results.hpp \
lexertl/memory_file.hpp \
lexertl/narrow.hpp \
lexertl/observer_ptr.hpp \
lexertl/rules.hpp \
lexertl/runtime_error.hpp \
lexertl/serialise.hpp \
lexertl/sm_to_csm.hpp \
lexertl/sm_traits.hpp \
lexertl/state_machine.hpp \
lexertl/stream_shared_iterator.hpp \
lexertl/string_token.hpp \
lexertl/utf_iterators.hpp
!CONFIG(no_opengl) {
HEADERS += ../common/gl/yacreader_flow_gl.h
@ -324,3 +359,6 @@ translation.files = ../release/languages/yacreaderlibrary_*
manpage.path = $$DATADIR/man/man1
manpage.files = ../YACReaderLibrary.1
}
DISTFILES += \
lexertl/licence_1_0.txt

View File

@ -0,0 +1,45 @@
// char_traits.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_CHAR_TRAITS_HPP
#define LEXERTL_CHAR_TRAITS_HPP
#include <cstddef>
namespace lexertl
{
template<typename ch_type>
struct basic_char_traits
{
using char_type = ch_type;
using index_type = ch_type;
static index_type max_val()
{
const std::uint32_t max_ = 0x10ffff;
return sizeof(char_type) > 2 ?
max_ : (max_ & 0xffff);
}
};
template<>
struct basic_char_traits<char>
{
using char_type = char;
using index_type = unsigned char;
static index_type max_val()
{
// Prevent annoying warning (VC++)
index_type zero_ = 0;
return ~zero_;
}
};
}
#endif

View File

@ -0,0 +1,311 @@
// debug.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_DEBUG_HPP
#define LEXERTL_DEBUG_HPP
#include <map>
#include <ostream>
#include "rules.hpp"
#include "sm_to_csm.hpp"
#include "state_machine.hpp"
#include "string_token.hpp"
#include <vector>
namespace lexertl
{
template<typename sm, typename char_type, typename id_type = uint16_t,
bool is_dfa = true>
class basic_debug
{
public:
using char_state_machine =
basic_char_state_machine<char_type, id_type, is_dfa>;
using ostream = std::basic_ostream<char_type>;
using rules = basic_rules<char_type, char_type, id_type>;
using string = std::basic_string<char_type>;
static void dump(const sm &sm_, rules &rules_, ostream &stream_)
{
char_state_machine csm_;
sm_to_csm(sm_, csm_);
dump(csm_, rules_, stream_);
}
static void dump(const sm &sm_, ostream &stream_)
{
char_state_machine csm_;
sm_to_csm(sm_, csm_);
dump(csm_, stream_);
}
static void dump(const char_state_machine &csm_, rules &rules_,
ostream &stream_)
{
for (std::size_t dfa_ = 0, dfas_ = csm_.size(); dfa_ < dfas_; ++dfa_)
{
lexer_state(stream_);
stream_ << rules_.state(dfa_) << std::endl << std::endl;
dump_ex(csm_._sm_vector[dfa_], stream_);
}
}
static void dump(const char_state_machine &csm_, ostream &stream_)
{
for (std::size_t dfa_ = 0, dfas_ = csm_.size(); dfa_ < dfas_; ++dfa_)
{
lexer_state(stream_);
stream_ << dfa_ << std::endl << std::endl;
dump_ex(csm_._sm_vector[dfa_], stream_);
}
}
protected:
using dfa_state = typename char_state_machine::state;
using string_token = typename dfa_state::string_token;
using stringstream = std::basic_stringstream<char_type>;
static void dump_ex(const typename char_state_machine::dfa &dfa_,
ostream &stream_)
{
const std::size_t states_ = dfa_._states.size();
const id_type bol_index_ = dfa_._bol_index;
for (std::size_t i_ = 0; i_ < states_; ++i_)
{
const dfa_state &state_ = dfa_._states[i_];
state(stream_);
stream_ << i_ << std::endl;
if (state_._end_state)
{
end_state(stream_);
if (state_._push_pop_dfa == dfa_state::push_dfa)
{
push(stream_);
stream_ << state_._push_dfa;
}
else if (state_._push_pop_dfa == dfa_state::pop_dfa)
{
pop(stream_);
}
id(stream_);
stream_ << static_cast<std::size_t>(state_._id);
user_id(stream_);
stream_ << static_cast<std::size_t>(state_._user_id);
dfa(stream_);
stream_ << static_cast<std::size_t>(state_._next_dfa);
stream_ << std::endl;
}
if (i_ == 0 && bol_index_ != char_state_machine::npos())
{
bol(stream_);
stream_ << static_cast<std::size_t>(bol_index_) << std::endl;
}
if (state_._eol_index != char_state_machine::npos())
{
eol(stream_);
stream_ << static_cast<std::size_t>(state_._eol_index) <<
std::endl;
}
for (const auto &tran_ : state_._transitions)
{
string_token token_ = tran_.second;
open_bracket(stream_);
if (!tran_.second.any() && tran_.second.negatable())
{
token_.negate();
negated(stream_);
}
string chars_;
for (const auto &range_ : token_._ranges)
{
if (range_.first == '-' || range_.first == '^' ||
range_.first == ']')
{
stream_ << '\\';
}
chars_ = string_token::escape_char
(range_.first);
if (range_.first != range_.second)
{
if (range_.first + 1 < range_.second)
{
chars_ += '-';
}
if (range_.second == '-' || range_.second == '^' ||
range_.second == ']')
{
stream_ << '\\';
}
chars_ += string_token::escape_char(range_.second);
}
stream_ << chars_;
}
close_bracket(stream_);
stream_ << static_cast<std::size_t>(tran_.first) <<
std::endl;
}
stream_ << std::endl;
}
}
static void lexer_state(std::ostream &stream_)
{
stream_ << "Lexer state: ";
}
static void lexer_state(std::wostream &stream_)
{
stream_ << L"Lexer state: ";
}
static void state(std::ostream &stream_)
{
stream_ << "State: ";
}
static void state(std::wostream &stream_)
{
stream_ << L"State: ";
}
static void bol(std::ostream &stream_)
{
stream_ << " BOL -> ";
}
static void bol(std::wostream &stream_)
{
stream_ << L" BOL -> ";
}
static void eol(std::ostream &stream_)
{
stream_ << " EOL -> ";
}
static void eol(std::wostream &stream_)
{
stream_ << L" EOL -> ";
}
static void end_state(std::ostream &stream_)
{
stream_ << " END STATE";
}
static void end_state(std::wostream &stream_)
{
stream_ << L" END STATE";
}
static void id(std::ostream &stream_)
{
stream_ << ", Id = ";
}
static void id(std::wostream &stream_)
{
stream_ << L", Id = ";
}
static void push(std::ostream &stream_)
{
stream_ << ", PUSH ";
}
static void push(std::wostream &stream_)
{
stream_ << L", PUSH ";
}
static void pop(std::ostream &stream_)
{
stream_ << ", POP";
}
static void pop(std::wostream &stream_)
{
stream_ << L", POP";
}
static void user_id(std::ostream &stream_)
{
stream_ << ", User Id = ";
}
static void user_id(std::wostream &stream_)
{
stream_ << L", User Id = ";
}
static void open_bracket(std::ostream &stream_)
{
stream_ << " [";
}
static void open_bracket(std::wostream &stream_)
{
stream_ << L" [";
}
static void negated(std::ostream &stream_)
{
stream_ << "^";
}
static void negated(std::wostream &stream_)
{
stream_ << L"^";
}
static void close_bracket(std::ostream &stream_)
{
stream_ << "] -> ";
}
static void close_bracket(std::wostream &stream_)
{
stream_ << L"] -> ";
}
static void dfa(std::ostream &stream_)
{
stream_ << ", dfa = ";
}
static void dfa(std::wostream &stream_)
{
stream_ << L", dfa = ";
}
};
using debug = basic_debug<state_machine, char>;
using wdebug = basic_debug<wstate_machine, wchar_t>;
}
#endif

View File

@ -0,0 +1,293 @@
// dot.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
// Copyright (c) 2013 Autodesk, Inc. All rights reserved.
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_DOT_HPP
#define LEXERTL_DOT_HPP
#include <ostream>
#include "rules.hpp"
#include "state_machine.hpp"
#include "sm_to_csm.hpp"
namespace lexertl
{
//! The class template basic_dot contains utility functions used to
//! dump a description of a finite state machine formatted in the
//! DOT language (http://www.graphviz.org/doc/info/lang.html). The
//! resulting directed graph can previewed by opening the ".dot" file
//! into the GraphViz application (http://www.graphviz.org).
template<typename sm, typename char_type, typename id_type = uint16_t,
bool is_dfa = true>
class basic_dot
{
public:
using char_state_machine =
basic_char_state_machine<char_type, id_type, is_dfa>;
using rules = basic_rules<char_type, char_type, id_type>;
using ostream = std::basic_ostream<char_type>;
using string = std::basic_string<char_type>;
//! Dumps a description of the finite state machine expressed in
//! the DOT language to the given output stream.
static void dump(const sm &sm_, rules &rules_, ostream &stream_)
{
char_state_machine csm_;
sm_to_csm(sm_, csm_);
dump(csm_, rules_, stream_);
}
//! Dumps a description of the finite state machine expressed in
//! the DOT language to the given output stream.
static void dump(const char_state_machine &csm_, rules &rules_,
ostream &stream_)
{
header(stream_);
for (std::size_t dfa_ = 0, dfas_ = csm_.size(); dfa_ < dfas_; ++dfa_)
{
dump_ex(dfa_, csm_._sm_vector[dfa_], rules_, stream_);
}
trailer(stream_);
}
protected:
using dfa_state = typename char_state_machine::state;
using string_token = typename dfa_state::string_token;
using stringstream = std::basic_stringstream<char_type>;
// Naming of nodes used in the DOT diagram. The naming is of the
// form: L<dfa_id>_S<state_id>.
static string node_name(id_type dfa_id_, id_type state_id_)
{
stringstream namestream_;
namestream_ << "L" << dfa_id_ << "_S" << state_id_;
return namestream_.str();
}
// Escape control characters twice. This is necessary when
// expressing character sets attached as to DOT nodes as
// labels.
static string double_escape_char(const id_type ch_)
{
stringstream out_;
switch (ch_)
{
case '\0':
out_ << '\\';
out_ << '\\';
out_ << '0';
break;
case '\a':
out_ << '\\';
out_ << '\\';
out_ << 'a';
break;
case '\b':
out_ << '\\';
out_ << '\\';
out_ << 'b';
break;
case '\f':
out_ << '\\';
out_ << '\\';
out_ << 'f';
break;
case '\n':
out_ << '\\';
out_ << '\\';
out_ << 'n';
break;
case '\r':
out_ << '\\';
out_ << '\\';
out_ << 'r';
break;
case '\t':
out_ << '\\';
out_ << '\\';
out_ << 't';
break;
case '\v':
out_ << '\\';
out_ << '\\';
out_ << 'v';
break;
case '\\':
out_ << '\\';
out_ << '\\';
break;
case '"':
out_ << '\\';
out_ << '\\';
out_ << '"';
break;
case '\'':
out_ << '\\';
out_ << '\\';
out_ << '\'';
break;
default:
{
if (ch_ < 32 || ch_ > 126)
{
out_ << '\\';
out_ << 'x';
out_ << std::hex <<
static_cast<std::size_t>(ch_);
}
else
{
out_ << char_type(ch_);
}
break;
}
}
return out_.str();
}
// Internal function actually performing the work of dumping the
// state machine in DOT.
static void dump_ex(id_type dfa_id_,
const typename char_state_machine::dfa &dfa_,
rules &rules_,
ostream &stream_)
{
const std::size_t states_ = dfa_._states.size();
typename dfa_state::id_type_string_token_map::const_iterator iter_;
typename dfa_state::id_type_string_token_map::const_iterator end_;
stream_ << std::endl;
for (std::size_t i_ = 0; i_ < states_; ++i_)
{
const dfa_state &state_ = dfa_._states[i_];
const string name = node_name(dfa_id_, i_);
if (i_ == 0)
{
stream_ << " " << name << " [shape = doublecircle, xlabel=\""
<< rules_.state(dfa_id_) << "\"];" << std::endl;
}
else if (state_._end_state)
{
stream_ << " " << name <<
" [shape = doublecircle, xlabel=\"id =" <<
static_cast<std::size_t>(state_._id) << "\"];" <<
std::endl;
}
else {
stream_ << " " << name << " [shape = circle];" << std::endl;
}
}
stream_ << std::endl;
for (std::size_t i_ = 0; i_ < states_; ++i_)
{
const dfa_state &state_ = dfa_._states[i_];
iter_ = state_._transitions.begin();
end_ = state_._transitions.end();
const string src_name = node_name(dfa_id_, i_);
for (; iter_ != end_; ++iter_)
{
const string dst_name = node_name(dfa_id_, iter_->first);
stream_ << " " << src_name << " -> " << dst_name <<
" [label = \"";
string_token token_ = iter_->second;
open_bracket(stream_);
if (!iter_->second.any() && iter_->second.negatable())
{
token_.negate();
negated(stream_);
}
string chars_;
auto ranges_iter_ = token_._ranges.cbegin();
auto ranges_end_ = token_._ranges.cend();
for (; ranges_iter_ != ranges_end_; ++ranges_iter_)
{
if (ranges_iter_->first == '^' ||
ranges_iter_->first == ']')
{
stream_ << "\\\\";
}
chars_ = double_escape_char(ranges_iter_->first);
if (ranges_iter_->first != ranges_iter_->second)
{
if (ranges_iter_->first + 1 < ranges_iter_->second)
{
chars_ += '-';
}
if (ranges_iter_->second == '^' ||
ranges_iter_->second == ']')
{
stream_ << "\\\\";
}
chars_ += double_escape_char(ranges_iter_->second);
}
stream_ << chars_;
}
close_bracket(stream_);
stream_ << "\"];" << std::endl;
}
if (state_._end_state) {
const string dst_name = node_name(state_._next_dfa, 0);
stream_ << " " << src_name << " -> " << dst_name
<< " [style = \"dashed\"];" << std::endl;
}
}
}
static void header(ostream &stream_)
{
stream_ << "digraph DFAs {" << std::endl;
stream_ << " rankdir = LR;" << std::endl;
}
static void trailer(ostream &stream_)
{
stream_ << "}" << std::endl;
}
static void open_bracket(ostream &stream_)
{
stream_ << "[";
}
static void negated(ostream &stream_)
{
stream_ << "^";
}
static void close_bracket(ostream &stream_)
{
stream_ << "]";
}
};
using dot = basic_dot<basic_state_machine<char>, char>;
using wdot = basic_dot<basic_state_machine<wchar_t>, wchar_t>;
}
#endif

View File

@ -0,0 +1,25 @@
// enums.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_ENUMS_HPP
#define LEXERTL_ENUMS_HPP
namespace lexertl
{
enum regex_flags {icase = 1, dot_not_newline = 2, dot_not_cr_lf = 4,
skip_ws = 8, match_zero_len = 16};
// 0 = end state, 1 = id, 2 = user id, 3 = push_dfa_index
// 4 = next dfa, 5 = dead state, 6 = dfa_start
enum {end_state_index, id_index, user_id_index, push_dfa_index,
next_dfa_index, eol_index, dead_state_index, transitions_index};
// Rule flags:
enum feature_flags {bol_bit = 1, eol_bit = 2, skip_bit = 4, again_bit = 8,
multi_state_bit = 16, recursive_bit = 32, advance_bit = 64};
// End state flags:
enum {end_state_bit = 1, pop_dfa_bit = 2};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,738 @@
// generator.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_GENERATOR_HPP
#define LEXERTL_GENERATOR_HPP
#include <algorithm>
#include "partition/charset.hpp"
#include "char_traits.hpp"
#include "partition/equivset.hpp"
#include <list>
#include <memory>
#include "parser/parser.hpp"
#include "rules.hpp"
#include "state_machine.hpp"
#include <type_traits>
namespace lexertl
{
template<typename rules, typename sm, typename char_traits = basic_char_traits
<typename sm::traits::input_char_type> >
class basic_generator
{
public:
using id_type = typename rules::id_type;
using rules_char_type = typename rules::rules_char_type;
using sm_traits = typename sm::traits;
using parser = detail::basic_parser<rules_char_type, sm_traits>;
using charset_map = typename parser::charset_map;
using node = typename parser::node;
using node_ptr_vector = typename parser::node_ptr_vector;
static void build(const rules &rules_, sm &sm_)
{
const std::size_t size_ = rules_.statemap().size();
// Strong exception guarantee
// http://www.boost.org/community/exception_safety.html
internals internals_;
sm temp_sm_;
node_ptr_vector node_ptr_vector_;
internals_._eoi = rules_.eoi();
internals_.add_states(size_);
for (id_type index_ = 0; index_ < size_; ++index_)
{
if (rules_.regexes()[index_].empty())
{
std::ostringstream ss_;
ss_ << "Lexer states with no rules are not allowed "
"(lexer state " << index_ << ".)";
throw runtime_error(ss_.str());
}
else
{
// Note that the following variables are per DFA.
// Map of regex charset tokens (strings) to index
charset_map charset_map_;
// Used to fix up $ and \n clashes.
id_type nl_id_ = sm_traits::npos();
// Regex syntax tree
observer_ptr<node> root_ = build_tree(rules_, index_,
node_ptr_vector_, charset_map_, nl_id_);
build_dfa(charset_map_, root_, internals_, temp_sm_, index_,
nl_id_);
if (internals_._dfa[index_].size() /
internals_._dfa_alphabet[index_] >= sm_traits::npos())
{
// Overflow
throw runtime_error("The data type you have chosen "
"cannot hold this many DFA rows.");
}
}
}
// If you get a compile error here the id_type from rules and
// state machine do no match.
create(internals_, temp_sm_, rules_.features(), lookup());
sm_.swap(temp_sm_);
}
static observer_ptr<node> build_tree(const rules &rules_,
const std::size_t dfa_, node_ptr_vector &node_ptr_vector_,
charset_map &charset_map_, id_type &nl_id_)
{
parser parser_(rules_.locale(), node_ptr_vector_, charset_map_,
rules_.eoi());
const auto &regexes_ = rules_.regexes();
auto regex_iter_ = regexes_[dfa_].cbegin();
auto regex_iter_end_ = regexes_[dfa_].cend();
const auto &ids_ = rules_.ids();
const auto &user_ids_ = rules_.user_ids();
auto id_iter_ = ids_[dfa_].cbegin();
auto user_id_iter_ = user_ids_[dfa_].cbegin();
const auto &next_dfas_ = rules_.next_dfas();
const auto &pushes_ = rules_.pushes();
const auto &pops_ = rules_.pops();
auto next_dfa_iter_ = next_dfas_[dfa_].cbegin();
auto push_dfa_iter_ = pushes_[dfa_].cbegin();
auto pop_dfa_iter_ = pops_[dfa_].cbegin();
const bool seen_bol_ = (rules_.features()[dfa_] & bol_bit) != 0;
observer_ptr<node> root_ = nullptr;
root_ = parser_.parse(*regex_iter_, *id_iter_, *user_id_iter_,
*next_dfa_iter_, *push_dfa_iter_, *pop_dfa_iter_,
rules_.flags(), nl_id_, seen_bol_);
++regex_iter_;
++id_iter_;
++user_id_iter_;
++next_dfa_iter_;
++push_dfa_iter_;
++pop_dfa_iter_;
// Build syntax trees
while (regex_iter_ != regex_iter_end_)
{
observer_ptr<node> rhs_ = parser_.parse(*regex_iter_, *id_iter_,
*user_id_iter_, *next_dfa_iter_, *push_dfa_iter_,
*pop_dfa_iter_, rules_.flags(), nl_id_,
(rules_.features()[dfa_] & bol_bit) != 0);
node_ptr_vector_.emplace_back
(std::make_unique<selection_node>(root_, rhs_));
root_ = node_ptr_vector_.back().get();
++regex_iter_;
++id_iter_;
++user_id_iter_;
++next_dfa_iter_;
++push_dfa_iter_;
++pop_dfa_iter_;
}
return root_;
}
protected:
using compressed = std::integral_constant<bool, sm_traits::compressed>;
using equivset = detail::basic_equivset<id_type>;
using equivset_list = std::list<std::unique_ptr<equivset>>;
using equivset_ptr = std::unique_ptr<equivset>;
using sm_char_type = typename sm_traits::char_type;
using charset = detail::basic_charset<sm_char_type, id_type>;
using charset_ptr = std::unique_ptr<charset>;
using charset_list = std::list<std::unique_ptr<charset>>;
using internals = detail::basic_internals<id_type>;
using id_type_set = typename std::set<id_type>;
using id_type_vector = typename internals::id_type_vector;
using index_set = typename charset::index_set;
using index_set_vector = std::vector<index_set>;
using is_dfa = std::integral_constant<bool, sm_traits::is_dfa>;
using lookup = std::integral_constant<bool, sm_traits::lookup>;
using node_set = std::set<observer_ptr<const node>>;
using node_set_vector = std::vector<std::unique_ptr<node_set>>;
using node_vector = typename node::node_vector;
using node_vector_vector = std::vector<std::unique_ptr<node_vector>>;
using selection_node = typename parser::selection_node;
using size_t_vector = typename std::vector<std::size_t>;
using string_token = typename parser::string_token;
static void build_dfa(const charset_map &charset_map_,
const observer_ptr<node> root_, internals &internals_, sm &sm_,
const id_type dfa_index_, id_type &nl_id_)
{
// partitioned charset list
charset_list charset_list_;
// vector mapping token indexes to partitioned token index sets
index_set_vector set_mapping_;
auto &dfa_ = internals_._dfa[dfa_index_];
std::size_t dfa_alphabet_ = 0;
const node_vector &followpos_ = root_->firstpos();
node_set_vector seen_sets_;
node_vector_vector seen_vectors_;
size_t_vector hash_vector_;
id_type zero_id_ = sm_traits::npos();
id_type_set eol_set_;
set_mapping_.resize(charset_map_.size());
partition_charsets(charset_map_, charset_list_, is_dfa());
build_set_mapping(charset_list_, internals_, dfa_index_,
set_mapping_);
if (nl_id_ != sm_traits::npos())
{
nl_id_ = *set_mapping_[nl_id_].begin();
zero_id_ = sm_traits::compressed ?
*set_mapping_[charset_map_.find(string_token(0, 0))->
second].begin() : sm_traits::npos();
}
dfa_alphabet_ = charset_list_.size() + transitions_index +
(nl_id_ == sm_traits::npos() ? 0 : 1);
if (dfa_alphabet_ > sm_traits::npos())
{
// Overflow
throw runtime_error("The data type you have chosen cannot hold "
"the dfa alphabet.");
}
internals_._dfa_alphabet[dfa_index_] =
static_cast<id_type>(dfa_alphabet_);
// 'jam' state
dfa_.resize(dfa_alphabet_, 0);
closure(followpos_, seen_sets_, seen_vectors_, hash_vector_,
static_cast<id_type>(dfa_alphabet_), dfa_);
// Loop over states
for (id_type index_ = 0; index_ < static_cast<id_type>
(seen_vectors_.size()); ++index_)
{
equivset_list equiv_list_;
// Intersect charsets
build_equiv_list(*seen_vectors_[index_].get(), set_mapping_,
equiv_list_, is_dfa());
for (auto &equivset_ : equiv_list_)
{
const id_type transition_ = closure
(equivset_->_followpos, seen_sets_, seen_vectors_,
hash_vector_, static_cast<id_type>(dfa_alphabet_), dfa_);
if (transition_ != sm_traits::npos())
{
observer_ptr<id_type> ptr_ = &dfa_.front() +
((index_ + 1) * dfa_alphabet_);
// Prune abstemious transitions from end states.
if (*ptr_ && !equivset_->_greedy) continue;
set_transitions(transition_, equivset_.get(), dfa_, ptr_,
index_, eol_set_);
}
}
}
fix_clashes(eol_set_, nl_id_, zero_id_, dfa_, dfa_alphabet_,
compressed());
append_dfa(charset_list_, internals_, sm_, dfa_index_, lookup());
}
static void set_transitions(const id_type transition_, equivset *equivset_,
typename internals::id_type_vector &dfa_, id_type *ptr_,
const id_type index_, id_type_set &eol_set_)
{
for (typename equivset::index_vector::const_iterator
equiv_iter_ = equivset_->_index_vector.begin(),
equiv_end_ = equivset_->_index_vector.end();
equiv_iter_ != equiv_end_; ++equiv_iter_)
{
const id_type i_ = *equiv_iter_;
if (i_ == parser::bol_token())
{
dfa_.front() = transition_;
}
else if (i_ == parser::eol_token())
{
ptr_[eol_index] = transition_;
eol_set_.insert(index_ + 1);
}
else
{
ptr_[i_ + transitions_index] = transition_;
}
}
}
// Uncompressed
static void fix_clashes(const id_type_set &eol_set_,
const id_type nl_id_, const id_type /*zero_id_*/,
typename internals::id_type_vector &dfa_,
const std::size_t dfa_alphabet_, const std::false_type &)
{
for (const auto &eol_ : eol_set_)
{
observer_ptr<id_type> ptr_ = &dfa_.front() + eol_ * dfa_alphabet_;
const id_type eol_state_ = ptr_[eol_index];
const id_type nl_state_ = ptr_[nl_id_ + transitions_index];
if (nl_state_)
{
ptr_[transitions_index + nl_id_] = 0;
ptr_ = &dfa_.front() + eol_state_ * dfa_alphabet_;
if (ptr_[transitions_index + nl_id_] == 0)
{
ptr_[transitions_index + nl_id_] = nl_state_;
}
}
}
}
// Compressed
static void fix_clashes(const id_type_set &eol_set_,
const id_type nl_id_, const id_type zero_id_,
typename internals::id_type_vector &dfa_,
const std::size_t dfa_alphabet_, const std::true_type &)
{
std::size_t i_ = 0;
for (const auto &eol_ : eol_set_)
{
observer_ptr<id_type> ptr_ = &dfa_.front() + eol_ * dfa_alphabet_;
const id_type eol_state_ = ptr_[eol_index];
id_type nl_state_ = 0;
for (; i_ < (sm_traits::char_24_bit ? 2 : 1); ++i_)
{
ptr_ = &dfa_.front() + ptr_[transitions_index + zero_id_] *
dfa_alphabet_;
}
nl_state_ = ptr_[transitions_index + nl_id_];
if (nl_state_)
{
ptr_ = &dfa_.front() + eol_state_ * dfa_alphabet_;
if (ptr_[transitions_index + zero_id_] != 0) continue;
ptr_[transitions_index + zero_id_] =
static_cast<id_type>(dfa_.size() / dfa_alphabet_);
dfa_.resize(dfa_.size() + dfa_alphabet_, 0);
for (i_ = 0; i_ < (sm_traits::char_24_bit ? 1 : 0); ++i_)
{
ptr_ = &dfa_.front() + dfa_.size() - dfa_alphabet_;
ptr_[transitions_index + zero_id_] =
static_cast<id_type>(dfa_.size() / dfa_alphabet_);
dfa_.resize(dfa_.size() + dfa_alphabet_, 0);
}
ptr_ = &dfa_.front() + dfa_.size() - dfa_alphabet_;
ptr_[transitions_index + nl_id_] = nl_state_;
}
}
}
// char_state_machine version
static void append_dfa(const charset_list &charset_list_,
const internals &internals_, sm &sm_, const id_type dfa_index_,
const std::false_type &)
{
std::size_t size_ = charset_list_.size();
typename sm::string_token_vector token_vector_;
token_vector_.reserve(size_);
for (const auto &charset_ : charset_list_)
{
token_vector_.push_back(charset_->_token);
}
sm_.append(token_vector_, internals_, dfa_index_);
}
// state_machine version
static void append_dfa(const charset_list &, const internals &, sm &,
const id_type, const std::true_type &)
{
// Nothing to do - will use create() instead
}
// char_state_machine version
static void create(internals &, sm &, const id_type_vector &,
const std::false_type &)
{
// Nothing to do - will use append_dfa() instead
}
// state_machine version
static void create(internals &internals_, sm &sm_,
const id_type_vector &features_, const std::true_type &)
{
for (std::size_t i_ = 0, size_ = internals_._dfa.size();
i_ < size_; ++i_)
{
internals_._features |= features_[i_];
}
if (internals_._dfa.size() > 1)
{
internals_._features |= multi_state_bit;
}
sm_.data().swap(internals_);
}
// NFA version
static void partition_charsets(const charset_map &map_,
charset_list &lhs_, const std::false_type &)
{
fill_rhs_list(map_, lhs_);
}
// DFA version
static void partition_charsets(const charset_map &map_,
charset_list &lhs_, const std::true_type &)
{
charset_list rhs_;
fill_rhs_list(map_, rhs_);
if (!rhs_.empty())
{
typename charset_list::iterator iter_;
typename charset_list::iterator end_;
charset_ptr overlap_ = std::make_unique<charset>();
lhs_.emplace_back(std::move(rhs_.front()));
rhs_.pop_front();
while (!rhs_.empty())
{
charset_ptr r_(rhs_.front().release());
rhs_.pop_front();
iter_ = lhs_.begin();
end_ = lhs_.end();
while (!r_->empty() && iter_ != end_)
{
auto l_iter_ = iter_;
(*l_iter_)->intersect(*r_.get(), *overlap_.get());
if (overlap_->empty())
{
++iter_;
}
else if ((*l_iter_)->empty())
{
l_iter_->reset(overlap_.release());
overlap_ = std::make_unique<charset>();
++iter_;
}
else if (r_->empty())
{
r_.reset(overlap_.release());
overlap_ = std::make_unique<charset>();
break;
}
else
{
iter_ = lhs_.insert(++iter_, charset_ptr());
iter_->reset(overlap_.release());
overlap_ = std::make_unique<charset>();
++iter_;
end_ = lhs_.end();
}
}
if (!r_->empty())
{
lhs_.emplace_back(std::move(r_));
}
}
}
}
static void fill_rhs_list(const charset_map &map_, charset_list &list_)
{
for (const auto &pair_ : map_)
{
list_.emplace_back(std::make_unique<charset>
(pair_.first, pair_.second));
}
}
static void build_set_mapping(const charset_list &charset_list_,
internals &internals_, const id_type dfa_index_,
index_set_vector &set_mapping_)
{
auto iter_ = charset_list_.cbegin();
auto end_ = charset_list_.cend();
for (id_type index_ = 0; iter_ != end_; ++iter_, ++index_)
{
observer_ptr<const charset> cs_ = iter_->get();
fill_lookup(cs_->_token, &internals_._lookup[dfa_index_],
index_, lookup());
for (const id_type i_ : cs_->_index_set)
{
set_mapping_[i_].insert(index_);
}
}
}
// char_state_machine version
static void fill_lookup(const string_token &, observer_ptr<id_type_vector> ,
const id_type, const std::false_type &)
{
// Do nothing (lookup not used)
}
// state_machine version
static void fill_lookup(const string_token &charset_,
observer_ptr<id_type_vector> lookup_, const id_type index_,
const std::true_type &)
{
observer_ptr<id_type> ptr_ = &lookup_->front();
for (const auto &range_ : charset_._ranges)
{
for (typename char_traits::index_type char_ = range_.first;
char_ < range_.second; ++char_)
{
// Note char_ must be unsigned
ptr_[char_] = index_ + transitions_index;
}
// Note range_.second must be unsigned
ptr_[range_.second] = index_ + transitions_index;
}
}
static id_type closure(const node_vector &followpos_,
node_set_vector &seen_sets_, node_vector_vector &seen_vectors_,
size_t_vector &hash_vector_, const id_type size_, id_type_vector &dfa_)
{
bool end_state_ = false;
id_type id_ = 0;
id_type user_id_ = sm_traits::npos();
id_type next_dfa_ = 0;
id_type push_dfa_ = sm_traits::npos();
bool pop_dfa_ = false;
std::size_t hash_ = 0;
if (followpos_.empty()) return sm_traits::npos();
id_type index_ = 0;
std::unique_ptr<node_set> set_ptr_ = std::make_unique<node_set>();
std::unique_ptr<node_vector> vector_ptr_ =
std::make_unique<node_vector>();
for (observer_ptr<node> node_ : followpos_)
{
closure_ex(node_, end_state_, id_, user_id_, next_dfa_,
push_dfa_, pop_dfa_, *set_ptr_.get(),
*vector_ptr_.get(), hash_);
}
bool found_ = false;
auto hash_iter_ = hash_vector_.cbegin();
auto hash_end_ = hash_vector_.cend();
auto set_iter_ = seen_sets_.cbegin();
for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_)
{
found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_;
++index_;
if (found_) break;
}
if (!found_)
{
seen_sets_.emplace_back(std::move(set_ptr_));
seen_vectors_.emplace_back(std::move(vector_ptr_));
hash_vector_.push_back(hash_);
// State 0 is the jam state...
index_ = static_cast<id_type>(seen_sets_.size());
const std::size_t old_size_ = dfa_.size();
dfa_.resize(old_size_ + size_, 0);
if (end_state_)
{
dfa_[old_size_] |= end_state_bit;
if (pop_dfa_)
{
dfa_[old_size_] |= pop_dfa_bit;
}
dfa_[old_size_ + id_index] = id_;
dfa_[old_size_ + user_id_index] = user_id_;
dfa_[old_size_ + push_dfa_index] = push_dfa_;
dfa_[old_size_ + next_dfa_index] = next_dfa_;
}
}
return index_;
}
static void closure_ex(observer_ptr<node> node_, bool &end_state_,
id_type &id_, id_type &user_id_, id_type &next_dfa_,
id_type &push_dfa_, bool &pop_dfa_, node_set &set_ptr_,
node_vector &vector_ptr_, std::size_t &hash_)
{
const bool temp_end_state_ = node_->end_state();
if (temp_end_state_)
{
if (!end_state_)
{
end_state_ = true;
id_ = node_->id();
user_id_ = node_->user_id();
next_dfa_ = node_->next_dfa();
push_dfa_ = node_->push_dfa();
pop_dfa_ = node_->pop_dfa();
}
}
if (set_ptr_.insert(node_).second)
{
vector_ptr_.push_back(node_);
hash_ += reinterpret_cast<std::size_t>(node_);
}
}
// NFA version
static void build_equiv_list(const node_vector &vector_,
const index_set_vector &set_mapping_, equivset_list &lhs_,
const std::false_type &)
{
fill_rhs_list(vector_, set_mapping_, lhs_);
}
// DFA version
static void build_equiv_list(const node_vector &vector_,
const index_set_vector &set_mapping_, equivset_list &lhs_,
const std::true_type &)
{
equivset_list rhs_;
fill_rhs_list(vector_, set_mapping_, rhs_);
if (!rhs_.empty())
{
typename equivset_list::iterator iter_;
typename equivset_list::iterator end_;
equivset_ptr overlap_ = std::make_unique<equivset>();
lhs_.emplace_back(std::move(rhs_.front()));
rhs_.pop_front();
while (!rhs_.empty())
{
equivset_ptr r_(rhs_.front().release());
rhs_.pop_front();
iter_ = lhs_.begin();
end_ = lhs_.end();
while (!r_->empty() && iter_ != end_)
{
auto l_iter_ = iter_;
(*l_iter_)->intersect(*r_.get(), *overlap_.get());
if (overlap_->empty())
{
++iter_;
}
else if ((*l_iter_)->empty())
{
l_iter_->reset(overlap_.release());
overlap_ = std::make_unique<equivset>();
++iter_;
}
else if (r_->empty())
{
r_.reset(overlap_.release());
overlap_ = std::make_unique<equivset>();
break;
}
else
{
iter_ = lhs_.insert(++iter_, equivset_ptr());
iter_->reset(overlap_.release());
overlap_ = std::make_unique<equivset>();
++iter_;
end_ = lhs_.end();
}
}
if (!r_->empty())
{
lhs_.emplace_back(std::move(r_));
}
}
}
}
static void fill_rhs_list(const node_vector &vector_,
const index_set_vector &set_mapping_, equivset_list &list_)
{
for (observer_ptr<const node> node_ : vector_)
{
if (!node_->end_state())
{
const id_type token_ = node_->token();
if (token_ != node::null_token())
{
if (token_ == parser::bol_token() ||
token_ == parser::eol_token())
{
std::set<id_type> index_set_;
index_set_.insert(token_);
list_.emplace_back
(std::make_unique<equivset>(index_set_,
token_, node_->greedy(), node_->followpos()));
}
else
{
list_.emplace_back(std::make_unique<equivset>
(set_mapping_[token_], token_, node_->greedy(),
node_->followpos()));
}
}
}
}
}
};
using generator = basic_generator<rules, state_machine>;
using wgenerator = basic_generator<wrules, wstate_machine>;
using u32generator = basic_generator<u32rules, u32state_machine>;
using char_generator = basic_generator<rules, char_state_machine>;
using wchar_generator = basic_generator<wrules, wchar_state_machine>;
using u32char_generator = basic_generator<u32rules, u32char_state_machine>;
}
#endif

View File

@ -0,0 +1,75 @@
// internals.hpp
// Copyright (c) 2009-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_INTERNALS_HPP
#define LEXERTL_INTERNALS_HPP
#include "enums.hpp"
#include <memory>
#include <vector>
namespace lexertl
{
namespace detail
{
template<typename id_type>
struct basic_internals
{
using id_type_vector = std::vector<id_type>;
using id_type_vector_vector = std::vector<id_type_vector>;
id_type _eoi;
id_type_vector_vector _lookup;
id_type_vector _dfa_alphabet;
id_type _features;
id_type_vector_vector _dfa;
basic_internals() :
_eoi(0),
_lookup(),
_dfa_alphabet(),
_features(0),
_dfa()
{
}
void clear()
{
_eoi = 0;
_lookup.clear();
_dfa_alphabet.clear();
_features = 0;
_dfa.clear();
}
bool empty() const
{
return _dfa.empty();
}
void add_states(const std::size_t num_)
{
for (std::size_t index_ = 0; index_ < num_; ++index_)
{
// lookup *always* has a size 256 now.
_lookup.push_back(id_type_vector(256, dead_state_index));
_dfa_alphabet.push_back(0);
_dfa.push_back(id_type_vector());
}
}
void swap(basic_internals &internals_)
{
std::swap(_eoi, internals_._eoi);
_lookup.swap(internals_._lookup);
_dfa_alphabet.swap(internals_._dfa_alphabet);
std::swap(_features, internals_._features);
_dfa.swap(internals_._dfa);
}
};
}
}
#endif

View File

@ -0,0 +1,135 @@
// iterator.hpp
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_ITERATOR_HPP
#define LEXERTL_ITERATOR_HPP
#include <iterator>
#include "lookup.hpp"
#include "state_machine.hpp"
namespace lexertl
{
template<typename iter, typename sm_type, typename results>
class iterator
{
public:
using value_type = results;
using difference_type = ptrdiff_t;
using pointer = const value_type *;
using reference = const value_type &;
using iterator_category = std::forward_iterator_tag;
iterator() :
_results(iter(), iter()),
_sm(nullptr)
{
}
iterator(const iter &start_, const iter &end_, const sm_type &sm) :
_results(start_, end_),
_sm(&sm)
{
lookup();
}
// Only need this because of warnings with gcc with -Weffc++
iterator(const iterator &rhs_)
{
_results = rhs_._results;
_sm = rhs_._sm;
}
// Only need this because of warnings with gcc with -Weffc++
iterator &operator =(const iterator &rhs_)
{
if (&rhs_ != this)
{
_results = rhs_._results;
_sm = rhs_._sm;
}
return *this;
}
iterator &operator ++()
{
lookup();
return *this;
}
iterator operator ++(int)
{
iterator iter_ = *this;
lookup();
return iter_;
}
const value_type &operator *() const
{
return _results;
}
const value_type *operator ->() const
{
return &_results;
}
bool operator ==(const iterator &rhs_) const
{
return _sm == rhs_._sm && (_sm == nullptr ? true :
_results == rhs_._results);
}
bool operator !=(const iterator &rhs_) const
{
return !(*this == rhs_);
}
const sm_type &sm() const
{
return *_sm;
}
private:
value_type _results;
const sm_type *_sm;
void lookup()
{
lexertl::lookup(*_sm, _results);
if (_results.first == _results.eoi)
{
_sm = nullptr;
}
}
};
using siterator =
iterator<std::string::const_iterator, state_machine, smatch>;
using citerator = iterator<const char *, state_machine, cmatch>;
using wsiterator =
iterator<std::wstring::const_iterator, wstate_machine, wsmatch>;
using wciterator = iterator<const wchar_t *, wstate_machine, wcmatch>;
using u32siterator = iterator<std::u32string::const_iterator,
u32state_machine, u32smatch>;
using u32citerator = iterator<const char32_t *, u32state_machine, u32cmatch>;
using sriterator =
iterator<std::string::const_iterator, state_machine, srmatch>;
using criterator = iterator<const char *, state_machine, crmatch>;
using wsriterator =
iterator<std::wstring::const_iterator, wstate_machine, wsrmatch>;
using wcriterator =
iterator<const wchar_t *, wstate_machine, wcrmatch>;
using u32sriterator = iterator<std::u32string::const_iterator,
u32state_machine, u32srmatch>;
using u32criterator = iterator<const char32_t *, u32state_machine, u32crmatch>;
}
#endif

View File

@ -0,0 +1,24 @@
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,491 @@
// lookup.hpp
// Copyright (c) 2009-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_LOOKUP_HPP
#define LEXERTL_LOOKUP_HPP
#include <assert.h>
#include "match_results.hpp"
#include <type_traits>
namespace lexertl
{
namespace detail
{
template<bool>
struct bol_state
{
bol_state(const bool)
{
}
};
template<>
struct bol_state<true>
{
bool _bol;
bool _end_bol;
bol_state(const bool bol_) :
_bol(bol_),
_end_bol(bol_)
{
}
};
template<typename id_type, bool>
struct eol_state
{
};
template<typename id_type>
struct eol_state<id_type, true>
{
id_type _EOL_state;
eol_state() :
_EOL_state(0)
{
}
};
template<typename id_type, bool>
struct multi_state_state
{
multi_state_state(const id_type)
{
}
};
template<typename id_type>
struct multi_state_state<id_type, true>
{
id_type _start_state;
multi_state_state(const id_type state_) :
_start_state(state_)
{
}
};
template<typename id_type, bool>
struct recursive_state
{
recursive_state(const id_type *)
{
}
};
template<typename id_type>
struct recursive_state<id_type, true>
{
bool _pop;
id_type _push_dfa;
recursive_state(const id_type *ptr_) :
_pop((*ptr_ & pop_dfa_bit) != 0),
_push_dfa(*(ptr_ + push_dfa_index))
{
}
};
template<typename internals, typename id_type, typename index_type,
std::size_t flags>
struct lookup_state
{
const id_type *_lookup;
id_type _dfa_alphabet;
const id_type *_dfa;
const id_type *_ptr;
bool _end_state;
id_type _id;
id_type _uid;
bol_state<(flags & bol_bit) != 0> _bol_state;
eol_state<id_type, (flags & eol_bit) != 0> _eol_state;
multi_state_state<id_type, (flags & multi_state_bit) != 0>
_multi_state_state;
recursive_state<id_type, (flags & recursive_bit) != 0> _recursive_state;
lookup_state(const internals &internals_, const bool bol_,
const id_type state_) :
_lookup(&internals_._lookup[state_][0]),
_dfa_alphabet(internals_._dfa_alphabet[state_]),
_dfa(&internals_._dfa[state_][0]),
_ptr(_dfa + _dfa_alphabet),
_end_state(*_ptr != 0),
_id(*(_ptr + id_index)),
_uid(*(_ptr + user_id_index)),
_bol_state(bol_),
_eol_state(),
_multi_state_state(state_),
_recursive_state(_ptr)
{
}
void reset_recursive(const std::false_type &)
{
// Do nothing
}
void reset_recursive(const std::true_type &)
{
_recursive_state._pop = (*_ptr & pop_dfa_bit) != 0;
_recursive_state._push_dfa = *(_ptr + push_dfa_index);
}
void bol_start_state(const std::false_type &)
{
// Do nothing
}
void bol_start_state(const std::true_type &)
{
if (_bol_state._bol)
{
const id_type state_ = *_dfa;
if (state_)
{
_ptr = &_dfa[state_ * _dfa_alphabet];
}
}
}
template<typename char_type>
bool is_eol(const char_type, const std::false_type &)
{
return false;
}
template<typename char_type>
bool is_eol(const char_type curr_, const std::true_type &)
{
bool ret_ = false;
_eol_state._EOL_state = _ptr[eol_index];
ret_ = _eol_state._EOL_state && (curr_ == '\r' || curr_ == '\n');
if (ret_)
{
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
}
return ret_;
}
template<typename char_type>
id_type next_char(const char_type prev_char_, const std::false_type &)
{
const id_type state_= _ptr[_lookup
[static_cast<index_type>(prev_char_)]];
if (state_ != 0)
{
_ptr = &_dfa[state_ * _dfa_alphabet];
}
return state_;
}
template<typename char_type>
id_type next_char(const char_type prev_char_, const std::true_type &)
{
const std::size_t bytes_ = sizeof(char_type) < 3 ?
sizeof(char_type) : 3;
const std::size_t shift_[] = {0, 8, 16};
id_type state_= 0;
for (std::size_t i_ = 0; i_ < bytes_; ++i_)
{
state_ = _ptr[_lookup[static_cast<unsigned char>((prev_char_ >>
shift_[bytes_ - 1 - i_]) & 0xff)]];
if (state_ == 0)
{
break;
}
_ptr = &_dfa[state_ * _dfa_alphabet];
}
return state_;
}
template<typename char_type>
void bol(const char_type, const std::false_type &)
{
// Do nothing
}
template<typename char_type>
void bol(const char_type prev_char_, const std::true_type &)
{
_bol_state._bol = prev_char_ == '\n';
}
void eol(const id_type, const std::false_type &)
{
// Do nothing
}
void eol(const id_type err_val_, const std::true_type &)
{
_eol_state._EOL_state = err_val_;
}
void reset_start_state(const std::false_type &)
{
// Do nothing
}
void reset_start_state(const std::true_type &)
{
_multi_state_state._start_state = *(_ptr + next_dfa_index);
}
void reset_end_bol(const std::false_type &)
{
// Do nothing
}
void reset_end_bol(const std::true_type &)
{
_bol_state._end_bol = _bol_state._bol;
}
template<typename iter_type>
void end_state(iter_type &end_token_, iter_type &curr_)
{
if (*_ptr)
{
_end_state = true;
reset_end_bol
(std::integral_constant<bool, (flags & bol_bit) != 0>());
_id = *(_ptr + id_index);
_uid = *(_ptr + user_id_index);
reset_recursive
(std::integral_constant<bool, (flags & recursive_bit) != 0>());
reset_start_state(std::integral_constant<bool,
(flags & multi_state_bit) != 0>());
end_token_ = curr_;
}
}
template<typename iter_type, typename char_type>
void check_eol(iter_type &, iter_type &, const id_type,
const char_type, const std::false_type &)
{
// Do nothing
}
template<typename iter_type, typename char_type>
void check_eol(iter_type &end_token_, iter_type &curr_,
const id_type npos, const char_type eoi_, const std::true_type &)
{
if (_eol_state._EOL_state != npos && curr_ == eoi_)
{
_eol_state._EOL_state = _ptr[eol_index];
if (_eol_state._EOL_state)
{
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
end_state(end_token_, curr_);
}
}
}
template<typename results>
void pop(results &, const std::false_type &)
{
// Nothing to do
}
template<typename results>
void pop(results &results_, const std::true_type &)
{
if (_recursive_state._pop)
{
_multi_state_state._start_state = results_.stack.top().first;
results_.stack.pop();
}
else if (_recursive_state._push_dfa != results::npos())
{
results_.stack.push(typename results::id_type_pair
(_recursive_state._push_dfa, _id));
}
}
template<typename results>
bool is_id_eoi(const id_type eoi_, const results &, const std::false_type &)
{
return _id == eoi_;
}
template<typename results>
bool is_id_eoi(const id_type eoi_, const results &results_,
const std::true_type &)
{
return _id == eoi_ || (_recursive_state._pop &&
!results_.stack.empty() && results_.stack.top().second == eoi_);
}
void start_state(id_type &, const std::false_type &)
{
// Do nothing
}
void start_state(id_type &start_state_, const std::true_type &)
{
start_state_ = _multi_state_state._start_state;
}
void bol(bool &, const std::false_type &)
{
// Do nothing
}
void bol(bool &end_bol_, const std::true_type &)
{
end_bol_ = _bol_state._end_bol;
}
};
template<typename results>
void inc_end(results &, const std::false_type &)
{
// Do nothing
}
template<typename results>
void inc_end(results &results_, const std::true_type &)
{
++results_.second;
}
template<typename sm_type, std::size_t flags, typename results,
bool compressed, bool recursive>
void next(const sm_type &sm_, results &results_,
const std::integral_constant<bool, compressed> &compressed_,
const std::integral_constant<bool, recursive> &recursive_,
const std::forward_iterator_tag &)
{
using id_type = typename sm_type::id_type;
const auto &internals_ = sm_.data();
auto end_token_ = results_.second;
skip:
auto curr_ = results_.second;
results_.first = curr_;
again:
if (curr_ == results_.eoi)
{
results_.id = internals_._eoi;
results_.user_id = results::npos();
return;
}
lookup_state<typename sm_type::internals, id_type,
typename results::index_type, flags> lu_state_
(internals_, results_.bol, results_.state);
lu_state_.bol_start_state
(std::integral_constant<bool, (flags & bol_bit) != 0>());
while (curr_ != results_.eoi)
{
if (!lu_state_.is_eol(*curr_,
std::integral_constant<bool, (flags & eol_bit) != 0>()))
{
const auto prev_char_ = *curr_;
const id_type state_ = lu_state_.next_char(prev_char_,
compressed_);
++curr_;
lu_state_.bol(prev_char_,
std::integral_constant<bool, (flags & bol_bit) != 0>());
if (state_ == 0)
{
lu_state_.is_eol(results::npos(),
std::integral_constant<bool, (flags & eol_bit) != 0>());
break;
}
}
lu_state_.end_state(end_token_, curr_);
}
lu_state_.check_eol(end_token_, curr_, results::npos(), results_.eoi,
std::integral_constant<bool, (flags & eol_bit) != 0>());
if (lu_state_._end_state)
{
// Return longest match
lu_state_.pop(results_, recursive_);
lu_state_.start_state(results_.state,
std::integral_constant<bool, (flags & multi_state_bit) != 0>());
lu_state_.bol(results_.bol,
std::integral_constant<bool, (flags & bol_bit) != 0>());
results_.second = end_token_;
if (lu_state_._id == sm_.skip()) goto skip;
if (lu_state_.is_id_eoi(internals_._eoi, results_, recursive_))
{
curr_ = end_token_;
goto again;
}
}
else
{
results_.second = end_token_;
results_.bol = *results_.second == '\n';
results_.first = results_.second;
// No match causes char to be skipped
inc_end(results_,
std::integral_constant<bool, (flags & advance_bit) != 0>());
lu_state_._id = results::npos();
lu_state_._uid = results::npos();
}
results_.id = lu_state_._id;
results_.user_id = lu_state_._uid;
}
}
template<typename iter_type, typename sm_type, std::size_t flags>
void lookup(const sm_type &sm_, match_results<iter_type,
typename sm_type::id_type, flags> &results_)
{
using value_type = typename std::iterator_traits<iter_type>::value_type;
using cat = typename std::iterator_traits<iter_type>::iterator_category;
// If this asserts, you have either not defined all the correct
// flags, or you should be using recursive_match_results instead
// of match_results.
assert((sm_.data()._features & flags) == sm_.data()._features);
detail::next<sm_type, flags>(sm_, results_,
std::integral_constant<bool, (sizeof(value_type) > 1)>(),
std::false_type(), cat());
}
template<typename iter_type, typename sm_type, std::size_t flags>
void lookup(const sm_type &sm_, recursive_match_results<iter_type,
typename sm_type::id_type, flags> &results_)
{
using value_type = typename std::iterator_traits<iter_type>::value_type;
using cat = typename std::iterator_traits<iter_type>::iterator_category;
// If this asserts, you have not defined all the correct flags
assert((sm_.data()._features & flags) == sm_.data()._features);
detail::next<sm_type, flags | recursive_bit>(sm_, results_,
std::integral_constant<bool, (sizeof(value_type) > 1)>(),
std::true_type(), cat());
}
}
#endif

View File

@ -0,0 +1,171 @@
// match_results.hpp
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_MATCH_RESULTS_HPP
#define LEXERTL_MATCH_RESULTS_HPP
#include "char_traits.hpp"
#include "enums.hpp"
#include <iterator>
#include <stack>
#include <string>
namespace lexertl
{
template<typename iter, typename id_type = uint16_t,
std::size_t flags = bol_bit | eol_bit | skip_bit | again_bit |
multi_state_bit | advance_bit>
struct match_results
{
using iter_type = iter;
using char_type = typename std::iterator_traits<iter_type>::value_type;
using index_type = typename basic_char_traits<char_type>::index_type;
using string = std::basic_string<char_type>;
id_type id;
id_type user_id;
iter_type first;
iter_type second;
iter_type eoi;
bool bol;
id_type state;
match_results() :
id(0),
user_id(npos()),
first(iter_type()),
second(iter_type()),
eoi(iter_type()),
bol(true),
state(0)
{
}
match_results(const iter_type &start_, const iter_type &end_) :
id(0),
user_id(npos()),
first(start_),
second(start_),
eoi(end_),
bol(true),
state(0)
{
}
virtual ~match_results()
{
}
string str() const
{
return string(first, second);
}
string substr(const std::size_t soffset_, const std::size_t eoffset_) const
{
return string(first + soffset_, second - eoffset_);
}
virtual void clear()
{
id = 0;
user_id = npos();
first = eoi;
second = eoi;
bol = true;
state = 0;
}
virtual void reset(const iter_type &start_, const iter_type &end_)
{
id = 0;
user_id = npos();
first = start_;
second = start_;
eoi = end_;
bol = true;
state = 0;
}
static id_type npos()
{
return static_cast<id_type>(~0);
}
static id_type skip()
{
return static_cast<id_type>(~1);
}
bool operator ==(const match_results &rhs_) const
{
return id == rhs_.id &&
user_id == rhs_.user_id &&
first == rhs_.first &&
second == rhs_.second &&
eoi == rhs_.eoi &&
bol == rhs_.bol &&
state == rhs_.state;
}
};
template<typename iter, typename id_type = uint16_t,
std::size_t flags = bol_bit | eol_bit | skip_bit | again_bit |
multi_state_bit | recursive_bit | advance_bit>
struct recursive_match_results : public match_results<iter, id_type, flags>
{
using id_type_pair = std::pair<id_type, id_type>;
std::stack<id_type_pair> stack;
recursive_match_results() :
match_results<iter, id_type, flags>(),
stack()
{
}
recursive_match_results(const iter &start_, const iter &end_) :
match_results<iter, id_type, flags>(start_, end_),
stack()
{
}
virtual ~recursive_match_results() override
{
}
virtual void clear() override
{
match_results<iter, id_type, flags>::clear();
while (!stack.empty()) stack.pop();
}
virtual void reset(const iter &start_, const iter &end_) override
{
match_results<iter, id_type, flags>::reset(start_, end_);
while (!stack.empty()) stack.pop();
}
};
using smatch = match_results<std::string::const_iterator>;
using cmatch = match_results<const char *>;
using wsmatch = match_results<std::wstring::const_iterator>;
using wcmatch = match_results<const wchar_t *>;
using u32smatch = match_results<std::u32string::const_iterator>;
using u32cmatch = match_results<const char32_t *>;
using srmatch =
recursive_match_results<std::string::const_iterator>;
using crmatch = recursive_match_results<const char *>;
using wsrmatch =
recursive_match_results<std::wstring::const_iterator>;
using wcrmatch = recursive_match_results<const wchar_t *>;
using u32srmatch =
recursive_match_results<std::u32string::const_iterator>;
using u32crmatch = recursive_match_results<const char32_t *>;
}
#endif

View File

@ -0,0 +1,138 @@
// memory_file.hpp
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
// Inspired by http://en.wikibooks.org/wiki/Optimizing_C%2B%2B/
// General_optimization_techniques/Input/Output#Memory-mapped_file
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_MEMORY_FILE_HPP
#define LEXERTL_MEMORY_FILE_HPP
#include <cstddef>
#ifdef _WIN32
#include <windows.h>
#else
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#endif
// Only files small enough to fit into memory are supported.
namespace lexertl
{
template<typename char_type>
class basic_memory_file
{
public:
basic_memory_file()
{
}
basic_memory_file(const char *pathname_)
{
open(pathname_);
}
~basic_memory_file()
{
close();
}
void open(const char *pathname_)
{
if (_data) close();
#ifdef _WIN32
_fh = ::CreateFileA(pathname_, GENERIC_READ, FILE_SHARE_READ, 0,
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
_fmh = 0;
if (_fh != INVALID_HANDLE_VALUE)
{
_fmh = ::CreateFileMapping(_fh, 0, PAGE_READONLY, 0, 0, 0);
if (_fmh != 0)
{
_data = static_cast<char_type *>(::MapViewOfFile
(_fmh, FILE_MAP_READ, 0, 0, 0));
if (_data) _size = ::GetFileSize(_fh, 0) / sizeof(char_type);
}
}
#else
_fh = ::open(pathname_, O_RDONLY);
if (_fh > -1)
{
struct stat sbuf_;
if (::fstat(_fh, &sbuf_) > -1)
{
_data = static_cast<const char_type *>
(::mmap(0, sbuf_.st_size, PROT_READ, MAP_SHARED, _fh, 0));
if (_data == MAP_FAILED)
{
_data = nullptr;
}
else
{
_size = sbuf_.st_size / sizeof(char_type);
}
}
}
#endif
}
const char_type *data() const
{
return _data;
}
std::size_t size() const
{
return _size;
}
void close()
{
#ifdef _WIN32
::UnmapViewOfFile(_data);
::CloseHandle(_fmh);
::CloseHandle(_fh);
#else
::munmap(const_cast<char_type *>(_data), _size);
::close(_fh);
#endif
_data = nullptr;
_size = 0;
_fh = 0;
#ifdef _WIN32
_fmh = 0;
#endif
}
private:
const char_type *_data = nullptr;
std::size_t _size = 0;
#ifdef _WIN32
HANDLE _fh = 0;
HANDLE _fmh = 0;
#else
int _fh = 0;
#endif
// No copy construction.
basic_memory_file(const basic_memory_file &) = delete;
// No assignment.
basic_memory_file &operator =(const basic_memory_file &) = delete;
};
using memory_file = basic_memory_file<char>;
using wmemory_file = basic_memory_file<wchar_t>;
}
#endif

View File

@ -0,0 +1,25 @@
// narrow.hpp
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_NARROW_HPP
#define LEXERTL_NARROW_HPP
#include <sstream>
namespace lexertl
{
template<typename char_type>
void narrow(const char_type *str_, std::ostringstream &ss_)
{
while (*str_)
{
// Safe to simply cast to char.
// when string only contains ASCII.
ss_ << static_cast<char>(*str_++);
}
}
}
#endif

View File

@ -0,0 +1,16 @@
// observer_ptr.hpp
// Copyright (c) 2017-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_OBSERVER_PTR_HPP
#define LEXERTL_OBSERVER_PTR_HPP
namespace lexertl
{
template<typename T>
using observer_ptr = T *;
}
#endif

View File

@ -0,0 +1,926 @@
// parser.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_PARSER_HPP
#define LEXERTL_PARSER_HPP
#include <assert.h>
#include <algorithm>
#include "tree/end_node.hpp"
#include "tree/iteration_node.hpp"
#include "tree/leaf_node.hpp"
#include <map>
#include "tokeniser/re_tokeniser.hpp"
#include "../runtime_error.hpp"
#include "tree/selection_node.hpp"
#include "tree/sequence_node.hpp"
#include <type_traits>
#include <vector>
namespace lexertl
{
namespace detail
{
/*
General principles of regex parsing:
- Every regex is a sequence of sub-regexes.
- Regexes consist of operands and operators
- All operators decompose to sequence, selection ('|') and iteration ('*')
- Regex tokens are stored on a stack.
- When a complete sequence of regex tokens is on the stack it is processed.
Grammar:
<REGEX> -> <OREXP>
<OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
<SEQUENCE> -> <SUB>
<SUB> -> <EXPRESSION> | <SUB><EXPRESSION>
<EXPRESSION> -> <REPEAT>
<REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
<DUPLICATE> -> '?' | '??' | '*' | '*?' | '+' | '+?' | '{n[,[m]]}' |
'{n[,[m]]}?'
*/
template<typename rules_char_type, typename sm_traits>
class basic_parser
{
public:
enum {char_24_bit = sm_traits::char_24_bit};
using char_type = typename sm_traits::char_type;
using id_type = typename sm_traits::id_type;
using end_node = basic_end_node<id_type>;
using input_char_type = typename sm_traits::input_char_type;
using input_string_token = basic_string_token<input_char_type>;
using iteration_node = basic_iteration_node<id_type>;
using leaf_node = basic_leaf_node<id_type>;
using tokeniser =
basic_re_tokeniser<rules_char_type, input_char_type, id_type>;
using node = basic_node<id_type>;
using node_ptr_vector = typename node::node_ptr_vector;
using string = std::basic_string<rules_char_type>;
using string_token = basic_string_token<char_type>;
using selection_node = basic_selection_node<id_type>;
using sequence_node = basic_sequence_node<id_type>;
using charset_map = std::map<string_token, id_type>;
using charset_pair = std::pair<string_token, id_type>;
using compressed = std::integral_constant<bool, sm_traits::compressed>;
using token = basic_re_token<rules_char_type, input_char_type>;
static_assert(std::is_move_assignable<token>::value &&
std::is_move_constructible<token>::value,
"token is not movable.");
using token_vector = std::vector<token>;
basic_parser(const std::locale &locale_,
node_ptr_vector &node_ptr_vector_,
charset_map &charset_map_, const id_type eoi_) :
_locale(locale_),
_node_ptr_vector(node_ptr_vector_),
_charset_map(charset_map_),
_eoi(eoi_),
_token_stack(),
_tree_node_stack()
{
}
observer_ptr<node> parse(const token_vector &regex_, const id_type id_,
const id_type user_id_, const id_type next_dfa_,
const id_type push_dfa_, const bool pop_dfa_,
const std::size_t flags_, id_type &nl_id_, const bool seen_bol_)
{
auto iter_ = regex_.cbegin();
auto end_ = regex_.cend();
observer_ptr<node> root_ = nullptr;
observer_ptr<token> lhs_token_ = nullptr;
// There cannot be less than 2 tokens
auto rhs_token_ = std::make_unique<token>(*iter_++);
char action_ = 0;
_token_stack.emplace(std::move(rhs_token_));
rhs_token_ = std::make_unique<token>(*iter_);
if (iter_ + 1 != end_) ++iter_;
do
{
lhs_token_ = _token_stack.top().get();
action_ = lhs_token_->precedence(rhs_token_->_type);
switch (action_)
{
case '<':
case '=':
_token_stack.emplace(std::move(rhs_token_));
rhs_token_ = std::make_unique<token>(*iter_);
if (iter_ + 1 != end_) ++iter_;
break;
case '>':
reduce(nl_id_);
break;
default:
{
std::ostringstream ss_;
ss_ << "A syntax error occurred: '" <<
lhs_token_->precedence_string() <<
"' against '" << rhs_token_->precedence_string() <<
" in rule id " << id_ << '.';
throw runtime_error(ss_.str());
break;
}
}
} while (!_token_stack.empty());
if (_tree_node_stack.empty())
{
std::ostringstream ss_;
ss_ << "Empty rules are not allowed in rule id " <<
id_ << '.';
throw runtime_error(ss_.str());
}
assert(_tree_node_stack.size() == 1);
observer_ptr<node> lhs_node_ = _tree_node_stack.top();
_tree_node_stack.pop();
_node_ptr_vector.emplace_back(std::make_unique<end_node>
(id_, user_id_, next_dfa_, push_dfa_, pop_dfa_));
observer_ptr<node> rhs_node_ = _node_ptr_vector.back().get();
_node_ptr_vector.emplace_back(std::make_unique<sequence_node>
(lhs_node_, rhs_node_));
root_ = _node_ptr_vector.back().get();
if (seen_bol_)
{
fixup_bol(root_);
}
if ((flags_ & match_zero_len) == 0)
{
const auto &firstpos_ = root_->firstpos();
for (observer_ptr<const node> node_ : firstpos_)
{
if (node_->end_state())
{
std::ostringstream ss_;
ss_ << "Rules that match zero characters are not allowed "
"as this can cause an infinite loop in user code. The "
"match_zero_len flag overrides this check. Rule id " <<
id_ << '.';
throw runtime_error(ss_.str());
}
}
}
return root_;
}
static id_type bol_token()
{
return static_cast<id_type>(~1);
}
static id_type eol_token()
{
return static_cast<id_type>(~2);
}
private:
using input_range = typename input_string_token::range;
using range = typename string_token::range;
using string_token_vector = std::vector<std::unique_ptr<string_token>>;
using token_stack = std::stack<std::unique_ptr<token>>;
using tree_node_stack = typename node::node_stack;
const std::locale &_locale;
node_ptr_vector &_node_ptr_vector;
charset_map &_charset_map;
id_type _eoi;
token_stack _token_stack;
tree_node_stack _tree_node_stack;
void reduce(id_type &nl_id_)
{
observer_ptr<token> lhs_ = nullptr;
observer_ptr<token> rhs_ = nullptr;
token_stack handle_;
char action_ = 0;
do
{
handle_.emplace();
rhs_ = _token_stack.top().release();
handle_.top().reset(rhs_);
_token_stack.pop();
if (!_token_stack.empty())
{
lhs_ = _token_stack.top().get();
action_ = lhs_->precedence(rhs_->_type);
}
} while (!_token_stack.empty() && action_ == '=');
assert(_token_stack.empty() || action_ == '<');
switch (rhs_->_type)
{
case BEGIN:
// finished processing so exit
break;
case REGEX:
// finished parsing, nothing to do
break;
case OREXP:
orexp(handle_);
break;
case SEQUENCE:
_token_stack.emplace(std::make_unique<token>(OREXP));
break;
case SUB:
sub(handle_);
break;
case EXPRESSION:
_token_stack.emplace(std::make_unique<token>(SUB));
break;
case REPEAT:
repeat(handle_);
break;
case BOL:
bol(handle_);
break;
case EOL:
eol(handle_, nl_id_);
break;
case CHARSET:
charset(handle_, compressed());
break;
case OPENPAREN:
openparen(handle_);
break;
case OPT:
case AOPT:
optional(rhs_->_type == OPT);
_token_stack.emplace(std::make_unique<token>(DUP));
break;
case ZEROORMORE:
case AZEROORMORE:
zero_or_more(rhs_->_type == ZEROORMORE);
_token_stack.emplace(std::make_unique<token>(DUP));
break;
case ONEORMORE:
case AONEORMORE:
one_or_more(rhs_->_type == ONEORMORE);
_token_stack.emplace(std::make_unique<token>(DUP));
break;
case REPEATN:
case AREPEATN:
repeatn(rhs_->_type == REPEATN, handle_.top().get());
_token_stack.emplace(std::make_unique<token>(DUP));
break;
default:
throw runtime_error
("Internal error in regex_parser::reduce.");
break;
}
}
void orexp(token_stack &handle_)
{
assert(handle_.top()->_type == OREXP &&
(handle_.size() == 1 || handle_.size() == 3));
if (handle_.size() == 1)
{
_token_stack.emplace(std::make_unique<token>(REGEX));
}
else
{
handle_.pop();
assert(handle_.top()->_type == OR);
handle_.pop();
assert(handle_.top()->_type == SEQUENCE);
perform_or();
_token_stack.emplace(std::make_unique<token>(OREXP));
}
}
void perform_or()
{
// perform or
observer_ptr<node> rhs_ = _tree_node_stack.top();
_tree_node_stack.pop();
observer_ptr<node> lhs_ = _tree_node_stack.top();
_node_ptr_vector.emplace_back
(std::make_unique<selection_node>(lhs_, rhs_));
_tree_node_stack.top() = _node_ptr_vector.back().get();
}
void sub(token_stack &handle_)
{
assert((handle_.top()->_type == SUB &&
handle_.size() == 1) || handle_.size() == 2);
if (handle_.size() == 1)
{
_token_stack.emplace(std::make_unique<token>(SEQUENCE));
}
else
{
handle_.pop();
assert(handle_.top()->_type == EXPRESSION);
// perform join
sequence();
_token_stack.emplace(std::make_unique<token>(SUB));
}
}
void repeat(token_stack &handle_)
{
assert(handle_.top()->_type == REPEAT &&
handle_.size() >= 1 && handle_.size() <= 3);
if (handle_.size() == 1)
{
_token_stack.emplace(std::make_unique<token>(EXPRESSION));
}
else
{
handle_.pop();
assert(handle_.top()->_type == DUP);
_token_stack.emplace(std::make_unique<token>(REPEAT));
}
}
#ifndef NDEBUG
void bol(token_stack &handle_)
#else
void bol(token_stack &)
#endif
{
assert(handle_.top()->_type == BOL &&
handle_.size() == 1);
// store charset
_node_ptr_vector.emplace_back
(std::make_unique<leaf_node>(bol_token(), true));
_tree_node_stack.push(_node_ptr_vector.back().get());
_token_stack.emplace(std::make_unique<token>(REPEAT));
}
#ifndef NDEBUG
void eol(token_stack &handle_, id_type &nl_id_)
#else
void eol(token_stack &, id_type &nl_id_)
#endif
{
const string_token nl_('\n');
const id_type temp_nl_id_ = lookup(nl_);
assert(handle_.top()->_type == EOL &&
handle_.size() == 1);
if (temp_nl_id_ != ~static_cast<id_type>(0))
{
nl_id_ = temp_nl_id_;
}
// store charset
_node_ptr_vector.emplace_back
(std::make_unique<leaf_node>(eol_token(), true));
_tree_node_stack.push(_node_ptr_vector.back().get());
_token_stack.emplace(std::make_unique<token>(REPEAT));
}
// Uncompressed
void charset(token_stack &handle_, const std::false_type &)
{
assert(handle_.top()->_type == CHARSET &&
handle_.size() == 1);
const id_type id_ = lookup(handle_.top()->_str);
// store charset
_node_ptr_vector.emplace_back(std::make_unique<leaf_node>(id_, true));
_tree_node_stack.push(_node_ptr_vector.back().get());
_token_stack.emplace(std::make_unique<token>(REPEAT));
}
// Compressed
void charset(token_stack &handle_, const std::true_type &)
{
assert(handle_.top()->_type == CHARSET &&
handle_.size() == 1);
std::unique_ptr<token> token_(handle_.top().release());
handle_.pop();
create_sequence(token_);
}
// Slice wchar_t into sequence of char.
void create_sequence(std::unique_ptr<token> &token_)
{
string_token_vector data_[char_24_bit ? 3 : 2];
for (const input_range &range_ : token_->_str._ranges)
{
slice_range(range_, data_,
std::integral_constant<bool, char_24_bit>());
}
push_ranges(data_, std::integral_constant<bool, char_24_bit>());
_token_stack.emplace(std::make_unique<token>(OPENPAREN));
_token_stack.emplace(std::make_unique<token>(REGEX));
_token_stack.emplace(std::make_unique<token>(CLOSEPAREN));
}
// 16 bit unicode
void slice_range(const input_range &range_, string_token_vector data_[2],
const std::false_type &)
{
const unsigned char first_msb_ = static_cast<unsigned char>
((range_.first >> 8) & 0xff);
const unsigned char first_lsb_ = static_cast<unsigned char>
(range_.first & 0xff);
const unsigned char second_msb_ = static_cast<unsigned char>
((range_.second >> 8) & 0xff);
const unsigned char second_lsb_ = static_cast<unsigned char>
(range_.second & 0xff);
if (first_msb_ == second_msb_)
{
insert_range(first_msb_, first_msb_, first_lsb_,
second_lsb_, data_);
}
else
{
insert_range(first_msb_, first_msb_, first_lsb_, 0xff, data_);
if (second_msb_ > first_msb_ + 1)
{
insert_range(first_msb_ + 1, second_msb_ - 1, 0, 0xff, data_);
}
insert_range(second_msb_, second_msb_, 0, second_lsb_, data_);
}
}
// 24 bit unicode
void slice_range(const input_range &range_, string_token_vector data_[3],
const std::true_type &)
{
const unsigned char first_msb_ = static_cast<unsigned char>
((range_.first >> 16) & 0xff);
const unsigned char first_mid_ = static_cast<unsigned char>
((range_.first >> 8) & 0xff);
const unsigned char first_lsb_ = static_cast<unsigned char>
(range_.first & 0xff);
const unsigned char second_msb_ = static_cast<unsigned char>
((range_.second >> 16) & 0xff);
const unsigned char second_mid_ = static_cast<unsigned char>
((range_.second >> 8) & 0xff);
const unsigned char second_lsb_ = static_cast<unsigned char>
(range_.second & 0xff);
if (first_msb_ == second_msb_)
{
string_token_vector data2_[2];
// Re-use 16 bit slice function
slice_range(range_, data2_, std::false_type());
for (std::size_t i_ = 0, size_ = data2_[0].size();
i_ < size_; ++i_)
{
insert_range(string_token(first_msb_, first_msb_),
*data2_[0][i_], *data2_[1][i_], data_);
}
}
else
{
insert_range(first_msb_, first_msb_,
first_mid_, first_mid_,
first_lsb_, 0xff, data_);
if (first_mid_ != 0xff)
{
insert_range(first_msb_, first_msb_,
first_mid_ + 1, 0xff,
0, 0xff, data_);
}
if (second_msb_ > first_msb_ + 1)
{
insert_range(first_mid_ + 1, second_mid_ - 1,
0, 0xff,
0, 0xff, data_);
}
if (second_mid_ != 0)
{
insert_range(second_msb_, second_msb_,
0, second_mid_ - 1,
0, 0xff, data_);
insert_range(second_msb_, second_msb_,
second_mid_, second_mid_,
0, second_lsb_, data_);
}
else
{
insert_range(second_msb_, second_msb_,
0, second_mid_,
0, second_lsb_, data_);
}
}
}
// 16 bit unicode
void insert_range(const unsigned char first_, const unsigned char second_,
const unsigned char first2_, const unsigned char second2_,
string_token_vector data_[2])
{
const string_token token_(first_ > second_ ? second_ : first_,
first_ > second_ ? first_ : second_);
const string_token token2_(first2_ > second2_ ? second2_ : first2_,
first2_ > second2_ ? first2_ : second2_);
insert_range(token_, token2_, data_);
}
void insert_range(const string_token &token_, const string_token &token2_,
string_token_vector data_[2])
{
typename string_token_vector::const_iterator iter_ =
std::find_if(data_[0].begin(), data_[0].end(),
[&token_](const std::unique_ptr<string_token> &rhs_)
{
return token_ == *rhs_.get();
});
if (iter_ == data_[0].end())
{
data_[0].emplace_back(std::make_unique<string_token>(token_));
data_[1].emplace_back(std::make_unique<string_token>(token2_));
}
else
{
const std::size_t index_ = iter_ - data_[0].begin();
data_[1][index_]->insert(token2_);
}
}
// 24 bit unicode
void insert_range(const unsigned char first_, const unsigned char second_,
const unsigned char first2_, const unsigned char second2_,
const unsigned char first3_, const unsigned char second3_,
string_token_vector data_[3])
{
const string_token token_(first_ > second_ ? second_ : first_,
first_ > second_ ? first_ : second_);
const string_token token2_(first2_ > second2_ ? second2_ : first2_,
first2_ > second2_ ? first2_ : second2_);
const string_token token3_(first3_ > second3_ ? second3_ : first3_,
first3_ > second3_ ? first3_ : second3_);
insert_range(token_, token2_, token3_, data_);
}
void insert_range(const string_token &token_, const string_token &token2_,
const string_token &token3_, string_token_vector data_[3])
{
auto iter_ = data_[0].cbegin();
auto end_ = data_[0].cend();
bool finished_ = false;
do
{
iter_ = std::find_if(iter_, end_,
[&token_](const std::unique_ptr<string_token> &rhs_)
{
return token_ == *rhs_.get();
});
if (iter_ == end_)
{
data_[0].emplace_back(std::make_unique<string_token>(token_));
data_[1].emplace_back(std::make_unique<string_token>(token2_));
data_[2].emplace_back(std::make_unique<string_token>(token3_));
finished_ = true;
}
else
{
const std::size_t index_ = iter_ - data_[0].begin();
if (*data_[1][index_] == token2_)
{
data_[2][index_]->insert(token3_);
finished_ = true;
}
else
{
++iter_;
}
}
} while (!finished_);
}
// 16 bit unicode
void push_ranges(string_token_vector data_[2], const std::false_type &)
{
auto viter_ = data_[0].cbegin();
auto vend_ = data_[0].cend();
auto viter2_ = data_[1].cbegin();
push_range(viter_++->get());
push_range(viter2_++->get());
sequence();
while (viter_ != vend_)
{
push_range(viter_++->get());
push_range(viter2_++->get());
sequence();
perform_or();
}
}
// 24 bit unicode
void push_ranges(string_token_vector data_[3], const std::true_type &)
{
auto viter_ = data_[0].cbegin();
auto vend_ = data_[0].cend();
auto viter2_ = data_[1].cbegin();
auto viter3_ = data_[2].cbegin();
push_range(viter_++->get());
push_range(viter2_++->get());
sequence();
push_range(viter3_++->get());
sequence();
while (viter_ != vend_)
{
push_range(viter_++->get());
push_range(viter2_++->get());
sequence();
push_range(viter3_++->get());
sequence();
perform_or();
}
}
void push_range(observer_ptr<const string_token> token_)
{
const id_type id_ = lookup(*token_);
_node_ptr_vector.emplace_back(std::make_unique<leaf_node>(id_, true));
_tree_node_stack.push(_node_ptr_vector.back().get());
}
id_type lookup(const string_token &charset_)
{
// Converted to id_type below.
std::size_t id_ = sm_traits::npos();
if (static_cast<id_type>(id_) < id_)
{
throw runtime_error("id_type is not large enough "
"to hold all ids.");
}
typename charset_map::const_iterator iter_ =
_charset_map.find(charset_);
if (iter_ == _charset_map.end())
{
id_ = _charset_map.size();
_charset_map.insert(charset_pair(charset_,
static_cast<id_type>(id_)));
}
else
{
id_ = iter_->second;
}
return static_cast<id_type>(id_);
}
void openparen(token_stack &handle_)
{
assert(handle_.top()->_type == OPENPAREN &&
handle_.size() == 3);
handle_.pop();
assert(handle_.top()->_type == REGEX);
handle_.pop();
assert(handle_.top()->_type == CLOSEPAREN);
_token_stack.emplace(std::make_unique<token>(REPEAT));
}
void sequence()
{
observer_ptr<node> rhs_ = _tree_node_stack.top();
_tree_node_stack.pop();
observer_ptr<node> lhs_ = _tree_node_stack.top();
_node_ptr_vector.emplace_back
(std::make_unique<sequence_node>(lhs_, rhs_));
_tree_node_stack.top() = _node_ptr_vector.back().get();
}
void optional(const bool greedy_)
{
// perform ?
observer_ptr<node> lhs_ = _tree_node_stack.top();
// Don't know if lhs_ is a leaf_node, so get firstpos.
auto &firstpos_ = lhs_->firstpos();
for (observer_ptr<node> node_ : firstpos_)
{
// These are leaf_nodes!
node_->greedy(greedy_);
}
_node_ptr_vector.emplace_back(std::make_unique<leaf_node>
(node::null_token(), greedy_));
observer_ptr<node> rhs_ = _node_ptr_vector.back().get();
_node_ptr_vector.emplace_back
(std::make_unique<selection_node>(lhs_, rhs_));
_tree_node_stack.top() = _node_ptr_vector.back().get();
}
void zero_or_more(const bool greedy_)
{
// perform *
observer_ptr<node> ptr_ = _tree_node_stack.top();
_node_ptr_vector.emplace_back
(std::make_unique<iteration_node>(ptr_, greedy_));
_tree_node_stack.top() = _node_ptr_vector.back().get();
}
void one_or_more(const bool greedy_)
{
// perform +
observer_ptr<node> lhs_ = _tree_node_stack.top();
observer_ptr<node> copy_ = lhs_->copy(_node_ptr_vector);
_node_ptr_vector.emplace_back(std::make_unique<iteration_node>
(copy_, greedy_));
observer_ptr<node> rhs_ = _node_ptr_vector.back().get();
_node_ptr_vector.emplace_back
(std::make_unique<sequence_node>(lhs_, rhs_));
_tree_node_stack.top() = _node_ptr_vector.back().get();
}
// perform {n[,[m]]}
// Semantic checks have already been performed.
// {0,} = *
// {0,1} = ?
// {1,} = +
// therefore we do not check for these cases.
void repeatn(const bool greedy_, observer_ptr<const token> token_)
{
const rules_char_type *str_ = token_->_extra.c_str();
std::size_t min_ = 0;
bool comma_ = false;
std::size_t max_ = 0;
while (*str_>= '0' && *str_ <= '9')
{
min_ *= 10;
min_ += *str_ - '0';
++str_;
}
comma_ = *str_ == ',';
if (comma_) ++str_;
while (*str_>= '0' && *str_ <= '9')
{
max_ *= 10;
max_ += *str_ - '0';
++str_;
}
if (!(min_ == 1 && !comma_))
{
const std::size_t top_ = min_ > 0 ? min_ : max_;
if (min_ == 0)
{
optional(greedy_);
}
observer_ptr<node> prev_ = _tree_node_stack.top()->
copy(_node_ptr_vector);
observer_ptr<node> curr_ = nullptr;
for (std::size_t i_ = 2; i_ < top_; ++i_)
{
curr_ = prev_->copy(_node_ptr_vector);
_tree_node_stack.push(prev_);
sequence();
prev_ = curr_;
}
if (comma_ && min_ > 0)
{
if (min_ > 1)
{
curr_ = prev_->copy(_node_ptr_vector);
_tree_node_stack.push(prev_);
sequence();
prev_ = curr_;
}
if (comma_ && max_)
{
_tree_node_stack.push(prev_);
optional(greedy_);
prev_ = _tree_node_stack.top();
_tree_node_stack.pop();
const std::size_t count_ = max_ - min_;
for (std::size_t i_ = 1; i_ < count_; ++i_)
{
curr_ = prev_->copy(_node_ptr_vector);
_tree_node_stack.push(prev_);
sequence();
prev_ = curr_;
}
}
else
{
_tree_node_stack.push(prev_);
zero_or_more(greedy_);
prev_ = _tree_node_stack.top();
_tree_node_stack.pop();
}
}
_tree_node_stack.push(prev_);
sequence();
}
}
void fixup_bol(observer_ptr<node> &root_)const
{
const auto &first_ = root_->firstpos();
bool found_ = false;
for (observer_ptr<const node> node_ : first_)
{
found_ = !node_->end_state() && node_->token() == bol_token();
if (found_) break;
}
if (!found_)
{
_node_ptr_vector.emplace_back
(std::make_unique<leaf_node>(bol_token(), true));
observer_ptr<node> lhs_ = _node_ptr_vector.back().get();
_node_ptr_vector.emplace_back
(std::make_unique<leaf_node>(node::null_token(), true));
observer_ptr<node> rhs_ = _node_ptr_vector.back().get();
_node_ptr_vector.emplace_back
(std::make_unique<selection_node>(lhs_, rhs_));
lhs_ = _node_ptr_vector.back().get();
_node_ptr_vector.emplace_back
(std::make_unique<sequence_node>(lhs_, root_));
root_ = _node_ptr_vector.back().get();
}
}
};
}
}
#endif

View File

@ -0,0 +1,100 @@
// re_token.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKEN_HPP
#define LEXERTL_RE_TOKEN_HPP
#include "../../string_token.hpp"
namespace lexertl
{
namespace detail
{
// Note that tokens following END are never seen by parser.hpp.
enum token_type {BEGIN, REGEX, OREXP, SEQUENCE, SUB, EXPRESSION, REPEAT,
DUP, OR, CHARSET, BOL, EOL, MACRO, OPENPAREN, CLOSEPAREN, OPT, AOPT,
ZEROORMORE, AZEROORMORE, ONEORMORE, AONEORMORE, REPEATN, AREPEATN,
END, DIFF};
template<typename input_char_type, typename char_type>
struct basic_re_token
{
using string_token = basic_string_token<char_type>;
using string = std::basic_string<input_char_type>;
token_type _type;
string _extra;
string_token _str;
basic_re_token(const token_type type_ = BEGIN) :
_type(type_),
_extra(),
_str()
{
}
void clear()
{
_type = BEGIN;
_extra.clear();
_str.clear();
}
void swap(basic_re_token &rhs_)
{
std::swap(_type, rhs_._type);
_extra.swap(rhs_._extra);
_str.swap(rhs_._str);
}
char precedence(const token_type type_) const
{
// Moved in here for Solaris compiler.
static const char precedence_table_[END + 1][END + 1] = {
// BEG, REG, ORE, SEQ, SUB, EXP, RPT, DUP, | , CHR, BOL, EOL, MCR, ( , ) , ? , ?? , * , *? , + , +?, {n}?, {n}, END
/*BEGIN*/{ ' ', '<', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/*REGEX*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/*OREXP*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* SEQ */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* SUB */{ ' ', ' ', ' ', ' ', ' ', '=', '<', ' ', '>', '<', '<', '<', '<', '<', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/*EXPRE*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* RPT */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', '>', '>', '>', '<', '<', '<', '<', '<', '<', '<', '<', '>' },
/*DUPLI*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* | */{ ' ', ' ', ' ', '=', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' },
/*CHARA*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
/* BOL */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
/* EOL */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
/*MACRO*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
/* ( */{ ' ', '=', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' },
/* ) */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
/* ? */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* ?? */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* * */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* *? */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* + */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* +? */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/*{n,m}*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/*{nm}?*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
/* END */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' }
};
return precedence_table_[_type][type_];
}
const char *precedence_string() const
{
// Moved in here for Solaris compiler.
static const char *precedence_strings_[END + 1] =
{"BEGIN", "REGEX", "OREXP", "SEQUENCE", "SUB", "EXPRESSION",
"REPEAT", "DUPLICATE", "|", "CHARSET", "^", "$", "MACRO", "(", ")",
"?", "??", "*", "*?", "+", "+?", "{n[,[m]]}", "{n[,[m]]}?", "END"};
return precedence_strings_[_type];
}
};
}
}
#endif

View File

@ -0,0 +1,778 @@
// tokeniser.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKENISER_HPP
#define LEXERTL_RE_TOKENISER_HPP
#include <cstring>
#include "re_token.hpp"
#include "../../runtime_error.hpp"
#include <sstream>
#include "../../string_token.hpp"
#include "re_tokeniser_helper.hpp"
namespace lexertl
{
namespace detail
{
template<typename rules_char_type, typename char_type, typename id_type>
class basic_re_tokeniser
{
public:
using re_token = basic_re_token<rules_char_type, char_type>;
using tokeniser_helper =
basic_re_tokeniser_helper<rules_char_type, char_type, id_type>;
using char_state = typename tokeniser_helper::char_state;
using state = typename tokeniser_helper::state;
using string_token = basic_string_token<char_type>;
static void next(re_token &lhs_, state &state_, re_token &token_)
{
rules_char_type ch_ = 0;
bool eos_ = state_.next(ch_);
bool skipped_ = false;
token_.clear();
do
{
// string begin/end
while (!eos_ && ch_ == '"')
{
state_._in_string ^= 1;
eos_ = state_.next(ch_);
}
if (eos_) break;
// (?# ...)
skipped_ = comment(eos_, ch_, state_);
if (eos_) break;
// skip_ws set
skipped_ |= skip(eos_, ch_, state_);
} while (!eos_ && skipped_);
if (eos_)
{
if (state_._in_string)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (missing '\"')";
state_.error(ss_);
throw runtime_error(ss_.str());
}
if (state_._paren_count)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (missing ')')";
state_.error(ss_);
throw runtime_error(ss_.str());
}
token_._type = END;
}
else
{
if (ch_ == '\\')
{
// Even if we are in a string, respect escape sequences...
token_._type = CHARSET;
escape(state_, token_._str);
}
else if (state_._in_string)
{
// All other meta characters lose their special meaning
// inside a string.
token_._type = CHARSET;
add_char(ch_, state_, token_._str);
}
else
{
// Not an escape sequence and not inside a string, so
// check for meta characters.
switch (ch_)
{
case '(':
token_._type = OPENPAREN;
++state_._paren_count;
read_options(state_);
break;
case ')':
--state_._paren_count;
if (state_._paren_count < 0)
{
std::ostringstream ss_;
ss_ << "Number of open parenthesis < 0 "
"at index " << state_.index() - 1;
state_.error(ss_);
throw runtime_error(ss_.str());
}
token_._type = CLOSEPAREN;
if (!state_._flags_stack.empty())
{
state_._flags = state_._flags_stack.top();
state_._flags_stack.pop();
}
break;
case '?':
if (!state_.eos() && *state_._curr == '?')
{
token_._type = AOPT;
state_.increment();
}
else
{
token_._type = OPT;
}
break;
case '*':
if (!state_.eos() && *state_._curr == '?')
{
token_._type = AZEROORMORE;
state_.increment();
}
else
{
token_._type = ZEROORMORE;
}
break;
case '+':
if (!state_.eos() && *state_._curr == '?')
{
token_._type = AONEORMORE;
state_.increment();
}
else
{
token_._type = ONEORMORE;
}
break;
case '{':
open_curly(lhs_, state_, token_);
break;
case '|':
token_._type = OR;
break;
case '^':
if (!state_._macro_name &&
state_._curr - 1 == state_._start)
{
token_._type = BOL;
}
else
{
token_._type = CHARSET;
token_._str.insert(range(ch_, ch_));
}
break;
case '$':
if (!state_._macro_name && state_._curr == state_._end)
{
token_._type = EOL;
}
else
{
token_._type = CHARSET;
token_._str.insert(range(ch_, ch_));
}
break;
case '.':
{
token_._type = CHARSET;
if (state_._flags & dot_not_newline)
{
token_._str.insert(range('\n', '\n'));
}
else if (state_._flags & dot_not_cr_lf)
{
token_._str.insert(range('\n', '\n'));
token_._str.insert(range('\r', '\r'));
}
token_._str.negate();
break;
}
case '[':
{
token_._type = CHARSET;
tokeniser_helper::charset(state_, token_._str);
break;
}
case '/':
{
std::ostringstream ss_;
ss_ << "Lookahead ('/') is not supported yet";
state_.error(ss_);
throw runtime_error(ss_.str());
break;
}
default:
token_._type = CHARSET;
add_char(ch_, state_, token_._str);
break;
}
}
}
}
private:
using range = typename string_token::range;
static bool comment(bool &eos_, rules_char_type &ch_, state &state_)
{
bool skipped_ = false;
if (!state_._in_string && ch_ == '(' && !state_.eos() &&
*state_._curr == '?' && state_._curr + 1 < state_._end &&
*(state_._curr + 1) == '#')
{
std::size_t paren_count_ = 1;
state_.increment();
state_.increment();
do
{
eos_ = state_.next(ch_);
if (ch_ == '(')
{
++paren_count_;
}
else if (ch_ == ')')
{
--paren_count_;
}
} while (!eos_ && !(ch_ == ')' && paren_count_ == 0));
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (unterminated comment)";
state_.error(ss_);
throw runtime_error(ss_.str());
}
else
{
eos_ = state_.next(ch_);
}
skipped_ = true;
}
return skipped_;
}
static bool skip(bool &eos_, rules_char_type &ch_, state &state_)
{
bool skipped_ = false;
if ((state_._flags & skip_ws) && !state_._in_string)
{
bool c_comment_ = false;
bool skip_ws_ = false;
do
{
c_comment_ = ch_ == '/' && !state_.eos() &&
*state_._curr == '*';
skip_ws_ = !c_comment_ && (ch_ == ' ' || ch_ == '\t' ||
ch_ == '\n' || ch_ == '\r' || ch_ == '\f' || ch_ == '\v');
if (c_comment_)
{
state_.increment();
eos_ = state_.next(ch_);
while (!eos_ && !(ch_ == '*' && !state_.eos() &&
*state_._curr == '/'))
{
eos_ = state_.next(ch_);
}
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (unterminated C style comment)";
state_.error(ss_);
throw runtime_error(ss_.str());
}
else
{
state_.increment();
eos_ = state_.next(ch_);
}
skipped_ = true;
}
else if (skip_ws_)
{
eos_ = state_.next(ch_);
skipped_ = true;
}
} while (!eos_ && (c_comment_ || skip_ws_));
}
return skipped_;
}
static void read_options(state &state_)
{
if (!state_.eos() && *state_._curr == '?')
{
rules_char_type ch_ = 0;
bool eos_ = false;
bool negate_ = false;
state_.increment();
eos_ = state_.next(ch_);
state_._flags_stack.push(state_._flags);
while (!eos_ && ch_ != ':')
{
switch (ch_)
{
case '-':
negate_ ^= 1;
break;
case 'i':
if (negate_)
{
state_._flags = state_._flags & ~icase;
}
else
{
state_._flags = state_._flags | icase;
}
negate_ = false;
break;
case 's':
if (negate_)
{
#ifdef _WIN32
state_._flags = state_._flags | dot_not_cr_lf;
#else
state_._flags = state_._flags | dot_not_newline;
#endif
}
else
{
#ifdef _WIN32
state_._flags = state_._flags & ~dot_not_cr_lf;
#else
state_._flags = state_._flags & ~dot_not_newline;
#endif
}
negate_ = false;
break;
case 'x':
if (negate_)
{
state_._flags = state_._flags & ~skip_ws;
}
else
{
state_._flags = state_._flags | skip_ws;
}
negate_ = false;
break;
default:
{
std::ostringstream ss_;
ss_ << "Unknown option at index " <<
state_.index() - 1;
state_.error(ss_);
throw runtime_error(ss_.str());
}
}
eos_ = state_.next(ch_);
}
// End of string handler will handle early termination
}
else if (!state_._flags_stack.empty())
{
state_._flags_stack.push(state_._flags);
}
}
static void escape(state &state_, string_token &token_)
{
char_type ch_ = 0;
std::size_t str_len_ = 0;
const char *str_ = tokeniser_helper::escape_sequence(state_,
ch_, str_len_);
if (str_)
{
char_state state2_(str_ + 1, str_ + str_len_, state_._id,
state_._flags, state_._locale, 0);
tokeniser_helper::charset(state2_, token_);
}
else
{
add_char(ch_, state_, token_);
}
}
static void add_char(const char_type ch_, const state &state_,
string_token &token_)
{
range range_(ch_, ch_);
token_.insert(range_);
if (state_._flags & icase)
{
string_token folded_;
tokeniser_helper::fold(range_, state_._locale,
folded_, typename tokeniser_helper::template
size<sizeof(char_type)>());
if (!folded_.empty())
{
token_.insert(folded_);
}
}
}
static void open_curly(re_token &lhs_, state &state_,
re_token &token_)
{
if (state_.eos())
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (missing '}')";
state_.error(ss_);
throw runtime_error(ss_.str());
}
else if (*state_._curr == '-' || *state_._curr == '+')
{
rules_char_type ch_ = 0;
if (lhs_._type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must precede {" <<
state_._curr << "} at index " <<
state_.index() - 1;
state_.error(ss_);
throw runtime_error(ss_.str());
}
state_.next(ch_);
token_._type = DIFF;
token_._extra = ch_;
if (state_.next(ch_))
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (missing '}')";
state_.error(ss_);
throw runtime_error(ss_.str());
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index() - 1;
state_.error(ss_);
throw runtime_error(ss_.str());
}
}
else if (*state_._curr >= '0' && *state_._curr <= '9')
{
repeat_n(state_, token_);
}
else
{
macro(state_, token_);
}
}
// SYNTAX:
// {n[,[n]]}
// SEMANTIC RULES:
// {0} - INVALID (throw exception)
// {0,} = *
// {0,0} - INVALID (throw exception)
// {0,1} = ?
// {1,} = +
// {min,max} where min == max - {min}
// {min,max} where max < min - INVALID (throw exception)
static void repeat_n(state &state_, re_token &token_)
{
rules_char_type ch_ = 0;
bool eos_ = state_.next(ch_);
std::size_t min_ = 0;
std::size_t max_ = 0;
while (!eos_ && ch_ >= '0' && ch_ <= '9')
{
min_ *= 10;
min_ += ch_ - '0';
token_._extra += ch_;
eos_ = state_.next(ch_);
}
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (missing repeat terminator '}')";
state_.error(ss_);
throw runtime_error(ss_.str());
}
bool min_max_ = false;
bool repeatn_ = true;
if (ch_ == ',')
{
token_._extra += ch_;
eos_ = state_.next(ch_);
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (missing repeat terminator '}')";
state_.error(ss_);
throw runtime_error(ss_.str());
}
if (ch_ == '}')
{
// Small optimisation: Check for '*' equivalency.
if (min_ == 0)
{
token_._type = ZEROORMORE;
repeatn_ = false;
}
// Small optimisation: Check for '+' equivalency.
else if (min_ == 1)
{
token_._type = ONEORMORE;
repeatn_ = false;
}
}
else
{
if (ch_ < '0' || ch_ > '9')
{
std::ostringstream ss_;
ss_ << "Missing repeat terminator '}' at index " <<
state_.index() - 1;
state_.error(ss_);
throw runtime_error(ss_.str());
}
min_max_ = true;
do
{
max_ *= 10;
max_ += ch_ - '0';
token_._extra += ch_;
eos_ = state_.next(ch_);
} while (!eos_ && ch_ >= '0' && ch_ <= '9');
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (missing repeat terminator '}')";
state_.error(ss_);
throw runtime_error(ss_.str());
}
// Small optimisation: Check for '?' equivalency.
if (min_ == 0 && max_ == 1)
{
token_._type = OPT;
repeatn_ = false;
}
// Small optimisation: if min == max, then min.
else if (min_ == max_)
{
token_._extra.erase(token_._extra.find(','));
min_max_ = false;
max_ = 0;
}
}
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing repeat terminator '}' at index " <<
state_.index() - 1;
state_.error(ss_);
throw runtime_error(ss_.str());
}
if (repeatn_)
{
// SEMANTIC VALIDATION follows:
// NOTE: {0,} has already become *
// therefore we don't check for a comma.
if (min_ == 0 && max_ == 0)
{
std::ostringstream ss_;
ss_ << "Cannot have exactly zero repeats preceding index " <<
state_.index();
state_.error(ss_);
throw runtime_error(ss_.str());
}
if (min_max_ && max_ < min_)
{
std::ostringstream ss_;
ss_ << "Max less than min preceding index " <<
state_.index();
state_.error(ss_);
throw runtime_error(ss_.str());
}
if (!state_.eos() && *state_._curr == '?')
{
token_._type = AREPEATN;
state_.increment();
}
else
{
token_._type = REPEATN;
}
}
else if (token_._type == ZEROORMORE)
{
if (!state_.eos() && *state_._curr == '?')
{
token_._type = AZEROORMORE;
state_.increment();
}
}
else if (token_._type == ONEORMORE)
{
if (!state_.eos() && *state_._curr == '?')
{
token_._type = AONEORMORE;
state_.increment();
}
}
else if (token_._type == OPT)
{
if (!state_.eos() && *state_._curr == '?')
{
token_._type = AOPT;
state_.increment();
}
}
}
static void macro(state &state_, re_token &token_)
{
rules_char_type ch_ = 0;
bool eos_ = false;
state_.next(ch_);
if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
!(ch_ >= 'a' && ch_ <= 'z'))
{
std::ostringstream ss_;
ss_ << "Invalid MACRO name at index " << state_.index() - 1;
state_.error(ss_);
throw runtime_error(ss_.str());
}
do
{
token_._extra += ch_;
eos_ = state_.next(ch_);
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
state_.unexpected_end(ss_);
ss_ << " (missing MACRO name terminator '}')";
state_.error(ss_);
throw runtime_error(ss_.str());
}
} while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
(ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing MACRO name terminator '}' at index " <<
state_.index() - 1;
state_.error(ss_);
throw runtime_error(ss_.str());
}
token_._type = MACRO;
}
};
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,136 @@
// tokeniser_state.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKENISER_STATE_HPP
#define LEXERTL_RE_TOKENISER_STATE_HPP
#include "../../char_traits.hpp"
#include "../../enums.hpp"
#include <locale>
#include "../../narrow.hpp"
#include <stack>
namespace lexertl
{
namespace detail
{
template<typename ch_type, typename id_type>
struct basic_re_tokeniser_state
{
using char_type = ch_type;
using index_type = typename basic_char_traits<char_type>::index_type;
const char_type * const _start;
const char_type * const _end;
const char_type *_curr;
id_type _id;
std::size_t _flags;
std::stack<std::size_t> _flags_stack;
std::locale _locale;
const char_type *_macro_name;
long _paren_count;
bool _in_string;
id_type _nl_id;
basic_re_tokeniser_state(const char_type *start_,
const char_type * const end_, id_type id_, const std::size_t flags_,
const std::locale locale_, const char_type *macro_name_) :
_start(start_),
_end(end_),
_curr(start_),
_id(id_),
_flags(flags_),
_flags_stack(),
_locale(locale_),
_macro_name(macro_name_),
_paren_count(0),
_in_string(false),
_nl_id(static_cast<id_type>(~0))
{
}
basic_re_tokeniser_state(const basic_re_tokeniser_state &rhs_)
{
assign(rhs_);
}
// prevent VC++ 7.1 warning:
const basic_re_tokeniser_state &operator =
(const basic_re_tokeniser_state &rhs_)
{
return assign(rhs_);
}
basic_re_tokeniser_state &assign(const basic_re_tokeniser_state &rhs_)
{
_start = rhs_._start;
_end = rhs_._end;
_curr = rhs_._curr;
_id = rhs_._id;
_flags = rhs_._flags;
_flags_stack = rhs_._flags_stack;
_locale = rhs_._locale;
_macro_name = rhs_._macro_name;
_paren_count = rhs_._paren_count;
_in_string = rhs_._in_string;
_nl_id = rhs_._nl_id;
return *this;
}
inline bool next(char_type &ch_)
{
if (_curr >= _end)
{
ch_ = 0;
return true;
}
else
{
ch_ = *_curr;
increment();
return false;
}
}
inline void increment()
{
++_curr;
}
inline std::size_t index()
{
return _curr - _start;
}
inline bool eos()
{
return _curr >= _end;
}
inline void unexpected_end(std::ostringstream &ss_)
{
ss_ << "Unexpected end of regex";
}
inline void error(std::ostringstream &ss_)
{
ss_ << " in ";
if (_macro_name)
{
ss_ << "MACRO '";
narrow(_macro_name, ss_);
ss_ << "'.";
}
else
{
ss_ << "rule id " << _id << '.';
}
}
};
}
}
#endif

View File

@ -0,0 +1,111 @@
// end_node.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_END_NODE_HPP
#define LEXERTL_END_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_end_node : public basic_node<id_type>
{
public:
using node = basic_node<id_type>;
using bool_stack = typename node::bool_stack;
using const_node_stack = typename node::const_node_stack;
using node_ptr_vector = typename node::node_ptr_vector;
using node_stack = typename node::node_stack;
using node_type = typename node::node_type;
using node_vector = typename node::node_vector;
basic_end_node(const id_type id_, const id_type user_id_,
const id_type next_dfa_, const id_type push_dfa_,
const bool pop_dfa_) :
node(false),
_id(id_),
_user_id(user_id_),
_next_dfa(next_dfa_),
_push_dfa(push_dfa_),
_pop_dfa(pop_dfa_),
_followpos()
{
node::_firstpos.push_back(this);
node::_lastpos.push_back(this);
}
virtual ~basic_end_node() override
{
}
virtual node_type what_type() const override
{
return node::END;
}
virtual bool traverse(const_node_stack &/*node_stack_*/,
bool_stack &/*perform_op_stack_*/) const override
{
return false;
}
virtual const node_vector &followpos() const override
{
// _followpos is always empty..!
return _followpos;
}
virtual bool end_state() const override
{
return true;
}
virtual id_type id() const override
{
return _id;
}
virtual id_type user_id() const override
{
return _user_id;
}
virtual id_type next_dfa() const override
{
return _next_dfa;
}
virtual id_type push_dfa() const override
{
return _push_dfa;
}
virtual bool pop_dfa() const override
{
return _pop_dfa;
}
private:
id_type _id;
id_type _user_id;
id_type _next_dfa;
id_type _push_dfa;
bool _pop_dfa;
node_vector _followpos;
virtual void copy_node(node_ptr_vector &/*node_ptr_vector_*/,
node_stack &/*new_node_stack_*/, bool_stack &/*perform_op_stack_*/,
bool &/*down_*/) const override
{
// Nothing to do, as end_nodes are not copied.
}
};
}
}
#endif

View File

@ -0,0 +1,96 @@
// iteration_node.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_ITERATION_NODE_HPP
#define LEXERTL_ITERATION_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_iteration_node : public basic_node<id_type>
{
public:
using node = basic_node<id_type>;
using bool_stack = typename node::bool_stack;
using const_node_stack = typename node::const_node_stack;
using node_ptr_vector = typename node::node_ptr_vector;
using node_stack = typename node::node_stack;
using node_type = typename node::node_type;
using node_vector = typename node::node_vector;
basic_iteration_node(observer_ptr<node> next_, const bool greedy_) :
node(true),
_next(next_),
_greedy(greedy_)
{
_next->append_firstpos(node::_firstpos);
_next->append_lastpos(node::_lastpos);
for (observer_ptr<node> node_ : node::_lastpos)
{
node_->append_followpos(node::_firstpos);
}
for (observer_ptr<node> node_ : node::_firstpos)
{
node_->greedy(greedy_);
}
}
virtual ~basic_iteration_node() override
{
}
virtual node_type what_type() const override
{
return node::ITERATION;
}
virtual bool traverse(const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const override
{
perform_op_stack_.push(true);
node_stack_.push(_next);
return true;
}
private:
observer_ptr<node> _next;
bool _greedy;
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const override
{
if (perform_op_stack_.top())
{
observer_ptr<node> ptr_ = new_node_stack_.top();
node_ptr_vector_.emplace_back
(std::make_unique<basic_iteration_node>(ptr_, _greedy));
new_node_stack_.top() = node_ptr_vector_.back().get();
}
else
{
down_ = true;
}
perform_op_stack_.pop();
}
// No copy construction.
basic_iteration_node(const basic_iteration_node &) = delete;
// No assignment.
const basic_iteration_node &operator =
(const basic_iteration_node &) = delete;
};
}
}
#endif

View File

@ -0,0 +1,110 @@
// leaf_node.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_LEAF_NODE_HPP
#define LEXERTL_LEAF_NODE_HPP
#include "../../enums.hpp" // null_token
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_leaf_node : public basic_node<id_type>
{
public:
using node = basic_node<id_type>;
using bool_stack = typename node::bool_stack;
using const_node_stack = typename node::const_node_stack;
using node_ptr_vector = typename node::node_ptr_vector;
using node_stack = typename node::node_stack;
using node_type = typename node::node_type;
using node_vector = typename node::node_vector;
basic_leaf_node(const id_type token_, const bool greedy_) :
node(token_ == node::null_token()),
_token(token_),
_set_greedy(!greedy_),
_greedy(greedy_),
_followpos()
{
if (!node::_nullable)
{
node::_firstpos.push_back(this);
node::_lastpos.push_back(this);
}
}
virtual ~basic_leaf_node() override
{
}
virtual void append_followpos(const node_vector &followpos_) override
{
_followpos.insert(_followpos.end(),
followpos_.begin(), followpos_.end());
}
virtual node_type what_type() const override
{
return node::LEAF;
}
virtual bool traverse(const_node_stack &/*node_stack_*/,
bool_stack &/*perform_op_stack_*/) const override
{
return false;
}
virtual id_type token() const override
{
return _token;
}
virtual void greedy(const bool greedy_) override
{
if (!_set_greedy)
{
_greedy = greedy_;
_set_greedy = true;
}
}
virtual bool greedy() const override
{
return _greedy;
}
virtual const node_vector &followpos() const override
{
return _followpos;
}
virtual node_vector &followpos() override
{
return _followpos;
}
private:
id_type _token;
bool _set_greedy;
bool _greedy;
node_vector _followpos;
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &/*perform_op_stack_*/,
bool &/*down_*/) const override
{
node_ptr_vector_.emplace_back(std::make_unique<basic_leaf_node>
(_token, _greedy));
new_node_stack_.push(node_ptr_vector_.back().get());
}
};
}
}
#endif

View File

@ -0,0 +1,242 @@
// node.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_NODE_HPP
#define LEXERTL_NODE_HPP
#include <assert.h>
#include <memory>
#include "../../observer_ptr.hpp"
#include "../../runtime_error.hpp"
#include <stack>
#include <vector>
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_node
{
public:
enum node_type {LEAF, SEQUENCE, SELECTION, ITERATION, END};
using bool_stack = std::stack<bool>;
using node_stack = std::stack<observer_ptr<basic_node>>;
using const_node_stack = std::stack<observer_ptr<const basic_node>>;
using node_vector = std::vector<observer_ptr<basic_node>>;
using node_ptr_vector = std::vector<std::unique_ptr<basic_node>>;
basic_node() :
_nullable(false),
_firstpos(),
_lastpos()
{
}
basic_node(const bool nullable_) :
_nullable(nullable_),
_firstpos(),
_lastpos()
{
}
virtual ~basic_node()
{
}
static id_type null_token()
{
return static_cast<id_type>(~0);
}
bool nullable() const
{
return _nullable;
}
void append_firstpos(node_vector &firstpos_) const
{
firstpos_.insert(firstpos_.end(),
_firstpos.begin(), _firstpos.end());
}
void append_lastpos(node_vector &lastpos_) const
{
lastpos_.insert(lastpos_.end(),
_lastpos.begin(), _lastpos.end());
}
virtual void append_followpos(const node_vector &/*followpos_*/)
{
throw runtime_error("Internal error node::append_followpos().");
}
observer_ptr<basic_node> copy(node_ptr_vector &node_ptr_vector_) const
{
observer_ptr<basic_node> new_root_ = nullptr;
const_node_stack node_stack_;
bool_stack perform_op_stack_;
bool down_ = true;
node_stack new_node_stack_;
node_stack_.push(this);
while (!node_stack_.empty())
{
while (down_)
{
down_ = node_stack_.top()->traverse(node_stack_,
perform_op_stack_);
}
while (!down_ && !node_stack_.empty())
{
observer_ptr<const basic_node> top_ = node_stack_.top();
top_->copy_node(node_ptr_vector_, new_node_stack_,
perform_op_stack_, down_);
if (!down_) node_stack_.pop();
}
}
assert(new_node_stack_.size() == 1);
new_root_ = new_node_stack_.top();
new_node_stack_.pop();
return new_root_;
}
virtual node_type what_type() const = 0;
virtual bool traverse(const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const = 0;
node_vector &firstpos()
{
return _firstpos;
}
const node_vector &firstpos() const
{
return _firstpos;
}
// _lastpos modified externally, so not const &
node_vector &lastpos()
{
return _lastpos;
}
virtual bool end_state() const
{
return false;
}
virtual id_type id() const
{
throw runtime_error("Internal error node::id().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type();
#endif
}
virtual id_type user_id() const
{
throw runtime_error("Internal error node::user_id().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type();
#endif
}
virtual id_type next_dfa() const
{
throw runtime_error("Internal error node::next_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type();
#endif
}
virtual id_type push_dfa() const
{
throw runtime_error("Internal error node::push_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type();
#endif
}
virtual bool pop_dfa() const
{
throw runtime_error("Internal error node::pop_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return false;
#endif
}
virtual id_type token() const
{
throw runtime_error("Internal error node::token().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type();
#endif
}
virtual void greedy(const bool /*greedy_*/)
{
throw runtime_error("Internal error node::greedy(bool).");
}
virtual bool greedy() const
{
throw runtime_error("Internal error node::greedy().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return false;
#endif
}
virtual const node_vector &followpos() const
{
throw runtime_error("Internal error node::followpos().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return firstpos;
#endif
}
virtual node_vector &followpos()
{
throw runtime_error("Internal error node::followpos().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return firstpos;
#endif
}
protected:
const bool _nullable;
node_vector _firstpos;
node_vector _lastpos;
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const = 0;
private:
// No copy construction.
basic_node(const basic_node &) = delete;
// No assignment.
const basic_node &operator =(const basic_node &) = delete;
};
}
}
#endif

View File

@ -0,0 +1,104 @@
// selection_node.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SELECTION_NODE_HPP
#define LEXERTL_SELECTION_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_selection_node : public basic_node<id_type>
{
public:
using node = basic_node<id_type>;
using bool_stack = typename node::bool_stack;
using const_node_stack = typename node::const_node_stack;
using node_ptr_vector = typename node::node_ptr_vector;
using node_stack = typename node::node_stack;
using node_type = typename node::node_type;
basic_selection_node(observer_ptr<node> left_, observer_ptr<node> right_) :
node(left_->nullable() || right_->nullable()),
_left(left_),
_right(right_)
{
_left->append_firstpos(node::_firstpos);
_right->append_firstpos(node::_firstpos);
_left->append_lastpos(node::_lastpos);
_right->append_lastpos(node::_lastpos);
}
virtual ~basic_selection_node() override
{
}
virtual node_type what_type() const override
{
return node::SELECTION;
}
virtual bool traverse(const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const override
{
perform_op_stack_.push(true);
switch (_right->what_type())
{
case node::SEQUENCE:
case node::SELECTION:
case node::ITERATION:
perform_op_stack_.push(false);
break;
default:
break;
}
node_stack_.push(_right);
node_stack_.push(_left);
return true;
}
private:
observer_ptr<node> _left;
observer_ptr<node> _right;
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const override
{
if (perform_op_stack_.top())
{
observer_ptr<node> rhs_ = new_node_stack_.top();
new_node_stack_.pop();
observer_ptr<node> lhs_ = new_node_stack_.top();
node_ptr_vector_.emplace_back
(std::make_unique<basic_selection_node>(lhs_, rhs_));
new_node_stack_.top() = node_ptr_vector_.back().get();
}
else
{
down_ = true;
}
perform_op_stack_.pop();
}
// No copy construction.
basic_selection_node(const basic_selection_node &) = delete;
// No assignment.
const basic_selection_node &operator =
(const basic_selection_node &) = delete;
};
}
}
#endif

View File

@ -0,0 +1,121 @@
// sequence_node.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SEQUENCE_NODE_HPP
#define LEXERTL_SEQUENCE_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_sequence_node : public basic_node<id_type>
{
public:
using node = basic_node<id_type>;
using bool_stack = typename node::bool_stack;
using const_node_stack = typename node::const_node_stack;
using node_ptr_vector = typename node::node_ptr_vector;
using node_stack = typename node::node_stack;
using node_type = typename node::node_type;
using node_vector = typename node::node_vector;
basic_sequence_node(observer_ptr<node> left_, observer_ptr<node> right_) :
node(left_->nullable() && right_->nullable()),
_left(left_),
_right(right_)
{
_left->append_firstpos(node::_firstpos);
if (_left->nullable())
{
_right->append_firstpos(node::_firstpos);
}
if (_right->nullable())
{
_left->append_lastpos(node::_lastpos);
}
_right->append_lastpos(node::_lastpos);
node_vector &lastpos_ = _left->lastpos();
const node_vector &firstpos_ = _right->firstpos();
for (observer_ptr<node> node_ : lastpos_)
{
node_->append_followpos(firstpos_);
}
}
virtual ~basic_sequence_node() override
{
}
virtual node_type what_type() const override
{
return node::SEQUENCE;
}
virtual bool traverse(const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const override
{
perform_op_stack_.push(true);
switch (_right->what_type())
{
case node::SEQUENCE:
case node::SELECTION:
case node::ITERATION:
perform_op_stack_.push(false);
break;
default:
break;
}
node_stack_.push(_right);
node_stack_.push(_left);
return true;
}
private:
observer_ptr<node> _left;
observer_ptr<node> _right;
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const override
{
if (perform_op_stack_.top())
{
observer_ptr<node> rhs_ = new_node_stack_.top();
new_node_stack_.pop();
observer_ptr<node> lhs_ = new_node_stack_.top();
node_ptr_vector_.emplace_back
(std::make_unique<basic_sequence_node>(lhs_, rhs_));
new_node_stack_.top() = node_ptr_vector_.back().get();
}
else
{
down_ = true;
}
perform_op_stack_.pop();
}
// No copy construction.
basic_sequence_node(const basic_sequence_node &) = delete;
// No assignment.
const basic_sequence_node &operator =(const basic_sequence_node &) = delete;
};
}
}
#endif

View File

@ -0,0 +1,72 @@
// charset.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_CHARSET_HPP
#define LEXERTL_CHARSET_HPP
#include <algorithm>
#include <iterator>
#include <set>
#include "../string_token.hpp"
namespace lexertl
{
namespace detail
{
template<typename char_type, typename id_type>
struct basic_charset
{
using token = basic_string_token<char_type>;
using index_set = std::set<id_type>;
token _token;
index_set _index_set;
basic_charset() :
_token(),
_index_set()
{
}
basic_charset(const token &token_, const id_type index_) :
_token(token_),
_index_set()
{
_index_set.insert(index_);
}
bool empty() const
{
return _token.empty() && _index_set.empty();
}
void intersect(basic_charset &rhs_, basic_charset &overlap_)
{
_token.intersect(rhs_._token, overlap_._token);
if (!overlap_._token.empty())
{
std::merge(_index_set.begin(), _index_set.end(),
rhs_._index_set.begin(), rhs_._index_set.end(),
std::inserter(overlap_._index_set,
overlap_._index_set.end()));
if (_token.empty())
{
_index_set.clear();
}
if (rhs_._token.empty())
{
rhs_._index_set.clear();
}
}
}
};
}
}
#endif

View File

@ -0,0 +1,135 @@
// equivset.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_EQUIVSET_HPP
#define LEXERTL_EQUIVSET_HPP
#include <algorithm>
#include "../parser/tree/node.hpp"
#include <set>
namespace lexertl
{
namespace detail
{
template<typename id_type>
struct basic_equivset
{
using index_set = std::set<id_type>;
using index_vector = std::vector<id_type>;
using node = basic_node<id_type>;
using node_vector = std::vector<observer_ptr<node>>;
index_vector _index_vector;
id_type _id;
bool _greedy;
node_vector _followpos;
basic_equivset() :
_index_vector(),
_id(0),
_greedy(true),
_followpos()
{
}
basic_equivset(const index_set &index_set_, const id_type id_,
const bool greedy_, const node_vector &followpos_) :
_index_vector(index_set_.begin(), index_set_.end()),
_id(id_),
_greedy(greedy_),
_followpos(followpos_)
{
}
bool empty() const
{
return _index_vector.empty() && _followpos.empty();
}
void intersect(basic_equivset &rhs_, basic_equivset &overlap_)
{
intersect_indexes(rhs_._index_vector, overlap_._index_vector);
if (!overlap_._index_vector.empty())
{
// Note that the LHS takes priority in order to
// respect rule ordering priority in the lex spec.
overlap_._id = _id;
overlap_._greedy = _greedy;
overlap_._followpos = _followpos;
auto overlap_begin_ = overlap_._followpos.cbegin();
auto overlap_end_ = overlap_._followpos.cend();
for (observer_ptr<node> node_ : rhs_._followpos)
{
if (std::find(overlap_begin_, overlap_end_, node_) ==
overlap_end_)
{
overlap_._followpos.push_back(node_);
overlap_begin_ = overlap_._followpos.begin();
overlap_end_ = overlap_._followpos.end();
}
}
if (_index_vector.empty())
{
_followpos.clear();
}
if (rhs_._index_vector.empty())
{
rhs_._followpos.clear();
}
}
}
private:
void intersect_indexes(index_vector &rhs_, index_vector &overlap_)
{
std::set_intersection(_index_vector.begin(), _index_vector.end(),
rhs_.begin(), rhs_.end(), std::back_inserter(overlap_));
if (!overlap_.empty())
{
remove(overlap_, _index_vector);
remove(overlap_, rhs_);
}
}
void remove(const index_vector &source_, index_vector &dest_)
{
auto inter_ = source_.begin();
auto inter_end_ = source_.end();
auto reader_ = std::find(dest_.begin(), dest_.end(), *inter_);
auto writer_ = reader_;
auto dest_end_ = dest_.end();
while (writer_ != dest_end_ && inter_ != inter_end_)
{
if (*reader_ == *inter_)
{
++inter_;
++reader_;
}
else
{
*writer_++ = *reader_++;
}
}
while (reader_ != dest_end_)
{
*writer_++ = *reader_++;
}
dest_.resize(dest_.size() - source_.size());
}
};
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,23 @@
// runtime_error.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RUNTIME_ERROR_HPP
#define LEXERTL_RUNTIME_ERROR_HPP
#include <stdexcept>
namespace lexertl
{
class runtime_error : public std::runtime_error
{
public:
runtime_error(const std::string &what_arg_) :
std::runtime_error(what_arg_)
{
}
};
}
#endif

View File

@ -0,0 +1,28 @@
// serialise.hpp
// Copyright (c) 2007-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SERIALISE_HPP
#define LEXERTL_SERIALISE_HPP
#include "state_machine.hpp"
#include <boost/serialization/vector.hpp>
namespace lexertl
{
// IMPORTANT! This won't work if you don't enable RTTI!
template<typename CharT, typename id_type, class Archive>
void serialise(basic_state_machine<CharT, id_type> &sm_, Archive &ar_)
{
detail::basic_internals<id_type> &internals_ = sm_.data();
ar_ & internals_._eoi;
ar_ & *internals_._lookup;
ar_ & internals_._dfa_alphabet;
ar_ & internals_._features;
ar_ & *internals_._dfa;
}
}
#endif

View File

@ -0,0 +1,53 @@
// sm_to_csm.hpp
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SM_TO_CSM_HPP
#define LEXERTL_SM_TO_CSM_HPP
#include "enums.hpp"
#include "observer_ptr.hpp"
#include <cstddef>
namespace lexertl
{
template<typename sm, typename char_state_machine>
void sm_to_csm(const sm &sm_, char_state_machine &csm_)
{
using id_type = typename sm::traits::id_type;
using internals = typename sm::internals;
using string_token = typename char_state_machine::state::string_token;
using index_type = typename string_token::index_type;
using string_token_vector =
typename char_state_machine::string_token_vector;
const internals &internals_ = sm_.data();
const std::size_t dfas_ = internals_._dfa.size();
for (id_type i_ = 0; i_ < dfas_; ++i_)
{
if (internals_._dfa_alphabet[i_] == 0) continue;
const std::size_t alphabet_ = internals_._dfa_alphabet[i_] -
transitions_index;
string_token_vector token_vector_(alphabet_, string_token());
observer_ptr<const id_type> ptr_ = &internals_._lookup[i_].front();
for (std::size_t c_ = 0; c_ < 256; ++c_, ++ptr_)
{
if (*ptr_ >= transitions_index)
{
string_token &token_ = token_vector_
[*ptr_ - transitions_index];
token_.insert(typename string_token::range
(index_type(c_), index_type(c_)));
}
}
csm_.append(token_vector_, internals_, i_);
}
}
}
#endif

View File

@ -0,0 +1,44 @@
// sm_traits.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SM_TRAITS_HPP
#define LEXERTL_SM_TRAITS_HPP
namespace lexertl
{
template<typename ch_type, typename sm_type, bool comp, bool look,
bool dfa_nfa>
struct basic_sm_traits
{
enum {char_24_bit = sizeof(ch_type) > 2, compressed = comp, lookup = look,
is_dfa = dfa_nfa};
using input_char_type = ch_type;
using char_type = ch_type;
using id_type = sm_type;
static id_type npos()
{
return static_cast<id_type>(~0);
}
};
template<typename ch_type, typename sm_type, bool look, bool dfa_nfa>
struct basic_sm_traits<ch_type, sm_type, true, look, dfa_nfa>
{
enum {char_24_bit = sizeof(ch_type) > 2, compressed = true, lookup = look,
is_dfa = dfa_nfa};
using input_char_type = ch_type;
using char_type = unsigned char;
using id_type = sm_type;
static id_type npos()
{
return static_cast<id_type>(~0);
}
};
}
#endif

View File

@ -0,0 +1,521 @@
// state_machine.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_STATE_MACHINE_HPP
#define LEXERTL_STATE_MACHINE_HPP
// memcmp()
#include <cstring>
#include "internals.hpp"
#include <map>
#include "observer_ptr.hpp"
#include <set>
#include "sm_traits.hpp"
#include "string_token.hpp"
namespace lexertl
{
template<typename char_type, typename id_ty = uint16_t>
class basic_state_machine
{
public:
using id_type = id_ty;
using traits =
basic_sm_traits<char_type, id_type,
(sizeof(char_type) > 1), true, true>;
using internals = detail::basic_internals<id_type>;
// If you get a compile error here you have
// failed to define an unsigned id type.
static_assert(std::is_unsigned<id_type>::value, "Your id type is signed");
basic_state_machine() :
_internals()
{
}
void clear()
{
_internals.clear();
}
internals &data()
{
return _internals;
}
const internals &data() const
{
return _internals;
}
bool empty() const
{
return _internals.empty();
}
id_type eoi() const
{
return _internals._eoi;
}
void minimise()
{
const id_type dfas_ = static_cast<id_type>(_internals._dfa.size());
for (id_type i_ = 0; i_ < dfas_; ++i_)
{
const id_type dfa_alphabet_ = _internals._dfa_alphabet[i_];
id_type_vector &dfa_ = _internals._dfa[i_];
if (dfa_alphabet_ != 0)
{
std::size_t size_ = 0;
do
{
size_ = dfa_.size();
minimise_dfa(dfa_alphabet_, dfa_, size_);
} while (dfa_.size() != size_);
}
}
}
static id_type npos()
{
return static_cast<id_type>(~0);
}
static id_type skip()
{
return static_cast<id_type>(~1);
}
void swap(basic_state_machine &rhs_)
{
_internals.swap(rhs_._internals);
}
private:
using id_type_vector = typename internals::id_type_vector;
using index_set = std::set<id_type>;
internals _internals;
void minimise_dfa(const id_type dfa_alphabet_,
id_type_vector &dfa_, std::size_t size_)
{
observer_ptr<const id_type> first_ = &dfa_.front();
observer_ptr<const id_type> end_ = first_ + size_;
id_type index_ = 1;
id_type new_index_ = 1;
id_type_vector lookup_(size_ / dfa_alphabet_, npos());
observer_ptr<id_type> lookup_ptr_ = &lookup_.front();
index_set index_set_;
const id_type bol_index_ = dfa_.front();
*lookup_ptr_ = 0;
// Only one 'jam' state, so skip it.
first_ += dfa_alphabet_;
for (; first_ < end_; first_ += dfa_alphabet_, ++index_)
{
observer_ptr<const id_type> second_ = first_ + dfa_alphabet_;
for (id_type curr_index_ = index_ + 1; second_ < end_;
++curr_index_, second_ += dfa_alphabet_)
{
if (index_set_.find(curr_index_) != index_set_.end())
{
continue;
}
// Some systems have memcmp in namespace std.
using namespace std;
if (memcmp(first_, second_, sizeof(id_type) *
dfa_alphabet_) == 0)
{
index_set_.insert(curr_index_);
lookup_ptr_[curr_index_] = new_index_;
}
}
if (lookup_ptr_[index_] == npos())
{
lookup_ptr_[index_] = new_index_;
++new_index_;
}
}
if (!index_set_.empty())
{
observer_ptr<const id_type> front_ = &dfa_.front();
id_type_vector new_dfa_(front_, front_ + dfa_alphabet_);
auto set_end_ = index_set_.cend();
observer_ptr<const id_type> ptr_ = front_ + dfa_alphabet_;
observer_ptr<id_type> new_ptr_ = nullptr;
new_dfa_.resize(size_ - index_set_.size() * dfa_alphabet_, 0);
new_ptr_ = &new_dfa_.front() + dfa_alphabet_;
size_ /= dfa_alphabet_;
if (bol_index_)
{
new_dfa_.front() = lookup_ptr_[bol_index_];
}
for (index_ = 1; index_ < size_; ++index_)
{
if (index_set_.find(index_) != set_end_)
{
ptr_ += dfa_alphabet_;
continue;
}
new_ptr_[end_state_index] = ptr_[end_state_index];
new_ptr_[id_index] = ptr_[id_index];
new_ptr_[user_id_index] = ptr_[user_id_index];
new_ptr_[push_dfa_index] = ptr_[push_dfa_index];
new_ptr_[next_dfa_index] = ptr_[next_dfa_index];
new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]];
new_ptr_ += transitions_index;
ptr_ += transitions_index;
for (id_type i_ = transitions_index; i_ < dfa_alphabet_; ++i_)
{
*new_ptr_++ = lookup_ptr_[*ptr_++];
}
}
dfa_.swap(new_dfa_);
}
}
};
using state_machine = basic_state_machine<char>;
using wstate_machine = basic_state_machine<wchar_t>;
using u32state_machine = basic_state_machine<char32_t>;
template<typename char_type, typename id_ty = uint16_t,
bool is_dfa = true>
struct basic_char_state_machine
{
using id_type = id_ty;
using traits = basic_sm_traits<char_type, id_type, false, false, is_dfa>;
using internals = detail::basic_internals<id_type>;
using id_type_vector = typename internals::id_type_vector;
struct state
{
using string_token = basic_string_token<char_type>;
using id_type_string_token_map = std::map<id_type, string_token>;
using id_type_string_token_pair = std::pair<id_type, string_token>;
enum push_pop_dfa {neither, push_dfa, pop_dfa};
bool _end_state;
push_pop_dfa _push_pop_dfa;
id_type _id;
id_type _user_id;
id_type _push_dfa;
id_type _next_dfa;
id_type _eol_index;
id_type_string_token_map _transitions;
state() :
_end_state(false),
_push_pop_dfa(neither),
_id(0),
_user_id(traits::npos()),
_push_dfa(traits::npos()),
_next_dfa(0),
_eol_index(traits::npos()),
_transitions()
{
}
bool operator ==(const state rhs_) const
{
return _end_state == rhs_._end_state &&
_push_pop_dfa == rhs_._push_pop_dfa &&
_id == rhs_._id &&
_user_id == rhs_._user_id &&
_push_dfa == rhs_._push_dfa &&
_next_dfa == rhs_._next_dfa &&
_eol_index == rhs_._eol_index &&
_transitions == rhs_._transitions;
}
};
using string_token = typename state::string_token;
using state_vector = std::vector<state>;
using string_token_vector = std::vector<string_token>;
using id_type_string_token_pair =
typename state::id_type_string_token_pair;
struct dfa
{
id_type _bol_index;
state_vector _states;
dfa(const std::size_t size_) :
_bol_index(traits::npos()),
_states(state_vector(size_))
{
}
std::size_t size() const
{
return _states.size();
}
void swap(dfa &rhs_)
{
std::swap(_bol_index, rhs_._bol_index);
_states.swap(rhs_._states);
}
};
static_assert(std::is_move_assignable<dfa>::value &&
std::is_move_constructible<dfa>::value, "dfa is not movable.");
using dfa_vector = std::vector<dfa>;
static_assert(std::is_unsigned<id_type>::value, "Your id type is signed");
dfa_vector _sm_vector;
basic_char_state_machine() :
_sm_vector()
{
}
void append(const string_token_vector &token_vector_,
const internals &internals_, const id_type dfa_index_)
{
const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[dfa_index_];
const std::size_t alphabet_ = dfa_alphabet_ - transitions_index;
const id_type_vector &source_dfa_ = internals_._dfa[dfa_index_];
observer_ptr<const id_type> ptr_ = &source_dfa_.front();
const std::size_t size_ = (source_dfa_.size() - dfa_alphabet_) /
dfa_alphabet_;
typename state::id_type_string_token_map::iterator trans_iter_;
_sm_vector.push_back(dfa(size_));
dfa &dest_dfa_ = _sm_vector.back();
if (*ptr_)
{
dest_dfa_._bol_index = *ptr_ - 1;
}
ptr_ += dfa_alphabet_;
for (id_type i_ = 0; i_ < size_; ++i_)
{
state &state_ = dest_dfa_._states[i_];
state_._end_state = ptr_[end_state_index] != 0;
if (ptr_[push_dfa_index] != npos())
{
state_._push_pop_dfa = state::push_dfa;
}
else if (ptr_[end_state_index] & pop_dfa_bit)
{
state_._push_pop_dfa = state::pop_dfa;
}
state_._id = ptr_[id_index];
state_._user_id = ptr_[user_id_index];
state_._push_dfa = ptr_[push_dfa_index];
state_._next_dfa = ptr_[next_dfa_index];
if (ptr_[eol_index])
{
state_._eol_index = ptr_[eol_index] - 1;
}
ptr_ += transitions_index;
for (id_type col_index_ = 0; col_index_ < alphabet_;
++col_index_, ++ptr_)
{
const id_type next_ = *ptr_;
if (next_ > 0)
{
trans_iter_ = state_._transitions.find(next_ - 1);
if (trans_iter_ == state_._transitions.end())
{
trans_iter_ = state_._transitions.insert
(id_type_string_token_pair(static_cast<id_type>
(next_ - 1), token_vector_[col_index_])).first;
}
else
{
trans_iter_->second.insert(token_vector_[col_index_]);
}
}
}
}
}
void clear()
{
_sm_vector.clear();
}
bool empty() const
{
return _sm_vector.empty();
}
void minimise()
{
const id_type dfas_ = static_cast<id_type>(_sm_vector.size());
for (id_type i_ = 0; i_ < dfas_; ++i_)
{
observer_ptr<dfa> dfa_ = &_sm_vector[i_];
if (dfa_->size() > 0)
{
std::size_t size_ = 0;
do
{
size_ = dfa_->size();
minimise_dfa(*dfa_, size_);
} while (dfa_->size() != size_);
}
}
}
static id_type npos()
{
return traits::npos();
}
id_type size() const
{
return static_cast<id_type>(_sm_vector.size());
}
static id_type skip()
{
return ~static_cast<id_type>(1);
}
void swap(basic_char_state_machine &csm_)
{
_sm_vector.swap(csm_._sm_vector);
}
private:
using index_set = std::set<id_type>;
void minimise_dfa(dfa &dfa_, std::size_t size_)
{
observer_ptr<const state> first_ = &dfa_._states.front();
observer_ptr<const state> end_ = first_ + size_;
id_type index_ = 0;
id_type new_index_ = 0;
id_type_vector lookup_(size_, npos());
observer_ptr<id_type> lookup_ptr_ = &lookup_.front();
index_set index_set_;
for (; first_ != end_; ++first_, ++index_)
{
observer_ptr<const state> second_ = first_ + 1;
for (id_type curr_index_ = index_ + 1; second_ != end_;
++curr_index_, ++second_)
{
if (index_set_.find(curr_index_) != index_set_.end())
{
continue;
}
if (*first_ == *second_)
{
index_set_.insert(curr_index_);
lookup_ptr_[curr_index_] = new_index_;
}
}
if (lookup_ptr_[index_] == npos())
{
lookup_ptr_[index_] = new_index_;
++new_index_;
}
}
if (!index_set_.empty())
{
observer_ptr<const state> front_ = &dfa_._states.front();
dfa new_dfa_(new_index_);
auto set_end_ = index_set_.cend();
observer_ptr<const state> ptr_ = front_;
observer_ptr<state> new_ptr_ = &new_dfa_._states.front();
if (dfa_._bol_index != npos())
{
new_dfa_._bol_index = lookup_ptr_[dfa_._bol_index];
}
for (index_ = 0; index_ < size_; ++index_)
{
if (index_set_.find(index_) != set_end_)
{
++ptr_;
continue;
}
new_ptr_->_end_state = ptr_->_end_state;
new_ptr_->_id = ptr_->_end_state;
new_ptr_->_user_id = ptr_->_user_id;
new_ptr_->_next_dfa = ptr_->_next_dfa;
if (ptr_->_eol_index != npos())
{
new_ptr_->_eol_index = lookup_ptr_[ptr_->_eol_index];
}
auto iter_ = ptr_->_transitions.cbegin();
auto end_ = ptr_->_transitions.cend();
typename state::id_type_string_token_map::iterator find_;
for (; iter_ != end_; ++iter_)
{
find_ = new_ptr_->_transitions.find
(lookup_ptr_[iter_->first]);
if (find_ == new_ptr_->_transitions.end())
{
new_ptr_->_transitions.insert
(id_type_string_token_pair
(lookup_ptr_[iter_->first], iter_->second));
}
else
{
find_->second.insert(iter_->second);
}
}
++ptr_;
++new_ptr_;
}
dfa_.swap(new_dfa_);
}
}
};
using char_state_machine = basic_char_state_machine<char>;
using wchar_state_machine = basic_char_state_machine<wchar_t>;
using u32char_state_machine = basic_char_state_machine<char32_t>;
}
#endif

View File

@ -0,0 +1,352 @@
// stream_shared_iterator.hpp
// Copyright (c) 2010-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_STREAM_SHARED_ITERATOR_HPP
#define LEXERTL_STREAM_SHARED_ITERATOR_HPP
#include <algorithm>
// memcpy
#include <cstring>
#include <iostream>
#include <math.h>
#include "runtime_error.hpp"
#include <vector>
namespace lexertl
{
template<typename char_type>
class basic_stream_shared_iterator
{
public:
using istream = std::basic_istream<char_type>;
using iterator_category = std::forward_iterator_tag;
using difference_type = std::size_t;
using value_type = char_type;
using pointer = char_type *;
using reference = char_type &;
basic_stream_shared_iterator() :
_master(false),
_live(false),
_index(shared::npos()),
_shared(nullptr)
{
}
basic_stream_shared_iterator(istream &stream_,
const std::size_t buff_size_ = 1024,
const std::size_t increment_ = 1024) :
_master(true),
_live(false),
_index(shared::npos()),
// For exception safety don't call new yet
_shared(nullptr)
{
// Safe to call potentially throwing new now.
_shared = new shared(stream_, buff_size_, increment_);
++_shared->_ref_count;
_shared->_clients.push_back(this);
}
basic_stream_shared_iterator(const basic_stream_shared_iterator &rhs_) :
_master(false),
_live(false),
_index(rhs_._master ? rhs_._shared->lowest() : rhs_._index),
_shared(rhs_._shared)
{
if (_shared)
{
// New copy of an iterator.
// The assumption is that any copy must be live
// even if the rhs is not (otherwise we will never
// have a record of the start of the current range!)
++_shared->_ref_count;
_shared->_clients.push_back(this);
_live = true;
}
}
~basic_stream_shared_iterator()
{
if (_shared)
{
--_shared->_ref_count;
_shared->erase(this);
if (_shared->_ref_count == 0)
{
delete _shared;
_shared = nullptr;
}
}
}
basic_stream_shared_iterator &operator =
(const basic_stream_shared_iterator &rhs_)
{
if (this != &rhs_)
{
_master = false;
_index = rhs_._master ? rhs_._shared->lowest() : rhs_._index;
if (!_live && !rhs_._live)
{
if (rhs_._shared)
{
++rhs_._shared->_ref_count;
}
}
else if (!_live && rhs_._live)
{
rhs_._shared->_clients.push_back(this);
if (!_shared)
{
++rhs_._shared->_ref_count;
}
}
else if (_live && !rhs_._live)
{
_shared->erase(this);
if (!rhs_._shared)
{
--_shared->_ref_count;
}
}
_live = rhs_._live;
_shared = rhs_._shared;
}
return *this;
}
bool operator ==(const basic_stream_shared_iterator &rhs_) const
{
return _index == rhs_._index &&
(_shared == rhs_._shared ||
(_index == shared::npos() || rhs_._index == shared::npos()) &&
(!_shared || !rhs_._shared));
}
bool operator !=(const basic_stream_shared_iterator &rhs_) const
{
return !(*this == rhs_);
}
const char_type &operator *()
{
check_master();
return _shared->_buffer[_index];
}
basic_stream_shared_iterator &operator ++()
{
check_master();
++_index;
update_state();
return *this;
}
basic_stream_shared_iterator operator ++(int)
{
basic_stream_shared_iterator iter_ = *this;
check_master();
++_index;
update_state();
return iter_;
}
private:
class shared
{
public:
std::size_t _ref_count;
using char_vector = std::vector<char_type>;
using iter_list = std::vector<basic_stream_shared_iterator *>;
istream &_stream;
std::size_t _increment;
std::size_t _len;
char_vector _buffer;
iter_list _clients;
shared(istream &stream_, const std::size_t buff_size_,
const std::size_t increment_) :
_ref_count(0),
_increment(increment_),
_stream(stream_)
{
_buffer.resize(buff_size_);
_stream.read(&_buffer.front(), _buffer.size());
_len = static_cast<std::size_t>(_stream.gcount());
}
bool reload_buffer()
{
const std::size_t lowest_ = lowest();
std::size_t read_ = 0;
if (lowest_ == 0)
{
// Resize buffer
const std::size_t old_size_ = _buffer.size();
const std::size_t new_size_ = old_size_ + _increment;
_buffer.resize(new_size_);
_stream.read(&_buffer.front() + old_size_, _increment);
read_ = static_cast<std::size_t>(_stream.gcount());
if (read_)
{
read_ += old_size_;
_len = read_;
}
}
else
{
// Some systems have memcpy in namespace std
using namespace std;
const size_t start_ = _buffer.size() - lowest_;
const size_t len_ = _buffer.size() - start_;
memcpy(&_buffer.front(), &_buffer[lowest_], start_ *
sizeof(char_type));
_stream.read(&_buffer.front() + start_, len_);
read_ = static_cast<size_t>(_stream.gcount());
subtract(lowest_);
if (read_)
{
read_ += start_;
_len = read_;
}
else
{
_len = highest();
}
}
return read_ != 0;
}
void erase(basic_stream_shared_iterator *ptr_)
{
auto iter_ = std::find(_clients.begin(), _clients.end(), ptr_);
if (iter_ != _clients.end())
_clients.erase(iter_);
}
std::size_t lowest() const
{
std::size_t lowest_ = npos();
auto iter_ = _clients.cbegin();
auto end_ = _clients.cend();
for (; iter_ != end_; ++iter_)
{
const basic_stream_shared_iterator *ptr_ = *iter_;
if (ptr_->_index < lowest_)
{
lowest_ = ptr_->_index;
}
}
if (lowest_ == npos())
{
lowest_ = 0;
}
return lowest_;
}
std::size_t highest() const
{
std::size_t highest_ = 0;
auto iter_ = _clients.cbegin();
auto end_ = _clients.cend();
for (; iter_ != end_; ++iter_)
{
const basic_stream_shared_iterator *ptr_ = *iter_;
if (ptr_->_index != npos() && ptr_->_index > highest_)
{
highest_ = ptr_->_index;
}
}
return highest_;
}
void subtract(const std::size_t lowest_)
{
auto iter_ = _clients.begin();
auto end_ = _clients.end();
for (; iter_ != end_; ++iter_)
{
basic_stream_shared_iterator *ptr_ = *iter_;
if (ptr_->_index != npos())
{
ptr_->_index -= lowest_;
}
}
}
static std::size_t npos()
{
return ~static_cast<std::size_t>(0);
}
private:
shared &operator =(const shared &rhs_);
};
bool _master;
bool _live;
std::size_t _index;
shared *_shared;
void check_master()
{
if (!_shared)
{
throw runtime_error("Cannot manipulate null (end) "
"stream_shared_iterators.");
}
if (_master)
{
_master = false;
_live = true;
_index = _shared->lowest();
}
}
void update_state()
{
if (_index >= _shared->_len)
{
if (!_shared->reload_buffer())
{
_shared->erase(this);
_index = shared::npos();
_live = false;
}
}
}
};
using stream_shared_iterator = basic_stream_shared_iterator<char>;
using wstream_shared_iterator = basic_stream_shared_iterator<wchar_t>;
}
#endif

View File

@ -0,0 +1,439 @@
// string_token.hpp
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_STRING_TOKEN_HPP
#define LEXERTL_STRING_TOKEN_HPP
#include "char_traits.hpp"
#include <ios> // Needed by GCC 4.4
#include <iostream>
#include <string>
#include <utility>
#include <vector>
namespace lexertl
{
template<typename ch_type>
struct basic_string_token
{
using char_type = ch_type;
using char_traits = basic_char_traits<char_type>;
using index_type = typename char_traits::index_type;
using range = std::pair<index_type, index_type>;
using range_vector = std::vector<range>;
using string = std::basic_string<char_type>;
using string_token = basic_string_token<char_type>;
range_vector _ranges;
basic_string_token() :
_ranges()
{
}
basic_string_token(char_type ch_) :
_ranges()
{
insert(range(ch_, ch_));
}
basic_string_token(char_type first_, char_type second_) :
_ranges()
{
insert(range(first_, second_));
}
void clear()
{
_ranges.clear();
}
bool empty() const
{
return _ranges.empty();
}
bool any() const
{
return _ranges.size() == 1 && _ranges.front().first == 0 &&
_ranges.front().second == char_traits::max_val();
}
bool operator <(const basic_string_token &rhs_) const
{
return _ranges < rhs_._ranges;
}
bool operator ==(const basic_string_token &rhs_) const
{
return _ranges == rhs_._ranges;
}
bool negatable() const
{
std::size_t size_ = 0;
auto iter_ = _ranges.cbegin();
auto end_ = _ranges.cend();
for (; iter_ != end_; ++iter_)
{
size_ += static_cast<std::size_t>(iter_->second) + 1 -
static_cast<std::size_t>(iter_->first);
}
return size_ > static_cast<std::size_t>(char_traits::max_val()) / 2;
}
void swap(basic_string_token &rhs_)
{
_ranges.swap(rhs_._ranges);
}
void insert(const basic_string_token &rhs_)
{
auto iter_ = rhs_._ranges.cbegin();
auto end_ = rhs_._ranges.cend();
for (; iter_ != end_; ++iter_)
{
insert(*iter_);
}
}
// Deliberately pass by value - may modify
typename range_vector::iterator insert(range rhs_)
{
bool insert_ = true;
auto iter_ = _ranges.begin();
auto end_ = _ranges.end();
auto erase_iter_ = end_;
while (iter_ != end_)
{
// follows current item
if (rhs_.first > iter_->second)
{
if (rhs_.first == iter_->second + 1)
{
// Auto normalise
rhs_.first = iter_->first;
}
else
{
// No intersection, consider next
++iter_;
continue;
}
}
// Precedes current item
else if (rhs_.second < iter_->first)
{
if (rhs_.second == iter_->first - 1)
{
// Auto normalise
rhs_.second = iter_->second;
}
else
{
// insert here
break;
}
}
else
{
// overlap (under)
if (rhs_.first < iter_->first)
{
if (rhs_.second < iter_->second)
{
rhs_.second = iter_->second;
}
}
// overlap (over)
else if (rhs_.second > iter_->second)
{
if (rhs_.first > iter_->first)
{
rhs_.first = iter_->first;
}
}
// subset
else
{
insert_ = false;
iter_ = _ranges.end();
break;
}
}
// Code minimisation: this always applies unless we have already
// exited the loop, or "continue" executed.
if (erase_iter_ == end_)
{
erase_iter_ = iter_;
}
++iter_;
}
if (erase_iter_ != end_)
{
if (insert_)
{
// Re-use obsolete location
*erase_iter_ = rhs_;
++erase_iter_;
}
iter_ = _ranges.erase(erase_iter_, iter_);
}
else if (insert_)
{
iter_ = _ranges.insert(iter_, rhs_);
}
return iter_;
}
void negate()
{
index_type next_ = 0;
const index_type max_ = char_traits::max_val();
string_token temp_;
auto iter_ = _ranges.cbegin();
auto end_ = _ranges.cend();
bool finished_ = false;
for (; iter_ != end_; ++iter_)
{
if (next_ < iter_->first)
{
temp_.insert(range(next_,
static_cast<index_type>(iter_->first - 1)));
}
if (iter_->second < max_)
{
next_ = iter_->second + 1;
}
else
{
finished_ = true;
break;
}
}
if (!finished_)
{
temp_.insert(range(next_, max_));
}
swap(temp_);
}
void intersect(basic_string_token &rhs_, basic_string_token &overlap_)
{
auto lhs_iter_ = _ranges.begin();
auto lhs_end_ = _ranges.end();
auto rhs_iter_ = rhs_._ranges.begin();
auto rhs_end_ = rhs_._ranges.end();
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
{
if (rhs_iter_->first > lhs_iter_->second)
{
++lhs_iter_;
}
else if (rhs_iter_->second < lhs_iter_->first)
{
++rhs_iter_;
}
else
{
range range_;
if (rhs_iter_->first > lhs_iter_->first)
{
range_.first = rhs_iter_->first;
}
else
{
range_.first = lhs_iter_->first;
}
if (rhs_iter_->second < lhs_iter_->second)
{
range_.second = rhs_iter_->second;
}
else
{
range_.second = lhs_iter_->second;
}
adjust(range_, *this, lhs_iter_, lhs_end_);
adjust(range_, rhs_, rhs_iter_, rhs_end_);
overlap_.insert(range_);
}
}
}
void remove(basic_string_token &rhs_)
{
auto lhs_iter_ = _ranges.begin();
auto lhs_end_ = _ranges.end();
auto rhs_iter_ = rhs_._ranges.begin();
auto rhs_end_ = rhs_._ranges.end();
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
{
if (rhs_iter_->first > lhs_iter_->second)
{
++lhs_iter_;
}
else if (rhs_iter_->second < lhs_iter_->first)
{
++rhs_iter_;
}
else
{
range range_;
if (rhs_iter_->first > lhs_iter_->first)
{
range_.first = rhs_iter_->first;
}
else
{
range_.first = lhs_iter_->first;
}
if (rhs_iter_->second < lhs_iter_->second)
{
range_.second = rhs_iter_->second;
}
else
{
range_.second = lhs_iter_->second;
}
adjust(range_, *this, lhs_iter_, lhs_end_);
}
}
}
static string escape_char(const typename char_traits::index_type ch_)
{
string out_;
switch (ch_)
{
case '\0':
out_ += '\\';
out_ += '0';
break;
case '\a':
out_ += '\\';
out_ += 'a';
break;
case '\b':
out_ += '\\';
out_ += 'b';
break;
case 27:
out_ += '\\';
out_ += 'x';
out_ += '1';
out_ += 'b';
break;
case '\f':
out_ += '\\';
out_ += 'f';
break;
case '\n':
out_ += '\\';
out_ += 'n';
break;
case '\r':
out_ += '\\';
out_ += 'r';
break;
case '\t':
out_ += '\\';
out_ += 't';
break;
case '\v':
out_ += '\\';
out_ += 'v';
break;
case '\\':
out_ += '\\';
out_ += '\\';
break;
case '"':
out_ += '\\';
out_ += '"';
break;
case '\'':
out_ += '\\';
out_ += '\'';
break;
default:
{
if (ch_ < 32 || ch_ > 126)
{
std::basic_stringstream<char_type> ss_;
out_ += '\\';
out_ += 'x';
ss_ << std::hex <<
static_cast<std::size_t>(ch_);
out_ += ss_.str();
}
else
{
out_ += ch_;
}
break;
}
}
return out_;
}
private:
void adjust(const range &range_, basic_string_token &token_,
typename range_vector::iterator &iter_,
typename range_vector::iterator &end_)
{
if (range_.first > iter_->first)
{
const index_type second_ = iter_->second;
iter_->second = range_.first - 1;
if (range_.second < second_)
{
range new_range_(static_cast<index_type>(range_.second + 1),
second_);
iter_ = token_.insert(new_range_);
end_ = token_._ranges.end();
}
}
else if (range_.second < iter_->second)
{
iter_->first = range_.second + 1;
}
else
{
iter_ = token_._ranges.erase(iter_);
end_ = token_._ranges.end();
}
}
};
}
#endif

View File

@ -0,0 +1,508 @@
// utf_iterators.hpp
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
// Inspired by http://utfcpp.sourceforge.net/
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_UTF_ITERATORS_HPP
#define LEXERTL_UTF_ITERATORS_HPP
#include <iterator>
namespace lexertl
{
template<typename char_iterator, typename char_type>
class basic_utf8_in_iterator :
public std::iterator<std::input_iterator_tag, char_type>
{
public:
using value_type = char_type;
using difference_type =
typename std::iterator_traits<char_iterator>::difference_type;
using iterator_category = std::forward_iterator_tag;
basic_utf8_in_iterator() :
_it(char_iterator()),
_end(char_iterator()),
_char(0)
{
}
explicit basic_utf8_in_iterator(const char_iterator &it_,
const char_iterator &end_) :
_it(it_),
_end(it_),
_char(0)
{
if (it_ != end_)
{
next();
}
}
char_type operator *() const
{
return _char;
}
bool operator ==(const basic_utf8_in_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator !=(const basic_utf8_in_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf8_in_iterator &operator ++()
{
_it = _end;
next();
return *this;
}
basic_utf8_in_iterator operator ++(int)
{
basic_utf8_in_iterator temp_ = *this;
_it = _end;
next();
return temp_;
}
basic_utf8_in_iterator operator +(const std::size_t count_) const
{
basic_utf8_in_iterator temp_ = *this;
for (std::size_t i_ = 0; i_ < count_; ++i_)
{
++temp_;
}
return temp_;
}
basic_utf8_in_iterator operator -(const std::size_t count_) const
{
basic_utf8_in_iterator temp_ = *this;
for (std::size_t i_ = 0; i_ < count_; ++i_)
{
temp_._end = temp_._it;
--temp_._it;
while ((*temp_._it & 0xc0) == 0x80) --temp_._it;
}
temp_.next();
return temp_;
}
private:
char_iterator _it;
char_iterator _end;
char_type _char;
void next()
{
const char len_ = len(_it);
char_type ch_ = *_it & 0xff;
switch (len_)
{
case 1:
_end = _it;
++_end;
break;
case 2:
_end = _it;
++_end;
if ((*_end & 0xc0) != 0x80) break;
ch_ = (ch_ << 6 & 0x7ff) | (*_end & 0x3f);
++_end;
break;
case 3:
_end = _it;
++_end;
if ((*_end & 0xc0) != 0x80) break;
ch_ = (ch_ << 12 & 0xffff) | ((*_end & 0xff) << 6 & 0xfff);
++_end;
if ((*_end & 0xc0) != 0x80) break;
ch_ |= *_end & 0x3f;
++_end;
break;
case 4:
_end = _it;
++_end;
if ((*_end & 0xc0) != 0x80) break;
ch_ = (ch_ << 18 & 0x1fffff) | ((*_end & 0xff) << 12 & 0x3ffff);
++_end;
if ((*_end & 0xc0) != 0x80) break;
ch_ |= (*_end & 0xff) << 6 & 0xfff;
++_end;
if ((*_end & 0xc0) != 0x80) break;
ch_ |= *_end & 0x3f;
++_end;
break;
}
_char = ch_;
}
char len(const char_iterator &it_) const
{
const unsigned char ch_ = *it_;
return ch_ < 0x80 ? 1 :
ch_ >> 5 == 0x06 ? 2 :
ch_ >> 4 == 0x0e ? 3 :
ch_ >> 3 == 0x1e ? 4 :
1;
}
};
template<typename char_iterator>
class basic_utf8_out_iterator :
public std::iterator<std::input_iterator_tag, char>
{
public:
using value_type = char;
using difference_type =
typename std::iterator_traits<char_iterator>::difference_type;
using iterator_category = std::forward_iterator_tag;
basic_utf8_out_iterator() :
_count(0),
_index(0)
{
}
explicit basic_utf8_out_iterator(const char_iterator &it_,
const char_iterator &end_) :
_it(it_),
_count(0),
_index(0)
{
if (it_ != end_)
{
next();
}
}
char operator *() const
{
return _bytes[_index];
}
bool operator ==(const basic_utf8_out_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator !=(const basic_utf8_out_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf8_out_iterator &operator ++()
{
++_index;
if (_index >= _count)
{
++_it;
next();
}
return *this;
}
basic_utf8_out_iterator operator ++(int)
{
basic_utf8_out_iterator temp_ = *this;
++_index;
if (_index >= _count)
{
++_it;
next();
}
return temp_;
}
private:
char_iterator _it;
char _bytes[4];
unsigned char _count;
unsigned char _index;
void next()
{
const std::size_t ch_ = *_it;
_count = len(ch_);
_index = 0;
switch (_count)
{
case 1:
_bytes[0] = static_cast<char>(ch_);
break;
case 2:
_bytes[0] = static_cast<char>((ch_ >> 6) | 0xc0);
_bytes[1] = (ch_ & 0x3f) | 0x80;
break;
case 3:
_bytes[0] = static_cast<char>((ch_ >> 12) | 0xe0);
_bytes[1] = ((ch_ >> 6) & 0x3f) | 0x80;
_bytes[2] = (ch_ & 0x3f) | 0x80;
break;
case 4:
_bytes[0] = static_cast<char>((ch_ >> 18) | 0xf0);
_bytes[1] = ((ch_ >> 12) & 0x3f) | 0x80;
_bytes[2] = ((ch_ >> 6) & 0x3f) | 0x80;
_bytes[3] = (ch_ & 0x3f) | 0x80;
break;
}
}
char len(const std::size_t ch_) const
{
return ch_ < 0x80 ? 1 :
ch_ < 0x800 ? 2 :
ch_ < 0x10000 ? 3 :
4;
}
};
template<typename char_iterator, typename char_type>
class basic_utf16_in_iterator :
public std::iterator<std::input_iterator_tag, char_type>
{
public:
using value_type = char_type;
using difference_type =
typename std::iterator_traits<char_iterator>::difference_type;
using iterator_category = std::forward_iterator_tag;
basic_utf16_in_iterator() :
_it(char_iterator()),
_end(char_iterator()),
_char(0)
{
}
explicit basic_utf16_in_iterator(const char_iterator &it_,
const char_iterator &end_) :
_it(it_),
_end(it_),
_char(0)
{
if (it_ != end_)
{
next();
}
}
char_type operator *() const
{
return _char;
}
bool operator ==(const basic_utf16_in_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator !=(const basic_utf16_in_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf16_in_iterator &operator ++()
{
_it = _end;
next();
return *this;
}
basic_utf16_in_iterator operator ++(int)
{
basic_utf16_in_iterator temp_ = *this;
_it = _end;
next();
return temp_;
}
basic_utf16_in_iterator operator +(const std::size_t count_) const
{
basic_utf16_in_iterator temp_ = *this;
for (std::size_t i_ = 0; i_ < count_; ++i_)
{
++temp_;
}
return temp_;
}
basic_utf16_in_iterator operator -(const std::size_t count_) const
{
basic_utf16_in_iterator temp_ = *this;
for (std::size_t i_ = 0; i_ < count_; ++i_)
{
temp_._end = temp_._it;
--temp_._it;
if (*temp_._it >= 0xdc00 && *temp_._it <= 0xdfff) --temp_._it;
}
temp_.next();
return temp_;
}
private:
char_iterator _it;
char_iterator _end;
char_type _char;
void next()
{
char_type ch_ = *_it & 0xffff;
_end = _it;
if (ch_ >= 0xd800 && ch_ <= 0xdbff)
{
const char_type surrogate_ = *++_end & 0xffff;
ch_ = (((ch_ - 0xd800) << 10) | (surrogate_ - 0xdc00)) + 0x10000;
}
_char = ch_;
++_end;
}
};
template<typename char_iterator>
class basic_utf16_out_iterator :
public std::iterator<std::input_iterator_tag, wchar_t>
{
public:
using value_type = wchar_t;
using difference_type =
typename std::iterator_traits<char_iterator>::difference_type;
using iterator_category = std::forward_iterator_tag;
basic_utf16_out_iterator() :
_count(0),
_index(0)
{
}
explicit basic_utf16_out_iterator(const char_iterator &it_,
const char_iterator &end_) :
_it(it_),
_count(0),
_index(0)
{
if (it_ != end_)
{
next();
}
}
wchar_t operator *() const
{
return _chars[_index];
}
bool operator ==(const basic_utf16_out_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator !=(const basic_utf16_out_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf16_out_iterator &operator ++()
{
++_index;
if (_index >= _count)
{
++_it;
next();
}
return *this;
}
basic_utf16_out_iterator operator ++(int)
{
basic_utf16_out_iterator temp_ = *this;
++_index;
if (_index >= _count)
{
++_it;
next();
}
return temp_;
}
private:
char_iterator _it;
wchar_t _chars[2];
unsigned char _count;
unsigned char _index;
void next()
{
const std::size_t ch_ = *_it;
_count = len(ch_);
_index = 0;
switch (_count)
{
case 1:
_chars[0] = static_cast<wchar_t>(ch_);
break;
case 2:
_chars[0] = static_cast<wchar_t>((ch_ >> 10) + 0xdc00u -
(0x10000 >> 10));
_chars[1] = static_cast<wchar_t>((ch_ & 0x3ff) + 0xdc00u);
break;
}
}
char len(const std::size_t ch_) const
{
return ch_ > 0xffff ? 2 : 1;
}
};
}
#endif