mirror of
https://github.com/YACReader/yacreader
synced 2025-05-28 03:10:27 -04:00
Add commit 43aab01 of BenHanson/lexertl14 from github
This commit is contained in:
parent
c4f792bd40
commit
d3de52ca82
@ -147,7 +147,42 @@ HEADERS += comic_flow.h \
|
||||
yacreader_comics_selection_helper.h \
|
||||
yacreader_comic_info_helper.h \
|
||||
db/reading_list.h \
|
||||
current_comic_view_helper.h
|
||||
current_comic_view_helper.h \
|
||||
lexertl/parser/tokeniser/re_token.hpp \
|
||||
lexertl/parser/tokeniser/re_tokeniser.hpp \
|
||||
lexertl/parser/tokeniser/re_tokeniser_helper.hpp \
|
||||
lexertl/parser/tokeniser/re_tokeniser_state.hpp \
|
||||
lexertl/parser/tree/end_node.hpp \
|
||||
lexertl/parser/tree/iteration_node.hpp \
|
||||
lexertl/parser/tree/leaf_node.hpp \
|
||||
lexertl/parser/tree/node.hpp \
|
||||
lexertl/parser/tree/selection_node.hpp \
|
||||
lexertl/parser/tree/sequence_node.hpp \
|
||||
lexertl/parser/parser.hpp \
|
||||
lexertl/partition/charset.hpp \
|
||||
lexertl/partition/equivset.hpp \
|
||||
lexertl/char_traits.hpp \
|
||||
lexertl/debug.hpp \
|
||||
lexertl/dot.hpp \
|
||||
lexertl/enums.hpp \
|
||||
lexertl/generate_cpp.hpp \
|
||||
lexertl/generator.hpp \
|
||||
lexertl/internals.hpp \
|
||||
lexertl/iterator.hpp \
|
||||
lexertl/lookup.hpp \
|
||||
lexertl/match_results.hpp \
|
||||
lexertl/memory_file.hpp \
|
||||
lexertl/narrow.hpp \
|
||||
lexertl/observer_ptr.hpp \
|
||||
lexertl/rules.hpp \
|
||||
lexertl/runtime_error.hpp \
|
||||
lexertl/serialise.hpp \
|
||||
lexertl/sm_to_csm.hpp \
|
||||
lexertl/sm_traits.hpp \
|
||||
lexertl/state_machine.hpp \
|
||||
lexertl/stream_shared_iterator.hpp \
|
||||
lexertl/string_token.hpp \
|
||||
lexertl/utf_iterators.hpp
|
||||
|
||||
!CONFIG(no_opengl) {
|
||||
HEADERS += ../common/gl/yacreader_flow_gl.h
|
||||
@ -324,3 +359,6 @@ translation.files = ../release/languages/yacreaderlibrary_*
|
||||
manpage.path = $$DATADIR/man/man1
|
||||
manpage.files = ../YACReaderLibrary.1
|
||||
}
|
||||
|
||||
DISTFILES += \
|
||||
lexertl/licence_1_0.txt
|
||||
|
45
YACReaderLibrary/lexertl/char_traits.hpp
Normal file
45
YACReaderLibrary/lexertl/char_traits.hpp
Normal file
@ -0,0 +1,45 @@
|
||||
// char_traits.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_CHAR_TRAITS_HPP
|
||||
#define LEXERTL_CHAR_TRAITS_HPP
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename ch_type>
|
||||
struct basic_char_traits
|
||||
{
|
||||
using char_type = ch_type;
|
||||
using index_type = ch_type;
|
||||
|
||||
static index_type max_val()
|
||||
{
|
||||
const std::uint32_t max_ = 0x10ffff;
|
||||
|
||||
return sizeof(char_type) > 2 ?
|
||||
max_ : (max_ & 0xffff);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct basic_char_traits<char>
|
||||
{
|
||||
using char_type = char;
|
||||
using index_type = unsigned char;
|
||||
|
||||
static index_type max_val()
|
||||
{
|
||||
// Prevent annoying warning (VC++)
|
||||
index_type zero_ = 0;
|
||||
|
||||
return ~zero_;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
311
YACReaderLibrary/lexertl/debug.hpp
Normal file
311
YACReaderLibrary/lexertl/debug.hpp
Normal file
@ -0,0 +1,311 @@
|
||||
// debug.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_DEBUG_HPP
|
||||
#define LEXERTL_DEBUG_HPP
|
||||
|
||||
#include <map>
|
||||
#include <ostream>
|
||||
#include "rules.hpp"
|
||||
#include "sm_to_csm.hpp"
|
||||
#include "state_machine.hpp"
|
||||
#include "string_token.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename sm, typename char_type, typename id_type = uint16_t,
|
||||
bool is_dfa = true>
|
||||
class basic_debug
|
||||
{
|
||||
public:
|
||||
using char_state_machine =
|
||||
basic_char_state_machine<char_type, id_type, is_dfa>;
|
||||
using ostream = std::basic_ostream<char_type>;
|
||||
using rules = basic_rules<char_type, char_type, id_type>;
|
||||
using string = std::basic_string<char_type>;
|
||||
|
||||
static void dump(const sm &sm_, rules &rules_, ostream &stream_)
|
||||
{
|
||||
char_state_machine csm_;
|
||||
|
||||
sm_to_csm(sm_, csm_);
|
||||
dump(csm_, rules_, stream_);
|
||||
}
|
||||
|
||||
static void dump(const sm &sm_, ostream &stream_)
|
||||
{
|
||||
char_state_machine csm_;
|
||||
|
||||
sm_to_csm(sm_, csm_);
|
||||
dump(csm_, stream_);
|
||||
}
|
||||
|
||||
static void dump(const char_state_machine &csm_, rules &rules_,
|
||||
ostream &stream_)
|
||||
{
|
||||
for (std::size_t dfa_ = 0, dfas_ = csm_.size(); dfa_ < dfas_; ++dfa_)
|
||||
{
|
||||
lexer_state(stream_);
|
||||
stream_ << rules_.state(dfa_) << std::endl << std::endl;
|
||||
|
||||
dump_ex(csm_._sm_vector[dfa_], stream_);
|
||||
}
|
||||
}
|
||||
|
||||
static void dump(const char_state_machine &csm_, ostream &stream_)
|
||||
{
|
||||
for (std::size_t dfa_ = 0, dfas_ = csm_.size(); dfa_ < dfas_; ++dfa_)
|
||||
{
|
||||
lexer_state(stream_);
|
||||
stream_ << dfa_ << std::endl << std::endl;
|
||||
|
||||
dump_ex(csm_._sm_vector[dfa_], stream_);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
using dfa_state = typename char_state_machine::state;
|
||||
using string_token = typename dfa_state::string_token;
|
||||
using stringstream = std::basic_stringstream<char_type>;
|
||||
|
||||
static void dump_ex(const typename char_state_machine::dfa &dfa_,
|
||||
ostream &stream_)
|
||||
{
|
||||
const std::size_t states_ = dfa_._states.size();
|
||||
const id_type bol_index_ = dfa_._bol_index;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < states_; ++i_)
|
||||
{
|
||||
const dfa_state &state_ = dfa_._states[i_];
|
||||
|
||||
state(stream_);
|
||||
stream_ << i_ << std::endl;
|
||||
|
||||
if (state_._end_state)
|
||||
{
|
||||
end_state(stream_);
|
||||
|
||||
if (state_._push_pop_dfa == dfa_state::push_dfa)
|
||||
{
|
||||
push(stream_);
|
||||
stream_ << state_._push_dfa;
|
||||
}
|
||||
else if (state_._push_pop_dfa == dfa_state::pop_dfa)
|
||||
{
|
||||
pop(stream_);
|
||||
}
|
||||
|
||||
id(stream_);
|
||||
stream_ << static_cast<std::size_t>(state_._id);
|
||||
user_id(stream_);
|
||||
stream_ << static_cast<std::size_t>(state_._user_id);
|
||||
dfa(stream_);
|
||||
stream_ << static_cast<std::size_t>(state_._next_dfa);
|
||||
stream_ << std::endl;
|
||||
}
|
||||
|
||||
if (i_ == 0 && bol_index_ != char_state_machine::npos())
|
||||
{
|
||||
bol(stream_);
|
||||
stream_ << static_cast<std::size_t>(bol_index_) << std::endl;
|
||||
}
|
||||
|
||||
if (state_._eol_index != char_state_machine::npos())
|
||||
{
|
||||
eol(stream_);
|
||||
stream_ << static_cast<std::size_t>(state_._eol_index) <<
|
||||
std::endl;
|
||||
}
|
||||
|
||||
for (const auto &tran_ : state_._transitions)
|
||||
{
|
||||
string_token token_ = tran_.second;
|
||||
|
||||
open_bracket(stream_);
|
||||
|
||||
if (!tran_.second.any() && tran_.second.negatable())
|
||||
{
|
||||
token_.negate();
|
||||
negated(stream_);
|
||||
}
|
||||
|
||||
string chars_;
|
||||
|
||||
for (const auto &range_ : token_._ranges)
|
||||
{
|
||||
if (range_.first == '-' || range_.first == '^' ||
|
||||
range_.first == ']')
|
||||
{
|
||||
stream_ << '\\';
|
||||
}
|
||||
|
||||
chars_ = string_token::escape_char
|
||||
(range_.first);
|
||||
|
||||
if (range_.first != range_.second)
|
||||
{
|
||||
if (range_.first + 1 < range_.second)
|
||||
{
|
||||
chars_ += '-';
|
||||
}
|
||||
|
||||
if (range_.second == '-' || range_.second == '^' ||
|
||||
range_.second == ']')
|
||||
{
|
||||
stream_ << '\\';
|
||||
}
|
||||
|
||||
chars_ += string_token::escape_char(range_.second);
|
||||
}
|
||||
|
||||
stream_ << chars_;
|
||||
}
|
||||
|
||||
close_bracket(stream_);
|
||||
stream_ << static_cast<std::size_t>(tran_.first) <<
|
||||
std::endl;
|
||||
}
|
||||
|
||||
stream_ << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
static void lexer_state(std::ostream &stream_)
|
||||
{
|
||||
stream_ << "Lexer state: ";
|
||||
}
|
||||
|
||||
static void lexer_state(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L"Lexer state: ";
|
||||
}
|
||||
|
||||
static void state(std::ostream &stream_)
|
||||
{
|
||||
stream_ << "State: ";
|
||||
}
|
||||
|
||||
static void state(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L"State: ";
|
||||
}
|
||||
|
||||
static void bol(std::ostream &stream_)
|
||||
{
|
||||
stream_ << " BOL -> ";
|
||||
}
|
||||
|
||||
static void bol(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L" BOL -> ";
|
||||
}
|
||||
|
||||
static void eol(std::ostream &stream_)
|
||||
{
|
||||
stream_ << " EOL -> ";
|
||||
}
|
||||
|
||||
static void eol(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L" EOL -> ";
|
||||
}
|
||||
|
||||
static void end_state(std::ostream &stream_)
|
||||
{
|
||||
stream_ << " END STATE";
|
||||
}
|
||||
|
||||
static void end_state(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L" END STATE";
|
||||
}
|
||||
|
||||
static void id(std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", Id = ";
|
||||
}
|
||||
|
||||
static void id(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", Id = ";
|
||||
}
|
||||
|
||||
static void push(std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", PUSH ";
|
||||
}
|
||||
|
||||
static void push(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", PUSH ";
|
||||
}
|
||||
|
||||
static void pop(std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", POP";
|
||||
}
|
||||
|
||||
static void pop(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", POP";
|
||||
}
|
||||
|
||||
static void user_id(std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", User Id = ";
|
||||
}
|
||||
|
||||
static void user_id(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", User Id = ";
|
||||
}
|
||||
|
||||
static void open_bracket(std::ostream &stream_)
|
||||
{
|
||||
stream_ << " [";
|
||||
}
|
||||
|
||||
static void open_bracket(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L" [";
|
||||
}
|
||||
|
||||
static void negated(std::ostream &stream_)
|
||||
{
|
||||
stream_ << "^";
|
||||
}
|
||||
|
||||
static void negated(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L"^";
|
||||
}
|
||||
|
||||
static void close_bracket(std::ostream &stream_)
|
||||
{
|
||||
stream_ << "] -> ";
|
||||
}
|
||||
|
||||
static void close_bracket(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L"] -> ";
|
||||
}
|
||||
|
||||
static void dfa(std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", dfa = ";
|
||||
}
|
||||
|
||||
static void dfa(std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", dfa = ";
|
||||
}
|
||||
};
|
||||
|
||||
using debug = basic_debug<state_machine, char>;
|
||||
using wdebug = basic_debug<wstate_machine, wchar_t>;
|
||||
}
|
||||
|
||||
#endif
|
293
YACReaderLibrary/lexertl/dot.hpp
Normal file
293
YACReaderLibrary/lexertl/dot.hpp
Normal file
@ -0,0 +1,293 @@
|
||||
// dot.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
// Copyright (c) 2013 Autodesk, Inc. All rights reserved.
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_DOT_HPP
|
||||
#define LEXERTL_DOT_HPP
|
||||
|
||||
#include <ostream>
|
||||
#include "rules.hpp"
|
||||
#include "state_machine.hpp"
|
||||
#include "sm_to_csm.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
//! The class template basic_dot contains utility functions used to
|
||||
//! dump a description of a finite state machine formatted in the
|
||||
//! DOT language (http://www.graphviz.org/doc/info/lang.html). The
|
||||
//! resulting directed graph can previewed by opening the ".dot" file
|
||||
//! into the GraphViz application (http://www.graphviz.org).
|
||||
template<typename sm, typename char_type, typename id_type = uint16_t,
|
||||
bool is_dfa = true>
|
||||
class basic_dot
|
||||
{
|
||||
public:
|
||||
using char_state_machine =
|
||||
basic_char_state_machine<char_type, id_type, is_dfa>;
|
||||
using rules = basic_rules<char_type, char_type, id_type>;
|
||||
using ostream = std::basic_ostream<char_type>;
|
||||
using string = std::basic_string<char_type>;
|
||||
|
||||
//! Dumps a description of the finite state machine expressed in
|
||||
//! the DOT language to the given output stream.
|
||||
static void dump(const sm &sm_, rules &rules_, ostream &stream_)
|
||||
{
|
||||
char_state_machine csm_;
|
||||
|
||||
sm_to_csm(sm_, csm_);
|
||||
dump(csm_, rules_, stream_);
|
||||
}
|
||||
|
||||
//! Dumps a description of the finite state machine expressed in
|
||||
//! the DOT language to the given output stream.
|
||||
static void dump(const char_state_machine &csm_, rules &rules_,
|
||||
ostream &stream_)
|
||||
{
|
||||
header(stream_);
|
||||
for (std::size_t dfa_ = 0, dfas_ = csm_.size(); dfa_ < dfas_; ++dfa_)
|
||||
{
|
||||
dump_ex(dfa_, csm_._sm_vector[dfa_], rules_, stream_);
|
||||
}
|
||||
trailer(stream_);
|
||||
}
|
||||
|
||||
protected:
|
||||
using dfa_state = typename char_state_machine::state;
|
||||
using string_token = typename dfa_state::string_token;
|
||||
using stringstream = std::basic_stringstream<char_type>;
|
||||
|
||||
// Naming of nodes used in the DOT diagram. The naming is of the
|
||||
// form: L<dfa_id>_S<state_id>.
|
||||
static string node_name(id_type dfa_id_, id_type state_id_)
|
||||
{
|
||||
stringstream namestream_;
|
||||
namestream_ << "L" << dfa_id_ << "_S" << state_id_;
|
||||
return namestream_.str();
|
||||
}
|
||||
|
||||
// Escape control characters twice. This is necessary when
|
||||
// expressing character sets attached as to DOT nodes as
|
||||
// labels.
|
||||
static string double_escape_char(const id_type ch_)
|
||||
{
|
||||
stringstream out_;
|
||||
|
||||
switch (ch_)
|
||||
{
|
||||
case '\0':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << '0';
|
||||
break;
|
||||
case '\a':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << 'a';
|
||||
break;
|
||||
case '\b':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << 'b';
|
||||
break;
|
||||
case '\f':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << 'f';
|
||||
break;
|
||||
case '\n':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << 'n';
|
||||
break;
|
||||
case '\r':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << 'r';
|
||||
break;
|
||||
case '\t':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << 't';
|
||||
break;
|
||||
case '\v':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << 'v';
|
||||
break;
|
||||
case '\\':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
break;
|
||||
case '"':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << '"';
|
||||
break;
|
||||
case '\'':
|
||||
out_ << '\\';
|
||||
out_ << '\\';
|
||||
out_ << '\'';
|
||||
break;
|
||||
default:
|
||||
{
|
||||
if (ch_ < 32 || ch_ > 126)
|
||||
{
|
||||
out_ << '\\';
|
||||
out_ << 'x';
|
||||
out_ << std::hex <<
|
||||
static_cast<std::size_t>(ch_);
|
||||
}
|
||||
else
|
||||
{
|
||||
out_ << char_type(ch_);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return out_.str();
|
||||
}
|
||||
|
||||
// Internal function actually performing the work of dumping the
|
||||
// state machine in DOT.
|
||||
static void dump_ex(id_type dfa_id_,
|
||||
const typename char_state_machine::dfa &dfa_,
|
||||
rules &rules_,
|
||||
ostream &stream_)
|
||||
{
|
||||
const std::size_t states_ = dfa_._states.size();
|
||||
typename dfa_state::id_type_string_token_map::const_iterator iter_;
|
||||
typename dfa_state::id_type_string_token_map::const_iterator end_;
|
||||
|
||||
stream_ << std::endl;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < states_; ++i_)
|
||||
{
|
||||
const dfa_state &state_ = dfa_._states[i_];
|
||||
|
||||
const string name = node_name(dfa_id_, i_);
|
||||
if (i_ == 0)
|
||||
{
|
||||
stream_ << " " << name << " [shape = doublecircle, xlabel=\""
|
||||
<< rules_.state(dfa_id_) << "\"];" << std::endl;
|
||||
}
|
||||
else if (state_._end_state)
|
||||
{
|
||||
stream_ << " " << name <<
|
||||
" [shape = doublecircle, xlabel=\"id =" <<
|
||||
static_cast<std::size_t>(state_._id) << "\"];" <<
|
||||
std::endl;
|
||||
}
|
||||
else {
|
||||
stream_ << " " << name << " [shape = circle];" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
stream_ << std::endl;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < states_; ++i_)
|
||||
{
|
||||
const dfa_state &state_ = dfa_._states[i_];
|
||||
|
||||
iter_ = state_._transitions.begin();
|
||||
end_ = state_._transitions.end();
|
||||
|
||||
const string src_name = node_name(dfa_id_, i_);
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
const string dst_name = node_name(dfa_id_, iter_->first);
|
||||
stream_ << " " << src_name << " -> " << dst_name <<
|
||||
" [label = \"";
|
||||
|
||||
string_token token_ = iter_->second;
|
||||
|
||||
open_bracket(stream_);
|
||||
|
||||
if (!iter_->second.any() && iter_->second.negatable())
|
||||
{
|
||||
token_.negate();
|
||||
negated(stream_);
|
||||
}
|
||||
|
||||
string chars_;
|
||||
auto ranges_iter_ = token_._ranges.cbegin();
|
||||
auto ranges_end_ = token_._ranges.cend();
|
||||
|
||||
for (; ranges_iter_ != ranges_end_; ++ranges_iter_)
|
||||
{
|
||||
if (ranges_iter_->first == '^' ||
|
||||
ranges_iter_->first == ']')
|
||||
{
|
||||
stream_ << "\\\\";
|
||||
}
|
||||
|
||||
chars_ = double_escape_char(ranges_iter_->first);
|
||||
|
||||
if (ranges_iter_->first != ranges_iter_->second)
|
||||
{
|
||||
if (ranges_iter_->first + 1 < ranges_iter_->second)
|
||||
{
|
||||
chars_ += '-';
|
||||
}
|
||||
|
||||
if (ranges_iter_->second == '^' ||
|
||||
ranges_iter_->second == ']')
|
||||
{
|
||||
stream_ << "\\\\";
|
||||
}
|
||||
|
||||
chars_ += double_escape_char(ranges_iter_->second);
|
||||
}
|
||||
|
||||
stream_ << chars_;
|
||||
}
|
||||
|
||||
close_bracket(stream_);
|
||||
stream_ << "\"];" << std::endl;
|
||||
}
|
||||
|
||||
if (state_._end_state) {
|
||||
const string dst_name = node_name(state_._next_dfa, 0);
|
||||
stream_ << " " << src_name << " -> " << dst_name
|
||||
<< " [style = \"dashed\"];" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void header(ostream &stream_)
|
||||
{
|
||||
stream_ << "digraph DFAs {" << std::endl;
|
||||
stream_ << " rankdir = LR;" << std::endl;
|
||||
}
|
||||
|
||||
static void trailer(ostream &stream_)
|
||||
{
|
||||
stream_ << "}" << std::endl;
|
||||
}
|
||||
|
||||
static void open_bracket(ostream &stream_)
|
||||
{
|
||||
stream_ << "[";
|
||||
}
|
||||
|
||||
static void negated(ostream &stream_)
|
||||
{
|
||||
stream_ << "^";
|
||||
}
|
||||
|
||||
static void close_bracket(ostream &stream_)
|
||||
{
|
||||
stream_ << "]";
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
using dot = basic_dot<basic_state_machine<char>, char>;
|
||||
using wdot = basic_dot<basic_state_machine<wchar_t>, wchar_t>;
|
||||
}
|
||||
|
||||
#endif
|
25
YACReaderLibrary/lexertl/enums.hpp
Normal file
25
YACReaderLibrary/lexertl/enums.hpp
Normal file
@ -0,0 +1,25 @@
|
||||
// enums.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_ENUMS_HPP
|
||||
#define LEXERTL_ENUMS_HPP
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
enum regex_flags {icase = 1, dot_not_newline = 2, dot_not_cr_lf = 4,
|
||||
skip_ws = 8, match_zero_len = 16};
|
||||
// 0 = end state, 1 = id, 2 = user id, 3 = push_dfa_index
|
||||
// 4 = next dfa, 5 = dead state, 6 = dfa_start
|
||||
enum {end_state_index, id_index, user_id_index, push_dfa_index,
|
||||
next_dfa_index, eol_index, dead_state_index, transitions_index};
|
||||
// Rule flags:
|
||||
enum feature_flags {bol_bit = 1, eol_bit = 2, skip_bit = 4, again_bit = 8,
|
||||
multi_state_bit = 16, recursive_bit = 32, advance_bit = 64};
|
||||
// End state flags:
|
||||
enum {end_state_bit = 1, pop_dfa_bit = 2};
|
||||
}
|
||||
|
||||
#endif
|
1123
YACReaderLibrary/lexertl/generate_cpp.hpp
Normal file
1123
YACReaderLibrary/lexertl/generate_cpp.hpp
Normal file
File diff suppressed because it is too large
Load Diff
738
YACReaderLibrary/lexertl/generator.hpp
Normal file
738
YACReaderLibrary/lexertl/generator.hpp
Normal file
@ -0,0 +1,738 @@
|
||||
// generator.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_GENERATOR_HPP
|
||||
#define LEXERTL_GENERATOR_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include "partition/charset.hpp"
|
||||
#include "char_traits.hpp"
|
||||
#include "partition/equivset.hpp"
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include "parser/parser.hpp"
|
||||
#include "rules.hpp"
|
||||
#include "state_machine.hpp"
|
||||
#include <type_traits>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename rules, typename sm, typename char_traits = basic_char_traits
|
||||
<typename sm::traits::input_char_type> >
|
||||
class basic_generator
|
||||
{
|
||||
public:
|
||||
using id_type = typename rules::id_type;
|
||||
using rules_char_type = typename rules::rules_char_type;
|
||||
using sm_traits = typename sm::traits;
|
||||
using parser = detail::basic_parser<rules_char_type, sm_traits>;
|
||||
using charset_map = typename parser::charset_map;
|
||||
using node = typename parser::node;
|
||||
using node_ptr_vector = typename parser::node_ptr_vector;
|
||||
|
||||
static void build(const rules &rules_, sm &sm_)
|
||||
{
|
||||
const std::size_t size_ = rules_.statemap().size();
|
||||
// Strong exception guarantee
|
||||
// http://www.boost.org/community/exception_safety.html
|
||||
internals internals_;
|
||||
sm temp_sm_;
|
||||
node_ptr_vector node_ptr_vector_;
|
||||
|
||||
internals_._eoi = rules_.eoi();
|
||||
internals_.add_states(size_);
|
||||
|
||||
for (id_type index_ = 0; index_ < size_; ++index_)
|
||||
{
|
||||
if (rules_.regexes()[index_].empty())
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Lexer states with no rules are not allowed "
|
||||
"(lexer state " << index_ << ".)";
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
// Note that the following variables are per DFA.
|
||||
// Map of regex charset tokens (strings) to index
|
||||
charset_map charset_map_;
|
||||
// Used to fix up $ and \n clashes.
|
||||
id_type nl_id_ = sm_traits::npos();
|
||||
// Regex syntax tree
|
||||
observer_ptr<node> root_ = build_tree(rules_, index_,
|
||||
node_ptr_vector_, charset_map_, nl_id_);
|
||||
|
||||
build_dfa(charset_map_, root_, internals_, temp_sm_, index_,
|
||||
nl_id_);
|
||||
|
||||
if (internals_._dfa[index_].size() /
|
||||
internals_._dfa_alphabet[index_] >= sm_traits::npos())
|
||||
{
|
||||
// Overflow
|
||||
throw runtime_error("The data type you have chosen "
|
||||
"cannot hold this many DFA rows.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If you get a compile error here the id_type from rules and
|
||||
// state machine do no match.
|
||||
create(internals_, temp_sm_, rules_.features(), lookup());
|
||||
sm_.swap(temp_sm_);
|
||||
}
|
||||
|
||||
static observer_ptr<node> build_tree(const rules &rules_,
|
||||
const std::size_t dfa_, node_ptr_vector &node_ptr_vector_,
|
||||
charset_map &charset_map_, id_type &nl_id_)
|
||||
{
|
||||
parser parser_(rules_.locale(), node_ptr_vector_, charset_map_,
|
||||
rules_.eoi());
|
||||
const auto ®exes_ = rules_.regexes();
|
||||
auto regex_iter_ = regexes_[dfa_].cbegin();
|
||||
auto regex_iter_end_ = regexes_[dfa_].cend();
|
||||
const auto &ids_ = rules_.ids();
|
||||
const auto &user_ids_ = rules_.user_ids();
|
||||
auto id_iter_ = ids_[dfa_].cbegin();
|
||||
auto user_id_iter_ = user_ids_[dfa_].cbegin();
|
||||
const auto &next_dfas_ = rules_.next_dfas();
|
||||
const auto &pushes_ = rules_.pushes();
|
||||
const auto &pops_ = rules_.pops();
|
||||
auto next_dfa_iter_ = next_dfas_[dfa_].cbegin();
|
||||
auto push_dfa_iter_ = pushes_[dfa_].cbegin();
|
||||
auto pop_dfa_iter_ = pops_[dfa_].cbegin();
|
||||
const bool seen_bol_ = (rules_.features()[dfa_] & bol_bit) != 0;
|
||||
observer_ptr<node> root_ = nullptr;
|
||||
|
||||
root_ = parser_.parse(*regex_iter_, *id_iter_, *user_id_iter_,
|
||||
*next_dfa_iter_, *push_dfa_iter_, *pop_dfa_iter_,
|
||||
rules_.flags(), nl_id_, seen_bol_);
|
||||
++regex_iter_;
|
||||
++id_iter_;
|
||||
++user_id_iter_;
|
||||
++next_dfa_iter_;
|
||||
++push_dfa_iter_;
|
||||
++pop_dfa_iter_;
|
||||
|
||||
// Build syntax trees
|
||||
while (regex_iter_ != regex_iter_end_)
|
||||
{
|
||||
observer_ptr<node> rhs_ = parser_.parse(*regex_iter_, *id_iter_,
|
||||
*user_id_iter_, *next_dfa_iter_, *push_dfa_iter_,
|
||||
*pop_dfa_iter_, rules_.flags(), nl_id_,
|
||||
(rules_.features()[dfa_] & bol_bit) != 0);
|
||||
|
||||
node_ptr_vector_.emplace_back
|
||||
(std::make_unique<selection_node>(root_, rhs_));
|
||||
root_ = node_ptr_vector_.back().get();
|
||||
|
||||
++regex_iter_;
|
||||
++id_iter_;
|
||||
++user_id_iter_;
|
||||
++next_dfa_iter_;
|
||||
++push_dfa_iter_;
|
||||
++pop_dfa_iter_;
|
||||
}
|
||||
|
||||
return root_;
|
||||
}
|
||||
|
||||
protected:
|
||||
using compressed = std::integral_constant<bool, sm_traits::compressed>;
|
||||
using equivset = detail::basic_equivset<id_type>;
|
||||
using equivset_list = std::list<std::unique_ptr<equivset>>;
|
||||
using equivset_ptr = std::unique_ptr<equivset>;
|
||||
using sm_char_type = typename sm_traits::char_type;
|
||||
using charset = detail::basic_charset<sm_char_type, id_type>;
|
||||
using charset_ptr = std::unique_ptr<charset>;
|
||||
using charset_list = std::list<std::unique_ptr<charset>>;
|
||||
using internals = detail::basic_internals<id_type>;
|
||||
using id_type_set = typename std::set<id_type>;
|
||||
using id_type_vector = typename internals::id_type_vector;
|
||||
using index_set = typename charset::index_set;
|
||||
using index_set_vector = std::vector<index_set>;
|
||||
using is_dfa = std::integral_constant<bool, sm_traits::is_dfa>;
|
||||
using lookup = std::integral_constant<bool, sm_traits::lookup>;
|
||||
using node_set = std::set<observer_ptr<const node>>;
|
||||
using node_set_vector = std::vector<std::unique_ptr<node_set>>;
|
||||
using node_vector = typename node::node_vector;
|
||||
using node_vector_vector = std::vector<std::unique_ptr<node_vector>>;
|
||||
using selection_node = typename parser::selection_node;
|
||||
using size_t_vector = typename std::vector<std::size_t>;
|
||||
using string_token = typename parser::string_token;
|
||||
|
||||
static void build_dfa(const charset_map &charset_map_,
|
||||
const observer_ptr<node> root_, internals &internals_, sm &sm_,
|
||||
const id_type dfa_index_, id_type &nl_id_)
|
||||
{
|
||||
// partitioned charset list
|
||||
charset_list charset_list_;
|
||||
// vector mapping token indexes to partitioned token index sets
|
||||
index_set_vector set_mapping_;
|
||||
auto &dfa_ = internals_._dfa[dfa_index_];
|
||||
std::size_t dfa_alphabet_ = 0;
|
||||
const node_vector &followpos_ = root_->firstpos();
|
||||
node_set_vector seen_sets_;
|
||||
node_vector_vector seen_vectors_;
|
||||
size_t_vector hash_vector_;
|
||||
id_type zero_id_ = sm_traits::npos();
|
||||
id_type_set eol_set_;
|
||||
|
||||
set_mapping_.resize(charset_map_.size());
|
||||
partition_charsets(charset_map_, charset_list_, is_dfa());
|
||||
build_set_mapping(charset_list_, internals_, dfa_index_,
|
||||
set_mapping_);
|
||||
|
||||
if (nl_id_ != sm_traits::npos())
|
||||
{
|
||||
nl_id_ = *set_mapping_[nl_id_].begin();
|
||||
zero_id_ = sm_traits::compressed ?
|
||||
*set_mapping_[charset_map_.find(string_token(0, 0))->
|
||||
second].begin() : sm_traits::npos();
|
||||
}
|
||||
|
||||
dfa_alphabet_ = charset_list_.size() + transitions_index +
|
||||
(nl_id_ == sm_traits::npos() ? 0 : 1);
|
||||
|
||||
if (dfa_alphabet_ > sm_traits::npos())
|
||||
{
|
||||
// Overflow
|
||||
throw runtime_error("The data type you have chosen cannot hold "
|
||||
"the dfa alphabet.");
|
||||
}
|
||||
|
||||
internals_._dfa_alphabet[dfa_index_] =
|
||||
static_cast<id_type>(dfa_alphabet_);
|
||||
// 'jam' state
|
||||
dfa_.resize(dfa_alphabet_, 0);
|
||||
closure(followpos_, seen_sets_, seen_vectors_, hash_vector_,
|
||||
static_cast<id_type>(dfa_alphabet_), dfa_);
|
||||
|
||||
// Loop over states
|
||||
for (id_type index_ = 0; index_ < static_cast<id_type>
|
||||
(seen_vectors_.size()); ++index_)
|
||||
{
|
||||
equivset_list equiv_list_;
|
||||
|
||||
// Intersect charsets
|
||||
build_equiv_list(*seen_vectors_[index_].get(), set_mapping_,
|
||||
equiv_list_, is_dfa());
|
||||
|
||||
for (auto &equivset_ : equiv_list_)
|
||||
{
|
||||
const id_type transition_ = closure
|
||||
(equivset_->_followpos, seen_sets_, seen_vectors_,
|
||||
hash_vector_, static_cast<id_type>(dfa_alphabet_), dfa_);
|
||||
|
||||
if (transition_ != sm_traits::npos())
|
||||
{
|
||||
observer_ptr<id_type> ptr_ = &dfa_.front() +
|
||||
((index_ + 1) * dfa_alphabet_);
|
||||
|
||||
// Prune abstemious transitions from end states.
|
||||
if (*ptr_ && !equivset_->_greedy) continue;
|
||||
|
||||
set_transitions(transition_, equivset_.get(), dfa_, ptr_,
|
||||
index_, eol_set_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fix_clashes(eol_set_, nl_id_, zero_id_, dfa_, dfa_alphabet_,
|
||||
compressed());
|
||||
append_dfa(charset_list_, internals_, sm_, dfa_index_, lookup());
|
||||
}
|
||||
|
||||
static void set_transitions(const id_type transition_, equivset *equivset_,
|
||||
typename internals::id_type_vector &dfa_, id_type *ptr_,
|
||||
const id_type index_, id_type_set &eol_set_)
|
||||
{
|
||||
for (typename equivset::index_vector::const_iterator
|
||||
equiv_iter_ = equivset_->_index_vector.begin(),
|
||||
equiv_end_ = equivset_->_index_vector.end();
|
||||
equiv_iter_ != equiv_end_; ++equiv_iter_)
|
||||
{
|
||||
const id_type i_ = *equiv_iter_;
|
||||
|
||||
if (i_ == parser::bol_token())
|
||||
{
|
||||
dfa_.front() = transition_;
|
||||
}
|
||||
else if (i_ == parser::eol_token())
|
||||
{
|
||||
ptr_[eol_index] = transition_;
|
||||
eol_set_.insert(index_ + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
ptr_[i_ + transitions_index] = transition_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uncompressed
|
||||
static void fix_clashes(const id_type_set &eol_set_,
|
||||
const id_type nl_id_, const id_type /*zero_id_*/,
|
||||
typename internals::id_type_vector &dfa_,
|
||||
const std::size_t dfa_alphabet_, const std::false_type &)
|
||||
{
|
||||
for (const auto &eol_ : eol_set_)
|
||||
{
|
||||
observer_ptr<id_type> ptr_ = &dfa_.front() + eol_ * dfa_alphabet_;
|
||||
const id_type eol_state_ = ptr_[eol_index];
|
||||
const id_type nl_state_ = ptr_[nl_id_ + transitions_index];
|
||||
|
||||
if (nl_state_)
|
||||
{
|
||||
ptr_[transitions_index + nl_id_] = 0;
|
||||
ptr_ = &dfa_.front() + eol_state_ * dfa_alphabet_;
|
||||
|
||||
if (ptr_[transitions_index + nl_id_] == 0)
|
||||
{
|
||||
ptr_[transitions_index + nl_id_] = nl_state_;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compressed
|
||||
static void fix_clashes(const id_type_set &eol_set_,
|
||||
const id_type nl_id_, const id_type zero_id_,
|
||||
typename internals::id_type_vector &dfa_,
|
||||
const std::size_t dfa_alphabet_, const std::true_type &)
|
||||
{
|
||||
std::size_t i_ = 0;
|
||||
|
||||
for (const auto &eol_ : eol_set_)
|
||||
{
|
||||
observer_ptr<id_type> ptr_ = &dfa_.front() + eol_ * dfa_alphabet_;
|
||||
const id_type eol_state_ = ptr_[eol_index];
|
||||
id_type nl_state_ = 0;
|
||||
|
||||
for (; i_ < (sm_traits::char_24_bit ? 2 : 1); ++i_)
|
||||
{
|
||||
ptr_ = &dfa_.front() + ptr_[transitions_index + zero_id_] *
|
||||
dfa_alphabet_;
|
||||
}
|
||||
|
||||
nl_state_ = ptr_[transitions_index + nl_id_];
|
||||
|
||||
if (nl_state_)
|
||||
{
|
||||
ptr_ = &dfa_.front() + eol_state_ * dfa_alphabet_;
|
||||
|
||||
if (ptr_[transitions_index + zero_id_] != 0) continue;
|
||||
|
||||
ptr_[transitions_index + zero_id_] =
|
||||
static_cast<id_type>(dfa_.size() / dfa_alphabet_);
|
||||
dfa_.resize(dfa_.size() + dfa_alphabet_, 0);
|
||||
|
||||
for (i_ = 0; i_ < (sm_traits::char_24_bit ? 1 : 0); ++i_)
|
||||
{
|
||||
ptr_ = &dfa_.front() + dfa_.size() - dfa_alphabet_;
|
||||
ptr_[transitions_index + zero_id_] =
|
||||
static_cast<id_type>(dfa_.size() / dfa_alphabet_);
|
||||
dfa_.resize(dfa_.size() + dfa_alphabet_, 0);
|
||||
}
|
||||
|
||||
ptr_ = &dfa_.front() + dfa_.size() - dfa_alphabet_;
|
||||
ptr_[transitions_index + nl_id_] = nl_state_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// char_state_machine version
|
||||
static void append_dfa(const charset_list &charset_list_,
|
||||
const internals &internals_, sm &sm_, const id_type dfa_index_,
|
||||
const std::false_type &)
|
||||
{
|
||||
std::size_t size_ = charset_list_.size();
|
||||
typename sm::string_token_vector token_vector_;
|
||||
|
||||
token_vector_.reserve(size_);
|
||||
|
||||
for (const auto &charset_ : charset_list_)
|
||||
{
|
||||
token_vector_.push_back(charset_->_token);
|
||||
}
|
||||
|
||||
sm_.append(token_vector_, internals_, dfa_index_);
|
||||
}
|
||||
|
||||
// state_machine version
|
||||
static void append_dfa(const charset_list &, const internals &, sm &,
|
||||
const id_type, const std::true_type &)
|
||||
{
|
||||
// Nothing to do - will use create() instead
|
||||
}
|
||||
|
||||
// char_state_machine version
|
||||
static void create(internals &, sm &, const id_type_vector &,
|
||||
const std::false_type &)
|
||||
{
|
||||
// Nothing to do - will use append_dfa() instead
|
||||
}
|
||||
|
||||
// state_machine version
|
||||
static void create(internals &internals_, sm &sm_,
|
||||
const id_type_vector &features_, const std::true_type &)
|
||||
{
|
||||
for (std::size_t i_ = 0, size_ = internals_._dfa.size();
|
||||
i_ < size_; ++i_)
|
||||
{
|
||||
internals_._features |= features_[i_];
|
||||
}
|
||||
|
||||
if (internals_._dfa.size() > 1)
|
||||
{
|
||||
internals_._features |= multi_state_bit;
|
||||
}
|
||||
|
||||
sm_.data().swap(internals_);
|
||||
}
|
||||
|
||||
// NFA version
|
||||
static void partition_charsets(const charset_map &map_,
|
||||
charset_list &lhs_, const std::false_type &)
|
||||
{
|
||||
fill_rhs_list(map_, lhs_);
|
||||
}
|
||||
|
||||
// DFA version
|
||||
static void partition_charsets(const charset_map &map_,
|
||||
charset_list &lhs_, const std::true_type &)
|
||||
{
|
||||
charset_list rhs_;
|
||||
|
||||
fill_rhs_list(map_, rhs_);
|
||||
|
||||
if (!rhs_.empty())
|
||||
{
|
||||
typename charset_list::iterator iter_;
|
||||
typename charset_list::iterator end_;
|
||||
charset_ptr overlap_ = std::make_unique<charset>();
|
||||
|
||||
lhs_.emplace_back(std::move(rhs_.front()));
|
||||
rhs_.pop_front();
|
||||
|
||||
while (!rhs_.empty())
|
||||
{
|
||||
charset_ptr r_(rhs_.front().release());
|
||||
|
||||
rhs_.pop_front();
|
||||
iter_ = lhs_.begin();
|
||||
end_ = lhs_.end();
|
||||
|
||||
while (!r_->empty() && iter_ != end_)
|
||||
{
|
||||
auto l_iter_ = iter_;
|
||||
|
||||
(*l_iter_)->intersect(*r_.get(), *overlap_.get());
|
||||
|
||||
if (overlap_->empty())
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if ((*l_iter_)->empty())
|
||||
{
|
||||
l_iter_->reset(overlap_.release());
|
||||
overlap_ = std::make_unique<charset>();
|
||||
++iter_;
|
||||
}
|
||||
else if (r_->empty())
|
||||
{
|
||||
r_.reset(overlap_.release());
|
||||
overlap_ = std::make_unique<charset>();
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
iter_ = lhs_.insert(++iter_, charset_ptr());
|
||||
iter_->reset(overlap_.release());
|
||||
overlap_ = std::make_unique<charset>();
|
||||
++iter_;
|
||||
end_ = lhs_.end();
|
||||
}
|
||||
}
|
||||
|
||||
if (!r_->empty())
|
||||
{
|
||||
lhs_.emplace_back(std::move(r_));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_rhs_list(const charset_map &map_, charset_list &list_)
|
||||
{
|
||||
for (const auto &pair_ : map_)
|
||||
{
|
||||
list_.emplace_back(std::make_unique<charset>
|
||||
(pair_.first, pair_.second));
|
||||
}
|
||||
}
|
||||
|
||||
static void build_set_mapping(const charset_list &charset_list_,
|
||||
internals &internals_, const id_type dfa_index_,
|
||||
index_set_vector &set_mapping_)
|
||||
{
|
||||
auto iter_ = charset_list_.cbegin();
|
||||
auto end_ = charset_list_.cend();
|
||||
|
||||
for (id_type index_ = 0; iter_ != end_; ++iter_, ++index_)
|
||||
{
|
||||
observer_ptr<const charset> cs_ = iter_->get();
|
||||
|
||||
fill_lookup(cs_->_token, &internals_._lookup[dfa_index_],
|
||||
index_, lookup());
|
||||
|
||||
for (const id_type i_ : cs_->_index_set)
|
||||
{
|
||||
set_mapping_[i_].insert(index_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// char_state_machine version
|
||||
static void fill_lookup(const string_token &, observer_ptr<id_type_vector> ,
|
||||
const id_type, const std::false_type &)
|
||||
{
|
||||
// Do nothing (lookup not used)
|
||||
}
|
||||
|
||||
// state_machine version
|
||||
static void fill_lookup(const string_token &charset_,
|
||||
observer_ptr<id_type_vector> lookup_, const id_type index_,
|
||||
const std::true_type &)
|
||||
{
|
||||
observer_ptr<id_type> ptr_ = &lookup_->front();
|
||||
|
||||
for (const auto &range_ : charset_._ranges)
|
||||
{
|
||||
for (typename char_traits::index_type char_ = range_.first;
|
||||
char_ < range_.second; ++char_)
|
||||
{
|
||||
// Note char_ must be unsigned
|
||||
ptr_[char_] = index_ + transitions_index;
|
||||
}
|
||||
|
||||
// Note range_.second must be unsigned
|
||||
ptr_[range_.second] = index_ + transitions_index;
|
||||
}
|
||||
}
|
||||
|
||||
static id_type closure(const node_vector &followpos_,
|
||||
node_set_vector &seen_sets_, node_vector_vector &seen_vectors_,
|
||||
size_t_vector &hash_vector_, const id_type size_, id_type_vector &dfa_)
|
||||
{
|
||||
bool end_state_ = false;
|
||||
id_type id_ = 0;
|
||||
id_type user_id_ = sm_traits::npos();
|
||||
id_type next_dfa_ = 0;
|
||||
id_type push_dfa_ = sm_traits::npos();
|
||||
bool pop_dfa_ = false;
|
||||
std::size_t hash_ = 0;
|
||||
|
||||
if (followpos_.empty()) return sm_traits::npos();
|
||||
|
||||
id_type index_ = 0;
|
||||
std::unique_ptr<node_set> set_ptr_ = std::make_unique<node_set>();
|
||||
std::unique_ptr<node_vector> vector_ptr_ =
|
||||
std::make_unique<node_vector>();
|
||||
|
||||
for (observer_ptr<node> node_ : followpos_)
|
||||
{
|
||||
closure_ex(node_, end_state_, id_, user_id_, next_dfa_,
|
||||
push_dfa_, pop_dfa_, *set_ptr_.get(),
|
||||
*vector_ptr_.get(), hash_);
|
||||
}
|
||||
|
||||
bool found_ = false;
|
||||
auto hash_iter_ = hash_vector_.cbegin();
|
||||
auto hash_end_ = hash_vector_.cend();
|
||||
auto set_iter_ = seen_sets_.cbegin();
|
||||
|
||||
for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_)
|
||||
{
|
||||
found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_;
|
||||
++index_;
|
||||
|
||||
if (found_) break;
|
||||
}
|
||||
|
||||
if (!found_)
|
||||
{
|
||||
seen_sets_.emplace_back(std::move(set_ptr_));
|
||||
seen_vectors_.emplace_back(std::move(vector_ptr_));
|
||||
hash_vector_.push_back(hash_);
|
||||
// State 0 is the jam state...
|
||||
index_ = static_cast<id_type>(seen_sets_.size());
|
||||
|
||||
const std::size_t old_size_ = dfa_.size();
|
||||
|
||||
dfa_.resize(old_size_ + size_, 0);
|
||||
|
||||
if (end_state_)
|
||||
{
|
||||
dfa_[old_size_] |= end_state_bit;
|
||||
|
||||
if (pop_dfa_)
|
||||
{
|
||||
dfa_[old_size_] |= pop_dfa_bit;
|
||||
}
|
||||
|
||||
dfa_[old_size_ + id_index] = id_;
|
||||
dfa_[old_size_ + user_id_index] = user_id_;
|
||||
dfa_[old_size_ + push_dfa_index] = push_dfa_;
|
||||
dfa_[old_size_ + next_dfa_index] = next_dfa_;
|
||||
}
|
||||
}
|
||||
|
||||
return index_;
|
||||
}
|
||||
|
||||
static void closure_ex(observer_ptr<node> node_, bool &end_state_,
|
||||
id_type &id_, id_type &user_id_, id_type &next_dfa_,
|
||||
id_type &push_dfa_, bool &pop_dfa_, node_set &set_ptr_,
|
||||
node_vector &vector_ptr_, std::size_t &hash_)
|
||||
{
|
||||
const bool temp_end_state_ = node_->end_state();
|
||||
|
||||
if (temp_end_state_)
|
||||
{
|
||||
if (!end_state_)
|
||||
{
|
||||
end_state_ = true;
|
||||
id_ = node_->id();
|
||||
user_id_ = node_->user_id();
|
||||
next_dfa_ = node_->next_dfa();
|
||||
push_dfa_ = node_->push_dfa();
|
||||
pop_dfa_ = node_->pop_dfa();
|
||||
}
|
||||
}
|
||||
|
||||
if (set_ptr_.insert(node_).second)
|
||||
{
|
||||
vector_ptr_.push_back(node_);
|
||||
hash_ += reinterpret_cast<std::size_t>(node_);
|
||||
}
|
||||
}
|
||||
|
||||
// NFA version
|
||||
static void build_equiv_list(const node_vector &vector_,
|
||||
const index_set_vector &set_mapping_, equivset_list &lhs_,
|
||||
const std::false_type &)
|
||||
{
|
||||
fill_rhs_list(vector_, set_mapping_, lhs_);
|
||||
}
|
||||
|
||||
// DFA version
|
||||
static void build_equiv_list(const node_vector &vector_,
|
||||
const index_set_vector &set_mapping_, equivset_list &lhs_,
|
||||
const std::true_type &)
|
||||
{
|
||||
equivset_list rhs_;
|
||||
|
||||
fill_rhs_list(vector_, set_mapping_, rhs_);
|
||||
|
||||
if (!rhs_.empty())
|
||||
{
|
||||
typename equivset_list::iterator iter_;
|
||||
typename equivset_list::iterator end_;
|
||||
equivset_ptr overlap_ = std::make_unique<equivset>();
|
||||
|
||||
lhs_.emplace_back(std::move(rhs_.front()));
|
||||
rhs_.pop_front();
|
||||
|
||||
while (!rhs_.empty())
|
||||
{
|
||||
equivset_ptr r_(rhs_.front().release());
|
||||
|
||||
rhs_.pop_front();
|
||||
iter_ = lhs_.begin();
|
||||
end_ = lhs_.end();
|
||||
|
||||
while (!r_->empty() && iter_ != end_)
|
||||
{
|
||||
auto l_iter_ = iter_;
|
||||
|
||||
(*l_iter_)->intersect(*r_.get(), *overlap_.get());
|
||||
|
||||
if (overlap_->empty())
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if ((*l_iter_)->empty())
|
||||
{
|
||||
l_iter_->reset(overlap_.release());
|
||||
overlap_ = std::make_unique<equivset>();
|
||||
++iter_;
|
||||
}
|
||||
else if (r_->empty())
|
||||
{
|
||||
r_.reset(overlap_.release());
|
||||
overlap_ = std::make_unique<equivset>();
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
iter_ = lhs_.insert(++iter_, equivset_ptr());
|
||||
iter_->reset(overlap_.release());
|
||||
overlap_ = std::make_unique<equivset>();
|
||||
++iter_;
|
||||
end_ = lhs_.end();
|
||||
}
|
||||
}
|
||||
|
||||
if (!r_->empty())
|
||||
{
|
||||
lhs_.emplace_back(std::move(r_));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_rhs_list(const node_vector &vector_,
|
||||
const index_set_vector &set_mapping_, equivset_list &list_)
|
||||
{
|
||||
for (observer_ptr<const node> node_ : vector_)
|
||||
{
|
||||
if (!node_->end_state())
|
||||
{
|
||||
const id_type token_ = node_->token();
|
||||
|
||||
if (token_ != node::null_token())
|
||||
{
|
||||
if (token_ == parser::bol_token() ||
|
||||
token_ == parser::eol_token())
|
||||
{
|
||||
std::set<id_type> index_set_;
|
||||
|
||||
index_set_.insert(token_);
|
||||
list_.emplace_back
|
||||
(std::make_unique<equivset>(index_set_,
|
||||
token_, node_->greedy(), node_->followpos()));
|
||||
}
|
||||
else
|
||||
{
|
||||
list_.emplace_back(std::make_unique<equivset>
|
||||
(set_mapping_[token_], token_, node_->greedy(),
|
||||
node_->followpos()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using generator = basic_generator<rules, state_machine>;
|
||||
using wgenerator = basic_generator<wrules, wstate_machine>;
|
||||
using u32generator = basic_generator<u32rules, u32state_machine>;
|
||||
using char_generator = basic_generator<rules, char_state_machine>;
|
||||
using wchar_generator = basic_generator<wrules, wchar_state_machine>;
|
||||
using u32char_generator = basic_generator<u32rules, u32char_state_machine>;
|
||||
}
|
||||
|
||||
#endif
|
75
YACReaderLibrary/lexertl/internals.hpp
Normal file
75
YACReaderLibrary/lexertl/internals.hpp
Normal file
@ -0,0 +1,75 @@
|
||||
// internals.hpp
|
||||
// Copyright (c) 2009-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_INTERNALS_HPP
|
||||
#define LEXERTL_INTERNALS_HPP
|
||||
|
||||
#include "enums.hpp"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
struct basic_internals
|
||||
{
|
||||
using id_type_vector = std::vector<id_type>;
|
||||
using id_type_vector_vector = std::vector<id_type_vector>;
|
||||
|
||||
id_type _eoi;
|
||||
id_type_vector_vector _lookup;
|
||||
id_type_vector _dfa_alphabet;
|
||||
id_type _features;
|
||||
id_type_vector_vector _dfa;
|
||||
|
||||
basic_internals() :
|
||||
_eoi(0),
|
||||
_lookup(),
|
||||
_dfa_alphabet(),
|
||||
_features(0),
|
||||
_dfa()
|
||||
{
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
_eoi = 0;
|
||||
_lookup.clear();
|
||||
_dfa_alphabet.clear();
|
||||
_features = 0;
|
||||
_dfa.clear();
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
return _dfa.empty();
|
||||
}
|
||||
|
||||
void add_states(const std::size_t num_)
|
||||
{
|
||||
for (std::size_t index_ = 0; index_ < num_; ++index_)
|
||||
{
|
||||
// lookup *always* has a size 256 now.
|
||||
_lookup.push_back(id_type_vector(256, dead_state_index));
|
||||
_dfa_alphabet.push_back(0);
|
||||
_dfa.push_back(id_type_vector());
|
||||
}
|
||||
}
|
||||
|
||||
void swap(basic_internals &internals_)
|
||||
{
|
||||
std::swap(_eoi, internals_._eoi);
|
||||
_lookup.swap(internals_._lookup);
|
||||
_dfa_alphabet.swap(internals_._dfa_alphabet);
|
||||
std::swap(_features, internals_._features);
|
||||
_dfa.swap(internals_._dfa);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
135
YACReaderLibrary/lexertl/iterator.hpp
Normal file
135
YACReaderLibrary/lexertl/iterator.hpp
Normal file
@ -0,0 +1,135 @@
|
||||
// iterator.hpp
|
||||
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_ITERATOR_HPP
|
||||
#define LEXERTL_ITERATOR_HPP
|
||||
|
||||
#include <iterator>
|
||||
#include "lookup.hpp"
|
||||
#include "state_machine.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename iter, typename sm_type, typename results>
|
||||
class iterator
|
||||
{
|
||||
public:
|
||||
using value_type = results;
|
||||
using difference_type = ptrdiff_t;
|
||||
using pointer = const value_type *;
|
||||
using reference = const value_type &;
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
|
||||
iterator() :
|
||||
_results(iter(), iter()),
|
||||
_sm(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
iterator(const iter &start_, const iter &end_, const sm_type &sm) :
|
||||
_results(start_, end_),
|
||||
_sm(&sm)
|
||||
{
|
||||
lookup();
|
||||
}
|
||||
|
||||
// Only need this because of warnings with gcc with -Weffc++
|
||||
iterator(const iterator &rhs_)
|
||||
{
|
||||
_results = rhs_._results;
|
||||
_sm = rhs_._sm;
|
||||
}
|
||||
|
||||
// Only need this because of warnings with gcc with -Weffc++
|
||||
iterator &operator =(const iterator &rhs_)
|
||||
{
|
||||
if (&rhs_ != this)
|
||||
{
|
||||
_results = rhs_._results;
|
||||
_sm = rhs_._sm;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
iterator &operator ++()
|
||||
{
|
||||
lookup();
|
||||
return *this;
|
||||
}
|
||||
|
||||
iterator operator ++(int)
|
||||
{
|
||||
iterator iter_ = *this;
|
||||
|
||||
lookup();
|
||||
return iter_;
|
||||
}
|
||||
|
||||
const value_type &operator *() const
|
||||
{
|
||||
return _results;
|
||||
}
|
||||
|
||||
const value_type *operator ->() const
|
||||
{
|
||||
return &_results;
|
||||
}
|
||||
|
||||
bool operator ==(const iterator &rhs_) const
|
||||
{
|
||||
return _sm == rhs_._sm && (_sm == nullptr ? true :
|
||||
_results == rhs_._results);
|
||||
}
|
||||
|
||||
bool operator !=(const iterator &rhs_) const
|
||||
{
|
||||
return !(*this == rhs_);
|
||||
}
|
||||
|
||||
const sm_type &sm() const
|
||||
{
|
||||
return *_sm;
|
||||
}
|
||||
|
||||
private:
|
||||
value_type _results;
|
||||
const sm_type *_sm;
|
||||
|
||||
void lookup()
|
||||
{
|
||||
lexertl::lookup(*_sm, _results);
|
||||
|
||||
if (_results.first == _results.eoi)
|
||||
{
|
||||
_sm = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using siterator =
|
||||
iterator<std::string::const_iterator, state_machine, smatch>;
|
||||
using citerator = iterator<const char *, state_machine, cmatch>;
|
||||
using wsiterator =
|
||||
iterator<std::wstring::const_iterator, wstate_machine, wsmatch>;
|
||||
using wciterator = iterator<const wchar_t *, wstate_machine, wcmatch>;
|
||||
using u32siterator = iterator<std::u32string::const_iterator,
|
||||
u32state_machine, u32smatch>;
|
||||
using u32citerator = iterator<const char32_t *, u32state_machine, u32cmatch>;
|
||||
|
||||
using sriterator =
|
||||
iterator<std::string::const_iterator, state_machine, srmatch>;
|
||||
using criterator = iterator<const char *, state_machine, crmatch>;
|
||||
using wsriterator =
|
||||
iterator<std::wstring::const_iterator, wstate_machine, wsrmatch>;
|
||||
using wcriterator =
|
||||
iterator<const wchar_t *, wstate_machine, wcrmatch>;
|
||||
using u32sriterator = iterator<std::u32string::const_iterator,
|
||||
u32state_machine, u32srmatch>;
|
||||
using u32criterator = iterator<const char32_t *, u32state_machine, u32crmatch>;
|
||||
}
|
||||
|
||||
#endif
|
24
YACReaderLibrary/lexertl/licence_1_0.txt
Normal file
24
YACReaderLibrary/lexertl/licence_1_0.txt
Normal file
@ -0,0 +1,24 @@
|
||||
Boost Software License - Version 1.0 - August 17th, 2003
|
||||
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
491
YACReaderLibrary/lexertl/lookup.hpp
Normal file
491
YACReaderLibrary/lexertl/lookup.hpp
Normal file
@ -0,0 +1,491 @@
|
||||
// lookup.hpp
|
||||
// Copyright (c) 2009-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_LOOKUP_HPP
|
||||
#define LEXERTL_LOOKUP_HPP
|
||||
|
||||
#include <assert.h>
|
||||
#include "match_results.hpp"
|
||||
#include <type_traits>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<bool>
|
||||
struct bol_state
|
||||
{
|
||||
bol_state(const bool)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct bol_state<true>
|
||||
{
|
||||
bool _bol;
|
||||
bool _end_bol;
|
||||
|
||||
bol_state(const bool bol_) :
|
||||
_bol(bol_),
|
||||
_end_bol(bol_)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type, bool>
|
||||
struct eol_state
|
||||
{
|
||||
};
|
||||
|
||||
template<typename id_type>
|
||||
struct eol_state<id_type, true>
|
||||
{
|
||||
id_type _EOL_state;
|
||||
|
||||
eol_state() :
|
||||
_EOL_state(0)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type, bool>
|
||||
struct multi_state_state
|
||||
{
|
||||
multi_state_state(const id_type)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type>
|
||||
struct multi_state_state<id_type, true>
|
||||
{
|
||||
id_type _start_state;
|
||||
|
||||
multi_state_state(const id_type state_) :
|
||||
_start_state(state_)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type, bool>
|
||||
struct recursive_state
|
||||
{
|
||||
recursive_state(const id_type *)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type>
|
||||
struct recursive_state<id_type, true>
|
||||
{
|
||||
bool _pop;
|
||||
id_type _push_dfa;
|
||||
|
||||
recursive_state(const id_type *ptr_) :
|
||||
_pop((*ptr_ & pop_dfa_bit) != 0),
|
||||
_push_dfa(*(ptr_ + push_dfa_index))
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename internals, typename id_type, typename index_type,
|
||||
std::size_t flags>
|
||||
struct lookup_state
|
||||
{
|
||||
const id_type *_lookup;
|
||||
id_type _dfa_alphabet;
|
||||
const id_type *_dfa;
|
||||
const id_type *_ptr;
|
||||
bool _end_state;
|
||||
id_type _id;
|
||||
id_type _uid;
|
||||
bol_state<(flags & bol_bit) != 0> _bol_state;
|
||||
eol_state<id_type, (flags & eol_bit) != 0> _eol_state;
|
||||
multi_state_state<id_type, (flags & multi_state_bit) != 0>
|
||||
_multi_state_state;
|
||||
recursive_state<id_type, (flags & recursive_bit) != 0> _recursive_state;
|
||||
|
||||
lookup_state(const internals &internals_, const bool bol_,
|
||||
const id_type state_) :
|
||||
_lookup(&internals_._lookup[state_][0]),
|
||||
_dfa_alphabet(internals_._dfa_alphabet[state_]),
|
||||
_dfa(&internals_._dfa[state_][0]),
|
||||
_ptr(_dfa + _dfa_alphabet),
|
||||
_end_state(*_ptr != 0),
|
||||
_id(*(_ptr + id_index)),
|
||||
_uid(*(_ptr + user_id_index)),
|
||||
_bol_state(bol_),
|
||||
_eol_state(),
|
||||
_multi_state_state(state_),
|
||||
_recursive_state(_ptr)
|
||||
{
|
||||
}
|
||||
|
||||
void reset_recursive(const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void reset_recursive(const std::true_type &)
|
||||
{
|
||||
_recursive_state._pop = (*_ptr & pop_dfa_bit) != 0;
|
||||
_recursive_state._push_dfa = *(_ptr + push_dfa_index);
|
||||
}
|
||||
|
||||
void bol_start_state(const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void bol_start_state(const std::true_type &)
|
||||
{
|
||||
if (_bol_state._bol)
|
||||
{
|
||||
const id_type state_ = *_dfa;
|
||||
|
||||
if (state_)
|
||||
{
|
||||
_ptr = &_dfa[state_ * _dfa_alphabet];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
bool is_eol(const char_type, const std::false_type &)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
bool is_eol(const char_type curr_, const std::true_type &)
|
||||
{
|
||||
bool ret_ = false;
|
||||
|
||||
_eol_state._EOL_state = _ptr[eol_index];
|
||||
ret_ = _eol_state._EOL_state && (curr_ == '\r' || curr_ == '\n');
|
||||
|
||||
if (ret_)
|
||||
{
|
||||
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
|
||||
}
|
||||
|
||||
return ret_;
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
id_type next_char(const char_type prev_char_, const std::false_type &)
|
||||
{
|
||||
const id_type state_= _ptr[_lookup
|
||||
[static_cast<index_type>(prev_char_)]];
|
||||
|
||||
if (state_ != 0)
|
||||
{
|
||||
_ptr = &_dfa[state_ * _dfa_alphabet];
|
||||
}
|
||||
|
||||
return state_;
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
id_type next_char(const char_type prev_char_, const std::true_type &)
|
||||
{
|
||||
const std::size_t bytes_ = sizeof(char_type) < 3 ?
|
||||
sizeof(char_type) : 3;
|
||||
const std::size_t shift_[] = {0, 8, 16};
|
||||
id_type state_= 0;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < bytes_; ++i_)
|
||||
{
|
||||
state_ = _ptr[_lookup[static_cast<unsigned char>((prev_char_ >>
|
||||
shift_[bytes_ - 1 - i_]) & 0xff)]];
|
||||
|
||||
if (state_ == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
_ptr = &_dfa[state_ * _dfa_alphabet];
|
||||
}
|
||||
|
||||
return state_;
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
void bol(const char_type, const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
void bol(const char_type prev_char_, const std::true_type &)
|
||||
{
|
||||
_bol_state._bol = prev_char_ == '\n';
|
||||
}
|
||||
|
||||
void eol(const id_type, const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void eol(const id_type err_val_, const std::true_type &)
|
||||
{
|
||||
_eol_state._EOL_state = err_val_;
|
||||
}
|
||||
|
||||
void reset_start_state(const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void reset_start_state(const std::true_type &)
|
||||
{
|
||||
_multi_state_state._start_state = *(_ptr + next_dfa_index);
|
||||
}
|
||||
|
||||
void reset_end_bol(const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void reset_end_bol(const std::true_type &)
|
||||
{
|
||||
_bol_state._end_bol = _bol_state._bol;
|
||||
}
|
||||
|
||||
template<typename iter_type>
|
||||
void end_state(iter_type &end_token_, iter_type &curr_)
|
||||
{
|
||||
if (*_ptr)
|
||||
{
|
||||
_end_state = true;
|
||||
reset_end_bol
|
||||
(std::integral_constant<bool, (flags & bol_bit) != 0>());
|
||||
_id = *(_ptr + id_index);
|
||||
_uid = *(_ptr + user_id_index);
|
||||
reset_recursive
|
||||
(std::integral_constant<bool, (flags & recursive_bit) != 0>());
|
||||
reset_start_state(std::integral_constant<bool,
|
||||
(flags & multi_state_bit) != 0>());
|
||||
end_token_ = curr_;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename iter_type, typename char_type>
|
||||
void check_eol(iter_type &, iter_type &, const id_type,
|
||||
const char_type, const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
template<typename iter_type, typename char_type>
|
||||
void check_eol(iter_type &end_token_, iter_type &curr_,
|
||||
const id_type npos, const char_type eoi_, const std::true_type &)
|
||||
{
|
||||
if (_eol_state._EOL_state != npos && curr_ == eoi_)
|
||||
{
|
||||
_eol_state._EOL_state = _ptr[eol_index];
|
||||
|
||||
if (_eol_state._EOL_state)
|
||||
{
|
||||
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
|
||||
end_state(end_token_, curr_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
void pop(results &, const std::false_type &)
|
||||
{
|
||||
// Nothing to do
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
void pop(results &results_, const std::true_type &)
|
||||
{
|
||||
if (_recursive_state._pop)
|
||||
{
|
||||
_multi_state_state._start_state = results_.stack.top().first;
|
||||
results_.stack.pop();
|
||||
}
|
||||
else if (_recursive_state._push_dfa != results::npos())
|
||||
{
|
||||
results_.stack.push(typename results::id_type_pair
|
||||
(_recursive_state._push_dfa, _id));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
bool is_id_eoi(const id_type eoi_, const results &, const std::false_type &)
|
||||
{
|
||||
return _id == eoi_;
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
bool is_id_eoi(const id_type eoi_, const results &results_,
|
||||
const std::true_type &)
|
||||
{
|
||||
return _id == eoi_ || (_recursive_state._pop &&
|
||||
!results_.stack.empty() && results_.stack.top().second == eoi_);
|
||||
}
|
||||
|
||||
void start_state(id_type &, const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void start_state(id_type &start_state_, const std::true_type &)
|
||||
{
|
||||
start_state_ = _multi_state_state._start_state;
|
||||
}
|
||||
|
||||
void bol(bool &, const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void bol(bool &end_bol_, const std::true_type &)
|
||||
{
|
||||
end_bol_ = _bol_state._end_bol;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename results>
|
||||
void inc_end(results &, const std::false_type &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
void inc_end(results &results_, const std::true_type &)
|
||||
{
|
||||
++results_.second;
|
||||
}
|
||||
|
||||
template<typename sm_type, std::size_t flags, typename results,
|
||||
bool compressed, bool recursive>
|
||||
void next(const sm_type &sm_, results &results_,
|
||||
const std::integral_constant<bool, compressed> &compressed_,
|
||||
const std::integral_constant<bool, recursive> &recursive_,
|
||||
const std::forward_iterator_tag &)
|
||||
{
|
||||
using id_type = typename sm_type::id_type;
|
||||
const auto &internals_ = sm_.data();
|
||||
auto end_token_ = results_.second;
|
||||
|
||||
skip:
|
||||
auto curr_ = results_.second;
|
||||
|
||||
results_.first = curr_;
|
||||
|
||||
again:
|
||||
if (curr_ == results_.eoi)
|
||||
{
|
||||
results_.id = internals_._eoi;
|
||||
results_.user_id = results::npos();
|
||||
return;
|
||||
}
|
||||
|
||||
lookup_state<typename sm_type::internals, id_type,
|
||||
typename results::index_type, flags> lu_state_
|
||||
(internals_, results_.bol, results_.state);
|
||||
lu_state_.bol_start_state
|
||||
(std::integral_constant<bool, (flags & bol_bit) != 0>());
|
||||
|
||||
while (curr_ != results_.eoi)
|
||||
{
|
||||
if (!lu_state_.is_eol(*curr_,
|
||||
std::integral_constant<bool, (flags & eol_bit) != 0>()))
|
||||
{
|
||||
const auto prev_char_ = *curr_;
|
||||
const id_type state_ = lu_state_.next_char(prev_char_,
|
||||
compressed_);
|
||||
|
||||
++curr_;
|
||||
lu_state_.bol(prev_char_,
|
||||
std::integral_constant<bool, (flags & bol_bit) != 0>());
|
||||
|
||||
if (state_ == 0)
|
||||
{
|
||||
lu_state_.is_eol(results::npos(),
|
||||
std::integral_constant<bool, (flags & eol_bit) != 0>());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lu_state_.end_state(end_token_, curr_);
|
||||
}
|
||||
|
||||
lu_state_.check_eol(end_token_, curr_, results::npos(), results_.eoi,
|
||||
std::integral_constant<bool, (flags & eol_bit) != 0>());
|
||||
|
||||
if (lu_state_._end_state)
|
||||
{
|
||||
// Return longest match
|
||||
lu_state_.pop(results_, recursive_);
|
||||
|
||||
lu_state_.start_state(results_.state,
|
||||
std::integral_constant<bool, (flags & multi_state_bit) != 0>());
|
||||
lu_state_.bol(results_.bol,
|
||||
std::integral_constant<bool, (flags & bol_bit) != 0>());
|
||||
results_.second = end_token_;
|
||||
|
||||
if (lu_state_._id == sm_.skip()) goto skip;
|
||||
|
||||
if (lu_state_.is_id_eoi(internals_._eoi, results_, recursive_))
|
||||
{
|
||||
curr_ = end_token_;
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
results_.second = end_token_;
|
||||
results_.bol = *results_.second == '\n';
|
||||
results_.first = results_.second;
|
||||
// No match causes char to be skipped
|
||||
inc_end(results_,
|
||||
std::integral_constant<bool, (flags & advance_bit) != 0>());
|
||||
lu_state_._id = results::npos();
|
||||
lu_state_._uid = results::npos();
|
||||
}
|
||||
|
||||
results_.id = lu_state_._id;
|
||||
results_.user_id = lu_state_._uid;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename iter_type, typename sm_type, std::size_t flags>
|
||||
void lookup(const sm_type &sm_, match_results<iter_type,
|
||||
typename sm_type::id_type, flags> &results_)
|
||||
{
|
||||
using value_type = typename std::iterator_traits<iter_type>::value_type;
|
||||
using cat = typename std::iterator_traits<iter_type>::iterator_category;
|
||||
|
||||
// If this asserts, you have either not defined all the correct
|
||||
// flags, or you should be using recursive_match_results instead
|
||||
// of match_results.
|
||||
assert((sm_.data()._features & flags) == sm_.data()._features);
|
||||
detail::next<sm_type, flags>(sm_, results_,
|
||||
std::integral_constant<bool, (sizeof(value_type) > 1)>(),
|
||||
std::false_type(), cat());
|
||||
}
|
||||
|
||||
template<typename iter_type, typename sm_type, std::size_t flags>
|
||||
void lookup(const sm_type &sm_, recursive_match_results<iter_type,
|
||||
typename sm_type::id_type, flags> &results_)
|
||||
{
|
||||
using value_type = typename std::iterator_traits<iter_type>::value_type;
|
||||
using cat = typename std::iterator_traits<iter_type>::iterator_category;
|
||||
|
||||
// If this asserts, you have not defined all the correct flags
|
||||
assert((sm_.data()._features & flags) == sm_.data()._features);
|
||||
detail::next<sm_type, flags | recursive_bit>(sm_, results_,
|
||||
std::integral_constant<bool, (sizeof(value_type) > 1)>(),
|
||||
std::true_type(), cat());
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
171
YACReaderLibrary/lexertl/match_results.hpp
Normal file
171
YACReaderLibrary/lexertl/match_results.hpp
Normal file
@ -0,0 +1,171 @@
|
||||
// match_results.hpp
|
||||
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_MATCH_RESULTS_HPP
|
||||
#define LEXERTL_MATCH_RESULTS_HPP
|
||||
|
||||
#include "char_traits.hpp"
|
||||
#include "enums.hpp"
|
||||
#include <iterator>
|
||||
#include <stack>
|
||||
#include <string>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename iter, typename id_type = uint16_t,
|
||||
std::size_t flags = bol_bit | eol_bit | skip_bit | again_bit |
|
||||
multi_state_bit | advance_bit>
|
||||
struct match_results
|
||||
{
|
||||
using iter_type = iter;
|
||||
using char_type = typename std::iterator_traits<iter_type>::value_type;
|
||||
using index_type = typename basic_char_traits<char_type>::index_type;
|
||||
using string = std::basic_string<char_type>;
|
||||
|
||||
id_type id;
|
||||
id_type user_id;
|
||||
iter_type first;
|
||||
iter_type second;
|
||||
iter_type eoi;
|
||||
bool bol;
|
||||
id_type state;
|
||||
|
||||
match_results() :
|
||||
id(0),
|
||||
user_id(npos()),
|
||||
first(iter_type()),
|
||||
second(iter_type()),
|
||||
eoi(iter_type()),
|
||||
bol(true),
|
||||
state(0)
|
||||
{
|
||||
}
|
||||
|
||||
match_results(const iter_type &start_, const iter_type &end_) :
|
||||
id(0),
|
||||
user_id(npos()),
|
||||
first(start_),
|
||||
second(start_),
|
||||
eoi(end_),
|
||||
bol(true),
|
||||
state(0)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~match_results()
|
||||
{
|
||||
}
|
||||
|
||||
string str() const
|
||||
{
|
||||
return string(first, second);
|
||||
}
|
||||
|
||||
string substr(const std::size_t soffset_, const std::size_t eoffset_) const
|
||||
{
|
||||
return string(first + soffset_, second - eoffset_);
|
||||
}
|
||||
|
||||
virtual void clear()
|
||||
{
|
||||
id = 0;
|
||||
user_id = npos();
|
||||
first = eoi;
|
||||
second = eoi;
|
||||
bol = true;
|
||||
state = 0;
|
||||
}
|
||||
|
||||
virtual void reset(const iter_type &start_, const iter_type &end_)
|
||||
{
|
||||
id = 0;
|
||||
user_id = npos();
|
||||
first = start_;
|
||||
second = start_;
|
||||
eoi = end_;
|
||||
bol = true;
|
||||
state = 0;
|
||||
}
|
||||
|
||||
static id_type npos()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
|
||||
static id_type skip()
|
||||
{
|
||||
return static_cast<id_type>(~1);
|
||||
}
|
||||
|
||||
bool operator ==(const match_results &rhs_) const
|
||||
{
|
||||
return id == rhs_.id &&
|
||||
user_id == rhs_.user_id &&
|
||||
first == rhs_.first &&
|
||||
second == rhs_.second &&
|
||||
eoi == rhs_.eoi &&
|
||||
bol == rhs_.bol &&
|
||||
state == rhs_.state;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename iter, typename id_type = uint16_t,
|
||||
std::size_t flags = bol_bit | eol_bit | skip_bit | again_bit |
|
||||
multi_state_bit | recursive_bit | advance_bit>
|
||||
struct recursive_match_results : public match_results<iter, id_type, flags>
|
||||
{
|
||||
using id_type_pair = std::pair<id_type, id_type>;
|
||||
std::stack<id_type_pair> stack;
|
||||
|
||||
recursive_match_results() :
|
||||
match_results<iter, id_type, flags>(),
|
||||
stack()
|
||||
{
|
||||
}
|
||||
|
||||
recursive_match_results(const iter &start_, const iter &end_) :
|
||||
match_results<iter, id_type, flags>(start_, end_),
|
||||
stack()
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~recursive_match_results() override
|
||||
{
|
||||
}
|
||||
|
||||
virtual void clear() override
|
||||
{
|
||||
match_results<iter, id_type, flags>::clear();
|
||||
|
||||
while (!stack.empty()) stack.pop();
|
||||
}
|
||||
|
||||
virtual void reset(const iter &start_, const iter &end_) override
|
||||
{
|
||||
match_results<iter, id_type, flags>::reset(start_, end_);
|
||||
|
||||
while (!stack.empty()) stack.pop();
|
||||
}
|
||||
};
|
||||
|
||||
using smatch = match_results<std::string::const_iterator>;
|
||||
using cmatch = match_results<const char *>;
|
||||
using wsmatch = match_results<std::wstring::const_iterator>;
|
||||
using wcmatch = match_results<const wchar_t *>;
|
||||
using u32smatch = match_results<std::u32string::const_iterator>;
|
||||
using u32cmatch = match_results<const char32_t *>;
|
||||
|
||||
using srmatch =
|
||||
recursive_match_results<std::string::const_iterator>;
|
||||
using crmatch = recursive_match_results<const char *>;
|
||||
using wsrmatch =
|
||||
recursive_match_results<std::wstring::const_iterator>;
|
||||
using wcrmatch = recursive_match_results<const wchar_t *>;
|
||||
using u32srmatch =
|
||||
recursive_match_results<std::u32string::const_iterator>;
|
||||
using u32crmatch = recursive_match_results<const char32_t *>;
|
||||
}
|
||||
|
||||
#endif
|
138
YACReaderLibrary/lexertl/memory_file.hpp
Normal file
138
YACReaderLibrary/lexertl/memory_file.hpp
Normal file
@ -0,0 +1,138 @@
|
||||
// memory_file.hpp
|
||||
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
// Inspired by http://en.wikibooks.org/wiki/Optimizing_C%2B%2B/
|
||||
// General_optimization_techniques/Input/Output#Memory-mapped_file
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_MEMORY_FILE_HPP
|
||||
#define LEXERTL_MEMORY_FILE_HPP
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
|
||||
// Only files small enough to fit into memory are supported.
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_type>
|
||||
class basic_memory_file
|
||||
{
|
||||
public:
|
||||
basic_memory_file()
|
||||
{
|
||||
}
|
||||
|
||||
basic_memory_file(const char *pathname_)
|
||||
{
|
||||
open(pathname_);
|
||||
}
|
||||
|
||||
~basic_memory_file()
|
||||
{
|
||||
close();
|
||||
}
|
||||
|
||||
void open(const char *pathname_)
|
||||
{
|
||||
if (_data) close();
|
||||
|
||||
#ifdef _WIN32
|
||||
_fh = ::CreateFileA(pathname_, GENERIC_READ, FILE_SHARE_READ, 0,
|
||||
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
|
||||
_fmh = 0;
|
||||
|
||||
if (_fh != INVALID_HANDLE_VALUE)
|
||||
{
|
||||
_fmh = ::CreateFileMapping(_fh, 0, PAGE_READONLY, 0, 0, 0);
|
||||
|
||||
if (_fmh != 0)
|
||||
{
|
||||
_data = static_cast<char_type *>(::MapViewOfFile
|
||||
(_fmh, FILE_MAP_READ, 0, 0, 0));
|
||||
|
||||
if (_data) _size = ::GetFileSize(_fh, 0) / sizeof(char_type);
|
||||
}
|
||||
}
|
||||
#else
|
||||
_fh = ::open(pathname_, O_RDONLY);
|
||||
|
||||
if (_fh > -1)
|
||||
{
|
||||
struct stat sbuf_;
|
||||
|
||||
if (::fstat(_fh, &sbuf_) > -1)
|
||||
{
|
||||
_data = static_cast<const char_type *>
|
||||
(::mmap(0, sbuf_.st_size, PROT_READ, MAP_SHARED, _fh, 0));
|
||||
|
||||
if (_data == MAP_FAILED)
|
||||
{
|
||||
_data = nullptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
_size = sbuf_.st_size / sizeof(char_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
const char_type *data() const
|
||||
{
|
||||
return _data;
|
||||
}
|
||||
|
||||
std::size_t size() const
|
||||
{
|
||||
return _size;
|
||||
}
|
||||
|
||||
void close()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
::UnmapViewOfFile(_data);
|
||||
::CloseHandle(_fmh);
|
||||
::CloseHandle(_fh);
|
||||
#else
|
||||
::munmap(const_cast<char_type *>(_data), _size);
|
||||
::close(_fh);
|
||||
#endif
|
||||
_data = nullptr;
|
||||
_size = 0;
|
||||
_fh = 0;
|
||||
#ifdef _WIN32
|
||||
_fmh = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
const char_type *_data = nullptr;
|
||||
std::size_t _size = 0;
|
||||
#ifdef _WIN32
|
||||
HANDLE _fh = 0;
|
||||
HANDLE _fmh = 0;
|
||||
#else
|
||||
int _fh = 0;
|
||||
#endif
|
||||
|
||||
// No copy construction.
|
||||
basic_memory_file(const basic_memory_file &) = delete;
|
||||
// No assignment.
|
||||
basic_memory_file &operator =(const basic_memory_file &) = delete;
|
||||
};
|
||||
|
||||
using memory_file = basic_memory_file<char>;
|
||||
using wmemory_file = basic_memory_file<wchar_t>;
|
||||
}
|
||||
|
||||
#endif
|
25
YACReaderLibrary/lexertl/narrow.hpp
Normal file
25
YACReaderLibrary/lexertl/narrow.hpp
Normal file
@ -0,0 +1,25 @@
|
||||
// narrow.hpp
|
||||
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_NARROW_HPP
|
||||
#define LEXERTL_NARROW_HPP
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_type>
|
||||
void narrow(const char_type *str_, std::ostringstream &ss_)
|
||||
{
|
||||
while (*str_)
|
||||
{
|
||||
// Safe to simply cast to char.
|
||||
// when string only contains ASCII.
|
||||
ss_ << static_cast<char>(*str_++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
16
YACReaderLibrary/lexertl/observer_ptr.hpp
Normal file
16
YACReaderLibrary/lexertl/observer_ptr.hpp
Normal file
@ -0,0 +1,16 @@
|
||||
// observer_ptr.hpp
|
||||
// Copyright (c) 2017-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_OBSERVER_PTR_HPP
|
||||
#define LEXERTL_OBSERVER_PTR_HPP
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename T>
|
||||
using observer_ptr = T *;
|
||||
}
|
||||
|
||||
#endif
|
926
YACReaderLibrary/lexertl/parser/parser.hpp
Normal file
926
YACReaderLibrary/lexertl/parser/parser.hpp
Normal file
@ -0,0 +1,926 @@
|
||||
// parser.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_PARSER_HPP
|
||||
#define LEXERTL_PARSER_HPP
|
||||
|
||||
#include <assert.h>
|
||||
#include <algorithm>
|
||||
#include "tree/end_node.hpp"
|
||||
#include "tree/iteration_node.hpp"
|
||||
#include "tree/leaf_node.hpp"
|
||||
#include <map>
|
||||
#include "tokeniser/re_tokeniser.hpp"
|
||||
#include "../runtime_error.hpp"
|
||||
#include "tree/selection_node.hpp"
|
||||
#include "tree/sequence_node.hpp"
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
/*
|
||||
General principles of regex parsing:
|
||||
- Every regex is a sequence of sub-regexes.
|
||||
- Regexes consist of operands and operators
|
||||
- All operators decompose to sequence, selection ('|') and iteration ('*')
|
||||
- Regex tokens are stored on a stack.
|
||||
- When a complete sequence of regex tokens is on the stack it is processed.
|
||||
|
||||
Grammar:
|
||||
|
||||
<REGEX> -> <OREXP>
|
||||
<OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
|
||||
<SEQUENCE> -> <SUB>
|
||||
<SUB> -> <EXPRESSION> | <SUB><EXPRESSION>
|
||||
<EXPRESSION> -> <REPEAT>
|
||||
<REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
|
||||
<DUPLICATE> -> '?' | '??' | '*' | '*?' | '+' | '+?' | '{n[,[m]]}' |
|
||||
'{n[,[m]]}?'
|
||||
*/
|
||||
|
||||
template<typename rules_char_type, typename sm_traits>
|
||||
class basic_parser
|
||||
{
|
||||
public:
|
||||
enum {char_24_bit = sm_traits::char_24_bit};
|
||||
using char_type = typename sm_traits::char_type;
|
||||
using id_type = typename sm_traits::id_type;
|
||||
using end_node = basic_end_node<id_type>;
|
||||
using input_char_type = typename sm_traits::input_char_type;
|
||||
using input_string_token = basic_string_token<input_char_type>;
|
||||
using iteration_node = basic_iteration_node<id_type>;
|
||||
using leaf_node = basic_leaf_node<id_type>;
|
||||
using tokeniser =
|
||||
basic_re_tokeniser<rules_char_type, input_char_type, id_type>;
|
||||
using node = basic_node<id_type>;
|
||||
using node_ptr_vector = typename node::node_ptr_vector;
|
||||
using string = std::basic_string<rules_char_type>;
|
||||
using string_token = basic_string_token<char_type>;
|
||||
using selection_node = basic_selection_node<id_type>;
|
||||
using sequence_node = basic_sequence_node<id_type>;
|
||||
using charset_map = std::map<string_token, id_type>;
|
||||
using charset_pair = std::pair<string_token, id_type>;
|
||||
using compressed = std::integral_constant<bool, sm_traits::compressed>;
|
||||
using token = basic_re_token<rules_char_type, input_char_type>;
|
||||
static_assert(std::is_move_assignable<token>::value &&
|
||||
std::is_move_constructible<token>::value,
|
||||
"token is not movable.");
|
||||
using token_vector = std::vector<token>;
|
||||
|
||||
basic_parser(const std::locale &locale_,
|
||||
node_ptr_vector &node_ptr_vector_,
|
||||
charset_map &charset_map_, const id_type eoi_) :
|
||||
_locale(locale_),
|
||||
_node_ptr_vector(node_ptr_vector_),
|
||||
_charset_map(charset_map_),
|
||||
_eoi(eoi_),
|
||||
_token_stack(),
|
||||
_tree_node_stack()
|
||||
{
|
||||
}
|
||||
|
||||
observer_ptr<node> parse(const token_vector ®ex_, const id_type id_,
|
||||
const id_type user_id_, const id_type next_dfa_,
|
||||
const id_type push_dfa_, const bool pop_dfa_,
|
||||
const std::size_t flags_, id_type &nl_id_, const bool seen_bol_)
|
||||
{
|
||||
auto iter_ = regex_.cbegin();
|
||||
auto end_ = regex_.cend();
|
||||
observer_ptr<node> root_ = nullptr;
|
||||
observer_ptr<token> lhs_token_ = nullptr;
|
||||
// There cannot be less than 2 tokens
|
||||
auto rhs_token_ = std::make_unique<token>(*iter_++);
|
||||
char action_ = 0;
|
||||
|
||||
_token_stack.emplace(std::move(rhs_token_));
|
||||
rhs_token_ = std::make_unique<token>(*iter_);
|
||||
|
||||
if (iter_ + 1 != end_) ++iter_;
|
||||
|
||||
do
|
||||
{
|
||||
lhs_token_ = _token_stack.top().get();
|
||||
action_ = lhs_token_->precedence(rhs_token_->_type);
|
||||
|
||||
switch (action_)
|
||||
{
|
||||
case '<':
|
||||
case '=':
|
||||
_token_stack.emplace(std::move(rhs_token_));
|
||||
rhs_token_ = std::make_unique<token>(*iter_);
|
||||
|
||||
if (iter_ + 1 != end_) ++iter_;
|
||||
|
||||
break;
|
||||
case '>':
|
||||
reduce(nl_id_);
|
||||
break;
|
||||
default:
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "A syntax error occurred: '" <<
|
||||
lhs_token_->precedence_string() <<
|
||||
"' against '" << rhs_token_->precedence_string() <<
|
||||
" in rule id " << id_ << '.';
|
||||
throw runtime_error(ss_.str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (!_token_stack.empty());
|
||||
|
||||
if (_tree_node_stack.empty())
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Empty rules are not allowed in rule id " <<
|
||||
id_ << '.';
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
assert(_tree_node_stack.size() == 1);
|
||||
|
||||
observer_ptr<node> lhs_node_ = _tree_node_stack.top();
|
||||
|
||||
_tree_node_stack.pop();
|
||||
_node_ptr_vector.emplace_back(std::make_unique<end_node>
|
||||
(id_, user_id_, next_dfa_, push_dfa_, pop_dfa_));
|
||||
|
||||
observer_ptr<node> rhs_node_ = _node_ptr_vector.back().get();
|
||||
|
||||
_node_ptr_vector.emplace_back(std::make_unique<sequence_node>
|
||||
(lhs_node_, rhs_node_));
|
||||
root_ = _node_ptr_vector.back().get();
|
||||
|
||||
if (seen_bol_)
|
||||
{
|
||||
fixup_bol(root_);
|
||||
}
|
||||
|
||||
if ((flags_ & match_zero_len) == 0)
|
||||
{
|
||||
const auto &firstpos_ = root_->firstpos();
|
||||
|
||||
for (observer_ptr<const node> node_ : firstpos_)
|
||||
{
|
||||
if (node_->end_state())
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Rules that match zero characters are not allowed "
|
||||
"as this can cause an infinite loop in user code. The "
|
||||
"match_zero_len flag overrides this check. Rule id " <<
|
||||
id_ << '.';
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return root_;
|
||||
}
|
||||
|
||||
static id_type bol_token()
|
||||
{
|
||||
return static_cast<id_type>(~1);
|
||||
}
|
||||
|
||||
static id_type eol_token()
|
||||
{
|
||||
return static_cast<id_type>(~2);
|
||||
}
|
||||
|
||||
private:
|
||||
using input_range = typename input_string_token::range;
|
||||
using range = typename string_token::range;
|
||||
using string_token_vector = std::vector<std::unique_ptr<string_token>>;
|
||||
using token_stack = std::stack<std::unique_ptr<token>>;
|
||||
using tree_node_stack = typename node::node_stack;
|
||||
|
||||
const std::locale &_locale;
|
||||
node_ptr_vector &_node_ptr_vector;
|
||||
charset_map &_charset_map;
|
||||
id_type _eoi;
|
||||
token_stack _token_stack;
|
||||
tree_node_stack _tree_node_stack;
|
||||
|
||||
void reduce(id_type &nl_id_)
|
||||
{
|
||||
observer_ptr<token> lhs_ = nullptr;
|
||||
observer_ptr<token> rhs_ = nullptr;
|
||||
token_stack handle_;
|
||||
char action_ = 0;
|
||||
|
||||
do
|
||||
{
|
||||
handle_.emplace();
|
||||
rhs_ = _token_stack.top().release();
|
||||
handle_.top().reset(rhs_);
|
||||
_token_stack.pop();
|
||||
|
||||
if (!_token_stack.empty())
|
||||
{
|
||||
lhs_ = _token_stack.top().get();
|
||||
action_ = lhs_->precedence(rhs_->_type);
|
||||
}
|
||||
} while (!_token_stack.empty() && action_ == '=');
|
||||
|
||||
assert(_token_stack.empty() || action_ == '<');
|
||||
|
||||
switch (rhs_->_type)
|
||||
{
|
||||
case BEGIN:
|
||||
// finished processing so exit
|
||||
break;
|
||||
case REGEX:
|
||||
// finished parsing, nothing to do
|
||||
break;
|
||||
case OREXP:
|
||||
orexp(handle_);
|
||||
break;
|
||||
case SEQUENCE:
|
||||
_token_stack.emplace(std::make_unique<token>(OREXP));
|
||||
break;
|
||||
case SUB:
|
||||
sub(handle_);
|
||||
break;
|
||||
case EXPRESSION:
|
||||
_token_stack.emplace(std::make_unique<token>(SUB));
|
||||
break;
|
||||
case REPEAT:
|
||||
repeat(handle_);
|
||||
break;
|
||||
case BOL:
|
||||
bol(handle_);
|
||||
break;
|
||||
case EOL:
|
||||
eol(handle_, nl_id_);
|
||||
break;
|
||||
case CHARSET:
|
||||
charset(handle_, compressed());
|
||||
break;
|
||||
case OPENPAREN:
|
||||
openparen(handle_);
|
||||
break;
|
||||
case OPT:
|
||||
case AOPT:
|
||||
optional(rhs_->_type == OPT);
|
||||
_token_stack.emplace(std::make_unique<token>(DUP));
|
||||
break;
|
||||
case ZEROORMORE:
|
||||
case AZEROORMORE:
|
||||
zero_or_more(rhs_->_type == ZEROORMORE);
|
||||
_token_stack.emplace(std::make_unique<token>(DUP));
|
||||
break;
|
||||
case ONEORMORE:
|
||||
case AONEORMORE:
|
||||
one_or_more(rhs_->_type == ONEORMORE);
|
||||
_token_stack.emplace(std::make_unique<token>(DUP));
|
||||
break;
|
||||
case REPEATN:
|
||||
case AREPEATN:
|
||||
repeatn(rhs_->_type == REPEATN, handle_.top().get());
|
||||
_token_stack.emplace(std::make_unique<token>(DUP));
|
||||
break;
|
||||
default:
|
||||
throw runtime_error
|
||||
("Internal error in regex_parser::reduce.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void orexp(token_stack &handle_)
|
||||
{
|
||||
assert(handle_.top()->_type == OREXP &&
|
||||
(handle_.size() == 1 || handle_.size() == 3));
|
||||
|
||||
if (handle_.size() == 1)
|
||||
{
|
||||
_token_stack.emplace(std::make_unique<token>(REGEX));
|
||||
}
|
||||
else
|
||||
{
|
||||
handle_.pop();
|
||||
assert(handle_.top()->_type == OR);
|
||||
handle_.pop();
|
||||
assert(handle_.top()->_type == SEQUENCE);
|
||||
perform_or();
|
||||
_token_stack.emplace(std::make_unique<token>(OREXP));
|
||||
}
|
||||
}
|
||||
|
||||
void perform_or()
|
||||
{
|
||||
// perform or
|
||||
observer_ptr<node> rhs_ = _tree_node_stack.top();
|
||||
|
||||
_tree_node_stack.pop();
|
||||
|
||||
observer_ptr<node> lhs_ = _tree_node_stack.top();
|
||||
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<selection_node>(lhs_, rhs_));
|
||||
_tree_node_stack.top() = _node_ptr_vector.back().get();
|
||||
}
|
||||
|
||||
void sub(token_stack &handle_)
|
||||
{
|
||||
assert((handle_.top()->_type == SUB &&
|
||||
handle_.size() == 1) || handle_.size() == 2);
|
||||
|
||||
if (handle_.size() == 1)
|
||||
{
|
||||
_token_stack.emplace(std::make_unique<token>(SEQUENCE));
|
||||
}
|
||||
else
|
||||
{
|
||||
handle_.pop();
|
||||
assert(handle_.top()->_type == EXPRESSION);
|
||||
// perform join
|
||||
sequence();
|
||||
_token_stack.emplace(std::make_unique<token>(SUB));
|
||||
}
|
||||
}
|
||||
|
||||
void repeat(token_stack &handle_)
|
||||
{
|
||||
assert(handle_.top()->_type == REPEAT &&
|
||||
handle_.size() >= 1 && handle_.size() <= 3);
|
||||
|
||||
if (handle_.size() == 1)
|
||||
{
|
||||
_token_stack.emplace(std::make_unique<token>(EXPRESSION));
|
||||
}
|
||||
else
|
||||
{
|
||||
handle_.pop();
|
||||
assert(handle_.top()->_type == DUP);
|
||||
_token_stack.emplace(std::make_unique<token>(REPEAT));
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
void bol(token_stack &handle_)
|
||||
#else
|
||||
void bol(token_stack &)
|
||||
#endif
|
||||
{
|
||||
assert(handle_.top()->_type == BOL &&
|
||||
handle_.size() == 1);
|
||||
|
||||
// store charset
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<leaf_node>(bol_token(), true));
|
||||
_tree_node_stack.push(_node_ptr_vector.back().get());
|
||||
_token_stack.emplace(std::make_unique<token>(REPEAT));
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
void eol(token_stack &handle_, id_type &nl_id_)
|
||||
#else
|
||||
void eol(token_stack &, id_type &nl_id_)
|
||||
#endif
|
||||
{
|
||||
const string_token nl_('\n');
|
||||
const id_type temp_nl_id_ = lookup(nl_);
|
||||
|
||||
assert(handle_.top()->_type == EOL &&
|
||||
handle_.size() == 1);
|
||||
|
||||
if (temp_nl_id_ != ~static_cast<id_type>(0))
|
||||
{
|
||||
nl_id_ = temp_nl_id_;
|
||||
}
|
||||
|
||||
// store charset
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<leaf_node>(eol_token(), true));
|
||||
_tree_node_stack.push(_node_ptr_vector.back().get());
|
||||
_token_stack.emplace(std::make_unique<token>(REPEAT));
|
||||
}
|
||||
|
||||
// Uncompressed
|
||||
void charset(token_stack &handle_, const std::false_type &)
|
||||
{
|
||||
assert(handle_.top()->_type == CHARSET &&
|
||||
handle_.size() == 1);
|
||||
|
||||
const id_type id_ = lookup(handle_.top()->_str);
|
||||
|
||||
// store charset
|
||||
_node_ptr_vector.emplace_back(std::make_unique<leaf_node>(id_, true));
|
||||
_tree_node_stack.push(_node_ptr_vector.back().get());
|
||||
_token_stack.emplace(std::make_unique<token>(REPEAT));
|
||||
}
|
||||
|
||||
// Compressed
|
||||
void charset(token_stack &handle_, const std::true_type &)
|
||||
{
|
||||
assert(handle_.top()->_type == CHARSET &&
|
||||
handle_.size() == 1);
|
||||
|
||||
std::unique_ptr<token> token_(handle_.top().release());
|
||||
|
||||
handle_.pop();
|
||||
create_sequence(token_);
|
||||
}
|
||||
|
||||
// Slice wchar_t into sequence of char.
|
||||
void create_sequence(std::unique_ptr<token> &token_)
|
||||
{
|
||||
string_token_vector data_[char_24_bit ? 3 : 2];
|
||||
|
||||
for (const input_range &range_ : token_->_str._ranges)
|
||||
{
|
||||
slice_range(range_, data_,
|
||||
std::integral_constant<bool, char_24_bit>());
|
||||
}
|
||||
|
||||
push_ranges(data_, std::integral_constant<bool, char_24_bit>());
|
||||
|
||||
_token_stack.emplace(std::make_unique<token>(OPENPAREN));
|
||||
_token_stack.emplace(std::make_unique<token>(REGEX));
|
||||
_token_stack.emplace(std::make_unique<token>(CLOSEPAREN));
|
||||
}
|
||||
|
||||
// 16 bit unicode
|
||||
void slice_range(const input_range &range_, string_token_vector data_[2],
|
||||
const std::false_type &)
|
||||
{
|
||||
const unsigned char first_msb_ = static_cast<unsigned char>
|
||||
((range_.first >> 8) & 0xff);
|
||||
const unsigned char first_lsb_ = static_cast<unsigned char>
|
||||
(range_.first & 0xff);
|
||||
const unsigned char second_msb_ = static_cast<unsigned char>
|
||||
((range_.second >> 8) & 0xff);
|
||||
const unsigned char second_lsb_ = static_cast<unsigned char>
|
||||
(range_.second & 0xff);
|
||||
|
||||
if (first_msb_ == second_msb_)
|
||||
{
|
||||
insert_range(first_msb_, first_msb_, first_lsb_,
|
||||
second_lsb_, data_);
|
||||
}
|
||||
else
|
||||
{
|
||||
insert_range(first_msb_, first_msb_, first_lsb_, 0xff, data_);
|
||||
|
||||
if (second_msb_ > first_msb_ + 1)
|
||||
{
|
||||
insert_range(first_msb_ + 1, second_msb_ - 1, 0, 0xff, data_);
|
||||
}
|
||||
|
||||
insert_range(second_msb_, second_msb_, 0, second_lsb_, data_);
|
||||
}
|
||||
}
|
||||
|
||||
// 24 bit unicode
|
||||
void slice_range(const input_range &range_, string_token_vector data_[3],
|
||||
const std::true_type &)
|
||||
{
|
||||
const unsigned char first_msb_ = static_cast<unsigned char>
|
||||
((range_.first >> 16) & 0xff);
|
||||
const unsigned char first_mid_ = static_cast<unsigned char>
|
||||
((range_.first >> 8) & 0xff);
|
||||
const unsigned char first_lsb_ = static_cast<unsigned char>
|
||||
(range_.first & 0xff);
|
||||
const unsigned char second_msb_ = static_cast<unsigned char>
|
||||
((range_.second >> 16) & 0xff);
|
||||
const unsigned char second_mid_ = static_cast<unsigned char>
|
||||
((range_.second >> 8) & 0xff);
|
||||
const unsigned char second_lsb_ = static_cast<unsigned char>
|
||||
(range_.second & 0xff);
|
||||
|
||||
if (first_msb_ == second_msb_)
|
||||
{
|
||||
string_token_vector data2_[2];
|
||||
|
||||
// Re-use 16 bit slice function
|
||||
slice_range(range_, data2_, std::false_type());
|
||||
|
||||
for (std::size_t i_ = 0, size_ = data2_[0].size();
|
||||
i_ < size_; ++i_)
|
||||
{
|
||||
insert_range(string_token(first_msb_, first_msb_),
|
||||
*data2_[0][i_], *data2_[1][i_], data_);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
insert_range(first_msb_, first_msb_,
|
||||
first_mid_, first_mid_,
|
||||
first_lsb_, 0xff, data_);
|
||||
|
||||
if (first_mid_ != 0xff)
|
||||
{
|
||||
insert_range(first_msb_, first_msb_,
|
||||
first_mid_ + 1, 0xff,
|
||||
0, 0xff, data_);
|
||||
}
|
||||
|
||||
if (second_msb_ > first_msb_ + 1)
|
||||
{
|
||||
insert_range(first_mid_ + 1, second_mid_ - 1,
|
||||
0, 0xff,
|
||||
0, 0xff, data_);
|
||||
}
|
||||
|
||||
if (second_mid_ != 0)
|
||||
{
|
||||
insert_range(second_msb_, second_msb_,
|
||||
0, second_mid_ - 1,
|
||||
0, 0xff, data_);
|
||||
insert_range(second_msb_, second_msb_,
|
||||
second_mid_, second_mid_,
|
||||
0, second_lsb_, data_);
|
||||
}
|
||||
else
|
||||
{
|
||||
insert_range(second_msb_, second_msb_,
|
||||
0, second_mid_,
|
||||
0, second_lsb_, data_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 16 bit unicode
|
||||
void insert_range(const unsigned char first_, const unsigned char second_,
|
||||
const unsigned char first2_, const unsigned char second2_,
|
||||
string_token_vector data_[2])
|
||||
{
|
||||
const string_token token_(first_ > second_ ? second_ : first_,
|
||||
first_ > second_ ? first_ : second_);
|
||||
const string_token token2_(first2_ > second2_ ? second2_ : first2_,
|
||||
first2_ > second2_ ? first2_ : second2_);
|
||||
|
||||
insert_range(token_, token2_, data_);
|
||||
}
|
||||
|
||||
void insert_range(const string_token &token_, const string_token &token2_,
|
||||
string_token_vector data_[2])
|
||||
{
|
||||
typename string_token_vector::const_iterator iter_ =
|
||||
std::find_if(data_[0].begin(), data_[0].end(),
|
||||
[&token_](const std::unique_ptr<string_token> &rhs_)
|
||||
{
|
||||
return token_ == *rhs_.get();
|
||||
});
|
||||
|
||||
if (iter_ == data_[0].end())
|
||||
{
|
||||
data_[0].emplace_back(std::make_unique<string_token>(token_));
|
||||
data_[1].emplace_back(std::make_unique<string_token>(token2_));
|
||||
}
|
||||
else
|
||||
{
|
||||
const std::size_t index_ = iter_ - data_[0].begin();
|
||||
|
||||
data_[1][index_]->insert(token2_);
|
||||
}
|
||||
}
|
||||
|
||||
// 24 bit unicode
|
||||
void insert_range(const unsigned char first_, const unsigned char second_,
|
||||
const unsigned char first2_, const unsigned char second2_,
|
||||
const unsigned char first3_, const unsigned char second3_,
|
||||
string_token_vector data_[3])
|
||||
{
|
||||
const string_token token_(first_ > second_ ? second_ : first_,
|
||||
first_ > second_ ? first_ : second_);
|
||||
const string_token token2_(first2_ > second2_ ? second2_ : first2_,
|
||||
first2_ > second2_ ? first2_ : second2_);
|
||||
const string_token token3_(first3_ > second3_ ? second3_ : first3_,
|
||||
first3_ > second3_ ? first3_ : second3_);
|
||||
|
||||
insert_range(token_, token2_, token3_, data_);
|
||||
}
|
||||
|
||||
void insert_range(const string_token &token_, const string_token &token2_,
|
||||
const string_token &token3_, string_token_vector data_[3])
|
||||
{
|
||||
auto iter_ = data_[0].cbegin();
|
||||
auto end_ = data_[0].cend();
|
||||
bool finished_ = false;
|
||||
|
||||
do
|
||||
{
|
||||
iter_ = std::find_if(iter_, end_,
|
||||
[&token_](const std::unique_ptr<string_token> &rhs_)
|
||||
{
|
||||
return token_ == *rhs_.get();
|
||||
});
|
||||
|
||||
if (iter_ == end_)
|
||||
{
|
||||
data_[0].emplace_back(std::make_unique<string_token>(token_));
|
||||
data_[1].emplace_back(std::make_unique<string_token>(token2_));
|
||||
data_[2].emplace_back(std::make_unique<string_token>(token3_));
|
||||
finished_ = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
const std::size_t index_ = iter_ - data_[0].begin();
|
||||
|
||||
if (*data_[1][index_] == token2_)
|
||||
{
|
||||
data_[2][index_]->insert(token3_);
|
||||
finished_ = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
}
|
||||
} while (!finished_);
|
||||
}
|
||||
|
||||
// 16 bit unicode
|
||||
void push_ranges(string_token_vector data_[2], const std::false_type &)
|
||||
{
|
||||
auto viter_ = data_[0].cbegin();
|
||||
auto vend_ = data_[0].cend();
|
||||
auto viter2_ = data_[1].cbegin();
|
||||
|
||||
push_range(viter_++->get());
|
||||
push_range(viter2_++->get());
|
||||
sequence();
|
||||
|
||||
while (viter_ != vend_)
|
||||
{
|
||||
push_range(viter_++->get());
|
||||
push_range(viter2_++->get());
|
||||
sequence();
|
||||
perform_or();
|
||||
}
|
||||
}
|
||||
|
||||
// 24 bit unicode
|
||||
void push_ranges(string_token_vector data_[3], const std::true_type &)
|
||||
{
|
||||
auto viter_ = data_[0].cbegin();
|
||||
auto vend_ = data_[0].cend();
|
||||
auto viter2_ = data_[1].cbegin();
|
||||
auto viter3_ = data_[2].cbegin();
|
||||
|
||||
push_range(viter_++->get());
|
||||
push_range(viter2_++->get());
|
||||
sequence();
|
||||
push_range(viter3_++->get());
|
||||
sequence();
|
||||
|
||||
while (viter_ != vend_)
|
||||
{
|
||||
push_range(viter_++->get());
|
||||
push_range(viter2_++->get());
|
||||
sequence();
|
||||
push_range(viter3_++->get());
|
||||
sequence();
|
||||
perform_or();
|
||||
}
|
||||
}
|
||||
|
||||
void push_range(observer_ptr<const string_token> token_)
|
||||
{
|
||||
const id_type id_ = lookup(*token_);
|
||||
|
||||
_node_ptr_vector.emplace_back(std::make_unique<leaf_node>(id_, true));
|
||||
_tree_node_stack.push(_node_ptr_vector.back().get());
|
||||
}
|
||||
|
||||
id_type lookup(const string_token &charset_)
|
||||
{
|
||||
// Converted to id_type below.
|
||||
std::size_t id_ = sm_traits::npos();
|
||||
|
||||
if (static_cast<id_type>(id_) < id_)
|
||||
{
|
||||
throw runtime_error("id_type is not large enough "
|
||||
"to hold all ids.");
|
||||
}
|
||||
|
||||
typename charset_map::const_iterator iter_ =
|
||||
_charset_map.find(charset_);
|
||||
|
||||
if (iter_ == _charset_map.end())
|
||||
{
|
||||
id_ = _charset_map.size();
|
||||
_charset_map.insert(charset_pair(charset_,
|
||||
static_cast<id_type>(id_)));
|
||||
}
|
||||
else
|
||||
{
|
||||
id_ = iter_->second;
|
||||
}
|
||||
|
||||
return static_cast<id_type>(id_);
|
||||
}
|
||||
|
||||
void openparen(token_stack &handle_)
|
||||
{
|
||||
assert(handle_.top()->_type == OPENPAREN &&
|
||||
handle_.size() == 3);
|
||||
|
||||
handle_.pop();
|
||||
assert(handle_.top()->_type == REGEX);
|
||||
handle_.pop();
|
||||
assert(handle_.top()->_type == CLOSEPAREN);
|
||||
_token_stack.emplace(std::make_unique<token>(REPEAT));
|
||||
}
|
||||
|
||||
void sequence()
|
||||
{
|
||||
observer_ptr<node> rhs_ = _tree_node_stack.top();
|
||||
|
||||
_tree_node_stack.pop();
|
||||
|
||||
observer_ptr<node> lhs_ = _tree_node_stack.top();
|
||||
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<sequence_node>(lhs_, rhs_));
|
||||
_tree_node_stack.top() = _node_ptr_vector.back().get();
|
||||
}
|
||||
|
||||
void optional(const bool greedy_)
|
||||
{
|
||||
// perform ?
|
||||
observer_ptr<node> lhs_ = _tree_node_stack.top();
|
||||
// Don't know if lhs_ is a leaf_node, so get firstpos.
|
||||
auto &firstpos_ = lhs_->firstpos();
|
||||
|
||||
for (observer_ptr<node> node_ : firstpos_)
|
||||
{
|
||||
// These are leaf_nodes!
|
||||
node_->greedy(greedy_);
|
||||
}
|
||||
|
||||
_node_ptr_vector.emplace_back(std::make_unique<leaf_node>
|
||||
(node::null_token(), greedy_));
|
||||
|
||||
observer_ptr<node> rhs_ = _node_ptr_vector.back().get();
|
||||
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<selection_node>(lhs_, rhs_));
|
||||
_tree_node_stack.top() = _node_ptr_vector.back().get();
|
||||
}
|
||||
|
||||
void zero_or_more(const bool greedy_)
|
||||
{
|
||||
// perform *
|
||||
observer_ptr<node> ptr_ = _tree_node_stack.top();
|
||||
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<iteration_node>(ptr_, greedy_));
|
||||
_tree_node_stack.top() = _node_ptr_vector.back().get();
|
||||
}
|
||||
|
||||
void one_or_more(const bool greedy_)
|
||||
{
|
||||
// perform +
|
||||
observer_ptr<node> lhs_ = _tree_node_stack.top();
|
||||
observer_ptr<node> copy_ = lhs_->copy(_node_ptr_vector);
|
||||
|
||||
_node_ptr_vector.emplace_back(std::make_unique<iteration_node>
|
||||
(copy_, greedy_));
|
||||
|
||||
observer_ptr<node> rhs_ = _node_ptr_vector.back().get();
|
||||
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<sequence_node>(lhs_, rhs_));
|
||||
_tree_node_stack.top() = _node_ptr_vector.back().get();
|
||||
}
|
||||
|
||||
// perform {n[,[m]]}
|
||||
// Semantic checks have already been performed.
|
||||
// {0,} = *
|
||||
// {0,1} = ?
|
||||
// {1,} = +
|
||||
// therefore we do not check for these cases.
|
||||
void repeatn(const bool greedy_, observer_ptr<const token> token_)
|
||||
{
|
||||
const rules_char_type *str_ = token_->_extra.c_str();
|
||||
std::size_t min_ = 0;
|
||||
bool comma_ = false;
|
||||
std::size_t max_ = 0;
|
||||
|
||||
while (*str_>= '0' && *str_ <= '9')
|
||||
{
|
||||
min_ *= 10;
|
||||
min_ += *str_ - '0';
|
||||
++str_;
|
||||
}
|
||||
|
||||
comma_ = *str_ == ',';
|
||||
|
||||
if (comma_) ++str_;
|
||||
|
||||
while (*str_>= '0' && *str_ <= '9')
|
||||
{
|
||||
max_ *= 10;
|
||||
max_ += *str_ - '0';
|
||||
++str_;
|
||||
}
|
||||
|
||||
if (!(min_ == 1 && !comma_))
|
||||
{
|
||||
const std::size_t top_ = min_ > 0 ? min_ : max_;
|
||||
|
||||
if (min_ == 0)
|
||||
{
|
||||
optional(greedy_);
|
||||
}
|
||||
|
||||
observer_ptr<node> prev_ = _tree_node_stack.top()->
|
||||
copy(_node_ptr_vector);
|
||||
observer_ptr<node> curr_ = nullptr;
|
||||
|
||||
for (std::size_t i_ = 2; i_ < top_; ++i_)
|
||||
{
|
||||
curr_ = prev_->copy(_node_ptr_vector);
|
||||
_tree_node_stack.push(prev_);
|
||||
sequence();
|
||||
prev_ = curr_;
|
||||
}
|
||||
|
||||
if (comma_ && min_ > 0)
|
||||
{
|
||||
if (min_ > 1)
|
||||
{
|
||||
curr_ = prev_->copy(_node_ptr_vector);
|
||||
_tree_node_stack.push(prev_);
|
||||
sequence();
|
||||
prev_ = curr_;
|
||||
}
|
||||
|
||||
if (comma_ && max_)
|
||||
{
|
||||
_tree_node_stack.push(prev_);
|
||||
optional(greedy_);
|
||||
prev_ = _tree_node_stack.top();
|
||||
_tree_node_stack.pop();
|
||||
|
||||
const std::size_t count_ = max_ - min_;
|
||||
|
||||
for (std::size_t i_ = 1; i_ < count_; ++i_)
|
||||
{
|
||||
curr_ = prev_->copy(_node_ptr_vector);
|
||||
_tree_node_stack.push(prev_);
|
||||
sequence();
|
||||
prev_ = curr_;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
_tree_node_stack.push(prev_);
|
||||
zero_or_more(greedy_);
|
||||
prev_ = _tree_node_stack.top();
|
||||
_tree_node_stack.pop();
|
||||
}
|
||||
}
|
||||
|
||||
_tree_node_stack.push(prev_);
|
||||
sequence();
|
||||
}
|
||||
}
|
||||
|
||||
void fixup_bol(observer_ptr<node> &root_)const
|
||||
{
|
||||
const auto &first_ = root_->firstpos();
|
||||
bool found_ = false;
|
||||
|
||||
for (observer_ptr<const node> node_ : first_)
|
||||
{
|
||||
found_ = !node_->end_state() && node_->token() == bol_token();
|
||||
|
||||
if (found_) break;
|
||||
}
|
||||
|
||||
if (!found_)
|
||||
{
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<leaf_node>(bol_token(), true));
|
||||
|
||||
observer_ptr<node> lhs_ = _node_ptr_vector.back().get();
|
||||
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<leaf_node>(node::null_token(), true));
|
||||
|
||||
observer_ptr<node> rhs_ = _node_ptr_vector.back().get();
|
||||
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<selection_node>(lhs_, rhs_));
|
||||
lhs_ = _node_ptr_vector.back().get();
|
||||
|
||||
_node_ptr_vector.emplace_back
|
||||
(std::make_unique<sequence_node>(lhs_, root_));
|
||||
root_ = _node_ptr_vector.back().get();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
100
YACReaderLibrary/lexertl/parser/tokeniser/re_token.hpp
Normal file
100
YACReaderLibrary/lexertl/parser/tokeniser/re_token.hpp
Normal file
@ -0,0 +1,100 @@
|
||||
// re_token.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RE_TOKEN_HPP
|
||||
#define LEXERTL_RE_TOKEN_HPP
|
||||
|
||||
#include "../../string_token.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
// Note that tokens following END are never seen by parser.hpp.
|
||||
enum token_type {BEGIN, REGEX, OREXP, SEQUENCE, SUB, EXPRESSION, REPEAT,
|
||||
DUP, OR, CHARSET, BOL, EOL, MACRO, OPENPAREN, CLOSEPAREN, OPT, AOPT,
|
||||
ZEROORMORE, AZEROORMORE, ONEORMORE, AONEORMORE, REPEATN, AREPEATN,
|
||||
END, DIFF};
|
||||
|
||||
template<typename input_char_type, typename char_type>
|
||||
struct basic_re_token
|
||||
{
|
||||
using string_token = basic_string_token<char_type>;
|
||||
using string = std::basic_string<input_char_type>;
|
||||
|
||||
token_type _type;
|
||||
string _extra;
|
||||
string_token _str;
|
||||
|
||||
basic_re_token(const token_type type_ = BEGIN) :
|
||||
_type(type_),
|
||||
_extra(),
|
||||
_str()
|
||||
{
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
_type = BEGIN;
|
||||
_extra.clear();
|
||||
_str.clear();
|
||||
}
|
||||
|
||||
void swap(basic_re_token &rhs_)
|
||||
{
|
||||
std::swap(_type, rhs_._type);
|
||||
_extra.swap(rhs_._extra);
|
||||
_str.swap(rhs_._str);
|
||||
}
|
||||
|
||||
char precedence(const token_type type_) const
|
||||
{
|
||||
// Moved in here for Solaris compiler.
|
||||
static const char precedence_table_[END + 1][END + 1] = {
|
||||
// BEG, REG, ORE, SEQ, SUB, EXP, RPT, DUP, | , CHR, BOL, EOL, MCR, ( , ) , ? , ?? , * , *? , + , +?, {n}?, {n}, END
|
||||
/*BEGIN*/{ ' ', '<', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/*REGEX*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/*OREXP*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* SEQ */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* SUB */{ ' ', ' ', ' ', ' ', ' ', '=', '<', ' ', '>', '<', '<', '<', '<', '<', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/*EXPRE*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* RPT */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', '>', '>', '>', '<', '<', '<', '<', '<', '<', '<', '<', '>' },
|
||||
/*DUPLI*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* | */{ ' ', ' ', ' ', '=', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' },
|
||||
/*CHARA*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
|
||||
/* BOL */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
|
||||
/* EOL */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
|
||||
/*MACRO*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
|
||||
/* ( */{ ' ', '=', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' },
|
||||
/* ) */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>' },
|
||||
/* ? */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* ?? */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* * */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* *? */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* + */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* +? */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/*{n,m}*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/*{nm}?*/{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>' },
|
||||
/* END */{ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' }
|
||||
};
|
||||
|
||||
return precedence_table_[_type][type_];
|
||||
}
|
||||
|
||||
const char *precedence_string() const
|
||||
{
|
||||
// Moved in here for Solaris compiler.
|
||||
static const char *precedence_strings_[END + 1] =
|
||||
{"BEGIN", "REGEX", "OREXP", "SEQUENCE", "SUB", "EXPRESSION",
|
||||
"REPEAT", "DUPLICATE", "|", "CHARSET", "^", "$", "MACRO", "(", ")",
|
||||
"?", "??", "*", "*?", "+", "+?", "{n[,[m]]}", "{n[,[m]]}?", "END"};
|
||||
|
||||
return precedence_strings_[_type];
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
778
YACReaderLibrary/lexertl/parser/tokeniser/re_tokeniser.hpp
Normal file
778
YACReaderLibrary/lexertl/parser/tokeniser/re_tokeniser.hpp
Normal file
@ -0,0 +1,778 @@
|
||||
// tokeniser.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RE_TOKENISER_HPP
|
||||
#define LEXERTL_RE_TOKENISER_HPP
|
||||
|
||||
#include <cstring>
|
||||
#include "re_token.hpp"
|
||||
#include "../../runtime_error.hpp"
|
||||
#include <sstream>
|
||||
#include "../../string_token.hpp"
|
||||
#include "re_tokeniser_helper.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename rules_char_type, typename char_type, typename id_type>
|
||||
class basic_re_tokeniser
|
||||
{
|
||||
public:
|
||||
using re_token = basic_re_token<rules_char_type, char_type>;
|
||||
using tokeniser_helper =
|
||||
basic_re_tokeniser_helper<rules_char_type, char_type, id_type>;
|
||||
using char_state = typename tokeniser_helper::char_state;
|
||||
using state = typename tokeniser_helper::state;
|
||||
using string_token = basic_string_token<char_type>;
|
||||
|
||||
static void next(re_token &lhs_, state &state_, re_token &token_)
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
bool eos_ = state_.next(ch_);
|
||||
bool skipped_ = false;
|
||||
|
||||
token_.clear();
|
||||
|
||||
do
|
||||
{
|
||||
// string begin/end
|
||||
while (!eos_ && ch_ == '"')
|
||||
{
|
||||
state_._in_string ^= 1;
|
||||
eos_ = state_.next(ch_);
|
||||
}
|
||||
|
||||
if (eos_) break;
|
||||
|
||||
// (?# ...)
|
||||
skipped_ = comment(eos_, ch_, state_);
|
||||
|
||||
if (eos_) break;
|
||||
|
||||
// skip_ws set
|
||||
skipped_ |= skip(eos_, ch_, state_);
|
||||
} while (!eos_ && skipped_);
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
if (state_._in_string)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (missing '\"')";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
if (state_._paren_count)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (missing ')')";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
token_._type = END;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ch_ == '\\')
|
||||
{
|
||||
// Even if we are in a string, respect escape sequences...
|
||||
token_._type = CHARSET;
|
||||
escape(state_, token_._str);
|
||||
}
|
||||
else if (state_._in_string)
|
||||
{
|
||||
// All other meta characters lose their special meaning
|
||||
// inside a string.
|
||||
token_._type = CHARSET;
|
||||
add_char(ch_, state_, token_._str);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Not an escape sequence and not inside a string, so
|
||||
// check for meta characters.
|
||||
switch (ch_)
|
||||
{
|
||||
case '(':
|
||||
token_._type = OPENPAREN;
|
||||
++state_._paren_count;
|
||||
read_options(state_);
|
||||
break;
|
||||
case ')':
|
||||
--state_._paren_count;
|
||||
|
||||
if (state_._paren_count < 0)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Number of open parenthesis < 0 "
|
||||
"at index " << state_.index() - 1;
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
token_._type = CLOSEPAREN;
|
||||
|
||||
if (!state_._flags_stack.empty())
|
||||
{
|
||||
state_._flags = state_._flags_stack.top();
|
||||
state_._flags_stack.pop();
|
||||
}
|
||||
|
||||
break;
|
||||
case '?':
|
||||
if (!state_.eos() && *state_._curr == '?')
|
||||
{
|
||||
token_._type = AOPT;
|
||||
state_.increment();
|
||||
}
|
||||
else
|
||||
{
|
||||
token_._type = OPT;
|
||||
}
|
||||
|
||||
break;
|
||||
case '*':
|
||||
if (!state_.eos() && *state_._curr == '?')
|
||||
{
|
||||
token_._type = AZEROORMORE;
|
||||
state_.increment();
|
||||
}
|
||||
else
|
||||
{
|
||||
token_._type = ZEROORMORE;
|
||||
}
|
||||
|
||||
break;
|
||||
case '+':
|
||||
if (!state_.eos() && *state_._curr == '?')
|
||||
{
|
||||
token_._type = AONEORMORE;
|
||||
state_.increment();
|
||||
}
|
||||
else
|
||||
{
|
||||
token_._type = ONEORMORE;
|
||||
}
|
||||
|
||||
break;
|
||||
case '{':
|
||||
open_curly(lhs_, state_, token_);
|
||||
break;
|
||||
case '|':
|
||||
token_._type = OR;
|
||||
break;
|
||||
case '^':
|
||||
if (!state_._macro_name &&
|
||||
state_._curr - 1 == state_._start)
|
||||
{
|
||||
token_._type = BOL;
|
||||
}
|
||||
else
|
||||
{
|
||||
token_._type = CHARSET;
|
||||
token_._str.insert(range(ch_, ch_));
|
||||
}
|
||||
|
||||
break;
|
||||
case '$':
|
||||
if (!state_._macro_name && state_._curr == state_._end)
|
||||
{
|
||||
token_._type = EOL;
|
||||
}
|
||||
else
|
||||
{
|
||||
token_._type = CHARSET;
|
||||
token_._str.insert(range(ch_, ch_));
|
||||
}
|
||||
|
||||
break;
|
||||
case '.':
|
||||
{
|
||||
token_._type = CHARSET;
|
||||
|
||||
if (state_._flags & dot_not_newline)
|
||||
{
|
||||
token_._str.insert(range('\n', '\n'));
|
||||
}
|
||||
else if (state_._flags & dot_not_cr_lf)
|
||||
{
|
||||
token_._str.insert(range('\n', '\n'));
|
||||
token_._str.insert(range('\r', '\r'));
|
||||
}
|
||||
|
||||
token_._str.negate();
|
||||
break;
|
||||
}
|
||||
case '[':
|
||||
{
|
||||
token_._type = CHARSET;
|
||||
tokeniser_helper::charset(state_, token_._str);
|
||||
break;
|
||||
}
|
||||
case '/':
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Lookahead ('/') is not supported yet";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
break;
|
||||
}
|
||||
default:
|
||||
token_._type = CHARSET;
|
||||
add_char(ch_, state_, token_._str);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
using range = typename string_token::range;
|
||||
|
||||
static bool comment(bool &eos_, rules_char_type &ch_, state &state_)
|
||||
{
|
||||
bool skipped_ = false;
|
||||
|
||||
if (!state_._in_string && ch_ == '(' && !state_.eos() &&
|
||||
*state_._curr == '?' && state_._curr + 1 < state_._end &&
|
||||
*(state_._curr + 1) == '#')
|
||||
{
|
||||
std::size_t paren_count_ = 1;
|
||||
|
||||
state_.increment();
|
||||
state_.increment();
|
||||
|
||||
do
|
||||
{
|
||||
eos_ = state_.next(ch_);
|
||||
|
||||
if (ch_ == '(')
|
||||
{
|
||||
++paren_count_;
|
||||
}
|
||||
else if (ch_ == ')')
|
||||
{
|
||||
--paren_count_;
|
||||
}
|
||||
} while (!eos_ && !(ch_ == ')' && paren_count_ == 0));
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (unterminated comment)";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
eos_ = state_.next(ch_);
|
||||
}
|
||||
|
||||
skipped_ = true;
|
||||
}
|
||||
|
||||
return skipped_;
|
||||
}
|
||||
|
||||
static bool skip(bool &eos_, rules_char_type &ch_, state &state_)
|
||||
{
|
||||
bool skipped_ = false;
|
||||
|
||||
if ((state_._flags & skip_ws) && !state_._in_string)
|
||||
{
|
||||
bool c_comment_ = false;
|
||||
bool skip_ws_ = false;
|
||||
|
||||
do
|
||||
{
|
||||
c_comment_ = ch_ == '/' && !state_.eos() &&
|
||||
*state_._curr == '*';
|
||||
skip_ws_ = !c_comment_ && (ch_ == ' ' || ch_ == '\t' ||
|
||||
ch_ == '\n' || ch_ == '\r' || ch_ == '\f' || ch_ == '\v');
|
||||
|
||||
if (c_comment_)
|
||||
{
|
||||
state_.increment();
|
||||
eos_ = state_.next(ch_);
|
||||
|
||||
while (!eos_ && !(ch_ == '*' && !state_.eos() &&
|
||||
*state_._curr == '/'))
|
||||
{
|
||||
eos_ = state_.next(ch_);
|
||||
}
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (unterminated C style comment)";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
else
|
||||
{
|
||||
state_.increment();
|
||||
eos_ = state_.next(ch_);
|
||||
}
|
||||
|
||||
skipped_ = true;
|
||||
}
|
||||
else if (skip_ws_)
|
||||
{
|
||||
eos_ = state_.next(ch_);
|
||||
skipped_ = true;
|
||||
}
|
||||
} while (!eos_ && (c_comment_ || skip_ws_));
|
||||
}
|
||||
|
||||
return skipped_;
|
||||
}
|
||||
|
||||
static void read_options(state &state_)
|
||||
{
|
||||
if (!state_.eos() && *state_._curr == '?')
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
bool eos_ = false;
|
||||
bool negate_ = false;
|
||||
|
||||
state_.increment();
|
||||
eos_ = state_.next(ch_);
|
||||
state_._flags_stack.push(state_._flags);
|
||||
|
||||
while (!eos_ && ch_ != ':')
|
||||
{
|
||||
switch (ch_)
|
||||
{
|
||||
case '-':
|
||||
negate_ ^= 1;
|
||||
break;
|
||||
case 'i':
|
||||
if (negate_)
|
||||
{
|
||||
state_._flags = state_._flags & ~icase;
|
||||
}
|
||||
else
|
||||
{
|
||||
state_._flags = state_._flags | icase;
|
||||
}
|
||||
|
||||
negate_ = false;
|
||||
break;
|
||||
case 's':
|
||||
if (negate_)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
state_._flags = state_._flags | dot_not_cr_lf;
|
||||
#else
|
||||
state_._flags = state_._flags | dot_not_newline;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef _WIN32
|
||||
state_._flags = state_._flags & ~dot_not_cr_lf;
|
||||
#else
|
||||
state_._flags = state_._flags & ~dot_not_newline;
|
||||
#endif
|
||||
}
|
||||
|
||||
negate_ = false;
|
||||
break;
|
||||
case 'x':
|
||||
if (negate_)
|
||||
{
|
||||
state_._flags = state_._flags & ~skip_ws;
|
||||
}
|
||||
else
|
||||
{
|
||||
state_._flags = state_._flags | skip_ws;
|
||||
}
|
||||
|
||||
negate_ = false;
|
||||
break;
|
||||
default:
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Unknown option at index " <<
|
||||
state_.index() - 1;
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
}
|
||||
|
||||
eos_ = state_.next(ch_);
|
||||
}
|
||||
|
||||
// End of string handler will handle early termination
|
||||
}
|
||||
else if (!state_._flags_stack.empty())
|
||||
{
|
||||
state_._flags_stack.push(state_._flags);
|
||||
}
|
||||
}
|
||||
|
||||
static void escape(state &state_, string_token &token_)
|
||||
{
|
||||
char_type ch_ = 0;
|
||||
std::size_t str_len_ = 0;
|
||||
const char *str_ = tokeniser_helper::escape_sequence(state_,
|
||||
ch_, str_len_);
|
||||
|
||||
if (str_)
|
||||
{
|
||||
char_state state2_(str_ + 1, str_ + str_len_, state_._id,
|
||||
state_._flags, state_._locale, 0);
|
||||
|
||||
tokeniser_helper::charset(state2_, token_);
|
||||
}
|
||||
else
|
||||
{
|
||||
add_char(ch_, state_, token_);
|
||||
}
|
||||
}
|
||||
|
||||
static void add_char(const char_type ch_, const state &state_,
|
||||
string_token &token_)
|
||||
{
|
||||
range range_(ch_, ch_);
|
||||
|
||||
token_.insert(range_);
|
||||
|
||||
if (state_._flags & icase)
|
||||
{
|
||||
string_token folded_;
|
||||
|
||||
tokeniser_helper::fold(range_, state_._locale,
|
||||
folded_, typename tokeniser_helper::template
|
||||
size<sizeof(char_type)>());
|
||||
|
||||
if (!folded_.empty())
|
||||
{
|
||||
token_.insert(folded_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void open_curly(re_token &lhs_, state &state_,
|
||||
re_token &token_)
|
||||
{
|
||||
if (state_.eos())
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (missing '}')";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
else if (*state_._curr == '-' || *state_._curr == '+')
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
|
||||
if (lhs_._type != CHARSET)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "CHARSET must precede {" <<
|
||||
state_._curr << "} at index " <<
|
||||
state_.index() - 1;
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
state_.next(ch_);
|
||||
token_._type = DIFF;
|
||||
token_._extra = ch_;
|
||||
|
||||
if (state_.next(ch_))
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (missing '}')";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
if (ch_ != '}')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing '}' at index " << state_.index() - 1;
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
}
|
||||
else if (*state_._curr >= '0' && *state_._curr <= '9')
|
||||
{
|
||||
repeat_n(state_, token_);
|
||||
}
|
||||
else
|
||||
{
|
||||
macro(state_, token_);
|
||||
}
|
||||
}
|
||||
|
||||
// SYNTAX:
|
||||
// {n[,[n]]}
|
||||
// SEMANTIC RULES:
|
||||
// {0} - INVALID (throw exception)
|
||||
// {0,} = *
|
||||
// {0,0} - INVALID (throw exception)
|
||||
// {0,1} = ?
|
||||
// {1,} = +
|
||||
// {min,max} where min == max - {min}
|
||||
// {min,max} where max < min - INVALID (throw exception)
|
||||
static void repeat_n(state &state_, re_token &token_)
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
bool eos_ = state_.next(ch_);
|
||||
std::size_t min_ = 0;
|
||||
std::size_t max_ = 0;
|
||||
|
||||
while (!eos_ && ch_ >= '0' && ch_ <= '9')
|
||||
{
|
||||
min_ *= 10;
|
||||
min_ += ch_ - '0';
|
||||
token_._extra += ch_;
|
||||
eos_ = state_.next(ch_);
|
||||
}
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (missing repeat terminator '}')";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
bool min_max_ = false;
|
||||
bool repeatn_ = true;
|
||||
|
||||
if (ch_ == ',')
|
||||
{
|
||||
token_._extra += ch_;
|
||||
eos_ = state_.next(ch_);
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (missing repeat terminator '}')";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
if (ch_ == '}')
|
||||
{
|
||||
// Small optimisation: Check for '*' equivalency.
|
||||
if (min_ == 0)
|
||||
{
|
||||
token_._type = ZEROORMORE;
|
||||
repeatn_ = false;
|
||||
}
|
||||
// Small optimisation: Check for '+' equivalency.
|
||||
else if (min_ == 1)
|
||||
{
|
||||
token_._type = ONEORMORE;
|
||||
repeatn_ = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ch_ < '0' || ch_ > '9')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing repeat terminator '}' at index " <<
|
||||
state_.index() - 1;
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
min_max_ = true;
|
||||
|
||||
do
|
||||
{
|
||||
max_ *= 10;
|
||||
max_ += ch_ - '0';
|
||||
token_._extra += ch_;
|
||||
eos_ = state_.next(ch_);
|
||||
} while (!eos_ && ch_ >= '0' && ch_ <= '9');
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (missing repeat terminator '}')";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
// Small optimisation: Check for '?' equivalency.
|
||||
if (min_ == 0 && max_ == 1)
|
||||
{
|
||||
token_._type = OPT;
|
||||
repeatn_ = false;
|
||||
}
|
||||
// Small optimisation: if min == max, then min.
|
||||
else if (min_ == max_)
|
||||
{
|
||||
token_._extra.erase(token_._extra.find(','));
|
||||
min_max_ = false;
|
||||
max_ = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ch_ != '}')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing repeat terminator '}' at index " <<
|
||||
state_.index() - 1;
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
if (repeatn_)
|
||||
{
|
||||
// SEMANTIC VALIDATION follows:
|
||||
// NOTE: {0,} has already become *
|
||||
// therefore we don't check for a comma.
|
||||
if (min_ == 0 && max_ == 0)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Cannot have exactly zero repeats preceding index " <<
|
||||
state_.index();
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
if (min_max_ && max_ < min_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Max less than min preceding index " <<
|
||||
state_.index();
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
if (!state_.eos() && *state_._curr == '?')
|
||||
{
|
||||
token_._type = AREPEATN;
|
||||
state_.increment();
|
||||
}
|
||||
else
|
||||
{
|
||||
token_._type = REPEATN;
|
||||
}
|
||||
}
|
||||
else if (token_._type == ZEROORMORE)
|
||||
{
|
||||
if (!state_.eos() && *state_._curr == '?')
|
||||
{
|
||||
token_._type = AZEROORMORE;
|
||||
state_.increment();
|
||||
}
|
||||
}
|
||||
else if (token_._type == ONEORMORE)
|
||||
{
|
||||
if (!state_.eos() && *state_._curr == '?')
|
||||
{
|
||||
token_._type = AONEORMORE;
|
||||
state_.increment();
|
||||
}
|
||||
}
|
||||
else if (token_._type == OPT)
|
||||
{
|
||||
if (!state_.eos() && *state_._curr == '?')
|
||||
{
|
||||
token_._type = AOPT;
|
||||
state_.increment();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void macro(state &state_, re_token &token_)
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
bool eos_ = false;
|
||||
|
||||
state_.next(ch_);
|
||||
|
||||
if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
|
||||
!(ch_ >= 'a' && ch_ <= 'z'))
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Invalid MACRO name at index " << state_.index() - 1;
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
token_._extra += ch_;
|
||||
eos_ = state_.next(ch_);
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
state_.unexpected_end(ss_);
|
||||
ss_ << " (missing MACRO name terminator '}')";
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
} while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
|
||||
(ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
|
||||
|
||||
if (ch_ != '}')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing MACRO name terminator '}' at index " <<
|
||||
state_.index() - 1;
|
||||
state_.error(ss_);
|
||||
throw runtime_error(ss_.str());
|
||||
}
|
||||
|
||||
token_._type = MACRO;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
3157
YACReaderLibrary/lexertl/parser/tokeniser/re_tokeniser_helper.hpp
Normal file
3157
YACReaderLibrary/lexertl/parser/tokeniser/re_tokeniser_helper.hpp
Normal file
File diff suppressed because it is too large
Load Diff
136
YACReaderLibrary/lexertl/parser/tokeniser/re_tokeniser_state.hpp
Normal file
136
YACReaderLibrary/lexertl/parser/tokeniser/re_tokeniser_state.hpp
Normal file
@ -0,0 +1,136 @@
|
||||
// tokeniser_state.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RE_TOKENISER_STATE_HPP
|
||||
#define LEXERTL_RE_TOKENISER_STATE_HPP
|
||||
|
||||
#include "../../char_traits.hpp"
|
||||
#include "../../enums.hpp"
|
||||
#include <locale>
|
||||
#include "../../narrow.hpp"
|
||||
#include <stack>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename ch_type, typename id_type>
|
||||
struct basic_re_tokeniser_state
|
||||
{
|
||||
using char_type = ch_type;
|
||||
using index_type = typename basic_char_traits<char_type>::index_type;
|
||||
|
||||
const char_type * const _start;
|
||||
const char_type * const _end;
|
||||
const char_type *_curr;
|
||||
id_type _id;
|
||||
std::size_t _flags;
|
||||
std::stack<std::size_t> _flags_stack;
|
||||
std::locale _locale;
|
||||
const char_type *_macro_name;
|
||||
long _paren_count;
|
||||
bool _in_string;
|
||||
id_type _nl_id;
|
||||
|
||||
basic_re_tokeniser_state(const char_type *start_,
|
||||
const char_type * const end_, id_type id_, const std::size_t flags_,
|
||||
const std::locale locale_, const char_type *macro_name_) :
|
||||
_start(start_),
|
||||
_end(end_),
|
||||
_curr(start_),
|
||||
_id(id_),
|
||||
_flags(flags_),
|
||||
_flags_stack(),
|
||||
_locale(locale_),
|
||||
_macro_name(macro_name_),
|
||||
_paren_count(0),
|
||||
_in_string(false),
|
||||
_nl_id(static_cast<id_type>(~0))
|
||||
{
|
||||
}
|
||||
|
||||
basic_re_tokeniser_state(const basic_re_tokeniser_state &rhs_)
|
||||
{
|
||||
assign(rhs_);
|
||||
}
|
||||
|
||||
// prevent VC++ 7.1 warning:
|
||||
const basic_re_tokeniser_state &operator =
|
||||
(const basic_re_tokeniser_state &rhs_)
|
||||
{
|
||||
return assign(rhs_);
|
||||
}
|
||||
|
||||
basic_re_tokeniser_state &assign(const basic_re_tokeniser_state &rhs_)
|
||||
{
|
||||
_start = rhs_._start;
|
||||
_end = rhs_._end;
|
||||
_curr = rhs_._curr;
|
||||
_id = rhs_._id;
|
||||
_flags = rhs_._flags;
|
||||
_flags_stack = rhs_._flags_stack;
|
||||
_locale = rhs_._locale;
|
||||
_macro_name = rhs_._macro_name;
|
||||
_paren_count = rhs_._paren_count;
|
||||
_in_string = rhs_._in_string;
|
||||
_nl_id = rhs_._nl_id;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline bool next(char_type &ch_)
|
||||
{
|
||||
if (_curr >= _end)
|
||||
{
|
||||
ch_ = 0;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
ch_ = *_curr;
|
||||
increment();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
inline void increment()
|
||||
{
|
||||
++_curr;
|
||||
}
|
||||
|
||||
inline std::size_t index()
|
||||
{
|
||||
return _curr - _start;
|
||||
}
|
||||
|
||||
inline bool eos()
|
||||
{
|
||||
return _curr >= _end;
|
||||
}
|
||||
|
||||
inline void unexpected_end(std::ostringstream &ss_)
|
||||
{
|
||||
ss_ << "Unexpected end of regex";
|
||||
}
|
||||
|
||||
inline void error(std::ostringstream &ss_)
|
||||
{
|
||||
ss_ << " in ";
|
||||
|
||||
if (_macro_name)
|
||||
{
|
||||
ss_ << "MACRO '";
|
||||
narrow(_macro_name, ss_);
|
||||
ss_ << "'.";
|
||||
}
|
||||
else
|
||||
{
|
||||
ss_ << "rule id " << _id << '.';
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
111
YACReaderLibrary/lexertl/parser/tree/end_node.hpp
Normal file
111
YACReaderLibrary/lexertl/parser/tree/end_node.hpp
Normal file
@ -0,0 +1,111 @@
|
||||
// end_node.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_END_NODE_HPP
|
||||
#define LEXERTL_END_NODE_HPP
|
||||
|
||||
#include "node.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_end_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
using node = basic_node<id_type>;
|
||||
using bool_stack = typename node::bool_stack;
|
||||
using const_node_stack = typename node::const_node_stack;
|
||||
using node_ptr_vector = typename node::node_ptr_vector;
|
||||
using node_stack = typename node::node_stack;
|
||||
using node_type = typename node::node_type;
|
||||
using node_vector = typename node::node_vector;
|
||||
|
||||
basic_end_node(const id_type id_, const id_type user_id_,
|
||||
const id_type next_dfa_, const id_type push_dfa_,
|
||||
const bool pop_dfa_) :
|
||||
node(false),
|
||||
_id(id_),
|
||||
_user_id(user_id_),
|
||||
_next_dfa(next_dfa_),
|
||||
_push_dfa(push_dfa_),
|
||||
_pop_dfa(pop_dfa_),
|
||||
_followpos()
|
||||
{
|
||||
node::_firstpos.push_back(this);
|
||||
node::_lastpos.push_back(this);
|
||||
}
|
||||
|
||||
virtual ~basic_end_node() override
|
||||
{
|
||||
}
|
||||
|
||||
virtual node_type what_type() const override
|
||||
{
|
||||
return node::END;
|
||||
}
|
||||
|
||||
virtual bool traverse(const_node_stack &/*node_stack_*/,
|
||||
bool_stack &/*perform_op_stack_*/) const override
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual const node_vector &followpos() const override
|
||||
{
|
||||
// _followpos is always empty..!
|
||||
return _followpos;
|
||||
}
|
||||
|
||||
virtual bool end_state() const override
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual id_type id() const override
|
||||
{
|
||||
return _id;
|
||||
}
|
||||
|
||||
virtual id_type user_id() const override
|
||||
{
|
||||
return _user_id;
|
||||
}
|
||||
|
||||
virtual id_type next_dfa() const override
|
||||
{
|
||||
return _next_dfa;
|
||||
}
|
||||
|
||||
virtual id_type push_dfa() const override
|
||||
{
|
||||
return _push_dfa;
|
||||
}
|
||||
|
||||
virtual bool pop_dfa() const override
|
||||
{
|
||||
return _pop_dfa;
|
||||
}
|
||||
|
||||
private:
|
||||
id_type _id;
|
||||
id_type _user_id;
|
||||
id_type _next_dfa;
|
||||
id_type _push_dfa;
|
||||
bool _pop_dfa;
|
||||
node_vector _followpos;
|
||||
|
||||
virtual void copy_node(node_ptr_vector &/*node_ptr_vector_*/,
|
||||
node_stack &/*new_node_stack_*/, bool_stack &/*perform_op_stack_*/,
|
||||
bool &/*down_*/) const override
|
||||
{
|
||||
// Nothing to do, as end_nodes are not copied.
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
96
YACReaderLibrary/lexertl/parser/tree/iteration_node.hpp
Normal file
96
YACReaderLibrary/lexertl/parser/tree/iteration_node.hpp
Normal file
@ -0,0 +1,96 @@
|
||||
// iteration_node.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_ITERATION_NODE_HPP
|
||||
#define LEXERTL_ITERATION_NODE_HPP
|
||||
|
||||
#include "node.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_iteration_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
using node = basic_node<id_type>;
|
||||
using bool_stack = typename node::bool_stack;
|
||||
using const_node_stack = typename node::const_node_stack;
|
||||
using node_ptr_vector = typename node::node_ptr_vector;
|
||||
using node_stack = typename node::node_stack;
|
||||
using node_type = typename node::node_type;
|
||||
using node_vector = typename node::node_vector;
|
||||
|
||||
basic_iteration_node(observer_ptr<node> next_, const bool greedy_) :
|
||||
node(true),
|
||||
_next(next_),
|
||||
_greedy(greedy_)
|
||||
{
|
||||
_next->append_firstpos(node::_firstpos);
|
||||
_next->append_lastpos(node::_lastpos);
|
||||
|
||||
for (observer_ptr<node> node_ : node::_lastpos)
|
||||
{
|
||||
node_->append_followpos(node::_firstpos);
|
||||
}
|
||||
|
||||
for (observer_ptr<node> node_ : node::_firstpos)
|
||||
{
|
||||
node_->greedy(greedy_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~basic_iteration_node() override
|
||||
{
|
||||
}
|
||||
|
||||
virtual node_type what_type() const override
|
||||
{
|
||||
return node::ITERATION;
|
||||
}
|
||||
|
||||
virtual bool traverse(const_node_stack &node_stack_,
|
||||
bool_stack &perform_op_stack_) const override
|
||||
{
|
||||
perform_op_stack_.push(true);
|
||||
node_stack_.push(_next);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
observer_ptr<node> _next;
|
||||
bool _greedy;
|
||||
|
||||
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
|
||||
bool &down_) const override
|
||||
{
|
||||
if (perform_op_stack_.top())
|
||||
{
|
||||
observer_ptr<node> ptr_ = new_node_stack_.top();
|
||||
|
||||
node_ptr_vector_.emplace_back
|
||||
(std::make_unique<basic_iteration_node>(ptr_, _greedy));
|
||||
new_node_stack_.top() = node_ptr_vector_.back().get();
|
||||
}
|
||||
else
|
||||
{
|
||||
down_ = true;
|
||||
}
|
||||
|
||||
perform_op_stack_.pop();
|
||||
}
|
||||
|
||||
// No copy construction.
|
||||
basic_iteration_node(const basic_iteration_node &) = delete;
|
||||
// No assignment.
|
||||
const basic_iteration_node &operator =
|
||||
(const basic_iteration_node &) = delete;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
110
YACReaderLibrary/lexertl/parser/tree/leaf_node.hpp
Normal file
110
YACReaderLibrary/lexertl/parser/tree/leaf_node.hpp
Normal file
@ -0,0 +1,110 @@
|
||||
// leaf_node.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_LEAF_NODE_HPP
|
||||
#define LEXERTL_LEAF_NODE_HPP
|
||||
|
||||
#include "../../enums.hpp" // null_token
|
||||
#include "node.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_leaf_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
using node = basic_node<id_type>;
|
||||
using bool_stack = typename node::bool_stack;
|
||||
using const_node_stack = typename node::const_node_stack;
|
||||
using node_ptr_vector = typename node::node_ptr_vector;
|
||||
using node_stack = typename node::node_stack;
|
||||
using node_type = typename node::node_type;
|
||||
using node_vector = typename node::node_vector;
|
||||
|
||||
basic_leaf_node(const id_type token_, const bool greedy_) :
|
||||
node(token_ == node::null_token()),
|
||||
_token(token_),
|
||||
_set_greedy(!greedy_),
|
||||
_greedy(greedy_),
|
||||
_followpos()
|
||||
{
|
||||
if (!node::_nullable)
|
||||
{
|
||||
node::_firstpos.push_back(this);
|
||||
node::_lastpos.push_back(this);
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~basic_leaf_node() override
|
||||
{
|
||||
}
|
||||
|
||||
virtual void append_followpos(const node_vector &followpos_) override
|
||||
{
|
||||
_followpos.insert(_followpos.end(),
|
||||
followpos_.begin(), followpos_.end());
|
||||
}
|
||||
|
||||
virtual node_type what_type() const override
|
||||
{
|
||||
return node::LEAF;
|
||||
}
|
||||
|
||||
virtual bool traverse(const_node_stack &/*node_stack_*/,
|
||||
bool_stack &/*perform_op_stack_*/) const override
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual id_type token() const override
|
||||
{
|
||||
return _token;
|
||||
}
|
||||
|
||||
virtual void greedy(const bool greedy_) override
|
||||
{
|
||||
if (!_set_greedy)
|
||||
{
|
||||
_greedy = greedy_;
|
||||
_set_greedy = true;
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool greedy() const override
|
||||
{
|
||||
return _greedy;
|
||||
}
|
||||
|
||||
virtual const node_vector &followpos() const override
|
||||
{
|
||||
return _followpos;
|
||||
}
|
||||
|
||||
virtual node_vector &followpos() override
|
||||
{
|
||||
return _followpos;
|
||||
}
|
||||
|
||||
private:
|
||||
id_type _token;
|
||||
bool _set_greedy;
|
||||
bool _greedy;
|
||||
node_vector _followpos;
|
||||
|
||||
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &/*perform_op_stack_*/,
|
||||
bool &/*down_*/) const override
|
||||
{
|
||||
node_ptr_vector_.emplace_back(std::make_unique<basic_leaf_node>
|
||||
(_token, _greedy));
|
||||
new_node_stack_.push(node_ptr_vector_.back().get());
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
242
YACReaderLibrary/lexertl/parser/tree/node.hpp
Normal file
242
YACReaderLibrary/lexertl/parser/tree/node.hpp
Normal file
@ -0,0 +1,242 @@
|
||||
// node.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_NODE_HPP
|
||||
#define LEXERTL_NODE_HPP
|
||||
|
||||
#include <assert.h>
|
||||
#include <memory>
|
||||
#include "../../observer_ptr.hpp"
|
||||
#include "../../runtime_error.hpp"
|
||||
#include <stack>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_node
|
||||
{
|
||||
public:
|
||||
enum node_type {LEAF, SEQUENCE, SELECTION, ITERATION, END};
|
||||
|
||||
using bool_stack = std::stack<bool>;
|
||||
using node_stack = std::stack<observer_ptr<basic_node>>;
|
||||
using const_node_stack = std::stack<observer_ptr<const basic_node>>;
|
||||
using node_vector = std::vector<observer_ptr<basic_node>>;
|
||||
using node_ptr_vector = std::vector<std::unique_ptr<basic_node>>;
|
||||
|
||||
basic_node() :
|
||||
_nullable(false),
|
||||
_firstpos(),
|
||||
_lastpos()
|
||||
{
|
||||
}
|
||||
|
||||
basic_node(const bool nullable_) :
|
||||
_nullable(nullable_),
|
||||
_firstpos(),
|
||||
_lastpos()
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~basic_node()
|
||||
{
|
||||
}
|
||||
|
||||
static id_type null_token()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
|
||||
bool nullable() const
|
||||
{
|
||||
return _nullable;
|
||||
}
|
||||
|
||||
void append_firstpos(node_vector &firstpos_) const
|
||||
{
|
||||
firstpos_.insert(firstpos_.end(),
|
||||
_firstpos.begin(), _firstpos.end());
|
||||
}
|
||||
|
||||
void append_lastpos(node_vector &lastpos_) const
|
||||
{
|
||||
lastpos_.insert(lastpos_.end(),
|
||||
_lastpos.begin(), _lastpos.end());
|
||||
}
|
||||
|
||||
virtual void append_followpos(const node_vector &/*followpos_*/)
|
||||
{
|
||||
throw runtime_error("Internal error node::append_followpos().");
|
||||
}
|
||||
|
||||
observer_ptr<basic_node> copy(node_ptr_vector &node_ptr_vector_) const
|
||||
{
|
||||
observer_ptr<basic_node> new_root_ = nullptr;
|
||||
const_node_stack node_stack_;
|
||||
bool_stack perform_op_stack_;
|
||||
bool down_ = true;
|
||||
node_stack new_node_stack_;
|
||||
|
||||
node_stack_.push(this);
|
||||
|
||||
while (!node_stack_.empty())
|
||||
{
|
||||
while (down_)
|
||||
{
|
||||
down_ = node_stack_.top()->traverse(node_stack_,
|
||||
perform_op_stack_);
|
||||
}
|
||||
|
||||
while (!down_ && !node_stack_.empty())
|
||||
{
|
||||
observer_ptr<const basic_node> top_ = node_stack_.top();
|
||||
|
||||
top_->copy_node(node_ptr_vector_, new_node_stack_,
|
||||
perform_op_stack_, down_);
|
||||
|
||||
if (!down_) node_stack_.pop();
|
||||
}
|
||||
}
|
||||
|
||||
assert(new_node_stack_.size() == 1);
|
||||
new_root_ = new_node_stack_.top();
|
||||
new_node_stack_.pop();
|
||||
return new_root_;
|
||||
}
|
||||
|
||||
virtual node_type what_type() const = 0;
|
||||
|
||||
virtual bool traverse(const_node_stack &node_stack_,
|
||||
bool_stack &perform_op_stack_) const = 0;
|
||||
|
||||
node_vector &firstpos()
|
||||
{
|
||||
return _firstpos;
|
||||
}
|
||||
|
||||
const node_vector &firstpos() const
|
||||
{
|
||||
return _firstpos;
|
||||
}
|
||||
|
||||
// _lastpos modified externally, so not const &
|
||||
node_vector &lastpos()
|
||||
{
|
||||
return _lastpos;
|
||||
}
|
||||
|
||||
virtual bool end_state() const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual id_type id() const
|
||||
{
|
||||
throw runtime_error("Internal error node::id().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual id_type user_id() const
|
||||
{
|
||||
throw runtime_error("Internal error node::user_id().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual id_type next_dfa() const
|
||||
{
|
||||
throw runtime_error("Internal error node::next_dfa().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual id_type push_dfa() const
|
||||
{
|
||||
throw runtime_error("Internal error node::push_dfa().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool pop_dfa() const
|
||||
{
|
||||
throw runtime_error("Internal error node::pop_dfa().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual id_type token() const
|
||||
{
|
||||
throw runtime_error("Internal error node::token().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void greedy(const bool /*greedy_*/)
|
||||
{
|
||||
throw runtime_error("Internal error node::greedy(bool).");
|
||||
}
|
||||
|
||||
virtual bool greedy() const
|
||||
{
|
||||
throw runtime_error("Internal error node::greedy().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual const node_vector &followpos() const
|
||||
{
|
||||
throw runtime_error("Internal error node::followpos().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return firstpos;
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual node_vector &followpos()
|
||||
{
|
||||
throw runtime_error("Internal error node::followpos().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return firstpos;
|
||||
#endif
|
||||
}
|
||||
|
||||
protected:
|
||||
const bool _nullable;
|
||||
node_vector _firstpos;
|
||||
node_vector _lastpos;
|
||||
|
||||
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
|
||||
bool &down_) const = 0;
|
||||
|
||||
private:
|
||||
// No copy construction.
|
||||
basic_node(const basic_node &) = delete;
|
||||
// No assignment.
|
||||
const basic_node &operator =(const basic_node &) = delete;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
104
YACReaderLibrary/lexertl/parser/tree/selection_node.hpp
Normal file
104
YACReaderLibrary/lexertl/parser/tree/selection_node.hpp
Normal file
@ -0,0 +1,104 @@
|
||||
// selection_node.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_SELECTION_NODE_HPP
|
||||
#define LEXERTL_SELECTION_NODE_HPP
|
||||
|
||||
#include "node.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_selection_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
using node = basic_node<id_type>;
|
||||
using bool_stack = typename node::bool_stack;
|
||||
using const_node_stack = typename node::const_node_stack;
|
||||
using node_ptr_vector = typename node::node_ptr_vector;
|
||||
using node_stack = typename node::node_stack;
|
||||
using node_type = typename node::node_type;
|
||||
|
||||
basic_selection_node(observer_ptr<node> left_, observer_ptr<node> right_) :
|
||||
node(left_->nullable() || right_->nullable()),
|
||||
_left(left_),
|
||||
_right(right_)
|
||||
{
|
||||
_left->append_firstpos(node::_firstpos);
|
||||
_right->append_firstpos(node::_firstpos);
|
||||
_left->append_lastpos(node::_lastpos);
|
||||
_right->append_lastpos(node::_lastpos);
|
||||
}
|
||||
|
||||
virtual ~basic_selection_node() override
|
||||
{
|
||||
}
|
||||
|
||||
virtual node_type what_type() const override
|
||||
{
|
||||
return node::SELECTION;
|
||||
}
|
||||
|
||||
virtual bool traverse(const_node_stack &node_stack_,
|
||||
bool_stack &perform_op_stack_) const override
|
||||
{
|
||||
perform_op_stack_.push(true);
|
||||
|
||||
switch (_right->what_type())
|
||||
{
|
||||
case node::SEQUENCE:
|
||||
case node::SELECTION:
|
||||
case node::ITERATION:
|
||||
perform_op_stack_.push(false);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
node_stack_.push(_right);
|
||||
node_stack_.push(_left);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
observer_ptr<node> _left;
|
||||
observer_ptr<node> _right;
|
||||
|
||||
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
|
||||
bool &down_) const override
|
||||
{
|
||||
if (perform_op_stack_.top())
|
||||
{
|
||||
observer_ptr<node> rhs_ = new_node_stack_.top();
|
||||
|
||||
new_node_stack_.pop();
|
||||
|
||||
observer_ptr<node> lhs_ = new_node_stack_.top();
|
||||
|
||||
node_ptr_vector_.emplace_back
|
||||
(std::make_unique<basic_selection_node>(lhs_, rhs_));
|
||||
new_node_stack_.top() = node_ptr_vector_.back().get();
|
||||
}
|
||||
else
|
||||
{
|
||||
down_ = true;
|
||||
}
|
||||
|
||||
perform_op_stack_.pop();
|
||||
}
|
||||
|
||||
// No copy construction.
|
||||
basic_selection_node(const basic_selection_node &) = delete;
|
||||
// No assignment.
|
||||
const basic_selection_node &operator =
|
||||
(const basic_selection_node &) = delete;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
121
YACReaderLibrary/lexertl/parser/tree/sequence_node.hpp
Normal file
121
YACReaderLibrary/lexertl/parser/tree/sequence_node.hpp
Normal file
@ -0,0 +1,121 @@
|
||||
// sequence_node.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_SEQUENCE_NODE_HPP
|
||||
#define LEXERTL_SEQUENCE_NODE_HPP
|
||||
|
||||
#include "node.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_sequence_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
using node = basic_node<id_type>;
|
||||
using bool_stack = typename node::bool_stack;
|
||||
using const_node_stack = typename node::const_node_stack;
|
||||
using node_ptr_vector = typename node::node_ptr_vector;
|
||||
using node_stack = typename node::node_stack;
|
||||
using node_type = typename node::node_type;
|
||||
using node_vector = typename node::node_vector;
|
||||
|
||||
basic_sequence_node(observer_ptr<node> left_, observer_ptr<node> right_) :
|
||||
node(left_->nullable() && right_->nullable()),
|
||||
_left(left_),
|
||||
_right(right_)
|
||||
{
|
||||
_left->append_firstpos(node::_firstpos);
|
||||
|
||||
if (_left->nullable())
|
||||
{
|
||||
_right->append_firstpos(node::_firstpos);
|
||||
}
|
||||
|
||||
if (_right->nullable())
|
||||
{
|
||||
_left->append_lastpos(node::_lastpos);
|
||||
}
|
||||
|
||||
_right->append_lastpos(node::_lastpos);
|
||||
|
||||
node_vector &lastpos_ = _left->lastpos();
|
||||
const node_vector &firstpos_ = _right->firstpos();
|
||||
|
||||
for (observer_ptr<node> node_ : lastpos_)
|
||||
{
|
||||
node_->append_followpos(firstpos_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~basic_sequence_node() override
|
||||
{
|
||||
}
|
||||
|
||||
virtual node_type what_type() const override
|
||||
{
|
||||
return node::SEQUENCE;
|
||||
}
|
||||
|
||||
virtual bool traverse(const_node_stack &node_stack_,
|
||||
bool_stack &perform_op_stack_) const override
|
||||
{
|
||||
perform_op_stack_.push(true);
|
||||
|
||||
switch (_right->what_type())
|
||||
{
|
||||
case node::SEQUENCE:
|
||||
case node::SELECTION:
|
||||
case node::ITERATION:
|
||||
perform_op_stack_.push(false);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
node_stack_.push(_right);
|
||||
node_stack_.push(_left);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
observer_ptr<node> _left;
|
||||
observer_ptr<node> _right;
|
||||
|
||||
virtual void copy_node(node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
|
||||
bool &down_) const override
|
||||
{
|
||||
if (perform_op_stack_.top())
|
||||
{
|
||||
observer_ptr<node> rhs_ = new_node_stack_.top();
|
||||
|
||||
new_node_stack_.pop();
|
||||
|
||||
observer_ptr<node> lhs_ = new_node_stack_.top();
|
||||
|
||||
node_ptr_vector_.emplace_back
|
||||
(std::make_unique<basic_sequence_node>(lhs_, rhs_));
|
||||
new_node_stack_.top() = node_ptr_vector_.back().get();
|
||||
}
|
||||
else
|
||||
{
|
||||
down_ = true;
|
||||
}
|
||||
|
||||
perform_op_stack_.pop();
|
||||
}
|
||||
|
||||
// No copy construction.
|
||||
basic_sequence_node(const basic_sequence_node &) = delete;
|
||||
// No assignment.
|
||||
const basic_sequence_node &operator =(const basic_sequence_node &) = delete;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
72
YACReaderLibrary/lexertl/partition/charset.hpp
Normal file
72
YACReaderLibrary/lexertl/partition/charset.hpp
Normal file
@ -0,0 +1,72 @@
|
||||
// charset.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_CHARSET_HPP
|
||||
#define LEXERTL_CHARSET_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <set>
|
||||
#include "../string_token.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename char_type, typename id_type>
|
||||
struct basic_charset
|
||||
{
|
||||
using token = basic_string_token<char_type>;
|
||||
using index_set = std::set<id_type>;
|
||||
|
||||
token _token;
|
||||
index_set _index_set;
|
||||
|
||||
basic_charset() :
|
||||
_token(),
|
||||
_index_set()
|
||||
{
|
||||
}
|
||||
|
||||
basic_charset(const token &token_, const id_type index_) :
|
||||
_token(token_),
|
||||
_index_set()
|
||||
{
|
||||
_index_set.insert(index_);
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
return _token.empty() && _index_set.empty();
|
||||
}
|
||||
|
||||
void intersect(basic_charset &rhs_, basic_charset &overlap_)
|
||||
{
|
||||
_token.intersect(rhs_._token, overlap_._token);
|
||||
|
||||
if (!overlap_._token.empty())
|
||||
{
|
||||
std::merge(_index_set.begin(), _index_set.end(),
|
||||
rhs_._index_set.begin(), rhs_._index_set.end(),
|
||||
std::inserter(overlap_._index_set,
|
||||
overlap_._index_set.end()));
|
||||
|
||||
if (_token.empty())
|
||||
{
|
||||
_index_set.clear();
|
||||
}
|
||||
|
||||
if (rhs_._token.empty())
|
||||
{
|
||||
rhs_._index_set.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
135
YACReaderLibrary/lexertl/partition/equivset.hpp
Normal file
135
YACReaderLibrary/lexertl/partition/equivset.hpp
Normal file
@ -0,0 +1,135 @@
|
||||
// equivset.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_EQUIVSET_HPP
|
||||
#define LEXERTL_EQUIVSET_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include "../parser/tree/node.hpp"
|
||||
#include <set>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
struct basic_equivset
|
||||
{
|
||||
using index_set = std::set<id_type>;
|
||||
using index_vector = std::vector<id_type>;
|
||||
using node = basic_node<id_type>;
|
||||
using node_vector = std::vector<observer_ptr<node>>;
|
||||
|
||||
index_vector _index_vector;
|
||||
id_type _id;
|
||||
bool _greedy;
|
||||
node_vector _followpos;
|
||||
|
||||
basic_equivset() :
|
||||
_index_vector(),
|
||||
_id(0),
|
||||
_greedy(true),
|
||||
_followpos()
|
||||
{
|
||||
}
|
||||
|
||||
basic_equivset(const index_set &index_set_, const id_type id_,
|
||||
const bool greedy_, const node_vector &followpos_) :
|
||||
_index_vector(index_set_.begin(), index_set_.end()),
|
||||
_id(id_),
|
||||
_greedy(greedy_),
|
||||
_followpos(followpos_)
|
||||
{
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
return _index_vector.empty() && _followpos.empty();
|
||||
}
|
||||
|
||||
void intersect(basic_equivset &rhs_, basic_equivset &overlap_)
|
||||
{
|
||||
intersect_indexes(rhs_._index_vector, overlap_._index_vector);
|
||||
|
||||
if (!overlap_._index_vector.empty())
|
||||
{
|
||||
// Note that the LHS takes priority in order to
|
||||
// respect rule ordering priority in the lex spec.
|
||||
overlap_._id = _id;
|
||||
overlap_._greedy = _greedy;
|
||||
overlap_._followpos = _followpos;
|
||||
|
||||
auto overlap_begin_ = overlap_._followpos.cbegin();
|
||||
auto overlap_end_ = overlap_._followpos.cend();
|
||||
|
||||
for (observer_ptr<node> node_ : rhs_._followpos)
|
||||
{
|
||||
if (std::find(overlap_begin_, overlap_end_, node_) ==
|
||||
overlap_end_)
|
||||
{
|
||||
overlap_._followpos.push_back(node_);
|
||||
overlap_begin_ = overlap_._followpos.begin();
|
||||
overlap_end_ = overlap_._followpos.end();
|
||||
}
|
||||
}
|
||||
|
||||
if (_index_vector.empty())
|
||||
{
|
||||
_followpos.clear();
|
||||
}
|
||||
|
||||
if (rhs_._index_vector.empty())
|
||||
{
|
||||
rhs_._followpos.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void intersect_indexes(index_vector &rhs_, index_vector &overlap_)
|
||||
{
|
||||
std::set_intersection(_index_vector.begin(), _index_vector.end(),
|
||||
rhs_.begin(), rhs_.end(), std::back_inserter(overlap_));
|
||||
|
||||
if (!overlap_.empty())
|
||||
{
|
||||
remove(overlap_, _index_vector);
|
||||
remove(overlap_, rhs_);
|
||||
}
|
||||
}
|
||||
|
||||
void remove(const index_vector &source_, index_vector &dest_)
|
||||
{
|
||||
auto inter_ = source_.begin();
|
||||
auto inter_end_ = source_.end();
|
||||
auto reader_ = std::find(dest_.begin(), dest_.end(), *inter_);
|
||||
auto writer_ = reader_;
|
||||
auto dest_end_ = dest_.end();
|
||||
|
||||
while (writer_ != dest_end_ && inter_ != inter_end_)
|
||||
{
|
||||
if (*reader_ == *inter_)
|
||||
{
|
||||
++inter_;
|
||||
++reader_;
|
||||
}
|
||||
else
|
||||
{
|
||||
*writer_++ = *reader_++;
|
||||
}
|
||||
}
|
||||
|
||||
while (reader_ != dest_end_)
|
||||
{
|
||||
*writer_++ = *reader_++;
|
||||
}
|
||||
|
||||
dest_.resize(dest_.size() - source_.size());
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
1018
YACReaderLibrary/lexertl/rules.hpp
Normal file
1018
YACReaderLibrary/lexertl/rules.hpp
Normal file
File diff suppressed because it is too large
Load Diff
23
YACReaderLibrary/lexertl/runtime_error.hpp
Normal file
23
YACReaderLibrary/lexertl/runtime_error.hpp
Normal file
@ -0,0 +1,23 @@
|
||||
// runtime_error.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RUNTIME_ERROR_HPP
|
||||
#define LEXERTL_RUNTIME_ERROR_HPP
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
class runtime_error : public std::runtime_error
|
||||
{
|
||||
public:
|
||||
runtime_error(const std::string &what_arg_) :
|
||||
std::runtime_error(what_arg_)
|
||||
{
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
28
YACReaderLibrary/lexertl/serialise.hpp
Normal file
28
YACReaderLibrary/lexertl/serialise.hpp
Normal file
@ -0,0 +1,28 @@
|
||||
// serialise.hpp
|
||||
// Copyright (c) 2007-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_SERIALISE_HPP
|
||||
#define LEXERTL_SERIALISE_HPP
|
||||
|
||||
#include "state_machine.hpp"
|
||||
#include <boost/serialization/vector.hpp>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
// IMPORTANT! This won't work if you don't enable RTTI!
|
||||
template<typename CharT, typename id_type, class Archive>
|
||||
void serialise(basic_state_machine<CharT, id_type> &sm_, Archive &ar_)
|
||||
{
|
||||
detail::basic_internals<id_type> &internals_ = sm_.data();
|
||||
|
||||
ar_ & internals_._eoi;
|
||||
ar_ & *internals_._lookup;
|
||||
ar_ & internals_._dfa_alphabet;
|
||||
ar_ & internals_._features;
|
||||
ar_ & *internals_._dfa;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
53
YACReaderLibrary/lexertl/sm_to_csm.hpp
Normal file
53
YACReaderLibrary/lexertl/sm_to_csm.hpp
Normal file
@ -0,0 +1,53 @@
|
||||
// sm_to_csm.hpp
|
||||
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_SM_TO_CSM_HPP
|
||||
#define LEXERTL_SM_TO_CSM_HPP
|
||||
|
||||
#include "enums.hpp"
|
||||
#include "observer_ptr.hpp"
|
||||
#include <cstddef>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename sm, typename char_state_machine>
|
||||
void sm_to_csm(const sm &sm_, char_state_machine &csm_)
|
||||
{
|
||||
using id_type = typename sm::traits::id_type;
|
||||
using internals = typename sm::internals;
|
||||
using string_token = typename char_state_machine::state::string_token;
|
||||
using index_type = typename string_token::index_type;
|
||||
using string_token_vector =
|
||||
typename char_state_machine::string_token_vector;
|
||||
const internals &internals_ = sm_.data();
|
||||
const std::size_t dfas_ = internals_._dfa.size();
|
||||
|
||||
for (id_type i_ = 0; i_ < dfas_; ++i_)
|
||||
{
|
||||
if (internals_._dfa_alphabet[i_] == 0) continue;
|
||||
|
||||
const std::size_t alphabet_ = internals_._dfa_alphabet[i_] -
|
||||
transitions_index;
|
||||
string_token_vector token_vector_(alphabet_, string_token());
|
||||
observer_ptr<const id_type> ptr_ = &internals_._lookup[i_].front();
|
||||
|
||||
for (std::size_t c_ = 0; c_ < 256; ++c_, ++ptr_)
|
||||
{
|
||||
if (*ptr_ >= transitions_index)
|
||||
{
|
||||
string_token &token_ = token_vector_
|
||||
[*ptr_ - transitions_index];
|
||||
|
||||
token_.insert(typename string_token::range
|
||||
(index_type(c_), index_type(c_)));
|
||||
}
|
||||
}
|
||||
|
||||
csm_.append(token_vector_, internals_, i_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
44
YACReaderLibrary/lexertl/sm_traits.hpp
Normal file
44
YACReaderLibrary/lexertl/sm_traits.hpp
Normal file
@ -0,0 +1,44 @@
|
||||
// sm_traits.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_SM_TRAITS_HPP
|
||||
#define LEXERTL_SM_TRAITS_HPP
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename ch_type, typename sm_type, bool comp, bool look,
|
||||
bool dfa_nfa>
|
||||
struct basic_sm_traits
|
||||
{
|
||||
enum {char_24_bit = sizeof(ch_type) > 2, compressed = comp, lookup = look,
|
||||
is_dfa = dfa_nfa};
|
||||
using input_char_type = ch_type;
|
||||
using char_type = ch_type;
|
||||
using id_type = sm_type;
|
||||
|
||||
static id_type npos()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename ch_type, typename sm_type, bool look, bool dfa_nfa>
|
||||
struct basic_sm_traits<ch_type, sm_type, true, look, dfa_nfa>
|
||||
{
|
||||
enum {char_24_bit = sizeof(ch_type) > 2, compressed = true, lookup = look,
|
||||
is_dfa = dfa_nfa};
|
||||
using input_char_type = ch_type;
|
||||
using char_type = unsigned char;
|
||||
using id_type = sm_type;
|
||||
|
||||
static id_type npos()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
521
YACReaderLibrary/lexertl/state_machine.hpp
Normal file
521
YACReaderLibrary/lexertl/state_machine.hpp
Normal file
@ -0,0 +1,521 @@
|
||||
// state_machine.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_STATE_MACHINE_HPP
|
||||
#define LEXERTL_STATE_MACHINE_HPP
|
||||
|
||||
// memcmp()
|
||||
#include <cstring>
|
||||
#include "internals.hpp"
|
||||
#include <map>
|
||||
#include "observer_ptr.hpp"
|
||||
#include <set>
|
||||
#include "sm_traits.hpp"
|
||||
#include "string_token.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_type, typename id_ty = uint16_t>
|
||||
class basic_state_machine
|
||||
{
|
||||
public:
|
||||
using id_type = id_ty;
|
||||
using traits =
|
||||
basic_sm_traits<char_type, id_type,
|
||||
(sizeof(char_type) > 1), true, true>;
|
||||
using internals = detail::basic_internals<id_type>;
|
||||
|
||||
// If you get a compile error here you have
|
||||
// failed to define an unsigned id type.
|
||||
static_assert(std::is_unsigned<id_type>::value, "Your id type is signed");
|
||||
|
||||
basic_state_machine() :
|
||||
_internals()
|
||||
{
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
_internals.clear();
|
||||
}
|
||||
|
||||
internals &data()
|
||||
{
|
||||
return _internals;
|
||||
}
|
||||
|
||||
const internals &data() const
|
||||
{
|
||||
return _internals;
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
return _internals.empty();
|
||||
}
|
||||
|
||||
id_type eoi() const
|
||||
{
|
||||
return _internals._eoi;
|
||||
}
|
||||
|
||||
void minimise()
|
||||
{
|
||||
const id_type dfas_ = static_cast<id_type>(_internals._dfa.size());
|
||||
|
||||
for (id_type i_ = 0; i_ < dfas_; ++i_)
|
||||
{
|
||||
const id_type dfa_alphabet_ = _internals._dfa_alphabet[i_];
|
||||
id_type_vector &dfa_ = _internals._dfa[i_];
|
||||
|
||||
if (dfa_alphabet_ != 0)
|
||||
{
|
||||
std::size_t size_ = 0;
|
||||
|
||||
do
|
||||
{
|
||||
size_ = dfa_.size();
|
||||
minimise_dfa(dfa_alphabet_, dfa_, size_);
|
||||
} while (dfa_.size() != size_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static id_type npos()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
|
||||
static id_type skip()
|
||||
{
|
||||
return static_cast<id_type>(~1);
|
||||
}
|
||||
|
||||
void swap(basic_state_machine &rhs_)
|
||||
{
|
||||
_internals.swap(rhs_._internals);
|
||||
}
|
||||
|
||||
private:
|
||||
using id_type_vector = typename internals::id_type_vector;
|
||||
using index_set = std::set<id_type>;
|
||||
internals _internals;
|
||||
|
||||
void minimise_dfa(const id_type dfa_alphabet_,
|
||||
id_type_vector &dfa_, std::size_t size_)
|
||||
{
|
||||
observer_ptr<const id_type> first_ = &dfa_.front();
|
||||
observer_ptr<const id_type> end_ = first_ + size_;
|
||||
id_type index_ = 1;
|
||||
id_type new_index_ = 1;
|
||||
id_type_vector lookup_(size_ / dfa_alphabet_, npos());
|
||||
observer_ptr<id_type> lookup_ptr_ = &lookup_.front();
|
||||
index_set index_set_;
|
||||
const id_type bol_index_ = dfa_.front();
|
||||
|
||||
*lookup_ptr_ = 0;
|
||||
// Only one 'jam' state, so skip it.
|
||||
first_ += dfa_alphabet_;
|
||||
|
||||
for (; first_ < end_; first_ += dfa_alphabet_, ++index_)
|
||||
{
|
||||
observer_ptr<const id_type> second_ = first_ + dfa_alphabet_;
|
||||
|
||||
for (id_type curr_index_ = index_ + 1; second_ < end_;
|
||||
++curr_index_, second_ += dfa_alphabet_)
|
||||
{
|
||||
if (index_set_.find(curr_index_) != index_set_.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Some systems have memcmp in namespace std.
|
||||
using namespace std;
|
||||
|
||||
if (memcmp(first_, second_, sizeof(id_type) *
|
||||
dfa_alphabet_) == 0)
|
||||
{
|
||||
index_set_.insert(curr_index_);
|
||||
lookup_ptr_[curr_index_] = new_index_;
|
||||
}
|
||||
}
|
||||
|
||||
if (lookup_ptr_[index_] == npos())
|
||||
{
|
||||
lookup_ptr_[index_] = new_index_;
|
||||
++new_index_;
|
||||
}
|
||||
}
|
||||
|
||||
if (!index_set_.empty())
|
||||
{
|
||||
observer_ptr<const id_type> front_ = &dfa_.front();
|
||||
id_type_vector new_dfa_(front_, front_ + dfa_alphabet_);
|
||||
auto set_end_ = index_set_.cend();
|
||||
observer_ptr<const id_type> ptr_ = front_ + dfa_alphabet_;
|
||||
observer_ptr<id_type> new_ptr_ = nullptr;
|
||||
|
||||
new_dfa_.resize(size_ - index_set_.size() * dfa_alphabet_, 0);
|
||||
new_ptr_ = &new_dfa_.front() + dfa_alphabet_;
|
||||
size_ /= dfa_alphabet_;
|
||||
|
||||
if (bol_index_)
|
||||
{
|
||||
new_dfa_.front() = lookup_ptr_[bol_index_];
|
||||
}
|
||||
|
||||
for (index_ = 1; index_ < size_; ++index_)
|
||||
{
|
||||
if (index_set_.find(index_) != set_end_)
|
||||
{
|
||||
ptr_ += dfa_alphabet_;
|
||||
continue;
|
||||
}
|
||||
|
||||
new_ptr_[end_state_index] = ptr_[end_state_index];
|
||||
new_ptr_[id_index] = ptr_[id_index];
|
||||
new_ptr_[user_id_index] = ptr_[user_id_index];
|
||||
new_ptr_[push_dfa_index] = ptr_[push_dfa_index];
|
||||
new_ptr_[next_dfa_index] = ptr_[next_dfa_index];
|
||||
new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]];
|
||||
new_ptr_ += transitions_index;
|
||||
ptr_ += transitions_index;
|
||||
|
||||
for (id_type i_ = transitions_index; i_ < dfa_alphabet_; ++i_)
|
||||
{
|
||||
*new_ptr_++ = lookup_ptr_[*ptr_++];
|
||||
}
|
||||
}
|
||||
|
||||
dfa_.swap(new_dfa_);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using state_machine = basic_state_machine<char>;
|
||||
using wstate_machine = basic_state_machine<wchar_t>;
|
||||
using u32state_machine = basic_state_machine<char32_t>;
|
||||
|
||||
template<typename char_type, typename id_ty = uint16_t,
|
||||
bool is_dfa = true>
|
||||
struct basic_char_state_machine
|
||||
{
|
||||
using id_type = id_ty;
|
||||
using traits = basic_sm_traits<char_type, id_type, false, false, is_dfa>;
|
||||
using internals = detail::basic_internals<id_type>;
|
||||
using id_type_vector = typename internals::id_type_vector;
|
||||
|
||||
struct state
|
||||
{
|
||||
using string_token = basic_string_token<char_type>;
|
||||
using id_type_string_token_map = std::map<id_type, string_token>;
|
||||
using id_type_string_token_pair = std::pair<id_type, string_token>;
|
||||
enum push_pop_dfa {neither, push_dfa, pop_dfa};
|
||||
|
||||
bool _end_state;
|
||||
push_pop_dfa _push_pop_dfa;
|
||||
id_type _id;
|
||||
id_type _user_id;
|
||||
id_type _push_dfa;
|
||||
id_type _next_dfa;
|
||||
id_type _eol_index;
|
||||
id_type_string_token_map _transitions;
|
||||
|
||||
state() :
|
||||
_end_state(false),
|
||||
_push_pop_dfa(neither),
|
||||
_id(0),
|
||||
_user_id(traits::npos()),
|
||||
_push_dfa(traits::npos()),
|
||||
_next_dfa(0),
|
||||
_eol_index(traits::npos()),
|
||||
_transitions()
|
||||
{
|
||||
}
|
||||
|
||||
bool operator ==(const state rhs_) const
|
||||
{
|
||||
return _end_state == rhs_._end_state &&
|
||||
_push_pop_dfa == rhs_._push_pop_dfa &&
|
||||
_id == rhs_._id &&
|
||||
_user_id == rhs_._user_id &&
|
||||
_push_dfa == rhs_._push_dfa &&
|
||||
_next_dfa == rhs_._next_dfa &&
|
||||
_eol_index == rhs_._eol_index &&
|
||||
_transitions == rhs_._transitions;
|
||||
}
|
||||
};
|
||||
|
||||
using string_token = typename state::string_token;
|
||||
using state_vector = std::vector<state>;
|
||||
using string_token_vector = std::vector<string_token>;
|
||||
using id_type_string_token_pair =
|
||||
typename state::id_type_string_token_pair;
|
||||
|
||||
struct dfa
|
||||
{
|
||||
id_type _bol_index;
|
||||
state_vector _states;
|
||||
|
||||
dfa(const std::size_t size_) :
|
||||
_bol_index(traits::npos()),
|
||||
_states(state_vector(size_))
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t size() const
|
||||
{
|
||||
return _states.size();
|
||||
}
|
||||
|
||||
void swap(dfa &rhs_)
|
||||
{
|
||||
std::swap(_bol_index, rhs_._bol_index);
|
||||
_states.swap(rhs_._states);
|
||||
}
|
||||
};
|
||||
|
||||
static_assert(std::is_move_assignable<dfa>::value &&
|
||||
std::is_move_constructible<dfa>::value, "dfa is not movable.");
|
||||
using dfa_vector = std::vector<dfa>;
|
||||
|
||||
static_assert(std::is_unsigned<id_type>::value, "Your id type is signed");
|
||||
dfa_vector _sm_vector;
|
||||
|
||||
basic_char_state_machine() :
|
||||
_sm_vector()
|
||||
{
|
||||
}
|
||||
|
||||
void append(const string_token_vector &token_vector_,
|
||||
const internals &internals_, const id_type dfa_index_)
|
||||
{
|
||||
const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[dfa_index_];
|
||||
const std::size_t alphabet_ = dfa_alphabet_ - transitions_index;
|
||||
const id_type_vector &source_dfa_ = internals_._dfa[dfa_index_];
|
||||
observer_ptr<const id_type> ptr_ = &source_dfa_.front();
|
||||
const std::size_t size_ = (source_dfa_.size() - dfa_alphabet_) /
|
||||
dfa_alphabet_;
|
||||
typename state::id_type_string_token_map::iterator trans_iter_;
|
||||
|
||||
_sm_vector.push_back(dfa(size_));
|
||||
|
||||
dfa &dest_dfa_ = _sm_vector.back();
|
||||
|
||||
if (*ptr_)
|
||||
{
|
||||
dest_dfa_._bol_index = *ptr_ - 1;
|
||||
}
|
||||
|
||||
ptr_ += dfa_alphabet_;
|
||||
|
||||
for (id_type i_ = 0; i_ < size_; ++i_)
|
||||
{
|
||||
state &state_ = dest_dfa_._states[i_];
|
||||
|
||||
state_._end_state = ptr_[end_state_index] != 0;
|
||||
|
||||
if (ptr_[push_dfa_index] != npos())
|
||||
{
|
||||
state_._push_pop_dfa = state::push_dfa;
|
||||
}
|
||||
else if (ptr_[end_state_index] & pop_dfa_bit)
|
||||
{
|
||||
state_._push_pop_dfa = state::pop_dfa;
|
||||
}
|
||||
|
||||
state_._id = ptr_[id_index];
|
||||
state_._user_id = ptr_[user_id_index];
|
||||
state_._push_dfa = ptr_[push_dfa_index];
|
||||
state_._next_dfa = ptr_[next_dfa_index];
|
||||
|
||||
if (ptr_[eol_index])
|
||||
{
|
||||
state_._eol_index = ptr_[eol_index] - 1;
|
||||
}
|
||||
|
||||
ptr_ += transitions_index;
|
||||
|
||||
for (id_type col_index_ = 0; col_index_ < alphabet_;
|
||||
++col_index_, ++ptr_)
|
||||
{
|
||||
const id_type next_ = *ptr_;
|
||||
|
||||
if (next_ > 0)
|
||||
{
|
||||
trans_iter_ = state_._transitions.find(next_ - 1);
|
||||
|
||||
if (trans_iter_ == state_._transitions.end())
|
||||
{
|
||||
trans_iter_ = state_._transitions.insert
|
||||
(id_type_string_token_pair(static_cast<id_type>
|
||||
(next_ - 1), token_vector_[col_index_])).first;
|
||||
}
|
||||
else
|
||||
{
|
||||
trans_iter_->second.insert(token_vector_[col_index_]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
_sm_vector.clear();
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
return _sm_vector.empty();
|
||||
}
|
||||
|
||||
void minimise()
|
||||
{
|
||||
const id_type dfas_ = static_cast<id_type>(_sm_vector.size());
|
||||
|
||||
for (id_type i_ = 0; i_ < dfas_; ++i_)
|
||||
{
|
||||
observer_ptr<dfa> dfa_ = &_sm_vector[i_];
|
||||
|
||||
if (dfa_->size() > 0)
|
||||
{
|
||||
std::size_t size_ = 0;
|
||||
|
||||
do
|
||||
{
|
||||
size_ = dfa_->size();
|
||||
minimise_dfa(*dfa_, size_);
|
||||
} while (dfa_->size() != size_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static id_type npos()
|
||||
{
|
||||
return traits::npos();
|
||||
}
|
||||
|
||||
id_type size() const
|
||||
{
|
||||
return static_cast<id_type>(_sm_vector.size());
|
||||
}
|
||||
|
||||
static id_type skip()
|
||||
{
|
||||
return ~static_cast<id_type>(1);
|
||||
}
|
||||
|
||||
void swap(basic_char_state_machine &csm_)
|
||||
{
|
||||
_sm_vector.swap(csm_._sm_vector);
|
||||
}
|
||||
|
||||
private:
|
||||
using index_set = std::set<id_type>;
|
||||
|
||||
void minimise_dfa(dfa &dfa_, std::size_t size_)
|
||||
{
|
||||
observer_ptr<const state> first_ = &dfa_._states.front();
|
||||
observer_ptr<const state> end_ = first_ + size_;
|
||||
id_type index_ = 0;
|
||||
id_type new_index_ = 0;
|
||||
id_type_vector lookup_(size_, npos());
|
||||
observer_ptr<id_type> lookup_ptr_ = &lookup_.front();
|
||||
index_set index_set_;
|
||||
|
||||
for (; first_ != end_; ++first_, ++index_)
|
||||
{
|
||||
observer_ptr<const state> second_ = first_ + 1;
|
||||
|
||||
for (id_type curr_index_ = index_ + 1; second_ != end_;
|
||||
++curr_index_, ++second_)
|
||||
{
|
||||
if (index_set_.find(curr_index_) != index_set_.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (*first_ == *second_)
|
||||
{
|
||||
index_set_.insert(curr_index_);
|
||||
lookup_ptr_[curr_index_] = new_index_;
|
||||
}
|
||||
}
|
||||
|
||||
if (lookup_ptr_[index_] == npos())
|
||||
{
|
||||
lookup_ptr_[index_] = new_index_;
|
||||
++new_index_;
|
||||
}
|
||||
}
|
||||
|
||||
if (!index_set_.empty())
|
||||
{
|
||||
observer_ptr<const state> front_ = &dfa_._states.front();
|
||||
dfa new_dfa_(new_index_);
|
||||
auto set_end_ = index_set_.cend();
|
||||
observer_ptr<const state> ptr_ = front_;
|
||||
observer_ptr<state> new_ptr_ = &new_dfa_._states.front();
|
||||
|
||||
if (dfa_._bol_index != npos())
|
||||
{
|
||||
new_dfa_._bol_index = lookup_ptr_[dfa_._bol_index];
|
||||
}
|
||||
|
||||
for (index_ = 0; index_ < size_; ++index_)
|
||||
{
|
||||
if (index_set_.find(index_) != set_end_)
|
||||
{
|
||||
++ptr_;
|
||||
continue;
|
||||
}
|
||||
|
||||
new_ptr_->_end_state = ptr_->_end_state;
|
||||
new_ptr_->_id = ptr_->_end_state;
|
||||
new_ptr_->_user_id = ptr_->_user_id;
|
||||
new_ptr_->_next_dfa = ptr_->_next_dfa;
|
||||
|
||||
if (ptr_->_eol_index != npos())
|
||||
{
|
||||
new_ptr_->_eol_index = lookup_ptr_[ptr_->_eol_index];
|
||||
}
|
||||
|
||||
auto iter_ = ptr_->_transitions.cbegin();
|
||||
auto end_ = ptr_->_transitions.cend();
|
||||
typename state::id_type_string_token_map::iterator find_;
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
find_ = new_ptr_->_transitions.find
|
||||
(lookup_ptr_[iter_->first]);
|
||||
|
||||
if (find_ == new_ptr_->_transitions.end())
|
||||
{
|
||||
new_ptr_->_transitions.insert
|
||||
(id_type_string_token_pair
|
||||
(lookup_ptr_[iter_->first], iter_->second));
|
||||
}
|
||||
else
|
||||
{
|
||||
find_->second.insert(iter_->second);
|
||||
}
|
||||
}
|
||||
|
||||
++ptr_;
|
||||
++new_ptr_;
|
||||
}
|
||||
|
||||
dfa_.swap(new_dfa_);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using char_state_machine = basic_char_state_machine<char>;
|
||||
using wchar_state_machine = basic_char_state_machine<wchar_t>;
|
||||
using u32char_state_machine = basic_char_state_machine<char32_t>;
|
||||
}
|
||||
|
||||
#endif
|
352
YACReaderLibrary/lexertl/stream_shared_iterator.hpp
Normal file
352
YACReaderLibrary/lexertl/stream_shared_iterator.hpp
Normal file
@ -0,0 +1,352 @@
|
||||
// stream_shared_iterator.hpp
|
||||
// Copyright (c) 2010-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_STREAM_SHARED_ITERATOR_HPP
|
||||
#define LEXERTL_STREAM_SHARED_ITERATOR_HPP
|
||||
|
||||
#include <algorithm>
|
||||
// memcpy
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include "runtime_error.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_type>
|
||||
class basic_stream_shared_iterator
|
||||
{
|
||||
public:
|
||||
using istream = std::basic_istream<char_type>;
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
using difference_type = std::size_t;
|
||||
using value_type = char_type;
|
||||
using pointer = char_type *;
|
||||
using reference = char_type &;
|
||||
|
||||
basic_stream_shared_iterator() :
|
||||
_master(false),
|
||||
_live(false),
|
||||
_index(shared::npos()),
|
||||
_shared(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator(istream &stream_,
|
||||
const std::size_t buff_size_ = 1024,
|
||||
const std::size_t increment_ = 1024) :
|
||||
_master(true),
|
||||
_live(false),
|
||||
_index(shared::npos()),
|
||||
// For exception safety don't call new yet
|
||||
_shared(nullptr)
|
||||
{
|
||||
// Safe to call potentially throwing new now.
|
||||
_shared = new shared(stream_, buff_size_, increment_);
|
||||
++_shared->_ref_count;
|
||||
_shared->_clients.push_back(this);
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator(const basic_stream_shared_iterator &rhs_) :
|
||||
_master(false),
|
||||
_live(false),
|
||||
_index(rhs_._master ? rhs_._shared->lowest() : rhs_._index),
|
||||
_shared(rhs_._shared)
|
||||
{
|
||||
if (_shared)
|
||||
{
|
||||
// New copy of an iterator.
|
||||
// The assumption is that any copy must be live
|
||||
// even if the rhs is not (otherwise we will never
|
||||
// have a record of the start of the current range!)
|
||||
++_shared->_ref_count;
|
||||
_shared->_clients.push_back(this);
|
||||
_live = true;
|
||||
}
|
||||
}
|
||||
|
||||
~basic_stream_shared_iterator()
|
||||
{
|
||||
if (_shared)
|
||||
{
|
||||
--_shared->_ref_count;
|
||||
_shared->erase(this);
|
||||
|
||||
if (_shared->_ref_count == 0)
|
||||
{
|
||||
delete _shared;
|
||||
_shared = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator &operator =
|
||||
(const basic_stream_shared_iterator &rhs_)
|
||||
{
|
||||
if (this != &rhs_)
|
||||
{
|
||||
_master = false;
|
||||
_index = rhs_._master ? rhs_._shared->lowest() : rhs_._index;
|
||||
|
||||
if (!_live && !rhs_._live)
|
||||
{
|
||||
if (rhs_._shared)
|
||||
{
|
||||
++rhs_._shared->_ref_count;
|
||||
}
|
||||
}
|
||||
else if (!_live && rhs_._live)
|
||||
{
|
||||
rhs_._shared->_clients.push_back(this);
|
||||
|
||||
if (!_shared)
|
||||
{
|
||||
++rhs_._shared->_ref_count;
|
||||
}
|
||||
}
|
||||
else if (_live && !rhs_._live)
|
||||
{
|
||||
_shared->erase(this);
|
||||
|
||||
if (!rhs_._shared)
|
||||
{
|
||||
--_shared->_ref_count;
|
||||
}
|
||||
}
|
||||
|
||||
_live = rhs_._live;
|
||||
_shared = rhs_._shared;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator ==(const basic_stream_shared_iterator &rhs_) const
|
||||
{
|
||||
return _index == rhs_._index &&
|
||||
(_shared == rhs_._shared ||
|
||||
(_index == shared::npos() || rhs_._index == shared::npos()) &&
|
||||
(!_shared || !rhs_._shared));
|
||||
}
|
||||
|
||||
bool operator !=(const basic_stream_shared_iterator &rhs_) const
|
||||
{
|
||||
return !(*this == rhs_);
|
||||
}
|
||||
|
||||
const char_type &operator *()
|
||||
{
|
||||
check_master();
|
||||
return _shared->_buffer[_index];
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator &operator ++()
|
||||
{
|
||||
check_master();
|
||||
++_index;
|
||||
update_state();
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator operator ++(int)
|
||||
{
|
||||
basic_stream_shared_iterator iter_ = *this;
|
||||
|
||||
check_master();
|
||||
++_index;
|
||||
update_state();
|
||||
return iter_;
|
||||
}
|
||||
|
||||
private:
|
||||
class shared
|
||||
{
|
||||
public:
|
||||
std::size_t _ref_count;
|
||||
using char_vector = std::vector<char_type>;
|
||||
using iter_list = std::vector<basic_stream_shared_iterator *>;
|
||||
istream &_stream;
|
||||
std::size_t _increment;
|
||||
std::size_t _len;
|
||||
char_vector _buffer;
|
||||
iter_list _clients;
|
||||
|
||||
shared(istream &stream_, const std::size_t buff_size_,
|
||||
const std::size_t increment_) :
|
||||
_ref_count(0),
|
||||
_increment(increment_),
|
||||
_stream(stream_)
|
||||
{
|
||||
_buffer.resize(buff_size_);
|
||||
_stream.read(&_buffer.front(), _buffer.size());
|
||||
_len = static_cast<std::size_t>(_stream.gcount());
|
||||
}
|
||||
|
||||
bool reload_buffer()
|
||||
{
|
||||
const std::size_t lowest_ = lowest();
|
||||
std::size_t read_ = 0;
|
||||
|
||||
if (lowest_ == 0)
|
||||
{
|
||||
// Resize buffer
|
||||
const std::size_t old_size_ = _buffer.size();
|
||||
const std::size_t new_size_ = old_size_ + _increment;
|
||||
|
||||
_buffer.resize(new_size_);
|
||||
_stream.read(&_buffer.front() + old_size_, _increment);
|
||||
read_ = static_cast<std::size_t>(_stream.gcount());
|
||||
|
||||
if (read_)
|
||||
{
|
||||
read_ += old_size_;
|
||||
_len = read_;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Some systems have memcpy in namespace std
|
||||
using namespace std;
|
||||
const size_t start_ = _buffer.size() - lowest_;
|
||||
const size_t len_ = _buffer.size() - start_;
|
||||
|
||||
memcpy(&_buffer.front(), &_buffer[lowest_], start_ *
|
||||
sizeof(char_type));
|
||||
_stream.read(&_buffer.front() + start_, len_);
|
||||
read_ = static_cast<size_t>(_stream.gcount());
|
||||
subtract(lowest_);
|
||||
|
||||
if (read_)
|
||||
{
|
||||
read_ += start_;
|
||||
_len = read_;
|
||||
}
|
||||
else
|
||||
{
|
||||
_len = highest();
|
||||
}
|
||||
}
|
||||
|
||||
return read_ != 0;
|
||||
}
|
||||
|
||||
void erase(basic_stream_shared_iterator *ptr_)
|
||||
{
|
||||
auto iter_ = std::find(_clients.begin(), _clients.end(), ptr_);
|
||||
|
||||
if (iter_ != _clients.end())
|
||||
_clients.erase(iter_);
|
||||
}
|
||||
|
||||
std::size_t lowest() const
|
||||
{
|
||||
std::size_t lowest_ = npos();
|
||||
auto iter_ = _clients.cbegin();
|
||||
auto end_ = _clients.cend();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
const basic_stream_shared_iterator *ptr_ = *iter_;
|
||||
|
||||
if (ptr_->_index < lowest_)
|
||||
{
|
||||
lowest_ = ptr_->_index;
|
||||
}
|
||||
}
|
||||
|
||||
if (lowest_ == npos())
|
||||
{
|
||||
lowest_ = 0;
|
||||
}
|
||||
|
||||
return lowest_;
|
||||
}
|
||||
|
||||
std::size_t highest() const
|
||||
{
|
||||
std::size_t highest_ = 0;
|
||||
auto iter_ = _clients.cbegin();
|
||||
auto end_ = _clients.cend();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
const basic_stream_shared_iterator *ptr_ = *iter_;
|
||||
|
||||
if (ptr_->_index != npos() && ptr_->_index > highest_)
|
||||
{
|
||||
highest_ = ptr_->_index;
|
||||
}
|
||||
}
|
||||
|
||||
return highest_;
|
||||
}
|
||||
|
||||
void subtract(const std::size_t lowest_)
|
||||
{
|
||||
auto iter_ = _clients.begin();
|
||||
auto end_ = _clients.end();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
basic_stream_shared_iterator *ptr_ = *iter_;
|
||||
|
||||
if (ptr_->_index != npos())
|
||||
{
|
||||
ptr_->_index -= lowest_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static std::size_t npos()
|
||||
{
|
||||
return ~static_cast<std::size_t>(0);
|
||||
}
|
||||
|
||||
private:
|
||||
shared &operator =(const shared &rhs_);
|
||||
};
|
||||
|
||||
bool _master;
|
||||
bool _live;
|
||||
std::size_t _index;
|
||||
shared *_shared;
|
||||
|
||||
void check_master()
|
||||
{
|
||||
if (!_shared)
|
||||
{
|
||||
throw runtime_error("Cannot manipulate null (end) "
|
||||
"stream_shared_iterators.");
|
||||
}
|
||||
|
||||
if (_master)
|
||||
{
|
||||
_master = false;
|
||||
_live = true;
|
||||
_index = _shared->lowest();
|
||||
}
|
||||
}
|
||||
|
||||
void update_state()
|
||||
{
|
||||
if (_index >= _shared->_len)
|
||||
{
|
||||
if (!_shared->reload_buffer())
|
||||
{
|
||||
_shared->erase(this);
|
||||
_index = shared::npos();
|
||||
_live = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using stream_shared_iterator = basic_stream_shared_iterator<char>;
|
||||
using wstream_shared_iterator = basic_stream_shared_iterator<wchar_t>;
|
||||
}
|
||||
|
||||
#endif
|
439
YACReaderLibrary/lexertl/string_token.hpp
Normal file
439
YACReaderLibrary/lexertl/string_token.hpp
Normal file
@ -0,0 +1,439 @@
|
||||
// string_token.hpp
|
||||
// Copyright (c) 2005-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_STRING_TOKEN_HPP
|
||||
#define LEXERTL_STRING_TOKEN_HPP
|
||||
|
||||
#include "char_traits.hpp"
|
||||
#include <ios> // Needed by GCC 4.4
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename ch_type>
|
||||
struct basic_string_token
|
||||
{
|
||||
using char_type = ch_type;
|
||||
using char_traits = basic_char_traits<char_type>;
|
||||
using index_type = typename char_traits::index_type;
|
||||
using range = std::pair<index_type, index_type>;
|
||||
using range_vector = std::vector<range>;
|
||||
using string = std::basic_string<char_type>;
|
||||
using string_token = basic_string_token<char_type>;
|
||||
|
||||
range_vector _ranges;
|
||||
|
||||
basic_string_token() :
|
||||
_ranges()
|
||||
{
|
||||
}
|
||||
|
||||
basic_string_token(char_type ch_) :
|
||||
_ranges()
|
||||
{
|
||||
insert(range(ch_, ch_));
|
||||
}
|
||||
|
||||
basic_string_token(char_type first_, char_type second_) :
|
||||
_ranges()
|
||||
{
|
||||
insert(range(first_, second_));
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
_ranges.clear();
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
return _ranges.empty();
|
||||
}
|
||||
|
||||
bool any() const
|
||||
{
|
||||
return _ranges.size() == 1 && _ranges.front().first == 0 &&
|
||||
_ranges.front().second == char_traits::max_val();
|
||||
}
|
||||
|
||||
bool operator <(const basic_string_token &rhs_) const
|
||||
{
|
||||
return _ranges < rhs_._ranges;
|
||||
}
|
||||
|
||||
bool operator ==(const basic_string_token &rhs_) const
|
||||
{
|
||||
return _ranges == rhs_._ranges;
|
||||
}
|
||||
|
||||
bool negatable() const
|
||||
{
|
||||
std::size_t size_ = 0;
|
||||
auto iter_ = _ranges.cbegin();
|
||||
auto end_ = _ranges.cend();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
size_ += static_cast<std::size_t>(iter_->second) + 1 -
|
||||
static_cast<std::size_t>(iter_->first);
|
||||
}
|
||||
|
||||
return size_ > static_cast<std::size_t>(char_traits::max_val()) / 2;
|
||||
}
|
||||
|
||||
void swap(basic_string_token &rhs_)
|
||||
{
|
||||
_ranges.swap(rhs_._ranges);
|
||||
}
|
||||
|
||||
void insert(const basic_string_token &rhs_)
|
||||
{
|
||||
auto iter_ = rhs_._ranges.cbegin();
|
||||
auto end_ = rhs_._ranges.cend();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
insert(*iter_);
|
||||
}
|
||||
}
|
||||
|
||||
// Deliberately pass by value - may modify
|
||||
typename range_vector::iterator insert(range rhs_)
|
||||
{
|
||||
bool insert_ = true;
|
||||
auto iter_ = _ranges.begin();
|
||||
auto end_ = _ranges.end();
|
||||
auto erase_iter_ = end_;
|
||||
|
||||
while (iter_ != end_)
|
||||
{
|
||||
// follows current item
|
||||
if (rhs_.first > iter_->second)
|
||||
{
|
||||
if (rhs_.first == iter_->second + 1)
|
||||
{
|
||||
// Auto normalise
|
||||
rhs_.first = iter_->first;
|
||||
}
|
||||
else
|
||||
{
|
||||
// No intersection, consider next
|
||||
++iter_;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Precedes current item
|
||||
else if (rhs_.second < iter_->first)
|
||||
{
|
||||
if (rhs_.second == iter_->first - 1)
|
||||
{
|
||||
// Auto normalise
|
||||
rhs_.second = iter_->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
// insert here
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// overlap (under)
|
||||
if (rhs_.first < iter_->first)
|
||||
{
|
||||
if (rhs_.second < iter_->second)
|
||||
{
|
||||
rhs_.second = iter_->second;
|
||||
}
|
||||
}
|
||||
// overlap (over)
|
||||
else if (rhs_.second > iter_->second)
|
||||
{
|
||||
if (rhs_.first > iter_->first)
|
||||
{
|
||||
rhs_.first = iter_->first;
|
||||
}
|
||||
}
|
||||
// subset
|
||||
else
|
||||
{
|
||||
insert_ = false;
|
||||
iter_ = _ranges.end();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Code minimisation: this always applies unless we have already
|
||||
// exited the loop, or "continue" executed.
|
||||
if (erase_iter_ == end_)
|
||||
{
|
||||
erase_iter_ = iter_;
|
||||
}
|
||||
|
||||
++iter_;
|
||||
}
|
||||
|
||||
if (erase_iter_ != end_)
|
||||
{
|
||||
if (insert_)
|
||||
{
|
||||
// Re-use obsolete location
|
||||
*erase_iter_ = rhs_;
|
||||
++erase_iter_;
|
||||
}
|
||||
|
||||
iter_ = _ranges.erase(erase_iter_, iter_);
|
||||
}
|
||||
else if (insert_)
|
||||
{
|
||||
iter_ = _ranges.insert(iter_, rhs_);
|
||||
}
|
||||
|
||||
return iter_;
|
||||
}
|
||||
|
||||
void negate()
|
||||
{
|
||||
index_type next_ = 0;
|
||||
const index_type max_ = char_traits::max_val();
|
||||
string_token temp_;
|
||||
auto iter_ = _ranges.cbegin();
|
||||
auto end_ = _ranges.cend();
|
||||
bool finished_ = false;
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
if (next_ < iter_->first)
|
||||
{
|
||||
temp_.insert(range(next_,
|
||||
static_cast<index_type>(iter_->first - 1)));
|
||||
}
|
||||
|
||||
if (iter_->second < max_)
|
||||
{
|
||||
next_ = iter_->second + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
finished_ = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!finished_)
|
||||
{
|
||||
temp_.insert(range(next_, max_));
|
||||
}
|
||||
|
||||
swap(temp_);
|
||||
}
|
||||
|
||||
void intersect(basic_string_token &rhs_, basic_string_token &overlap_)
|
||||
{
|
||||
auto lhs_iter_ = _ranges.begin();
|
||||
auto lhs_end_ = _ranges.end();
|
||||
auto rhs_iter_ = rhs_._ranges.begin();
|
||||
auto rhs_end_ = rhs_._ranges.end();
|
||||
|
||||
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (rhs_iter_->first > lhs_iter_->second)
|
||||
{
|
||||
++lhs_iter_;
|
||||
}
|
||||
else if (rhs_iter_->second < lhs_iter_->first)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
range range_;
|
||||
|
||||
if (rhs_iter_->first > lhs_iter_->first)
|
||||
{
|
||||
range_.first = rhs_iter_->first;
|
||||
}
|
||||
else
|
||||
{
|
||||
range_.first = lhs_iter_->first;
|
||||
}
|
||||
|
||||
if (rhs_iter_->second < lhs_iter_->second)
|
||||
{
|
||||
range_.second = rhs_iter_->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
range_.second = lhs_iter_->second;
|
||||
}
|
||||
|
||||
adjust(range_, *this, lhs_iter_, lhs_end_);
|
||||
adjust(range_, rhs_, rhs_iter_, rhs_end_);
|
||||
overlap_.insert(range_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void remove(basic_string_token &rhs_)
|
||||
{
|
||||
auto lhs_iter_ = _ranges.begin();
|
||||
auto lhs_end_ = _ranges.end();
|
||||
auto rhs_iter_ = rhs_._ranges.begin();
|
||||
auto rhs_end_ = rhs_._ranges.end();
|
||||
|
||||
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (rhs_iter_->first > lhs_iter_->second)
|
||||
{
|
||||
++lhs_iter_;
|
||||
}
|
||||
else if (rhs_iter_->second < lhs_iter_->first)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
range range_;
|
||||
|
||||
if (rhs_iter_->first > lhs_iter_->first)
|
||||
{
|
||||
range_.first = rhs_iter_->first;
|
||||
}
|
||||
else
|
||||
{
|
||||
range_.first = lhs_iter_->first;
|
||||
}
|
||||
|
||||
if (rhs_iter_->second < lhs_iter_->second)
|
||||
{
|
||||
range_.second = rhs_iter_->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
range_.second = lhs_iter_->second;
|
||||
}
|
||||
|
||||
adjust(range_, *this, lhs_iter_, lhs_end_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static string escape_char(const typename char_traits::index_type ch_)
|
||||
{
|
||||
string out_;
|
||||
|
||||
switch (ch_)
|
||||
{
|
||||
case '\0':
|
||||
out_ += '\\';
|
||||
out_ += '0';
|
||||
break;
|
||||
case '\a':
|
||||
out_ += '\\';
|
||||
out_ += 'a';
|
||||
break;
|
||||
case '\b':
|
||||
out_ += '\\';
|
||||
out_ += 'b';
|
||||
break;
|
||||
case 27:
|
||||
out_ += '\\';
|
||||
out_ += 'x';
|
||||
out_ += '1';
|
||||
out_ += 'b';
|
||||
break;
|
||||
case '\f':
|
||||
out_ += '\\';
|
||||
out_ += 'f';
|
||||
break;
|
||||
case '\n':
|
||||
out_ += '\\';
|
||||
out_ += 'n';
|
||||
break;
|
||||
case '\r':
|
||||
out_ += '\\';
|
||||
out_ += 'r';
|
||||
break;
|
||||
case '\t':
|
||||
out_ += '\\';
|
||||
out_ += 't';
|
||||
break;
|
||||
case '\v':
|
||||
out_ += '\\';
|
||||
out_ += 'v';
|
||||
break;
|
||||
case '\\':
|
||||
out_ += '\\';
|
||||
out_ += '\\';
|
||||
break;
|
||||
case '"':
|
||||
out_ += '\\';
|
||||
out_ += '"';
|
||||
break;
|
||||
case '\'':
|
||||
out_ += '\\';
|
||||
out_ += '\'';
|
||||
break;
|
||||
default:
|
||||
{
|
||||
if (ch_ < 32 || ch_ > 126)
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
|
||||
out_ += '\\';
|
||||
out_ += 'x';
|
||||
ss_ << std::hex <<
|
||||
static_cast<std::size_t>(ch_);
|
||||
out_ += ss_.str();
|
||||
}
|
||||
else
|
||||
{
|
||||
out_ += ch_;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return out_;
|
||||
}
|
||||
|
||||
private:
|
||||
void adjust(const range &range_, basic_string_token &token_,
|
||||
typename range_vector::iterator &iter_,
|
||||
typename range_vector::iterator &end_)
|
||||
{
|
||||
if (range_.first > iter_->first)
|
||||
{
|
||||
const index_type second_ = iter_->second;
|
||||
|
||||
iter_->second = range_.first - 1;
|
||||
|
||||
if (range_.second < second_)
|
||||
{
|
||||
range new_range_(static_cast<index_type>(range_.second + 1),
|
||||
second_);
|
||||
|
||||
iter_ = token_.insert(new_range_);
|
||||
end_ = token_._ranges.end();
|
||||
}
|
||||
}
|
||||
else if (range_.second < iter_->second)
|
||||
{
|
||||
iter_->first = range_.second + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
iter_ = token_._ranges.erase(iter_);
|
||||
end_ = token_._ranges.end();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
508
YACReaderLibrary/lexertl/utf_iterators.hpp
Normal file
508
YACReaderLibrary/lexertl/utf_iterators.hpp
Normal file
@ -0,0 +1,508 @@
|
||||
// utf_iterators.hpp
|
||||
// Copyright (c) 2015-2018 Ben Hanson (http://www.benhanson.net/)
|
||||
// Inspired by http://utfcpp.sourceforge.net/
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_UTF_ITERATORS_HPP
|
||||
#define LEXERTL_UTF_ITERATORS_HPP
|
||||
|
||||
#include <iterator>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_iterator, typename char_type>
|
||||
class basic_utf8_in_iterator :
|
||||
public std::iterator<std::input_iterator_tag, char_type>
|
||||
{
|
||||
public:
|
||||
using value_type = char_type;
|
||||
using difference_type =
|
||||
typename std::iterator_traits<char_iterator>::difference_type;
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
|
||||
basic_utf8_in_iterator() :
|
||||
_it(char_iterator()),
|
||||
_end(char_iterator()),
|
||||
_char(0)
|
||||
{
|
||||
}
|
||||
|
||||
explicit basic_utf8_in_iterator(const char_iterator &it_,
|
||||
const char_iterator &end_) :
|
||||
_it(it_),
|
||||
_end(it_),
|
||||
_char(0)
|
||||
{
|
||||
if (it_ != end_)
|
||||
{
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
char_type operator *() const
|
||||
{
|
||||
return _char;
|
||||
}
|
||||
|
||||
bool operator ==(const basic_utf8_in_iterator &rhs_) const
|
||||
{
|
||||
return _it == rhs_._it;
|
||||
}
|
||||
|
||||
bool operator !=(const basic_utf8_in_iterator &rhs_) const
|
||||
{
|
||||
return _it != rhs_._it;
|
||||
}
|
||||
|
||||
basic_utf8_in_iterator &operator ++()
|
||||
{
|
||||
_it = _end;
|
||||
next();
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_utf8_in_iterator operator ++(int)
|
||||
{
|
||||
basic_utf8_in_iterator temp_ = *this;
|
||||
|
||||
_it = _end;
|
||||
next();
|
||||
return temp_;
|
||||
}
|
||||
|
||||
basic_utf8_in_iterator operator +(const std::size_t count_) const
|
||||
{
|
||||
basic_utf8_in_iterator temp_ = *this;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < count_; ++i_)
|
||||
{
|
||||
++temp_;
|
||||
}
|
||||
|
||||
return temp_;
|
||||
}
|
||||
|
||||
basic_utf8_in_iterator operator -(const std::size_t count_) const
|
||||
{
|
||||
basic_utf8_in_iterator temp_ = *this;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < count_; ++i_)
|
||||
{
|
||||
temp_._end = temp_._it;
|
||||
--temp_._it;
|
||||
|
||||
while ((*temp_._it & 0xc0) == 0x80) --temp_._it;
|
||||
}
|
||||
|
||||
temp_.next();
|
||||
return temp_;
|
||||
}
|
||||
|
||||
private:
|
||||
char_iterator _it;
|
||||
char_iterator _end;
|
||||
char_type _char;
|
||||
|
||||
void next()
|
||||
{
|
||||
const char len_ = len(_it);
|
||||
char_type ch_ = *_it & 0xff;
|
||||
|
||||
switch (len_)
|
||||
{
|
||||
case 1:
|
||||
_end = _it;
|
||||
++_end;
|
||||
break;
|
||||
case 2:
|
||||
_end = _it;
|
||||
++_end;
|
||||
|
||||
if ((*_end & 0xc0) != 0x80) break;
|
||||
|
||||
ch_ = (ch_ << 6 & 0x7ff) | (*_end & 0x3f);
|
||||
++_end;
|
||||
break;
|
||||
case 3:
|
||||
_end = _it;
|
||||
++_end;
|
||||
|
||||
if ((*_end & 0xc0) != 0x80) break;
|
||||
|
||||
ch_ = (ch_ << 12 & 0xffff) | ((*_end & 0xff) << 6 & 0xfff);
|
||||
++_end;
|
||||
|
||||
if ((*_end & 0xc0) != 0x80) break;
|
||||
|
||||
ch_ |= *_end & 0x3f;
|
||||
++_end;
|
||||
break;
|
||||
case 4:
|
||||
_end = _it;
|
||||
++_end;
|
||||
|
||||
if ((*_end & 0xc0) != 0x80) break;
|
||||
|
||||
ch_ = (ch_ << 18 & 0x1fffff) | ((*_end & 0xff) << 12 & 0x3ffff);
|
||||
++_end;
|
||||
|
||||
if ((*_end & 0xc0) != 0x80) break;
|
||||
|
||||
ch_ |= (*_end & 0xff) << 6 & 0xfff;
|
||||
++_end;
|
||||
|
||||
if ((*_end & 0xc0) != 0x80) break;
|
||||
|
||||
ch_ |= *_end & 0x3f;
|
||||
++_end;
|
||||
break;
|
||||
}
|
||||
|
||||
_char = ch_;
|
||||
}
|
||||
|
||||
char len(const char_iterator &it_) const
|
||||
{
|
||||
const unsigned char ch_ = *it_;
|
||||
|
||||
return ch_ < 0x80 ? 1 :
|
||||
ch_ >> 5 == 0x06 ? 2 :
|
||||
ch_ >> 4 == 0x0e ? 3 :
|
||||
ch_ >> 3 == 0x1e ? 4 :
|
||||
1;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename char_iterator>
|
||||
class basic_utf8_out_iterator :
|
||||
public std::iterator<std::input_iterator_tag, char>
|
||||
{
|
||||
public:
|
||||
using value_type = char;
|
||||
using difference_type =
|
||||
typename std::iterator_traits<char_iterator>::difference_type;
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
|
||||
basic_utf8_out_iterator() :
|
||||
_count(0),
|
||||
_index(0)
|
||||
{
|
||||
}
|
||||
|
||||
explicit basic_utf8_out_iterator(const char_iterator &it_,
|
||||
const char_iterator &end_) :
|
||||
_it(it_),
|
||||
_count(0),
|
||||
_index(0)
|
||||
{
|
||||
if (it_ != end_)
|
||||
{
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
char operator *() const
|
||||
{
|
||||
return _bytes[_index];
|
||||
}
|
||||
|
||||
bool operator ==(const basic_utf8_out_iterator &rhs_) const
|
||||
{
|
||||
return _it == rhs_._it;
|
||||
}
|
||||
|
||||
bool operator !=(const basic_utf8_out_iterator &rhs_) const
|
||||
{
|
||||
return _it != rhs_._it;
|
||||
}
|
||||
|
||||
basic_utf8_out_iterator &operator ++()
|
||||
{
|
||||
++_index;
|
||||
|
||||
if (_index >= _count)
|
||||
{
|
||||
++_it;
|
||||
next();
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_utf8_out_iterator operator ++(int)
|
||||
{
|
||||
basic_utf8_out_iterator temp_ = *this;
|
||||
|
||||
++_index;
|
||||
|
||||
if (_index >= _count)
|
||||
{
|
||||
++_it;
|
||||
next();
|
||||
}
|
||||
|
||||
return temp_;
|
||||
}
|
||||
|
||||
private:
|
||||
char_iterator _it;
|
||||
char _bytes[4];
|
||||
unsigned char _count;
|
||||
unsigned char _index;
|
||||
|
||||
void next()
|
||||
{
|
||||
const std::size_t ch_ = *_it;
|
||||
|
||||
_count = len(ch_);
|
||||
_index = 0;
|
||||
|
||||
switch (_count)
|
||||
{
|
||||
case 1:
|
||||
_bytes[0] = static_cast<char>(ch_);
|
||||
break;
|
||||
case 2:
|
||||
_bytes[0] = static_cast<char>((ch_ >> 6) | 0xc0);
|
||||
_bytes[1] = (ch_ & 0x3f) | 0x80;
|
||||
break;
|
||||
case 3:
|
||||
_bytes[0] = static_cast<char>((ch_ >> 12) | 0xe0);
|
||||
_bytes[1] = ((ch_ >> 6) & 0x3f) | 0x80;
|
||||
_bytes[2] = (ch_ & 0x3f) | 0x80;
|
||||
break;
|
||||
case 4:
|
||||
_bytes[0] = static_cast<char>((ch_ >> 18) | 0xf0);
|
||||
_bytes[1] = ((ch_ >> 12) & 0x3f) | 0x80;
|
||||
_bytes[2] = ((ch_ >> 6) & 0x3f) | 0x80;
|
||||
_bytes[3] = (ch_ & 0x3f) | 0x80;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
char len(const std::size_t ch_) const
|
||||
{
|
||||
return ch_ < 0x80 ? 1 :
|
||||
ch_ < 0x800 ? 2 :
|
||||
ch_ < 0x10000 ? 3 :
|
||||
4;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename char_iterator, typename char_type>
|
||||
class basic_utf16_in_iterator :
|
||||
public std::iterator<std::input_iterator_tag, char_type>
|
||||
{
|
||||
public:
|
||||
using value_type = char_type;
|
||||
using difference_type =
|
||||
typename std::iterator_traits<char_iterator>::difference_type;
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
|
||||
basic_utf16_in_iterator() :
|
||||
_it(char_iterator()),
|
||||
_end(char_iterator()),
|
||||
_char(0)
|
||||
{
|
||||
}
|
||||
|
||||
explicit basic_utf16_in_iterator(const char_iterator &it_,
|
||||
const char_iterator &end_) :
|
||||
_it(it_),
|
||||
_end(it_),
|
||||
_char(0)
|
||||
{
|
||||
if (it_ != end_)
|
||||
{
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
char_type operator *() const
|
||||
{
|
||||
return _char;
|
||||
}
|
||||
|
||||
bool operator ==(const basic_utf16_in_iterator &rhs_) const
|
||||
{
|
||||
return _it == rhs_._it;
|
||||
}
|
||||
|
||||
bool operator !=(const basic_utf16_in_iterator &rhs_) const
|
||||
{
|
||||
return _it != rhs_._it;
|
||||
}
|
||||
|
||||
basic_utf16_in_iterator &operator ++()
|
||||
{
|
||||
_it = _end;
|
||||
next();
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_utf16_in_iterator operator ++(int)
|
||||
{
|
||||
basic_utf16_in_iterator temp_ = *this;
|
||||
|
||||
_it = _end;
|
||||
next();
|
||||
return temp_;
|
||||
}
|
||||
|
||||
basic_utf16_in_iterator operator +(const std::size_t count_) const
|
||||
{
|
||||
basic_utf16_in_iterator temp_ = *this;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < count_; ++i_)
|
||||
{
|
||||
++temp_;
|
||||
}
|
||||
|
||||
return temp_;
|
||||
}
|
||||
|
||||
basic_utf16_in_iterator operator -(const std::size_t count_) const
|
||||
{
|
||||
basic_utf16_in_iterator temp_ = *this;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < count_; ++i_)
|
||||
{
|
||||
temp_._end = temp_._it;
|
||||
--temp_._it;
|
||||
|
||||
if (*temp_._it >= 0xdc00 && *temp_._it <= 0xdfff) --temp_._it;
|
||||
}
|
||||
|
||||
temp_.next();
|
||||
return temp_;
|
||||
}
|
||||
|
||||
private:
|
||||
char_iterator _it;
|
||||
char_iterator _end;
|
||||
char_type _char;
|
||||
|
||||
void next()
|
||||
{
|
||||
char_type ch_ = *_it & 0xffff;
|
||||
|
||||
_end = _it;
|
||||
|
||||
if (ch_ >= 0xd800 && ch_ <= 0xdbff)
|
||||
{
|
||||
const char_type surrogate_ = *++_end & 0xffff;
|
||||
|
||||
ch_ = (((ch_ - 0xd800) << 10) | (surrogate_ - 0xdc00)) + 0x10000;
|
||||
}
|
||||
|
||||
_char = ch_;
|
||||
++_end;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename char_iterator>
|
||||
class basic_utf16_out_iterator :
|
||||
public std::iterator<std::input_iterator_tag, wchar_t>
|
||||
{
|
||||
public:
|
||||
using value_type = wchar_t;
|
||||
using difference_type =
|
||||
typename std::iterator_traits<char_iterator>::difference_type;
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
|
||||
basic_utf16_out_iterator() :
|
||||
_count(0),
|
||||
_index(0)
|
||||
{
|
||||
}
|
||||
|
||||
explicit basic_utf16_out_iterator(const char_iterator &it_,
|
||||
const char_iterator &end_) :
|
||||
_it(it_),
|
||||
_count(0),
|
||||
_index(0)
|
||||
{
|
||||
if (it_ != end_)
|
||||
{
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
wchar_t operator *() const
|
||||
{
|
||||
return _chars[_index];
|
||||
}
|
||||
|
||||
bool operator ==(const basic_utf16_out_iterator &rhs_) const
|
||||
{
|
||||
return _it == rhs_._it;
|
||||
}
|
||||
|
||||
bool operator !=(const basic_utf16_out_iterator &rhs_) const
|
||||
{
|
||||
return _it != rhs_._it;
|
||||
}
|
||||
|
||||
basic_utf16_out_iterator &operator ++()
|
||||
{
|
||||
++_index;
|
||||
|
||||
if (_index >= _count)
|
||||
{
|
||||
++_it;
|
||||
next();
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_utf16_out_iterator operator ++(int)
|
||||
{
|
||||
basic_utf16_out_iterator temp_ = *this;
|
||||
|
||||
++_index;
|
||||
|
||||
if (_index >= _count)
|
||||
{
|
||||
++_it;
|
||||
next();
|
||||
}
|
||||
|
||||
return temp_;
|
||||
}
|
||||
|
||||
private:
|
||||
char_iterator _it;
|
||||
wchar_t _chars[2];
|
||||
unsigned char _count;
|
||||
unsigned char _index;
|
||||
|
||||
void next()
|
||||
{
|
||||
const std::size_t ch_ = *_it;
|
||||
|
||||
_count = len(ch_);
|
||||
_index = 0;
|
||||
|
||||
switch (_count)
|
||||
{
|
||||
case 1:
|
||||
_chars[0] = static_cast<wchar_t>(ch_);
|
||||
break;
|
||||
case 2:
|
||||
_chars[0] = static_cast<wchar_t>((ch_ >> 10) + 0xdc00u -
|
||||
(0x10000 >> 10));
|
||||
_chars[1] = static_cast<wchar_t>((ch_ & 0x3ff) + 0xdc00u);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
char len(const std::size_t ch_) const
|
||||
{
|
||||
return ch_ > 0xffff ? 2 : 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user