*Depending on algorithm used internally.
*There are many slightly different dialects (Perl, ECMA, Posix).
std::regex re{"abc|[0-9]+"} // compile the RE only once;
bool match(std::string_view sv) {
return std::regex_match(sv.begin(), sv.end(), re);
}
const unsigned char * p = "^[0-9]{4,16}?[aA]";
int err = 0;
size_t errOff = 0;
pcre2_code * re = pcre2_compile(p, PCRE2_ZERO_TERMINATED, 0, &err, &errOff, NULL);
int match(const char * str, size_t sz) {
return pcre2_match(re, str, sz, 0, 0, NULL, NULL);
}
// TODO: don't forget to call pcre2_code_free at the end
You can write a finite state machine by yourself.
But that's hard to write properly.
You can easily introduce an error.
You can use an external utility to generate the parser.
bool match(std::string_view sv) noexcept {
return std::regex_match<"abc|[0-9]+">(sv);
}
This is possible with C++20's language support
for Non Type Template Parameters.
bool match(std::string_view sv) noexcept {
return ctre::match<"abc|[0-9]+">(sv);
}
Disabled for compilers without class NTTP support.
bool match(std::string_view sv) noexcept {
using namespace ctre::literals;
return "abc|[0-9]+"_ctre.match(sv);
}
* with N3599 language extension (in GCC & Clang)
static_assert("[0-9]+\\.[0-9]+"_ctre.match("123.456"sv));
static_assert("[0-9]+\\.([0-9]+)"_ctre.match("123.456"sv).get<1>() == "456"sv);
struct name { std::string_view first, family; };
std::optional<name> extract(std::string_view sv) noexcept {
using namespace ctre::literals;
if (auto r = "([A-Za-z]++),([A-Za-z]++)"_ctre.match(sv)) {
return name{ r.get<1>(), r.get<2>() };
} else {
return std::nullopt;
}
}
struct date { unsigned year, month, day; };
std::optional<date> extract(std::string_view sv) noexcept {
using namespace ctre::literals;
if (auto [re, y, m, d] = "([0-9]+)/([0-9]{1,2})/([0-9]{1,2})"_ctre.match(sv); re) {
return date{ conv(y), conv(m), conv(d) };
} else {
return std::nullopt;
}
}
And a complete std::regex implementation is included :)
return ")abc+"_ctre.match(sv);
In file included from example.cpp:1:
In file included from include/ctre.hpp:4:
ctre/literals.hpp:63:2: error: static_assert failed due to requirement 'correct()'
"Regular Expression contains syntax error."
static_assert(correct(), "Regular Expression syntax error.");
^ ~~~~~~~
example.cpp:5:18: note: in instantiation of function template specialization
'ctre::literals::operator""_ctre<char, ')', 'a', 'b', 'c'>' requested here
return ")abc"_ctre.match(sv);
( | ) | * | + | ? | \ | a | d | | | other | ε | |
---|---|---|---|---|---|---|---|---|---|---|---|
→ S | ( alt0 ) mod seq alt | \ esc mod seq alt | a char mod seq alt | d char mod seq alt | other char mod seq alt | ε | |||||
alt0 | ( alt0 ) mod seq alt | \ esc mod seq alt | a char mod seq alt | d char mod seq alt | other char mod seq alt | ||||||
alt | ε | | seq0 alt alt | ε | ||||||||
esc | a alpha | d digit | |||||||||
mod | ε | ε | * star | + plus | ? opt | ε | ε | ε | ε | ε | ε |
seq0 | ( alt0 ) mod seq | \ esc mod seq | a char mod seq | d char mod seq | other char mod seq | ||||||
seq | ( alt0 ) mod seq seq | ε | \ esc mod seq seq | a char mod seq seq | d char mod seq seq | ε | other char mod seq seq | ε | |||
( | pop | ||||||||||
) | pop | ||||||||||
* | pop | ||||||||||
+ | pop | ||||||||||
? | pop | ||||||||||
\ | pop | ||||||||||
a | pop | ||||||||||
d | pop | ||||||||||
| | pop | ||||||||||
other | pop | ||||||||||
Z0 | accept |
There are many ways to encode it,
but not many for the compile-time parsing.
struct my_grammar {
struct S {};
struct alt0 {};
struct alt {};
struct esc {};
struct mod {};
struct seq0 {};
struct seq {};
using start_symbol = S;
// ...
}
Just a bunch of empty types.
struct my_grammar { //... | |
f (symbol, character) → (...) | auto f (symbol, term<'character'>) -> list{...}; |
f (symbol, symbol) → pop_input | template <auto S> auto f (term<S>, term<S>) -> pop_input; |
f (symbol, character) → reject | auto f (...) -> reject; |
f (Z0, ε) → accept | auto f (empty_stack, epsilon) -> accept; |
}; |
The decision of which step of the LL(1) table to use is based
on the function overload resolution of the C++ language.
Return type is the value. There is no need for a function body.
template <typename Grammar, ...> struct parser {
//...
auto next_move = Grammar::f(top(stack), get_character<Pos>());
//...
}
It allows me to pass the grammar as an argument to the parser.
Just use (simplest) type list and a set of a few free functions:
template <typename... Ts> struct list { };
template <typename... Ts, typename... As>
constexpr auto push(list<Ts...>, As...) -> list<As..., Ts...>;
template <typename T, typename... As>
constexpr auto pop(list<T, Ts...>) -> list<Ts...>;
template <typename T, typename... Ts>
constexpr auto top(list<T, Ts...>) -> T;
struct empty_stack { };
constexpr auto top(list<>) -> empty_stack;
Compile-time programming starts to be easy when you think functionally.
template <typename CharT, size_t N> struct fixed_string {
std::array<CharT, N> data;
// constexpr constructor from const char[N]
constexpr auto operator[](size_t i) const noexcept { return data[i]; }
constexpr size_t size() const noexcept { return N; }
};
template <typename CharT, size_t N>
fixed_string(const CharT[N]) -> fixed_string<CharT, N>;
// more info about class NTTP in p0732 by Jeff Snyder and Louis Dionne
std::fixed_string will (probably) be part of C++20.
template <..., fixed_string Str> struct parser {
// ...
template <size_t Pos> constexpr auto get_character() const noexcept() {
if constexpr (Pos < Str.size()) return term<Str[Pos]>{};
else return epsilon{};
}
// ...
};
Formally, the symbol on input is either terminal or epsilon.
constexpr bool ok = parser<pcre_grammar, "^abc">::correct;
Think functionally, your tools are:
function overloading resolution and (tail) recursion…
template <typename Grammar, fixed_string Str> struct parser { static constexpr bool correct = parse(list<Grammar::start_symbol>());
// prepare each step and move to next
template <size_t Pos = 0, typename S> static constexpr bool parse(S stack) {
// reminder: all these types are empty structs
// and are all known during compile time
using next_stack = decltype( pop(stack) );
using symbol = decltype( top(stack) ); using current = decltype( get_character<Pos>() ); using next_move = decltype( Grammar::f(symbol{}, current{}) ); return move<Pos>(next_move{}, next_stack{}); }
//...
// pushing something to stack (epsilon case included)
template <size_t Pos, typename... Push, typename Stack> static constexpr bool move(list<Push...>, Stack stack) { using next_stack = decltype( push(stack, Push{}...) )
return parse<Pos>(next_stack{});}
// move to next character
template <size_t Pos, typename Stack> static constexpr bool move(pop_input, Stack stack) { return parse<Pos+1>(stack);}
//...
template <size_t Pos, typename Stack> static constexpr bool move(reject, Stack) { return false;
}
template <size_t Pos, typename Stack> static constexpr bool move(accept, Stack) { return true;
}
}; // end of Parser struct
Use the same parsing algorithm with an added argument
and new type of special symbols (called semantic actions)
in a grammar to modify the argument.
If you want to do calculation
the best type of argument is a type list (again).
( | ) | * | + | ? | \ | a | d | | | other | ε | |
---|---|---|---|---|---|---|---|---|---|---|---|
→ S | ( alt0 ) mod seq alt | \ esc mod seq alt | a char mod seq alt | d char mod seq alt | other char mod seq alt | ε | |||||
alt0 | ( alt0 ) mod seq alt | \ esc mod seq alt | a char mod seq alt | d char mod seq alt | other char mod seq alt | ||||||
alt | ε | | seq0 alt alt | ε | ||||||||
esc | a alpha | d digit | |||||||||
mod | ε | ε | * star | + plus | ? opt | ε | ε | ε | ε | ε | ε |
seq0 | ( alt0 ) mod seq | \ esc mod seq | a char mod seq | d char mod seq | other char mod seq | ||||||
seq | ( alt0 ) mod seq seq | ε | \ esc mod seq seq | a char mod seq seq | d char mod seq seq | ε | other char mod seq seq | ε | |||
( | pop | ||||||||||
) | pop | ||||||||||
* | pop | ||||||||||
+ | pop | ||||||||||
? | pop | ||||||||||
\ | pop | ||||||||||
a | pop | ||||||||||
d | pop | ||||||||||
| | pop | ||||||||||
other | pop | ||||||||||
Z0 | accept |
struct my_grammar {
// ...
struct _char: action {};
struct alpha: action {};
struct digit: action {};
struct seq: action {};
struct star: action {};
struct plus: action {};
struct opt: action {};
// ...
}
template <typename Grammar, fixed_string Str> struct parser {
template <size_t Pos = 0, typename S, typename T = list<>>
static constexpr auto parse(S stack, T subject = {}) {
using next_stack = decltype( pop(stack) );
using symbol = decltype( top(stack) ); if constexpr (SemanticAction<symbol>) {
using previous = decltype( prev_character<Pos>() ); using next_subject = decltype( modify(symbol{}, previous{}, subject) ); return parse<Pos>(next_stack(), next_subject{});
} else { using current = decltype( get_character<Pos>() );
using next_move = decltype( Grammar::f(symbol{}, current{}) );
return move<Pos>(next_move{}, next_stack{}, subject);
}
}
//...
// pushing something to stack (epsilon case included)
template <size_t Pos, typename... Push, typename Stack, typename T>
static constexpr auto move(list<Push...>, Stack stack, T subject) {
using next_stack = decltype( push(stack, Push{}...) )
return parse<Pos>(next_stack{}, subject);
}
// move to next character
template <size_t Pos, typename Stack, typename T>
static constexpr auto move(pop_input, Stack stack, T subject) {
return parse<Pos+1>(stack, subject);
}
//...
template <size_t Pos, typename Stack, typename T>
static constexpr auto move(reject, Stack, T subject) { return pair{false, subject};}
template <size_t Pos, typename Stack, typename T>
static constexpr auto move(accept, Stack, T subject) { return pair{true, subject};}
}; // end of Parser struct
template <auto C, typename... Ts>
auto modify(my_grammar::_char, term<C>, list<Ts...>) -> list<ch<C>, Ts...>;
(Notice the term and ch types.)
template <auto C, typename Opt, typename... Ts>
auto modify(my_grammar::opt, term<C>, list<Opt, Ts...>)
-> list<opt<Opt>, Ts...>;
template <auto C, typename Plus, typename... Ts>
auto modify(my_grammar::plus, term<C>, list<Plus, Ts...>)
-> list<plus<Plus>, Ts...>;
template <auto C, typename Star, typename... Ts>
auto modify(my_grammar::star, term<C>, list<Star, Ts...>)
-> list<star<Star>, Ts...>;
template <auto C, typename A, typename B, typename... Ts>
auto modify(my_grammar::seq, term<C>, list<B, A, Ts...>) -> list<seq<A, B>, Ts...>;
(Notice the switched order of A & B on the stack.)
template <auto C, typename... As, typename B, typename... Ts>
auto modify(my_grammar::seq, term<C>, list<B, seq<As...>, Ts...>)
-> list<seq<As..., B>, Ts...>;
template <auto C, typename A, typename B, typename... Ts>
auto modify(my_grammar::alt, term<C>, list<B, A, Ts...>) -> list<alt<A, B>, Ts...>;
(Notice the switched order of A & B on the stack.)
template <auto C, typename... As, typename B, typename... Ts>
auto modify(my_grammar::alt, term<C>, list<B, alt<As...>, Ts...>)
-> list<alt<As..., B>, Ts...>;
template <auto C, typename... Ts>
auto modify(my_grammar::alpha, term<C>, list<Ts...>) -> list<alpha, Ts...>;
template <auto C, typename... Ts>
auto modify(my_grammar::digit, term<C>, list<Ts...>) -> list<digit, Ts...>;
static_assert(std::is_same_v<
,
parser<my_grammar, "">::output_type
>);
template <fixed_string re> bool match(std::string_view sv) { static_assert(parser<my_grammar, re>::correct); using RE = parser<my_grammar, re>::output_type;
return match(sv.begin(), sv.begin(), sv.end(), list<RE>{}).success;}
match<"[a-z]\\d+">("a42"sv);
template <ForwardIterator It> struct pair {
ForwardIterator it;
bool success;
// constexpr constructor etc...
};
// and at the end... we need to check for the end :)
template <ForwardIterator It>
bool match(It begin, It it, It end, list<>{}) {
// if we are at the end input we should accept
return {it, it == end};
}
(Helper for finishing the regex matching.)
template <ForwardIterator It, auto C, typename... Ts>
pair<It> match(It begin, It it, It end, list<ch<C>, Ts...>) {
if (it == end) return {it, false}; if (*it != C) return {it, false};
return match(begin, std::next(it), end, list<Ts...>{});
}
template <ForwardIterator It, typename... Ts>
pair<It> match(It begin, It it, It end, list<digit, Ts...>) {
if (it == end) return {it, false}; if (!(*it >= '0' && *it <= '9')) return {it, false}; return match(begin, std::next(it), end, list<Ts...>{});}
template <ForwardIterator It, typename... Ts>
pair<It> match(It begin, It it, It end, list<alpha, Ts...>) {
if (it == end) return {it, false}; if (!(*it >= 'a' && *it <= 'z')) return {it, false}; return match(begin, std::next(it), end, list<Ts...>{});}
template <ForwardIterator It, typename... Seq, typename... Ts>
pair<It> match(It begin, It it, It end, list<seq<Seq...>, Ts...>) {
return match(begin, it, end, list<Seq..., Ts...>{});}
template <ForwardIterator It, typename... Opt, typename... Ts>
pair<It> match(It begin, It it, It end, list<opt<Opt...>, Ts...>) {
if (auto out = match(begin, it, end, list<Opt..., Ts...>{})) {
return out;
} else {
// try it without the content of opt<...>
return match(begin, it, end, list<Ts...>{}));
}
}
template <ForwardIterator It, typename Head, typename... Tail, typename... Ts>
pair<It> match(It begin, It it, It end, list<alt<Head, Tail...>, Ts...>) {
if (auto out = match(begin, it, end, list<Head, Ts...>{})) {
return out;
} else {
// try the next one
return match(begin, it, end, list<alt<Tail...>, Ts...>{}));
}
}
template <ForwardIterator It, typename... Ts>
pair<It> match(It begin, It it, It end, list<alt<>, Ts...>) {
// no option from alternation was successful
return {it, false};
}
template <ForwardIterator It, typename First, typename... Alt, typename... Ts>
pair<It> match(It begin, It it, It end, list<plus<Plus...>, Ts...>) {
for (;;) {
if (auto inner = match(begin, it, end, list<Plus..., end_of_cycle>{})) {
if (auto out = match(begin, it, end, list<Ts...>{})) {
return out;
} else {
it = inner.it;
}
} else return {it, false};
}
}
struct end_of_cycle {};
template <ForwardIterator It>
pair<It> match(It, It it, It, list<end_of_cycle>) {
return {it, true};
}
(The cycle is lazy.)
template <ForwardIterator It, typename First, typename... Alt, typename... Ts>
pair<It> match(It begin, It it, It end, list<star<Star...>, Ts...>) {
for (;;) { if (auto out = match(begin, it, end, list<Ts...>{})) {
return out; } else if (auto inner = match(begin, it, end, list<Star..., end_of_cycle>{})) {
if (auto out = match(begin, it, end, list<Ts...>{})) {
return out;
} else {
it = inner.it;
}
} else return {it, false};
}
}
(The cycle is lazy.)
int main (int argc, char ** argv) {
using namespace ctre::literals; constexpr auto re = "PATTERN"_ctre; std::ifstream stream{argv[1], std::ifstream::in};
std::string line;
while (std::getline(stream, line)) { if (re.search(line)) { std::cout << line << '\n';
}
}
}
int main (int argc, char ** argv) { std::regex re("PATTERN"); std::ifstream stream{argv[1], std::ifstream::in};
std::string line;
while (std::getline(stream, line)) { if (std::regex_search(line, re)) { std::cout << line << '\n';
}
}
}
int main (int argc, char ** argv) { auto * pattern = "PATTERN";
pcre2_code * re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, 0,
&errornumber, &erroroffset, nullptr);
std::ifstream stream{argv[1], std::ifstream::in};
std::string line;
while (std::getline(stream, line)) { if (pcre2_match(re, line.c_str(), line.length(), 0, 0, NULL, NULL) >= 0) { std::cout << line << '\n';
}
}
}
You can find the library at: cpp.fail/ctre
You can find the library at: cpp.fail/ctre