std::regex
std::regex
struct date {
std::string year;
std::string month;
std::string day;
};
std::optional<date> extract_date(std::string_view input) noexcept {
static std::regex pattern("([0-9]{4})/([0-9]{2})/([0-9]{2})");
std::smatch_results<std::string_view::iterator> captures;
if (!std::regex_match(input.begin(), input.end(), captures, pattern)) {
return std::nullopt;
}
return date{
captures[1].str(),
captures[2].str(),
captures[3].str()
};
}
struct date {
std::string year;
std::string month;
std::string day;
};
std::optional<date> extract_date(std::string_view input) noexcept {
auto [match, y, m, d] = ctre::match<"([0-9]{4})/([0-9]{2})/([0-9]{2})">(input);
if (!match) {
return std::nullopt;
}
return date{y.str(), m.str(), d.str()};
}
bool fnc(std::string_view view) {
return ctre::match<"(hello">(view);
}
In file included from test.cpp:1: In file included from include/ctre.hpp:5: include/ctre/functions.hpp:48:2: error: static_assert failed due to requirement 'correct' "Regular Expression contains syntax error." static_assert(correct, "Regular Expression contains syntax error."); ^ ~~~~~~~ test.cpp:2:15: note: in instantiation of function template specialization 'ctre::match<"(hello">' requested here return ctre::match<"(hello">(view); ^
std::optional<std::string_view> extract_sha256(std::string_view input) noexcept {
if (auto r = ctre::match<"[a-fA-F0-9]{64}">(input)) {
return r;
} else {
return std::nullopt;
}
}
int sum(std::string_view input) {
int output = 0;
for (const auto & capt: ctre::range<"^,?([0-9]+)">(input))
output += to_integer(capt.get<1>());
return output;
}
constexpr bool is_emoji_only(std::u32string_view input) noexcept {
return ctre::match<"\\p{emoji}+">(input);
}
static_assert( is_emoji_only(U"🤪🤡🥳"));
static_assert(!is_emoji_only(U"💩java"));
bool contains_date(std::string_view input) noexcept {
return ctre::fast_search<"[0-9]{4}/[0-9]{2}/[0-9]{2}">(input);
}
hello|aloha|guten tag|dobrý den|bonjour
[a-z]+[0-9]+
a
or {"a
"}),(ct)?re|[a-f]+
(ε|ct)re|(a|b|c|d|e|f)(a|b|c|d|e|f)*
(ct)?re|[a-f]+
// building blocks
struct epsilon { };
template <Character auto C> struct ch { };
// operations
template <typename...> struct concat { };
template <typename...> struct alt { };
template <typename> struct star { };
// for convenient usage
template <typename E> using opt = alternation<E, epsilon>;
template <typename E> using plus = concat<E, star<E>>;
( | ) | * | + | ? | | | other | ε | |
---|---|---|---|---|---|---|---|---|
→ S | ( alt0 ) mod seq alt | other char mod seq alt | ε | |||||
alt0 | ( alt0 ) mod seq alt | other char mod seq alt | ||||||
alt | ε | | seq0 alt alt | ε | |||||
mod | ε | ε | * star | + plus | ? opt | ε | ε | ε |
seq0 | ( alt0 ) mod seq | other char mod seq | ||||||
seq | ( alt0 ) mod seq seq | ε | ε | other char mod seq seq | ε | |||
( | pop | |||||||
) | pop | |||||||
* | pop | |||||||
+ | pop | |||||||
? | pop | |||||||
| | pop | |||||||
other | pop | |||||||
Z0 | accept |
( | ) | * | + | ? | | | other | ε | |
---|---|---|---|---|---|---|---|---|
→ S | ( alt0 ) mod seq alt | other char mod seq alt | ε | |||||
alt0 | ( alt0 ) mod seq alt | other char mod seq alt | ||||||
alt | ε | | seq0 alt alt | ε | |||||
mod | ε | ε | * star | + plus | ? opt | ε | ε | ε |
seq0 | ( alt0 ) mod seq | other char mod seq | ||||||
seq | ( alt0 ) mod seq seq | ε | ε | other char mod seq seq | ε | |||
terminal | pop | pop | pop | pop | pop | pop | pop | |
Z0 | accept |
struct S {};
struct alt0 {};
struct alt {};
struct mod {};
struct seq0 {};
struct seq {};
using start_symbol = S;
// f (symbol, char) → (...)
auto f(symbol, term<'c'>) -> list{...};
// f (symbol, symbol) → pop input
template <auto S> auto f(term<S>, term<S>) -> pop_input;
// f (symbol, char) → reject
auto f(...) -> reject;
// f (Z0, ε) → accept
auto f(empty_stack, epsilon) -> accept;
struct pcre {
struct S {};
struct alt0 {};
struct alt {};
// ...
using start_symbol = S;
auto f(...) -> reject;
// ...
}
constexpr bool ok = parser<pcre, "a+b+">::correct;
template <typename Grammar, ...> struct parser {
//...
auto next_move = Grammar::f(top_of_stack, current_term);
//...
}
template <typename CharT, size_t N> struct fixed_string {
CharT data[N+1];
// constexpr constructor from const char[N]
constexpr auto operator[](size_t i) const noexcept { return data[i]; }
constexpr size_t size() const noexcept { return N; }
constexpr auto operator<=>(const fixed_string &) = default;
};
template <typename CharT, size_t N>
fixed_string(const CharT[N]) -> fixed_string<CharT, N>;
// more info about class NTTP in p0732 by Jeff Snyder and Louis Dionne
template <typename... Ts> struct list { };
template <typename... Ts, typename... As>
constexpr auto push(list<Ts...>, As...) -> list<As..., Ts...>;
template <typename T, typename... As>
constexpr auto pop(list<T, Ts...>) -> list<Ts...>;
template <typename T, typename... Ts>
constexpr auto top(list<T, Ts...>) -> T;
struct empty { };
constexpr auto top(list<>) -> empty;
constexpr bool ok = parser<pcre, "a*b*">::correct;
template <typename Grammar, fixed_string Str> struct parser {
static constexpr bool correct = parse(list<Grammar::start_symbol>{});
// return current term
template <size_t Pos> constexpr auto get_character() const {
if constexpr (Pos < Str.size()) return term<Str[Pos]>{};
else return epsilon{};
}
// prepare each step and move to next
template <size_t Pos = 0, typename S> static constexpr bool parse(S stack) {
auto symbol = top(stack);
auto current_term = get_character<Pos>();
auto next_move = decltype(Grammar::f(symbol, current_term)){};
return move<Pos>(next_move, pop(stack));
}
// pushing something to stack (epsilon case included)
template <size_t Pos, typename... Push, typename Stack>
static constexpr bool move(list<Push...>, Stack stack) {
return parse<Pos>(push(stack, Push{}...));
}
// move to next character
template <size_t Pos, typename Stack>
static constexpr bool move(pop_input, Stack stack) {
return parse<Pos+1>(stack);
}
// reject
template <size_t Pos, typename Stack>
static constexpr bool move(reject, Stack) {
return false;
}
// accept
template <size_t Pos, typename Stack>
static constexpr bool move(accept, Stack) {
return true;
}
};
( | ) | * | + | ? | | | other | ε | |
---|---|---|---|---|---|---|---|---|
→ S | ( alt0 ) mod seq alt | other char mod seq alt | ε | |||||
alt0 | ( alt0 ) mod seq alt | other char mod seq alt | ||||||
alt | ε | | seq0 alt alt | ε | |||||
mod | ε | ε | * star | + plus | ? opt | ε | ε | ε |
seq0 | ( alt0 ) mod seq | other char mod seq | ||||||
seq | ( alt0 ) mod concat seq | ε | ε | other char mod concat seq | ε | |||
( | pop | |||||||
) | pop | |||||||
* | pop | |||||||
+ | pop | |||||||
? | pop | |||||||
| | pop | |||||||
other | pop | |||||||
Z0 | accept |
struct pcre {
struct _char: action {};
struct alpha: action {};
struct digit: action {};
struct seq: action {};
struct star: action {};
struct plus: action {};
struct opt: action {};
// ...
}
template <typename Grammar, fixed_string Str> struct parser {
template <typename Subject>
static constexpr auto output = parse(list<Grammar::start_symbol>{}, Subject{});
// return current term
template <size_t Pos> constexpr auto get_character() const {
if constexpr (Pos < Str.size()) return term<Str[Pos]>{};
else return epsilon{};
}
// prepare each step and move to next
template <size_t Pos = 0, typename S, typename T>
static constexpr auto parse(S stack, T subject) {
auto symbol = top(stack);
if constexpr (is_semantic_action(symbol)) {
auto previous_term = get_character<Pos-1>();
auto next_subject = decltype( modify(symbol, previous_term, subject) ){};
return parse<Pos>(pop(stack), next_subject);
} else {
auto current_term = get_character<Pos>();
auto next_move = decltype(Grammar::f(symbol, current_term));
return move<Pos>(next_move, pop(stack), subject);
}
}
// pushing something to stack (epsilon case included)
template <size_t Pos, typename... Push, typename Stack, typename Subject>
static constexpr bool move(list<Push...>, Stack stack, Subject subject) {
return parse<Pos>(push(stack, Push{}...), subject);
}
// move to next character
template <size_t Pos, typename Stack, typename Subject>
static constexpr bool move(pop_input, Stack stack, Subject subject) {
return parse<Pos+1>(stack, subject);
}
// reject
template <size_t Pos, typename Stack, typename Subject>
static constexpr auto move(reject, Stack, Subject subject) {
return std::pair{false, subject};
}
// accept
template <size_t Pos, typename Stack>
static constexpr auto move(accept, Stack, Subject subject) {
return std::pair{true, subject};
}
bool correct = output<list<>{}>.first;
using type = decltype( output<list<>{}>.second );
};
// pushing a character
template <Character auto C, typename... Ts>
auto modify(pcre::_char, term<C>, list<Ts...>)
-> list<ch<C>, Ts...>;
// concatenating a sequence (notice the switched order of A & B on the stack)
template <Character auto C, typename A, typename B, typename... Ts>
auto modify(pcre::seq, term<C>, list<B, A, Ts...>)
-> list<concat<A, B>, Ts...>;
// adding to a concatenated sequence
template <Character auto C, typename... As, typename B, typename... Ts>
auto modify(pcre::seq, term<C>, list<B, concat<As...>, Ts...>)
-> list<concat<As..., B>, Ts...>;
// making an alternate (again, switched order of A & B)
template <Character auto C, typename A, typename B, typename... Ts>
auto modify(pcre::alt, term<C>, list<B, A, Ts...>)
-> list<alt<A, B>, Ts...>;
// adding to an alternate
template <auto C, typename... As, typename B, typename... Ts>
auto modify(pcre::alt, term<C>, list<B, alt<As...>, Ts...>)
-> list<alt<As..., B>, Ts...>;
// making something optional
template <Character auto C, typename Opt, typename... Ts>
auto modify(pcre::opt, term<C>, list<Opt, Ts...>)
-> list<opt<Opt>, Ts...>;
// making a plus cycle
template <Character auto C, typename Plus, typename... Ts>
auto modify(pcre::plus, term<C>, list<Plus, Ts...>)
-> list<plus<Plus>, Ts...>;
// making a star cycle
template <Character auto C, typename Star, typename... Ts>
auto modify(pcre::star, term<C>, list<Star, Ts...>)
-> list<star<Star>, Ts...>;
static_assert(std::is_same_v<
,
decltype( top(parser<pcre,"">::type) )
>);
"And if thou gaze long at a finite automaton,
a finite automaton also gazes into thee."– Friedrich Nietzsche
(after taking a computer science class)
[0-9]+
set<int> int
(implicit)char32_t
set<tuple<int, int, char32_t>>
int(0)
set<int>
fixed_set
template <size_t Sz, typename T> class fixed_set {
T data[Sz];
public:
constexpr auto begin();
constexpr auto end();
constexpr auto begin() const;
constexpr auto end() const;
constexpr size_t size() const;
constexpr auto & operator[](size_t);
constexpr const auto & operator[](size_t) const;
constexpr auto insert(T);
template <typename K> constexpr auto erase(K);
constexpr auto erase(T *);
constexpr set();
// ...
};
finite_automaton
struct transition {
int source;
int target;
char32_t term;
constexpr transition(int, int, char32_t);
constexpr bool match(char32_t c) const;
// comparable with int against source
};
template <size_t Tr, size_t FSz> struct finite_automaton {
fixed_set<Tr, transition> transitions;
fixed_set<FSz, int> final_states;
// normal constructor / copy constructor ...
};
static constexpr auto empty = finite_automaton<0,0>{};
static constexpr auto epsilon = finite_automaton<0,1>{{}, {0}};
template <char32_t C> static constexpr auto one_char = finite_automaton<1,1>{
{transition(0, 1, C)},
{1}
};
template <finite_automaton Lhs, finite_automaton Rhs> struct concat_two {
constexpr static auto calculate() {
constexpr size_t tr_count = Lhs.transitions.size() + Rhs.transitions.size();
constexpr size_t tr_start = Rhs.count_transitions(0);
constexpr int prefix = Lhs.get_max_state() + 1;
finite_automaton<(tr_count + tr_start), Rhs.final_states.count()> out;
copy(Lhs.transitions.begin(), Lhs.transitions.end(), out.transitions.begin());
for (int f: Lhs.final_states) {
Rhs.foreach_transition_from(0, [&](transition t){
t.source = f;
t.target += prefix;
out.add_transition(t);
});
}
for (transition t: Rhs.transitions) {
t.source += prefix;
t.target += prefix;
out.add_transition(t);
}
for (int f: Rhs.final_states) out.mark_final(f + prefix);
return out;
}
static constexpr auto value = calculate();
};
Both other algorithms work the same,
pre-calculate the size and then populate the content.
template <finite_automaton... Fas>
static constexpr auto concat = apply<concat_two, Fas...>::value;
template <finite_automaton... Fas>
static constexpr auto alternation = apply<alternation_two, Fas...>::value;
template <finite_automaton... Fas>
static constexpr auto star = star_one<concat<Fas...>>::value;
const auto &
it's C++17 compatible!constexpr auto fa = concat<one_char<'a'>, one_char<'b'>>;
Remove unnecessary states and links by merging the same states.
Remove nondeterministic transitions, can generate 2n new states.
Remove nondeterministic transitions, can generate 2n new states.
Constexpr implementation must be iterative,
as the whole output can't be calculated in one step.
It's useful to minimize the FA after the determinization.
"There is no such thing as a zero-cost abstraction."– Chandler Carruth
template <typename T> struct identify_type;
"hello" =~ /aloha|[a-z]+/
ctre::match<"aloha|[a-z]+">("aloha"sv);
// search(x) is actually match(.*x.*)
ctre::search<"aloha|[a-z]+">("...aloha..."sv);
template <fixed_string re> bool fast_match(const Range auto & rng) {
static_assert(parser<pcre, re>::correct);
using RE = parser<pcre, re>::type;
constexpr auto dfa = fa::minimize<fa::determinize<nfa_from<RE>>>;
return match_state<dfa>(rng.begin(), rng.end());
}
namespace fa {
static constexpr auto empty = finite_automaton<0, 0>{};
}
template <typename RE>
static constexpr auto nfa_from = convert(fa::empty, list<RE>{});
RE
is a type, finite_automaton
is a value.
auto convert(const FinAutomaton auto seed, list<>) {
// for an empty RE seed is empty FA
return seed;
}
namespace fa {
// epsilon is FA with a start state final
static constexpr auto epsilon = finite_automaton<0, 1>{{}, {0}};
}
template <typename... Ts>
auto convert(const FinAutomaton auto lhs, list<epsilon, Ts...>) {
return convert(fa::concat<lhs, fa::epsilon>, list<Ts...>{});
}
namespace fa {
// epsilon is FA with a start state final
template <char32_t Term> static constexpr auto
character = finite_automaton<1, 1>{{transition{0, 1, Term}}, {1}};
}
template <char32_t Term, typename... Ts>
auto convert(const FinAutomaton auto lhs, list<character<Term>, Ts...>) {
return convert(fa::concat<lhs, fa::character<Term>>, list<Ts...>{});
}
template <typename... Content, typename... Ts>
auto convert(const FinAutomaton auto lhs, list<concat<Content...>, Ts...>) {
return convert(lhs, list<Content..., Ts...>{});
}
template <typename... Options, typename... Ts>
auto convert(const FinAutomaton auto lhs, list<alt<Options...>, Ts...>) {
constexpr auto inner = fa::alternative<convert(fa::empty, Options)...>;
return convert(fa::concat<lhs, inner>, list<Ts...>{});
}
template <typename Content, typename Ts...>
auto convert(const FinAutomaton auto seed, list<star<Content>, Ts...>) {
auto content = convert(fa::empty, Content);
return convert(fa::concat<lhs, fa::star<content>>, list<Ts...>{});
}
template <fixed_string re> bool fast_match(const Range auto & rng) {
static_assert(parser<pcre, re>::correct);
using RE = parser<pcre, re>::type;
constexpr auto dfa = fa::minimize<fa::determinize<nfa_from<RE>>>;
return match_state<dfa>(rng.begin(), rng.end());
}
template <finite_automaton DFA, int State = 0, typename Iterator>
constexpr bool match_state(Iterator it, Iterator end) noexcept {
// we can end correctly only if this state is final
if constexpr (DFA.is_final(State)) {
if (end == it) return true;
} else {
if (end == it) return false;
}
// transitions are sorted by source state
constexpr auto index = DFA.first_transition_index_of(State);
// tail-recursion
return choose_transition<DFA, index, State>(it, end);
}
template <finite_automaton DFA, int Index, int State, typename Iterator>
constexpr bool choose_transition(Iterator it, const Iterator end) noexcept {
// check if we are still in same state
if constexpr (Index >= DFA.transitions.size()
|| DFA.transitions[Index].source != State) {
return false;
}
if (DFA.transitions[Index].match(*it)) {
// this transition is the one, go to next state (tail recursion)
return match_state<DFA, DFA.transitions[Index].target>(std::next(it), end);
}
// try next one (also tail recursion)
return choose_transition<DFA, Index+1, State>(it, end);
}
n = length of the input, t = average number of transitions per state
[a-z0-9]+[0-9]
#include <everything>
int main(int argc, char ** argv) {
auto re = PREPARE("PATTERN");
auto lines = load_file_by_lines(argv[1]);
size_t count = 0;
// some warm-up
auto start = steady_clock::now();
for (string_view line: lines) {
if (re.SEARCH(line)) count++;
}
auto end = steady_clock::now();
// print results
}
[a-z0-9]+[0-9]
You can find slides & code at compile-time.re