(almost every language has a regex library)
[a-z0-9]+[a-z0-9]+abcd[a-z]
struct date {
std::string year;
std::string month;
std::string day;
};
std::optional<date> extract_date(std::string_view input) noexcept {
static std::regex pattern("([0-9]{4})/([0-9]{2})/([0-9]{2})");
std::smatch_results<std::string_view::iterator> captures;
if (!std::regex_match(input.begin(), input.end(), captures, pattern)) {
return std::nullopt;
}
return date{
captures[1].str(),
captures[2].str(),
captures[3].str()
};
}
struct date {
std::string year;
std::string month;
std::string day;
};
std::optional<date> extract_date(std::string_view input) noexcept {
auto [match, y, m, d] = ctre::match<"([0-9]{4})/([0-9]{2})/([0-9]{2})">(input);
if (!match) {
return std::nullopt;
}
return date{y.str(), m.str(), d.str()};
}
bool fnc(std::string_view view) {
return ctre::match<"(hello">(view);
}
In file included from test.cpp:1: In file included from include/ctre.hpp:5: include/ctre/functions.hpp:48:2: error: static_assert failed due to requirement 'correct' "Regular Expression contains syntax error." static_assert(correct, "Regular Expression contains syntax error."); ^ ~~~~~~~ test.cpp:2:15: note: in instantiation of function template specialization 'ctre::match<"(hello">' requested here return ctre::match<"(hello">(view); ^
constexpr bool is_cyrillic(std::u32string_view input) noexcept {
return ctre::match<"[\\p{cyrillic}\\s]+">(input);
}
static_assert(is_cyrillic(U"Нормально делай нормально будет"));
constexpr bool is_emoji_only(std::u32string_view input) noexcept {
return ctre::match<"[\\p{emoji]+">(input);
}
static_assert(is_emoji_only(U"😘☺️"));
bool contains_date(std::string_view input) noexcept {
return ctre::fast_search<"[0-9]{4}/[0-9]{2}/[0-9]{2}">(input);
}
hello|aloha|hallo|ahoj|bonjour
[a-z]+[0-9]+
a
or {"a
"}),(ct)?re|[a-f]+
(ε|ct)re|(a|b|c|d|e|f)(a|b|c|d|e|f)*
(ct)?re|[a-f]+
// building blocks
struct epsilon { };
template <character_like auto C> struct ch { };
// operations
template <typename...> struct concat { };
template <typename...> struct alt { };
template <typename> struct star { };
// for convenient usage
template <typename E> using opt = alternation<E, epsilon>;
template <typename E> using plus = concat<E, star<E>>;
( | ) | * | + | ? | | | other | ε | |
---|---|---|---|---|---|---|---|---|
→ S | ( alt0 ) mod seq alt | other char mod seq alt | ε | |||||
alt0 | ( alt0 ) mod seq alt | other char mod seq alt | ||||||
alt | ε | | seq0 alt alt | ε | |||||
mod | ε | ε | * star | + plus | ? opt | ε | ε | ε |
seq0 | ( alt0 ) mod seq | other char mod seq | ||||||
seq | ( alt0 ) mod seq seq | ε | ε | other char mod seq seq | ε | |||
( | pop | |||||||
) | pop | |||||||
* | pop | |||||||
+ | pop | |||||||
? | pop | |||||||
| | pop | |||||||
other | pop | |||||||
Z0 | accept |
( | ) | * | + | ? | | | other | ε | |
---|---|---|---|---|---|---|---|---|
→ S | ( alt0 ) mod seq alt | other char mod seq alt | ε | |||||
alt0 | ( alt0 ) mod seq alt | other char mod seq alt | ||||||
alt | ε | | seq0 alt alt | ε | |||||
mod | ε | ε | * star | + plus | ? opt | ε | ε | ε |
seq0 | ( alt0 ) mod seq | other char mod seq | ||||||
seq | ( alt0 ) mod seq seq | ε | ε | other char mod seq seq | ε | |||
terminal | pop | pop | pop | pop | pop | pop | pop | |
Z0 | accept |
struct S {};
struct alt0 {};
struct alt {};
struct mod {};
struct seq0 {};
struct seq {};
using start_symbol = S;
// f (symbol, char) → (...)
auto f(symbol, term<'c'>) -> list<...>;
// f (symbol, symbol) → pop input
template <auto S> auto f(term<S>, term<S>) -> pop_input;
// f (symbol, char) → reject
auto f(...) -> reject;
// f (Z0, ε) → accept
auto f(empty_stack, epsilon) -> accept;
struct pcre {
struct S {};
struct alt0 {};
struct alt {};
// ...
using start_symbol = S;
auto f(...) -> reject;
// ...
}
constexpr bool ok = parser<pcre, "a+b+">::correct;
template <typename Grammar, ...> struct parser {
//...
auto next_move = decltype( Grammar::f(top_of_stack, current_term) ){};
//...
}
template <typename CharT, size_t N> struct fixed_string {
CharT data[N+1];
// constexpr constructor from const char[N]
constexpr auto operator[](size_t i) const noexcept { return data[i]; }
constexpr size_t size() const noexcept { return N; }
constexpr auto operator<=>(const fixed_string &) = default;
};
template <typename CharT, size_t N>
fixed_string(const CharT[N]) -> fixed_string<CharT, N>;
template <typename... Ts> struct list { };
template <typename... Ts, typename... As>
constexpr auto push(list<Ts...>, As...) -> list<As..., Ts...>;
template <typename T, typename... Ts>
constexpr auto pop(list<T, Ts...>) -> list<Ts...>;
template <typename T, typename... Ts>
constexpr auto top(list<T, Ts...>) -> T;
struct empty { };
constexpr auto top(list<>) -> empty;
constexpr bool ok = parser<pcre, "a*b*">::correct;
template <typename Grammar, fixed_string Str> struct parser {
static constexpr bool correct = parse(list<Grammar::start_symbol>{});
// return current term
template <size_t Pos> constexpr auto get_character() const {
if constexpr (Pos < Str.size()) return term<Str[Pos]>{};
else return epsilon{};
}
// prepare each step and move to next
template <size_t Pos = 0, typename S> static constexpr bool parse(S stack) {
auto symbol = top(stack);
auto current_term = get_character<Pos>();
auto next_move = decltype( Grammar::f(symbol, current_term) ) {};
return move<Pos>(next_move, pop(stack));
}
// pushing something to stack (epsilon case included)
template <size_t Pos, typename... Push, typename Stack>
static constexpr bool move(list<Push...>, Stack stack) {
return parse<Pos>(push(stack, Push{}...));
}
// move to next character
template <size_t Pos, typename Stack>
static constexpr bool move(pop_input, Stack stack) {
return parse<Pos+1>(stack);
}
// reject
template <size_t Pos, typename Stack>
static constexpr bool move(reject, Stack) {
return false;
}
// accept
template <size_t Pos, typename Stack>
static constexpr bool move(accept, Stack) {
return true;
}
};
( | ) | * | + | ? | | | other | ε | |
---|---|---|---|---|---|---|---|---|
→ S | ( alt0 ) mod seq alt | other char mod seq alt | ε | |||||
alt0 | ( alt0 ) mod seq alt | other char mod seq alt | ||||||
alt | ε | | seq0 alt alt | ε | |||||
mod | ε | ε | * star | + plus | ? opt | ε | ε | ε |
seq0 | ( alt0 ) mod seq | other char mod seq | ||||||
seq | ( alt0 ) mod concat seq | ε | ε | other char mod concat seq | ε | |||
( | pop | |||||||
) | pop | |||||||
* | pop | |||||||
+ | pop | |||||||
? | pop | |||||||
| | pop | |||||||
other | pop | |||||||
Z0 | accept |
struct pcre {
struct _char: action {};
struct alpha: action {};
struct digit: action {};
struct seq: action {};
struct star: action {};
struct plus: action {};
struct opt: action {};
// ...
}
template <typename Grammar, fixed_string Str> struct parser {
template <typename Subject>
static constexpr auto output = parse(list<Grammar::start_symbol>{}, Subject{});
// return current term
template <size_t Pos> constexpr auto get_character() const {
if constexpr (Pos < Str.size()) return term<Str[Pos]>{};
else return epsilon{};
}
// prepare each step and move to next
template <size_t Pos = 0, typename S, typename T>
static constexpr auto parse(S stack, T subject) {
auto symbol = top(stack);
if constexpr (is_semantic_action(symbol)) {
auto previous_term = get_character<Pos-1>();
auto next_subject = decltype( modify(symbol, previous_term, subject) ){};
return parse<Pos>(pop(stack), next_subject);
} else {
auto current_term = get_character<Pos>();
auto next_move = decltype( Grammar::f(symbol, current_term) ){};
return move<Pos>(next_move, pop(stack), subject);
}
}
// pushing something to stack (epsilon case included)
template <size_t Pos, typename... Push, typename Stack, typename Subject>
static constexpr bool move(list<Push...>, Stack stack, Subject subject) {
return parse<Pos>(push(stack, Push{}...), subject);
}
// move to next character
template <size_t Pos, typename Stack, typename Subject>
static constexpr bool move(pop_input, Stack stack, Subject subject) {
return parse<Pos+1>(stack, subject);
}
// reject
template <size_t Pos, typename Stack, typename Subject>
static constexpr auto move(reject, Stack, Subject subject) {
return std::pair{false, subject};
}
// accept
template <size_t Pos, typename Stack>
static constexpr auto move(accept, Stack, Subject subject) {
return std::pair{true, subject};
}
bool correct = output<list<>>.first;
using type = decltype( output<list<>>.second );
};
// pushing a character
template <character_like auto C, typename... Ts>
auto modify(pcre::_char, term<C>, list<Ts...>)
-> list<ch<C>, Ts...>;
// concatenating a sequence (notice the switched order of A & B on the stack)
template <character_like auto C, typename A, typename B, typename... Ts>
auto modify(pcre::seq, term<C>, list<B, A, Ts...>)
-> list<concat<A, B>, Ts...>;
// adding to a concatenated sequence
template <character_like auto C, typename... As, typename B, typename... Ts>
auto modify(pcre::seq, term<C>, list<B, concat<As...>, Ts...>)
-> list<concat<As..., B>, Ts...>;
// making an alternate (again, switched order of A & B)
template <character_like auto C, typename A, typename B, typename... Ts>
auto modify(pcre::alt, term<C>, list<B, A, Ts...>)
-> list<alt<A, B>, Ts...>;
// adding to an alternate
template <auto C, typename... As, typename B, typename... Ts>
auto modify(pcre::alt, term<C>, list<B, alt<As...>, Ts...>)
-> list<alt<As..., B>, Ts...>;
// making something optional
template <character_like auto C, typename Opt, typename... Ts>
auto modify(pcre::opt, term<C>, list<Opt, Ts...>)
-> list<opt<Opt>, Ts...>;
// making a plus cycle
template <character_like auto C, typename Plus, typename... Ts>
auto modify(pcre::plus, term<C>, list<Plus, Ts...>)
-> list<plus<Plus>, Ts...>;
// making a star cycle
template <character_like auto C, typename Star, typename... Ts>
auto modify(pcre::star, term<C>, list<Star, Ts...>)
-> list<star<Star>, Ts...>;
static_assert(std::is_same_v<
,
decltype( top(parser<pcre,"">::type) )
>);
"And if thou gaze long at a finite automaton,
a finite automaton also gazes into thee."– Friedrich Nietzsche
[0-9]+
set<int> int
(implicit)char32_t
set<tuple<int, int, char32_t>>
int(0)
set<int>
fixed_set
template <size_t Sz, typename T> class fixed_set {
T data[Sz];
public:
constexpr auto begin();
constexpr auto end();
constexpr auto begin() const;
constexpr auto end() const;
constexpr size_t size() const;
constexpr auto & operator[](size_t);
constexpr const auto & operator[](size_t) const;
constexpr auto insert(T);
template <typename K> constexpr auto erase(K);
constexpr auto erase(T *);
constexpr set();
// ...
};
finite_automaton
struct transition {
int source;
int target;
char32_t term;
constexpr transition(int, int, char32_t);
constexpr bool match(char32_t c) const;
// comparable with int against source
};
template <size_t Tr, size_t FSz> struct finite_automaton {
fixed_set<Tr, transition> transitions; // sorted as source + target tuple
fixed_set<FSz, int> final_states;
// normal constructor / copy constructor ...
};
static constexpr auto empty = finite_automaton<0,0>{};
static constexpr auto epsilon = finite_automaton<0,1>{{}, {0}};
template <char32_t C> static constexpr auto one_char = finite_automaton<1,1>{
{transition(0, 1, C)},
{1}
};
template <finite_automaton Lhs, finite_automaton Rhs> struct concat_two {
constexpr static auto calculate() {
// precalculate size
constexpr size_t tr_count = Lhs.transitions.size() + Rhs.transitions.size();
constexpr size_t tr_start = Rhs.count_transitions(0);
// calculate the first available state id
constexpr int prefix = Lhs.get_max_state() + 1;
// create output object
finite_automaton<(tr_count + tr_start), Rhs.final_states.count()> out;
// copy left FA into output
copy(Lhs.transitions.begin(), Lhs.transitions.end(), out.transitions.begin());
// connect left final states to the right start state
for (int f: Lhs.final_states) {
Rhs.foreach_transition_from(0, [&](transition t){
t.source = f;
t.target += prefix;
out.add_transition(t);
});
}
// copy right FA into output
for (transition t: Rhs.transitions) {
t.source += prefix;
t.target += prefix;
out.add_transition(t);
}
// mark final states from right as the final states
for (int f: Rhs.final_states) out.mark_final(f + prefix);
return out;
}
// store the output
static constexpr auto value = calculate();
};
// shorthand
template <finite_automaton Lhs, finite_automaton Rhs>
static constexpr auto concat = concat_two<Lhs, Rhs>::value;
Both other algorithms work the same,
pre-calculate the size and then populate the content.
constexpr auto fa = concat<one_char<'a'>, one_char<'b'>>;
Remove nondeterministic transitions, can generate 2n new states.
Remove nondeterministic transitions, can generate 2n new states.
Constexpr implementation must be iterative,
as the whole output can't be calculated in one step.
It's useful to minimize the FA after the determinization.
"Abstraction good ... but also bad"– Chandler Carruth
template <typename T> struct identify_type;
constexpr JIT compilation*
*currently developed for clang
"hello" =~ /aloha|[a-z]+/
ctre::fast_match<"aloha|[a-z]+">("aloha"sv);
// search(x) is actually match(.*x.*)
ctre::fast_search<"aloha|[a-z]+">("something aloha something"sv);
template <fixed_string re> bool fast_match(const std::forward_range auto & rng) {
static_assert(parser<pcre, re>::correct);
using RE = parser<pcre, re>::type;
constexpr auto dfa = fa::minimize<fa::determinize<nfa_from<RE>>>;
return match_state<dfa>(rng.begin(), rng.end());
}
namespace fa {
static constexpr auto empty = finite_automaton<0, 0>{};
}
template <typename RE>
static constexpr auto nfa_from = build(fa::empty, list<RE>{});
RE
is a type, finite_automaton
is a value.
auto build(const fa_like auto seed, list<>) {
// for an empty RE seed is empty FA
return seed;
}
namespace fa {
// epsilon is FA with a start state final
static constexpr auto epsilon = finite_automaton<0, 1>{{}, {0}};
}
template <typename... Ts>
auto build(const fa_like auto lhs, list<epsilon, Ts...>) {
return build(fa::concat<lhs, fa::epsilon>, list<Ts...>{});
}
namespace fa {
// epsilon is FA with a start state final
template <char32_t Term> static constexpr auto
character = finite_automaton<1, 1>{{transition{0, 1, Term}}, {1}};
}
template <char32_t Term, typename... Ts>
auto build(const fa_like auto lhs, list<character<Term>, Ts...>) {
return build(fa::concat<lhs, fa::character<Term>>, list<Ts...>{});
}
template <typename... Content, typename... Ts>
auto build(const fa_like auto lhs, list<concat<Content...>, Ts...>) {
return build(lhs, list<Content..., Ts...>{});
}
template <typename... Options, typename... Ts>
auto build(const fa_like auto lhs, list<alt<Options...>, Ts...>) {
constexpr auto inner = fa::alternative<build(fa::empty, Options)...>;
return build(fa::concat<lhs, inner>, list<Ts...>{});
}
template <typename Content, typename Ts...>
auto build(const fa_like auto seed, list<star<Content>, Ts...>) {
auto content = build(fa::empty, Content);
return build(fa::concat<lhs, fa::star<content>>, list<Ts...>{});
}
template <fixed_string re> bool fast_match(const std::forward_range auto & rng) {
static_assert(parser<pcre, re>::correct);
using RE = parser<pcre, re>::type;
constexpr auto dfa = fa::minimize<fa::determinize<nfa_from<RE>>>;
return match_state<dfa>(rng.begin(), rng.end());
}
template <finite_automaton DFA, int State = 0, typename Iterator>
constexpr bool match_state(Iterator it, Iterator end) noexcept {
// we can end correctly only if this state is final
if constexpr (DFA.is_final(State)) {
if (end == it) return true;
} else {
if (end == it) return false;
}
// transitions are sorted by source state
constexpr auto index = DFA.first_transition_index_of(State);
// tail-recursion
return choose_transition<DFA, index, State>(it, end);
}
template <finite_automaton DFA, int Index, int State, typename Iterator>
constexpr bool choose_transition(Iterator it, const Iterator end) noexcept {
// check if we are still in same state
if constexpr (Index >= DFA.transitions.size()
|| DFA.transitions[Index].source != State) {
return false;
}
if (DFA.transitions[Index].match(*it)) {
// this transition is the one, go to next state (tail recursion)
return match_state<DFA, DFA.transitions[Index].target>(std::next(it), end);
}
// try next one (also tail recursion)
return choose_transition<DFA, Index+1, State>(it, end);
}
n = length of the input, t = average number of transitions per state
[a-z0-9]+[0-9]
#include <everything>
int main(int argc, char ** argv) {
auto re = PREPARE("PATTERN");
auto lines = load_file_by_lines(argv[1]);
size_t count = 0;
// some warm-up
auto start = steady_clock::now();
for (string_view line: lines) {
if (re.SEARCH(line)) count++;
}
auto end = steady_clock::now();
// print results
}
[a-z0-9]+[0-9]
Check Hal Finkel's CppCon talk:
Faster Compile Times and Better Performance:
Bringing Just-in-Time Compilation to C++
(Spoiler: Run-Time Compile Time Regular Expressions 😱)
💁🏻♀️ compile-time.re
Also thanks to my friends:
Lucie Mohelníková, Michal Vaner, Martin Doložílek,
Filip Slunečko, and Galina Alperovich