summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMica White <botahamec@outlook.com>2025-12-07 14:22:10 -0500
committerMica White <botahamec@outlook.com>2025-12-07 14:22:10 -0500
commit91fdd47a80156b942187c734c3473ede4e089c02 (patch)
treeed5f0669e9da431a0bcecda705302c3737fb76f0
First versionHEADmain
-rw-r--r--.gitignore2
-rw-r--r--Cargo.toml6
-rw-r--r--grammar.txt6
-rw-r--r--rustfmt.toml3
-rw-r--r--src/lib.rs507
5 files changed, 524 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..96ef6c0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..57315ce
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "deteregex"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
diff --git a/grammar.txt b/grammar.txt
new file mode 100644
index 0000000..f2a7827
--- /dev/null
+++ b/grammar.txt
@@ -0,0 +1,6 @@
+regex := alternate (PIPE alternate)*
+alternate := node*
+node := csetnode | grouping
+csetnode := TILDE? (CHARSET | union) STAR?
+union := LEFT_BRACKET CHARSET* RIGHT_BRACKET
+grouping := LEFT_PAREN regex RIGHT_PAREN
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..d3d0d1e
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,3 @@
+edition = "2024"
+hard_tabs = true
+newline_style = "Unix"
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..5271746
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,507 @@
+use std::{borrow::Cow, ops::Range, rc::Rc};
+
+#[derive(Clone)]
+enum CharSet {
+ StaticFn(fn(Option<char>) -> bool),
+ BoxedFn(Rc<dyn Fn(Option<char>) -> bool>),
+}
+
+#[derive(Clone)]
+enum RegexToken {
+ Pipe,
+ Label(Rc<str>),
+ Jump(Rc<str>),
+ CharacterSet(CharSet),
+ KleeneStar,
+ Tilde,
+ LeftParenthesis,
+ RightParenthesis,
+ LeftBracket,
+ RightBracket,
+}
+
+#[derive(Clone)]
+struct Union {
+ left_bracket: RegexToken,
+ charsets: Vec<CharSet>,
+ right_bracket: Option<RegexToken>,
+}
+
+enum CsetType {
+ Literal(CharSet),
+ Union(Union),
+}
+
+struct CharSetNode {
+ tilde: Option<RegexToken>,
+ cset: CsetType,
+ star: Option<RegexToken>,
+}
+
+struct Grouping {
+ left_parenthesis: RegexToken,
+ regex: Regex,
+ right_parenthesis: Option<RegexToken>,
+}
+
+enum RegexNode {
+ Grouping(Grouping),
+ CharSetNode(CharSetNode),
+}
+
+struct Alternate {
+ pipe: Option<RegexToken>,
+ nodes: Vec<RegexNode>,
+}
+
+pub struct Regex {
+ alternates: Vec<Alternate>,
+}
+
+pub struct Match<'h> {
+ haystack: &'h str,
+ start: usize,
+ end: usize,
+}
+
+impl Match<'_> {
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
+ pub fn range(&self) -> Range<usize> {
+ self.start..self.end
+ }
+
+ pub fn as_str(&self) -> &str {
+ &self.haystack[self.range()]
+ }
+}
+
+impl Regex {
+ fn tokenize(mut regex: &str) -> Vec<RegexToken> {
+ fn advance_if_starts_with(regex: &mut &str, pattern: &str) -> bool {
+ if regex.starts_with(pattern) {
+ *regex = regex.split_at(pattern.len()).1;
+ true
+ } else {
+ false
+ }
+ }
+
+ fn upto<'a>(regex: &mut &'a str, pattern: &str) -> Option<&'a str> {
+ regex.split_once(pattern).map(|(consumed, rest)| {
+ *regex = rest;
+ consumed
+ })
+ }
+
+ fn next_char(regex: &mut &str) -> Option<char> {
+ let char = regex.chars().next()?;
+ for i in 1..9 {
+ if regex.is_char_boundary(i) {
+ *regex = &regex[i..];
+ return Some(char);
+ }
+ }
+
+ None
+ }
+
+ let mut tokens = Vec::new();
+ while !regex.is_empty() {
+ if advance_if_starts_with(&mut regex, "|") {
+ tokens.push(RegexToken::Pipe);
+ } else if advance_if_starts_with(&mut regex, "<") {
+ let label = upto(&mut regex, ">").expect("labels should terminate");
+ tokens.push(RegexToken::Label(label.into()));
+ } else if advance_if_starts_with(&mut regex, "@") {
+ let label = upto(&mut regex, ">").expect("jumps should terminate");
+ tokens.push(RegexToken::Jump(label.into()));
+ } else if advance_if_starts_with(&mut regex, "~") {
+ tokens.push(RegexToken::Tilde);
+ } else if advance_if_starts_with(&mut regex, "[") {
+ tokens.push(RegexToken::LeftBracket);
+ } else if advance_if_starts_with(&mut regex, "]") {
+ tokens.push(RegexToken::RightBracket);
+ } else if advance_if_starts_with(&mut regex, "(") {
+ tokens.push(RegexToken::LeftParenthesis);
+ } else if advance_if_starts_with(&mut regex, ")") {
+ tokens.push(RegexToken::RightParenthesis);
+ } else if advance_if_starts_with(&mut regex, "*") {
+ tokens.push(RegexToken::KleeneStar);
+ } else if advance_if_starts_with(&mut regex, ".") {
+ tokens.push(RegexToken::CharacterSet(CharSet::StaticFn(|_| true)));
+ } else if advance_if_starts_with(&mut regex, "$") {
+ tokens.push(RegexToken::CharacterSet(CharSet::StaticFn(|c| c.is_none())));
+ } else if advance_if_starts_with(&mut regex, "\\d") {
+ tokens.push(RegexToken::CharacterSet(CharSet::StaticFn(|c| {
+ c.map(|c| c.is_ascii_digit()).unwrap_or(false)
+ })))
+ } else if advance_if_starts_with(&mut regex, "\\D") {
+ tokens.push(RegexToken::CharacterSet(CharSet::StaticFn(|c| {
+ c.map(|c| !c.is_ascii_digit()).unwrap_or(false)
+ })))
+ } else if advance_if_starts_with(&mut regex, "\\w") {
+ tokens.push(RegexToken::CharacterSet(CharSet::StaticFn(|c| {
+ c.map(|c| c.is_ascii_alphanumeric() || c == '_')
+ .unwrap_or(false)
+ })))
+ } else if advance_if_starts_with(&mut regex, "\\W") {
+ tokens.push(RegexToken::CharacterSet(CharSet::StaticFn(|c| {
+ c.map(|c| !(c.is_ascii_alphanumeric() || c == '_'))
+ .unwrap_or(false)
+ })))
+ } else if advance_if_starts_with(&mut regex, "\\s") {
+ tokens.push(RegexToken::CharacterSet(CharSet::StaticFn(|c| {
+ c.map(|c| c.is_ascii_whitespace()).unwrap_or(false)
+ })))
+ } else if advance_if_starts_with(&mut regex, "\\S") {
+ tokens.push(RegexToken::CharacterSet(CharSet::StaticFn(|c| {
+ c.map(|c| !c.is_ascii_whitespace()).unwrap_or(false)
+ })))
+ } else if advance_if_starts_with(&mut regex, "\\") {
+ let char = next_char(&mut regex).expect("at least one character");
+ tokens.push(RegexToken::CharacterSet(CharSet::BoxedFn(Rc::new(
+ move |c| c.map(|c| c == char).unwrap_or(false),
+ ))));
+ } else {
+ let char = next_char(&mut regex).expect("at least one character");
+ tokens.push(RegexToken::CharacterSet(CharSet::BoxedFn(Rc::new(
+ move |c| c.map(|c| c == char).unwrap_or(false),
+ ))));
+ }
+ }
+
+ tokens
+ }
+
+ fn peek_token(tokens: &[RegexToken]) -> Option<&RegexToken> {
+ tokens.first()
+ }
+
+ fn next_token<'a>(tokens: &mut &'a [RegexToken]) -> Option<&'a RegexToken> {
+ let token = tokens.first()?;
+ *tokens = &tokens[1..];
+ Some(token)
+ }
+
+ fn parse_if<T>(
+ tokens: &mut &[RegexToken],
+ predicate: impl Fn(&RegexToken) -> bool,
+ parser: impl Fn(&mut &[RegexToken]) -> Result<T, Rc<str>>,
+ ) -> Option<Result<T, Rc<str>>> {
+ if !tokens.is_empty() && predicate(&tokens[0]) {
+ Some(parser(tokens))
+ } else {
+ None
+ }
+ }
+
+ fn parse_while<T>(
+ tokens: &mut &[RegexToken],
+ predicate: impl Fn(&RegexToken) -> bool,
+ parser: impl Fn(&mut &[RegexToken]) -> Result<T, Rc<str>>,
+ ) -> Result<Vec<T>, Rc<str>> {
+ let mut result = Vec::new();
+ while !tokens.is_empty() && predicate(&tokens[0]) {
+ result.push(parser(tokens)?);
+ }
+ Ok(result)
+ }
+
+ fn parse_union(tokens: &mut &[RegexToken]) -> Result<Union, Rc<str>> {
+ let Some(left_bracket) = Self::next_token(tokens).cloned() else {
+ return Err("expected a left bracket".into());
+ };
+ let charsets = Self::parse_while(
+ tokens,
+ |token| matches!(token, RegexToken::CharacterSet(..)),
+ |token| {
+ if let Some(RegexToken::CharacterSet(set)) = token.first() {
+ Ok(set.clone())
+ } else {
+ unreachable!("just saw a set")
+ }
+ },
+ )?;
+ let right_bracket = Self::next_token(tokens).cloned();
+
+ Ok(Union {
+ left_bracket,
+ charsets,
+ right_bracket,
+ })
+ }
+
+ fn parse_cset_type(tokens: &mut &[RegexToken]) -> Result<CsetType, Rc<str>> {
+ if let Some(cset) = Self::parse_if(
+ tokens,
+ |token| matches!(token, RegexToken::LeftBracket),
+ Self::parse_union,
+ ) {
+ Ok(CsetType::Union(cset?))
+ } else if let Some(cset) = Self::parse_if(
+ tokens,
+ |token| matches!(token, RegexToken::CharacterSet(..)),
+ |tokens| {
+ if let Some(RegexToken::CharacterSet(cset)) = Self::next_token(tokens) {
+ Ok(cset.clone())
+ } else {
+ unreachable!("just saw a char set")
+ }
+ },
+ ) {
+ Ok(CsetType::Literal(cset?))
+ } else {
+ Err("invalid char set".into())
+ }
+ }
+
+ fn parse_grouping(tokens: &mut &[RegexToken]) -> Result<Grouping, Rc<str>> {
+ let left_parenthesis = Self::next_token(tokens)
+ .expect("a left parenthesis")
+ .clone();
+ let regex = Self::parse_regex(tokens)?;
+ let right_parenthesis = Self::next_token(tokens).cloned();
+
+ Ok(Grouping {
+ left_parenthesis,
+ regex,
+ right_parenthesis,
+ })
+ }
+
+ fn parse_cset_node(tokens: &mut &[RegexToken]) -> Result<CharSetNode, Rc<str>> {
+ let tilde = Self::parse_if(
+ tokens,
+ |token| matches!(token, RegexToken::Tilde),
+ |tokens| {
+ Self::next_token(tokens)
+ .ok_or("bug: should have a tilde here".into())
+ .cloned()
+ },
+ )
+ .transpose()?;
+
+ let cset = Self::parse_cset_type(tokens)?;
+
+ let star = Self::parse_if(
+ tokens,
+ |token| matches!(token, RegexToken::KleeneStar),
+ |tokens| {
+ Ok(Self::next_token(tokens)
+ .expect("at least one token")
+ .clone())
+ },
+ )
+ .transpose()?;
+
+ Ok(CharSetNode { tilde, cset, star })
+ }
+
+ fn parse_node(tokens: &mut &[RegexToken]) -> Result<RegexNode, Rc<str>> {
+ let peek = Self::peek_token(tokens).ok_or_else(|| "expected a token".to_string())?;
+ if matches!(peek, RegexToken::LeftParenthesis) {
+ Ok(RegexNode::Grouping(Self::parse_grouping(tokens)?))
+ } else {
+ Ok(RegexNode::CharSetNode(Self::parse_cset_node(tokens)?))
+ }
+ }
+
+ fn parse_alternate(tokens: &mut &[RegexToken]) -> Result<Alternate, Rc<str>> {
+ let pipe = Self::parse_if(
+ tokens,
+ |token| matches!(token, RegexToken::Pipe),
+ |tokens| Self::next_token(tokens).cloned().ok_or(Rc::<str>::from("")),
+ )
+ .transpose()?;
+ let nodes = Self::parse_while(
+ tokens,
+ |token| {
+ !matches!(
+ token,
+ RegexToken::Pipe
+ | RegexToken::RightParenthesis
+ | RegexToken::KleeneStar
+ | RegexToken::RightBracket
+ )
+ },
+ Self::parse_node,
+ )?;
+
+ Ok(Alternate { pipe, nodes })
+ }
+
+ fn parse_regex(tokens: &mut &[RegexToken]) -> Result<Regex, Rc<str>> {
+ let alternates = Self::parse_while(
+ tokens,
+ |token| {
+ !matches!(
+ token,
+ RegexToken::RightParenthesis
+ | RegexToken::KleeneStar
+ | RegexToken::RightBracket
+ )
+ },
+ Self::parse_alternate,
+ )?;
+ Ok(Regex { alternates })
+ }
+
+ fn find_cset(cset: &CharSet, input: &str) -> bool {
+ let char = input.chars().next();
+ match cset {
+ CharSet::StaticFn(func) => func(char),
+ CharSet::BoxedFn(func) => func(char),
+ }
+ }
+
+ fn find_union(union: &Union, input: &str) -> bool {
+ for cset in &union.charsets {
+ if Self::find_cset(cset, input) {
+ return true;
+ }
+ }
+
+ false
+ }
+
+ fn find_cset_node(cset: &CharSetNode, input: &str) -> bool {
+ let mut is_match = false;
+ loop {
+ let match_found = match &cset.cset {
+ CsetType::Literal(cset) => Self::find_cset(cset, input),
+ CsetType::Union(cset) => Self::find_union(cset, input),
+ };
+
+ if match_found {
+ is_match = true;
+ } else {
+ break;
+ }
+
+ if cset.star.is_none() {
+ break;
+ }
+ }
+
+ if cset.tilde.is_some() {
+ is_match = !is_match;
+ }
+
+ is_match
+ }
+
+ fn find_node(node: &RegexNode, input: &str) -> Option<usize> {
+ match node {
+ RegexNode::CharSetNode(cset) => Self::find_cset_node(cset, input)
+ .then(|| input.chars().next().map(char::len_utf8))
+ .flatten(),
+ RegexNode::Grouping(grouping) => Self::find(&grouping.regex, input).map(|m| m.len()),
+ }
+ }
+
+ fn find_alternate<'s>(alternate: &Alternate, mut input: &'s str) -> Option<Match<'s>> {
+ let original = input;
+ for node in &alternate.nodes {
+ if let Some(match_len) = Self::find_node(node, input) {
+ input = &input[match_len..];
+ } else {
+ return None;
+ }
+ }
+
+ Some(Match {
+ haystack: original,
+ start: 0,
+ end: original.len() - input.len(),
+ })
+ }
+
+ pub fn new(regex: &str) -> Result<Self, Rc<str>> {
+ let tokens = Self::tokenize(regex);
+ let mut tokens = tokens.as_slice();
+ Self::parse_regex(&mut tokens)
+ }
+
+ pub fn find<'s>(&self, input: &'s str) -> Option<Match<'s>> {
+ for start in 0..input.len() {
+ if let Some(match_str) = self.find_at(input, start) {
+ return Some(match_str);
+ }
+ }
+
+ None
+ }
+
+ pub fn find_all<'s>(&self, input: &'s str) -> impl IntoIterator<Item = Match<'s>> {
+ let mut matches = Vec::new();
+ let mut i = 0;
+ while i < input.len() {
+ if let Some(match_str) = self.find_at(input, i) {
+ i = match_str.end;
+ matches.push(match_str);
+ } else {
+ i += 1;
+ }
+ }
+
+ matches
+ }
+
+ pub fn find_at<'s>(&self, input: &'s str, start: usize) -> Option<Match<'s>> {
+ for alternate in &self.alternates {
+ if let Some(match_str) = Self::find_alternate(alternate, &input[start..]) {
+ return Some(Match {
+ haystack: input,
+ start,
+ end: start + match_str.len(),
+ });
+ }
+ }
+
+ None
+ }
+
+ pub fn is_match_at(&self, input: &str, start: usize) -> bool {
+ self.find_at(input, start).is_some()
+ }
+
+ pub fn is_match(&self, input: &str) -> bool {
+ self.find(input).is_some()
+ }
+
+ pub fn replace<'s>(&self, input: &'s str, replacement: impl AsRef<str>) -> Cow<'s, str> {
+ if let Some(match_str) = self.find(input) {
+ input
+ .replace(match_str.as_str(), replacement.as_ref())
+ .into()
+ } else {
+ Cow::Borrowed(input)
+ }
+ }
+
+ pub fn replace_all<'s>(&self, input: &'s str, replacement: impl AsRef<str>) -> Cow<'s, str> {
+ let mut output = Cow::Borrowed(input);
+ for match_str in self.find_all(input) {
+ output = output
+ .replace(match_str.as_str(), replacement.as_ref())
+ .into();
+ }
+
+ output
+ }
+}