From 1035f6b1ff502eb5b1a5fc49a79f45971c772d47 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 11 Jun 2023 21:25:23 -0400 Subject: [PATCH] deps: initial migration steps to regex 1.9 This leaves the grep-regex crate in tatters. Pretty much the entire thing needs to be re-worked. The upshot is that it should result in some big simplifications. I hope. The idea here is to drop down and actually use regex-automata 0.3 instead of the regex crate itself. --- Cargo.lock | 78 +++--- Cargo.toml | 7 +- crates/core/args.rs | 2 +- crates/globset/Cargo.toml | 6 +- crates/globset/src/lib.rs | 4 +- crates/regex/Cargo.toml | 10 +- crates/regex/src/config.rs | 13 +- crates/regex/src/crlf.rs | 42 ++- crates/regex/src/literal.rs | 419 +-------------------------- crates/regex/src/literalold.rs | 466 +++++++++++++++++++++++++++++++ crates/regex/src/matcher.rs | 2 + crates/regex/src/multi.rs | 27 +- crates/regex/src/non_matching.rs | 54 ++-- crates/regex/src/strip.rs | 33 +-- crates/regex/src/util.rs | 1 + 15 files changed, 606 insertions(+), 558 deletions(-) create mode 100644 crates/regex/src/literalold.rs diff --git a/Cargo.lock b/Cargo.lock index 44bb161..a721f8c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,18 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" -dependencies = [ - "memchr", -] - -[[package]] -name = "aho-corasick" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" dependencies = [ "memchr", ] @@ -40,7 +31,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5" dependencies = [ "memchr", "once_cell", - "regex-automata", + "regex-automata 0.1.10", "serde", ] @@ -131,7 +122,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" name = "globset" version = "0.4.10" dependencies = [ - "aho-corasick 0.7.20", + "aho-corasick", "bstr", "fnv", "glob", @@ -204,12 +195,12 @@ dependencies = [ name = "grep-regex" version = "0.1.11" dependencies = [ - "aho-corasick 0.7.20", + "aho-corasick", "bstr", "grep-matcher", "log", "regex", - "regex-syntax 0.6.29", + "regex-syntax", "thread_local", ] @@ -287,9 +278,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.144" +version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" +checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" [[package]] name = "libm" @@ -299,12 +290,9 @@ checksum = "7fc7aa29613bd6a620df431842069224d8bc9011086b1db4c0e0cd47fa03ec9a" [[package]] name = "log" -version = "0.4.17" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" -dependencies = [ - "cfg-if", -] +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" [[package]] name = "memchr" @@ -323,9 +311,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.1" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "packed_simd_2" @@ -368,31 +356,30 @@ checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" [[package]] name = "proc-macro2" -version = "1.0.58" +version = "1.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" +checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" dependencies = [ "proc-macro2", ] [[package]] name = "regex" -version = "1.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81ca098a9821bd52d6b24fd8b10bd081f47d39c22778cafaa75a2857a62c6390" +version = "1.8.4" dependencies = [ - "aho-corasick 1.0.1", + "aho-corasick", "memchr", - "regex-syntax 0.7.2", + "regex-automata 0.3.0", + "regex-syntax", ] [[package]] @@ -402,16 +389,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] -name = "regex-syntax" -version = "0.6.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +name = "regex-automata" +version = "0.3.0" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] [[package]] name = "regex-syntax" version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" [[package]] name = "ripgrep" @@ -449,18 +437,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.163" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.163" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" +checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", @@ -486,9 +474,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "syn" -version = "2.0.16" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 662773c..bb15e12 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,11 @@ autotests = false edition = "2018" rust-version = "1.65" +[patch.crates-io] +regex = { path = "/home/andrew/rust/regex" } +regex-automata = { path = "/home/andrew/rust/regex/regex-automata" } +regex-syntax = { path = "/home/andrew/rust/regex/regex-syntax" } + [[bin]] bench = false path = "crates/core/main.rs" @@ -47,7 +52,7 @@ grep = { version = "0.2.12", path = "crates/grep" } ignore = { version = "0.4.19", path = "crates/ignore" } lazy_static = "1.1.0" log = "0.4.5" -regex = "1.3.5" +regex = "1.8.3" serde_json = "1.0.23" termcolor = "1.1.0" diff --git a/crates/core/args.rs b/crates/core/args.rs index 425ec19..10e6a64 100644 --- a/crates/core/args.rs +++ b/crates/core/args.rs @@ -1464,7 +1464,7 @@ impl ArgMatches { // own, but if the patterns are joined in a set of alternations, then // you wind up with `foo|`, which is currently invalid in Rust's regex // engine. - "(?:z{0})*".to_string() + "(?:)".to_string() } /// Converts an OsStr pattern to a String pattern. The pattern is escaped diff --git a/crates/globset/Cargo.toml b/crates/globset/Cargo.toml index 04d6597..e6ce307 100644 --- a/crates/globset/Cargo.toml +++ b/crates/globset/Cargo.toml @@ -20,11 +20,11 @@ name = "globset" bench = false [dependencies] -aho-corasick = "0.7.3" -bstr = { version = "1.1.0", default-features = false, features = ["std"] } +aho-corasick = "1.0.2" +bstr = { version = "1.5.0", default-features = false, features = ["std"] } fnv = "1.0.6" log = { version = "0.4.5", optional = true } -regex = { version = "1.1.5", default-features = false, features = ["perf", "std"] } +regex = { version = "1.8.3", default-features = false, features = ["perf", "std"] } serde = { version = "1.0.104", optional = true } [dev-dependencies] diff --git a/crates/globset/src/lib.rs b/crates/globset/src/lib.rs index c8072b2..8ea9af1 100644 --- a/crates/globset/src/lib.rs +++ b/crates/globset/src/lib.rs @@ -818,7 +818,7 @@ impl MultiStrategyBuilder { fn prefix(self) -> PrefixStrategy { PrefixStrategy { - matcher: AhoCorasick::new_auto_configured(&self.literals), + matcher: AhoCorasick::new(&self.literals).unwrap(), map: self.map, longest: self.longest, } @@ -826,7 +826,7 @@ impl MultiStrategyBuilder { fn suffix(self) -> SuffixStrategy { SuffixStrategy { - matcher: AhoCorasick::new_auto_configured(&self.literals), + matcher: AhoCorasick::new(&self.literals).unwrap(), map: self.map, longest: self.longest, } diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml index 64157cf..4f9b295 100644 --- a/crates/regex/Cargo.toml +++ b/crates/regex/Cargo.toml @@ -14,10 +14,10 @@ license = "Unlicense OR MIT" edition = "2018" [dependencies] -aho-corasick = "0.7.3" -bstr = "1.1.0" +aho-corasick = "1.0.2" +bstr = "1.5.0" grep-matcher = { version = "0.1.6", path = "../matcher" } log = "0.4.5" -regex = "1.1" -regex-syntax = "0.6.5" -thread_local = "1.1.2" +regex = "1.8.3" +regex-syntax = "0.7.2" +thread_local = "1.1.7" diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index 5f40b8c..bb7430e 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -71,7 +71,7 @@ impl Config { let ast = self.ast(pattern)?; let analysis = self.analysis(&ast)?; let expr = hir::translate::TranslatorBuilder::new() - .allow_invalid_utf8(true) + .utf8(false) .case_insensitive(self.is_case_insensitive(&analysis)) .multi_line(self.multi_line) .dot_matches_new_line(self.dot_matches_new_line) @@ -172,7 +172,12 @@ impl ConfiguredHIR { /// CRLF hack is enabled and the regex is line anchored at the end. In /// this case, matches that end with a `\r` have the `\r` stripped. pub fn needs_crlf_stripped(&self) -> bool { - self.config.crlf && self.expr.is_line_anchored_end() + self.config.crlf + && self + .expr + .properties() + .look_set_suffix_any() + .contains(hir::Look::EndLF) } /// Returns the line terminator configured on this expression. @@ -202,7 +207,7 @@ impl ConfiguredHIR { /// Returns true if and only if the underlying HIR has any text anchors. fn is_any_anchored(&self) -> bool { - self.expr.is_any_anchored_start() || self.expr.is_any_anchored_end() + self.expr.properties().look_set().contains_anchor_haystack() } /// Builds a regular expression from this HIR expression. @@ -301,7 +306,7 @@ impl ConfiguredHIR { let expr = ::regex_syntax::ParserBuilder::new() .nest_limit(self.config.nest_limit) .octal(self.config.octal) - .allow_invalid_utf8(true) + .utf8(false) .multi_line(self.config.multi_line) .dot_matches_new_line(self.config.dot_matches_new_line) .unicode(self.config.unicode) diff --git a/crates/regex/src/crlf.rs b/crates/regex/src/crlf.rs index 45492bd..b0c85c8 100644 --- a/crates/regex/src/crlf.rs +++ b/crates/regex/src/crlf.rs @@ -124,32 +124,26 @@ pub fn adjust_match(haystack: &[u8], m: Match) -> Match { /// nicely in most cases, especially when a match is limited to a single line. pub fn crlfify(expr: Hir) -> Hir { match expr.into_kind() { - HirKind::Anchor(hir::Anchor::EndLine) => { - let concat = Hir::concat(vec![ - Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrOne, - greedy: false, - hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))), - }), - Hir::anchor(hir::Anchor::EndLine), - ]); - Hir::group(hir::Group { - kind: hir::GroupKind::NonCapturing, - hir: Box::new(concat), - }) - } + HirKind::Look(hir::Look::EndLF) => Hir::concat(vec![ + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: false, + sub: Box::new(Hir::literal("\r".as_bytes())), + }), + Hir::look(hir::Look::EndLF), + ]), HirKind::Empty => Hir::empty(), - HirKind::Literal(x) => Hir::literal(x), + HirKind::Literal(hir::Literal(x)) => Hir::literal(x), HirKind::Class(x) => Hir::class(x), - HirKind::Anchor(x) => Hir::anchor(x), - HirKind::WordBoundary(x) => Hir::word_boundary(x), + HirKind::Look(x) => Hir::look(x), HirKind::Repetition(mut x) => { - x.hir = Box::new(crlfify(*x.hir)); + x.sub = Box::new(crlfify(*x.sub)); Hir::repetition(x) } - HirKind::Group(mut x) => { - x.hir = Box::new(crlfify(*x.hir)); - Hir::group(x) + HirKind::Capture(mut x) => { + x.sub = Box::new(crlfify(*x.sub)); + Hir::capture(x) } HirKind::Concat(xs) => { Hir::concat(xs.into_iter().map(crlfify).collect()) @@ -174,12 +168,12 @@ mod tests { #[test] fn various() { assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))"); - assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))"); + assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$)\r??(?m:$))"); assert_eq!( roundtrip(r"(?m)(?:foo$|bar$)"), - "(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))" + "(?:(?:(?:foo)\r??(?m:$))|(?:(?:bar)\r??(?m:$)))" ); - assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a"); + assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$)a)"); // Not a multiline `$`, so no crlfifying occurs. assert_eq!(roundtrip(r"$"), "\\z"); diff --git a/crates/regex/src/literal.rs b/crates/regex/src/literal.rs index db5ed3b..8058d61 100644 --- a/crates/regex/src/literal.rs +++ b/crates/regex/src/literal.rs @@ -1,43 +1,12 @@ -/* -This module is responsible for extracting *inner* literals out of the AST of a -regular expression. Normally this is the job of the regex engine itself, but -the regex engine doesn't look for inner literals. Since we're doing line based -searching, we can use them, so we need to do it ourselves. -*/ +use regex_syntax::hir::Hir; -use bstr::ByteSlice; -use regex_syntax::hir::literal::{Literal, Literals}; -use regex_syntax::hir::{self, Hir, HirKind}; - -use crate::util; - -/// Represents prefix, suffix and inner "required" literals for a regular -/// expression. -/// -/// Prefixes and suffixes are detected using regex-syntax. The inner required -/// literals are detected using something custom (but based on the code in -/// regex-syntax). #[derive(Clone, Debug)] -pub struct LiteralSets { - /// A set of prefix literals. - prefixes: Literals, - /// A set of suffix literals. - suffixes: Literals, - /// A set of literals such that at least one of them must appear in every - /// match. A literal in this set may be neither a prefix nor a suffix. - required: Literals, -} +pub struct LiteralSets {} impl LiteralSets { /// Create a set of literals from the given HIR expression. - pub fn new(expr: &Hir) -> LiteralSets { - let mut required = Literals::empty(); - union_required(expr, &mut required); - LiteralSets { - prefixes: Literals::prefixes(expr), - suffixes: Literals::suffixes(expr), - required, - } + pub fn new(_: &Hir) -> LiteralSets { + LiteralSets {} } /// If it is deemed advantageuous to do so (via various suspicious @@ -46,383 +15,7 @@ impl LiteralSets { /// generated these literal sets. The idea here is that the pattern /// returned by this method is much cheaper to search for. i.e., It is /// usually a single literal or an alternation of literals. - pub fn one_regex(&self, word: bool) -> Option { - // TODO: The logic in this function is basically inscrutable. It grew - // organically in the old grep 0.1 crate. Ideally, it would be - // re-worked. In fact, the entire inner literal extraction should be - // re-worked. Actually, most of regex-syntax's literal extraction - // should also be re-worked. Alas... only so much time in the day. - - if !word { - if self.prefixes.all_complete() && !self.prefixes.is_empty() { - log::debug!("literal prefixes detected: {:?}", self.prefixes); - // When this is true, the regex engine will do a literal scan, - // so we don't need to return anything. But we only do this - // if we aren't doing a word regex, since a word regex adds - // a `(?:\W|^)` to the beginning of the regex, thereby - // defeating the regex engine's literal detection. - return None; - } - } - - // Out of inner required literals, prefixes and suffixes, which one - // is the longest? We pick the longest to do fast literal scan under - // the assumption that a longer literal will have a lower false - // positive rate. - let pre_lcp = self.prefixes.longest_common_prefix(); - let pre_lcs = self.prefixes.longest_common_suffix(); - let suf_lcp = self.suffixes.longest_common_prefix(); - let suf_lcs = self.suffixes.longest_common_suffix(); - - let req_lits = self.required.literals(); - let req = match req_lits.iter().max_by_key(|lit| lit.len()) { - None => &[], - Some(req) => &***req, - }; - - let mut lit = pre_lcp; - if pre_lcs.len() > lit.len() { - lit = pre_lcs; - } - if suf_lcp.len() > lit.len() { - lit = suf_lcp; - } - if suf_lcs.len() > lit.len() { - lit = suf_lcs; - } - if req_lits.len() == 1 && req.len() > lit.len() { - lit = req; - } - - // Special case: if we detected an alternation of inner required - // literals and its longest literal is bigger than the longest - // prefix/suffix, then choose the alternation. In practice, this - // helps with case insensitive matching, which can generate lots of - // inner required literals. - let any_empty = req_lits.iter().any(|lit| lit.is_empty()); - let any_white = has_only_whitespace(&req_lits); - if req.len() > lit.len() - && req_lits.len() > 1 - && !any_empty - && !any_white - { - log::debug!("required literals found: {:?}", req_lits); - let alts: Vec = req_lits - .into_iter() - .map(|x| util::bytes_to_regex(x)) - .collect(); - // We're matching raw bytes, so disable Unicode mode. - Some(format!("(?-u:{})", alts.join("|"))) - } else if lit.is_empty() { - // If we're here, then we have no LCP. No LCS. And no detected - // inner required literals. In theory this shouldn't happen, but - // the inner literal detector isn't as nice as we hope and doesn't - // actually support returning a set of alternating required - // literals. (Instead, it only returns a set where EVERY literal - // in it is required. It cannot currently express "either P or Q - // is required.") - // - // In this case, it is possible that we still have meaningful - // prefixes or suffixes to use. So we look for the set of literals - // with the highest minimum length and use that to build our "fast" - // regex. - // - // This manifests in fairly common scenarios. e.g., - // - // rg -w 'foo|bar|baz|quux' - // - // Normally, without the `-w`, the regex engine itself would - // detect the prefix correctly. Unfortunately, the `-w` option - // turns the regex into something like this: - // - // rg '(^|\W)(foo|bar|baz|quux)($|\W)' - // - // Which will defeat all prefix and suffix literal optimizations. - // (Not in theory---it could be better. But the current - // implementation isn't good enough.) ... So we make up for it - // here. - if !word { - return None; - } - let p_min_len = self.prefixes.min_len(); - let s_min_len = self.suffixes.min_len(); - let lits = match (p_min_len, s_min_len) { - (None, None) => return None, - (Some(_), None) => { - log::debug!("prefix literals found"); - self.prefixes.literals() - } - (None, Some(_)) => { - log::debug!("suffix literals found"); - self.suffixes.literals() - } - (Some(p), Some(s)) => { - if p >= s { - log::debug!("prefix literals found"); - self.prefixes.literals() - } else { - log::debug!("suffix literals found"); - self.suffixes.literals() - } - } - }; - - log::debug!("prefix/suffix literals found: {:?}", lits); - if has_only_whitespace(lits) { - log::debug!("dropping literals because one was whitespace"); - return None; - } - let alts: Vec = - lits.into_iter().map(|x| util::bytes_to_regex(x)).collect(); - // We're matching raw bytes, so disable Unicode mode. - Some(format!("(?-u:{})", alts.join("|"))) - } else { - log::debug!("required literal found: {:?}", util::show_bytes(lit)); - if lit.chars().all(|c| c.is_whitespace()) { - log::debug!("dropping literal because one was whitespace"); - return None; - } - Some(format!("(?-u:{})", util::bytes_to_regex(&lit))) - } - } -} - -fn union_required(expr: &Hir, lits: &mut Literals) { - match *expr.kind() { - HirKind::Literal(hir::Literal::Unicode(c)) => { - let mut buf = [0u8; 4]; - lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); - } - HirKind::Literal(hir::Literal::Byte(b)) => { - lits.cross_add(&[b]); - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) { - lits.cut(); - } - } - HirKind::Class(hir::Class::Bytes(ref cls)) => { - if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) { - lits.cut(); - } - } - HirKind::Group(hir::Group { ref hir, .. }) => { - union_required(&**hir, lits); - } - HirKind::Repetition(ref x) => match x.kind { - hir::RepetitionKind::ZeroOrOne => lits.cut(), - hir::RepetitionKind::ZeroOrMore => lits.cut(), - hir::RepetitionKind::OneOrMore => { - union_required(&x.hir, lits); - } - hir::RepetitionKind::Range(ref rng) => { - let (min, max) = match *rng { - hir::RepetitionRange::Exactly(m) => (m, Some(m)), - hir::RepetitionRange::AtLeast(m) => (m, None), - hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), - }; - repeat_range_literals( - &x.hir, - min, - max, - x.greedy, - lits, - union_required, - ); - } - }, - HirKind::Concat(ref es) if es.is_empty() => {} - HirKind::Concat(ref es) if es.len() == 1 => { - union_required(&es[0], lits) - } - HirKind::Concat(ref es) => { - for e in es { - let mut lits2 = lits.to_empty(); - union_required(e, &mut lits2); - if lits2.is_empty() { - lits.cut(); - continue; - } - if lits2.contains_empty() || !is_simple(&e) { - lits.cut(); - } - if !lits.cross_product(&lits2) || !lits2.any_complete() { - // If this expression couldn't yield any literal that - // could be extended, then we need to quit. Since we're - // short-circuiting, we also need to freeze every member. - lits.cut(); - break; - } - } - } - HirKind::Alternation(ref es) => { - alternate_literals(es, lits, union_required); - } - _ => lits.cut(), - } -} - -fn repeat_range_literals( - e: &Hir, - min: u32, - _max: Option, - _greedy: bool, - lits: &mut Literals, - mut f: F, -) { - if min == 0 { - // This is a bit conservative. If `max` is set, then we could - // treat this as a finite set of alternations. For now, we - // just treat it as `e*`. - lits.cut(); - } else { - // We only extract literals from a single repetition, even though - // we could do more. e.g., `a{3}` will have `a` extracted instead of - // `aaa`. The reason is that inner literal extraction can't be unioned - // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}` - // is wrong. - f(e, lits); - lits.cut(); - } -} - -fn alternate_literals( - es: &[Hir], - lits: &mut Literals, - mut f: F, -) { - let mut lits2 = lits.to_empty(); - for e in es { - let mut lits3 = lits.to_empty(); - lits3.set_limit_size(lits.limit_size() / 5); - f(e, &mut lits3); - if lits3.is_empty() || !lits2.union(lits3) { - // If we couldn't find suffixes for *any* of the - // alternates, then the entire alternation has to be thrown - // away and any existing members must be frozen. Similarly, - // if the union couldn't complete, stop and freeze. - lits.cut(); - return; - } - } - // All we do at the moment is look for prefixes and suffixes. If both - // are empty, then we report nothing. We should be able to do better than - // this, but we'll need something more expressive than just a "set of - // literals." - let lcp = lits2.longest_common_prefix(); - let lcs = lits2.longest_common_suffix(); - if !lcp.is_empty() { - lits.cross_add(lcp); - } - lits.cut(); - if !lcs.is_empty() { - lits.add(Literal::empty()); - lits.add(Literal::new(lcs.to_vec())); - } -} - -fn is_simple(expr: &Hir) -> bool { - match *expr.kind() { - HirKind::Empty - | HirKind::Literal(_) - | HirKind::Class(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => true, - HirKind::Anchor(_) - | HirKind::WordBoundary(_) - | HirKind::Group(_) - | HirKind::Repetition(_) => false, - } -} - -/// Return the number of characters in the given class. -fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 { - cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() -} - -/// Return the number of bytes in the given class. -fn count_byte_class(cls: &hir::ClassBytes) -> u32 { - cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() -} - -/// Returns true if and only if any of the literals in the given set is -/// entirely whitespace. -fn has_only_whitespace(lits: &[Literal]) -> bool { - for lit in lits { - if lit.chars().all(|c| c.is_whitespace()) { - return true; - } - } - false -} - -#[cfg(test)] -mod tests { - use super::LiteralSets; - use regex_syntax::Parser; - - fn sets(pattern: &str) -> LiteralSets { - let hir = Parser::new().parse(pattern).unwrap(); - LiteralSets::new(&hir) - } - - fn one_regex(pattern: &str) -> Option { - sets(pattern).one_regex(false) - } - - // Put a pattern into the same format as the one returned by `one_regex`. - fn pat(pattern: &str) -> Option { - Some(format!("(?-u:{})", pattern)) - } - - #[test] - fn various() { - // Obviously no literals. - assert!(one_regex(r"\w").is_none()); - assert!(one_regex(r"\pL").is_none()); - - // Tantalizingly close. - assert!(one_regex(r"\w|foo").is_none()); - - // There's a literal, but it's better if the regex engine handles it - // internally. - assert!(one_regex(r"abc").is_none()); - - // Core use cases. - assert_eq!(one_regex(r"\wabc\w"), pat("abc")); - assert_eq!(one_regex(r"abc\w"), pat("abc")); - - // TODO: Make these pass. We're missing some potentially big wins - // without these. - // assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz")); - // assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz")); - } - - #[test] - fn regression_1064() { - // Regression from: - // https://github.com/BurntSushi/ripgrep/issues/1064 - // assert_eq!(one_regex(r"a.*c"), pat("a")); - assert_eq!(one_regex(r"a(.*c)"), pat("a")); - } - - #[test] - fn regression_1319() { - // Regression from: - // https://github.com/BurntSushi/ripgrep/issues/1319 - assert_eq!( - one_regex(r"TTGAGTCCAGGAG[ATCG]{2}C"), - pat("TTGAGTCCAGGAG"), - ); - } - - #[test] - fn regression_1537() { - // Regression from: - // https://github.com/BurntSushi/ripgrep/issues/1537 - assert_eq!(one_regex(r";(.*,)"), pat(";")); - assert_eq!(one_regex(r";((.*,))"), pat(";")); - assert_eq!(one_regex(r";(.*,)+"), pat(";"),); - assert_eq!(one_regex(r";(.*,){1}"), pat(";"),); + pub fn one_regex(&self, _word: bool) -> Option { + None } } diff --git a/crates/regex/src/literalold.rs b/crates/regex/src/literalold.rs new file mode 100644 index 0000000..df73a45 --- /dev/null +++ b/crates/regex/src/literalold.rs @@ -0,0 +1,466 @@ +/* +This module is responsible for extracting *inner* literals out of the AST of a +regular expression. Normally this is the job of the regex engine itself, but +the regex engine doesn't look for inner literals. Since we're doing line based +searching, we can use them, so we need to do it ourselves. +*/ + +use { + bstr::ByteSlice, + regex_syntax::hir::{ + self, + literal::{self, Literal, Seq}, + Hir, HirKind, + }, +}; + +use crate::util; + +/// Represents prefix, suffix and inner "required" literals for a regular +/// expression. +/// +/// Prefixes and suffixes are detected using regex-syntax. The inner required +/// literals are detected using something custom (but based on the code in +/// regex-syntax). +#[derive(Clone, Debug)] +pub struct LiteralSets { + /// A set of prefix literals. + prefixes: Seq, + /// A set of suffix literals. + suffixes: Seq, + /// A set of literals such that at least one of them must appear in every + /// match. A literal in this set may be neither a prefix nor a suffix. + required: Seq, +} + +impl LiteralSets { + /// Create a set of literals from the given HIR expression. + pub fn new(expr: &Hir) -> LiteralSets { + let mut required = Seq::singleton(Literal::exact(vec![])); + union_required(expr, &mut required); + LiteralSets { + prefixes: prefixes(expr), + suffixes: suffixes(expr), + required, + } + } + + /// If it is deemed advantageuous to do so (via various suspicious + /// heuristics), this will return a single regular expression pattern that + /// matches a subset of the language matched by the regular expression that + /// generated these literal sets. The idea here is that the pattern + /// returned by this method is much cheaper to search for. i.e., It is + /// usually a single literal or an alternation of literals. + pub fn one_regex(&self, word: bool) -> Option { + // TODO: The logic in this function is basically inscrutable. It grew + // organically in the old grep 0.1 crate. Ideally, it would be + // re-worked. In fact, the entire inner literal extraction should be + // re-worked. Actually, most of regex-syntax's literal extraction + // should also be re-worked. Alas... only so much time in the day. + + if !word { + if self.prefixes.is_exact() && !self.prefixes.is_empty() { + log::debug!("literal prefixes detected: {:?}", self.prefixes); + // When this is true, the regex engine will do a literal scan, + // so we don't need to return anything. But we only do this + // if we aren't doing a word regex, since a word regex adds + // a `(?:\W|^)` to the beginning of the regex, thereby + // defeating the regex engine's literal detection. + return None; + } + } + + // Out of inner required literals, prefixes and suffixes, which one + // is the longest? We pick the longest to do fast literal scan under + // the assumption that a longer literal will have a lower false + // positive rate. + let pre_lcp = self.prefixes.longest_common_prefix().unwrap_or(&[]); + let pre_lcs = self.prefixes.longest_common_suffix().unwrap_or(&[]); + let suf_lcp = self.suffixes.longest_common_prefix().unwrap_or(&[]); + let suf_lcs = self.suffixes.longest_common_suffix().unwrap_or(&[]); + + let req_lits = self.required.literals().unwrap_or(&[]); + let req = match req_lits.iter().max_by_key(|lit| lit.len()) { + None => &[], + Some(req) => req.as_bytes(), + }; + + let mut lit = pre_lcp; + if pre_lcs.len() > lit.len() { + lit = pre_lcs; + } + if suf_lcp.len() > lit.len() { + lit = suf_lcp; + } + if suf_lcs.len() > lit.len() { + lit = suf_lcs; + } + if req_lits.len() == 1 && req.len() > lit.len() { + lit = req; + } + + // Special case: if we detected an alternation of inner required + // literals and its longest literal is bigger than the longest + // prefix/suffix, then choose the alternation. In practice, this + // helps with case insensitive matching, which can generate lots of + // inner required literals. + let any_empty = req_lits.iter().any(|lit| lit.is_empty()); + let any_white = has_only_whitespace(&req_lits); + if req.len() > lit.len() + && req_lits.len() > 1 + && !any_empty + && !any_white + { + log::debug!("required literals found: {:?}", req_lits); + let alts: Vec = req_lits + .into_iter() + .map(|x| util::bytes_to_regex(x.as_bytes())) + .collect(); + // We're matching raw bytes, so disable Unicode mode. + Some(format!("(?-u:{})", alts.join("|"))) + } else if lit.is_empty() { + // If we're here, then we have no LCP. No LCS. And no detected + // inner required literals. In theory this shouldn't happen, but + // the inner literal detector isn't as nice as we hope and doesn't + // actually support returning a set of alternating required + // literals. (Instead, it only returns a set where EVERY literal + // in it is required. It cannot currently express "either P or Q + // is required.") + // + // In this case, it is possible that we still have meaningful + // prefixes or suffixes to use. So we look for the set of literals + // with the highest minimum length and use that to build our "fast" + // regex. + // + // This manifests in fairly common scenarios. e.g., + // + // rg -w 'foo|bar|baz|quux' + // + // Normally, without the `-w`, the regex engine itself would + // detect the prefix correctly. Unfortunately, the `-w` option + // turns the regex into something like this: + // + // rg '(^|\W)(foo|bar|baz|quux)($|\W)' + // + // Which will defeat all prefix and suffix literal optimizations. + // (Not in theory---it could be better. But the current + // implementation isn't good enough.) ... So we make up for it + // here. + if !word { + return None; + } + let p_min_len = self.prefixes.min_literal_len(); + let s_min_len = self.suffixes.min_literal_len(); + let lits = match (p_min_len, s_min_len) { + (None, None) => return None, + (Some(_), None) => { + log::debug!("prefix literals found"); + self.prefixes.literals().unwrap() + } + (None, Some(_)) => { + log::debug!("suffix literals found"); + self.suffixes.literals().unwrap() + } + (Some(p), Some(s)) => { + if p >= s { + log::debug!("prefix literals found"); + self.prefixes.literals().unwrap() + } else { + log::debug!("suffix literals found"); + self.suffixes.literals().unwrap() + } + } + }; + + log::debug!("prefix/suffix literals found: {:?}", lits); + if has_only_whitespace(lits) { + log::debug!("dropping literals because one was whitespace"); + return None; + } + let alts: Vec = lits + .into_iter() + .map(|x| util::bytes_to_regex(x.as_bytes())) + .collect(); + // We're matching raw bytes, so disable Unicode mode. + Some(format!("(?-u:{})", alts.join("|"))) + } else { + log::debug!("required literal found: {:?}", util::show_bytes(lit)); + if lit.chars().all(|c| c.is_whitespace()) { + log::debug!("dropping literal because one was whitespace"); + return None; + } + Some(format!("(?-u:{})", util::bytes_to_regex(&lit))) + } + } +} + +fn union_required(expr: &Hir, lits: &mut Seq) { + match *expr.kind() { + HirKind::Literal(hir::Literal(ref bytes)) => { + lits.cross_forward(&mut Seq::new([bytes])); + } + HirKind::Class(hir::Class::Unicode(_)) => { + lits.make_inexact(); + } + HirKind::Class(hir::Class::Bytes(_)) => { + lits.make_inexact(); + } + HirKind::Capture(hir::Capture { ref sub, .. }) => { + union_required(&**sub, lits); + } + HirKind::Repetition(hir::Repetition { min, max, greedy, ref sub }) => { + repeat_range_literals( + &sub, + min, + max, + greedy, + lits, + union_required, + ); + } + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => { + union_required(&es[0], lits) + } + HirKind::Concat(ref es) => { + for e in es { + let mut lits2 = Seq::singleton(Literal::exact(vec![])); + union_required(e, &mut lits2); + if lits2.len() == Some(1) && lits2.min_literal_len() == Some(0) + { + lits.make_inexact(); + continue; + } + if lits2.min_literal_len() == Some(0) || !is_simple(&e) { + lits.make_inexact(); + } + lits.cross_forward(&mut lits2); + if lits2.is_inexact() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.make_inexact(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, union_required); + } + _ => lits.make_inexact(), + } +} + +fn repeat_range_literals( + e: &Hir, + min: u32, + _max: Option, + _greedy: bool, + lits: &mut Seq, + mut f: F, +) { + if min == 0 { + // This is a bit conservative. If `max` is set, then we could + // treat this as a finite set of alternations. For now, we + // just treat it as `e*`. + lits.make_inexact(); + } else { + // We only extract literals from a single repetition, even though + // we could do more. e.g., `a{3}` will have `a` extracted instead of + // `aaa`. The reason is that inner literal extraction can't be unioned + // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}` + // is wrong. + f(e, lits); + lits.make_inexact(); + } +} + +fn alternate_literals( + es: &[Hir], + lits: &mut Seq, + mut f: F, +) { + let mut lits2 = Seq::empty(); + for e in es { + let mut lits3 = Seq::empty(); + // FIXME + // lits3.set_limit_size(lits.limit_size() / 5); + f(e, &mut lits3); + if lits3.is_empty() { + lits.make_inexact(); + return; + } + lits2.union(&mut lits3); + } + // All we do at the moment is look for prefixes and suffixes. If both + // are empty, then we report nothing. We should be able to do better than + // this, but we'll need something more expressive than just a "set of + // literals." + if let Some(lcp) = lits2.longest_common_prefix() { + lits.cross_forward(&mut Seq::new([lcp])); + } + lits.make_inexact(); + if let Some(lcs) = lits2.longest_common_suffix() { + lits.push(Literal::exact([])); + lits.push(Literal::exact(lcs)); + } + /* + let lcp = lits2.longest_common_prefix(); + let lcs = lits2.longest_common_suffix(); + if !lcp.is_empty() { + lits.cross_forward(lcp); + } + lits.make_inexact(); + if !lcs.is_empty() { + lits.push(Literal::exact([])); + lits.push(Literal::exact(lcs)); + } + */ +} + +fn is_simple(expr: &Hir) -> bool { + match *expr.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => true, + HirKind::Look(_) | HirKind::Capture(_) | HirKind::Repetition(_) => { + false + } + } +} + +/* +/// Return the number of characters in the given class. +fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + +/// Return the number of bytes in the given class. +fn count_byte_class(cls: &hir::ClassBytes) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} +*/ + +/// Returns true if and only if any of the literals in the given set is +/// entirely whitespace. +fn has_only_whitespace(lits: &[Literal]) -> bool { + for lit in lits { + if lit.as_bytes().chars().all(|c| c.is_whitespace()) { + return true; + } + } + false +} + +fn prefixes(hir: &Hir) -> Seq { + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Prefix); + let mut prefixes = extractor.extract(hir); + log::debug!( + "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}", + prefixes.len(), + prefixes.is_exact(), + prefixes + ); + prefixes.optimize_for_prefix_by_preference(); + log::debug!( + "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}", + prefixes.len(), + prefixes.is_exact(), + prefixes + ); + prefixes +} + +fn suffixes(hir: &Hir) -> Seq { + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Suffix); + let mut suffixes = extractor.extract(hir); + log::debug!( + "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}", + suffixes.len(), + suffixes.is_exact(), + suffixes + ); + suffixes.optimize_for_suffix_by_preference(); + log::debug!( + "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}", + suffixes.len(), + suffixes.is_exact(), + suffixes + ); + suffixes +} + +#[cfg(test)] +mod tests { + use super::LiteralSets; + use regex_syntax::Parser; + + fn sets(pattern: &str) -> LiteralSets { + let hir = Parser::new().parse(pattern).unwrap(); + LiteralSets::new(&hir) + } + + fn one_regex(pattern: &str) -> Option { + sets(pattern).one_regex(false) + } + + // Put a pattern into the same format as the one returned by `one_regex`. + fn pat(pattern: &str) -> Option { + Some(format!("(?-u:{})", pattern)) + } + + #[test] + fn various() { + // Obviously no literals. + assert!(one_regex(r"\w").is_none()); + assert!(one_regex(r"\pL").is_none()); + + // Tantalizingly close. + assert!(one_regex(r"\w|foo").is_none()); + + // There's a literal, but it's better if the regex engine handles it + // internally. + assert!(one_regex(r"abc").is_none()); + + // Core use cases. + assert_eq!(one_regex(r"\wabc\w"), pat("abc")); + assert_eq!(one_regex(r"abc\w"), pat("abc")); + + // TODO: Make these pass. We're missing some potentially big wins + // without these. + // assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz")); + // assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz")); + } + + #[test] + fn regression_1064() { + // Regression from: + // https://github.com/BurntSushi/ripgrep/issues/1064 + // assert_eq!(one_regex(r"a.*c"), pat("a")); + assert_eq!(one_regex(r"a(.*c)"), pat("a")); + } + + #[test] + fn regression_1319() { + // Regression from: + // https://github.com/BurntSushi/ripgrep/issues/1319 + assert_eq!( + one_regex(r"TTGAGTCCAGGAG[ATCG]{2}C"), + pat("TTGAGTCCAGGAG"), + ); + } + + #[test] + fn regression_1537() { + // Regression from: + // https://github.com/BurntSushi/ripgrep/issues/1537 + assert_eq!(one_regex(r";(.*,)"), pat(";")); + assert_eq!(one_regex(r";((.*,))"), pat(";")); + assert_eq!(one_regex(r";(.*,)+"), pat(";"),); + assert_eq!(one_regex(r";(.*,){1}"), pat(";"),); + } +} diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 725aae3..350cf0c 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -1036,7 +1036,9 @@ mod tests { } // Test that finding candidate lines works as expected. + // FIXME: Re-enable this test once inner literal extraction works. #[test] + #[ignore] fn candidate_lines() { fn is_confirmed(m: LineMatchKind) -> bool { match m { diff --git a/crates/regex/src/multi.rs b/crates/regex/src/multi.rs index d2d4af9..9d2b613 100644 --- a/crates/regex/src/multi.rs +++ b/crates/regex/src/multi.rs @@ -1,6 +1,6 @@ -use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; +use aho_corasick::{AhoCorasick, MatchKind}; use grep_matcher::{Match, Matcher, NoError}; -use regex_syntax::hir::Hir; +use regex_syntax::hir::{Hir, HirKind}; use crate::error::Error; use crate::matcher::RegexCaptures; @@ -23,10 +23,9 @@ impl MultiLiteralMatcher { pub fn new>( literals: &[B], ) -> Result { - let ac = AhoCorasickBuilder::new() + let ac = AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) - .auto_configure(literals) - .build_with_size::(literals) + .build(literals) .map_err(Error::regex)?; Ok(MultiLiteralMatcher { ac }) } @@ -79,13 +78,11 @@ impl Matcher for MultiLiteralMatcher { /// Alternation literals checks if the given HIR is a simple alternation of /// literals, and if so, returns them. Otherwise, this returns None. pub fn alternation_literals(expr: &Hir) -> Option>> { - use regex_syntax::hir::{HirKind, Literal}; - // This is pretty hacky, but basically, if `is_alternation_literal` is // true, then we can make several assumptions about the structure of our // HIR. This is what justifies the `unreachable!` statements below. - if !expr.is_alternation_literal() { + if !expr.properties().is_alternation_literal() { return None; } let alts = match *expr.kind() { @@ -93,26 +90,16 @@ pub fn alternation_literals(expr: &Hir) -> Option>> { _ => return None, // one literal isn't worth it }; - let extendlit = |lit: &Literal, dst: &mut Vec| match *lit { - Literal::Unicode(c) => { - let mut buf = [0; 4]; - dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); - } - Literal::Byte(b) => { - dst.push(b); - } - }; - let mut lits = vec![]; for alt in alts { let mut lit = vec![]; match *alt.kind() { HirKind::Empty => {} - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(ref x) => lit.extend_from_slice(&x.0), HirKind::Concat(ref exprs) => { for e in exprs { match *e.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(ref x) => lit.extend_from_slice(&x.0), _ => unreachable!("expected literal, got {:?}", e), } } diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs index 7f68e84..eb89082 100644 --- a/crates/regex/src/non_matching.rs +++ b/crates/regex/src/non_matching.rs @@ -1,6 +1,10 @@ -use grep_matcher::ByteSet; -use regex_syntax::hir::{self, Hir, HirKind}; -use regex_syntax::utf8::Utf8Sequences; +use { + grep_matcher::ByteSet, + regex_syntax::{ + hir::{self, Hir, HirKind, Look}, + utf8::Utf8Sequences, + }, +}; /// Return a confirmed set of non-matching bytes from the given expression. pub fn non_matching_bytes(expr: &Hir) -> ByteSet { @@ -13,18 +17,28 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet { /// the given expression. fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) { match *expr.kind() { - HirKind::Empty | HirKind::WordBoundary(_) => {} - HirKind::Anchor(_) => { + HirKind::Empty + // | HirKind::Look(Look::Start | Look::End) + | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate) + | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {} + HirKind::Look(Look::Start | Look::End) => { + // FIXME: This is wrong, but not doing this leads to incorrect + // results because of how anchored searches are implemented in + // the 'grep-searcher' crate. set.remove(b'\n'); } - HirKind::Literal(hir::Literal::Unicode(c)) => { - for &b in c.encode_utf8(&mut [0; 4]).as_bytes() { + HirKind::Look(Look::StartLF | Look::EndLF) => { + set.remove(b'\n'); + } + HirKind::Look(Look::StartCRLF | Look::EndCRLF) => { + set.remove(b'\r'); + set.remove(b'\n'); + } + HirKind::Literal(hir::Literal(ref lit)) => { + for &b in lit.iter() { set.remove(b); } } - HirKind::Literal(hir::Literal::Byte(b)) => { - set.remove(b); - } HirKind::Class(hir::Class::Unicode(ref cls)) => { for range in cls.iter() { // This is presumably faster than encoding every codepoint @@ -42,10 +56,10 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) { } } HirKind::Repetition(ref x) => { - remove_matching_bytes(&x.hir, set); + remove_matching_bytes(&x.sub, set); } - HirKind::Group(ref x) => { - remove_matching_bytes(&x.hir, set); + HirKind::Capture(ref x) => { + remove_matching_bytes(&x.sub, set); } HirKind::Concat(ref xs) => { for x in xs { @@ -62,17 +76,13 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) { #[cfg(test)] mod tests { - use grep_matcher::ByteSet; - use regex_syntax::ParserBuilder; + use {grep_matcher::ByteSet, regex_syntax::ParserBuilder}; use super::non_matching_bytes; fn extract(pattern: &str) -> ByteSet { - let expr = ParserBuilder::new() - .allow_invalid_utf8(true) - .build() - .parse(pattern) - .unwrap(); + let expr = + ParserBuilder::new().utf8(false).build().parse(pattern).unwrap(); non_matching_bytes(&expr) } @@ -131,9 +141,13 @@ mod tests { #[test] fn anchor() { + // FIXME: The first four tests below should correspond to a full set + // of bytes for the non-matching bytes I think. assert_eq!(sparse(&extract(r"^")), sparse_except(&[b'\n'])); assert_eq!(sparse(&extract(r"$")), sparse_except(&[b'\n'])); assert_eq!(sparse(&extract(r"\A")), sparse_except(&[b'\n'])); assert_eq!(sparse(&extract(r"\z")), sparse_except(&[b'\n'])); + assert_eq!(sparse(&extract(r"(?m)^")), sparse_except(&[b'\n'])); + assert_eq!(sparse(&extract(r"(?m)$")), sparse_except(&[b'\n'])); } } diff --git a/crates/regex/src/strip.rs b/crates/regex/src/strip.rs index f529f47..39bd131 100644 --- a/crates/regex/src/strip.rs +++ b/crates/regex/src/strip.rs @@ -42,17 +42,11 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result { Ok(match expr.into_kind() { HirKind::Empty => Hir::empty(), - HirKind::Literal(hir::Literal::Unicode(c)) => { - if c == chr { + HirKind::Literal(hir::Literal(lit)) => { + if lit.iter().find(|&&b| b == byte).is_some() { return invalid(); } - Hir::literal(hir::Literal::Unicode(c)) - } - HirKind::Literal(hir::Literal::Byte(b)) => { - if b as char == chr { - return invalid(); - } - Hir::literal(hir::Literal::Byte(b)) + Hir::literal(lit) } HirKind::Class(hir::Class::Unicode(mut cls)) => { let remove = hir::ClassUnicode::new(Some( @@ -74,15 +68,14 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result { } Hir::class(hir::Class::Bytes(cls)) } - HirKind::Anchor(x) => Hir::anchor(x), - HirKind::WordBoundary(x) => Hir::word_boundary(x), + HirKind::Look(x) => Hir::look(x), HirKind::Repetition(mut x) => { - x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?); + x.sub = Box::new(strip_from_match_ascii(*x.sub, byte)?); Hir::repetition(x) } - HirKind::Group(mut x) => { - x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?); - Hir::group(x) + HirKind::Capture(mut x) => { + x.sub = Box::new(strip_from_match_ascii(*x.sub, byte)?); + Hir::capture(x) } HirKind::Concat(xs) => { let xs = xs @@ -131,11 +124,11 @@ mod tests { #[test] fn various() { - assert_eq!(roundtrip(r"[a\n]", b'\n'), "[a]"); - assert_eq!(roundtrip(r"[a\n]", b'a'), "[\n]"); - assert_eq!(roundtrip_crlf(r"[a\n]"), "[a]"); - assert_eq!(roundtrip_crlf(r"[a\r]"), "[a]"); - assert_eq!(roundtrip_crlf(r"[a\r\n]"), "[a]"); + assert_eq!(roundtrip(r"[a\n]", b'\n'), "a"); + assert_eq!(roundtrip(r"[a\n]", b'a'), "\n"); + assert_eq!(roundtrip_crlf(r"[a\n]"), "a"); + assert_eq!(roundtrip_crlf(r"[a\r]"), "a"); + assert_eq!(roundtrip_crlf(r"[a\r\n]"), "a"); assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])"); assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])"); diff --git a/crates/regex/src/util.rs b/crates/regex/src/util.rs index 71b4ad7..aef4f13 100644 --- a/crates/regex/src/util.rs +++ b/crates/regex/src/util.rs @@ -1,5 +1,6 @@ /// Converts an arbitrary sequence of bytes to a literal suitable for building /// a regular expression. +#[allow(dead_code)] pub fn bytes_to_regex(bs: &[u8]) -> String { use regex_syntax::is_meta_character; use std::fmt::Write;