grep: upgrade to regex-syntax 0.5

This update brings with it many bug fixes: * Better error messages are printed overall. We also include explicit call out for unsupported features like backreferences and look-around. * Regexes like `\s*{` no longer emit incomprehensible errors. * Unicode escape sequences, such as `\u{..}` are now supported. For the most part, this upgrade was done in a straight-forward way. We resist the urge to refactor the `grep` crate, in anticipation of it being rewritten anyway. Note that we removed the `--fixed-strings` suggestion whenever a regex syntax error occurs. In practice, I've found that it results in a lot of false positives, and I believe that its use is not as paramount now that regex parse errors are much more readable. Closes #268, Closes #395, Closes #702, Closes #853
2018-03-13 20:38:50 -04:00
parent c2e97cd858
commit cd08707c7c
9 changed files with 152 additions and 159 deletions
--- a/grep/src/literals.rs
+++ b/grep/src/literals.rs
@@ -10,10 +10,8 @@ principled.
 use std::cmp;

 use regex::bytes::RegexBuilder;
-use syntax::{
-    Expr, Literals, Lit,
-    ByteClass, ByteRange, CharClass, ClassRange, Repeater,
-};
+use syntax::hir::{self, Hir, HirKind};
+use syntax::hir::literal::{Literal, Literals};

 #[derive(Clone, Debug)]
 pub struct LiteralSets {
@@ -23,12 +21,12 @@ pub struct LiteralSets {
 }

 impl LiteralSets {
-    pub fn create(expr: &Expr) -> Self {
+    pub fn create(expr: &Hir) -> Self {
        let mut required = Literals::empty();
        union_required(expr, &mut required);
        LiteralSets {
-            prefixes: expr.prefixes(),
-            suffixes: expr.suffixes(),
+            prefixes: Literals::prefixes(expr),
+            suffixes: Literals::suffixes(expr),
            required: required,
        }
    }
@@ -93,60 +91,52 @@ impl LiteralSets {
    }
 }

-fn union_required(expr: &Expr, lits: &mut Literals) {
-    use syntax::Expr::*;
-    match *expr {
-        Literal { ref chars, casei: false } => {
-            let s: String = chars.iter().cloned().collect();
-            lits.cross_add(s.as_bytes());
+fn union_required(expr: &Hir, lits: &mut Literals) {
+    match *expr.kind() {
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
+            let mut buf = [0u8; 4];
+            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
        }
-        Literal { ref chars, casei: true } => {
-            for &c in chars {
-                let cls = CharClass::new(vec![
-                    ClassRange { start: c, end: c },
-                ]).case_fold();
-                if !lits.add_char_class(&cls) {
+        HirKind::Literal(hir::Literal::Byte(b)) => {
+            lits.cross_add(&[b]);
+        }
+        HirKind::Class(hir::Class::Unicode(ref cls)) => {
+            if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
+                lits.cut();
+            }
+        }
+        HirKind::Class(hir::Class::Bytes(ref cls)) => {
+            if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
+                lits.cut();
+            }
+        }
+        HirKind::Group(hir::Group { ref hir, .. }) => {
+            union_required(&**hir, lits);
+        }
+        HirKind::Repetition(ref x) => {
+            match x.kind {
+                hir::RepetitionKind::ZeroOrOne => lits.cut(),
+                hir::RepetitionKind::ZeroOrMore => lits.cut(),
+                hir::RepetitionKind::OneOrMore => {
+                    union_required(&x.hir, lits);
                    lits.cut();
-                    return;
+                }
+                hir::RepetitionKind::Range(ref rng) => {
+                    let (min, max) = match *rng {
+                        hir::RepetitionRange::Exactly(m) => (m, Some(m)),
+                        hir::RepetitionRange::AtLeast(m) => (m, None),
+                        hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
+                    };
+                    repeat_range_literals(
+                        &x.hir, min, max, x.greedy, lits, union_required);
                }
            }
        }
-        LiteralBytes { ref bytes, casei: false } => {
-            lits.cross_add(bytes);
+        HirKind::Concat(ref es) if es.is_empty() => {}
+        HirKind::Concat(ref es) if es.len() == 1 => {
+            union_required(&es[0], lits)
        }
-        LiteralBytes { ref bytes, casei: true } => {
-            for &b in bytes {
-                let cls = ByteClass::new(vec![
-                    ByteRange { start: b, end: b },
-                ]).case_fold();
-                if !lits.add_byte_class(&cls) {
-                    lits.cut();
-                    return;
-                }
-            }
-        }
-        Class(_) => {
-            lits.cut();
-        }
-        ClassBytes(_) => {
-            lits.cut();
-        }
-        Group { ref e, .. } => {
-            union_required(&**e, lits);
-        }
-        Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(),
-        Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(),
-        Repeat { ref e, r: Repeater::OneOrMore, .. } => {
-            union_required(&**e, lits);
-            lits.cut();
-        }
-        Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
-            repeat_range_literals(
-                &**e, min, max, greedy, lits, union_required);
-        }
-        Concat(ref es) if es.is_empty() => {}
-        Concat(ref es) if es.len() == 1 => union_required(&es[0], lits),
-        Concat(ref es) => {
+        HirKind::Concat(ref es) => {
            for e in es {
                let mut lits2 = lits.to_empty();
                union_required(e, &mut lits2);
@@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
                if lits2.contains_empty() {
                    lits.cut();
                }
-                // if !lits.union(lits2) {
                if !lits.cross_product(&lits2) {
                    // If this expression couldn't yield any literal that
                    // could be extended, then we need to quit. Since we're
@@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
                }
            }
        }
-        Alternate(ref es) => {
+        HirKind::Alternation(ref es) => {
            alternate_literals(es, lits, union_required);
        }
        _ => lits.cut(),
    }
 }

-fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
-    e: &Expr,
+fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
+    e: &Hir,
    min: u32,
    max: Option<u32>,
    _greedy: bool,
@@ -204,8 +193,8 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
    }
 }

-fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
-    es: &[Expr],
+fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
+    es: &[Hir],
    lits: &mut Literals,
    mut f: F,
 ) {
@@ -234,11 +223,21 @@ fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
    }
    lits.cut();
    if !lcs.is_empty() {
-        lits.add(Lit::empty());
-        lits.add(Lit::new(lcs.to_vec()));
+        lits.add(Literal::empty());
+        lits.add(Literal::new(lcs.to_vec()));
    }
 }

+/// Return the number of characters in the given class.
+fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
+    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
+/// Return the number of bytes in the given class.
+fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
+    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
 /// Converts an arbitrary sequence of bytes to a literal suitable for building
 /// a regular expression.
 fn bytes_to_regex(bs: &[u8]) -> String {
--- a/grep/src/nonl.rs
+++ b/grep/src/nonl.rs
@@ -1,4 +1,4 @@
-use syntax::Expr;
+use syntax::hir::{self, Hir, HirKind};

 use {Error, Result};

@@ -9,59 +9,66 @@ use {Error, Result};
 ///
 /// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
 /// function panics.
-pub fn remove(expr: Expr, byte: u8) -> Result<Expr> {
-    // TODO(burntsushi): There is a bug in this routine where only `\n` is
-    // handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated
-    // to proper character classes instead of the special `AnyCharNoNL` and
-    // `AnyByteNoNL` classes.
-    use syntax::Expr::*;
+pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
    assert!(byte <= 0x7F);
    let chr = byte as char;
    assert!(chr.len_utf8() == 1);

-    Ok(match expr {
-        Literal { chars, casei } => {
-            if chars.iter().position(|&c| c == chr).is_some() {
+    Ok(match expr.into_kind() {
+        HirKind::Empty => Hir::empty(),
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
+            if c == chr {
                return Err(Error::LiteralNotAllowed(chr));
            }
-            Literal { chars: chars, casei: casei }
+            Hir::literal(hir::Literal::Unicode(c))
        }
-        LiteralBytes { bytes, casei } => {
-            if bytes.iter().position(|&b| b == byte).is_some() {
+        HirKind::Literal(hir::Literal::Byte(b)) => {
+            if b as char == chr {
                return Err(Error::LiteralNotAllowed(chr));
            }
-            LiteralBytes { bytes: bytes, casei: casei }
+            Hir::literal(hir::Literal::Byte(b))
        }
-        AnyChar => AnyCharNoNL,
-        AnyByte => AnyByteNoNL,
-        Class(mut cls) => {
-            cls.remove(chr);
-            Class(cls)
-        }
-        ClassBytes(mut cls) => {
-            cls.remove(byte);
-            ClassBytes(cls)
-        }
-        Group { e, i, name } => {
-            Group {
-                e: Box::new(remove(*e, byte)?),
-                i: i,
-                name: name,
+        HirKind::Class(hir::Class::Unicode(mut cls)) => {
+            let remove = hir::ClassUnicode::new(Some(
+                hir::ClassUnicodeRange::new(chr, chr),
+            ));
+            cls.difference(&remove);
+            if cls.iter().next().is_none() {
+                return Err(Error::LiteralNotAllowed(chr));
            }
+            Hir::class(hir::Class::Unicode(cls))
        }
-        Repeat { e, r, greedy } => {
-            Repeat {
-                e: Box::new(remove(*e, byte)?),
-                r: r,
-                greedy: greedy,
+        HirKind::Class(hir::Class::Bytes(mut cls)) => {
+            let remove = hir::ClassBytes::new(Some(
+                hir::ClassBytesRange::new(byte, byte),
+            ));
+            cls.difference(&remove);
+            if cls.iter().next().is_none() {
+                return Err(Error::LiteralNotAllowed(chr));
            }
+            Hir::class(hir::Class::Bytes(cls))
        }
-        Concat(exprs) => {
-            Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
+        HirKind::Anchor(x) => Hir::anchor(x),
+        HirKind::WordBoundary(x) => Hir::word_boundary(x),
+        HirKind::Repetition(mut x) => {
+            x.hir = Box::new(remove(*x.hir, byte)?);
+            Hir::repetition(x)
        }
-        Alternate(exprs) => {
-            Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
+        HirKind::Group(mut x) => {
+            x.hir = Box::new(remove(*x.hir, byte)?);
+            Hir::group(x)
+        }
+        HirKind::Concat(xs) => {
+            let xs = xs.into_iter()
+                .map(|e| remove(e, byte))
+                .collect::<Result<Vec<Hir>>>()?;
+            Hir::concat(xs)
+        }
+        HirKind::Alternation(xs) => {
+            let xs = xs.into_iter()
+                .map(|e| remove(e, byte))
+                .collect::<Result<Vec<Hir>>>()?;
+            Hir::alternation(xs)
        }
-        e => e,
    })
 }
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -1,10 +1,10 @@
 use memchr::{memchr, memrchr};
 use regex::bytes::{Regex, RegexBuilder};
-use syntax;

 use literals::LiteralSets;
 use nonl;
-use syntax::Expr;
+use syntax::ParserBuilder;
+use syntax::hir::Hir;
 use word_boundary::strip_unicode_word_boundaries;
 use Result;

@@ -166,7 +166,7 @@ impl GrepBuilder {

    /// Creates a new regex from the given expression with the current
    /// configuration.
-    fn regex(&self, expr: &Expr) -> Result<Regex> {
+    fn regex(&self, expr: &Hir) -> Result<Regex> {
        let mut builder = RegexBuilder::new(&expr.to_string());
        builder.unicode(true);
        self.regex_build(builder)
@@ -184,15 +184,16 @@ impl GrepBuilder {

    /// Parses the underlying pattern and ensures the pattern can never match
    /// the line terminator.
-    fn parse(&self) -> Result<syntax::Expr> {
-        let expr =
-            syntax::ExprBuilder::new()
-            .allow_bytes(true)
-            .unicode(true)
+    fn parse(&self) -> Result<Hir> {
+        let expr = ParserBuilder::new()
+            .allow_invalid_utf8(true)
            .case_insensitive(self.is_case_insensitive()?)
+            .multi_line(true)
+            .build()
            .parse(&self.pattern)?;
+        debug!("original regex HIR pattern:\n{}", expr);
        let expr = nonl::remove(expr, self.opts.line_terminator)?;
-        debug!("regex ast:\n{:#?}", expr);
+        debug!("transformed regex HIR pattern:\n{}", expr);
        Ok(expr)
    }

--- a/grep/src/word_boundary.rs
+++ b/grep/src/word_boundary.rs
@@ -1,4 +1,4 @@
-use syntax::Expr;
+use syntax::hir::{self, Hir, HirKind};

 /// Strips Unicode word boundaries from the given expression.
 ///
@@ -8,7 +8,7 @@ use syntax::Expr;
 /// false negatives.
 ///
 /// If no word boundaries could be stripped, then None is returned.
-pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
+pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option<Hir> {
    // The real reason we do this is because Unicode word boundaries are the
    // one thing that Rust's regex DFA engine can't handle. When it sees a
    // Unicode word boundary among non-ASCII text, it falls back to one of the
@@ -16,23 +16,24 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
    // a regex to find candidate matches without a Unicode word boundary. We'll
    // only then use the full (and slower) regex to confirm a candidate as a
    // match or not during search.
-    use syntax::Expr::*;
-
-    match *expr {
-        Concat(ref es) if !es.is_empty() => {
+    //
+    // It looks like we only check the outer edges for `\b`? I guess this is
+    // an attempt to optimize for the `-w/--word-regexp` flag? ---AG
+    match *expr.kind() {
+        HirKind::Concat(ref es) if !es.is_empty() => {
            let first = is_unicode_word_boundary(&es[0]);
            let last = is_unicode_word_boundary(es.last().unwrap());
            // Be careful not to strip word boundaries if there are no other
            // expressions to match.
            match (first, last) {
                (true, false) if es.len() > 1 => {
-                    Some(Concat(es[1..].to_vec()))
+                    Some(Hir::concat(es[1..].to_vec()))
                }
                (false, true) if es.len() > 1 => {
-                    Some(Concat(es[..es.len() - 1].to_vec()))
+                    Some(Hir::concat(es[..es.len() - 1].to_vec()))
                }
                (true, true) if es.len() > 2 => {
-                    Some(Concat(es[1..es.len() - 1].to_vec()))
+                    Some(Hir::concat(es[1..es.len() - 1].to_vec()))
                }
                _ => None,
            }
@@ -42,13 +43,11 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
 }

 /// Returns true if the given expression is a Unicode word boundary.
-fn is_unicode_word_boundary(expr: &Expr) -> bool {
-    use syntax::Expr::*;
-
-    match *expr {
-        WordBoundary => true,
-        NotWordBoundary => true,
-        Group { ref e, .. } => is_unicode_word_boundary(e),
+fn is_unicode_word_boundary(expr: &Hir) -> bool {
+    match *expr.kind() {
+        HirKind::WordBoundary(hir::WordBoundary::Unicode) => true,
+        HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true,
+        HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir),
        _ => false,
    }
 }