diff --git a/grep/Cargo.toml b/grep/Cargo.toml index 18371c9..e01e688 100644 --- a/grep/Cargo.toml +++ b/grep/Cargo.toml @@ -14,6 +14,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"] license = "Unlicense/MIT" [dependencies] +log = "0.3" memchr = "0.1" memmap = "0.2" regex = "0.1.75" diff --git a/grep/src/lib.rs b/grep/src/lib.rs index d57b8fb..68f95a4 100644 --- a/grep/src/lib.rs +++ b/grep/src/lib.rs @@ -4,6 +4,8 @@ A fast line oriented regex searcher. */ +#[macro_use] +extern crate log; extern crate memchr; extern crate regex; extern crate regex_syntax as syntax; diff --git a/grep/src/literals.rs b/grep/src/literals.rs index 5408fae..f168527 100644 --- a/grep/src/literals.rs +++ b/grep/src/literals.rs @@ -1,13 +1,22 @@ +/*! +The literals module is responsible for extracting *inner* literals out of the +AST of a regular expression. Normally this is the job of the regex engine +itself, but the regex engine doesn't look for inner literals. Since we're doing +line based searching, we can use them, so we need to do it ourselves. + +Note that this implementation is incredibly suspicious. We need something more +principled. +*/ use std::cmp; use std::iter; use regex::bytes::Regex; use syntax::{ Expr, Literals, Lit, - Repeater, + ByteClass, ByteRange, CharClass, ClassRange, Repeater, }; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct LiteralSets { prefixes: Literals, suffixes: Literals, @@ -27,6 +36,7 @@ impl LiteralSets { pub fn to_regex(&self) -> Option { if self.prefixes.all_complete() && !self.prefixes.is_empty() { + debug!("literal prefixes detected: {:?}", self.prefixes); // When this is true, the regex engine will do a literal scan. return None; } @@ -56,13 +66,27 @@ impl LiteralSets { if suf_lcs.len() > lit.len() { lit = suf_lcs; } - if req.len() > lit.len() { + if req_lits.len() == 1 && req.len() > lit.len() { lit = req; } - if lit.is_empty() { + + // Special case: if we detected an alternation of inner required + // literals and its longest literal is bigger than the longest + // prefix/suffix, then choose the alternation. In practice, this + // helps with case insensitive matching, which can generate lots of + // inner required literals. + let any_empty = req_lits.iter().any(|lit| lit.is_empty()); + if req.len() > lit.len() && req_lits.len() > 1 && !any_empty { + debug!("required literals found: {:?}", req_lits); + let alts: Vec = + req_lits.into_iter().map(|x| bytes_to_regex(x)).collect(); + // Literals always compile. + Some(Regex::new(&alts.join("|")).unwrap()) + } else if lit.is_empty() { None } else { // Literals always compile. + debug!("required literal found: {:?}", show(lit)); Some(Regex::new(&bytes_to_regex(lit)).unwrap()) } } @@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) { let s: String = chars.iter().cloned().collect(); lits.cross_add(s.as_bytes()); } - Literal { casei: true, .. } => { - lits.cut(); + Literal { ref chars, casei: true } => { + for &c in chars { + let cls = CharClass::new(vec![ + ClassRange { start: c, end: c }, + ]).case_fold(); + if !lits.add_char_class(&cls) { + lits.cut(); + return; + } + } } LiteralBytes { ref bytes, casei: false } => { lits.cross_add(bytes); } - LiteralBytes { casei: true, .. } => { - lits.cut(); + LiteralBytes { ref bytes, casei: true } => { + for &b in bytes { + let cls = ByteClass::new(vec![ + ByteRange { start: b, end: b }, + ]).case_fold(); + if !lits.add_byte_class(&cls) { + lits.cut(); + return; + } + } } Class(_) => { lits.cut(); @@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String { } s } + +/// Converts arbitrary bytes to a nice string. +fn show(bs: &[u8]) -> String { + // Why aren't we using this to feed to the regex? Doesn't really matter + // I guess. ---AG + use std::ascii::escape_default; + use std::str; + + let mut nice = String::new(); + for &b in bs { + let part: Vec = escape_default(b).collect(); + nice.push_str(str::from_utf8(&part).unwrap()); + } + nice +} diff --git a/grep/src/search.rs b/grep/src/search.rs index 6451cc5..a26fb15 100644 --- a/grep/src/search.rs +++ b/grep/src/search.rs @@ -152,6 +152,7 @@ impl GrepBuilder { .unicode(true) .case_insensitive(self.opts.case_insensitive) .parse(&self.pattern)); + debug!("regex ast:\n{:#?}", expr); Ok(try!(nonl::remove(expr, self.opts.line_terminator))) } }