grep: add "perfect" smart case detection

This commit removes the previous smart case detection logic and replaces
it with detection based on the regex AST. This particular AST is a faithful
representation of the concrete syntax, which lets us be very precise in
how we handle it.

Closes #851
This commit is contained in:
Andrew Gallant
2018-03-13 21:43:23 -04:00
parent cd08707c7c
commit 42b8132d0a
4 changed files with 203 additions and 49 deletions

View File

@@ -1,10 +1,11 @@
use memchr::{memchr, memrchr};
use syntax::ParserBuilder;
use syntax::hir::Hir;
use regex::bytes::{Regex, RegexBuilder};
use literals::LiteralSets;
use nonl;
use syntax::ParserBuilder;
use syntax::hir::Hir;
use smart_case::Cased;
use word_boundary::strip_unicode_word_boundaries;
use Result;
@@ -205,7 +206,11 @@ impl GrepBuilder {
if !self.opts.case_smart {
return Ok(false);
}
Ok(!has_uppercase_literal(&self.pattern))
let cased = match Cased::from_pattern(&self.pattern) {
None => return Ok(false),
Some(cased) => cased,
};
Ok(cased.any_literal && !cased.any_uppercase)
}
}
@@ -311,44 +316,15 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
}
}
/// Determine whether the pattern contains an uppercase character which should
/// negate the effect of the smart-case option.
///
/// Ideally we would be able to check the AST in order to correctly handle
/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
/// cased), but we don't currently have that option. For now, our 'good enough'
/// solution is to simply perform a semi-naïve scan of the input pattern and
/// ignore all characters following a '\'. The ExprBuilder will handle any
/// actual errors, and this at least lets us support the most common cases,
/// like 'foo\w' and 'foo\S', in an intuitive manner.
fn has_uppercase_literal(pattern: &str) -> bool {
let mut chars = pattern.chars();
while let Some(c) = chars.next() {
if c == '\\' {
chars.next();
} else if c.is_uppercase() {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
#![allow(unused_imports)]
use memchr::{memchr, memrchr};
use regex::bytes::Regex;
use super::{GrepBuilder, Match, has_uppercase_literal};
use super::{GrepBuilder, Match};
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
#[allow(dead_code)]
fn s(bytes: &[u8]) -> String {
String::from_utf8(bytes.to_vec()).unwrap()
}
fn find_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
let re = Regex::new(pat).unwrap();
let mut lines = vec![];
@@ -377,20 +353,4 @@ mod tests {
assert_eq!(expected.len(), got.len());
assert_eq!(expected, got);
}
#[test]
fn pattern_case() {
assert_eq!(has_uppercase_literal(&"".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
}
}