From 0473df1ef5721143941fb7f883e22b17292b35bb Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 28 Nov 2016 18:31:58 -0500 Subject: [PATCH] Disable Unicode mode for literal regex. When ripgrep detects a literal, it emits them as raw hex escaped byte sequences to Regex::new. This permits literal optimizations for arbitrary byte sequences (i.e., possibly invalid UTF-8). The problem is that Regex::new interprets hex escaped byte sequences as *Unicode codepoints* by default, but we want them to actually stand for their raw byte values. Therefore, disable Unicode mode. This is OK, since the regex is composed entirely of literals and literal extraction does Unicode case folding. Fixes #251 --- grep/src/literals.rs | 4 ++-- grep/src/search.rs | 3 +-- tests/tests.rs | 9 +++++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/grep/src/literals.rs b/grep/src/literals.rs index d931f13..3e68d24 100644 --- a/grep/src/literals.rs +++ b/grep/src/literals.rs @@ -79,12 +79,12 @@ impl LiteralSets { debug!("required literals found: {:?}", req_lits); let alts: Vec = req_lits.into_iter().map(|x| bytes_to_regex(x)).collect(); - Some(RegexBuilder::new(&alts.join("|"))) + Some(RegexBuilder::new(&alts.join("|")).unicode(false)) } else if lit.is_empty() { None } else { debug!("required literal found: {:?}", show(lit)); - Some(RegexBuilder::new(&bytes_to_regex(lit))) + Some(RegexBuilder::new(&bytes_to_regex(&lit)).unicode(false)) } } } diff --git a/grep/src/search.rs b/grep/src/search.rs index 850c8d6..cf1a4c3 100644 --- a/grep/src/search.rs +++ b/grep/src/search.rs @@ -167,14 +167,13 @@ impl GrepBuilder { /// Creates a new regex from the given expression with the current /// configuration. fn regex(&self, expr: &Expr) -> Result { - self.regex_build(RegexBuilder::new(&expr.to_string())) + self.regex_build(RegexBuilder::new(&expr.to_string()).unicode(true)) } /// Builds a new regex from the given builder using the caller's settings. fn regex_build(&self, builder: RegexBuilder) -> Result { builder .multi_line(true) - .unicode(true) .size_limit(self.opts.size_limit) .dfa_size_limit(self.opts.dfa_size_limit) .compile() diff --git a/tests/tests.rs b/tests/tests.rs index 5c152b9..876ee40 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -936,6 +936,15 @@ clean!(regression_229, "[E]conomie", ".", |wd: WorkDir, mut cmd: Command| { wd.assert_err(&mut cmd); }); +// See: https://github.com/BurntSushi/ripgrep/issues/251 +clean!(regression_251, "привет", ".", |wd: WorkDir, mut cmd: Command| { + wd.create("foo", "привет\nПривет\nПрИвЕт"); + cmd.arg("-i"); + + let lines: String = wd.stdout(&mut cmd); + assert_eq!(lines, "foo:привет\nfoo:Привет\nfoo:ПрИвЕт\n"); +}); + // See: https://github.com/BurntSushi/ripgrep/issues/7 sherlock!(feature_7, "-fpat", "sherlock", |wd: WorkDir, mut cmd: Command| { wd.create("pat", "Sherlock\nHolmes");