diff --git a/grep-printer/src/json.rs b/grep-printer/src/json.rs index 45d6d68..50387bf 100644 --- a/grep-printer/src/json.rs +++ b/grep-printer/src/json.rs @@ -817,7 +817,8 @@ impl<'a> SubMatches<'a> { #[cfg(test)] mod tests { - use grep_regex::RegexMatcher; + use grep_regex::{RegexMatcher, RegexMatcherBuilder}; + use grep_matcher::LineTerminator; use grep_searcher::SearcherBuilder; use super::{JSON, JSONBuilder}; @@ -918,4 +919,45 @@ and exhibited clearly, with a label attached.\ assert_eq!(got.lines().count(), 2); assert!(got.contains("begin") && got.contains("end")); } + + #[test] + fn missing_crlf() { + let haystack = "test\r\n".as_bytes(); + + let matcher = RegexMatcherBuilder::new() + .build("test") + .unwrap(); + let mut printer = JSONBuilder::new() + .build(vec![]); + SearcherBuilder::new() + .build() + .search_reader(&matcher, haystack, printer.sink(&matcher)) + .unwrap(); + let got = printer_contents(&mut printer); + assert_eq!(got.lines().count(), 3); + assert!( + got.lines().nth(1).unwrap().contains(r"test\r\n"), + r"missing 'test\r\n' in '{}'", + got.lines().nth(1).unwrap(), + ); + + let matcher = RegexMatcherBuilder::new() + .crlf(true) + .build("test") + .unwrap(); + let mut printer = JSONBuilder::new() + .build(vec![]); + SearcherBuilder::new() + .line_terminator(LineTerminator::crlf()) + .build() + .search_reader(&matcher, haystack, printer.sink(&matcher)) + .unwrap(); + let got = printer_contents(&mut printer); + assert_eq!(got.lines().count(), 3); + assert!( + got.lines().nth(1).unwrap().contains(r"test\r\n"), + r"missing 'test\r\n' in '{}'", + got.lines().nth(1).unwrap(), + ); + } } diff --git a/grep-regex/Cargo.toml b/grep-regex/Cargo.toml index e99edca..644043f 100644 --- a/grep-regex/Cargo.toml +++ b/grep-regex/Cargo.toml @@ -16,6 +16,6 @@ license = "Unlicense/MIT" log = "0.4.5" grep-matcher = { version = "0.1.1", path = "../grep-matcher" } regex = "1.1" -regex-syntax = "0.6.4" +regex-syntax = "0.6.5" thread_local = "0.3.6" utf8-ranges = "1.0.1" diff --git a/grep-regex/src/config.rs b/grep-regex/src/config.rs index f3d1f1c..efed9d4 100644 --- a/grep-regex/src/config.rs +++ b/grep-regex/src/config.rs @@ -160,6 +160,14 @@ impl ConfiguredHIR { non_matching_bytes(&self.expr) } + /// Returns true if and only if this regex needs to have its match offsets + /// tweaked because of CRLF support. Specifically, this occurs when the + /// CRLF hack is enabled and the regex is line anchored at the end. In + /// this case, matches that end with a `\r` have the `\r` stripped. + pub fn needs_crlf_stripped(&self) -> bool { + self.config.crlf && self.expr.is_line_anchored_end() + } + /// Builds a regular expression from this HIR expression. pub fn regex(&self) -> Result { self.pattern_to_regex(&self.expr.to_string()) diff --git a/grep-regex/src/crlf.rs b/grep-regex/src/crlf.rs index ff6b15b..838a28f 100644 --- a/grep-regex/src/crlf.rs +++ b/grep-regex/src/crlf.rs @@ -1,5 +1,105 @@ +use std::collections::HashMap; + +use grep_matcher::{Match, Matcher, NoError}; +use regex::bytes::Regex; use regex_syntax::hir::{self, Hir, HirKind}; +use config::ConfiguredHIR; +use error::Error; +use matcher::RegexCaptures; + +/// A matcher for implementing "word match" semantics. +#[derive(Clone, Debug)] +pub struct CRLFMatcher { + /// The regex. + regex: Regex, + /// A map from capture group name to capture group index. + names: HashMap, +} + +impl CRLFMatcher { + /// Create a new matcher from the given pattern that strips `\r` from the + /// end of every match. + /// + /// This panics if the given expression doesn't need its CRLF stripped. + pub fn new(expr: &ConfiguredHIR) -> Result { + assert!(expr.needs_crlf_stripped()); + + let regex = expr.regex()?; + let mut names = HashMap::new(); + for (i, optional_name) in regex.capture_names().enumerate() { + if let Some(name) = optional_name { + names.insert(name.to_string(), i.checked_sub(1).unwrap()); + } + } + Ok(CRLFMatcher { regex, names }) + } +} + +impl Matcher for CRLFMatcher { + type Captures = RegexCaptures; + type Error = NoError; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, NoError> { + let m = match self.regex.find_at(haystack, at) { + None => return Ok(None), + Some(m) => Match::new(m.start(), m.end()), + }; + Ok(Some(adjust_match(haystack, m))) + } + + fn new_captures(&self) -> Result { + Ok(RegexCaptures::new(self.regex.capture_locations())) + } + + fn capture_count(&self) -> usize { + self.regex.captures_len().checked_sub(1).unwrap() + } + + fn capture_index(&self, name: &str) -> Option { + self.names.get(name).map(|i| *i) + } + + fn captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut RegexCaptures, + ) -> Result { + caps.strip_crlf(false); + let r = self.regex.captures_read_at(caps.locations(), haystack, at); + if !r.is_some() { + return Ok(false); + } + + // If the end of our match includes a `\r`, then strip it from all + // capture groups ending at the same location. + let end = caps.locations().get(0).unwrap().1; + if end > 0 && haystack.get(end - 1) == Some(&b'\r') { + caps.strip_crlf(true); + } + Ok(true) + } + + // We specifically do not implement other methods like find_iter or + // captures_iter. Namely, the iter methods are guaranteed to be correct + // by virtue of implementing find_at and captures_at above. +} + +/// If the given match ends with a `\r`, then return a new match that ends +/// immediately before the `\r`. +pub fn adjust_match(haystack: &[u8], m: Match) -> Match { + if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') { + m.with_end(m.end() - 1) + } else { + m + } +} + /// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`. /// /// This does not preserve the exact semantics of the given expression, diff --git a/grep-regex/src/matcher.rs b/grep-regex/src/matcher.rs index 831d573..b15d824 100644 --- a/grep-regex/src/matcher.rs +++ b/grep-regex/src/matcher.rs @@ -6,6 +6,7 @@ use grep_matcher::{ use regex::bytes::{CaptureLocations, Regex}; use config::{Config, ConfiguredHIR}; +use crlf::CRLFMatcher; use error::Error; use word::WordMatcher; @@ -344,6 +345,11 @@ impl RegexMatcher { enum RegexMatcherImpl { /// The standard matcher used for all regular expressions. Standard(StandardMatcher), + /// A matcher that strips `\r` from the end of matches. + /// + /// This is only used when the CRLF hack is enabled and the regex is line + /// anchored at the end. + CRLF(CRLFMatcher), /// A matcher that only matches at word boundaries. This transforms the /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`. /// Because of this, the WordMatcher provides its own implementation of @@ -358,6 +364,8 @@ impl RegexMatcherImpl { fn new(expr: &ConfiguredHIR) -> Result { if expr.config().word { Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?)) + } else if expr.needs_crlf_stripped() { + Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?)) } else { Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?)) } @@ -379,6 +387,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find_at(haystack, at), + CRLF(ref m) => m.find_at(haystack, at), Word(ref m) => m.find_at(haystack, at), } } @@ -387,6 +396,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.new_captures(), + CRLF(ref m) => m.new_captures(), Word(ref m) => m.new_captures(), } } @@ -395,6 +405,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.capture_count(), + CRLF(ref m) => m.capture_count(), Word(ref m) => m.capture_count(), } } @@ -403,6 +414,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.capture_index(name), + CRLF(ref m) => m.capture_index(name), Word(ref m) => m.capture_index(name), } } @@ -411,6 +423,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find(haystack), + CRLF(ref m) => m.find(haystack), Word(ref m) => m.find(haystack), } } @@ -425,6 +438,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find_iter(haystack, matched), + CRLF(ref m) => m.find_iter(haystack, matched), Word(ref m) => m.find_iter(haystack, matched), } } @@ -439,6 +453,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.try_find_iter(haystack, matched), + CRLF(ref m) => m.try_find_iter(haystack, matched), Word(ref m) => m.try_find_iter(haystack, matched), } } @@ -451,6 +466,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures(haystack, caps), + CRLF(ref m) => m.captures(haystack, caps), Word(ref m) => m.captures(haystack, caps), } } @@ -466,6 +482,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures_iter(haystack, caps, matched), + CRLF(ref m) => m.captures_iter(haystack, caps, matched), Word(ref m) => m.captures_iter(haystack, caps, matched), } } @@ -481,6 +498,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.try_captures_iter(haystack, caps, matched), + CRLF(ref m) => m.try_captures_iter(haystack, caps, matched), Word(ref m) => m.try_captures_iter(haystack, caps, matched), } } @@ -494,6 +512,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures_at(haystack, at, caps), + CRLF(ref m) => m.captures_at(haystack, at, caps), Word(ref m) => m.captures_at(haystack, at, caps), } } @@ -509,6 +528,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.replace(haystack, dst, append), + CRLF(ref m) => m.replace(haystack, dst, append), Word(ref m) => m.replace(haystack, dst, append), } } @@ -527,6 +547,9 @@ impl Matcher for RegexMatcher { Standard(ref m) => { m.replace_with_captures(haystack, caps, dst, append) } + CRLF(ref m) => { + m.replace_with_captures(haystack, caps, dst, append) + } Word(ref m) => { m.replace_with_captures(haystack, caps, dst, append) } @@ -537,6 +560,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.is_match(haystack), + CRLF(ref m) => m.is_match(haystack), Word(ref m) => m.is_match(haystack), } } @@ -549,6 +573,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.is_match_at(haystack, at), + CRLF(ref m) => m.is_match_at(haystack, at), Word(ref m) => m.is_match_at(haystack, at), } } @@ -560,6 +585,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.shortest_match(haystack), + CRLF(ref m) => m.shortest_match(haystack), Word(ref m) => m.shortest_match(haystack), } } @@ -572,6 +598,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.shortest_match_at(haystack, at), + CRLF(ref m) => m.shortest_match_at(haystack, at), Word(ref m) => m.shortest_match_at(haystack, at), } } @@ -712,6 +739,9 @@ pub struct RegexCaptures { /// and the capturing groups must behave as if `(re)` is the `0`th capture /// group. offset: usize, + /// When enable, the end of a match has `\r` stripped from it, if one + /// exists. + strip_crlf: bool, } impl Captures for RegexCaptures { @@ -720,8 +750,25 @@ impl Captures for RegexCaptures { } fn get(&self, i: usize) -> Option { - let actual = i.checked_add(self.offset).unwrap(); - self.locs.pos(actual).map(|(s, e)| Match::new(s, e)) + if !self.strip_crlf { + let actual = i.checked_add(self.offset).unwrap(); + return self.locs.pos(actual).map(|(s, e)| Match::new(s, e)); + } + + // currently don't support capture offsetting with CRLF stripping + assert_eq!(self.offset, 0); + let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) { + None => return None, + Some(m) => m, + }; + // If the end position of this match corresponds to the end position + // of the overall match, then we apply our CRLF stripping. Otherwise, + // we cannot assume stripping is correct. + if i == 0 || m.end() == self.locs.pos(0).unwrap().1 { + Some(m.with_end(m.end() - 1)) + } else { + Some(m) + } } } @@ -734,12 +781,16 @@ impl RegexCaptures { locs: CaptureLocations, offset: usize, ) -> RegexCaptures { - RegexCaptures { locs, offset } + RegexCaptures { locs, offset, strip_crlf: false } } pub(crate) fn locations(&mut self) -> &mut CaptureLocations { &mut self.locs } + + pub(crate) fn strip_crlf(&mut self, yes: bool) { + self.strip_crlf = yes; + } } #[cfg(test)] diff --git a/grep-searcher/src/searcher/core.rs b/grep-searcher/src/searcher/core.rs index 21dbae3..c56f643 100644 --- a/grep-searcher/src/searcher/core.rs +++ b/grep-searcher/src/searcher/core.rs @@ -424,16 +424,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { } self.count_lines(buf, range.start()); let offset = self.absolute_byte_offset + range.start() as u64; - let linebuf = - if self.config.line_term.is_crlf() { - // Normally, a line terminator is never part of a match, but - // if the line terminator is CRLF, then it's possible for `\r` - // to end up in the match, which we generally don't want. So - // we strip it here. - lines::without_terminator(&buf[*range], self.config.line_term) - } else { - &buf[*range] - }; + let linebuf = &buf[*range]; let keepgoing = self.sink.matched( &self.searcher, &SinkMatch { diff --git a/tests/json.rs b/tests/json.rs index c0b064b..f49c906 100644 --- a/tests/json.rs +++ b/tests/json.rs @@ -153,7 +153,10 @@ rgtest!(basic, |dir: Dir, mut cmd: TestCommand| { msgs[1].unwrap_context(), Context { path: Some(Data::text("sherlock")), - lines: Data::text("Holmeses, success in the province of detective work must always\n"), + lines: Data::text( + "Holmeses, success in the province of \ + detective work must always\n", + ), line_number: Some(2), absolute_offset: 65, submatches: vec![], @@ -163,7 +166,10 @@ rgtest!(basic, |dir: Dir, mut cmd: TestCommand| { msgs[2].unwrap_match(), Match { path: Some(Data::text("sherlock")), - lines: Data::text("be, to a very large extent, the result of luck. Sherlock Holmes\n"), + lines: Data::text( + "be, to a very large extent, the result of luck. \ + Sherlock Holmes\n", + ), line_number: Some(3), absolute_offset: 129, submatches: vec![ @@ -212,7 +218,9 @@ rgtest!(notutf8, |dir: Dir, mut cmd: TestCommand| { let contents = &b"quux\xFFbaz"[..]; // APFS does not support creating files with invalid UTF-8 bytes, so just - // skip the test if we can't create our file. + // skip the test if we can't create our file. Presumably we don't need this + // check if we're already skipping it on macOS, but maybe other file + // systems won't like this test either? if !dir.try_create_bytes(OsStr::from_bytes(name), contents).is_ok() { return; } @@ -305,3 +313,52 @@ rgtest!(crlf, |dir: Dir, mut cmd: TestCommand| { }, ); }); + +// See: https://github.com/BurntSushi/ripgrep/issues/1095 +// +// This test checks that we don't drop the \r\n in a matching line when --crlf +// mode is enabled. +rgtest!(r1095_missing_crlf, |dir: Dir, mut cmd: TestCommand| { + dir.create("foo", "test\r\n"); + + // Check without --crlf flag. + let msgs = json_decode(&cmd.arg("--json").arg("test").stdout()); + assert_eq!(msgs.len(), 4); + assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n")); + + // Now check with --crlf flag. + let msgs = json_decode(&cmd.arg("--crlf").stdout()); + assert_eq!(msgs.len(), 4); + assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n")); +}); + +// See: https://github.com/BurntSushi/ripgrep/issues/1095 +// +// This test checks that we don't return empty submatches when matching a `\n` +// in CRLF mode. +rgtest!(r1095_crlf_empty_match, |dir: Dir, mut cmd: TestCommand| { + dir.create("foo", "test\r\n\n"); + + // Check without --crlf flag. + let msgs = json_decode(&cmd.arg("-U").arg("--json").arg("\n").stdout()); + assert_eq!(msgs.len(), 5); + + let m = msgs[1].unwrap_match(); + assert_eq!(m.lines, Data::text("test\r\n")); + assert_eq!(m.submatches[0].m, Data::text("\n")); + + let m = msgs[2].unwrap_match(); + assert_eq!(m.lines, Data::text("\n")); + assert_eq!(m.submatches[0].m, Data::text("\n")); + + // Now check with --crlf flag. + let msgs = json_decode(&cmd.arg("--crlf").stdout()); + + let m = msgs[1].unwrap_match(); + assert_eq!(m.lines, Data::text("test\r\n")); + assert_eq!(m.submatches[0].m, Data::text("\n")); + + let m = msgs[2].unwrap_match(); + assert_eq!(m.lines, Data::text("\n")); + assert_eq!(m.submatches[0].m, Data::text("\n")); +});