From 581a35e568c3acd32461d276a4cfe746524e17cd Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 29 May 2021 07:34:14 -0400
Subject: [PATCH] impl: fix --multiline anchored match bug

This fixes a bug where using \A or (?-m)^ in combination with
-U/--multiline would permit matches that aren't anchored to the
beginning of the file. The underlying cause was an optimization that
occurred when mmaps couldn't be used. Namely, ripgrep tries to still
read the input incrementally if it knows the pattern can't match through
a new line. But the detection logic was flawed, since it didn't account
for line anchors. This commit fixes that.

Fixes #1878, Fixes #1879
---
 CHANGELOG.md                     |  2 ++
 crates/regex/src/non_matching.rs |  5 ++++-
 tests/regression.rs              | 23 +++++++++++++++++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d0e117..fc64451 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,8 @@ Bug fixes:
   Fix stdin detection when using PowerShell in UNIX environments.
 * [BUG #1866](https://github.com/BurntSushi/ripgrep/issues/1866#issuecomment-841635553):
   Fix bug when computing column numbers in `--vimgrep` mode.
+* [BUG #1878](https://github.com/BurntSushi/ripgrep/issues/1878):
+  Fix bug where `\A` could produce unanchored matches in multiline search.
 
 
 12.1.1 (2020-05-29)
diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs
index 2270f94..e2e0755 100644
--- a/crates/regex/src/non_matching.rs
+++ b/crates/regex/src/non_matching.rs
@@ -13,7 +13,10 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
 /// the given expression.
 fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
     match *expr.kind() {
-        HirKind::Empty | HirKind::Anchor(_) | HirKind::WordBoundary(_) => {}
+        HirKind::Empty | HirKind::WordBoundary(_) => {}
+        HirKind::Anchor(_) => {
+            set.remove(b'\n');
+        }
         HirKind::Literal(hir::Literal::Unicode(c)) => {
             for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
                 set.remove(b);
diff --git a/tests/regression.rs b/tests/regression.rs
index 2935a43..9aba274 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -882,3 +882,26 @@ test:3:5:foo quux
 ";
     eqnice!(expected, cmd.stdout());
 });
+
+rgtest!(r1878, |dir: Dir, _: TestCommand| {
+    dir.create("test", "a\nbaz\nabc\n");
+
+    // Since ripgrep enables (?m) by default, '^' will match at the beginning
+    // of a line, even when -U/--multiline is used.
+    let args = &["-U", "--no-mmap", r"^baz", "test"];
+    eqnice!("baz\n", dir.command().args(args).stdout());
+    let args = &["-U", "--mmap", r"^baz", "test"];
+    eqnice!("baz\n", dir.command().args(args).stdout());
+
+    // But when (?-m) is disabled, or when \A is used, then there should be no
+    // matches that aren't anchored to the beginning of the file.
+    let args = &["-U", "--no-mmap", r"(?-m)^baz", "test"];
+    dir.command().args(args).assert_err();
+    let args = &["-U", "--mmap", r"(?-m)^baz", "test"];
+    dir.command().args(args).assert_err();
+
+    let args = &["-U", "--no-mmap", r"\Abaz", "test"];
+    dir.command().args(args).assert_err();
+    let args = &["-U", "--mmap", r"\Abaz", "test"];
+    dir.command().args(args).assert_err();
+});