refactor progress

This commit is contained in:
Andrew Gallant
2016-06-20 16:53:48 -04:00
parent 8d9d602945
commit 0163b39faa
9 changed files with 492 additions and 298 deletions

View File

@@ -1,201 +0,0 @@
use std::cmp;
use std::iter;
use regex::bytes::Regex;
use syntax::{
Expr, Literals, Lit,
Repeater,
};
#[derive(Debug)]
pub struct LiteralSets {
prefixes: Literals,
suffixes: Literals,
required: Literals,
}
impl LiteralSets {
pub fn create(expr: &Expr) -> Self {
let mut required = Literals::empty();
union_required(expr, &mut required);
LiteralSets {
prefixes: expr.prefixes(),
suffixes: expr.suffixes(),
required: required,
}
}
pub fn to_matcher(&self) -> Option<Regex> {
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
// When this is true, the regex engine will do a literal scan.
return None;
}
let pre_lcp = self.prefixes.longest_common_prefix();
let pre_lcs = self.prefixes.longest_common_suffix();
let suf_lcp = self.suffixes.longest_common_prefix();
let suf_lcs = self.suffixes.longest_common_suffix();
let req_lits = self.required.literals();
let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
None => &[],
Some(req) => &***req,
};
let mut lit = pre_lcp;
if pre_lcs.len() > lit.len() {
lit = pre_lcs;
}
if suf_lcp.len() > lit.len() {
lit = suf_lcp;
}
if suf_lcs.len() > lit.len() {
lit = suf_lcs;
}
if req.len() > lit.len() {
lit = req;
}
if lit.is_empty() {
None
} else {
// Literals always compile.
Some(Regex::new(&bytes_to_regex(lit)).unwrap())
}
}
}
fn union_required(expr: &Expr, lits: &mut Literals) {
use syntax::Expr::*;
match *expr {
Literal { ref chars, casei: false } => {
let s: String = chars.iter().cloned().collect();
lits.cross_add(s.as_bytes());
}
Literal { ref chars, casei: true } => {
lits.cut();
}
LiteralBytes { ref bytes, casei: false } => {
lits.cross_add(bytes);
}
LiteralBytes { ref bytes, casei: true } => {
lits.cut();
}
Class(ref cls) => {
lits.cut();
}
ClassBytes(ref cls) => {
lits.cut();
}
Group { ref e, .. } => {
union_required(&**e, lits);
}
Repeat { ref e, r: Repeater::ZeroOrOne, .. } => lits.cut(),
Repeat { ref e, r: Repeater::ZeroOrMore, .. } => lits.cut(),
Repeat { ref e, r: Repeater::OneOrMore, .. } => {
union_required(&**e, lits);
lits.cut();
}
Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
repeat_range_literals(&**e, min, max, greedy, lits, union_required);
}
Concat(ref es) if es.is_empty() => {}
Concat(ref es) if es.len() == 1 => union_required(&es[0], lits),
Concat(ref es) => {
for e in es {
let mut lits2 = lits.to_empty();
union_required(e, &mut lits2);
if lits2.is_empty() {
lits.cut();
continue;
}
if lits2.contains_empty() {
lits.cut();
}
// if !lits.union(lits2) {
if !lits.cross_product(&lits2) {
// If this expression couldn't yield any literal that
// could be extended, then we need to quit. Since we're
// short-circuiting, we also need to freeze every member.
lits.cut();
break;
}
}
}
Alternate(ref es) => {
alternate_literals(es, lits, union_required);
}
_ => lits.cut(),
}
}
fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
e: &Expr,
min: u32,
max: Option<u32>,
greedy: bool,
lits: &mut Literals,
mut f: F,
) {
use syntax::Expr::*;
if min == 0 {
// This is a bit conservative. If `max` is set, then we could
// treat this as a finite set of alternations. For now, we
// just treat it as `e*`.
lits.cut();
} else {
let n = cmp::min(lits.limit_size(), min as usize);
let es = iter::repeat(e.clone()).take(n).collect();
f(&Concat(es), lits);
if n < min as usize {
lits.cut();
}
if max.map_or(true, |max| min < max) {
lits.cut();
}
}
}
fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
es: &[Expr],
lits: &mut Literals,
mut f: F,
) {
let mut lits2 = lits.to_empty();
for e in es {
let mut lits3 = lits.to_empty();
lits3.set_limit_size(lits.limit_size() / 5);
f(e, &mut lits3);
if lits3.is_empty() || !lits2.union(lits3) {
// If we couldn't find suffixes for *any* of the
// alternates, then the entire alternation has to be thrown
// away and any existing members must be frozen. Similarly,
// if the union couldn't complete, stop and freeze.
lits.cut();
return;
}
}
// All we do at the moment is look for prefixes and suffixes. If both
// are empty, then we report nothing. We should be able to do better than
// this, but we'll need something more expressive than just a "set of
// literals."
let lcp = lits2.longest_common_prefix();
let lcs = lits2.longest_common_suffix();
if !lcp.is_empty() {
lits.cross_add(lcp);
}
lits.cut();
if !lcs.is_empty() {
lits.add(Lit::empty());
lits.add(Lit::new(lcs.to_vec()));
}
}
/// Converts an arbitrary sequence of bytes to a literal suitable for building
/// a regular expression.
fn bytes_to_regex(bs: &[u8]) -> String {
let mut s = String::with_capacity(bs.len());
for &b in bs {
s.push_str(&format!("\\x{:02x}", b));
}
s
}

View File

@@ -1,6 +1,7 @@
#![allow(dead_code, unused_variables)]
extern crate docopt;
extern crate grep;
extern crate memchr;
extern crate memmap;
extern crate regex;
@@ -15,19 +16,13 @@ Options:
";
use std::error::Error;
use std::io::{self, BufRead, Write};
use std::io::{self, Write};
use std::process;
use std::result;
use docopt::Docopt;
use regex::bytes::Regex;
use literals::LiteralSets;
use search::{LineSearcher, LineSearcherBuilder};
mod literals;
mod nonl;
mod search;
use grep::{Grep, GrepBuilder};
pub type Result<T> = result::Result<T, Box<Error + Send + Sync>>;
@@ -53,36 +48,20 @@ fn main() {
fn run(args: &Args) -> Result<u64> {
if args.arg_file.is_empty() {
let expr = try!(parse(&args.arg_pattern));
let literals = LiteralSets::create(&expr);
let re = Regex::new(&expr.to_string()).unwrap();
let _stdin = io::stdin();
let stdin = _stdin.lock();
run_by_line(args, &re, stdin)
unimplemented!()
} else {
let searcher =
try!(LineSearcherBuilder::new(&args.arg_pattern).create());
if args.flag_count {
run_mmap_count_only(args, &searcher)
} else {
run_mmap(args, &searcher)
}
try!(GrepBuilder::new(&args.arg_pattern).create());
run_mmap(args, &searcher)
}
}
#[inline(never)]
fn run_mmap(args: &Args, searcher: &LineSearcher) -> Result<u64> {
use memmap::{Mmap, Protection};
assert!(args.arg_file.len() == 1);
let mut wtr = io::BufWriter::new(io::stdout());
let mmap = try!(Mmap::open_path(&args.arg_file[0], Protection::Read));
let text = unsafe { mmap.as_slice() };
let mut count = 0;
for m in searcher.search(text) {
try!(wtr.write(&text[m.start..m.end]));
try!(wtr.write(b"\n"));
fn run_mmap(args: &Args, searcher: &Grep) -> Result<u64> {
for m in searcher.iter(text) {
if !args.flag_count {
try!(wtr.write(&text[m.start()..m.end()]));
try!(wtr.write(b"\n"));
}
count += 1;
}
Ok(count)
@@ -100,36 +79,3 @@ fn run_mmap_count_only(args: &Args, searcher: &LineSearcher) -> Result<u64> {
try!(writeln!(wtr, "{}", count));
Ok(count)
}
fn run_by_line<B: BufRead>(
args: &Args,
re: &Regex,
mut rdr: B,
) -> Result<u64> {
let mut wtr = io::BufWriter::new(io::stdout());
let mut count = 0;
let mut nline = 0;
let mut line = vec![];
loop {
line.clear();
let n = try!(rdr.read_until(b'\n', &mut line));
if n == 0 {
break;
}
nline += 1;
if re.is_match(&line) {
count += 1;
try!(wtr.write(&line));
}
}
Ok(count)
}
fn parse(re: &str) -> Result<syntax::Expr> {
let expr =
try!(syntax::ExprBuilder::new()
.allow_bytes(true)
.unicode(false)
.parse(re));
Ok(try!(nonl::remove(expr)))
}

View File

@@ -1,55 +0,0 @@
use syntax::Expr;
use Result;
/// Returns a new expression that is guaranteed to never match `\n`.
///
/// If the expression contains a literal `\n`, then an error is returned.
pub fn remove(expr: Expr) -> Result<Expr> {
use syntax::Expr::*;
Ok(match expr {
Literal { chars, casei } => {
if chars.iter().position(|&c| c == '\n').is_some() {
return Err(format!("Literal '\\n' are not allowed.").into());
}
Literal { chars: chars, casei: casei }
}
LiteralBytes { bytes, casei } => {
if bytes.iter().position(|&b| b == b'\n').is_some() {
return Err(format!("Literal '\\n' are not allowed.").into());
}
LiteralBytes { bytes: bytes, casei: casei }
}
AnyChar => AnyCharNoNL,
AnyByte => AnyByteNoNL,
Class(mut cls) => {
cls.remove('\n');
Class(cls)
}
ClassBytes(mut cls) => {
cls.remove(b'\n');
ClassBytes(cls)
}
Group { e, i, name } => {
Group {
e: Box::new(try!(remove(*e))),
i: i,
name: name,
}
}
Repeat { e, r, greedy } => {
Repeat {
e: Box::new(try!(remove(*e))),
r: r,
greedy: greedy,
}
}
Concat(exprs) => {
Concat(try!(exprs.into_iter().map(remove).collect()))
}
Alternate(exprs) => {
Alternate(try!(exprs.into_iter().map(remove).collect()))
}
e => e,
})
}

View File

@@ -1,168 +0,0 @@
use std::cmp;
use memchr::{memchr, memrchr};
use regex::bytes::Regex;
use syntax;
use literals::LiteralSets;
use nonl;
use Result;
#[derive(Clone, Debug)]
pub struct LineSearcher {
re: Regex,
required: Option<Regex>,
opts: Options,
}
#[derive(Clone, Debug)]
pub struct LineSearcherBuilder {
pattern: String,
opts: Options,
}
#[derive(Clone, Debug, Default)]
struct Options {
case_insensitive: bool,
lines: bool,
locations: bool,
}
impl LineSearcherBuilder {
pub fn new(pattern: &str) -> LineSearcherBuilder {
LineSearcherBuilder {
pattern: pattern.to_string(),
opts: Options::default(),
}
}
pub fn case_insensitive(mut self, yes: bool) -> LineSearcherBuilder {
self.opts.case_insensitive = yes;
self
}
pub fn line_numbers(mut self, yes: bool) -> LineSearcherBuilder {
self.opts.lines = yes;
self
}
pub fn locations(mut self, yes: bool) -> LineSearcherBuilder {
self.opts.locations = yes;
self
}
pub fn create(self) -> Result<LineSearcher> {
let expr = try!(parse(&self.pattern));
let literals = LiteralSets::create(&expr);
let pat =
if self.opts.case_insensitive {
format!("(?i){}", expr)
} else {
expr.to_string()
};
// We've already parsed the pattern, so we know it will compiled.
let re = Regex::new(&pat).unwrap();
Ok(LineSearcher {
re: re,
required: literals.to_matcher(),
opts: self.opts,
})
}
}
impl LineSearcher {
pub fn search<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> {
Iter {
searcher: self,
buf: buf,
start: 0,
count: 0,
}
}
}
pub struct Match {
pub start: usize,
pub end: usize,
pub count: u64,
pub line: Option<usize>,
pub locations: Vec<(usize, usize)>,
}
pub struct Iter<'b, 's> {
searcher: &'s LineSearcher,
buf: &'b [u8],
start: usize,
count: u64,
}
impl<'b, 's> Iter<'b, 's> {
#[inline(always)] // reduces constant overhead
fn next_line_match(&mut self) -> Option<(usize, usize)> {
if let Some(ref req) = self.searcher.required {
while self.start < self.buf.len() {
let e = match req.shortest_match(&self.buf[self.start..]) {
None => return None,
Some(e) => self.start + e,
};
let (prevnl, nextnl) = self.find_line(e, e);
match self.searcher.re.shortest_match(&self.buf[prevnl..nextnl]) {
None => {
self.start = nextnl + 1;
continue;
}
Some(_) => return Some((prevnl, nextnl)),
}
}
None
} else {
self.searcher.re
.shortest_match(&self.buf[self.start..])
.map(|e| self.find_line(self.start + e, self.start + e))
}
}
fn find_line(&self, s: usize, e: usize) -> (usize, usize) {
(self.find_line_start(s), self.find_line_end(e))
}
fn find_line_start(&self, pos: usize) -> usize {
memrchr(b'\n', &self.buf[0..pos]).map_or(0, |i| i + 1)
}
fn find_line_end(&self, pos: usize) -> usize {
memchr(b'\n', &self.buf[pos..]).map_or(self.buf.len(), |i| pos + i)
}
}
impl<'b, 's> Iterator for Iter<'b, 's> {
type Item = Match;
#[inline(always)] // reduces constant overhead
fn next(&mut self) -> Option<Match> {
match self.next_line_match() {
None => None,
Some((prevnl, nextnl)) => {
let count = self.count;
self.start = cmp::min(self.buf.len(), nextnl + 1);
self.count += 1;
Some(Match {
start: prevnl,
end: nextnl,
count: count,
line: None,
locations: vec![],
})
}
}
}
}
fn parse(re: &str) -> Result<syntax::Expr> {
let expr =
try!(syntax::ExprBuilder::new()
.allow_bytes(true)
.unicode(false)
.parse(re));
Ok(try!(nonl::remove(expr)))
}