| 1 | use crate::util::{ |
| 2 | prefilter::PrefilterI, |
| 3 | search::{MatchKind, Span}, |
| 4 | }; |
| 5 | |
| 6 | #[derive (Clone, Debug)] |
| 7 | pub(crate) struct Teddy { |
| 8 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
| 9 | _unused: (), |
| 10 | /// The actual Teddy searcher. |
| 11 | /// |
| 12 | /// Technically, it's possible that Teddy doesn't actually get used, since |
| 13 | /// Teddy does require its haystack to at least be of a certain size |
| 14 | /// (usually around the size of whatever vector is being used, so ~16 |
| 15 | /// or ~32 bytes). For haystacks shorter than that, the implementation |
| 16 | /// currently uses Rabin-Karp. |
| 17 | #[cfg (feature = "perf-literal-multisubstring" )] |
| 18 | searcher: aho_corasick::packed::Searcher, |
| 19 | /// When running an anchored search, the packed searcher can't handle it so |
| 20 | /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed |
| 21 | /// searchers to support anchored search would be difficult at worst and |
| 22 | /// annoying at best. Since packed searchers only apply to small numbers of |
| 23 | /// literals, we content ourselves that this is not much of an added cost. |
| 24 | /// (That packed searchers only work with a small number of literals is |
| 25 | /// also why we use a DFA here. Otherwise, the memory usage of a DFA would |
| 26 | /// likely be unacceptable.) |
| 27 | #[cfg (feature = "perf-literal-multisubstring" )] |
| 28 | anchored_ac: aho_corasick::dfa::DFA, |
| 29 | /// The length of the smallest literal we look for. |
| 30 | /// |
| 31 | /// We use this as a heuristic to figure out whether this will be "fast" or |
| 32 | /// not. Generally, the longer the better, because longer needles are more |
| 33 | /// discriminating and thus reduce false positive rate. |
| 34 | #[cfg (feature = "perf-literal-multisubstring" )] |
| 35 | minimum_len: usize, |
| 36 | } |
| 37 | |
| 38 | impl Teddy { |
| 39 | pub(crate) fn new<B: AsRef<[u8]>>( |
| 40 | kind: MatchKind, |
| 41 | needles: &[B], |
| 42 | ) -> Option<Teddy> { |
| 43 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
| 44 | { |
| 45 | None |
| 46 | } |
| 47 | #[cfg (feature = "perf-literal-multisubstring" )] |
| 48 | { |
| 49 | // We only really support leftmost-first semantics. In |
| 50 | // theory we could at least support leftmost-longest, as the |
| 51 | // aho-corasick crate does, but regex-automata doesn't know about |
| 52 | // leftmost-longest currently. |
| 53 | // |
| 54 | // And like the aho-corasick prefilter, if we're using `All` |
| 55 | // semantics, then we can still use leftmost semantics for a |
| 56 | // prefilter. (This might be a suspicious choice for the literal |
| 57 | // engine, which uses a prefilter as a regex engine directly, but |
| 58 | // that only happens when using leftmost-first semantics.) |
| 59 | let (packed_match_kind, ac_match_kind) = match kind { |
| 60 | MatchKind::LeftmostFirst | MatchKind::All => ( |
| 61 | aho_corasick::packed::MatchKind::LeftmostFirst, |
| 62 | aho_corasick::MatchKind::LeftmostFirst, |
| 63 | ), |
| 64 | }; |
| 65 | let minimum_len = |
| 66 | needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); |
| 67 | let packed = aho_corasick::packed::Config::new() |
| 68 | .match_kind(packed_match_kind) |
| 69 | .builder() |
| 70 | .extend(needles) |
| 71 | .build()?; |
| 72 | let anchored_ac = aho_corasick::dfa::DFA::builder() |
| 73 | .match_kind(ac_match_kind) |
| 74 | .start_kind(aho_corasick::StartKind::Anchored) |
| 75 | .prefilter(false) |
| 76 | .build(needles) |
| 77 | .ok()?; |
| 78 | Some(Teddy { searcher: packed, anchored_ac, minimum_len }) |
| 79 | } |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | impl PrefilterI for Teddy { |
| 84 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
| 85 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
| 86 | { |
| 87 | unreachable!() |
| 88 | } |
| 89 | #[cfg (feature = "perf-literal-multisubstring" )] |
| 90 | { |
| 91 | let ac_span = |
| 92 | aho_corasick::Span { start: span.start, end: span.end }; |
| 93 | self.searcher |
| 94 | .find_in(haystack, ac_span) |
| 95 | .map(|m| Span { start: m.start(), end: m.end() }) |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
| 100 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
| 101 | { |
| 102 | unreachable!() |
| 103 | } |
| 104 | #[cfg (feature = "perf-literal-multisubstring" )] |
| 105 | { |
| 106 | use aho_corasick::automaton::Automaton; |
| 107 | let input = aho_corasick::Input::new(haystack) |
| 108 | .anchored(aho_corasick::Anchored::Yes) |
| 109 | .span(span.start..span.end); |
| 110 | self.anchored_ac |
| 111 | .try_find(&input) |
| 112 | // OK because we build the DFA with anchored support. |
| 113 | .expect("aho-corasick DFA should never fail" ) |
| 114 | .map(|m| Span { start: m.start(), end: m.end() }) |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | fn memory_usage(&self) -> usize { |
| 119 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
| 120 | { |
| 121 | unreachable!() |
| 122 | } |
| 123 | #[cfg (feature = "perf-literal-multisubstring" )] |
| 124 | { |
| 125 | use aho_corasick::automaton::Automaton; |
| 126 | self.searcher.memory_usage() + self.anchored_ac.memory_usage() |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | fn is_fast(&self) -> bool { |
| 131 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
| 132 | { |
| 133 | unreachable!() |
| 134 | } |
| 135 | #[cfg (feature = "perf-literal-multisubstring" )] |
| 136 | { |
| 137 | // Teddy is usually quite fast, but I have seen some cases where |
| 138 | // a large number of literals can overwhelm it and make it not so |
| 139 | // fast. We make an educated but conservative guess at a limit, at |
| 140 | // which point, we're not so comfortable thinking Teddy is "fast." |
| 141 | // |
| 142 | // Well... this used to incorporate a "limit" on the *number* |
| 143 | // of literals, but I have since changed it to a minimum on the |
| 144 | // *smallest* literal. Namely, when there is a very small literal |
| 145 | // (1 or 2 bytes), it is far more likely that it leads to a higher |
| 146 | // false positive rate. (Although, of course, not always. For |
| 147 | // example, 'zq' is likely to have a very low false positive rate.) |
| 148 | // But when we have 3 bytes, we have a really good chance of being |
| 149 | // quite discriminatory and thus fast. |
| 150 | // |
| 151 | // We may still want to add some kind of limit on the number of |
| 152 | // literals here, but keep in mind that Teddy already has its own |
| 153 | // somewhat small limit (64 at time of writing). The main issue |
| 154 | // here is that if 'is_fast' is false, it opens the door for the |
| 155 | // reverse inner optimization to kick in. We really only want to |
| 156 | // resort to the reverse inner optimization if we absolutely must. |
| 157 | self.minimum_len >= 3 |
| 158 | } |
| 159 | } |
| 160 | } |
| 161 | |