teddy.rs source code [crates/regex-automata-0.4.9/src/util/prefilter/teddy.rs]

1	use crate::util::{
2	prefilter::PrefilterI,
3	search::{MatchKind, Span},
4	};
5
6	#[derive(Clone, Debug)]
7	pub(crate) struct Teddy {
8	#[cfg(not(feature = "perf-literal-multisubstring"))]
9	_unused: (),
10	/// The actual Teddy searcher.
11	///
12	/// Technically, it's possible that Teddy doesn't actually get used, since
13	/// Teddy does require its haystack to at least be of a certain size
14	/// (usually around the size of whatever vector is being used, so ~16
15	/// or ~32 bytes). For haystacks shorter than that, the implementation
16	/// currently uses Rabin-Karp.
17	#[cfg(feature = "perf-literal-multisubstring")]
18	searcher: aho_corasick::packed::Searcher,
19	/// When running an anchored search, the packed searcher can't handle it so
20	/// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
21	/// searchers to support anchored search would be difficult at worst and
22	/// annoying at best. Since packed searchers only apply to small numbers of
23	/// literals, we content ourselves that this is not much of an added cost.
24	/// (That packed searchers only work with a small number of literals is
25	/// also why we use a DFA here. Otherwise, the memory usage of a DFA would
26	/// likely be unacceptable.)
27	#[cfg(feature = "perf-literal-multisubstring")]
28	anchored_ac: aho_corasick::dfa::DFA,
29	/// The length of the smallest literal we look for.
30	///
31	/// We use this as a heuristic to figure out whether this will be "fast" or
32	/// not. Generally, the longer the better, because longer needles are more
33	/// discriminating and thus reduce false positive rate.
34	#[cfg(feature = "perf-literal-multisubstring")]
35	minimum_len: usize,
36	}
37
38	impl Teddy {
39	pub(crate) fn new<B: AsRef<[u8]>>(
40	kind: MatchKind,
41	needles: &[B],
42	) -> Option<Teddy> {
43	#[cfg(not(feature = "perf-literal-multisubstring"))]
44	{
45	None
46	}
47	#[cfg(feature = "perf-literal-multisubstring")]
48	{
49	// We only really support leftmost-first semantics. In
50	// theory we could at least support leftmost-longest, as the
51	// aho-corasick crate does, but regex-automata doesn't know about
52	// leftmost-longest currently.
53	//
54	// And like the aho-corasick prefilter, if we're using `All`
55	// semantics, then we can still use leftmost semantics for a
56	// prefilter. (This might be a suspicious choice for the literal
57	// engine, which uses a prefilter as a regex engine directly, but
58	// that only happens when using leftmost-first semantics.)
59	let (packed_match_kind, ac_match_kind) = match kind {
60	MatchKind::LeftmostFirst \| MatchKind::All => (
61	aho_corasick::packed::MatchKind::LeftmostFirst,
62	aho_corasick::MatchKind::LeftmostFirst,
63	),
64	};
65	let minimum_len =
66	needles.iter().map(\|n\| n.as_ref().len()).min().unwrap_or(`0`);
67	let packed = aho_corasick::packed::Config::new()
68	.match_kind(packed_match_kind)
69	.builder()
70	.extend(needles)
71	.build()?;
72	let anchored_ac = aho_corasick::dfa::DFA::builder()
73	.match_kind(ac_match_kind)
74	.start_kind(aho_corasick::StartKind::Anchored)
75	.prefilter(`false`)
76	.build(needles)
77	.ok()?;
78	Some(Teddy { searcher: packed, anchored_ac, minimum_len })
79	}
80	}
81	}
82
83	impl PrefilterI for Teddy {
84	fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
85	#[cfg(not(feature = "perf-literal-multisubstring"))]
86	{
87	unreachable!()
88	}
89	#[cfg(feature = "perf-literal-multisubstring")]
90	{
91	let ac_span =
92	aho_corasick::Span { start: span.start, end: span.end };
93	self.searcher
94	.find_in(haystack, ac_span)
95	.map(\|m\| Span { start: m.start(), end: m.end() })
96	}
97	}
98
99	fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
100	#[cfg(not(feature = "perf-literal-multisubstring"))]
101	{
102	unreachable!()
103	}
104	#[cfg(feature = "perf-literal-multisubstring")]
105	{
106	use aho_corasick::automaton::Automaton;
107	let input = aho_corasick::Input::new(haystack)
108	.anchored(aho_corasick::Anchored::Yes)
109	.span(span.start..span.end);
110	self.anchored_ac
111	.try_find(&input)
112	// OK because we build the DFA with anchored support.
113	.expect("aho-corasick DFA should never fail")
114	.map(\|m\| Span { start: m.start(), end: m.end() })
115	}
116	}
117
118	fn memory_usage(&self) -> usize {
119	#[cfg(not(feature = "perf-literal-multisubstring"))]
120	{
121	unreachable!()
122	}
123	#[cfg(feature = "perf-literal-multisubstring")]
124	{
125	use aho_corasick::automaton::Automaton;
126	self.searcher.memory_usage() + self.anchored_ac.memory_usage()
127	}
128	}
129
130	fn is_fast(&self) -> bool {
131	#[cfg(not(feature = "perf-literal-multisubstring"))]
132	{
133	unreachable!()
134	}
135	#[cfg(feature = "perf-literal-multisubstring")]
136	{
137	// Teddy is usually quite fast, but I have seen some cases where
138	// a large number of literals can overwhelm it and make it not so
139	// fast. We make an educated but conservative guess at a limit, at
140	// which point, we're not so comfortable thinking Teddy is "fast."
141	//
142	// Well... this used to incorporate a "limit" on the number
143	// of literals, but I have since changed it to a minimum on the
144	// smallest* literal. Namely, when there is a very small literal*
145	// (1 or 2 bytes), it is far more likely that it leads to a higher
146	// false positive rate. (Although, of course, not always. For
147	// example, 'zq' is likely to have a very low false positive rate.)
148	// But when we have 3 bytes, we have a really good chance of being
149	// quite discriminatory and thus fast.
150	//
151	// We may still want to add some kind of limit on the number of
152	// literals here, but keep in mind that Teddy already has its own
153	// somewhat small limit (64 at time of writing). The main issue
154	// here is that if 'is_fast' is false, it opens the door for the
155	// reverse inner optimization to kick in. We really only want to
156	// resort to the reverse inner optimization if we absolutely must.
157	self.minimum_len >= `3`
158	}
159	}
160	}
161