mod.rs source code [crates/regex_syntax/src/hir/mod.rs]

1	/!*
2	Defines a high-level intermediate (HIR) representation for regular expressions.
3
4	The HIR is represented by the [`Hir`] type, and it principally constructed via
5	[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6	may use the smart constructors defined on `Hir` to build their own by hand. The
7	smart constructors simultaneously simplify and "optimize" the HIR, and are also
8	the same routines used by translation.
9
10	Most regex engines only have an HIR like this, and usually construct it
11	directly from the concrete syntax. This crate however first parses the
12	concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13	as mentioned above. It's done this way to facilitate better error reporting,
14	and to have a structured representation of a regex that faithfully represents
15	its concrete syntax. Namely, while an `Hir` value can be converted back to an
16	equivalent regex pattern string, it is unlikely to look like the original due
17	to its simplified structure.
18	*/
19
20	use core::{char, cmp};
21
22	use alloc::{
23	boxed::Box,
24	format,
25	string::{String, ToString},
26	vec,
27	vec::Vec,
28	};
29
30	use crate::{
31	ast::Span,
32	hir::interval::{Interval, IntervalSet, IntervalSetIter},
33	unicode,
34	};
35
36	pub use crate::{
37	hir::visitor::{visit, Visitor},
38	unicode::CaseFoldError,
39	};
40
41	mod interval;
42	pub mod literal;
43	pub mod print;
44	pub mod translate;
45	mod visitor;
46
47	/// An error that can occur while translating an `Ast` to a `Hir`.
48	#[derive(Clone, Debug, Eq, PartialEq)]
49	pub struct Error {
50	/// The kind of error.
51	kind: ErrorKind,
52	/// The original pattern that the translator's Ast was parsed from. Every
53	/// span in an error is a valid range into this string.
54	pattern: String,
55	/// The span of this error, derived from the Ast given to the translator.
56	span: Span,
57	}
58
59	impl Error {
60	/// Return the type of this error.
61	pub fn kind(&self) -> &ErrorKind {
62	&self.kind
63	}
64
65	/// The original pattern string in which this error occurred.
66	///
67	/// Every span reported by this error is reported in terms of this string.
68	pub fn pattern(&self) -> &str {
69	&self.pattern
70	}
71
72	/// Return the span at which this error occurred.
73	pub fn span(&self) -> &Span {
74	&self.span
75	}
76	}
77
78	/// The type of an error that occurred while building an `Hir`.
79	///
80	/// This error type is marked as `non_exhaustive`. This means that adding a
81	/// new variant is not considered a breaking change.
82	#[non_exhaustive]
83	#[derive(Clone, Debug, Eq, PartialEq)]
84	pub enum ErrorKind {
85	/// This error occurs when a Unicode feature is used when Unicode
86	/// support is disabled. For example `(?-u:\pL)` would trigger this error.
87	UnicodeNotAllowed,
88	/// This error occurs when translating a pattern that could match a byte
89	/// sequence that isn't UTF-8 and `utf8` was enabled.
90	InvalidUtf8,
91	/// This error occurs when one uses a non-ASCII byte for a line terminator,
92	/// but where Unicode mode is enabled and UTF-8 mode is disabled.
93	InvalidLineTerminator,
94	/// This occurs when an unrecognized Unicode property name could not
95	/// be found.
96	UnicodePropertyNotFound,
97	/// This occurs when an unrecognized Unicode property value could not
98	/// be found.
99	UnicodePropertyValueNotFound,
100	/// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101	/// `\d`) could not be found. This can occur when the `unicode-perl`
102	/// crate feature is not enabled.
103	UnicodePerlClassNotFound,
104	/// This occurs when the Unicode simple case mapping tables are not
105	/// available, and the regular expression required Unicode aware case
106	/// insensitivity.
107	UnicodeCaseUnavailable,
108	}
109
110	#[cfg(feature = "std")]
111	impl std::error::Error for Error {}
112
113	impl core::fmt::Display for Error {
114	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115	crate::error::Formatter::from(self).fmt(f)
116	}
117	}
118
119	impl core::fmt::Display for ErrorKind {
120	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121	use self::ErrorKind::*;
122
123	let msg: &'static str = match *self {
124	UnicodeNotAllowed => "Unicode not allowed here",
125	InvalidUtf8 => "pattern can match invalid UTF-8",
126	InvalidLineTerminator => "invalid line terminator, must be ASCII",
127	UnicodePropertyNotFound => "Unicode property not found",
128	UnicodePropertyValueNotFound => "Unicode property value not found",
129	UnicodePerlClassNotFound => {
130	"Unicode-aware Perl class not found \
131	(make sure the unicode-perl feature is enabled)"
132	}
133	UnicodeCaseUnavailable => {
134	"Unicode-aware case insensitivity matching is not available \
135	(make sure the unicode-case feature is enabled)"
136	}
137	};
138	f.write_str(data:msg)
139	}
140	}
141
142	/// A high-level intermediate representation (HIR) for a regular expression.
143	///
144	/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145	/// An `HirKind` indicates what kind of regular expression it is (a literal,
146	/// a repetition, a look-around assertion, etc.), where as a `Properties`
147	/// describes various facts about the regular expression. For example, whether
148	/// it matches UTF-8 or if it matches the empty string.
149	///
150	/// The HIR of a regular expression represents an intermediate step between
151	/// its abstract syntax (a structured description of the concrete syntax) and
152	/// an actual regex matcher. The purpose of HIR is to make regular expressions
153	/// easier to analyze. In particular, the AST is much more complex than the
154	/// HIR. For example, while an AST supports arbitrarily nested character
155	/// classes, the HIR will flatten all nested classes into a single set. The HIR
156	/// will also "compile away" every flag present in the concrete syntax. For
157	/// example, users of HIR expressions never need to worry about case folding;
158	/// it is handled automatically by the translator (e.g., by translating
159	/// `(?i:A)` to `[aA]`).
160	///
161	/// The specific type of an HIR expression can be accessed via its `kind`
162	/// or `into_kind` methods. This extra level of indirection exists for two
163	/// reasons:
164	///
165	/// 1. Construction of an HIR expression must* use the constructor methods on*
166	/// this `Hir` type instead of building the `HirKind` values directly. This
167	/// permits construction to enforce invariants like "concatenations always
168	/// consist of two or more sub-expressions."
169	/// 2. Every HIR expression contains attributes that are defined inductively,
170	/// and can be computed cheaply during the construction process. For example,
171	/// one such attribute is whether the expression must match at the beginning of
172	/// the haystack.
173	///
174	/// In particular, if you have an `HirKind` value, then there is intentionally
175	/// no way to build an `Hir` value from it. You instead need to do case
176	/// analysis on the `HirKind` value and build the `Hir` value using its smart
177	/// constructors.
178	///
179	/// # UTF-8
180	///
181	/// If the HIR was produced by a translator with
182	/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183	/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184	/// matches.
185	///
186	/// For empty matches, those can occur at any position. It is the
187	/// responsibility of the regex engine to determine whether empty matches are
188	/// permitted between the code units of a single codepoint.
189	///
190	/// # Stack space
191	///
192	/// This type defines its own destructor that uses constant stack space and
193	/// heap space proportional to the size of the HIR.
194	///
195	/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196	/// expression pattern string, and uses constant stack space and heap space
197	/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198	/// be _semantically_ equivalent to the original concrete syntax, but it may
199	/// look very different. (And potentially not practically readable by a human.)
200	///
201	/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202	/// stack space. The implementation will also suppress some details (such as
203	/// the `Properties` inlined into every `Hir` value to make it less noisy).
204	#[derive(Clone, Eq, PartialEq)]
205	pub struct Hir {
206	/// The underlying HIR kind.
207	kind: HirKind,
208	/// Analysis info about this HIR, computed during construction.
209	props: Properties,
210	}
211
212	/// Methods for accessing the underlying `HirKind` and `Properties`.
213	impl Hir {
214	/// Returns a reference to the underlying HIR kind.
215	pub fn kind(&self) -> &HirKind {
216	&self.kind
217	}
218
219	/// Consumes ownership of this HIR expression and returns its underlying
220	/// `HirKind`.
221	pub fn into_kind(mut self) -> HirKind {
222	core::mem::replace(&mut self.kind, HirKind::Empty)
223	}
224
225	/// Returns the properties computed for this `Hir`.
226	pub fn properties(&self) -> &Properties {
227	&self.props
228	}
229
230	/// Splits this HIR into its constituent parts.
231	///
232	/// This is useful because `let Hir { kind, props } = hir;` does not work
233	/// because of `Hir`'s custom `Drop` implementation.
234	fn into_parts(mut self) -> (HirKind, Properties) {
235	(
236	core::mem::replace(&mut self.kind, HirKind::Empty),
237	core::mem::replace(&mut self.props, Properties::empty()),
238	)
239	}
240	}
241
242	/// Smart constructors for HIR values.
243	///
244	/// These constructors are called "smart" because they do inductive work or
245	/// simplifications. For example, calling `Hir::repetition` with a repetition
246	/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247	/// since it is equivalent to an empty regex. Another example is calling
248	/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249	/// just get back the original `expr` since it's precisely equivalent.
250	///
251	/// Smart constructors enable maintaining invariants about the HIR data type
252	/// while also simulanteously keeping the representation as simple as possible.
253	impl Hir {
254	/// Returns an empty HIR expression.
255	///
256	/// An empty HIR expression always matches, including the empty string.
257	#[inline]
258	pub fn empty() -> Hir {
259	let props = Properties::empty();
260	Hir { kind: HirKind::Empty, props }
261	}
262
263	/// Returns an HIR expression that can never match anything. That is,
264	/// the size of the set of strings in the language described by the HIR
265	/// returned is `0`.
266	///
267	/// This is distinct from [`Hir::empty`] in that the empty string matches
268	/// the HIR returned by `Hir::empty`. That is, the set of strings in the
269	/// language describe described by `Hir::empty` is non-empty.
270	///
271	/// Note that currently, the HIR returned uses an empty character class to
272	/// indicate that nothing can match. An equivalent expression that cannot
273	/// match is an empty alternation, but all such "fail" expressions are
274	/// normalized (via smart constructors) to empty character classes. This is
275	/// because empty character classes can be spelled in the concrete syntax
276	/// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277	/// empty alternations cannot.
278	#[inline]
279	pub fn fail() -> Hir {
280	let class = Class::Bytes(ClassBytes::empty());
281	let props = Properties::class(&class);
282	// We can't just call Hir::class here because it defers to Hir::fail
283	// in order to canonicalize the Hir value used to represent "cannot
284	// match."
285	Hir { kind: HirKind::Class(class), props }
286	}
287
288	/// Creates a literal HIR expression.
289	///
290	/// This accepts anything that can be converted into a `Box<[u8]>`.
291	///
292	/// Note that there is no mechanism for storing a `char` or a `Box<str>`
293	/// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294	/// any HIR node) matches valid UTF-8 exclusively can be queried via
295	/// [`Properties::is_utf8`].
296	///
297	/// # Example
298	///
299	/// This example shows that concatenations of `Literal` HIR values will
300	/// automatically get flattened and combined together. So for example, even
301	/// if you concat multiple `Literal` values that are themselves not valid
302	/// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303	/// how "smart" Hir's smart constructors are.
304	///
305	/// ```
306	/// use regex_syntax::hir::{Hir, HirKind, Literal};
307	///
308	/// let literals = vec![
309	/// Hir::literal([`0xE2`]),
310	/// Hir::literal([`0x98`]),
311	/// Hir::literal([`0x83`]),
312	/// ];
313	/// // Each literal, on its own, is invalid UTF-8.
314	/// assert!(literals.iter().all(\|hir\| !hir.properties().is_utf8()));
315	///
316	/// let concat = Hir::concat(literals);
317	/// // But the concatenation is valid UTF-8!
318	/// assert!(concat.properties().is_utf8());
319	///
320	/// // And also notice that the literals have been concatenated into a
321	/// // single `Literal`, to the point where there is no explicit `Concat`!
322	/// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323	/// assert_eq!(&expected, concat.kind());
324	/// ```
325	///
326	/// # Example: building a literal from a `char`
327	///
328	/// This example shows how to build a single `Hir` literal from a `char`
329	/// value. Since a [`Literal`] is just bytes, we just need to UTF-8
330	/// encode a `char` value:
331	///
332	/// ```
333	/// use regex_syntax::hir::{Hir, HirKind, Literal};
334	///
335	/// let ch = '☃';
336	/// let got = Hir::literal(ch.encode_utf8(&mut [`0`; `4`]).as_bytes());
337	///
338	/// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
339	/// assert_eq!(&expected, got.kind());
340	/// ```
341	#[inline]
342	pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343	let bytes = lit.into();
344	if bytes.is_empty() {
345	return Hir::empty();
346	}
347
348	let lit = Literal(bytes);
349	let props = Properties::literal(&lit);
350	Hir { kind: HirKind::Literal(lit), props }
351	}
352
353	/// Creates a class HIR expression. The class may either be defined over
354	/// ranges of Unicode codepoints or ranges of raw byte values.
355	///
356	/// Note that an empty class is permitted. An empty class is equivalent to
357	/// `Hir::fail()`.
358	#[inline]
359	pub fn class(class: Class) -> Hir {
360	if class.is_empty() {
361	return Hir::fail();
362	} else if let Some(bytes) = class.literal() {
363	return Hir::literal(bytes);
364	}
365	let props = Properties::class(&class);
366	Hir { kind: HirKind::Class(class), props }
367	}
368
369	/// Creates a look-around assertion HIR expression.
370	#[inline]
371	pub fn look(look: Look) -> Hir {
372	let props = Properties::look(look);
373	Hir { kind: HirKind::Look(look), props }
374	}
375
376	/// Creates a repetition HIR expression.
377	#[inline]
378	pub fn repetition(mut rep: Repetition) -> Hir {
379	// If the sub-expression of a repetition can only match the empty
380	// string, then we force its maximum to be at most 1.
381	if rep.sub.properties().maximum_len() == Some(`0`) {
382	rep.min = cmp::min(rep.min, `1`);
383	rep.max = rep.max.map(\|n\| cmp::min(n, `1`)).or(Some(`1`));
384	}
385	// The regex 'a{0}' is always equivalent to the empty regex. This is
386	// true even when 'a' is an expression that never matches anything
387	// (like '\P{any}').
388	//
389	// Additionally, the regex 'a{1}' is always equivalent to 'a'.
390	if rep.min == `0` && rep.max == Some(`0`) {
391	return Hir::empty();
392	} else if rep.min == `1` && rep.max == Some(`1`) {
393	return *rep.sub;
394	}
395	let props = Properties::repetition(&rep);
396	Hir { kind: HirKind::Repetition(rep), props }
397	}
398
399	/// Creates a capture HIR expression.
400	///
401	/// Note that there is no explicit HIR value for a non-capturing group.
402	/// Since a non-capturing group only exists to override precedence in the
403	/// concrete syntax and since an HIR already does its own grouping based on
404	/// what is parsed, there is no need to explicitly represent non-capturing
405	/// groups in the HIR.
406	#[inline]
407	pub fn capture(capture: Capture) -> Hir {
408	let props = Properties::capture(&capture);
409	Hir { kind: HirKind::Capture(capture), props }
410	}
411
412	/// Returns the concatenation of the given expressions.
413	///
414	/// This attempts to flatten and simplify the concatenation as appropriate.
415	///
416	/// # Example
417	///
418	/// This shows a simple example of basic flattening of both concatenations
419	/// and literals.
420	///
421	/// ```
422	/// use regex_syntax::hir::Hir;
423	///
424	/// let hir = Hir::concat(vec![
425	/// Hir::concat(vec![
426	/// Hir::literal([b'a']),
427	/// Hir::literal([b'b']),
428	/// Hir::literal([b'c']),
429	/// ]),
430	/// Hir::concat(vec![
431	/// Hir::literal([b'x']),
432	/// Hir::literal([b'y']),
433	/// Hir::literal([b'z']),
434	/// ]),
435	/// ]);
436	/// let expected = Hir::literal("abcxyz".as_bytes());
437	/// assert_eq!(expected, hir);
438	/// ```
439	pub fn concat(subs: Vec<Hir>) -> Hir {
440	// We rebuild the concatenation by simplifying it. Would be nice to do
441	// it in place, but that seems a little tricky?
442	let mut new = vec![];
443	// This gobbles up any adjacent literals in a concatenation and smushes
444	// them together. Basically, when we see a literal, we add its bytes
445	// to 'prior_lit', and whenever we see anything else, we first take
446	// any bytes in 'prior_lit' and add it to the 'new' concatenation.
447	let mut prior_lit: Option<Vec<u8>> = None;
448	for sub in subs {
449	let (kind, props) = sub.into_parts();
450	match kind {
451	HirKind::Literal(Literal(bytes)) => {
452	if let Some(ref mut prior_bytes) = prior_lit {
453	prior_bytes.extend_from_slice(&bytes);
454	} else {
455	prior_lit = Some(bytes.to_vec());
456	}
457	}
458	// We also flatten concats that are direct children of another
459	// concat. We only need to do this one level deep since
460	// Hir::concat is the only way to build concatenations, and so
461	// flattening happens inductively.
462	HirKind::Concat(subs2) => {
463	for sub2 in subs2 {
464	let (kind2, props2) = sub2.into_parts();
465	match kind2 {
466	HirKind::Literal(Literal(bytes)) => {
467	if let Some(ref mut prior_bytes) = prior_lit {
468	prior_bytes.extend_from_slice(&bytes);
469	} else {
470	prior_lit = Some(bytes.to_vec());
471	}
472	}
473	kind2 => {
474	if let Some(prior_bytes) = prior_lit.take() {
475	new.push(Hir::literal(prior_bytes));
476	}
477	new.push(Hir { kind: kind2, props: props2 });
478	}
479	}
480	}
481	}
482	// We can just skip empty HIRs.
483	HirKind::Empty => {}
484	kind => {
485	if let Some(prior_bytes) = prior_lit.take() {
486	new.push(Hir::literal(prior_bytes));
487	}
488	new.push(Hir { kind, props });
489	}
490	}
491	}
492	if let Some(prior_bytes) = prior_lit.take() {
493	new.push(Hir::literal(prior_bytes));
494	}
495	if new.is_empty() {
496	return Hir::empty();
497	} else if new.len() == `1` {
498	return new.pop().unwrap();
499	}
500	let props = Properties::concat(&new);
501	Hir { kind: HirKind::Concat(new), props }
502	}
503
504	/// Returns the alternation of the given expressions.
505	///
506	/// This flattens and simplifies the alternation as appropriate. This may
507	/// include factoring out common prefixes or even rewriting the alternation
508	/// as a character class.
509	///
510	/// Note that an empty alternation is equivalent to `Hir::fail()`. (It
511	/// is not possible for one to write an empty alternation, or even an
512	/// alternation with a single sub-expression, in the concrete syntax of a
513	/// regex.)
514	///
515	/// # Example
516	///
517	/// This is a simple example showing how an alternation might get
518	/// simplified.
519	///
520	/// ```
521	/// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
522	///
523	/// let hir = Hir::alternation(vec![
524	/// Hir::literal([b'a']),
525	/// Hir::literal([b'b']),
526	/// Hir::literal([b'c']),
527	/// Hir::literal([b'd']),
528	/// Hir::literal([b'e']),
529	/// Hir::literal([b'f']),
530	/// ]);
531	/// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
532	/// ClassUnicodeRange::new('a', 'f'),
533	/// ])));
534	/// assert_eq!(expected, hir);
535	/// ```
536	///
537	/// And another example showing how common prefixes might get factored
538	/// out.
539	///
540	/// ```
541	/// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
542	///
543	/// let hir = Hir::alternation(vec![
544	/// Hir::concat(vec![
545	/// Hir::literal("abc".as_bytes()),
546	/// Hir::class(Class::Unicode(ClassUnicode::new([
547	/// ClassUnicodeRange::new('A', 'Z'),
548	/// ]))),
549	/// ]),
550	/// Hir::concat(vec![
551	/// Hir::literal("abc".as_bytes()),
552	/// Hir::class(Class::Unicode(ClassUnicode::new([
553	/// ClassUnicodeRange::new('a', 'z'),
554	/// ]))),
555	/// ]),
556	/// ]);
557	/// let expected = Hir::concat(vec![
558	/// Hir::literal("abc".as_bytes()),
559	/// Hir::alternation(vec![
560	/// Hir::class(Class::Unicode(ClassUnicode::new([
561	/// ClassUnicodeRange::new('A', 'Z'),
562	/// ]))),
563	/// Hir::class(Class::Unicode(ClassUnicode::new([
564	/// ClassUnicodeRange::new('a', 'z'),
565	/// ]))),
566	/// ]),
567	/// ]);
568	/// assert_eq!(expected, hir);
569	/// ```
570	///
571	/// Note that these sorts of simplifications are not guaranteed.
572	pub fn alternation(subs: Vec<Hir>) -> Hir {
573	// We rebuild the alternation by simplifying it. We proceed similarly
574	// as the concatenation case. But in this case, there's no literal
575	// simplification happening. We're just flattening alternations.
576	let mut new = Vec::with_capacity(subs.len());
577	for sub in subs {
578	let (kind, props) = sub.into_parts();
579	match kind {
580	HirKind::Alternation(subs2) => {
581	new.extend(subs2);
582	}
583	kind => {
584	new.push(Hir { kind, props });
585	}
586	}
587	}
588	if new.is_empty() {
589	return Hir::fail();
590	} else if new.len() == `1` {
591	return new.pop().unwrap();
592	}
593	// Now that it's completely flattened, look for the special case of
594	// 'char1\|char2\|...\|charN' and collapse that into a class. Note that
595	// we look for 'char' first and then bytes. The issue here is that if
596	// we find both non-ASCII codepoints and non-ASCII singleton bytes,
597	// then it isn't actually possible to smush them into a single class.
598	// (Because classes are either "all codepoints" or "all bytes." You
599	// can have a class that both matches non-ASCII but valid UTF-8 and
600	// invalid UTF-8.) So we look for all chars and then all bytes, and
601	// don't handle anything else.
602	if let Some(singletons) = singleton_chars(&new) {
603	let it = singletons
604	.into_iter()
605	.map(\|ch\| ClassUnicodeRange { start: ch, end: ch });
606	return Hir::class(Class::Unicode(ClassUnicode::new(it)));
607	}
608	if let Some(singletons) = singleton_bytes(&new) {
609	let it = singletons
610	.into_iter()
611	.map(\|b\| ClassBytesRange { start: b, end: b });
612	return Hir::class(Class::Bytes(ClassBytes::new(it)));
613	}
614	// Similar to singleton chars, we can also look for alternations of
615	// classes. Those can be smushed into a single class.
616	if let Some(cls) = class_chars(&new) {
617	return Hir::class(cls);
618	}
619	if let Some(cls) = class_bytes(&new) {
620	return Hir::class(cls);
621	}
622	// Factor out a common prefix if we can, which might potentially
623	// simplify the expression and unlock other optimizations downstream.
624	// It also might generally make NFA matching and DFA construction
625	// faster by reducing the scope of branching in the regex.
626	new = match lift_common_prefix(new) {
627	Ok(hir) => return hir,
628	Err(unchanged) => unchanged,
629	};
630	let props = Properties::alternation(&new);
631	Hir { kind: HirKind::Alternation(new), props }
632	}
633
634	/// Returns an HIR expression for `.`.
635	///
636	/// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
637	/// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
638	/// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
639	/// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
640	/// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
641	/// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
642	///
643	/// # Example
644	///
645	/// Note that this is a convenience routine for constructing the correct
646	/// character class based on the value of `Dot`. There is no explicit "dot"
647	/// HIR value. It is just an abbreviation for a common character class.
648	///
649	/// ```
650	/// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
651	///
652	/// let hir = Hir::dot(Dot::AnyByte);
653	/// let expected = Hir::class(Class::Bytes(ClassBytes::new([
654	/// ClassBytesRange::new(`0x00`, `0xFF`),
655	/// ])));
656	/// assert_eq!(expected, hir);
657	/// ```
658	#[inline]
659	pub fn dot(dot: Dot) -> Hir {
660	match dot {
661	Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([
662	ClassUnicodeRange::new('`\0`', '`\u{10FFFF}`'),
663	]))),
664	Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([
665	ClassBytesRange::new(b'`\0`', b'`\xFF`'),
666	]))),
667	Dot::AnyCharExcept(ch) => {
668	let mut cls =
669	ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
670	cls.negate();
671	Hir::class(Class::Unicode(cls))
672	}
673	Dot::AnyCharExceptLF => {
674	Hir::class(Class::Unicode(ClassUnicode::new([
675	ClassUnicodeRange::new('`\0`', '`\x09`'),
676	ClassUnicodeRange::new('`\x0B`', '`\u{10FFFF}`'),
677	])))
678	}
679	Dot::AnyCharExceptCRLF => {
680	Hir::class(Class::Unicode(ClassUnicode::new([
681	ClassUnicodeRange::new('`\0`', '`\x09`'),
682	ClassUnicodeRange::new('`\x0B`', '`\x0C`'),
683	ClassUnicodeRange::new('`\x0E`', '`\u{10FFFF}`'),
684	])))
685	}
686	Dot::AnyByteExcept(byte) => {
687	let mut cls =
688	ClassBytes::new([ClassBytesRange::new(byte, byte)]);
689	cls.negate();
690	Hir::class(Class::Bytes(cls))
691	}
692	Dot::AnyByteExceptLF => {
693	Hir::class(Class::Bytes(ClassBytes::new([
694	ClassBytesRange::new(b'`\0`', b'`\x09`'),
695	ClassBytesRange::new(b'`\x0B`', b'`\xFF`'),
696	])))
697	}
698	Dot::AnyByteExceptCRLF => {
699	Hir::class(Class::Bytes(ClassBytes::new([
700	ClassBytesRange::new(b'`\0`', b'`\x09`'),
701	ClassBytesRange::new(b'`\x0B`', b'`\x0C`'),
702	ClassBytesRange::new(b'`\x0E`', b'`\xFF`'),
703	])))
704	}
705	}
706	}
707	}
708
709	/// The underlying kind of an arbitrary [`Hir`] expression.
710	///
711	/// An `HirKind` is principally useful for doing case analysis on the type
712	/// of a regular expression. If you're looking to build new `Hir` values,
713	/// then you _must_ use the smart constructors defined on `Hir`, like
714	/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
715	/// not expose any way of building an `Hir` directly from an `HirKind`.
716	#[derive(Clone, Debug, Eq, PartialEq)]
717	pub enum HirKind {
718	/// The empty regular expression, which matches everything, including the
719	/// empty string.
720	Empty,
721	/// A literalstring that matches exactly these bytes.
722	Literal(Literal),
723	/// A single character class that matches any of the characters in the
724	/// class. A class can either consist of Unicode scalar values as
725	/// characters, or it can use bytes.
726	///
727	/// A class may be empty. In which case, it matches nothing.
728	Class(Class),
729	/// A look-around assertion. A look-around match always has zero length.
730	Look(Look),
731	/// A repetition operation applied to a sub-expression.
732	Repetition(Repetition),
733	/// A capturing group, which contains a sub-expression.
734	Capture(Capture),
735	/// A concatenation of expressions.
736	///
737	/// A concatenation matches only if each of its sub-expressions match one
738	/// after the other.
739	///
740	/// Concatenations are guaranteed by `Hir`'s smart constructors to always
741	/// have at least two sub-expressions.
742	Concat(Vec<Hir>),
743	/// An alternation of expressions.
744	///
745	/// An alternation matches only if at least one of its sub-expressions
746	/// match. If multiple sub-expressions match, then the leftmost is
747	/// preferred.
748	///
749	/// Alternations are guaranteed by `Hir`'s smart constructors to always
750	/// have at least two sub-expressions.
751	Alternation(Vec<Hir>),
752	}
753
754	impl HirKind {
755	/// Returns a slice of this kind's sub-expressions, if any.
756	pub fn subs(&self) -> &[Hir] {
757	use core::slice::from_ref;
758
759	match *self {
760	HirKind::Empty
761	\| HirKind::Literal(_)
762	\| HirKind::Class(_)
763	\| HirKind::Look(_) => &[],
764	HirKind::Repetition(Repetition { ref sub: &Box, .. }) => from_ref(sub),
765	HirKind::Capture(Capture { ref sub: &Box, .. }) => from_ref(sub),
766	HirKind::Concat(ref subs: &Vec) => subs,
767	HirKind::Alternation(ref subs: &Vec) => subs,
768	}
769	}
770	}
771
772	impl core::fmt::Debug for Hir {
773	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
774	self.kind.fmt(f)
775	}
776	}
777
778	/// Print a display representation of this Hir.
779	///
780	/// The result of this is a valid regular expression pattern string.
781	///
782	/// This implementation uses constant stack space and heap space proportional
783	/// to the size of the `Hir`.
784	impl core::fmt::Display for Hir {
785	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
786	crate::hir::print::Printer::new().print(self, wtr:f)
787	}
788	}
789
790	/// The high-level intermediate representation of a literal.
791	///
792	/// A literal corresponds to `0` or more bytes that should be matched
793	/// literally. The smart constructors defined on `Hir` will automatically
794	/// concatenate adjacent literals into one literal, and will even automatically
795	/// replace empty literals with `Hir::empty()`.
796	///
797	/// Note that despite a literal being represented by a sequence of bytes, its
798	/// `Debug` implementation will attempt to print it as a normal string. (That
799	/// is, not a sequence of decimal numbers.)
800	#[derive(Clone, Eq, PartialEq)]
801	pub struct Literal(pub Box<[u8]>);
802
803	impl core::fmt::Debug for Literal {
804	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
805	crate::debug::Bytes(&self.0).fmt(f)
806	}
807	}
808
809	/// The high-level intermediate representation of a character class.
810	///
811	/// A character class corresponds to a set of characters. A character is either
812	/// defined by a Unicode scalar value or a byte.
813	///
814	/// A character class, regardless of its character type, is represented by a
815	/// sequence of non-overlapping non-adjacent ranges of characters.
816	///
817	/// There are no guarantees about which class variant is used. Generally
818	/// speaking, the Unicode variat is used whenever a class needs to contain
819	/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
820	/// when Unicode mode is disabled. For example, at the time of writing, the
821	/// regex `(?-u:a\|\xc2\xa0)` will compile down to HIR for the Unicode class
822	/// `[a\u00A0]` due to optimizations.
823	///
824	/// Note that `Bytes` variant may be produced even when it exclusively matches
825	/// valid UTF-8. This is because a `Bytes` variant represents an intention by
826	/// the author of the regular expression to disable Unicode mode, which in turn
827	/// impacts the semantics of case insensitive matching. For example, `(?i)k`
828	/// and `(?i-u)k` will not match the same set of strings.
829	#[derive(Clone, Eq, PartialEq)]
830	pub enum Class {
831	/// A set of characters represented by Unicode scalar values.
832	Unicode(ClassUnicode),
833	/// A set of characters represented by arbitrary bytes (one byte per
834	/// character).
835	Bytes(ClassBytes),
836	}
837
838	impl Class {
839	/// Apply Unicode simple case folding to this character class, in place.
840	/// The character class will be expanded to include all simple case folded
841	/// character variants.
842	///
843	/// If this is a byte oriented character class, then this will be limited
844	/// to the ASCII ranges `A-Z` and `a-z`.
845	///
846	/// # Panics
847	///
848	/// This routine panics when the case mapping data necessary for this
849	/// routine to complete is unavailable. This occurs when the `unicode-case`
850	/// feature is not enabled and the underlying class is Unicode oriented.
851	///
852	/// Callers should prefer using `try_case_fold_simple` instead, which will
853	/// return an error instead of panicking.
854	pub fn case_fold_simple(&mut self) {
855	match *self {
856	Class::Unicode(ref mut x) => x.case_fold_simple(),
857	Class::Bytes(ref mut x) => x.case_fold_simple(),
858	}
859	}
860
861	/// Apply Unicode simple case folding to this character class, in place.
862	/// The character class will be expanded to include all simple case folded
863	/// character variants.
864	///
865	/// If this is a byte oriented character class, then this will be limited
866	/// to the ASCII ranges `A-Z` and `a-z`.
867	///
868	/// # Error
869	///
870	/// This routine returns an error when the case mapping data necessary
871	/// for this routine to complete is unavailable. This occurs when the
872	/// `unicode-case` feature is not enabled and the underlying class is
873	/// Unicode oriented.
874	pub fn try_case_fold_simple(
875	&mut self,
876	) -> core::result::Result<(), CaseFoldError> {
877	match *self {
878	Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
879	Class::Bytes(ref mut x) => x.case_fold_simple(),
880	}
881	Ok(())
882	}
883
884	/// Negate this character class in place.
885	///
886	/// After completion, this character class will contain precisely the
887	/// characters that weren't previously in the class.
888	pub fn negate(&mut self) {
889	match *self {
890	Class::Unicode(ref mut x) => x.negate(),
891	Class::Bytes(ref mut x) => x.negate(),
892	}
893	}
894
895	/// Returns true if and only if this character class will only ever match
896	/// valid UTF-8.
897	///
898	/// A character class can match invalid UTF-8 only when the following
899	/// conditions are met:
900	///
901	/// 1. The translator was configured to permit generating an expression
902	/// that can match invalid UTF-8. (By default, this is disabled.)
903	/// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
904	/// syntax or in the parser builder. By default, Unicode mode is
905	/// enabled.
906	pub fn is_utf8(&self) -> bool {
907	match *self {
908	Class::Unicode(_) => `true`,
909	Class::Bytes(ref x) => x.is_ascii(),
910	}
911	}
912
913	/// Returns the length, in bytes, of the smallest string matched by this
914	/// character class.
915	///
916	/// For non-empty byte oriented classes, this always returns `1`. For
917	/// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
918	/// `4`. For empty classes, `None` is returned. It is impossible for `0` to
919	/// be returned.
920	///
921	/// # Example
922	///
923	/// This example shows some examples of regexes and their corresponding
924	/// minimum length, if any.
925	///
926	/// ```
927	/// use regex_syntax::{hir::Properties, parse};
928	///
929	/// // The empty string has a min length of 0.
930	/// let hir = parse(r"")?;
931	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
932	/// // As do other types of regexes that only match the empty string.
933	/// let hir = parse(r"^$\b\B")?;
934	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
935	/// // A regex that can match the empty string but match more is still 0.
936	/// let hir = parse(r"a*")?;
937	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
938	/// // A regex that matches nothing has no minimum defined.
939	/// let hir = parse(r"[a&&b]")?;
940	/// assert_eq!(None, hir.properties().minimum_len());
941	/// // Character classes usually have a minimum length of 1.
942	/// let hir = parse(r"\w")?;
943	/// assert_eq!(Some(`1`), hir.properties().minimum_len());
944	/// // But sometimes Unicode classes might be bigger!
945	/// let hir = parse(r"\p{Cyrillic}")?;
946	/// assert_eq!(Some(`2`), hir.properties().minimum_len());
947	///
948	/// # Ok::<(), Box<dyn std::error::Error>>(())
949	/// ```
950	pub fn minimum_len(&self) -> Option<usize> {
951	match *self {
952	Class::Unicode(ref x) => x.minimum_len(),
953	Class::Bytes(ref x) => x.minimum_len(),
954	}
955	}
956
957	/// Returns the length, in bytes, of the longest string matched by this
958	/// character class.
959	///
960	/// For non-empty byte oriented classes, this always returns `1`. For
961	/// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
962	/// `4`. For empty classes, `None` is returned. It is impossible for `0` to
963	/// be returned.
964	///
965	/// # Example
966	///
967	/// This example shows some examples of regexes and their corresponding
968	/// maximum length, if any.
969	///
970	/// ```
971	/// use regex_syntax::{hir::Properties, parse};
972	///
973	/// // The empty string has a max length of 0.
974	/// let hir = parse(r"")?;
975	/// assert_eq!(Some(`0`), hir.properties().maximum_len());
976	/// // As do other types of regexes that only match the empty string.
977	/// let hir = parse(r"^$\b\B")?;
978	/// assert_eq!(Some(`0`), hir.properties().maximum_len());
979	/// // A regex that matches nothing has no maximum defined.
980	/// let hir = parse(r"[a&&b]")?;
981	/// assert_eq!(None, hir.properties().maximum_len());
982	/// // Bounded repeats work as you expect.
983	/// let hir = parse(r"x{2,10}")?;
984	/// assert_eq!(Some(`10`), hir.properties().maximum_len());
985	/// // An unbounded repeat means there is no maximum.
986	/// let hir = parse(r"x{2,}")?;
987	/// assert_eq!(None, hir.properties().maximum_len());
988	/// // With Unicode enabled, \w can match up to 4 bytes!
989	/// let hir = parse(r"\w")?;
990	/// assert_eq!(Some(`4`), hir.properties().maximum_len());
991	/// // Without Unicode enabled, \w matches at most 1 byte.
992	/// let hir = parse(r"(?-u)\w")?;
993	/// assert_eq!(Some(`1`), hir.properties().maximum_len());
994	///
995	/// # Ok::<(), Box<dyn std::error::Error>>(())
996	/// ```
997	pub fn maximum_len(&self) -> Option<usize> {
998	match *self {
999	Class::Unicode(ref x) => x.maximum_len(),
1000	Class::Bytes(ref x) => x.maximum_len(),
1001	}
1002	}
1003
1004	/// Returns true if and only if this character class is empty. That is,
1005	/// it has no elements.
1006	///
1007	/// An empty character can never match anything, including an empty string.
1008	pub fn is_empty(&self) -> bool {
1009	match *self {
1010	Class::Unicode(ref x) => x.ranges().is_empty(),
1011	Class::Bytes(ref x) => x.ranges().is_empty(),
1012	}
1013	}
1014
1015	/// If this class consists of exactly one element (whether a codepoint or a
1016	/// byte), then return it as a literal byte string.
1017	///
1018	/// If this class is empty or contains more than one element, then `None`
1019	/// is returned.
1020	pub fn literal(&self) -> Option<Vec<u8>> {
1021	match *self {
1022	Class::Unicode(ref x) => x.literal(),
1023	Class::Bytes(ref x) => x.literal(),
1024	}
1025	}
1026	}
1027
1028	impl core::fmt::Debug for Class {
1029	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1030	use crate::debug::Byte;
1031
1032	let mut fmter: DebugSet<'_, '_> = f.debug_set();
1033	match *self {
1034	Class::Unicode(ref cls: &ClassUnicode) => {
1035	for r: &ClassUnicodeRange in cls.ranges().iter() {
1036	fmter.entry(&(r.start..=r.end));
1037	}
1038	}
1039	Class::Bytes(ref cls: &ClassBytes) => {
1040	for r: &ClassBytesRange in cls.ranges().iter() {
1041	fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1042	}
1043	}
1044	}
1045	fmter.finish()
1046	}
1047	}
1048
1049	/// A set of characters represented by Unicode scalar values.
1050	#[derive(Clone, Debug, Eq, PartialEq)]
1051	pub struct ClassUnicode {
1052	set: IntervalSet<ClassUnicodeRange>,
1053	}
1054
1055	impl ClassUnicode {
1056	/// Create a new class from a sequence of ranges.
1057	///
1058	/// The given ranges do not need to be in any specific order, and ranges
1059	/// may overlap. Ranges will automatically be sorted into a canonical
1060	/// non-overlapping order.
1061	pub fn new<I>(ranges: I) -> ClassUnicode
1062	where
1063	I: IntoIterator<Item = ClassUnicodeRange>,
1064	{
1065	ClassUnicode { set: IntervalSet::new(ranges) }
1066	}
1067
1068	/// Create a new class with no ranges.
1069	///
1070	/// An empty class matches nothing. That is, it is equivalent to
1071	/// [`Hir::fail`].
1072	pub fn empty() -> ClassUnicode {
1073	ClassUnicode::new(vec![])
1074	}
1075
1076	/// Add a new range to this set.
1077	pub fn push(&mut self, range: ClassUnicodeRange) {
1078	self.set.push(range);
1079	}
1080
1081	/// Return an iterator over all ranges in this class.
1082	///
1083	/// The iterator yields ranges in ascending order.
1084	pub fn iter(&self) -> ClassUnicodeIter<'_> {
1085	ClassUnicodeIter(self.set.iter())
1086	}
1087
1088	/// Return the underlying ranges as a slice.
1089	pub fn ranges(&self) -> &[ClassUnicodeRange] {
1090	self.set.intervals()
1091	}
1092
1093	/// Expand this character class such that it contains all case folded
1094	/// characters, according to Unicode's "simple" mapping. For example, if
1095	/// this class consists of the range `a-z`, then applying case folding will
1096	/// result in the class containing both the ranges `a-z` and `A-Z`.
1097	///
1098	/// # Panics
1099	///
1100	/// This routine panics when the case mapping data necessary for this
1101	/// routine to complete is unavailable. This occurs when the `unicode-case`
1102	/// feature is not enabled.
1103	///
1104	/// Callers should prefer using `try_case_fold_simple` instead, which will
1105	/// return an error instead of panicking.
1106	pub fn case_fold_simple(&mut self) {
1107	self.set
1108	.case_fold_simple()
1109	.expect("unicode-case feature must be enabled");
1110	}
1111
1112	/// Expand this character class such that it contains all case folded
1113	/// characters, according to Unicode's "simple" mapping. For example, if
1114	/// this class consists of the range `a-z`, then applying case folding will
1115	/// result in the class containing both the ranges `a-z` and `A-Z`.
1116	///
1117	/// # Error
1118	///
1119	/// This routine returns an error when the case mapping data necessary
1120	/// for this routine to complete is unavailable. This occurs when the
1121	/// `unicode-case` feature is not enabled.
1122	pub fn try_case_fold_simple(
1123	&mut self,
1124	) -> core::result::Result<(), CaseFoldError> {
1125	self.set.case_fold_simple()
1126	}
1127
1128	/// Negate this character class.
1129	///
1130	/// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1131	/// set, then it will not be in this set after negation.
1132	pub fn negate(&mut self) {
1133	self.set.negate();
1134	}
1135
1136	/// Union this character class with the given character class, in place.
1137	pub fn union(&mut self, other: &ClassUnicode) {
1138	self.set.union(&other.set);
1139	}
1140
1141	/// Intersect this character class with the given character class, in
1142	/// place.
1143	pub fn intersect(&mut self, other: &ClassUnicode) {
1144	self.set.intersect(&other.set);
1145	}
1146
1147	/// Subtract the given character class from this character class, in place.
1148	pub fn difference(&mut self, other: &ClassUnicode) {
1149	self.set.difference(&other.set);
1150	}
1151
1152	/// Compute the symmetric difference of the given character classes, in
1153	/// place.
1154	///
1155	/// This computes the symmetric difference of two character classes. This
1156	/// removes all elements in this class that are also in the given class,
1157	/// but all adds all elements from the given class that aren't in this
1158	/// class. That is, the class will contain all elements in either class,
1159	/// but will not contain any elements that are in both classes.
1160	pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1161	self.set.symmetric_difference(&other.set);
1162	}
1163
1164	/// Returns true if and only if this character class will either match
1165	/// nothing or only ASCII bytes. Stated differently, this returns false
1166	/// if and only if this class contains a non-ASCII codepoint.
1167	pub fn is_ascii(&self) -> bool {
1168	self.set.intervals().last().map_or(`true`, \|r\| r.end <= '`\x7F`')
1169	}
1170
1171	/// Returns the length, in bytes, of the smallest string matched by this
1172	/// character class.
1173	///
1174	/// Returns `None` when the class is empty.
1175	pub fn minimum_len(&self) -> Option<usize> {
1176	let first = self.ranges().get(`0`)?;
1177	// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1178	Some(first.start.len_utf8())
1179	}
1180
1181	/// Returns the length, in bytes, of the longest string matched by this
1182	/// character class.
1183	///
1184	/// Returns `None` when the class is empty.
1185	pub fn maximum_len(&self) -> Option<usize> {
1186	let last = self.ranges().last()?;
1187	// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1188	Some(last.end.len_utf8())
1189	}
1190
1191	/// If this class consists of exactly one codepoint, then return it as
1192	/// a literal byte string.
1193	///
1194	/// If this class is empty or contains more than one codepoint, then `None`
1195	/// is returned.
1196	pub fn literal(&self) -> Option<Vec<u8>> {
1197	let rs = self.ranges();
1198	if rs.len() == `1` && rs[`0`].start == rs[`0`].end {
1199	Some(rs[`0`].start.encode_utf8(&mut [`0`; `4`]).to_string().into_bytes())
1200	} else {
1201	None
1202	}
1203	}
1204
1205	/// If this class consists of only ASCII ranges, then return its
1206	/// corresponding and equivalent byte class.
1207	pub fn to_byte_class(&self) -> Option<ClassBytes> {
1208	if !self.is_ascii() {
1209	return None;
1210	}
1211	Some(ClassBytes::new(self.ranges().iter().map(\|r\| {
1212	// Since we are guaranteed that our codepoint range is ASCII, the
1213	// 'u8::try_from' calls below are guaranteed to be correct.
1214	ClassBytesRange {
1215	start: u8::try_from(r.start).unwrap(),
1216	end: u8::try_from(r.end).unwrap(),
1217	}
1218	})))
1219	}
1220	}
1221
1222	/// An iterator over all ranges in a Unicode character class.
1223	///
1224	/// The lifetime `'a` refers to the lifetime of the underlying class.
1225	#[derive(Debug)]
1226	pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1227
1228	impl<'a> Iterator for ClassUnicodeIter<'a> {
1229	type Item = &'a ClassUnicodeRange;
1230
1231	fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1232	self.0.next()
1233	}
1234	}
1235
1236	/// A single range of characters represented by Unicode scalar values.
1237	///
1238	/// The range is closed. That is, the start and end of the range are included
1239	/// in the range.
1240	#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1241	pub struct ClassUnicodeRange {
1242	start: char,
1243	end: char,
1244	}
1245
1246	impl core::fmt::Debug for ClassUnicodeRange {
1247	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1248	let start: String = if !self.start.is_whitespace() && !self.start.is_control()
1249	{
1250	self.start.to_string()
1251	} else {
1252	format!("0x{:X}", u32::from(self.start))
1253	};
1254	let end: String = if !self.end.is_whitespace() && !self.end.is_control() {
1255	self.end.to_string()
1256	} else {
1257	format!("0x{:X}", u32::from(self.end))
1258	};
1259	f&mut DebugStruct<'_, '_>.debug_struct("ClassUnicodeRange")
1260	.field("start", &start)
1261	.field(name:"end", &end)
1262	.finish()
1263	}
1264	}
1265
1266	impl Interval for ClassUnicodeRange {
1267	type Bound = char;
1268
1269	#[inline]
1270	fn lower(&self) -> char {
1271	self.start
1272	}
1273	#[inline]
1274	fn upper(&self) -> char {
1275	self.end
1276	}
1277	#[inline]
1278	fn set_lower(&mut self, bound: char) {
1279	self.start = bound;
1280	}
1281	#[inline]
1282	fn set_upper(&mut self, bound: char) {
1283	self.end = bound;
1284	}
1285
1286	/// Apply simple case folding to this Unicode scalar value range.
1287	///
1288	/// Additional ranges are appended to the given vector. Canonical ordering
1289	/// is not* maintained in the given vector.*
1290	fn case_fold_simple(
1291	&self,
1292	ranges: &mut Vec<ClassUnicodeRange>,
1293	) -> Result<(), unicode::CaseFoldError> {
1294	let mut folder = unicode::SimpleCaseFolder::new()?;
1295	if !folder.overlaps(self.start, self.end) {
1296	return Ok(());
1297	}
1298	let (start, end) = (u32::from(self.start), u32::from(self.end));
1299	for cp in (start..=end).filter_map(char::from_u32) {
1300	for &cp_folded in folder.mapping(cp) {
1301	ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1302	}
1303	}
1304	Ok(())
1305	}
1306	}
1307
1308	impl ClassUnicodeRange {
1309	/// Create a new Unicode scalar value range for a character class.
1310	///
1311	/// The returned range is always in a canonical form. That is, the range
1312	/// returned always satisfies the invariant that `start <= end`.
1313	pub fn new(start: char, end: char) -> ClassUnicodeRange {
1314	ClassUnicodeRange::create(start, end)
1315	}
1316
1317	/// Return the start of this range.
1318	///
1319	/// The start of a range is always less than or equal to the end of the
1320	/// range.
1321	pub fn start(&self) -> char {
1322	self.start
1323	}
1324
1325	/// Return the end of this range.
1326	///
1327	/// The end of a range is always greater than or equal to the start of the
1328	/// range.
1329	pub fn end(&self) -> char {
1330	self.end
1331	}
1332
1333	/// Returns the number of codepoints in this range.
1334	pub fn len(&self) -> usize {
1335	let diff = `1` + u32::from(self.end) - u32::from(self.start);
1336	// This is likely to panic in 16-bit targets since a usize can only fit
1337	// 2^16. It's not clear what to do here, other than to return an error
1338	// when building a Unicode class that contains a range whose length
1339	// overflows usize. (Which, to be honest, is probably quite common on
1340	// 16-bit targets. For example, this would imply that '.' and '\p{any}'
1341	// would be impossible to build.)
1342	usize::try_from(diff).expect("char class len fits in usize")
1343	}
1344	}
1345
1346	/// A set of characters represented by arbitrary bytes.
1347	///
1348	/// Each byte corresponds to one character.
1349	#[derive(Clone, Debug, Eq, PartialEq)]
1350	pub struct ClassBytes {
1351	set: IntervalSet<ClassBytesRange>,
1352	}
1353
1354	impl ClassBytes {
1355	/// Create a new class from a sequence of ranges.
1356	///
1357	/// The given ranges do not need to be in any specific order, and ranges
1358	/// may overlap. Ranges will automatically be sorted into a canonical
1359	/// non-overlapping order.
1360	pub fn new<I>(ranges: I) -> ClassBytes
1361	where
1362	I: IntoIterator<Item = ClassBytesRange>,
1363	{
1364	ClassBytes { set: IntervalSet::new(ranges) }
1365	}
1366
1367	/// Create a new class with no ranges.
1368	///
1369	/// An empty class matches nothing. That is, it is equivalent to
1370	/// [`Hir::fail`].
1371	pub fn empty() -> ClassBytes {
1372	ClassBytes::new(vec![])
1373	}
1374
1375	/// Add a new range to this set.
1376	pub fn push(&mut self, range: ClassBytesRange) {
1377	self.set.push(range);
1378	}
1379
1380	/// Return an iterator over all ranges in this class.
1381	///
1382	/// The iterator yields ranges in ascending order.
1383	pub fn iter(&self) -> ClassBytesIter<'_> {
1384	ClassBytesIter(self.set.iter())
1385	}
1386
1387	/// Return the underlying ranges as a slice.
1388	pub fn ranges(&self) -> &[ClassBytesRange] {
1389	self.set.intervals()
1390	}
1391
1392	/// Expand this character class such that it contains all case folded
1393	/// characters. For example, if this class consists of the range `a-z`,
1394	/// then applying case folding will result in the class containing both the
1395	/// ranges `a-z` and `A-Z`.
1396	///
1397	/// Note that this only applies ASCII case folding, which is limited to the
1398	/// characters `a-z` and `A-Z`.
1399	pub fn case_fold_simple(&mut self) {
1400	self.set.case_fold_simple().expect("ASCII case folding never fails");
1401	}
1402
1403	/// Negate this byte class.
1404	///
1405	/// For all `b` where `b` is a any byte, if `b` was in this set, then it
1406	/// will not be in this set after negation.
1407	pub fn negate(&mut self) {
1408	self.set.negate();
1409	}
1410
1411	/// Union this byte class with the given byte class, in place.
1412	pub fn union(&mut self, other: &ClassBytes) {
1413	self.set.union(&other.set);
1414	}
1415
1416	/// Intersect this byte class with the given byte class, in place.
1417	pub fn intersect(&mut self, other: &ClassBytes) {
1418	self.set.intersect(&other.set);
1419	}
1420
1421	/// Subtract the given byte class from this byte class, in place.
1422	pub fn difference(&mut self, other: &ClassBytes) {
1423	self.set.difference(&other.set);
1424	}
1425
1426	/// Compute the symmetric difference of the given byte classes, in place.
1427	///
1428	/// This computes the symmetric difference of two byte classes. This
1429	/// removes all elements in this class that are also in the given class,
1430	/// but all adds all elements from the given class that aren't in this
1431	/// class. That is, the class will contain all elements in either class,
1432	/// but will not contain any elements that are in both classes.
1433	pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1434	self.set.symmetric_difference(&other.set);
1435	}
1436
1437	/// Returns true if and only if this character class will either match
1438	/// nothing or only ASCII bytes. Stated differently, this returns false
1439	/// if and only if this class contains a non-ASCII byte.
1440	pub fn is_ascii(&self) -> bool {
1441	self.set.intervals().last().map_or(`true`, \|r\| r.end <= `0x7F`)
1442	}
1443
1444	/// Returns the length, in bytes, of the smallest string matched by this
1445	/// character class.
1446	///
1447	/// Returns `None` when the class is empty.
1448	pub fn minimum_len(&self) -> Option<usize> {
1449	if self.ranges().is_empty() {
1450	None
1451	} else {
1452	Some(`1`)
1453	}
1454	}
1455
1456	/// Returns the length, in bytes, of the longest string matched by this
1457	/// character class.
1458	///
1459	/// Returns `None` when the class is empty.
1460	pub fn maximum_len(&self) -> Option<usize> {
1461	if self.ranges().is_empty() {
1462	None
1463	} else {
1464	Some(`1`)
1465	}
1466	}
1467
1468	/// If this class consists of exactly one byte, then return it as
1469	/// a literal byte string.
1470	///
1471	/// If this class is empty or contains more than one byte, then `None`
1472	/// is returned.
1473	pub fn literal(&self) -> Option<Vec<u8>> {
1474	let rs = self.ranges();
1475	if rs.len() == `1` && rs[`0`].start == rs[`0`].end {
1476	Some(vec![rs[`0`].start])
1477	} else {
1478	None
1479	}
1480	}
1481
1482	/// If this class consists of only ASCII ranges, then return its
1483	/// corresponding and equivalent Unicode class.
1484	pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1485	if !self.is_ascii() {
1486	return None;
1487	}
1488	Some(ClassUnicode::new(self.ranges().iter().map(\|r\| {
1489	// Since we are guaranteed that our byte range is ASCII, the
1490	// 'char::from' calls below are correct and will not erroneously
1491	// convert a raw byte value into its corresponding codepoint.
1492	ClassUnicodeRange {
1493	start: char::from(r.start),
1494	end: char::from(r.end),
1495	}
1496	})))
1497	}
1498	}
1499
1500	/// An iterator over all ranges in a byte character class.
1501	///
1502	/// The lifetime `'a` refers to the lifetime of the underlying class.
1503	#[derive(Debug)]
1504	pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1505
1506	impl<'a> Iterator for ClassBytesIter<'a> {
1507	type Item = &'a ClassBytesRange;
1508
1509	fn next(&mut self) -> Option<&'a ClassBytesRange> {
1510	self.0.next()
1511	}
1512	}
1513
1514	/// A single range of characters represented by arbitrary bytes.
1515	///
1516	/// The range is closed. That is, the start and end of the range are included
1517	/// in the range.
1518	#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1519	pub struct ClassBytesRange {
1520	start: u8,
1521	end: u8,
1522	}
1523
1524	impl Interval for ClassBytesRange {
1525	type Bound = u8;
1526
1527	#[inline]
1528	fn lower(&self) -> u8 {
1529	self.start
1530	}
1531	#[inline]
1532	fn upper(&self) -> u8 {
1533	self.end
1534	}
1535	#[inline]
1536	fn set_lower(&mut self, bound: u8) {
1537	self.start = bound;
1538	}
1539	#[inline]
1540	fn set_upper(&mut self, bound: u8) {
1541	self.end = bound;
1542	}
1543
1544	/// Apply simple case folding to this byte range. Only ASCII case mappings
1545	/// (for a-z) are applied.
1546	///
1547	/// Additional ranges are appended to the given vector. Canonical ordering
1548	/// is not* maintained in the given vector.*
1549	fn case_fold_simple(
1550	&self,
1551	ranges: &mut Vec<ClassBytesRange>,
1552	) -> Result<(), unicode::CaseFoldError> {
1553	if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1554	let lower = cmp::max(self.start, b'a');
1555	let upper = cmp::min(self.end, b'z');
1556	ranges.push(ClassBytesRange::new(lower - `32`, upper - `32`));
1557	}
1558	if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1559	let lower = cmp::max(self.start, b'A');
1560	let upper = cmp::min(self.end, b'Z');
1561	ranges.push(ClassBytesRange::new(lower + `32`, upper + `32`));
1562	}
1563	Ok(())
1564	}
1565	}
1566
1567	impl ClassBytesRange {
1568	/// Create a new byte range for a character class.
1569	///
1570	/// The returned range is always in a canonical form. That is, the range
1571	/// returned always satisfies the invariant that `start <= end`.
1572	pub fn new(start: u8, end: u8) -> ClassBytesRange {
1573	ClassBytesRange::create(start, end)
1574	}
1575
1576	/// Return the start of this range.
1577	///
1578	/// The start of a range is always less than or equal to the end of the
1579	/// range.
1580	pub fn start(&self) -> u8 {
1581	self.start
1582	}
1583
1584	/// Return the end of this range.
1585	///
1586	/// The end of a range is always greater than or equal to the start of the
1587	/// range.
1588	pub fn end(&self) -> u8 {
1589	self.end
1590	}
1591
1592	/// Returns the number of bytes in this range.
1593	pub fn len(&self) -> usize {
1594	usize::from(self.end.checked_sub(self.start).unwrap())
1595	.checked_add(`1`)
1596	.unwrap()
1597	}
1598	}
1599
1600	impl core::fmt::Debug for ClassBytesRange {
1601	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1602	f&mut DebugStruct<'_, '_>.debug_struct("ClassBytesRange")
1603	.field("start", &crate::debug::Byte(self.start))
1604	.field(name:"end", &crate::debug::Byte(self.end))
1605	.finish()
1606	}
1607	}
1608
1609	/// The high-level intermediate representation for a look-around assertion.
1610	///
1611	/// An assertion match is always zero-length. Also called an "empty match."
1612	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1613	pub enum Look {
1614	/// Match the beginning of text. Specifically, this matches at the starting
1615	/// position of the input.
1616	Start = `1` << `0`,
1617	/// Match the end of text. Specifically, this matches at the ending
1618	/// position of the input.
1619	End = `1` << `1`,
1620	/// Match the beginning of a line or the beginning of text. Specifically,
1621	/// this matches at the starting position of the input, or at the position
1622	/// immediately following a `\n` character.
1623	StartLF = `1` << `2`,
1624	/// Match the end of a line or the end of text. Specifically, this matches
1625	/// at the end position of the input, or at the position immediately
1626	/// preceding a `\n` character.
1627	EndLF = `1` << `3`,
1628	/// Match the beginning of a line or the beginning of text. Specifically,
1629	/// this matches at the starting position of the input, or at the position
1630	/// immediately following either a `\r` or `\n` character, but never after
1631	/// a `\r` when a `\n` follows.
1632	StartCRLF = `1` << `4`,
1633	/// Match the end of a line or the end of text. Specifically, this matches
1634	/// at the end position of the input, or at the position immediately
1635	/// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1636	/// precedes it.
1637	EndCRLF = `1` << `5`,
1638	/// Match an ASCII-only word boundary. That is, this matches a position
1639	/// where the left adjacent character and right adjacent character
1640	/// correspond to a word and non-word or a non-word and word character.
1641	WordAscii = `1` << `6`,
1642	/// Match an ASCII-only negation of a word boundary.
1643	WordAsciiNegate = `1` << `7`,
1644	/// Match a Unicode-aware word boundary. That is, this matches a position
1645	/// where the left adjacent character and right adjacent character
1646	/// correspond to a word and non-word or a non-word and word character.
1647	WordUnicode = `1` << `8`,
1648	/// Match a Unicode-aware negation of a word boundary.
1649	WordUnicodeNegate = `1` << `9`,
1650	/// Match the start of an ASCII-only word boundary. That is, this matches a
1651	/// position at either the beginning of the haystack or where the previous
1652	/// character is not a word character and the following character is a word
1653	/// character.
1654	WordStartAscii = `1` << `10`,
1655	/// Match the end of an ASCII-only word boundary. That is, this matches
1656	/// a position at either the end of the haystack or where the previous
1657	/// character is a word character and the following character is not a word
1658	/// character.
1659	WordEndAscii = `1` << `11`,
1660	/// Match the start of a Unicode word boundary. That is, this matches a
1661	/// position at either the beginning of the haystack or where the previous
1662	/// character is not a word character and the following character is a word
1663	/// character.
1664	WordStartUnicode = `1` << `12`,
1665	/// Match the end of a Unicode word boundary. That is, this matches a
1666	/// position at either the end of the haystack or where the previous
1667	/// character is a word character and the following character is not a word
1668	/// character.
1669	WordEndUnicode = `1` << `13`,
1670	/// Match the start half of an ASCII-only word boundary. That is, this
1671	/// matches a position at either the beginning of the haystack or where the
1672	/// previous character is not a word character.
1673	WordStartHalfAscii = `1` << `14`,
1674	/// Match the end half of an ASCII-only word boundary. That is, this
1675	/// matches a position at either the end of the haystack or where the
1676	/// following character is not a word character.
1677	WordEndHalfAscii = `1` << `15`,
1678	/// Match the start half of a Unicode word boundary. That is, this matches
1679	/// a position at either the beginning of the haystack or where the
1680	/// previous character is not a word character.
1681	WordStartHalfUnicode = `1` << `16`,
1682	/// Match the end half of a Unicode word boundary. That is, this matches
1683	/// a position at either the end of the haystack or where the following
1684	/// character is not a word character.
1685	WordEndHalfUnicode = `1` << `17`,
1686	}
1687
1688	impl Look {
1689	/// Flip the look-around assertion to its equivalent for reverse searches.
1690	/// For example, `StartLF` gets translated to `EndLF`.
1691	///
1692	/// Some assertions, such as `WordUnicode`, remain the same since they
1693	/// match the same positions regardless of the direction of the search.
1694	#[inline]
1695	pub const fn reversed(self) -> Look {
1696	match self {
1697	Look::Start => Look::End,
1698	Look::End => Look::Start,
1699	Look::StartLF => Look::EndLF,
1700	Look::EndLF => Look::StartLF,
1701	Look::StartCRLF => Look::EndCRLF,
1702	Look::EndCRLF => Look::StartCRLF,
1703	Look::WordAscii => Look::WordAscii,
1704	Look::WordAsciiNegate => Look::WordAsciiNegate,
1705	Look::WordUnicode => Look::WordUnicode,
1706	Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1707	Look::WordStartAscii => Look::WordEndAscii,
1708	Look::WordEndAscii => Look::WordStartAscii,
1709	Look::WordStartUnicode => Look::WordEndUnicode,
1710	Look::WordEndUnicode => Look::WordStartUnicode,
1711	Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1712	Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1713	Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1714	Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1715	}
1716	}
1717
1718	/// Return the underlying representation of this look-around enumeration
1719	/// as an integer. Giving the return value to the [`Look::from_repr`]
1720	/// constructor is guaranteed to return the same look-around variant that
1721	/// one started with within a semver compatible release of this crate.
1722	#[inline]
1723	pub const fn as_repr(self) -> u32 {
1724	// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1725	// actual int.
1726	self as u32
1727	}
1728
1729	/// Given the underlying representation of a `Look` value, return the
1730	/// corresponding `Look` value if the representation is valid. Otherwise
1731	/// `None` is returned.
1732	#[inline]
1733	pub const fn from_repr(repr: u32) -> Option<Look> {
1734	match repr {
1735	`0b00_0000_0000_0000_0001` => Some(Look::Start),
1736	`0b00_0000_0000_0000_0010` => Some(Look::End),
1737	`0b00_0000_0000_0000_0100` => Some(Look::StartLF),
1738	`0b00_0000_0000_0000_1000` => Some(Look::EndLF),
1739	`0b00_0000_0000_0001_0000` => Some(Look::StartCRLF),
1740	`0b00_0000_0000_0010_0000` => Some(Look::EndCRLF),
1741	`0b00_0000_0000_0100_0000` => Some(Look::WordAscii),
1742	`0b00_0000_0000_1000_0000` => Some(Look::WordAsciiNegate),
1743	`0b00_0000_0001_0000_0000` => Some(Look::WordUnicode),
1744	`0b00_0000_0010_0000_0000` => Some(Look::WordUnicodeNegate),
1745	`0b00_0000_0100_0000_0000` => Some(Look::WordStartAscii),
1746	`0b00_0000_1000_0000_0000` => Some(Look::WordEndAscii),
1747	`0b00_0001_0000_0000_0000` => Some(Look::WordStartUnicode),
1748	`0b00_0010_0000_0000_0000` => Some(Look::WordEndUnicode),
1749	`0b00_0100_0000_0000_0000` => Some(Look::WordStartHalfAscii),
1750	`0b00_1000_0000_0000_0000` => Some(Look::WordEndHalfAscii),
1751	`0b01_0000_0000_0000_0000` => Some(Look::WordStartHalfUnicode),
1752	`0b10_0000_0000_0000_0000` => Some(Look::WordEndHalfUnicode),
1753	_ => None,
1754	}
1755	}
1756
1757	/// Returns a convenient single codepoint representation of this
1758	/// look-around assertion. Each assertion is guaranteed to be represented
1759	/// by a distinct character.
1760	///
1761	/// This is useful for succinctly representing a look-around assertion in
1762	/// human friendly but succinct output intended for a programmer working on
1763	/// regex internals.
1764	#[inline]
1765	pub const fn as_char(self) -> char {
1766	match self {
1767	Look::Start => 'A',
1768	Look::End => 'z',
1769	Look::StartLF => '^',
1770	Look::EndLF => '$',
1771	Look::StartCRLF => 'r',
1772	Look::EndCRLF => 'R',
1773	Look::WordAscii => 'b',
1774	Look::WordAsciiNegate => 'B',
1775	Look::WordUnicode => '𝛃',
1776	Look::WordUnicodeNegate => '𝚩',
1777	Look::WordStartAscii => '<',
1778	Look::WordEndAscii => '>',
1779	Look::WordStartUnicode => '〈',
1780	Look::WordEndUnicode => '〉',
1781	Look::WordStartHalfAscii => '◁',
1782	Look::WordEndHalfAscii => '▷',
1783	Look::WordStartHalfUnicode => '◀',
1784	Look::WordEndHalfUnicode => '▶',
1785	}
1786	}
1787	}
1788
1789	/// The high-level intermediate representation for a capturing group.
1790	///
1791	/// A capturing group always has an index and a child expression. It may
1792	/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1793	/// necessary.
1794	///
1795	/// Note that there is no explicit representation of a non-capturing group
1796	/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1797	/// the recursive structure of the `Hir` itself.
1798	#[derive(Clone, Debug, Eq, PartialEq)]
1799	pub struct Capture {
1800	/// The capture index of the capture.
1801	pub index: u32,
1802	/// The name of the capture, if it exists.
1803	pub name: Option<Box<str>>,
1804	/// The expression inside the capturing group, which may be empty.
1805	pub sub: Box<Hir>,
1806	}
1807
1808	/// The high-level intermediate representation of a repetition operator.
1809	///
1810	/// A repetition operator permits the repetition of an arbitrary
1811	/// sub-expression.
1812	#[derive(Clone, Debug, Eq, PartialEq)]
1813	pub struct Repetition {
1814	/// The minimum range of the repetition.
1815	///
1816	/// Note that special cases like `?`, `+` and `` all get translated into*
1817	/// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1818	///
1819	/// When `min` is zero, this expression can match the empty string
1820	/// regardless of what its sub-expression is.
1821	pub min: u32,
1822	/// The maximum range of the repetition.
1823	///
1824	/// Note that when `max` is `None`, `min` acts as a lower bound but where
1825	/// there is no upper bound. For something like `x{5}` where the min and
1826	/// max are equivalent, `min` will be set to `5` and `max` will be set to
1827	/// `Some(5)`.
1828	pub max: Option<u32>,
1829	/// Whether this repetition operator is greedy or not. A greedy operator
1830	/// will match as much as it can. A non-greedy operator will match as
1831	/// little as it can.
1832	///
1833	/// Typically, operators are greedy by default and are only non-greedy when
1834	/// a `?` suffix is used, e.g., `(expr)` is greedy while `(expr)?` is
1835	/// not. However, this can be inverted via the `U` "ungreedy" flag.
1836	pub greedy: bool,
1837	/// The expression being repeated.
1838	pub sub: Box<Hir>,
1839	}
1840
1841	impl Repetition {
1842	/// Returns a new repetition with the same `min`, `max` and `greedy`
1843	/// values, but with its sub-expression replaced with the one given.
1844	pub fn with(&self, sub: Hir) -> Repetition {
1845	Repetition {
1846	min: self.min,
1847	max: self.max,
1848	greedy: self.greedy,
1849	sub: Box::new(sub),
1850	}
1851	}
1852	}
1853
1854	/// A type describing the different flavors of `.`.
1855	///
1856	/// This type is meant to be used with [`Hir::dot`], which is a convenience
1857	/// routine for building HIR values derived from the `.` regex.
1858	#[non_exhaustive]
1859	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1860	pub enum Dot {
1861	/// Matches the UTF-8 encoding of any Unicode scalar value.
1862	///
1863	/// This is equivalent to `(?su:.)` and also `\p{any}`.
1864	AnyChar,
1865	/// Matches any byte value.
1866	///
1867	/// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1868	AnyByte,
1869	/// Matches the UTF-8 encoding of any Unicode scalar value except for the
1870	/// `char` given.
1871	///
1872	/// This is equivalent to using `(?u-s:.)` with the line terminator set
1873	/// to a particular ASCII byte. (Because of peculiarities in the regex
1874	/// engines, a line terminator must be a single byte. It follows that when
1875	/// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1876	/// value. That is, ti must be ASCII.)
1877	///
1878	/// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1879	/// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1880	AnyCharExcept(char),
1881	/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1882	///
1883	/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1884	AnyCharExceptLF,
1885	/// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1886	/// and `\n`.
1887	///
1888	/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1889	AnyCharExceptCRLF,
1890	/// Matches any byte value except for the `u8` given.
1891	///
1892	/// This is equivalent to using `(?-us:.)` with the line terminator set
1893	/// to a particular ASCII byte. (Because of peculiarities in the regex
1894	/// engines, a line terminator must be a single byte. It follows that when
1895	/// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1896	/// value. That is, ti must be ASCII.)
1897	///
1898	/// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1899	/// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1900	AnyByteExcept(u8),
1901	/// Matches any byte value except for `\n`.
1902	///
1903	/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1904	AnyByteExceptLF,
1905	/// Matches any byte value except for `\r` and `\n`.
1906	///
1907	/// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1908	AnyByteExceptCRLF,
1909	}
1910
1911	/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1912	/// space but heap space proportional to the depth of the total `Hir`.
1913	impl Drop for Hir {
1914	fn drop(&mut self) {
1915	use core::mem;
1916
1917	match *self.kind() {
1918	HirKind::Empty
1919	\| HirKind::Literal(_)
1920	\| HirKind::Class(_)
1921	\| HirKind::Look(_) => return,
1922	HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1923	HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1924	return
1925	}
1926	HirKind::Concat(ref x) if x.is_empty() => return,
1927	HirKind::Alternation(ref x) if x.is_empty() => return,
1928	_ => {}
1929	}
1930
1931	let mut stack = vec![mem::replace(self, Hir::empty())];
1932	while let Some(mut expr) = stack.pop() {
1933	match expr.kind {
1934	HirKind::Empty
1935	\| HirKind::Literal(_)
1936	\| HirKind::Class(_)
1937	\| HirKind::Look(_) => {}
1938	HirKind::Capture(ref mut x) => {
1939	stack.push(mem::replace(&mut x.sub, Hir::empty()));
1940	}
1941	HirKind::Repetition(ref mut x) => {
1942	stack.push(mem::replace(&mut x.sub, Hir::empty()));
1943	}
1944	HirKind::Concat(ref mut x) => {
1945	stack.extend(x.drain(..));
1946	}
1947	HirKind::Alternation(ref mut x) => {
1948	stack.extend(x.drain(..));
1949	}
1950	}
1951	}
1952	}
1953	}
1954
1955	/// A type that collects various properties of an HIR value.
1956	///
1957	/// Properties are always scalar values and represent meta data that is
1958	/// computed inductively on an HIR value. Properties are defined for all
1959	/// HIR values.
1960	///
1961	/// All methods on a `Properties` value take constant time and are meant to
1962	/// be cheap to call.
1963	#[derive(Clone, Debug, Eq, PartialEq)]
1964	pub struct Properties(Box<PropertiesI>);
1965
1966	/// The property definition. It is split out so that we can box it, and
1967	/// there by make `Properties` use less stack size. This is kind-of important
1968	/// because every HIR value has a `Properties` attached to it.
1969	///
1970	/// This does have the unfortunate consequence that creating any HIR value
1971	/// always leads to at least one alloc for properties, but this is generally
1972	/// true anyway (for pretty much all HirKinds except for look-arounds).
1973	#[derive(Clone, Debug, Eq, PartialEq)]
1974	struct PropertiesI {
1975	minimum_len: Option<usize>,
1976	maximum_len: Option<usize>,
1977	look_set: LookSet,
1978	look_set_prefix: LookSet,
1979	look_set_suffix: LookSet,
1980	look_set_prefix_any: LookSet,
1981	look_set_suffix_any: LookSet,
1982	utf8: bool,
1983	explicit_captures_len: usize,
1984	static_explicit_captures_len: Option<usize>,
1985	literal: bool,
1986	alternation_literal: bool,
1987	}
1988
1989	impl Properties {
1990	/// Returns the length (in bytes) of the smallest string matched by this
1991	/// HIR.
1992	///
1993	/// A return value of `0` is possible and occurs when the HIR can match an
1994	/// empty string.
1995	///
1996	/// `None` is returned when there is no minimum length. This occurs in
1997	/// precisely the cases where the HIR matches nothing. i.e., The language
1998	/// the regex matches is empty. An example of such a regex is `\P{any}`.
1999	#[inline]
2000	pub fn minimum_len(&self) -> Option<usize> {
2001	self.0.minimum_len
2002	}
2003
2004	/// Returns the length (in bytes) of the longest string matched by this
2005	/// HIR.
2006	///
2007	/// A return value of `0` is possible and occurs when nothing longer than
2008	/// the empty string is in the language described by this HIR.
2009	///
2010	/// `None` is returned when there is no longest matching string. This
2011	/// occurs when the HIR matches nothing or when there is no upper bound on
2012	/// the length of matching strings. Example of such regexes are `\P{any}`
2013	/// (matches nothing) and `a+` (has no upper bound).
2014	#[inline]
2015	pub fn maximum_len(&self) -> Option<usize> {
2016	self.0.maximum_len
2017	}
2018
2019	/// Returns a set of all look-around assertions that appear at least once
2020	/// in this HIR value.
2021	#[inline]
2022	pub fn look_set(&self) -> LookSet {
2023	self.0.look_set
2024	}
2025
2026	/// Returns a set of all look-around assertions that appear as a prefix for
2027	/// this HIR value. That is, the set returned corresponds to the set of
2028	/// assertions that must be passed before matching any bytes in a haystack.
2029	///
2030	/// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2031	/// if and only if the HIR is fully anchored at the start.
2032	#[inline]
2033	pub fn look_set_prefix(&self) -> LookSet {
2034	self.0.look_set_prefix
2035	}
2036
2037	/// Returns a set of all look-around assertions that appear as a _possible_
2038	/// prefix for this HIR value. That is, the set returned corresponds to the
2039	/// set of assertions that _may_ be passed before matching any bytes in a
2040	/// haystack.
2041	///
2042	/// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2043	/// true if and only if it's possible for the regex to match through a
2044	/// anchored assertion before consuming any input.
2045	#[inline]
2046	pub fn look_set_prefix_any(&self) -> LookSet {
2047	self.0.look_set_prefix_any
2048	}
2049
2050	/// Returns a set of all look-around assertions that appear as a suffix for
2051	/// this HIR value. That is, the set returned corresponds to the set of
2052	/// assertions that must be passed in order to be considered a match after
2053	/// all other consuming HIR expressions.
2054	///
2055	/// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2056	/// if and only if the HIR is fully anchored at the end.
2057	#[inline]
2058	pub fn look_set_suffix(&self) -> LookSet {
2059	self.0.look_set_suffix
2060	}
2061
2062	/// Returns a set of all look-around assertions that appear as a _possible_
2063	/// suffix for this HIR value. That is, the set returned corresponds to the
2064	/// set of assertions that _may_ be passed before matching any bytes in a
2065	/// haystack.
2066	///
2067	/// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2068	/// true if and only if it's possible for the regex to match through a
2069	/// anchored assertion at the end of a match without consuming any input.
2070	#[inline]
2071	pub fn look_set_suffix_any(&self) -> LookSet {
2072	self.0.look_set_suffix_any
2073	}
2074
2075	/// Return true if and only if the corresponding HIR will always match
2076	/// valid UTF-8.
2077	///
2078	/// When this returns false, then it is possible for this HIR expression to
2079	/// match invalid UTF-8, including by matching between the code units of
2080	/// a single UTF-8 encoded codepoint.
2081	///
2082	/// Note that this returns true even when the corresponding HIR can match
2083	/// the empty string. Since an empty string can technically appear between
2084	/// UTF-8 code units, it is possible for a match to be reported that splits
2085	/// a codepoint which could in turn be considered matching invalid UTF-8.
2086	/// However, it is generally assumed that such empty matches are handled
2087	/// specially by the search routine if it is absolutely required that
2088	/// matches not split a codepoint.
2089	///
2090	/// # Example
2091	///
2092	/// This code example shows the UTF-8 property of a variety of patterns.
2093	///
2094	/// ```
2095	/// use regex_syntax::{ParserBuilder, parse};
2096	///
2097	/// // Examples of 'is_utf8() == true'.
2098	/// assert!(parse(r"a")?.properties().is_utf8());
2099	/// assert!(parse(r"[^a]")?.properties().is_utf8());
2100	/// assert!(parse(r".")?.properties().is_utf8());
2101	/// assert!(parse(r"\W")?.properties().is_utf8());
2102	/// assert!(parse(r"\b")?.properties().is_utf8());
2103	/// assert!(parse(r"\B")?.properties().is_utf8());
2104	/// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2105	/// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2106	/// // Unicode mode is enabled by default, and in
2107	/// // that mode, all \x hex escapes are treated as
2108	/// // codepoints. So this actually matches the UTF-8
2109	/// // encoding of U+00FF.
2110	/// assert!(parse(r"\xFF")?.properties().is_utf8());
2111	///
2112	/// // Now we show examples of 'is_utf8() == false'.
2113	/// // The only way to do this is to force the parser
2114	/// // to permit invalid UTF-8, otherwise all of these
2115	/// // would fail to parse!
2116	/// let parse = \|pattern\| {
2117	/// ParserBuilder::new().utf8(`false`).build().parse(pattern)
2118	/// };
2119	/// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2120	/// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2121	/// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2122	/// // Conversely to the equivalent example above,
2123	/// // when Unicode mode is disabled, \x hex escapes
2124	/// // are treated as their raw byte values.
2125	/// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2126	/// // Note that just because we disabled UTF-8 in the
2127	/// // parser doesn't mean we still can't use Unicode.
2128	/// // It is enabled by default, so \xFF is still
2129	/// // equivalent to matching the UTF-8 encoding of
2130	/// // U+00FF by default.
2131	/// assert!(parse(r"\xFF")?.properties().is_utf8());
2132	/// // Even though we use raw bytes that individually
2133	/// // are not valid UTF-8, when combined together, the
2134	/// // overall expression does* match valid UTF-8!*
2135	/// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2136	///
2137	/// # Ok::<(), Box<dyn std::error::Error>>(())
2138	/// ```
2139	#[inline]
2140	pub fn is_utf8(&self) -> bool {
2141	self.0.utf8
2142	}
2143
2144	/// Returns the total number of explicit capturing groups in the
2145	/// corresponding HIR.
2146	///
2147	/// Note that this does not include the implicit capturing group
2148	/// corresponding to the entire match that is typically included by regex
2149	/// engines.
2150	///
2151	/// # Example
2152	///
2153	/// This method will return `0` for `a` and `1` for `(a)`:
2154	///
2155	/// ```
2156	/// use regex_syntax::parse;
2157	///
2158	/// assert_eq!(`0`, parse("a")?.properties().explicit_captures_len());
2159	/// assert_eq!(`1`, parse("(a)")?.properties().explicit_captures_len());
2160	///
2161	/// # Ok::<(), Box<dyn std::error::Error>>(())
2162	/// ```
2163	#[inline]
2164	pub fn explicit_captures_len(&self) -> usize {
2165	self.0.explicit_captures_len
2166	}
2167
2168	/// Returns the total number of explicit capturing groups that appear in
2169	/// every possible match.
2170	///
2171	/// If the number of capture groups can vary depending on the match, then
2172	/// this returns `None`. That is, a value is only returned when the number
2173	/// of matching groups is invariant or "static."
2174	///
2175	/// Note that this does not include the implicit capturing group
2176	/// corresponding to the entire match.
2177	///
2178	/// # Example
2179	///
2180	/// This shows a few cases where a static number of capture groups is
2181	/// available and a few cases where it is not.
2182	///
2183	/// ```
2184	/// use regex_syntax::parse;
2185	///
2186	/// let len = \|pattern\| {
2187	/// parse(pattern).map(\|h\| {
2188	/// h.properties().static_explicit_captures_len()
2189	/// })
2190	/// };
2191	///
2192	/// assert_eq!(Some(`0`), len("a")?);
2193	/// assert_eq!(Some(`1`), len("(a)")?);
2194	/// assert_eq!(Some(`1`), len("(a)\|(b)")?);
2195	/// assert_eq!(Some(`2`), len("(a)(b)\|(c)(d)")?);
2196	/// assert_eq!(None, len("(a)\|b")?);
2197	/// assert_eq!(None, len("a\|(b)")?);
2198	/// assert_eq!(None, len("(b)*")?);
2199	/// assert_eq!(Some(`1`), len("(b)+")?);
2200	///
2201	/// # Ok::<(), Box<dyn std::error::Error>>(())
2202	/// ```
2203	#[inline]
2204	pub fn static_explicit_captures_len(&self) -> Option<usize> {
2205	self.0.static_explicit_captures_len
2206	}
2207
2208	/// Return true if and only if this HIR is a simple literal. This is
2209	/// only true when this HIR expression is either itself a `Literal` or a
2210	/// concatenation of only `Literal`s.
2211	///
2212	/// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2213	/// the empty string are not (even though they contain sub-expressions that
2214	/// are literals).
2215	#[inline]
2216	pub fn is_literal(&self) -> bool {
2217	self.0.literal
2218	}
2219
2220	/// Return true if and only if this HIR is either a simple literal or an
2221	/// alternation of simple literals. This is only
2222	/// true when this HIR expression is either itself a `Literal` or a
2223	/// concatenation of only `Literal`s or an alternation of only `Literal`s.
2224	///
2225	/// For example, `f`, `foo`, `a\|b\|c`, and `foo\|bar\|baz` are alternation
2226	/// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2227	/// (even though that contain sub-expressions that are literals).
2228	#[inline]
2229	pub fn is_alternation_literal(&self) -> bool {
2230	self.0.alternation_literal
2231	}
2232
2233	/// Returns the total amount of heap memory usage, in bytes, used by this
2234	/// `Properties` value.
2235	#[inline]
2236	pub fn memory_usage(&self) -> usize {
2237	core::mem::size_of::<PropertiesI>()
2238	}
2239
2240	/// Returns a new set of properties that corresponds to the union of the
2241	/// iterator of properties given.
2242	///
2243	/// This is useful when one has multiple `Hir` expressions and wants
2244	/// to combine them into a single alternation without constructing the
2245	/// corresponding `Hir`. This routine provides a way of combining the
2246	/// properties of each `Hir` expression into one set of properties
2247	/// representing the union of those expressions.
2248	///
2249	/// # Example: union with HIRs that never match
2250	///
2251	/// This example shows that unioning properties together with one that
2252	/// represents a regex that never matches will "poison" certain attributes,
2253	/// like the minimum and maximum lengths.
2254	///
2255	/// ```
2256	/// use regex_syntax::{hir::Properties, parse};
2257	///
2258	/// let hir1 = parse("ab?c?")?;
2259	/// assert_eq!(Some(`1`), hir1.properties().minimum_len());
2260	/// assert_eq!(Some(`3`), hir1.properties().maximum_len());
2261	///
2262	/// let hir2 = parse(r"[a&&b]")?;
2263	/// assert_eq!(None, hir2.properties().minimum_len());
2264	/// assert_eq!(None, hir2.properties().maximum_len());
2265	///
2266	/// let hir3 = parse(r"wxy?z?")?;
2267	/// assert_eq!(Some(`2`), hir3.properties().minimum_len());
2268	/// assert_eq!(Some(`4`), hir3.properties().maximum_len());
2269	///
2270	/// let unioned = Properties::union([
2271	/// hir1.properties(),
2272	/// hir2.properties(),
2273	/// hir3.properties(),
2274	/// ]);
2275	/// assert_eq!(None, unioned.minimum_len());
2276	/// assert_eq!(None, unioned.maximum_len());
2277	///
2278	/// # Ok::<(), Box<dyn std::error::Error>>(())
2279	/// ```
2280	///
2281	/// The maximum length can also be "poisoned" by a pattern that has no
2282	/// upper bound on the length of a match. The minimum length remains
2283	/// unaffected:
2284	///
2285	/// ```
2286	/// use regex_syntax::{hir::Properties, parse};
2287	///
2288	/// let hir1 = parse("ab?c?")?;
2289	/// assert_eq!(Some(`1`), hir1.properties().minimum_len());
2290	/// assert_eq!(Some(`3`), hir1.properties().maximum_len());
2291	///
2292	/// let hir2 = parse(r"a+")?;
2293	/// assert_eq!(Some(`1`), hir2.properties().minimum_len());
2294	/// assert_eq!(None, hir2.properties().maximum_len());
2295	///
2296	/// let hir3 = parse(r"wxy?z?")?;
2297	/// assert_eq!(Some(`2`), hir3.properties().minimum_len());
2298	/// assert_eq!(Some(`4`), hir3.properties().maximum_len());
2299	///
2300	/// let unioned = Properties::union([
2301	/// hir1.properties(),
2302	/// hir2.properties(),
2303	/// hir3.properties(),
2304	/// ]);
2305	/// assert_eq!(Some(`1`), unioned.minimum_len());
2306	/// assert_eq!(None, unioned.maximum_len());
2307	///
2308	/// # Ok::<(), Box<dyn std::error::Error>>(())
2309	/// ```
2310	pub fn union<I, P>(props: I) -> Properties
2311	where
2312	I: IntoIterator<Item = P>,
2313	P: core::borrow::Borrow<Properties>,
2314	{
2315	let mut it = props.into_iter().peekable();
2316	// While empty alternations aren't possible, we still behave as if they
2317	// are. When we have an empty alternate, then clearly the look-around
2318	// prefix and suffix is empty. Otherwise, it is the intersection of all
2319	// prefixes and suffixes (respectively) of the branches.
2320	let fix = if it.peek().is_none() {
2321	LookSet::empty()
2322	} else {
2323	LookSet::full()
2324	};
2325	// And also, an empty alternate means we have 0 static capture groups,
2326	// but we otherwise start with the number corresponding to the first
2327	// alternate. If any subsequent alternate has a different number of
2328	// static capture groups, then we overall have a variation and not a
2329	// static number of groups.
2330	let static_explicit_captures_len =
2331	it.peek().and_then(\|p\| p.borrow().static_explicit_captures_len());
2332	// The base case is an empty alternation, which matches nothing.
2333	// Note though that empty alternations aren't possible, because the
2334	// Hir::alternation smart constructor rewrites those as empty character
2335	// classes.
2336	let mut props = PropertiesI {
2337	minimum_len: None,
2338	maximum_len: None,
2339	look_set: LookSet::empty(),
2340	look_set_prefix: fix,
2341	look_set_suffix: fix,
2342	look_set_prefix_any: LookSet::empty(),
2343	look_set_suffix_any: LookSet::empty(),
2344	utf8: `true`,
2345	explicit_captures_len: `0`,
2346	static_explicit_captures_len,
2347	literal: `false`,
2348	alternation_literal: `true`,
2349	};
2350	let (mut min_poisoned, mut max_poisoned) = (`false`, `false`);
2351	// Handle properties that need to visit every child hir.
2352	for prop in it {
2353	let p = prop.borrow();
2354	props.look_set.set_union(p.look_set());
2355	props.look_set_prefix.set_intersect(p.look_set_prefix());
2356	props.look_set_suffix.set_intersect(p.look_set_suffix());
2357	props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2358	props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2359	props.utf8 = props.utf8 && p.is_utf8();
2360	props.explicit_captures_len = props
2361	.explicit_captures_len
2362	.saturating_add(p.explicit_captures_len());
2363	if props.static_explicit_captures_len
2364	!= p.static_explicit_captures_len()
2365	{
2366	props.static_explicit_captures_len = None;
2367	}
2368	props.alternation_literal =
2369	props.alternation_literal && p.is_literal();
2370	if !min_poisoned {
2371	if let Some(xmin) = p.minimum_len() {
2372	if props.minimum_len.map_or(`true`, \|pmin\| xmin < pmin) {
2373	props.minimum_len = Some(xmin);
2374	}
2375	} else {
2376	props.minimum_len = None;
2377	min_poisoned = `true`;
2378	}
2379	}
2380	if !max_poisoned {
2381	if let Some(xmax) = p.maximum_len() {
2382	if props.maximum_len.map_or(`true`, \|pmax\| xmax > pmax) {
2383	props.maximum_len = Some(xmax);
2384	}
2385	} else {
2386	props.maximum_len = None;
2387	max_poisoned = `true`;
2388	}
2389	}
2390	}
2391	Properties(Box::new(props))
2392	}
2393	}
2394
2395	impl Properties {
2396	/// Create a new set of HIR properties for an empty regex.
2397	fn empty() -> Properties {
2398	let inner = PropertiesI {
2399	minimum_len: Some(`0`),
2400	maximum_len: Some(`0`),
2401	look_set: LookSet::empty(),
2402	look_set_prefix: LookSet::empty(),
2403	look_set_suffix: LookSet::empty(),
2404	look_set_prefix_any: LookSet::empty(),
2405	look_set_suffix_any: LookSet::empty(),
2406	// It is debatable whether an empty regex always matches at valid
2407	// UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2408	// it is clearly false. There are, for example, many empty strings
2409	// between the bytes encoding a '☃'.
2410	//
2411	// However, when Unicode mode is enabled, the fundamental atom
2412	// of matching is really a codepoint. And in that scenario, an
2413	// empty regex is defined to only match at valid UTF-8 boundaries
2414	// and to never split a codepoint. It just so happens that this
2415	// enforcement is somewhat tricky to do for regexes that match
2416	// the empty string inside regex engines themselves. It usually
2417	// requires some layer above the regex engine to filter out such
2418	// matches.
2419	//
2420	// In any case, 'true' is really the only coherent option. If it
2421	// were false, for example, then 'a' would also need to be false*
2422	// since it too can match the empty string.
2423	utf8: `true`,
2424	explicit_captures_len: `0`,
2425	static_explicit_captures_len: Some(`0`),
2426	literal: `false`,
2427	alternation_literal: `false`,
2428	};
2429	Properties(Box::new(inner))
2430	}
2431
2432	/// Create a new set of HIR properties for a literal regex.
2433	fn literal(lit: &Literal) -> Properties {
2434	let inner = PropertiesI {
2435	minimum_len: Some(lit.0.len()),
2436	maximum_len: Some(lit.0.len()),
2437	look_set: LookSet::empty(),
2438	look_set_prefix: LookSet::empty(),
2439	look_set_suffix: LookSet::empty(),
2440	look_set_prefix_any: LookSet::empty(),
2441	look_set_suffix_any: LookSet::empty(),
2442	utf8: core::str::from_utf8(&lit.0).is_ok(),
2443	explicit_captures_len: `0`,
2444	static_explicit_captures_len: Some(`0`),
2445	literal: `true`,
2446	alternation_literal: `true`,
2447	};
2448	Properties(Box::new(inner))
2449	}
2450
2451	/// Create a new set of HIR properties for a character class.
2452	fn class(class: &Class) -> Properties {
2453	let inner = PropertiesI {
2454	minimum_len: class.minimum_len(),
2455	maximum_len: class.maximum_len(),
2456	look_set: LookSet::empty(),
2457	look_set_prefix: LookSet::empty(),
2458	look_set_suffix: LookSet::empty(),
2459	look_set_prefix_any: LookSet::empty(),
2460	look_set_suffix_any: LookSet::empty(),
2461	utf8: class.is_utf8(),
2462	explicit_captures_len: `0`,
2463	static_explicit_captures_len: Some(`0`),
2464	literal: `false`,
2465	alternation_literal: `false`,
2466	};
2467	Properties(Box::new(inner))
2468	}
2469
2470	/// Create a new set of HIR properties for a look-around assertion.
2471	fn look(look: Look) -> Properties {
2472	let inner = PropertiesI {
2473	minimum_len: Some(`0`),
2474	maximum_len: Some(`0`),
2475	look_set: LookSet::singleton(look),
2476	look_set_prefix: LookSet::singleton(look),
2477	look_set_suffix: LookSet::singleton(look),
2478	look_set_prefix_any: LookSet::singleton(look),
2479	look_set_suffix_any: LookSet::singleton(look),
2480	// This requires a little explanation. Basically, we don't consider
2481	// matching an empty string to be equivalent to matching invalid
2482	// UTF-8, even though technically matching every empty string will
2483	// split the UTF-8 encoding of a single codepoint when treating a
2484	// UTF-8 encoded string as a sequence of bytes. Our defense here is
2485	// that in such a case, a codepoint should logically be treated as
2486	// the fundamental atom for matching, and thus the only valid match
2487	// points are between codepoints and not bytes.
2488	//
2489	// More practically, this is true here because it's also true
2490	// for 'Hir::empty()', otherwise something like 'a' would be*
2491	// considered to match invalid UTF-8. That in turn makes this
2492	// property borderline useless.
2493	utf8: `true`,
2494	explicit_captures_len: `0`,
2495	static_explicit_captures_len: Some(`0`),
2496	literal: `false`,
2497	alternation_literal: `false`,
2498	};
2499	Properties(Box::new(inner))
2500	}
2501
2502	/// Create a new set of HIR properties for a repetition.
2503	fn repetition(rep: &Repetition) -> Properties {
2504	let p = rep.sub.properties();
2505	let minimum_len = p.minimum_len().map(\|child_min\| {
2506	let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2507	child_min.saturating_mul(rep_min)
2508	});
2509	let maximum_len = rep.max.and_then(\|rep_max\| {
2510	let rep_max = usize::try_from(rep_max).ok()?;
2511	let child_max = p.maximum_len()?;
2512	child_max.checked_mul(rep_max)
2513	});
2514
2515	let mut inner = PropertiesI {
2516	minimum_len,
2517	maximum_len,
2518	look_set: p.look_set(),
2519	look_set_prefix: LookSet::empty(),
2520	look_set_suffix: LookSet::empty(),
2521	look_set_prefix_any: p.look_set_prefix_any(),
2522	look_set_suffix_any: p.look_set_suffix_any(),
2523	utf8: p.is_utf8(),
2524	explicit_captures_len: p.explicit_captures_len(),
2525	static_explicit_captures_len: p.static_explicit_captures_len(),
2526	literal: `false`,
2527	alternation_literal: `false`,
2528	};
2529	// If the repetition operator can match the empty string, then its
2530	// lookset prefix and suffixes themselves remain empty since they are
2531	// no longer required to match.
2532	if rep.min > `0` {
2533	inner.look_set_prefix = p.look_set_prefix();
2534	inner.look_set_suffix = p.look_set_suffix();
2535	}
2536	// If the static captures len of the sub-expression is not known or
2537	// is greater than zero, then it automatically propagates to the
2538	// repetition, regardless of the repetition. Otherwise, it might
2539	// change, but only when the repetition can match 0 times.
2540	if rep.min == `0`
2541	&& inner.static_explicit_captures_len.map_or(`false`, \|len\| len > `0`)
2542	{
2543	// If we require a match 0 times, then our captures len is
2544	// guaranteed to be zero. Otherwise, if we can* match the empty*
2545	// string, then it's impossible to know how many captures will be
2546	// in the resulting match.
2547	if rep.max == Some(`0`) {
2548	inner.static_explicit_captures_len = Some(`0`);
2549	} else {
2550	inner.static_explicit_captures_len = None;
2551	}
2552	}
2553	Properties(Box::new(inner))
2554	}
2555
2556	/// Create a new set of HIR properties for a capture.
2557	fn capture(capture: &Capture) -> Properties {
2558	let p = capture.sub.properties();
2559	Properties(Box::new(PropertiesI {
2560	explicit_captures_len: p.explicit_captures_len().saturating_add(`1`),
2561	static_explicit_captures_len: p
2562	.static_explicit_captures_len()
2563	.map(\|len\| len.saturating_add(`1`)),
2564	literal: `false`,
2565	alternation_literal: `false`,
2566	..*p.0.clone()
2567	}))
2568	}
2569
2570	/// Create a new set of HIR properties for a concatenation.
2571	fn concat(concat: &[Hir]) -> Properties {
2572	// The base case is an empty concatenation, which matches the empty
2573	// string. Note though that empty concatenations aren't possible,
2574	// because the Hir::concat smart constructor rewrites those as
2575	// Hir::empty.
2576	let mut props = PropertiesI {
2577	minimum_len: Some(`0`),
2578	maximum_len: Some(`0`),
2579	look_set: LookSet::empty(),
2580	look_set_prefix: LookSet::empty(),
2581	look_set_suffix: LookSet::empty(),
2582	look_set_prefix_any: LookSet::empty(),
2583	look_set_suffix_any: LookSet::empty(),
2584	utf8: `true`,
2585	explicit_captures_len: `0`,
2586	static_explicit_captures_len: Some(`0`),
2587	literal: `true`,
2588	alternation_literal: `true`,
2589	};
2590	// Handle properties that need to visit every child hir.
2591	for x in concat.iter() {
2592	let p = x.properties();
2593	props.look_set.set_union(p.look_set());
2594	props.utf8 = props.utf8 && p.is_utf8();
2595	props.explicit_captures_len = props
2596	.explicit_captures_len
2597	.saturating_add(p.explicit_captures_len());
2598	props.static_explicit_captures_len = p
2599	.static_explicit_captures_len()
2600	.and_then(\|len1\| {
2601	Some((len1, props.static_explicit_captures_len?))
2602	})
2603	.and_then(\|(len1, len2)\| Some(len1.saturating_add(len2)));
2604	props.literal = props.literal && p.is_literal();
2605	props.alternation_literal =
2606	props.alternation_literal && p.is_alternation_literal();
2607	if let Some(minimum_len) = props.minimum_len {
2608	match p.minimum_len() {
2609	None => props.minimum_len = None,
2610	Some(len) => {
2611	// We use saturating arithmetic here because the
2612	// minimum is just a lower bound. We can't go any
2613	// higher than what our number types permit.
2614	props.minimum_len =
2615	Some(minimum_len.saturating_add(len));
2616	}
2617	}
2618	}
2619	if let Some(maximum_len) = props.maximum_len {
2620	match p.maximum_len() {
2621	None => props.maximum_len = None,
2622	Some(len) => {
2623	props.maximum_len = maximum_len.checked_add(len)
2624	}
2625	}
2626	}
2627	}
2628	// Handle the prefix properties, which only requires visiting
2629	// child exprs until one matches more than the empty string.
2630	let mut it = concat.iter();
2631	while let Some(x) = it.next() {
2632	props.look_set_prefix.set_union(x.properties().look_set_prefix());
2633	props
2634	.look_set_prefix_any
2635	.set_union(x.properties().look_set_prefix_any());
2636	if x.properties().maximum_len().map_or(`true`, \|x\| x > `0`) {
2637	break;
2638	}
2639	}
2640	// Same thing for the suffix properties, but in reverse.
2641	let mut it = concat.iter().rev();
2642	while let Some(x) = it.next() {
2643	props.look_set_suffix.set_union(x.properties().look_set_suffix());
2644	props
2645	.look_set_suffix_any
2646	.set_union(x.properties().look_set_suffix_any());
2647	if x.properties().maximum_len().map_or(`true`, \|x\| x > `0`) {
2648	break;
2649	}
2650	}
2651	Properties(Box::new(props))
2652	}
2653
2654	/// Create a new set of HIR properties for a concatenation.
2655	fn alternation(alts: &[Hir]) -> Properties {
2656	Properties::union(alts.iter().map(\|hir\| hir.properties()))
2657	}
2658	}
2659
2660	/// A set of look-around assertions.
2661	///
2662	/// This is useful for efficiently tracking look-around assertions. For
2663	/// example, an [`Hir`] provides properties that return `LookSet`s.
2664	#[derive(Clone, Copy, Default, Eq, PartialEq)]
2665	pub struct LookSet {
2666	/// The underlying representation this set is exposed to make it possible
2667	/// to store it somewhere efficiently. The representation is that
2668	/// of a bitset, where each assertion occupies bit `i` where `i =
2669	/// Look::as_repr()`.
2670	///
2671	/// Note that users of this internal representation must permit the full
2672	/// range of `u16` values to be represented. For example, even if the
2673	/// current implementation only makes use of the 10 least significant bits,
2674	/// it may use more bits in a future semver compatible release.
2675	pub bits: u32,
2676	}
2677
2678	impl LookSet {
2679	/// Create an empty set of look-around assertions.
2680	#[inline]
2681	pub fn empty() -> LookSet {
2682	LookSet { bits: `0` }
2683	}
2684
2685	/// Create a full set of look-around assertions.
2686	///
2687	/// This set contains all possible look-around assertions.
2688	#[inline]
2689	pub fn full() -> LookSet {
2690	LookSet { bits: !`0` }
2691	}
2692
2693	/// Create a look-around set containing the look-around assertion given.
2694	///
2695	/// This is a convenience routine for creating an empty set and inserting
2696	/// one look-around assertions.
2697	#[inline]
2698	pub fn singleton(look: Look) -> LookSet {
2699	LookSet::empty().insert(look)
2700	}
2701
2702	/// Returns the total number of look-around assertions in this set.
2703	#[inline]
2704	pub fn len(self) -> usize {
2705	// OK because max value always fits in a u8, which in turn always
2706	// fits in a usize, regardless of target.
2707	usize::try_from(self.bits.count_ones()).unwrap()
2708	}
2709
2710	/// Returns true if and only if this set is empty.
2711	#[inline]
2712	pub fn is_empty(self) -> bool {
2713	self.len() == `0`
2714	}
2715
2716	/// Returns true if and only if the given look-around assertion is in this
2717	/// set.
2718	#[inline]
2719	pub fn contains(self, look: Look) -> bool {
2720	self.bits & look.as_repr() != `0`
2721	}
2722
2723	/// Returns true if and only if this set contains any anchor assertions.
2724	/// This includes both "start/end of haystack" and "start/end of line."
2725	#[inline]
2726	pub fn contains_anchor(&self) -> bool {
2727	self.contains_anchor_haystack() \|\| self.contains_anchor_line()
2728	}
2729
2730	/// Returns true if and only if this set contains any "start/end of
2731	/// haystack" anchors. This doesn't include "start/end of line" anchors.
2732	#[inline]
2733	pub fn contains_anchor_haystack(&self) -> bool {
2734	self.contains(Look::Start) \|\| self.contains(Look::End)
2735	}
2736
2737	/// Returns true if and only if this set contains any "start/end of line"
2738	/// anchors. This doesn't include "start/end of haystack" anchors. This
2739	/// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2740	#[inline]
2741	pub fn contains_anchor_line(&self) -> bool {
2742	self.contains(Look::StartLF)
2743	\|\| self.contains(Look::EndLF)
2744	\|\| self.contains(Look::StartCRLF)
2745	\|\| self.contains(Look::EndCRLF)
2746	}
2747
2748	/// Returns true if and only if this set contains any "start/end of line"
2749	/// anchors that only treat `\n` as line terminators. This does not include
2750	/// haystack anchors or CRLF aware line anchors.
2751	#[inline]
2752	pub fn contains_anchor_lf(&self) -> bool {
2753	self.contains(Look::StartLF) \|\| self.contains(Look::EndLF)
2754	}
2755
2756	/// Returns true if and only if this set contains any "start/end of line"
2757	/// anchors that are CRLF-aware. This doesn't include "start/end of
2758	/// haystack" or "start/end of line-feed" anchors.
2759	#[inline]
2760	pub fn contains_anchor_crlf(&self) -> bool {
2761	self.contains(Look::StartCRLF) \|\| self.contains(Look::EndCRLF)
2762	}
2763
2764	/// Returns true if and only if this set contains any word boundary or
2765	/// negated word boundary assertions. This include both Unicode and ASCII
2766	/// word boundaries.
2767	#[inline]
2768	pub fn contains_word(self) -> bool {
2769	self.contains_word_unicode() \|\| self.contains_word_ascii()
2770	}
2771
2772	/// Returns true if and only if this set contains any Unicode word boundary
2773	/// or negated Unicode word boundary assertions.
2774	#[inline]
2775	pub fn contains_word_unicode(self) -> bool {
2776	self.contains(Look::WordUnicode)
2777	\|\| self.contains(Look::WordUnicodeNegate)
2778	\|\| self.contains(Look::WordStartUnicode)
2779	\|\| self.contains(Look::WordEndUnicode)
2780	\|\| self.contains(Look::WordStartHalfUnicode)
2781	\|\| self.contains(Look::WordEndHalfUnicode)
2782	}
2783
2784	/// Returns true if and only if this set contains any ASCII word boundary
2785	/// or negated ASCII word boundary assertions.
2786	#[inline]
2787	pub fn contains_word_ascii(self) -> bool {
2788	self.contains(Look::WordAscii)
2789	\|\| self.contains(Look::WordAsciiNegate)
2790	\|\| self.contains(Look::WordStartAscii)
2791	\|\| self.contains(Look::WordEndAscii)
2792	\|\| self.contains(Look::WordStartHalfAscii)
2793	\|\| self.contains(Look::WordEndHalfAscii)
2794	}
2795
2796	/// Returns an iterator over all of the look-around assertions in this set.
2797	#[inline]
2798	pub fn iter(self) -> LookSetIter {
2799	LookSetIter { set: self }
2800	}
2801
2802	/// Return a new set that is equivalent to the original, but with the given
2803	/// assertion added to it. If the assertion is already in the set, then the
2804	/// returned set is equivalent to the original.
2805	#[inline]
2806	pub fn insert(self, look: Look) -> LookSet {
2807	LookSet { bits: self.bits \| look.as_repr() }
2808	}
2809
2810	/// Updates this set in place with the result of inserting the given
2811	/// assertion into this set.
2812	#[inline]
2813	pub fn set_insert(&mut self, look: Look) {
2814	*self = self.insert(look);
2815	}
2816
2817	/// Return a new set that is equivalent to the original, but with the given
2818	/// assertion removed from it. If the assertion is not in the set, then the
2819	/// returned set is equivalent to the original.
2820	#[inline]
2821	pub fn remove(self, look: Look) -> LookSet {
2822	LookSet { bits: self.bits & !look.as_repr() }
2823	}
2824
2825	/// Updates this set in place with the result of removing the given
2826	/// assertion from this set.
2827	#[inline]
2828	pub fn set_remove(&mut self, look: Look) {
2829	*self = self.remove(look);
2830	}
2831
2832	/// Returns a new set that is the result of subtracting the given set from
2833	/// this set.
2834	#[inline]
2835	pub fn subtract(self, other: LookSet) -> LookSet {
2836	LookSet { bits: self.bits & !other.bits }
2837	}
2838
2839	/// Updates this set in place with the result of subtracting the given set
2840	/// from this set.
2841	#[inline]
2842	pub fn set_subtract(&mut self, other: LookSet) {
2843	*self = self.subtract(other);
2844	}
2845
2846	/// Returns a new set that is the union of this and the one given.
2847	#[inline]
2848	pub fn union(self, other: LookSet) -> LookSet {
2849	LookSet { bits: self.bits \| other.bits }
2850	}
2851
2852	/// Updates this set in place with the result of unioning it with the one
2853	/// given.
2854	#[inline]
2855	pub fn set_union(&mut self, other: LookSet) {
2856	*self = self.union(other);
2857	}
2858
2859	/// Returns a new set that is the intersection of this and the one given.
2860	#[inline]
2861	pub fn intersect(self, other: LookSet) -> LookSet {
2862	LookSet { bits: self.bits & other.bits }
2863	}
2864
2865	/// Updates this set in place with the result of intersecting it with the
2866	/// one given.
2867	#[inline]
2868	pub fn set_intersect(&mut self, other: LookSet) {
2869	*self = self.intersect(other);
2870	}
2871
2872	/// Return a `LookSet` from the slice given as a native endian 32-bit
2873	/// integer.
2874	///
2875	/// # Panics
2876	///
2877	/// This panics if `slice.len() < 4`.
2878	#[inline]
2879	pub fn read_repr(slice: &[u8]) -> LookSet {
2880	let bits = u32::from_ne_bytes(slice[..`4`].try_into().unwrap());
2881	LookSet { bits }
2882	}
2883
2884	/// Write a `LookSet` as a native endian 32-bit integer to the beginning
2885	/// of the slice given.
2886	///
2887	/// # Panics
2888	///
2889	/// This panics if `slice.len() < 4`.
2890	#[inline]
2891	pub fn write_repr(self, slice: &mut [u8]) {
2892	let raw = self.bits.to_ne_bytes();
2893	slice[`0`] = raw[`0`];
2894	slice[`1`] = raw[`1`];
2895	slice[`2`] = raw[`2`];
2896	slice[`3`] = raw[`3`];
2897	}
2898	}
2899
2900	impl core::fmt::Debug for LookSet {
2901	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2902	if self.is_empty() {
2903	return write!(f, "∅");
2904	}
2905	for look: Look in self.iter() {
2906	write!(f, "{}", look.as_char())?;
2907	}
2908	Ok(())
2909	}
2910	}
2911
2912	/// An iterator over all look-around assertions in a [`LookSet`].
2913	///
2914	/// This iterator is created by [`LookSet::iter`].
2915	#[derive(Clone, Debug)]
2916	pub struct LookSetIter {
2917	set: LookSet,
2918	}
2919
2920	impl Iterator for LookSetIter {
2921	type Item = Look;
2922
2923	#[inline]
2924	fn next(&mut self) -> Option<Look> {
2925	if self.set.is_empty() {
2926	return None;
2927	}
2928	// We'll never have more than u8::MAX distinct look-around assertions,
2929	// so 'bit' will always fit into a u16.
2930	let bit: u16 = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2931	let look: Look = Look::from_repr(`1` << bit)?;
2932	self.set = self.set.remove(look);
2933	Some(look)
2934	}
2935	}
2936
2937	/// Given a sequence of HIR values where each value corresponds to a Unicode
2938	/// class (or an all-ASCII byte class), return a single Unicode class
2939	/// corresponding to the union of the classes found.
2940	fn class_chars(hirs: &[Hir]) -> Option<Class> {
2941	let mut cls: ClassUnicode = ClassUnicode::new(ranges:vec![]);
2942	for hir: &Hir in hirs.iter() {
2943	match *hir.kind() {
2944	HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2945	cls.union(cls2);
2946	}
2947	HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2948	cls.union(&cls2.to_unicode_class()?);
2949	}
2950	_ => return None,
2951	};
2952	}
2953	Some(Class::Unicode(cls))
2954	}
2955
2956	/// Given a sequence of HIR values where each value corresponds to a byte class
2957	/// (or an all-ASCII Unicode class), return a single byte class corresponding
2958	/// to the union of the classes found.
2959	fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2960	let mut cls: ClassBytes = ClassBytes::new(ranges:vec![]);
2961	for hir: &Hir in hirs.iter() {
2962	match *hir.kind() {
2963	HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2964	cls.union(&cls2.to_byte_class()?);
2965	}
2966	HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2967	cls.union(cls2);
2968	}
2969	_ => return None,
2970	};
2971	}
2972	Some(Class::Bytes(cls))
2973	}
2974
2975	/// Given a sequence of HIR values where each value corresponds to a literal
2976	/// that is a single `char`, return that sequence of `char`s. Otherwise return
2977	/// None. No deduplication is done.
2978	fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2979	let mut singletons: Vec = vec![];
2980	for hir: &Hir in hirs.iter() {
2981	let literal: &Box<[u8]> = match *hir.kind() {
2982	HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2983	_ => return None,
2984	};
2985	let ch: char = match crate::debug::utf8_decode(bytes:literal) {
2986	None => return None,
2987	Some(Err(_)) => return None,
2988	Some(Ok(ch: char)) => ch,
2989	};
2990	if literal.len() != ch.len_utf8() {
2991	return None;
2992	}
2993	singletons.push(ch);
2994	}
2995	Some(singletons)
2996	}
2997
2998	/// Given a sequence of HIR values where each value corresponds to a literal
2999	/// that is a single byte, return that sequence of bytes. Otherwise return
3000	/// None. No deduplication is done.
3001	fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
3002	let mut singletons: Vec = vec![];
3003	for hir: &Hir in hirs.iter() {
3004	let literal: &Box<[u8]> = match *hir.kind() {
3005	HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
3006	_ => return None,
3007	};
3008	if literal.len() != `1` {
3009	return None;
3010	}
3011	singletons.push(literal[`0`]);
3012	}
3013	Some(singletons)
3014	}
3015
3016	/// Looks for a common prefix in the list of alternation branches given. If one
3017	/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3018	/// Otherwise, the original given list of branches is returned unmodified.
3019	///
3020	/// This is not quite as good as it could be. Right now, it requires that
3021	/// all branches are 'Concat' expressions. It also doesn't do well with
3022	/// literals. For example, given 'foofoo\|foobar', it will not refactor it to
3023	/// 'foo(?:foo\|bar)' because literals are flattened into their own special
3024	/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3025	/// instead of a string of bytes because of this. Otherwise, handling the
3026	/// current representation in this routine will be pretty gnarly. Sigh.)
3027	fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3028	if hirs.len() <= `1` {
3029	return Err(hirs);
3030	}
3031	let mut prefix = match hirs[`0`].kind() {
3032	HirKind::Concat(ref xs) => &**xs,
3033	_ => return Err(hirs),
3034	};
3035	if prefix.is_empty() {
3036	return Err(hirs);
3037	}
3038	for h in hirs.iter().skip(`1`) {
3039	let concat = match h.kind() {
3040	HirKind::Concat(ref xs) => xs,
3041	_ => return Err(hirs),
3042	};
3043	let common_len = prefix
3044	.iter()
3045	.zip(concat.iter())
3046	.take_while(\|(x, y)\| x == y)
3047	.count();
3048	prefix = &prefix[..common_len];
3049	if prefix.is_empty() {
3050	return Err(hirs);
3051	}
3052	}
3053	let len = prefix.len();
3054	assert_ne!(`0`, len);
3055	let mut prefix_concat = vec![];
3056	let mut suffix_alts = vec![];
3057	for h in hirs {
3058	let mut concat = match h.into_kind() {
3059	HirKind::Concat(xs) => xs,
3060	// We required all sub-expressions to be
3061	// concats above, so we're only here if we
3062	// have a concat.
3063	_ => unreachable!(),
3064	};
3065	suffix_alts.push(Hir::concat(concat.split_off(len)));
3066	if prefix_concat.is_empty() {
3067	prefix_concat = concat;
3068	}
3069	}
3070	let mut concat = prefix_concat;
3071	concat.push(Hir::alternation(suffix_alts));
3072	Ok(Hir::concat(concat))
3073	}
3074
3075	#[cfg(test)]
3076	mod tests {
3077	use super::*;
3078
3079	fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3080	let ranges: Vec<ClassUnicodeRange> = ranges
3081	.iter()
3082	.map(\|&(s, e)\| ClassUnicodeRange::new(s, e))
3083	.collect();
3084	ClassUnicode::new(ranges)
3085	}
3086
3087	fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3088	let ranges: Vec<ClassBytesRange> =
3089	ranges.iter().map(\|&(s, e)\| ClassBytesRange::new(s, e)).collect();
3090	ClassBytes::new(ranges)
3091	}
3092
3093	fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3094	cls.iter().map(\|x\| (x.start(), x.end())).collect()
3095	}
3096
3097	#[cfg(feature = "unicode-case")]
3098	fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3099	let mut cls_ = cls.clone();
3100	cls_.case_fold_simple();
3101	cls_
3102	}
3103
3104	fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3105	let mut cls_ = cls1.clone();
3106	cls_.union(cls2);
3107	cls_
3108	}
3109
3110	fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3111	let mut cls_ = cls1.clone();
3112	cls_.intersect(cls2);
3113	cls_
3114	}
3115
3116	fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3117	let mut cls_ = cls1.clone();
3118	cls_.difference(cls2);
3119	cls_
3120	}
3121
3122	fn usymdifference(
3123	cls1: &ClassUnicode,
3124	cls2: &ClassUnicode,
3125	) -> ClassUnicode {
3126	let mut cls_ = cls1.clone();
3127	cls_.symmetric_difference(cls2);
3128	cls_
3129	}
3130
3131	fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3132	let mut cls_ = cls.clone();
3133	cls_.negate();
3134	cls_
3135	}
3136
3137	fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3138	cls.iter().map(\|x\| (x.start(), x.end())).collect()
3139	}
3140
3141	fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3142	let mut cls_ = cls.clone();
3143	cls_.case_fold_simple();
3144	cls_
3145	}
3146
3147	fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3148	let mut cls_ = cls1.clone();
3149	cls_.union(cls2);
3150	cls_
3151	}
3152
3153	fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3154	let mut cls_ = cls1.clone();
3155	cls_.intersect(cls2);
3156	cls_
3157	}
3158
3159	fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3160	let mut cls_ = cls1.clone();
3161	cls_.difference(cls2);
3162	cls_
3163	}
3164
3165	fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3166	let mut cls_ = cls1.clone();
3167	cls_.symmetric_difference(cls2);
3168	cls_
3169	}
3170
3171	fn bnegate(cls: &ClassBytes) -> ClassBytes {
3172	let mut cls_ = cls.clone();
3173	cls_.negate();
3174	cls_
3175	}
3176
3177	#[test]
3178	fn class_range_canonical_unicode() {
3179	let range = ClassUnicodeRange::new('`\u{00FF}`', '`\0`');
3180	assert_eq!('`\0`', range.start());
3181	assert_eq!('`\u{00FF}`', range.end());
3182	}
3183
3184	#[test]
3185	fn class_range_canonical_bytes() {
3186	let range = ClassBytesRange::new(b'`\xFF`', b'`\0`');
3187	assert_eq!(b'`\0`', range.start());
3188	assert_eq!(b'`\xFF`', range.end());
3189	}
3190
3191	#[test]
3192	fn class_canonicalize_unicode() {
3193	let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3194	let expected = vec![('a', 'c'), ('x', 'z')];
3195	assert_eq!(expected, uranges(&cls));
3196
3197	let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3198	let expected = vec![('a', 'c'), ('x', 'z')];
3199	assert_eq!(expected, uranges(&cls));
3200
3201	let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3202	let expected = vec![('w', 'z')];
3203	assert_eq!(expected, uranges(&cls));
3204
3205	let cls = uclass(&[
3206	('c', 'f'),
3207	('a', 'g'),
3208	('d', 'j'),
3209	('a', 'c'),
3210	('m', 'p'),
3211	('l', 's'),
3212	]);
3213	let expected = vec![('a', 'j'), ('l', 's')];
3214	assert_eq!(expected, uranges(&cls));
3215
3216	let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3217	let expected = vec![('u', 'z')];
3218	assert_eq!(expected, uranges(&cls));
3219
3220	let cls = uclass(&[('`\x00`', '`\u{10FFFF}`'), ('`\x00`', '`\u{10FFFF}`')]);
3221	let expected = vec![('`\x00`', '`\u{10FFFF}`')];
3222	assert_eq!(expected, uranges(&cls));
3223
3224	let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3225	let expected = vec![('a', 'b')];
3226	assert_eq!(expected, uranges(&cls));
3227	}
3228
3229	#[test]
3230	fn class_canonicalize_bytes() {
3231	let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3232	let expected = vec![(b'a', b'c'), (b'x', b'z')];
3233	assert_eq!(expected, branges(&cls));
3234
3235	let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3236	let expected = vec![(b'a', b'c'), (b'x', b'z')];
3237	assert_eq!(expected, branges(&cls));
3238
3239	let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3240	let expected = vec![(b'w', b'z')];
3241	assert_eq!(expected, branges(&cls));
3242
3243	let cls = bclass(&[
3244	(b'c', b'f'),
3245	(b'a', b'g'),
3246	(b'd', b'j'),
3247	(b'a', b'c'),
3248	(b'm', b'p'),
3249	(b'l', b's'),
3250	]);
3251	let expected = vec![(b'a', b'j'), (b'l', b's')];
3252	assert_eq!(expected, branges(&cls));
3253
3254	let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3255	let expected = vec![(b'u', b'z')];
3256	assert_eq!(expected, branges(&cls));
3257
3258	let cls = bclass(&[(b'`\x00`', b'`\xFF`'), (b'`\x00`', b'`\xFF`')]);
3259	let expected = vec![(b'`\x00`', b'`\xFF`')];
3260	assert_eq!(expected, branges(&cls));
3261
3262	let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3263	let expected = vec![(b'a', b'b')];
3264	assert_eq!(expected, branges(&cls));
3265	}
3266
3267	#[test]
3268	#[cfg(feature = "unicode-case")]
3269	fn class_case_fold_unicode() {
3270	let cls = uclass(&[
3271	('C', 'F'),
3272	('A', 'G'),
3273	('D', 'J'),
3274	('A', 'C'),
3275	('M', 'P'),
3276	('L', 'S'),
3277	('c', 'f'),
3278	]);
3279	let expected = uclass(&[
3280	('A', 'J'),
3281	('L', 'S'),
3282	('a', 'j'),
3283	('l', 's'),
3284	('`\u{17F}`', '`\u{17F}`'),
3285	]);
3286	assert_eq!(expected, ucasefold(&cls));
3287
3288	let cls = uclass(&[('A', 'Z')]);
3289	let expected = uclass(&[
3290	('A', 'Z'),
3291	('a', 'z'),
3292	('`\u{17F}`', '`\u{17F}`'),
3293	('`\u{212A}`', '`\u{212A}`'),
3294	]);
3295	assert_eq!(expected, ucasefold(&cls));
3296
3297	let cls = uclass(&[('a', 'z')]);
3298	let expected = uclass(&[
3299	('A', 'Z'),
3300	('a', 'z'),
3301	('`\u{17F}`', '`\u{17F}`'),
3302	('`\u{212A}`', '`\u{212A}`'),
3303	]);
3304	assert_eq!(expected, ucasefold(&cls));
3305
3306	let cls = uclass(&[('A', 'A'), ('_', '_')]);
3307	let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3308	assert_eq!(expected, ucasefold(&cls));
3309
3310	let cls = uclass(&[('A', 'A'), ('=', '=')]);
3311	let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3312	assert_eq!(expected, ucasefold(&cls));
3313
3314	let cls = uclass(&[('`\x00`', '`\x10`')]);
3315	assert_eq!(cls, ucasefold(&cls));
3316
3317	let cls = uclass(&[('k', 'k')]);
3318	let expected =
3319	uclass(&[('K', 'K'), ('k', 'k'), ('`\u{212A}`', '`\u{212A}`')]);
3320	assert_eq!(expected, ucasefold(&cls));
3321
3322	let cls = uclass(&[('@', '@')]);
3323	assert_eq!(cls, ucasefold(&cls));
3324	}
3325
3326	#[test]
3327	#[cfg(not(feature = "unicode-case"))]
3328	fn class_case_fold_unicode_disabled() {
3329	let mut cls = uclass(&[
3330	('C', 'F'),
3331	('A', 'G'),
3332	('D', 'J'),
3333	('A', 'C'),
3334	('M', 'P'),
3335	('L', 'S'),
3336	('c', 'f'),
3337	]);
3338	assert!(cls.try_case_fold_simple().is_err());
3339	}
3340
3341	#[test]
3342	#[should_panic]
3343	#[cfg(not(feature = "unicode-case"))]
3344	fn class_case_fold_unicode_disabled_panics() {
3345	let mut cls = uclass(&[
3346	('C', 'F'),
3347	('A', 'G'),
3348	('D', 'J'),
3349	('A', 'C'),
3350	('M', 'P'),
3351	('L', 'S'),
3352	('c', 'f'),
3353	]);
3354	cls.case_fold_simple();
3355	}
3356
3357	#[test]
3358	fn class_case_fold_bytes() {
3359	let cls = bclass(&[
3360	(b'C', b'F'),
3361	(b'A', b'G'),
3362	(b'D', b'J'),
3363	(b'A', b'C'),
3364	(b'M', b'P'),
3365	(b'L', b'S'),
3366	(b'c', b'f'),
3367	]);
3368	let expected =
3369	bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3370	assert_eq!(expected, bcasefold(&cls));
3371
3372	let cls = bclass(&[(b'A', b'Z')]);
3373	let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3374	assert_eq!(expected, bcasefold(&cls));
3375
3376	let cls = bclass(&[(b'a', b'z')]);
3377	let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3378	assert_eq!(expected, bcasefold(&cls));
3379
3380	let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3381	let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3382	assert_eq!(expected, bcasefold(&cls));
3383
3384	let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3385	let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3386	assert_eq!(expected, bcasefold(&cls));
3387
3388	let cls = bclass(&[(b'`\x00`', b'`\x10`')]);
3389	assert_eq!(cls, bcasefold(&cls));
3390
3391	let cls = bclass(&[(b'k', b'k')]);
3392	let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3393	assert_eq!(expected, bcasefold(&cls));
3394
3395	let cls = bclass(&[(b'@', b'@')]);
3396	assert_eq!(cls, bcasefold(&cls));
3397	}
3398
3399	#[test]
3400	fn class_negate_unicode() {
3401	let cls = uclass(&[('a', 'a')]);
3402	let expected = uclass(&[('`\x00`', '`\x60`'), ('`\x62`', '`\u{10FFFF}`')]);
3403	assert_eq!(expected, unegate(&cls));
3404
3405	let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3406	let expected = uclass(&[('`\x00`', '`\x60`'), ('`\x63`', '`\u{10FFFF}`')]);
3407	assert_eq!(expected, unegate(&cls));
3408
3409	let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3410	let expected = uclass(&[
3411	('`\x00`', '`\x60`'),
3412	('`\x64`', '`\x77`'),
3413	('`\x7B`', '`\u{10FFFF}`'),
3414	]);
3415	assert_eq!(expected, unegate(&cls));
3416
3417	let cls = uclass(&[('`\x00`', 'a')]);
3418	let expected = uclass(&[('`\x62`', '`\u{10FFFF}`')]);
3419	assert_eq!(expected, unegate(&cls));
3420
3421	let cls = uclass(&[('a', '`\u{10FFFF}`')]);
3422	let expected = uclass(&[('`\x00`', '`\x60`')]);
3423	assert_eq!(expected, unegate(&cls));
3424
3425	let cls = uclass(&[('`\x00`', '`\u{10FFFF}`')]);
3426	let expected = uclass(&[]);
3427	assert_eq!(expected, unegate(&cls));
3428
3429	let cls = uclass(&[]);
3430	let expected = uclass(&[('`\x00`', '`\u{10FFFF}`')]);
3431	assert_eq!(expected, unegate(&cls));
3432
3433	let cls =
3434	uclass(&[('`\x00`', '`\u{10FFFD}`'), ('`\u{10FFFF}`', '`\u{10FFFF}`')]);
3435	let expected = uclass(&[('`\u{10FFFE}`', '`\u{10FFFE}`')]);
3436	assert_eq!(expected, unegate(&cls));
3437
3438	let cls = uclass(&[('`\x00`', '`\u{D7FF}`')]);
3439	let expected = uclass(&[('`\u{E000}`', '`\u{10FFFF}`')]);
3440	assert_eq!(expected, unegate(&cls));
3441
3442	let cls = uclass(&[('`\x00`', '`\u{D7FE}`')]);
3443	let expected = uclass(&[('`\u{D7FF}`', '`\u{10FFFF}`')]);
3444	assert_eq!(expected, unegate(&cls));
3445
3446	let cls = uclass(&[('`\u{E000}`', '`\u{10FFFF}`')]);
3447	let expected = uclass(&[('`\x00`', '`\u{D7FF}`')]);
3448	assert_eq!(expected, unegate(&cls));
3449
3450	let cls = uclass(&[('`\u{E001}`', '`\u{10FFFF}`')]);
3451	let expected = uclass(&[('`\x00`', '`\u{E000}`')]);
3452	assert_eq!(expected, unegate(&cls));
3453	}
3454
3455	#[test]
3456	fn class_negate_bytes() {
3457	let cls = bclass(&[(b'a', b'a')]);
3458	let expected = bclass(&[(b'`\x00`', b'`\x60`'), (b'`\x62`', b'`\xFF`')]);
3459	assert_eq!(expected, bnegate(&cls));
3460
3461	let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3462	let expected = bclass(&[(b'`\x00`', b'`\x60`'), (b'`\x63`', b'`\xFF`')]);
3463	assert_eq!(expected, bnegate(&cls));
3464
3465	let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3466	let expected = bclass(&[
3467	(b'`\x00`', b'`\x60`'),
3468	(b'`\x64`', b'`\x77`'),
3469	(b'`\x7B`', b'`\xFF`'),
3470	]);
3471	assert_eq!(expected, bnegate(&cls));
3472
3473	let cls = bclass(&[(b'`\x00`', b'a')]);
3474	let expected = bclass(&[(b'`\x62`', b'`\xFF`')]);
3475	assert_eq!(expected, bnegate(&cls));
3476
3477	let cls = bclass(&[(b'a', b'`\xFF`')]);
3478	let expected = bclass(&[(b'`\x00`', b'`\x60`')]);
3479	assert_eq!(expected, bnegate(&cls));
3480
3481	let cls = bclass(&[(b'`\x00`', b'`\xFF`')]);
3482	let expected = bclass(&[]);
3483	assert_eq!(expected, bnegate(&cls));
3484
3485	let cls = bclass(&[]);
3486	let expected = bclass(&[(b'`\x00`', b'`\xFF`')]);
3487	assert_eq!(expected, bnegate(&cls));
3488
3489	let cls = bclass(&[(b'`\x00`', b'`\xFD`'), (b'`\xFF`', b'`\xFF`')]);
3490	let expected = bclass(&[(b'`\xFE`', b'`\xFE`')]);
3491	assert_eq!(expected, bnegate(&cls));
3492	}
3493
3494	#[test]
3495	fn class_union_unicode() {
3496	let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3497	let cls2 = uclass(&[('a', 'z')]);
3498	let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3499	assert_eq!(expected, uunion(&cls1, &cls2));
3500	}
3501
3502	#[test]
3503	fn class_union_bytes() {
3504	let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3505	let cls2 = bclass(&[(b'a', b'z')]);
3506	let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3507	assert_eq!(expected, bunion(&cls1, &cls2));
3508	}
3509
3510	#[test]
3511	fn class_intersect_unicode() {
3512	let cls1 = uclass(&[]);
3513	let cls2 = uclass(&[('a', 'a')]);
3514	let expected = uclass(&[]);
3515	assert_eq!(expected, uintersect(&cls1, &cls2));
3516
3517	let cls1 = uclass(&[('a', 'a')]);
3518	let cls2 = uclass(&[('a', 'a')]);
3519	let expected = uclass(&[('a', 'a')]);
3520	assert_eq!(expected, uintersect(&cls1, &cls2));
3521
3522	let cls1 = uclass(&[('a', 'a')]);
3523	let cls2 = uclass(&[('b', 'b')]);
3524	let expected = uclass(&[]);
3525	assert_eq!(expected, uintersect(&cls1, &cls2));
3526
3527	let cls1 = uclass(&[('a', 'a')]);
3528	let cls2 = uclass(&[('a', 'c')]);
3529	let expected = uclass(&[('a', 'a')]);
3530	assert_eq!(expected, uintersect(&cls1, &cls2));
3531
3532	let cls1 = uclass(&[('a', 'b')]);
3533	let cls2 = uclass(&[('a', 'c')]);
3534	let expected = uclass(&[('a', 'b')]);
3535	assert_eq!(expected, uintersect(&cls1, &cls2));
3536
3537	let cls1 = uclass(&[('a', 'b')]);
3538	let cls2 = uclass(&[('b', 'c')]);
3539	let expected = uclass(&[('b', 'b')]);
3540	assert_eq!(expected, uintersect(&cls1, &cls2));
3541
3542	let cls1 = uclass(&[('a', 'b')]);
3543	let cls2 = uclass(&[('c', 'd')]);
3544	let expected = uclass(&[]);
3545	assert_eq!(expected, uintersect(&cls1, &cls2));
3546
3547	let cls1 = uclass(&[('b', 'c')]);
3548	let cls2 = uclass(&[('a', 'd')]);
3549	let expected = uclass(&[('b', 'c')]);
3550	assert_eq!(expected, uintersect(&cls1, &cls2));
3551
3552	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3553	let cls2 = uclass(&[('a', 'h')]);
3554	let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3555	assert_eq!(expected, uintersect(&cls1, &cls2));
3556
3557	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3558	let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3559	let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3560	assert_eq!(expected, uintersect(&cls1, &cls2));
3561
3562	let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3563	let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3564	let expected = uclass(&[]);
3565	assert_eq!(expected, uintersect(&cls1, &cls2));
3566
3567	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3568	let cls2 = uclass(&[('h', 'h')]);
3569	let expected = uclass(&[('h', 'h')]);
3570	assert_eq!(expected, uintersect(&cls1, &cls2));
3571
3572	let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3573	let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3574	let expected = uclass(&[]);
3575	assert_eq!(expected, uintersect(&cls1, &cls2));
3576
3577	let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3578	let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3579	let expected = uclass(&[('b', 'f')]);
3580	assert_eq!(expected, uintersect(&cls1, &cls2));
3581	}
3582
3583	#[test]
3584	fn class_intersect_bytes() {
3585	let cls1 = bclass(&[]);
3586	let cls2 = bclass(&[(b'a', b'a')]);
3587	let expected = bclass(&[]);
3588	assert_eq!(expected, bintersect(&cls1, &cls2));
3589
3590	let cls1 = bclass(&[(b'a', b'a')]);
3591	let cls2 = bclass(&[(b'a', b'a')]);
3592	let expected = bclass(&[(b'a', b'a')]);
3593	assert_eq!(expected, bintersect(&cls1, &cls2));
3594
3595	let cls1 = bclass(&[(b'a', b'a')]);
3596	let cls2 = bclass(&[(b'b', b'b')]);
3597	let expected = bclass(&[]);
3598	assert_eq!(expected, bintersect(&cls1, &cls2));
3599
3600	let cls1 = bclass(&[(b'a', b'a')]);
3601	let cls2 = bclass(&[(b'a', b'c')]);
3602	let expected = bclass(&[(b'a', b'a')]);
3603	assert_eq!(expected, bintersect(&cls1, &cls2));
3604
3605	let cls1 = bclass(&[(b'a', b'b')]);
3606	let cls2 = bclass(&[(b'a', b'c')]);
3607	let expected = bclass(&[(b'a', b'b')]);
3608	assert_eq!(expected, bintersect(&cls1, &cls2));
3609
3610	let cls1 = bclass(&[(b'a', b'b')]);
3611	let cls2 = bclass(&[(b'b', b'c')]);
3612	let expected = bclass(&[(b'b', b'b')]);
3613	assert_eq!(expected, bintersect(&cls1, &cls2));
3614
3615	let cls1 = bclass(&[(b'a', b'b')]);
3616	let cls2 = bclass(&[(b'c', b'd')]);
3617	let expected = bclass(&[]);
3618	assert_eq!(expected, bintersect(&cls1, &cls2));
3619
3620	let cls1 = bclass(&[(b'b', b'c')]);
3621	let cls2 = bclass(&[(b'a', b'd')]);
3622	let expected = bclass(&[(b'b', b'c')]);
3623	assert_eq!(expected, bintersect(&cls1, &cls2));
3624
3625	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3626	let cls2 = bclass(&[(b'a', b'h')]);
3627	let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3628	assert_eq!(expected, bintersect(&cls1, &cls2));
3629
3630	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3631	let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3632	let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3633	assert_eq!(expected, bintersect(&cls1, &cls2));
3634
3635	let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3636	let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3637	let expected = bclass(&[]);
3638	assert_eq!(expected, bintersect(&cls1, &cls2));
3639
3640	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3641	let cls2 = bclass(&[(b'h', b'h')]);
3642	let expected = bclass(&[(b'h', b'h')]);
3643	assert_eq!(expected, bintersect(&cls1, &cls2));
3644
3645	let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3646	let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3647	let expected = bclass(&[]);
3648	assert_eq!(expected, bintersect(&cls1, &cls2));
3649
3650	let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3651	let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3652	let expected = bclass(&[(b'b', b'f')]);
3653	assert_eq!(expected, bintersect(&cls1, &cls2));
3654	}
3655
3656	#[test]
3657	fn class_difference_unicode() {
3658	let cls1 = uclass(&[('a', 'a')]);
3659	let cls2 = uclass(&[('a', 'a')]);
3660	let expected = uclass(&[]);
3661	assert_eq!(expected, udifference(&cls1, &cls2));
3662
3663	let cls1 = uclass(&[('a', 'a')]);
3664	let cls2 = uclass(&[]);
3665	let expected = uclass(&[('a', 'a')]);
3666	assert_eq!(expected, udifference(&cls1, &cls2));
3667
3668	let cls1 = uclass(&[]);
3669	let cls2 = uclass(&[('a', 'a')]);
3670	let expected = uclass(&[]);
3671	assert_eq!(expected, udifference(&cls1, &cls2));
3672
3673	let cls1 = uclass(&[('a', 'z')]);
3674	let cls2 = uclass(&[('a', 'a')]);
3675	let expected = uclass(&[('b', 'z')]);
3676	assert_eq!(expected, udifference(&cls1, &cls2));
3677
3678	let cls1 = uclass(&[('a', 'z')]);
3679	let cls2 = uclass(&[('z', 'z')]);
3680	let expected = uclass(&[('a', 'y')]);
3681	assert_eq!(expected, udifference(&cls1, &cls2));
3682
3683	let cls1 = uclass(&[('a', 'z')]);
3684	let cls2 = uclass(&[('m', 'm')]);
3685	let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3686	assert_eq!(expected, udifference(&cls1, &cls2));
3687
3688	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3689	let cls2 = uclass(&[('a', 'z')]);
3690	let expected = uclass(&[]);
3691	assert_eq!(expected, udifference(&cls1, &cls2));
3692
3693	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3694	let cls2 = uclass(&[('d', 'v')]);
3695	let expected = uclass(&[('a', 'c')]);
3696	assert_eq!(expected, udifference(&cls1, &cls2));
3697
3698	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3699	let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3700	let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3701	assert_eq!(expected, udifference(&cls1, &cls2));
3702
3703	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3704	let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3705	let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3706	assert_eq!(expected, udifference(&cls1, &cls2));
3707
3708	let cls1 = uclass(&[('x', 'z')]);
3709	let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3710	let expected = uclass(&[('x', 'z')]);
3711	assert_eq!(expected, udifference(&cls1, &cls2));
3712
3713	let cls1 = uclass(&[('a', 'z')]);
3714	let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3715	let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3716	assert_eq!(expected, udifference(&cls1, &cls2));
3717	}
3718
3719	#[test]
3720	fn class_difference_bytes() {
3721	let cls1 = bclass(&[(b'a', b'a')]);
3722	let cls2 = bclass(&[(b'a', b'a')]);
3723	let expected = bclass(&[]);
3724	assert_eq!(expected, bdifference(&cls1, &cls2));
3725
3726	let cls1 = bclass(&[(b'a', b'a')]);
3727	let cls2 = bclass(&[]);
3728	let expected = bclass(&[(b'a', b'a')]);
3729	assert_eq!(expected, bdifference(&cls1, &cls2));
3730
3731	let cls1 = bclass(&[]);
3732	let cls2 = bclass(&[(b'a', b'a')]);
3733	let expected = bclass(&[]);
3734	assert_eq!(expected, bdifference(&cls1, &cls2));
3735
3736	let cls1 = bclass(&[(b'a', b'z')]);
3737	let cls2 = bclass(&[(b'a', b'a')]);
3738	let expected = bclass(&[(b'b', b'z')]);
3739	assert_eq!(expected, bdifference(&cls1, &cls2));
3740
3741	let cls1 = bclass(&[(b'a', b'z')]);
3742	let cls2 = bclass(&[(b'z', b'z')]);
3743	let expected = bclass(&[(b'a', b'y')]);
3744	assert_eq!(expected, bdifference(&cls1, &cls2));
3745
3746	let cls1 = bclass(&[(b'a', b'z')]);
3747	let cls2 = bclass(&[(b'm', b'm')]);
3748	let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3749	assert_eq!(expected, bdifference(&cls1, &cls2));
3750
3751	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3752	let cls2 = bclass(&[(b'a', b'z')]);
3753	let expected = bclass(&[]);
3754	assert_eq!(expected, bdifference(&cls1, &cls2));
3755
3756	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3757	let cls2 = bclass(&[(b'd', b'v')]);
3758	let expected = bclass(&[(b'a', b'c')]);
3759	assert_eq!(expected, bdifference(&cls1, &cls2));
3760
3761	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3762	let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3763	let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3764	assert_eq!(expected, bdifference(&cls1, &cls2));
3765
3766	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3767	let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3768	let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3769	assert_eq!(expected, bdifference(&cls1, &cls2));
3770
3771	let cls1 = bclass(&[(b'x', b'z')]);
3772	let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3773	let expected = bclass(&[(b'x', b'z')]);
3774	assert_eq!(expected, bdifference(&cls1, &cls2));
3775
3776	let cls1 = bclass(&[(b'a', b'z')]);
3777	let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3778	let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3779	assert_eq!(expected, bdifference(&cls1, &cls2));
3780	}
3781
3782	#[test]
3783	fn class_symmetric_difference_unicode() {
3784	let cls1 = uclass(&[('a', 'm')]);
3785	let cls2 = uclass(&[('g', 't')]);
3786	let expected = uclass(&[('a', 'f'), ('n', 't')]);
3787	assert_eq!(expected, usymdifference(&cls1, &cls2));
3788	}
3789
3790	#[test]
3791	fn class_symmetric_difference_bytes() {
3792	let cls1 = bclass(&[(b'a', b'm')]);
3793	let cls2 = bclass(&[(b'g', b't')]);
3794	let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3795	assert_eq!(expected, bsymdifference(&cls1, &cls2));
3796	}
3797
3798	// We use a thread with an explicit stack size to test that our destructor
3799	// for Hir can handle arbitrarily sized expressions in constant stack
3800	// space. In case we run on a platform without threads (WASM?), we limit
3801	// this test to Windows/Unix.
3802	#[test]
3803	#[cfg(any(unix, windows))]
3804	fn no_stack_overflow_on_drop() {
3805	use std::thread;
3806
3807	let run = \|\| {
3808	let mut expr = Hir::empty();
3809	for _ in `0`..`100` {
3810	expr = Hir::capture(Capture {
3811	index: `1`,
3812	name: None,
3813	sub: Box::new(expr),
3814	});
3815	expr = Hir::repetition(Repetition {
3816	min: `0`,
3817	max: Some(`1`),
3818	greedy: `true`,
3819	sub: Box::new(expr),
3820	});
3821
3822	expr = Hir {
3823	kind: HirKind::Concat(vec![expr]),
3824	props: Properties::empty(),
3825	};
3826	expr = Hir {
3827	kind: HirKind::Alternation(vec![expr]),
3828	props: Properties::empty(),
3829	};
3830	}
3831	assert!(!matches!(*expr.kind(), HirKind::Empty));
3832	};
3833
3834	// We run our test on a thread with a small stack size so we can
3835	// force the issue more easily.
3836	//
3837	// NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3838	// for context on the specific stack size chosen here.
3839	thread::Builder::new()
3840	.stack_size(`16` << `10`)
3841	.spawn(run)
3842	.unwrap()
3843	.join()
3844	.unwrap();
3845	}
3846
3847	#[test]
3848	fn look_set_iter() {
3849	let set = LookSet::empty();
3850	assert_eq!(`0`, set.iter().count());
3851
3852	let set = LookSet::full();
3853	assert_eq!(`18`, set.iter().count());
3854
3855	let set =
3856	LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3857	assert_eq!(`2`, set.iter().count());
3858
3859	let set = LookSet::empty().insert(Look::StartLF);
3860	assert_eq!(`1`, set.iter().count());
3861
3862	let set = LookSet::empty().insert(Look::WordAsciiNegate);
3863	assert_eq!(`1`, set.iter().count());
3864	}
3865
3866	#[test]
3867	fn look_set_debug() {
3868	let res = format!("{:?}", LookSet::empty());
3869	assert_eq!("∅", res);
3870	let res = format!("{:?}", LookSet::full());
3871	assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3872	}
3873	}
3874