aho_corasick/util/
prefilter.rs

1use core::{
2    cmp,
3    fmt::Debug,
4    panic::{RefUnwindSafe, UnwindSafe},
5    u8,
6};
7
8use alloc::{sync::Arc, vec, vec::Vec};
9
10use crate::{
11    packed,
12    util::{
13        alphabet::ByteSet,
14        search::{Match, MatchKind, Span},
15    },
16};
17
18/// A prefilter for accelerating a search.
19///
20/// This crate uses prefilters in the core search implementations to accelerate
21/// common cases. They typically only apply to cases where there are a small
22/// number of patterns (less than 100 or so), but when they do, thoughput can
23/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
24/// is active, it is used whenever a search enters an automaton's start state.
25///
26/// Currently, prefilters cannot be constructed by
27/// callers. A `Prefilter` can only be accessed via the
28/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
29/// method and used to execute a search. In other words, a prefilter can be
30/// used to optimize your own search implementation if necessary, but cannot do
31/// much else. If you have a use case for more APIs, please submit an issue.
32#[derive(Clone, Debug)]
33pub struct Prefilter {
34    finder: Arc<dyn PrefilterI>,
35    memory_usage: usize,
36}
37
38impl Prefilter {
39    /// Execute a search in the haystack within the span given. If a match or
40    /// a possible match is returned, then it is guaranteed to occur within
41    /// the bounds of the span.
42    ///
43    /// If the span provided is invalid for the given haystack, then behavior
44    /// is unspecified.
45    #[inline]
46    pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
47        self.finder.find_in(haystack, span)
48    }
49
50    #[inline]
51    pub(crate) fn memory_usage(&self) -> usize {
52        self.memory_usage
53    }
54}
55
56/// A candidate is the result of running a prefilter on a haystack at a
57/// particular position.
58///
59/// The result is either no match, a confirmed match or a possible match.
60///
61/// When no match is returned, the prefilter is guaranteeing that no possible
62/// match can be found in the haystack, and the caller may trust this. That is,
63/// all correct prefilters must never report false negatives.
64///
65/// In some cases, a prefilter can confirm a match very quickly, in which case,
66/// the caller may use this to stop what it's doing and report the match. In
67/// this case, prefilter implementations must never report a false positive.
68/// In other cases, the prefilter can only report a potential match, in which
69/// case the callers must attempt to confirm the match. In this case, prefilter
70/// implementations are permitted to return false positives.
71#[derive(Clone, Debug)]
72pub enum Candidate {
73    /// No match was found. Since false negatives are not possible, this means
74    /// the search can quit as it is guaranteed not to find another match.
75    None,
76    /// A confirmed match was found. Callers do not need to confirm it.
77    Match(Match),
78    /// The start of a possible match was found. Callers must confirm it before
79    /// reporting it as a match.
80    PossibleStartOfMatch(usize),
81}
82
83impl Candidate {
84    /// Convert this candidate into an option. This is useful when callers
85    /// do not distinguish between true positives and false positives (i.e.,
86    /// the caller must always confirm the match).
87    pub fn into_option(self) -> Option<usize> {
88        match self {
89            Candidate::None => None,
90            Candidate::Match(ref m) => Some(m.start()),
91            Candidate::PossibleStartOfMatch(start) => Some(start),
92        }
93    }
94}
95
96/// A prefilter describes the behavior of fast literal scanners for quickly
97/// skipping past bytes in the haystack that we know cannot possibly
98/// participate in a match.
99trait PrefilterI:
100    Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
101{
102    /// Returns the next possible match candidate. This may yield false
103    /// positives, so callers must confirm a match starting at the position
104    /// returned. This, however, must never produce false negatives. That is,
105    /// this must, at minimum, return the starting position of the next match
106    /// in the given haystack after or at the given position.
107    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
108}
109
110impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
111    #[inline(always)]
112    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
113        (**self).find_in(haystack, span)
114    }
115}
116
117/// A builder for constructing the best possible prefilter. When constructed,
118/// this builder will heuristically select the best prefilter it can build,
119/// if any, and discard the rest.
120#[derive(Debug)]
121pub(crate) struct Builder {
122    count: usize,
123    ascii_case_insensitive: bool,
124    start_bytes: StartBytesBuilder,
125    rare_bytes: RareBytesBuilder,
126    memmem: MemmemBuilder,
127    packed: Option<packed::Builder>,
128    // If we run across a condition that suggests we shouldn't use a prefilter
129    // at all (like an empty pattern), then disable prefilters entirely.
130    enabled: bool,
131}
132
133impl Builder {
134    /// Create a new builder for constructing the best possible prefilter.
135    pub(crate) fn new(kind: MatchKind) -> Builder {
136        let pbuilder = kind
137            .as_packed()
138            .map(|kind| packed::Config::new().match_kind(kind).builder());
139        Builder {
140            count: 0,
141            ascii_case_insensitive: false,
142            start_bytes: StartBytesBuilder::new(),
143            rare_bytes: RareBytesBuilder::new(),
144            memmem: MemmemBuilder::default(),
145            packed: pbuilder,
146            enabled: true,
147        }
148    }
149
150    /// Enable ASCII case insensitivity. When set, byte strings added to this
151    /// builder will be interpreted without respect to ASCII case.
152    pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
153        self.ascii_case_insensitive = yes;
154        self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
155        self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
156        self
157    }
158
159    /// Return a prefilter suitable for quickly finding potential matches.
160    ///
161    /// All patterns added to an Aho-Corasick automaton should be added to this
162    /// builder before attempting to construct the prefilter.
163    pub(crate) fn build(&self) -> Option<Prefilter> {
164        if !self.enabled {
165            debug!("prefilter not enabled, skipping");
166            return None;
167        }
168        // If we only have one pattern, then deferring to memmem is always
169        // the best choice. This is kind of a weird case, because, well, why
170        // use Aho-Corasick if you only have one pattern? But maybe you don't
171        // know exactly how many patterns you'll get up front, and you need to
172        // support the option of multiple patterns. So instead of relying on
173        // the caller to branch and use memmem explicitly, we just do it for
174        // them.
175        if !self.ascii_case_insensitive {
176            if let Some(pre) = self.memmem.build() {
177                debug!("using memmem prefilter");
178                return Some(pre);
179            }
180        }
181        let (packed, patlen, minlen) = if self.ascii_case_insensitive {
182            (None, usize::MAX, 0)
183        } else {
184            let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len());
185            let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len());
186            let packed =
187                self.packed.as_ref().and_then(|b| b.build()).map(|s| {
188                    let memory_usage = s.memory_usage();
189                    debug!(
190                        "built packed prefilter (len: {}, \
191                         minimum pattern len: {}, memory usage: {}) \
192                         for consideration",
193                        patlen, minlen, memory_usage,
194                    );
195                    Prefilter { finder: Arc::new(Packed(s)), memory_usage }
196                });
197            (packed, patlen, minlen)
198        };
199        match (self.start_bytes.build(), self.rare_bytes.build()) {
200            // If we could build both start and rare prefilters, then there are
201            // a few cases in which we'd want to use the start-byte prefilter
202            // over the rare-byte prefilter, since the former has lower
203            // overhead.
204            (prestart @ Some(_), prerare @ Some(_)) => {
205                debug!(
206                    "both start (len={}, rank={}) and \
207                     rare (len={}, rank={}) byte prefilters \
208                     are available",
209                    self.start_bytes.count,
210                    self.start_bytes.rank_sum,
211                    self.rare_bytes.count,
212                    self.rare_bytes.rank_sum,
213                );
214                if patlen <= 16
215                    && minlen >= 2
216                    && self.start_bytes.count >= 3
217                    && self.rare_bytes.count >= 3
218                {
219                    debug!(
220                        "start and rare byte prefilters available, but \
221                             they're probably slower than packed so using \
222                             packed"
223                    );
224                    return packed;
225                }
226                // If the start-byte prefilter can scan for a smaller number
227                // of bytes than the rare-byte prefilter, then it's probably
228                // faster.
229                let has_fewer_bytes =
230                    self.start_bytes.count < self.rare_bytes.count;
231                // Otherwise, if the combined frequency rank of the detected
232                // bytes in the start-byte prefilter is "close" to the combined
233                // frequency rank of the rare-byte prefilter, then we pick
234                // the start-byte prefilter even if the rare-byte prefilter
235                // heuristically searches for rare bytes. This is because the
236                // rare-byte prefilter has higher constant costs, so we tend to
237                // prefer the start-byte prefilter when we can.
238                let has_rarer_bytes =
239                    self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
240                if has_fewer_bytes {
241                    debug!(
242                        "using start byte prefilter because it has fewer
243                         bytes to search for than the rare byte prefilter",
244                    );
245                    prestart
246                } else if has_rarer_bytes {
247                    debug!(
248                        "using start byte prefilter because its byte \
249                         frequency rank was determined to be \
250                         \"good enough\" relative to the rare byte prefilter \
251                         byte frequency rank",
252                    );
253                    prestart
254                } else {
255                    debug!("using rare byte prefilter");
256                    prerare
257                }
258            }
259            (prestart @ Some(_), None) => {
260                if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 {
261                    debug!(
262                        "start byte prefilter available, but \
263                         it's probably slower than packed so using \
264                         packed"
265                    );
266                    return packed;
267                }
268                debug!(
269                    "have start byte prefilter but not rare byte prefilter, \
270                     so using start byte prefilter",
271                );
272                prestart
273            }
274            (None, prerare @ Some(_)) => {
275                if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 {
276                    debug!(
277                        "rare byte prefilter available, but \
278                         it's probably slower than packed so using \
279                         packed"
280                    );
281                    return packed;
282                }
283                debug!(
284                    "have rare byte prefilter but not start byte prefilter, \
285                     so using rare byte prefilter",
286                );
287                prerare
288            }
289            (None, None) if self.ascii_case_insensitive => {
290                debug!(
291                    "no start or rare byte prefilter and ASCII case \
292                     insensitivity was enabled, so skipping prefilter",
293                );
294                None
295            }
296            (None, None) => {
297                if packed.is_some() {
298                    debug!("falling back to packed prefilter");
299                } else {
300                    debug!("no prefilter available");
301                }
302                packed
303            }
304        }
305    }
306
307    /// Add a literal string to this prefilter builder.
308    pub(crate) fn add(&mut self, bytes: &[u8]) {
309        if bytes.is_empty() {
310            self.enabled = false;
311        }
312        if !self.enabled {
313            return;
314        }
315        self.count += 1;
316        self.start_bytes.add(bytes);
317        self.rare_bytes.add(bytes);
318        self.memmem.add(bytes);
319        if let Some(ref mut pbuilder) = self.packed {
320            pbuilder.add(bytes);
321        }
322    }
323}
324
325/// A type that wraps a packed searcher and implements the `Prefilter`
326/// interface.
327#[derive(Clone, Debug)]
328struct Packed(packed::Searcher);
329
330impl PrefilterI for Packed {
331    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
332        self.0
333            .find_in(&haystack, span)
334            .map_or(Candidate::None, Candidate::Match)
335    }
336}
337
338/// A builder for constructing a prefilter that uses memmem.
339#[derive(Debug, Default)]
340struct MemmemBuilder {
341    /// The number of patterns that have been added.
342    count: usize,
343    /// The singular pattern to search for. This is only set when count==1.
344    one: Option<Vec<u8>>,
345}
346
347impl MemmemBuilder {
348    fn build(&self) -> Option<Prefilter> {
349        #[cfg(all(feature = "std", feature = "perf-literal"))]
350        fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
351            let pattern = builder.one.as_ref()?;
352            assert_eq!(1, builder.count);
353            let finder = Arc::new(Memmem(
354                memchr::memmem::Finder::new(pattern).into_owned(),
355            ));
356            let memory_usage = pattern.len();
357            Some(Prefilter { finder, memory_usage })
358        }
359
360        #[cfg(not(all(feature = "std", feature = "perf-literal")))]
361        fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
362            None
363        }
364
365        imp(self)
366    }
367
368    fn add(&mut self, bytes: &[u8]) {
369        self.count += 1;
370        if self.count == 1 {
371            self.one = Some(bytes.to_vec());
372        } else {
373            self.one = None;
374        }
375    }
376}
377
378/// A type that wraps a SIMD accelerated single substring search from the
379/// `memchr` crate for use as a prefilter.
380///
381/// Currently, this prefilter is only active for Aho-Corasick searchers with
382/// a single pattern. In theory, this could be extended to support searchers
383/// that have a common prefix of more than one byte (for one byte, we would use
384/// memchr), but it's not clear if it's worth it or not.
385///
386/// Also, unfortunately, this currently also requires the 'std' feature to
387/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
388/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
389/// disabled. But there should be an 'alloc' feature that brings in APIs like
390/// Finder::into_owned but doesn't use std-only features like runtime CPU
391/// feature detection.
392#[cfg(all(feature = "std", feature = "perf-literal"))]
393#[derive(Clone, Debug)]
394struct Memmem(memchr::memmem::Finder<'static>);
395
396#[cfg(all(feature = "std", feature = "perf-literal"))]
397impl PrefilterI for Memmem {
398    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
399        use crate::util::primitives::PatternID;
400
401        self.0.find(&haystack[span]).map_or(Candidate::None, |i| {
402            let start = span.start + i;
403            let end = start + self.0.needle().len();
404            // N.B. We can declare a match and use a fixed pattern ID here
405            // because a Memmem prefilter is only ever created for searchers
406            // with exactly one pattern. Thus, every match is always a match
407            // and it is always for the first and only pattern.
408            Candidate::Match(Match::new(PatternID::ZERO, start..end))
409        })
410    }
411}
412
413/// A builder for constructing a rare byte prefilter.
414///
415/// A rare byte prefilter attempts to pick out a small set of rare bytes that
416/// occurr in the patterns, and then quickly scan to matches of those rare
417/// bytes.
418#[derive(Clone, Debug)]
419struct RareBytesBuilder {
420    /// Whether this prefilter should account for ASCII case insensitivity or
421    /// not.
422    ascii_case_insensitive: bool,
423    /// A set of rare bytes, indexed by byte value.
424    rare_set: ByteSet,
425    /// A set of byte offsets associated with bytes in a pattern. An entry
426    /// corresponds to a particular bytes (its index) and is only non-zero if
427    /// the byte occurred at an offset greater than 0 in at least one pattern.
428    ///
429    /// If a byte's offset is not representable in 8 bits, then the rare bytes
430    /// prefilter becomes inert.
431    byte_offsets: RareByteOffsets,
432    /// Whether this is available as a prefilter or not. This can be set to
433    /// false during construction if a condition is seen that invalidates the
434    /// use of the rare-byte prefilter.
435    available: bool,
436    /// The number of bytes set to an active value in `byte_offsets`.
437    count: usize,
438    /// The sum of frequency ranks for the rare bytes detected. This is
439    /// intended to give a heuristic notion of how rare the bytes are.
440    rank_sum: u16,
441}
442
443/// A set of byte offsets, keyed by byte.
444#[derive(Clone, Copy)]
445struct RareByteOffsets {
446    /// Each entry corresponds to the maximum offset of the corresponding
447    /// byte across all patterns seen.
448    set: [RareByteOffset; 256],
449}
450
451impl RareByteOffsets {
452    /// Create a new empty set of rare byte offsets.
453    pub(crate) fn empty() -> RareByteOffsets {
454        RareByteOffsets { set: [RareByteOffset::default(); 256] }
455    }
456
457    /// Add the given offset for the given byte to this set. If the offset is
458    /// greater than the existing offset, then it overwrites the previous
459    /// value and returns false. If there is no previous value set, then this
460    /// sets it and returns true.
461    pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
462        self.set[byte as usize].max =
463            cmp::max(self.set[byte as usize].max, off.max);
464    }
465}
466
467impl core::fmt::Debug for RareByteOffsets {
468    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
469        let mut offsets = vec![];
470        for off in self.set.iter() {
471            if off.max > 0 {
472                offsets.push(off);
473            }
474        }
475        f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
476    }
477}
478
479/// Offsets associated with an occurrence of a "rare" byte in any of the
480/// patterns used to construct a single Aho-Corasick automaton.
481#[derive(Clone, Copy, Debug)]
482struct RareByteOffset {
483    /// The maximum offset at which a particular byte occurs from the start
484    /// of any pattern. This is used as a shift amount. That is, when an
485    /// occurrence of this byte is found, the candidate position reported by
486    /// the prefilter is `position_of_byte - max`, such that the automaton
487    /// will begin its search at a position that is guaranteed to observe a
488    /// match.
489    ///
490    /// To avoid accidentally quadratic behavior, a prefilter is considered
491    /// ineffective when it is asked to start scanning from a position that it
492    /// has already scanned past.
493    ///
494    /// Using a `u8` here means that if we ever see a pattern that's longer
495    /// than 255 bytes, then the entire rare byte prefilter is disabled.
496    max: u8,
497}
498
499impl Default for RareByteOffset {
500    fn default() -> RareByteOffset {
501        RareByteOffset { max: 0 }
502    }
503}
504
505impl RareByteOffset {
506    /// Create a new rare byte offset. If the given offset is too big, then
507    /// None is returned. In that case, callers should render the rare bytes
508    /// prefilter inert.
509    fn new(max: usize) -> Option<RareByteOffset> {
510        if max > u8::MAX as usize {
511            None
512        } else {
513            Some(RareByteOffset { max: max as u8 })
514        }
515    }
516}
517
518impl RareBytesBuilder {
519    /// Create a new builder for constructing a rare byte prefilter.
520    fn new() -> RareBytesBuilder {
521        RareBytesBuilder {
522            ascii_case_insensitive: false,
523            rare_set: ByteSet::empty(),
524            byte_offsets: RareByteOffsets::empty(),
525            available: true,
526            count: 0,
527            rank_sum: 0,
528        }
529    }
530
531    /// Enable ASCII case insensitivity. When set, byte strings added to this
532    /// builder will be interpreted without respect to ASCII case.
533    fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
534        self.ascii_case_insensitive = yes;
535        self
536    }
537
538    /// Build the rare bytes prefilter.
539    ///
540    /// If there are more than 3 distinct rare bytes found, or if heuristics
541    /// otherwise determine that this prefilter should not be used, then `None`
542    /// is returned.
543    fn build(&self) -> Option<Prefilter> {
544        #[cfg(feature = "perf-literal")]
545        fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
546            if !builder.available || builder.count > 3 {
547                return None;
548            }
549            let (mut bytes, mut len) = ([0; 3], 0);
550            for b in 0..=255 {
551                if builder.rare_set.contains(b) {
552                    bytes[len] = b as u8;
553                    len += 1;
554                }
555            }
556            let finder: Arc<dyn PrefilterI> = match len {
557                0 => return None,
558                1 => Arc::new(RareBytesOne {
559                    byte1: bytes[0],
560                    offset: builder.byte_offsets.set[bytes[0] as usize],
561                }),
562                2 => Arc::new(RareBytesTwo {
563                    offsets: builder.byte_offsets,
564                    byte1: bytes[0],
565                    byte2: bytes[1],
566                }),
567                3 => Arc::new(RareBytesThree {
568                    offsets: builder.byte_offsets,
569                    byte1: bytes[0],
570                    byte2: bytes[1],
571                    byte3: bytes[2],
572                }),
573                _ => unreachable!(),
574            };
575            Some(Prefilter { finder, memory_usage: 0 })
576        }
577
578        #[cfg(not(feature = "perf-literal"))]
579        fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
580            None
581        }
582
583        imp(self)
584    }
585
586    /// Add a byte string to this builder.
587    ///
588    /// All patterns added to an Aho-Corasick automaton should be added to this
589    /// builder before attempting to construct the prefilter.
590    fn add(&mut self, bytes: &[u8]) {
591        // If we've already given up, then do nothing.
592        if !self.available {
593            return;
594        }
595        // If we've already blown our budget, then don't waste time looking
596        // for more rare bytes.
597        if self.count > 3 {
598            self.available = false;
599            return;
600        }
601        // If the pattern is too long, then our offset table is bunk, so
602        // give up.
603        if bytes.len() >= 256 {
604            self.available = false;
605            return;
606        }
607        let mut rarest = match bytes.get(0) {
608            None => return,
609            Some(&b) => (b, freq_rank(b)),
610        };
611        // The idea here is to look for the rarest byte in each pattern, and
612        // add that to our set. As a special exception, if we see a byte that
613        // we've already added, then we immediately stop and choose that byte,
614        // even if there's another rare byte in the pattern. This helps us
615        // apply the rare byte optimization in more cases by attempting to pick
616        // bytes that are in common between patterns. So for example, if we
617        // were searching for `Sherlock` and `lockjaw`, then this would pick
618        // `k` for both patterns, resulting in the use of `memchr` instead of
619        // `memchr2` for `k` and `j`.
620        let mut found = false;
621        for (pos, &b) in bytes.iter().enumerate() {
622            self.set_offset(pos, b);
623            if found {
624                continue;
625            }
626            if self.rare_set.contains(b) {
627                found = true;
628                continue;
629            }
630            let rank = freq_rank(b);
631            if rank < rarest.1 {
632                rarest = (b, rank);
633            }
634        }
635        if !found {
636            self.add_rare_byte(rarest.0);
637        }
638    }
639
640    fn set_offset(&mut self, pos: usize, byte: u8) {
641        // This unwrap is OK because pos is never bigger than our max.
642        let offset = RareByteOffset::new(pos).unwrap();
643        self.byte_offsets.set(byte, offset);
644        if self.ascii_case_insensitive {
645            self.byte_offsets.set(opposite_ascii_case(byte), offset);
646        }
647    }
648
649    fn add_rare_byte(&mut self, byte: u8) {
650        self.add_one_rare_byte(byte);
651        if self.ascii_case_insensitive {
652            self.add_one_rare_byte(opposite_ascii_case(byte));
653        }
654    }
655
656    fn add_one_rare_byte(&mut self, byte: u8) {
657        if !self.rare_set.contains(byte) {
658            self.rare_set.add(byte);
659            self.count += 1;
660            self.rank_sum += freq_rank(byte) as u16;
661        }
662    }
663}
664
665/// A prefilter for scanning for a single "rare" byte.
666#[cfg(feature = "perf-literal")]
667#[derive(Clone, Debug)]
668struct RareBytesOne {
669    byte1: u8,
670    offset: RareByteOffset,
671}
672
673#[cfg(feature = "perf-literal")]
674impl PrefilterI for RareBytesOne {
675    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
676        memchr::memchr(self.byte1, &haystack[span])
677            .map(|i| {
678                let pos = span.start + i;
679                cmp::max(
680                    span.start,
681                    pos.saturating_sub(usize::from(self.offset.max)),
682                )
683            })
684            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
685    }
686}
687
688/// A prefilter for scanning for two "rare" bytes.
689#[cfg(feature = "perf-literal")]
690#[derive(Clone, Debug)]
691struct RareBytesTwo {
692    offsets: RareByteOffsets,
693    byte1: u8,
694    byte2: u8,
695}
696
697#[cfg(feature = "perf-literal")]
698impl PrefilterI for RareBytesTwo {
699    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
700        memchr::memchr2(self.byte1, self.byte2, &haystack[span])
701            .map(|i| {
702                let pos = span.start + i;
703                let offset = self.offsets.set[usize::from(haystack[pos])].max;
704                cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
705            })
706            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
707    }
708}
709
710/// A prefilter for scanning for three "rare" bytes.
711#[cfg(feature = "perf-literal")]
712#[derive(Clone, Debug)]
713struct RareBytesThree {
714    offsets: RareByteOffsets,
715    byte1: u8,
716    byte2: u8,
717    byte3: u8,
718}
719
720#[cfg(feature = "perf-literal")]
721impl PrefilterI for RareBytesThree {
722    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
723        memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
724            .map(|i| {
725                let pos = span.start + i;
726                let offset = self.offsets.set[usize::from(haystack[pos])].max;
727                cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
728            })
729            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
730    }
731}
732
733/// A builder for constructing a starting byte prefilter.
734///
735/// A starting byte prefilter is a simplistic prefilter that looks for possible
736/// matches by reporting all positions corresponding to a particular byte. This
737/// generally only takes affect when there are at most 3 distinct possible
738/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
739/// distinct starting bytes (`f` and `b`), and this prefilter returns all
740/// occurrences of either `f` or `b`.
741///
742/// In some cases, a heuristic frequency analysis may determine that it would
743/// be better not to use this prefilter even when there are 3 or fewer distinct
744/// starting bytes.
745#[derive(Clone, Debug)]
746struct StartBytesBuilder {
747    /// Whether this prefilter should account for ASCII case insensitivity or
748    /// not.
749    ascii_case_insensitive: bool,
750    /// The set of starting bytes observed.
751    byteset: Vec<bool>,
752    /// The number of bytes set to true in `byteset`.
753    count: usize,
754    /// The sum of frequency ranks for the rare bytes detected. This is
755    /// intended to give a heuristic notion of how rare the bytes are.
756    rank_sum: u16,
757}
758
759impl StartBytesBuilder {
760    /// Create a new builder for constructing a start byte prefilter.
761    fn new() -> StartBytesBuilder {
762        StartBytesBuilder {
763            ascii_case_insensitive: false,
764            byteset: vec![false; 256],
765            count: 0,
766            rank_sum: 0,
767        }
768    }
769
770    /// Enable ASCII case insensitivity. When set, byte strings added to this
771    /// builder will be interpreted without respect to ASCII case.
772    fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
773        self.ascii_case_insensitive = yes;
774        self
775    }
776
777    /// Build the starting bytes prefilter.
778    ///
779    /// If there are more than 3 distinct starting bytes, or if heuristics
780    /// otherwise determine that this prefilter should not be used, then `None`
781    /// is returned.
782    fn build(&self) -> Option<Prefilter> {
783        #[cfg(feature = "perf-literal")]
784        fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
785            if builder.count > 3 {
786                return None;
787            }
788            let (mut bytes, mut len) = ([0; 3], 0);
789            for b in 0..256 {
790                if !builder.byteset[b] {
791                    continue;
792                }
793                // We don't handle non-ASCII bytes for now. Getting non-ASCII
794                // bytes right is trickier, since we generally don't want to put
795                // a leading UTF-8 code unit into a prefilter that isn't ASCII,
796                // since they can frequently. Instead, it would be better to use a
797                // continuation byte, but this requires more sophisticated analysis
798                // of the automaton and a richer prefilter API.
799                if b > 0x7F {
800                    return None;
801                }
802                bytes[len] = b as u8;
803                len += 1;
804            }
805            let finder: Arc<dyn PrefilterI> = match len {
806                0 => return None,
807                1 => Arc::new(StartBytesOne { byte1: bytes[0] }),
808                2 => Arc::new(StartBytesTwo {
809                    byte1: bytes[0],
810                    byte2: bytes[1],
811                }),
812                3 => Arc::new(StartBytesThree {
813                    byte1: bytes[0],
814                    byte2: bytes[1],
815                    byte3: bytes[2],
816                }),
817                _ => unreachable!(),
818            };
819            Some(Prefilter { finder, memory_usage: 0 })
820        }
821
822        #[cfg(not(feature = "perf-literal"))]
823        fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
824            None
825        }
826
827        imp(self)
828    }
829
830    /// Add a byte string to this builder.
831    ///
832    /// All patterns added to an Aho-Corasick automaton should be added to this
833    /// builder before attempting to construct the prefilter.
834    fn add(&mut self, bytes: &[u8]) {
835        if self.count > 3 {
836            return;
837        }
838        if let Some(&byte) = bytes.get(0) {
839            self.add_one_byte(byte);
840            if self.ascii_case_insensitive {
841                self.add_one_byte(opposite_ascii_case(byte));
842            }
843        }
844    }
845
846    fn add_one_byte(&mut self, byte: u8) {
847        if !self.byteset[byte as usize] {
848            self.byteset[byte as usize] = true;
849            self.count += 1;
850            self.rank_sum += freq_rank(byte) as u16;
851        }
852    }
853}
854
855/// A prefilter for scanning for a single starting byte.
856#[cfg(feature = "perf-literal")]
857#[derive(Clone, Debug)]
858struct StartBytesOne {
859    byte1: u8,
860}
861
862#[cfg(feature = "perf-literal")]
863impl PrefilterI for StartBytesOne {
864    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
865        memchr::memchr(self.byte1, &haystack[span])
866            .map(|i| span.start + i)
867            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
868    }
869}
870
871/// A prefilter for scanning for two starting bytes.
872#[cfg(feature = "perf-literal")]
873#[derive(Clone, Debug)]
874struct StartBytesTwo {
875    byte1: u8,
876    byte2: u8,
877}
878
879#[cfg(feature = "perf-literal")]
880impl PrefilterI for StartBytesTwo {
881    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
882        memchr::memchr2(self.byte1, self.byte2, &haystack[span])
883            .map(|i| span.start + i)
884            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
885    }
886}
887
888/// A prefilter for scanning for three starting bytes.
889#[cfg(feature = "perf-literal")]
890#[derive(Clone, Debug)]
891struct StartBytesThree {
892    byte1: u8,
893    byte2: u8,
894    byte3: u8,
895}
896
897#[cfg(feature = "perf-literal")]
898impl PrefilterI for StartBytesThree {
899    fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
900        memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
901            .map(|i| span.start + i)
902            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
903    }
904}
905
906/// If the given byte is an ASCII letter, then return it in the opposite case.
907/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
908/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
909pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
910    if b'A' <= b && b <= b'Z' {
911        b.to_ascii_lowercase()
912    } else if b'a' <= b && b <= b'z' {
913        b.to_ascii_uppercase()
914    } else {
915        b
916    }
917}
918
919/// Return the frequency rank of the given byte. The higher the rank, the more
920/// common the byte (heuristically speaking).
921fn freq_rank(b: u8) -> u8 {
922    use crate::util::byte_frequencies::BYTE_FREQUENCIES;
923    BYTE_FREQUENCIES[b as usize]
924}