encoding_rs_io/
util.rs

1use std::cmp;
2use std::io;
3
4use encoding_rs::{CoderResult, Decoder, Encoding};
5
6/// This is the minimum amount of space that a decoder-to-utf8-with-replacement
7/// will use for any state and any input.
8const TINY_BUFFER_SIZE: usize = 7;
9
10/// A tiny transcoder performs transcoding incrementally even when a caller
11/// provided buffer is not large enough.
12///
13/// This use case comes up when implementing streaming transcoding in cases
14/// where it is permissible to provide incomplete UTF-8 sequences to the
15/// caller (e.g., when decoding into a `&[u8]` where the caller must be capable
16/// of handling invalid UTF-8). In particular, this type specifically handles
17/// cases where a caller provided buffer is too small to store a full UTF-8
18/// sequence. Thus, this type should be used in cases where the caller provided
19/// buffer has length 3 or fewer.
20///
21/// This could likely be done with better performance by allocating a larger
22/// buffer for these cases, but we instead opt to handle this without
23/// allocation under the assumption that tiny caller provided buffers are
24/// probably a pathological case.
25#[derive(Clone, Debug)]
26pub struct TinyTranscoder {
27    /// This is where we store the results of a transcoding. Since we are
28    /// always decoding to UTF-8, 7 bytes is sufficient to represent any
29    /// codepoint.
30    partial: [u8; TINY_BUFFER_SIZE],
31    /// The number of bytes written in `partial`.
32    len: usize,
33    /// The position in `partial` at which the next byte should be read.
34    pos: usize,
35}
36
37impl TinyTranscoder {
38    /// Create a new tiny transcoder that is ready for use.
39    pub fn new() -> TinyTranscoder {
40        TinyTranscoder { partial: [0; TINY_BUFFER_SIZE], len: 0, pos: 0 }
41    }
42
43    /// Transcode the contents of `src` into this buffer using the provided
44    /// decoder, and return the number of bytes consumed in `src` and the
45    /// number of bytes written to this transcoder.
46    ///
47    /// The results of transcoding can be read using the TinyTranscoder's
48    /// `io::Read` implementation.
49    ///
50    /// If `last` is true, then this signals to the decoder that we've reached
51    /// EOF and `src` must be empty. Otherwise, if `last` is false, then
52    /// `src` must be non-empty. Violating either of these constraits will
53    /// cause a panic.
54    ///
55    /// Finally, if this transcoder still has unconsumed bytes from a previous
56    /// transcode, then this panics. Callers must consume all bytes from a
57    /// previous transcoding before performing another one.
58    pub fn transcode(
59        &mut self,
60        decoder: &mut Decoder,
61        src: &[u8],
62        last: bool,
63    ) -> (usize, usize) {
64        assert!(self.as_slice().is_empty(), "transcoder has unconsumed bytes");
65        if last {
66            assert!(src.is_empty(), "src must be empty when last==true");
67        }
68        let (res, nin, nout, _) =
69            decoder.decode_to_utf8(src, &mut self.partial[..], last);
70        if last {
71            assert_eq!(
72                res,
73                CoderResult::InputEmpty,
74                "input should be exhausted",
75            );
76        }
77        self.pos = 0;
78        self.len = nout;
79        (nin, nout)
80    }
81
82    /// Return the the bytes remaining to be read as a slice.
83    fn as_slice(&self) -> &[u8] {
84        &self.partial[self.pos..self.len]
85    }
86}
87
88impl io::Read for TinyTranscoder {
89    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
90        if self.pos >= self.len {
91            return Ok(0);
92        }
93        let mut count = 0;
94        for (src, dst) in self.as_slice().iter().zip(buf) {
95            *dst = *src;
96            count += 1;
97        }
98        self.pos += count;
99        Ok(count)
100    }
101}
102
103/// `BomPeeker` wraps `R` and satisfies the `io::Read` interface while also
104/// providing a peek at the BOM if one exists. Peeking at the BOM does not
105/// advance the reader.
106#[derive(Debug)]
107pub struct BomPeeker<R> {
108    rdr: R,
109    strip: bool,
110    bom: Option<PossibleBom>,
111    nread: usize,
112}
113
114impl<R: io::Read> BomPeeker<R> {
115    /// Create a new BomPeeker that includes the BOM in calls to `read`.
116    ///
117    /// The first three bytes can be read using the `peek_bom` method, but
118    /// will not advance the reader.
119    pub fn with_bom(rdr: R) -> BomPeeker<R> {
120        BomPeeker { rdr: rdr, strip: false, bom: None, nread: 0 }
121    }
122
123    /// Create a new BomPeeker that never includes the BOM in calls to `read`.
124    pub fn without_bom(rdr: R) -> BomPeeker<R> {
125        BomPeeker { rdr: rdr, strip: true, bom: None, nread: 0 }
126    }
127
128    /// Peek at the first three bytes of the underlying reader.
129    ///
130    /// This does not advance the reader provided by `BomPeeker`.
131    ///
132    /// If the underlying reader does not have at least two bytes available,
133    /// then `None` is returned.
134    pub fn peek_bom(&mut self) -> io::Result<PossibleBom> {
135        if let Some(bom) = self.bom {
136            return Ok(bom);
137        }
138        // If the underlying reader fails or panics, make sure we set at least
139        // an empty BOM so that we don't end up here again..
140        self.bom = Some(PossibleBom::new());
141
142        // OK, try to read the BOM.
143        let mut buf = [0u8; 3];
144        let bom_len = read_full(&mut self.rdr, &mut buf)?;
145        self.bom = Some(PossibleBom { bytes: buf, len: bom_len });
146        Ok(self.bom.unwrap())
147    }
148}
149
150impl<R: io::Read> io::Read for BomPeeker<R> {
151    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
152        if self.nread < 3 {
153            let bom = self.peek_bom()?;
154
155            // If we don't have a valid BOM (e.g., no encoding for it), then
156            // we always pass through the first 3 bytes. Otherwise, if we have
157            // a valid BOM, we only pass it thru if we don't want to strip it.
158            let bom = bom.as_slice(!self.strip);
159            if self.nread < bom.len() {
160                let rest = &bom[self.nread..];
161                let len = cmp::min(buf.len(), rest.len());
162                buf[..len].copy_from_slice(&rest[..len]);
163                self.nread += len;
164                return Ok(len);
165            }
166        }
167        let nread = self.rdr.read(buf)?;
168        self.nread += nread;
169        Ok(nread)
170    }
171}
172
173/// A PossibleBom is a sequence of bytes at the beginning of a stream that
174/// may represent an actual BOM. To detect the BOM, this must contain at
175/// least 3 bytes.
176///
177/// If this is a valid UTF-8 or UTF-16 BOM, then an encoding_rs decoder can
178/// be built from the BOM.
179#[derive(Clone, Copy, Debug, Eq, PartialEq)]
180pub struct PossibleBom {
181    bytes: [u8; 3],
182    len: usize,
183}
184
185impl PossibleBom {
186    /// Build a new empty BOM.
187    fn new() -> PossibleBom {
188        PossibleBom { bytes: [0; 3], len: 0 }
189    }
190
191    /// Return the BOM as a normal slice.
192    ///
193    /// If `bom` is true, then this includes any leading BOM bytes. Otherwise,
194    /// this only includes non-BOM bytes.
195    fn as_slice(&self, bom: bool) -> &[u8] {
196        let slice = &self.bytes[0..self.len];
197        if bom || slice.len() <= 1 {
198            slice
199        } else if &slice[0..2] == b"\xFF\xFE" || &slice[0..2] == b"\xFE\xFF" {
200            &slice[2..]
201        } else if slice == b"\xEF\xBB\xBF" {
202            &[]
203        } else {
204            slice
205        }
206    }
207
208    /// If this is a valid UTF-8 or UTF-16 BOM, return its corresponding
209    /// encoding. Otherwise, return `None`.
210    pub fn encoding(&self) -> Option<&'static Encoding> {
211        let bom = self.as_slice(true);
212        if bom.len() < 3 {
213            return None;
214        }
215        if let Some((enc, _)) = Encoding::for_bom(bom) {
216            return Some(enc);
217        }
218        None
219    }
220}
221
222/// Like `io::Read::read_exact`, except it never returns `UnexpectedEof` and
223/// instead returns the number of bytes read if EOF is seen before filling
224/// `buf`.
225pub fn read_full<R: io::Read>(
226    mut rdr: R,
227    mut buf: &mut [u8],
228) -> io::Result<usize> {
229    let mut nread = 0;
230    while !buf.is_empty() {
231        match rdr.read(buf) {
232            Ok(0) => break,
233            Ok(n) => {
234                nread += n;
235                let tmp = buf;
236                buf = &mut tmp[n..];
237            }
238            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
239            Err(e) => return Err(e),
240        }
241    }
242    Ok(nread)
243}
244
245#[cfg(test)]
246mod tests {
247    use super::{BomPeeker, PossibleBom, TinyTranscoder};
248    use encoding_rs::Encoding;
249    use std::io::Read;
250
251    #[test]
252    fn tiny_utf16_normal() {
253        let enc = Encoding::for_label(b"utf-16le").unwrap();
254        let mut dec = enc.new_decoder_with_bom_removal();
255        let mut bytes = &b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00"[..];
256        let mut tiny = TinyTranscoder::new();
257        let mut tmp = [0u8; 1];
258
259        let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
260        assert_eq!(nin, 14);
261        assert_eq!(nout, 7);
262        bytes = &bytes[nin..];
263
264        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
265        assert_eq!(tmp, [b'f'; 1]);
266        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
267        assert_eq!(tmp, [b'o'; 1]);
268        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
269        assert_eq!(tmp, [b'o'; 1]);
270        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
271        assert_eq!(tmp, [b'b'; 1]);
272        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
273        assert_eq!(tmp, [b'a'; 1]);
274        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
275        assert_eq!(tmp, [b'r'; 1]);
276        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
277        assert_eq!(tmp, [b'b'; 1]);
278
279        let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
280        assert_eq!(nin, 4);
281        assert_eq!(nout, 2);
282        bytes = &bytes[nin..];
283
284        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
285        assert_eq!(tmp, [b'a'; 1]);
286        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
287        assert_eq!(tmp, [b'z'; 1]);
288
289        let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
290        assert_eq!(nin, 0);
291        assert_eq!(nout, 0);
292
293        assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
294    }
295
296    #[test]
297    fn tiny_utf16_invalid() {
298        let enc = Encoding::for_label(b"utf-16le").unwrap();
299        let mut dec = enc.new_decoder_with_bom_removal();
300        let mut bytes = &b"\x00"[..];
301        let mut tiny = TinyTranscoder::new();
302        let mut tmp = [0u8; 1];
303
304        let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
305        assert_eq!(nin, 1);
306        assert_eq!(nout, 0);
307        assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
308        bytes = &bytes[nin..];
309
310        let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
311        assert_eq!(nin, 0);
312        assert_eq!(nout, 3);
313
314        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
315        assert_eq!(tmp, [b'\xEF'; 1]);
316        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
317        assert_eq!(tmp, [b'\xBF'; 1]);
318        assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
319        assert_eq!(tmp, [b'\xBD'; 1]);
320        assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
321    }
322
323    #[test]
324    fn peeker_empty() {
325        let buf = [];
326        let mut peeker = BomPeeker::with_bom(&buf[..]);
327        assert_eq!(PossibleBom::new(), peeker.peek_bom().unwrap());
328
329        let mut tmp = [0; 100];
330        assert_eq!(0, peeker.read(&mut tmp).unwrap());
331    }
332
333    #[test]
334    fn peeker_one() {
335        let buf = [1];
336        let mut peeker = BomPeeker::with_bom(&buf[..]);
337        assert_eq!(
338            PossibleBom { bytes: [1, 0, 0], len: 1 },
339            peeker.peek_bom().unwrap()
340        );
341
342        let mut tmp = [0; 100];
343        assert_eq!(1, peeker.read(&mut tmp).unwrap());
344        assert_eq!(1, tmp[0]);
345        assert_eq!(0, peeker.read(&mut tmp).unwrap());
346    }
347
348    #[test]
349    fn peeker_two() {
350        let buf = [1, 2];
351        let mut peeker = BomPeeker::with_bom(&buf[..]);
352        assert_eq!(
353            PossibleBom { bytes: [1, 2, 0], len: 2 },
354            peeker.peek_bom().unwrap()
355        );
356
357        let mut tmp = [0; 100];
358        assert_eq!(2, peeker.read(&mut tmp).unwrap());
359        assert_eq!(1, tmp[0]);
360        assert_eq!(2, tmp[1]);
361        assert_eq!(0, peeker.read(&mut tmp).unwrap());
362    }
363
364    #[test]
365    fn peeker_three() {
366        let buf = [1, 2, 3];
367        let mut peeker = BomPeeker::with_bom(&buf[..]);
368        assert_eq!(
369            PossibleBom { bytes: [1, 2, 3], len: 3 },
370            peeker.peek_bom().unwrap()
371        );
372
373        let mut tmp = [0; 100];
374        assert_eq!(3, peeker.read(&mut tmp).unwrap());
375        assert_eq!(1, tmp[0]);
376        assert_eq!(2, tmp[1]);
377        assert_eq!(3, tmp[2]);
378        assert_eq!(0, peeker.read(&mut tmp).unwrap());
379    }
380
381    #[test]
382    fn peeker_four() {
383        let buf = [1, 2, 3, 4];
384        let mut peeker = BomPeeker::with_bom(&buf[..]);
385        assert_eq!(
386            PossibleBom { bytes: [1, 2, 3], len: 3 },
387            peeker.peek_bom().unwrap()
388        );
389
390        let mut tmp = [0; 100];
391        assert_eq!(3, peeker.read(&mut tmp).unwrap());
392        assert_eq!(1, tmp[0]);
393        assert_eq!(2, tmp[1]);
394        assert_eq!(3, tmp[2]);
395        assert_eq!(1, peeker.read(&mut tmp).unwrap());
396        assert_eq!(4, tmp[0]);
397        assert_eq!(0, peeker.read(&mut tmp).unwrap());
398    }
399
400    #[test]
401    fn peeker_one_at_a_time() {
402        let buf = [1, 2, 3, 4];
403        let mut peeker = BomPeeker::with_bom(&buf[..]);
404
405        let mut tmp = [0; 1];
406        assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap());
407        assert_eq!(0, tmp[0]);
408        assert_eq!(1, peeker.read(&mut tmp).unwrap());
409        assert_eq!(1, tmp[0]);
410        assert_eq!(1, peeker.read(&mut tmp).unwrap());
411        assert_eq!(2, tmp[0]);
412        assert_eq!(1, peeker.read(&mut tmp).unwrap());
413        assert_eq!(3, tmp[0]);
414        assert_eq!(1, peeker.read(&mut tmp).unwrap());
415        assert_eq!(4, tmp[0]);
416    }
417
418    #[test]
419    fn peeker_without_bom() {
420        let buf = [b'\xEF', b'\xBB', b'\xBF', b'a'];
421        let mut peeker = BomPeeker::without_bom(&buf[..]);
422        assert_eq!(
423            PossibleBom { bytes: [b'\xEF', b'\xBB', b'\xBF'], len: 3 },
424            peeker.peek_bom().unwrap()
425        );
426
427        let mut tmp = [0; 100];
428        assert_eq!(1, peeker.read(&mut tmp).unwrap());
429        assert_eq!(b'a', tmp[0]);
430        assert_eq!(0, peeker.read(&mut tmp).unwrap());
431    }
432
433    #[test]
434    fn peeker_without_bom_nobom() {
435        let buf = [1, 2, 3, 4];
436        let mut peeker = BomPeeker::without_bom(&buf[..]);
437        assert_eq!(
438            PossibleBom { bytes: [1, 2, 3], len: 3 },
439            peeker.peek_bom().unwrap()
440        );
441
442        let mut tmp = [0; 100];
443        assert_eq!(3, peeker.read(&mut tmp).unwrap());
444        assert_eq!(1, tmp[0]);
445        assert_eq!(2, tmp[1]);
446        assert_eq!(3, tmp[2]);
447        assert_eq!(1, peeker.read(&mut tmp).unwrap());
448        assert_eq!(4, tmp[0]);
449        assert_eq!(0, peeker.read(&mut tmp).unwrap());
450    }
451}