1use std::cmp;
2use std::io;
3
4use encoding_rs::{CoderResult, Decoder, Encoding};
5
6const TINY_BUFFER_SIZE: usize = 7;
9
10#[derive(Clone, Debug)]
26pub struct TinyTranscoder {
27 partial: [u8; TINY_BUFFER_SIZE],
31 len: usize,
33 pos: usize,
35}
36
37impl TinyTranscoder {
38 pub fn new() -> TinyTranscoder {
40 TinyTranscoder { partial: [0; TINY_BUFFER_SIZE], len: 0, pos: 0 }
41 }
42
43 pub fn transcode(
59 &mut self,
60 decoder: &mut Decoder,
61 src: &[u8],
62 last: bool,
63 ) -> (usize, usize) {
64 assert!(self.as_slice().is_empty(), "transcoder has unconsumed bytes");
65 if last {
66 assert!(src.is_empty(), "src must be empty when last==true");
67 }
68 let (res, nin, nout, _) =
69 decoder.decode_to_utf8(src, &mut self.partial[..], last);
70 if last {
71 assert_eq!(
72 res,
73 CoderResult::InputEmpty,
74 "input should be exhausted",
75 );
76 }
77 self.pos = 0;
78 self.len = nout;
79 (nin, nout)
80 }
81
82 fn as_slice(&self) -> &[u8] {
84 &self.partial[self.pos..self.len]
85 }
86}
87
88impl io::Read for TinyTranscoder {
89 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
90 if self.pos >= self.len {
91 return Ok(0);
92 }
93 let mut count = 0;
94 for (src, dst) in self.as_slice().iter().zip(buf) {
95 *dst = *src;
96 count += 1;
97 }
98 self.pos += count;
99 Ok(count)
100 }
101}
102
103#[derive(Debug)]
107pub struct BomPeeker<R> {
108 rdr: R,
109 strip: bool,
110 bom: Option<PossibleBom>,
111 nread: usize,
112}
113
114impl<R: io::Read> BomPeeker<R> {
115 pub fn with_bom(rdr: R) -> BomPeeker<R> {
120 BomPeeker { rdr: rdr, strip: false, bom: None, nread: 0 }
121 }
122
123 pub fn without_bom(rdr: R) -> BomPeeker<R> {
125 BomPeeker { rdr: rdr, strip: true, bom: None, nread: 0 }
126 }
127
128 pub fn peek_bom(&mut self) -> io::Result<PossibleBom> {
135 if let Some(bom) = self.bom {
136 return Ok(bom);
137 }
138 self.bom = Some(PossibleBom::new());
141
142 let mut buf = [0u8; 3];
144 let bom_len = read_full(&mut self.rdr, &mut buf)?;
145 self.bom = Some(PossibleBom { bytes: buf, len: bom_len });
146 Ok(self.bom.unwrap())
147 }
148}
149
150impl<R: io::Read> io::Read for BomPeeker<R> {
151 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
152 if self.nread < 3 {
153 let bom = self.peek_bom()?;
154
155 let bom = bom.as_slice(!self.strip);
159 if self.nread < bom.len() {
160 let rest = &bom[self.nread..];
161 let len = cmp::min(buf.len(), rest.len());
162 buf[..len].copy_from_slice(&rest[..len]);
163 self.nread += len;
164 return Ok(len);
165 }
166 }
167 let nread = self.rdr.read(buf)?;
168 self.nread += nread;
169 Ok(nread)
170 }
171}
172
173#[derive(Clone, Copy, Debug, Eq, PartialEq)]
180pub struct PossibleBom {
181 bytes: [u8; 3],
182 len: usize,
183}
184
185impl PossibleBom {
186 fn new() -> PossibleBom {
188 PossibleBom { bytes: [0; 3], len: 0 }
189 }
190
191 fn as_slice(&self, bom: bool) -> &[u8] {
196 let slice = &self.bytes[0..self.len];
197 if bom || slice.len() <= 1 {
198 slice
199 } else if &slice[0..2] == b"\xFF\xFE" || &slice[0..2] == b"\xFE\xFF" {
200 &slice[2..]
201 } else if slice == b"\xEF\xBB\xBF" {
202 &[]
203 } else {
204 slice
205 }
206 }
207
208 pub fn encoding(&self) -> Option<&'static Encoding> {
211 let bom = self.as_slice(true);
212 if bom.len() < 3 {
213 return None;
214 }
215 if let Some((enc, _)) = Encoding::for_bom(bom) {
216 return Some(enc);
217 }
218 None
219 }
220}
221
222pub fn read_full<R: io::Read>(
226 mut rdr: R,
227 mut buf: &mut [u8],
228) -> io::Result<usize> {
229 let mut nread = 0;
230 while !buf.is_empty() {
231 match rdr.read(buf) {
232 Ok(0) => break,
233 Ok(n) => {
234 nread += n;
235 let tmp = buf;
236 buf = &mut tmp[n..];
237 }
238 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
239 Err(e) => return Err(e),
240 }
241 }
242 Ok(nread)
243}
244
245#[cfg(test)]
246mod tests {
247 use super::{BomPeeker, PossibleBom, TinyTranscoder};
248 use encoding_rs::Encoding;
249 use std::io::Read;
250
251 #[test]
252 fn tiny_utf16_normal() {
253 let enc = Encoding::for_label(b"utf-16le").unwrap();
254 let mut dec = enc.new_decoder_with_bom_removal();
255 let mut bytes = &b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00"[..];
256 let mut tiny = TinyTranscoder::new();
257 let mut tmp = [0u8; 1];
258
259 let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
260 assert_eq!(nin, 14);
261 assert_eq!(nout, 7);
262 bytes = &bytes[nin..];
263
264 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
265 assert_eq!(tmp, [b'f'; 1]);
266 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
267 assert_eq!(tmp, [b'o'; 1]);
268 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
269 assert_eq!(tmp, [b'o'; 1]);
270 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
271 assert_eq!(tmp, [b'b'; 1]);
272 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
273 assert_eq!(tmp, [b'a'; 1]);
274 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
275 assert_eq!(tmp, [b'r'; 1]);
276 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
277 assert_eq!(tmp, [b'b'; 1]);
278
279 let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
280 assert_eq!(nin, 4);
281 assert_eq!(nout, 2);
282 bytes = &bytes[nin..];
283
284 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
285 assert_eq!(tmp, [b'a'; 1]);
286 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
287 assert_eq!(tmp, [b'z'; 1]);
288
289 let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
290 assert_eq!(nin, 0);
291 assert_eq!(nout, 0);
292
293 assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
294 }
295
296 #[test]
297 fn tiny_utf16_invalid() {
298 let enc = Encoding::for_label(b"utf-16le").unwrap();
299 let mut dec = enc.new_decoder_with_bom_removal();
300 let mut bytes = &b"\x00"[..];
301 let mut tiny = TinyTranscoder::new();
302 let mut tmp = [0u8; 1];
303
304 let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
305 assert_eq!(nin, 1);
306 assert_eq!(nout, 0);
307 assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
308 bytes = &bytes[nin..];
309
310 let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
311 assert_eq!(nin, 0);
312 assert_eq!(nout, 3);
313
314 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
315 assert_eq!(tmp, [b'\xEF'; 1]);
316 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
317 assert_eq!(tmp, [b'\xBF'; 1]);
318 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
319 assert_eq!(tmp, [b'\xBD'; 1]);
320 assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
321 }
322
323 #[test]
324 fn peeker_empty() {
325 let buf = [];
326 let mut peeker = BomPeeker::with_bom(&buf[..]);
327 assert_eq!(PossibleBom::new(), peeker.peek_bom().unwrap());
328
329 let mut tmp = [0; 100];
330 assert_eq!(0, peeker.read(&mut tmp).unwrap());
331 }
332
333 #[test]
334 fn peeker_one() {
335 let buf = [1];
336 let mut peeker = BomPeeker::with_bom(&buf[..]);
337 assert_eq!(
338 PossibleBom { bytes: [1, 0, 0], len: 1 },
339 peeker.peek_bom().unwrap()
340 );
341
342 let mut tmp = [0; 100];
343 assert_eq!(1, peeker.read(&mut tmp).unwrap());
344 assert_eq!(1, tmp[0]);
345 assert_eq!(0, peeker.read(&mut tmp).unwrap());
346 }
347
348 #[test]
349 fn peeker_two() {
350 let buf = [1, 2];
351 let mut peeker = BomPeeker::with_bom(&buf[..]);
352 assert_eq!(
353 PossibleBom { bytes: [1, 2, 0], len: 2 },
354 peeker.peek_bom().unwrap()
355 );
356
357 let mut tmp = [0; 100];
358 assert_eq!(2, peeker.read(&mut tmp).unwrap());
359 assert_eq!(1, tmp[0]);
360 assert_eq!(2, tmp[1]);
361 assert_eq!(0, peeker.read(&mut tmp).unwrap());
362 }
363
364 #[test]
365 fn peeker_three() {
366 let buf = [1, 2, 3];
367 let mut peeker = BomPeeker::with_bom(&buf[..]);
368 assert_eq!(
369 PossibleBom { bytes: [1, 2, 3], len: 3 },
370 peeker.peek_bom().unwrap()
371 );
372
373 let mut tmp = [0; 100];
374 assert_eq!(3, peeker.read(&mut tmp).unwrap());
375 assert_eq!(1, tmp[0]);
376 assert_eq!(2, tmp[1]);
377 assert_eq!(3, tmp[2]);
378 assert_eq!(0, peeker.read(&mut tmp).unwrap());
379 }
380
381 #[test]
382 fn peeker_four() {
383 let buf = [1, 2, 3, 4];
384 let mut peeker = BomPeeker::with_bom(&buf[..]);
385 assert_eq!(
386 PossibleBom { bytes: [1, 2, 3], len: 3 },
387 peeker.peek_bom().unwrap()
388 );
389
390 let mut tmp = [0; 100];
391 assert_eq!(3, peeker.read(&mut tmp).unwrap());
392 assert_eq!(1, tmp[0]);
393 assert_eq!(2, tmp[1]);
394 assert_eq!(3, tmp[2]);
395 assert_eq!(1, peeker.read(&mut tmp).unwrap());
396 assert_eq!(4, tmp[0]);
397 assert_eq!(0, peeker.read(&mut tmp).unwrap());
398 }
399
400 #[test]
401 fn peeker_one_at_a_time() {
402 let buf = [1, 2, 3, 4];
403 let mut peeker = BomPeeker::with_bom(&buf[..]);
404
405 let mut tmp = [0; 1];
406 assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap());
407 assert_eq!(0, tmp[0]);
408 assert_eq!(1, peeker.read(&mut tmp).unwrap());
409 assert_eq!(1, tmp[0]);
410 assert_eq!(1, peeker.read(&mut tmp).unwrap());
411 assert_eq!(2, tmp[0]);
412 assert_eq!(1, peeker.read(&mut tmp).unwrap());
413 assert_eq!(3, tmp[0]);
414 assert_eq!(1, peeker.read(&mut tmp).unwrap());
415 assert_eq!(4, tmp[0]);
416 }
417
418 #[test]
419 fn peeker_without_bom() {
420 let buf = [b'\xEF', b'\xBB', b'\xBF', b'a'];
421 let mut peeker = BomPeeker::without_bom(&buf[..]);
422 assert_eq!(
423 PossibleBom { bytes: [b'\xEF', b'\xBB', b'\xBF'], len: 3 },
424 peeker.peek_bom().unwrap()
425 );
426
427 let mut tmp = [0; 100];
428 assert_eq!(1, peeker.read(&mut tmp).unwrap());
429 assert_eq!(b'a', tmp[0]);
430 assert_eq!(0, peeker.read(&mut tmp).unwrap());
431 }
432
433 #[test]
434 fn peeker_without_bom_nobom() {
435 let buf = [1, 2, 3, 4];
436 let mut peeker = BomPeeker::without_bom(&buf[..]);
437 assert_eq!(
438 PossibleBom { bytes: [1, 2, 3], len: 3 },
439 peeker.peek_bom().unwrap()
440 );
441
442 let mut tmp = [0; 100];
443 assert_eq!(3, peeker.read(&mut tmp).unwrap());
444 assert_eq!(1, tmp[0]);
445 assert_eq!(2, tmp[1]);
446 assert_eq!(3, tmp[2]);
447 assert_eq!(1, peeker.read(&mut tmp).unwrap());
448 assert_eq!(4, tmp[0]);
449 assert_eq!(0, peeker.read(&mut tmp).unwrap());
450 }
451}