├── .gitignore ├── Cargo.toml ├── src ├── lib.rs ├── iter.rs ├── csv.rs └── buffers.rs ├── UNLICENSE.txt ├── README.md ├── test_data └── fr │ └── sample.conllx └── benches └── toy_parsers.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | .*.swp 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | 3 | name = "rust-streaming" 4 | version = "0.0.1" 5 | authors = ["Eric Kidd "] 6 | 7 | [lib] 8 | name = "streaming" 9 | doc = false -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Experimental Rust utilities for writing fast, streaming parsers without 2 | //! allocating memory. 3 | 4 | #![license = "Public domain (Unlicense)"] 5 | #![experimental] 6 | // Feel free to disable these if they become too annoying. 7 | #![deny(missing_doc)] 8 | #![deny(warnings)] 9 | 10 | #![feature(macro_rules)] 11 | 12 | #[cfg(test)] extern crate test; 13 | 14 | // Want to share your experiments, hacks, etc.? Just add a module. 15 | 16 | pub mod csv; 17 | pub mod iter; 18 | pub mod buffers; 19 | -------------------------------------------------------------------------------- /src/iter.rs: -------------------------------------------------------------------------------- 1 | //! A replacement for Iterator 2 | 3 | #![macro_escape] 4 | 5 | /// Like `Iterator`, but it allows you to store temporary data in the 6 | /// iterator itself, and return temporary references from `next`. 7 | /// 8 | /// Massive thanks to Sharp for figuring out how to do this. 9 | pub trait StreamingIterator<'a, T> { 10 | /// Return either the next item in the sequence, or `None` if all items 11 | /// have been consumed. 12 | fn next(&'a mut self) -> Option; 13 | } 14 | 15 | /// Similar to `for`, but doesn't enforce any trait restrictions on the 16 | /// iterator. 17 | #[macro_export] 18 | macro_rules! streaming_for { 19 | ($var:pat in $expr:expr, $b:stmt) => { 20 | { 21 | // Only evaluate once! 22 | let ref mut iter = &mut $expr; 23 | loop { 24 | match iter.next() { 25 | None => { break; } 26 | Some($var) => { $b } 27 | } 28 | } 29 | } 30 | }; 31 | } 32 | -------------------------------------------------------------------------------- /UNLICENSE.txt: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EXPERIMENTAL: Zero-Allocation Streaming Parsers in Rust 2 | 3 | Here there be hacks. No APIs are stable. Code may not do what the 4 | comments claim. 5 | 6 | Key goal: 7 | 8 | * Build a `StreamingIterator` type that can return references to internal 9 | state, including as I/O buffers and the output buffers of libraries like 10 | [`flate2`](https://github.com/alexcrichton/flate2-rs). This prevents 11 | implementing `collect`, but why can't we have `map`, `filter` and `fold`? 12 | 13 | Target applications: 14 | 15 | * [rust-csv](https://github.com/BurntSushi/rust-csv). 16 | * Multicore map/reduce of 17 | [Snappy](https://code.google.com/p/snappy/)-compressed records. 18 | * Anybody else who needs to iterate over a data stream without allocating. 19 | 20 | Random useful things to read: 21 | 22 | * [Higher-kinded types and why they're important](https://github.com/aturon/rfcs/blob/collections-conventions/text/0000-collection-conventions.md#lack-of-iterator-methods). 23 | * [Emulating higher-kinded types with proposed associated types & lifetimes](https://github.com/rust-lang/rfcs/blob/master/text/0195-associated-items.md#encoding-higher-kinded-types). 24 | * [Iterating short-lived objects](http://discuss.rust-lang.org/t/iterating-short-lived-objects/274) 25 | * [Borrow scopes should not always be lexical](https://github.com/rust-lang/rust/issues/6393) (aka, "why we have one line of `unsafe`") 26 | * [Borrow checker gets confused by a conditionally returned borrows](https://github.com/rust-lang/rust/issues/12147) (same as above, but clearer) 27 | * [Iterator returning items by reference, lifetime issue](http://stackoverflow.com/questions/24574741/iterator-returning-items-by-reference-lifetime-issue) (`Iterator` works the way it does for reasons explained here) 28 | 29 | We beg for help:: 30 | 31 | * [Can I write zero-copy parsers with Iterator? It looks like it might get me a 47x speedup here.](http://www.reddit.com/r/rust/comments/2i6xry/can_i_write_zerocopy_parsers_with_iterator_it/) 32 | * [Rust struct can borrow “&'a mut self” twice, so why can't a trait?](http://stackoverflow.com/questions/26192564/rust-struct-can-borrow-a-mut-self-twice-so-why-cant-a-trait) 33 | 34 | -------------------------------------------------------------------------------- /src/csv.rs: -------------------------------------------------------------------------------- 1 | #![allow(missing_doc)] 2 | #![allow(dead_code)] 3 | #![allow(unused_variable)] 4 | 5 | use std::path::BytesContainer; 6 | 7 | trait StreamIterator { 8 | fn next_item<'a>(&'a mut self) -> Option<&'a A>; 9 | } 10 | 11 | struct CsvRdr; 12 | struct CsvWtr; 13 | 14 | impl CsvRdr { 15 | /// Returns `true` when the underlying data stream has been exhausted. 16 | fn done(&self) -> bool { false } 17 | } 18 | 19 | /// An iterator over fields in the current record. 20 | /// 21 | /// When the end of the record is reached, the iterator yields `None`. 22 | /// Subsequent invocations of the iterator yield fields from the next 23 | /// record. If the underlying data stream has been exhausted (or if there 24 | /// was an error parsing the data), `None` is returned indefinitely. 25 | impl StreamIterator<[u8]> for CsvRdr { 26 | fn next_item<'a>(&'a mut self) -> Option<&'a [u8]> { 27 | // In real usage, this would return a slice of bytes from the CSV's 28 | // underlying data stream. 29 | // The slow version is allocating a new `Vec` and yielding that 30 | // instead. 31 | // The advantage of this approach is that it does not require an 32 | // allocation. 33 | None 34 | } 35 | } 36 | 37 | impl CsvWtr { 38 | /// Writes a single record to the CSV data. 39 | /// 40 | /// The input is an iterator of things that can produce a `&[u8]`. 41 | fn write_record> 42 | (&mut self, it: I) -> Result<(), String> { 43 | Ok(()) 44 | } 45 | 46 | // A dummy impl to make the code below compile. 47 | fn write_record_regular_iter> 48 | (&mut self, it: I) -> Result<(), String> { 49 | Ok(()) 50 | } 51 | } 52 | 53 | // A dummy impl to make the code below compile. 54 | impl<'a> Iterator<&'a [u8]> for CsvRdr { 55 | fn next(&mut self) -> Option<&'a [u8]> { None } 56 | } 57 | 58 | /// The payoff. 59 | /// 60 | /// Crucially, a "streaming iterator" puts the choice of allocation in the 61 | /// hands of the caller. This is important because it lets the caller do 62 | /// transformations either without allocating or without allocating space for 63 | /// an entire record. 64 | /// 65 | /// For example, consider the task of reading CSV data with 100 columns and 66 | /// transforming it to CSV data with only 2 columns. A forced allocation here 67 | /// can be quite costly. But if the caller is left to choose, then they can 68 | /// "select" their two fields to write to new CSV data. 69 | fn main() { 70 | let rdr = CsvRdr; 71 | let mut wtr = CsvWtr; 72 | 73 | while !rdr.done() { 74 | // This should be `wtr.write_record`. 75 | wtr.write_record_regular_iter( 76 | // None of these methods work on `StreamIterator`, but AFAIK, 77 | // there is no *fundamental* reason why they can't. It just may 78 | // not be expressible in Rust. 79 | rdr.enumerate() 80 | .filter(|&(i, _)| i == 4 || i == 58) 81 | .map(|(_, field)| field)).unwrap(); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /test_data/fr/sample.conllx: -------------------------------------------------------------------------------- 1 | 1 chapitre chapitre N NC _ 0 root _ _ 2 | 2 premier premier A ADJ _ 1 mod _ _ 3 | 4 | 1 les le D DET _ 3 det _ _ 5 | 2 trois *trois A ADJ _ 3 dep _ _ 6 | 3 présents présent A ADJ _ 0 root _ _ 7 | 4 de de P P _ 3 dep _ _ 8 | 5 m *m N NC _ 4 obj _ _ 9 | 6 . . PONCT PONCT _ 3 ponct _ _ 10 | 7 d' de P P _ 3 dep _ _ 11 | 8 artagnan *artagnan D DET _ 9 det _ _ 12 | 9 père père N NC _ 7 obj _ _ 13 | 14 | 1 Le le D DET _ 3 det _ _ 15 | 2 premier premier A ADJ _ 3 mod _ _ 16 | 3 lundi lundi N NC _ 25 suj _ _ 17 | 4 du *du P+D P+D _ 3 dep _ _ 18 | 5 mois mois N NC _ 4 obj _ _ 19 | 6 d' de P P _ 5 dep _ _ 20 | 7 avril avril N NC _ 6 obj _ _ 21 | 8 1625 *1625 A ADJ _ 7 mod _ _ 22 | 9 , , PONCT PONCT _ 25 ponct _ _ 23 | 10 le le D DET _ 11 det _ _ 24 | 11 bourg bourg N NC _ 25 suj _ _ 25 | 12 de de P P _ 11 dep _ _ 26 | 13 Meung *Meung N NPP _ 12 obj _ _ 27 | 14 , , PONCT PONCT _ 11 ponct _ _ 28 | 15 où où PRO PROREL _ 16 suj _ _ 29 | 16 naquit naître V V _ 11 mod_rel _ _ 30 | 17 l' le D DET _ 18 det _ _ 31 | 18 auteur auteur N NC _ 16 obj _ _ 32 | 19 du *du P+D P+D _ 18 dep _ _ 33 | 20 Roman Roman N NPP _ 19 obj _ _ 34 | 21 de de P P _ 20 dep _ _ 35 | 22 la le D DET _ 23 det _ _ 36 | 23 Rose Rose N NPP _ 21 obj _ _ 37 | 24 , , PONCT PONCT _ 25 ponct _ _ 38 | 25 semblait sembler V V _ 0 root _ _ 39 | 26 être être V VINF _ 25 ats _ _ 40 | 27 dans dans P P _ 26 mod _ _ 41 | 28 une un D DET _ 29 det _ _ 42 | 29 révolution révolution N NC _ 27 obj _ _ 43 | 30 aussi aussi ADV ADV _ 31 mod _ _ 44 | 31 entière entier A ADJ _ 29 mod _ _ 45 | 32 que que C CS _ 26 obj _ _ 46 | 33 si si C CS _ 38 mod _ _ 47 | 34 les le D DET _ 35 det _ _ 48 | 35 huguenots huguenot N NC _ 38 suj _ _ 49 | 36 en en P P _ 38 mod _ _ 50 | 37 fussent *fussent V VPR _ 38 aux_tps _ _ 51 | 38 venus venir V VPP _ 32 obj _ _ 52 | 39 faire faire V VINF _ 38 obj _ _ 53 | 40 une un D DET _ 42 det _ _ 54 | 41 seconde seconde A ADJ _ 42 mod _ _ 55 | 42 Rochelle Rochelle N NPP _ 39 obj _ _ 56 | 43 . . PONCT PONCT _ 25 ponct _ _ 57 | 58 | 1 Plusieurs plusieurs D DET _ 2 det _ _ 59 | 2 bourgeois bourgeois N NC _ 18 suj _ _ 60 | 3 , , PONCT PONCT _ 2 ponct _ _ 61 | 4 voyant voir V VPR _ 2 mod _ _ 62 | 5 s' clr CL CLR _ 6 aff _ _ 63 | 6 enfuir enfuir V VINF _ 4 obj _ _ 64 | 7 les le D DET _ 8 det _ _ 65 | 8 femmes femme N NC _ 6 obj _ _ 66 | 9 du *du P+D P+D _ 8 dep _ _ 67 | 10 côté côté N NC _ 9 obj _ _ 68 | 11 de de P P _ 10 dep _ _ 69 | 12 la le D DET _ 13 det _ _ 70 | 13 Grande-Rue *Grande-Rue N NPP _ 11 obj _ _ 71 | 14 , , PONCT PONCT _ 2 ponct _ _ 72 | 15 entendant entendre V VPR _ 2 mod _ _ 73 | 16 les le D DET _ 17 det _ _ 74 | 17 enfants enfant N NC _ 15 obj _ _ 75 | 18 crier crier V VINF _ 26 suj _ _ 76 | 19 sur sur P P _ 18 mod _ _ 77 | 20 le le D DET _ 21 det _ _ 78 | 21 seuil seuil N NC _ 19 obj _ _ 79 | 22 des *des P+D P+D _ 21 dep _ _ 80 | 23 portes porte N NC _ 22 obj _ _ 81 | 24 , , PONCT PONCT _ 26 ponct _ _ 82 | 25 se clr CL CLR _ 26 aff _ _ 83 | 26 hâtaient hâter V V _ 0 root _ _ 84 | 27 d' de P P _ 26 obj _ _ 85 | 28 endosser endosser V VINF _ 27 obj _ _ 86 | 29 la le D DET _ 30 det _ _ 87 | 30 cuirasse cuirasse N NC _ 28 obj _ _ 88 | 31 et et C CC _ 26 coord _ _ 89 | 32 , , PONCT PONCT _ 31 ponct _ _ 90 | 33 appuyant appuyer V VPR _ 31 dep_coord _ _ 91 | 34 leur son D DET _ 35 det _ _ 92 | 35 contenance contenance N NC _ 33 obj _ _ 93 | 36 quelque quelque A ADJ _ 35 mod _ _ 94 | 37 peu peu ADV ADV _ 38 mod _ _ 95 | 38 incertaine incertain A ADJ _ 35 mod _ _ 96 | 39 d' de P P _ 35 dep _ _ 97 | 40 un un D DET _ 41 det _ _ 98 | 41 mousquet mousquet N NC _ 39 obj _ _ 99 | 42 ou ou C CC _ 39 coord _ _ 100 | 43 d' de P P _ 42 dep_coord _ _ 101 | 44 une un D DET _ 45 det _ _ 102 | 45 pertuisane pertuisane N NC _ 43 obj _ _ 103 | 46 , , PONCT PONCT _ 48 ponct _ _ 104 | 47 se clr CL CLR _ 48 aff _ _ 105 | 48 dirigeaient diriger V V _ 26 mod _ _ 106 | 49 vers vers P P _ 48 mod _ _ 107 | 50 l' le D DET _ 51 det _ _ 108 | 51 hôtellerie hôtellerie N NC _ 49 obj _ _ 109 | 52 du *du P+D P+D _ 51 dep _ _ 110 | 53 Franc Franc N NPP _ 52 obj _ _ 111 | 54 Meunier *Meunier N NPP _ 53 mod _ _ 112 | 55 , , PONCT PONCT _ 48 ponct _ _ 113 | 56 devant devant P P _ 59 mod _ _ 114 | 57 laquelle lequel PRO PROREL _ 56 obj _ _ 115 | 58 s' clr CL CLR _ 59 aff _ _ 116 | 59 empressait empresser V V _ 48 mod _ _ 117 | 60 , , PONCT PONCT _ 59 ponct _ _ 118 | 61 en en P P _ 59 mod _ _ 119 | 62 grossissant grossir V VPR _ 61 obj _ _ 120 | 63 de de P P _ 62 de_obj _ _ 121 | 64 minute minute N NC _ 63 obj _ _ 122 | 65 en en P P _ 62 mod _ _ 123 | 66 minute minute N NC _ 65 obj _ _ 124 | 67 , , PONCT PONCT _ 62 ponct _ _ 125 | 68 un un D DET _ 69 det _ _ 126 | 69 groupe groupe N NC _ 62 obj _ _ 127 | 70 compact compact A ADJ _ 69 mod _ _ 128 | 71 , , PONCT PONCT _ 69 ponct _ _ 129 | 72 bruyant bruyant A ADJ _ 69 mod _ _ 130 | 73 et et C CC _ 72 coord _ _ 131 | 74 plein plein A ADJ _ 73 dep_coord _ _ 132 | 75 de de P P _ 69 dep _ _ 133 | 76 curiosité curiosité N NC _ 75 obj _ _ 134 | 77 . . PONCT PONCT _ 26 ponct _ _ 135 | 136 | -------------------------------------------------------------------------------- /benches/toy_parsers.rs: -------------------------------------------------------------------------------- 1 | //! I need the advice of some more experienced Rust hackers. 2 | //! 3 | //! I want to use Rust to write high-speed, un-pwn-able parsers. This 4 | //! seems like a great application for libraries written in Rust. Below, I 5 | //! compare the performance and API of copying parsers versus zero-copy 6 | //! parsers. Anyway, here's the key benchmark: 7 | //! 8 | //! ``` 9 | //! test copying_parser ... bench: 90432 ns/iter (+/- 5644) = 26 MB/s 10 | //! test zero_copy_parser ... bench: 1926 ns/iter (+/- 135) = 1246 MB/s 11 | //! ``` 12 | //! 13 | //! My goal: Can we make zero_copy_parser an instance of Iterator? Or does 14 | //! Iterator force us to use something like copying_parser? 15 | //! 16 | //! And if Iterator does force us to copy, is there some way to change 17 | //! Iterator that allows us to use zero_copy_parser without causing ugly 18 | //! design issues elsewhere? 19 | 20 | #![feature(phase)] 21 | 22 | #[cfg(test)] extern crate test; 23 | #[phase(plugin)] extern crate streaming; 24 | extern crate streaming; 25 | 26 | use std::iter::range; 27 | use streaming::iter::StreamingIterator; 28 | 29 | 30 | //========================================================================= 31 | // Infrastructure 32 | 33 | static LINE: &'static str = "foo bar baz\n"; 34 | 35 | /// This is our raw data source. Pretend it's on disk somewhere, and it's 36 | /// too big to load into memory all at once. 37 | pub fn make_pretend_file() -> String { 38 | let mut result: String = String::new(); 39 | for _ in range(0u, 200) { result.push_str(LINE); } 40 | result 41 | } 42 | 43 | /// This is our stand-in for Buffer. Here, `next_line` is a simpler 44 | /// stand-in for `fill_buf`, `consume`, etc. 45 | pub trait Buffer { 46 | fn next_line<'a>(&'a mut self) -> Option<&'a str>; 47 | } 48 | 49 | /// This is our stand-in for a smart implementation of the Buffer trait. 50 | /// In the real world, it has an internal buffer of some sort, and it has 51 | /// some magic to finesse buffer boundaries for us (in an amortitized 52 | /// fashion), so we always get all the data associated with a given 53 | /// iteration. 54 | pub struct BufferedReader<'a> { 55 | // This represents a file, a network connection, a streaming Snappy 56 | // decompressor, etc. 57 | file: &'a str, 58 | offset: uint, 59 | // The represents the I/O buffer inside a real BufferedReader. No fair 60 | // taking this out! 61 | buffer: String 62 | } 63 | 64 | impl<'a> BufferedReader<'a> { 65 | /// Create a new BufferedReader. 66 | pub fn new(file: &'a str) -> BufferedReader<'a> { 67 | BufferedReader{file: file, offset: 0, 68 | buffer: String::with_capacity(LINE.len())} 69 | } 70 | } 71 | 72 | impl<'a> Buffer for BufferedReader<'a> { 73 | /// Return a line with no allocations. Again, a massive 74 | /// oversimplification: We're assuming our return value points into an 75 | /// I/O buffer. The analogous read-world function is Buffer::fill_buf, 76 | /// plus some custom magic to get us complete lines. 77 | #[inline] 78 | fn next_line<'a>(&'a mut self) -> Option<&'a str> { 79 | if self.offset == self.file.len() { return None; } 80 | let result = self.file.slice(self.offset, self.offset + LINE.len()); 81 | self.buffer.clear(); 82 | self.buffer.push_str(result); 83 | self.offset += LINE.len(); 84 | Some(self.buffer.as_slice()) 85 | } 86 | } 87 | 88 | 89 | //========================================================================= 90 | // CopyingParser 91 | 92 | pub struct CopyingParser<'a> { 93 | reader: &'a mut BufferedReader<'a> 94 | } 95 | 96 | impl<'a> CopyingParser<'a> { 97 | pub fn new(reader: &'a mut BufferedReader<'a>) -> CopyingParser<'a> { 98 | CopyingParser{reader: reader} 99 | } 100 | } 101 | 102 | impl<'a> Iterator<(String,String,String)> for CopyingParser<'a> { 103 | // We can use the iterator protocol here, but we need to copy. 104 | fn next(&mut self) -> Option<(String,String,String)> { 105 | match self.reader.next_line() { 106 | None => None, 107 | Some(line) => { 108 | Some((line.slice(0, 3).to_string(), 109 | line.slice(4, 7).to_string(), 110 | line.slice(8, 11).to_string())) 111 | } 112 | } 113 | } 114 | } 115 | 116 | #[bench] 117 | fn copying_parser(b: &mut test::Bencher) { 118 | let file = make_pretend_file(); 119 | b.bytes = file.len() as u64; 120 | b.iter(|| { 121 | let mut reader = BufferedReader::new(file.as_slice()); 122 | let mut parser = CopyingParser::new(&mut reader); 123 | // This looks nice, but it's really slow. 124 | for result in parser { 125 | test::black_box(result); 126 | } 127 | }); 128 | } 129 | 130 | 131 | //========================================================================= 132 | // ZeroCopyParser 133 | 134 | pub struct ZeroCopyParser<'a> { 135 | reader: &'a mut BufferedReader<'a>, 136 | } 137 | 138 | impl<'a> ZeroCopyParser<'a> { 139 | pub fn new(reader: &'a mut BufferedReader<'a>) -> ZeroCopyParser<'a> { 140 | ZeroCopyParser{reader: reader} 141 | } 142 | } 143 | 144 | impl<'a> StreamingIterator<'a, (&'a str, &'a str, &'a str)> 145 | for ZeroCopyParser<'a> { 146 | 147 | fn next(&mut self) -> Option<(&str, &str, &str)> { 148 | match self.reader.next_line() { 149 | None => None, 150 | Some(ref line) => { 151 | // Like above, but keep our strings in BufferedReader's 152 | // internal buffer. 153 | Some((line.slice(0, 3), 154 | line.slice(4, 7), 155 | line.slice(8, 11))) 156 | 157 | } 158 | } 159 | } 160 | } 161 | 162 | #[bench] 163 | fn zero_copy_parser(b: &mut test::Bencher) { 164 | let file = make_pretend_file(); 165 | b.bytes = file.len() as u64; 166 | b.iter(|| -> () { 167 | let mut reader = BufferedReader::new(file.as_slice()); 168 | let mut parser = ZeroCopyParser::new(&mut reader); 169 | 170 | streaming_for!(line in parser, { 171 | test::black_box(line); 172 | }); 173 | }); 174 | } 175 | -------------------------------------------------------------------------------- /src/buffers.rs: -------------------------------------------------------------------------------- 1 | //! Custom buffer support. 2 | //! 3 | //! WARNING: Don't believe the 'boundary' parameter. It's a lie. 4 | 5 | use std::cmp::min; 6 | use std::iter::range; 7 | use std::io::{Buffer,EndOfFile,IoError,IoResult}; 8 | use std::mem::transmute; 9 | use std::rand::{Rng,task_rng}; 10 | 11 | #[cfg(test)] use std::io::{File,MemReader}; 12 | #[cfg(test)] use std::str::from_utf8; 13 | 14 | 15 | /// An internal trait with some convenience functions. 16 | pub trait SliceContains { 17 | /// Does `needle` appear in this buffer? 18 | fn contains_slice(&self, needle: &[u8]) -> bool; 19 | /// At what location does `needle` appear in this buffer? 20 | fn contains_slice_pos(&self, needle: &[u8]) -> Option; 21 | } 22 | 23 | impl<'a> SliceContains for &'a [u8] { 24 | #[inline(never)] 25 | fn contains_slice(&self, needle: &[u8]) -> bool { 26 | self.contains_slice_pos(needle).is_some() 27 | } 28 | 29 | // XXX - Ignores _needle for now, hardcoded for speed. 30 | #[inline(never)] 31 | fn contains_slice_pos(&self, _needle: &[u8]) -> Option { 32 | // This will burn 50% of our total program execution time if we let 33 | // it. 34 | //self.windows(needle.len()).position(|w| w == needle) 35 | //if self.len() < needle.len() { return None; } 36 | //'outer: for i in range(0, self.len()-(needle.len()+1)) { 37 | // if self[i] == needle[0] { 38 | // for j in range(0, needle.len()) { 39 | // if self[i+j] != needle[j] { continue 'outer; } 40 | // } 41 | // return Some(i); 42 | // } 43 | //} 44 | //return None; 45 | 46 | // XXX - Hardcoded for performance. 47 | if self.len() < 2 { return None; } 48 | for i in range(0, self.len()-1) { 49 | if self[i] == 10 && self[i+1] == 10 { return Some(i); } 50 | } 51 | None 52 | } 53 | } 54 | 55 | /// Used for testing other buffers. Dribbles bytes through in small, 56 | /// random increments. 57 | pub struct DribbleBuffer<'a, T: Buffer+'a> { 58 | input: &'a mut T 59 | } 60 | 61 | impl<'a,T: Buffer+'a> DribbleBuffer<'a, T> { 62 | /// Create a new wrapper around `input`. 63 | pub fn new(input: &'a mut T) -> DribbleBuffer<'a, T> { 64 | DribbleBuffer{input: input} 65 | } 66 | } 67 | 68 | impl<'a,T: Buffer+'a> Reader for DribbleBuffer<'a,T> { 69 | fn read(&mut self, buf: &mut [u8]) -> IoResult { 70 | self.input.read(buf) 71 | } 72 | } 73 | 74 | impl<'a,T: Buffer+'a> Buffer for DribbleBuffer<'a,T> { 75 | fn fill_buf<'a>(&'a mut self) -> IoResult<&'a [u8]> { 76 | let original = try!(self.input.fill_buf()); 77 | let limit = task_rng().gen::() % 6; 78 | Ok(original[..min(original.len(), limit)]) 79 | } 80 | 81 | fn consume(&mut self, amt: uint) { 82 | self.input.consume(amt) 83 | } 84 | } 85 | 86 | #[cfg(test)] 87 | fn test_data() -> Vec { 88 | let path = "test_data/fr/sample.conllx"; 89 | File::open(&Path::new(path)).read_to_end().unwrap() 90 | } 91 | 92 | #[test] 93 | fn dribble_buffer_read_to_string() { 94 | let data = test_data(); 95 | let mut reader = MemReader::new(data.clone()); 96 | let mut buffer = DribbleBuffer::new(&mut reader); 97 | let lines: Vec = buffer.lines().map(|l| l.unwrap()).collect(); 98 | let via_buffer = lines.concat(); 99 | assert_eq!(from_utf8(data.as_slice()).unwrap(), via_buffer.as_slice()); 100 | } 101 | 102 | /// A buffer which breaks chunks only after the specified boundary 103 | /// sequence, or at the end of a file, but nowhere else. 104 | pub struct ChunkBuffer<'a, T: Buffer+'a> { 105 | input: &'a mut T, 106 | boundary: Vec, 107 | buffer: Vec 108 | } 109 | 110 | impl<'a, T: Buffer+'a> ChunkBuffer<'a,T> { 111 | /// Create a new `ChunkBuffer` wrapping `input` and breaking at 112 | /// `boundary`. 113 | pub fn new(input: &'a mut T, boundary: &[u8]) -> ChunkBuffer<'a,T> { 114 | ChunkBuffer{input: input, boundary: boundary.to_vec(), 115 | buffer: vec![]} 116 | } 117 | 118 | // Called internally to make `buffer` valid. This is where all our 119 | // evil magic lives. 120 | fn top_up<'b>(&'b mut self) -> IoResult<&'b [u8]> { 121 | assert!(!self.buffer.as_slice() 122 | .contains_slice(self.boundary.as_slice())); 123 | loop { 124 | let (consumed, done) = { 125 | let read_or_err = self.input.fill_buf(); 126 | match read_or_err { 127 | Err(IoError{kind: EndOfFile, ..}) => { 128 | // Exit 1: We're at the end of the file, so use 129 | // whatever we've got. 130 | return Ok(self.buffer.as_slice()) 131 | }, 132 | Err(err) => { 133 | // Exit 2: We've got a hard error. 134 | return Err(err) 135 | }, 136 | Ok(read) => { 137 | // Try to grab enough so that we know we have a 138 | // chunk. 139 | match read.contains_slice_pos(self.boundary.as_slice()) { 140 | Some(pos) => { 141 | let bytes = pos + self.boundary.len(); 142 | self.buffer.push_all(read[..bytes]); 143 | (bytes, true) 144 | } 145 | None => { 146 | let buf_len = self.buffer.len(); 147 | let bound_len = self.boundary.len(); 148 | // We'll look here for a split boundary token. 149 | let scan_start = 150 | buf_len - min(buf_len, bound_len-1); 151 | let scan_end = min(buf_len + (bound_len-1), 152 | buf_len + read.len()); 153 | self.buffer.push_all(read); 154 | let check = 155 | self.buffer.slice(scan_start, scan_end); 156 | (read.len(), 157 | check.contains_slice(self.boundary.as_slice())) 158 | } 159 | } 160 | } 161 | } 162 | }; 163 | self.input.consume(consumed); 164 | if done { 165 | // Exit 3: We've got at least one boundary in our buffer. 166 | assert!(self.buffer.as_slice() 167 | .contains_slice(self.boundary.as_slice())); 168 | return Ok(self.buffer.as_slice()) 169 | } 170 | } 171 | } 172 | 173 | } 174 | 175 | impl<'a,T: Buffer+'a> Reader for ChunkBuffer<'a,T> { 176 | fn read(&mut self, _buf: &mut [u8]) -> IoResult { 177 | // We need to drain our internal buffer first, then our external 178 | // buffer. 179 | fail!("Not yet implemented"); 180 | } 181 | } 182 | 183 | impl<'a,T: Buffer+'a> Buffer for ChunkBuffer<'a,T> { 184 | fn fill_buf<'a>(&'a mut self) -> IoResult<&'a [u8]> { 185 | if self.buffer.as_slice().contains_slice(self.boundary.as_slice()) { 186 | // Exit 1: Valid data in our local buffer. 187 | Ok(self.buffer.as_slice()) 188 | } else if self.buffer.len() > 0 { 189 | // Exit 2: Add some more data to our local buffer so that it's 190 | // valid (see invariants for top_up). 191 | self.top_up() 192 | } else { 193 | { 194 | let read_or_err = self.input.fill_buf(); 195 | // Exit 3: Error when reading underlying buffer. 196 | match read_or_err { 197 | Err(err) => { return Err(err); } 198 | Ok(read) => { 199 | if read.contains_slice(self.boundary.as_slice()) { 200 | // Exit 4: We can return this directly, but see 201 | // https://github.com/rust-lang/rust/issues/6393 202 | // https://github.com/rust-lang/rust/issues/12147 203 | // for a discussion of why we need unsafe here. 204 | // Basically, we need to break the lifetime 205 | // propagation between `read` and our return 206 | // value, so `read` can be allowed to lapse 207 | // when we leave this lexical scope. 208 | return Ok(unsafe { transmute(read) }); 209 | } 210 | } 211 | } 212 | } 213 | 214 | // Exit 5: Accumulate sufficient data in our local buffer (see 215 | // invariants for top_up). 216 | self.top_up() 217 | } 218 | } 219 | 220 | fn consume(&mut self, amt: uint) { 221 | if self.buffer.len() > 0 { 222 | assert!(amt <= self.buffer.len()); 223 | let keeping = self.buffer.len() - amt; 224 | for i in range(0, keeping) { 225 | self.buffer.swap_remove(keeping-(i+1)); 226 | } 227 | self.buffer.truncate(keeping); 228 | } else { 229 | self.input.consume(amt); 230 | } 231 | } 232 | } 233 | 234 | #[cfg(test)] 235 | fn read_chunks(chunked: &mut T, boundary: &[u8]) -> Vec { 236 | let boundary_len = boundary.len(); 237 | let mut read = vec![]; 238 | loop { 239 | let consumed = { 240 | match chunked.fill_buf() { 241 | Ok(ref data) => { 242 | read.push_all(data.as_slice()); 243 | let data_len = data.len(); 244 | assert!(data_len >= boundary_len); 245 | assert!(data.contains_slice(boundary)); 246 | data_len 247 | } 248 | Err(IoError{kind: EndOfFile, ..}) => { break; } 249 | Err(err) => { fail!("{}", err); } 250 | } 251 | }; 252 | chunked.consume(consumed); 253 | } 254 | read 255 | } 256 | 257 | #[test] 258 | fn reading_chunks() { 259 | let data = test_data(); 260 | let mut reader = MemReader::new(data.clone()); 261 | let mut chunked = ChunkBuffer::new(&mut reader, &[10, 10]); 262 | let read = read_chunks(&mut chunked, &[10, 10]); 263 | assert_eq!(data, read); 264 | } 265 | 266 | #[test] 267 | fn reading_chunks_via_dribble() { 268 | let data = test_data(); 269 | let mut reader = MemReader::new(data.clone()); 270 | let mut dribble = DribbleBuffer::new(&mut reader); 271 | let mut chunked = ChunkBuffer::new(&mut dribble, &[10, 10]); 272 | let read = read_chunks(&mut chunked, &[10, 10]); 273 | assert_eq!(data, read); 274 | } 275 | --------------------------------------------------------------------------------