├── .gitignore ├── tange-collection ├── .gitignore ├── Cargo.toml ├── src │ ├── collection │ │ ├── mod.rs │ │ ├── memory.rs │ │ └── disk.rs │ ├── utils.rs │ ├── lib.rs │ ├── partitioned.rs │ └── interfaces.rs └── README.md ├── tange-core ├── .gitignore ├── Cargo.toml ├── src │ ├── task.rs │ ├── lib.rs │ ├── graph.rs │ ├── deferred.rs │ └── scheduler.rs └── README.md ├── Cargo.toml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | Cargo.lock 2 | target/ 3 | -------------------------------------------------------------------------------- /tange-collection/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | -------------------------------------------------------------------------------- /tange-core/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | /target/ 3 | **/*.rs.bk 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "tange-core", 4 | "tange-collection", 5 | ] 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Tange 2 | --- 3 | 4 | Tange is a Task-based parallelization library written for Rust. It currently comes with two flavors: 5 | 6 | 1. [tange-core](tange-core): This contains the primitives for constructing and executing a task graphs 7 | 2. [tange-collection](tange-collection): This contains a higher level Dataflow interface for convenient munging of data. 8 | -------------------------------------------------------------------------------- /tange-core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tange" 3 | version = "0.1.1" 4 | authors = ["Andrew Stanton "] 5 | description = "Scalable Task-based Parallelism Framework" 6 | license = "Apache-2.0/MIT" 7 | repository = "https://github.com/Refefer/tange/tree/master/tange-core" 8 | readme = "README.md" 9 | keywords = ["parallel", "thread", "concurrency", "dataflow", "performance"] 10 | categories = ["concurrency"] 11 | 12 | [dependencies] 13 | log = "0.4" 14 | priority-queue = "0.5.1" 15 | jobpool = "0.3.8" 16 | num_cpus = "1.0" 17 | 18 | [lib] 19 | name = "tange" 20 | path = "src/lib.rs" 21 | -------------------------------------------------------------------------------- /tange-collection/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tange-collection" 3 | version = "0.1.2" 4 | authors = ["Andrew Stanton "] 5 | description = "Dataflow computation" 6 | license = "Apache-2.0/MIT" 7 | repository = "https://github.com/Refefer/tange/tree/master/tange-collection" 8 | readme = "README.md" 9 | keywords = ["parallel", "thread", "concurrency", "dataflow", "performance"] 10 | categories = ["concurrency", "algorithms"] 11 | 12 | [dependencies] 13 | tange = { path = "../tange-core" } 14 | bincode = "1.0" 15 | serde = "1.0" 16 | serde_derive = "1.0" 17 | uuid = { version = "0.6", features = ["v4"] } 18 | snap = "0.2.5" 19 | 20 | [lib] 21 | name = "tange_collection" 22 | path = "src/lib.rs" 23 | -------------------------------------------------------------------------------- /tange-collection/src/collection/mod.rs: -------------------------------------------------------------------------------- 1 | //! Contains the two main primitives: MemoryCollection and DiskCollection 2 | 3 | /// Defines MemoryCollection and assorted functions 4 | pub mod memory; 5 | 6 | /// Defines DiskCollection and assorted functions 7 | pub mod disk; 8 | 9 | use std::any::Any; 10 | 11 | use tange::deferred::{Deferred, batch_apply}; 12 | use interfaces::{Accumulator,ValueWriter,Stream}; 13 | 14 | fn emit< 15 | A, 16 | Col: Any + Send + Sync + Clone + Stream, 17 | B: Any + Send + Sync + Clone, 18 | F: 'static + Sync + Send + Clone + Fn(&A, &mut FnMut(B) -> ()), 19 | Acc: 'static + Accumulator 20 | >(defs: &[Deferred], acc: Acc, f: F) -> Vec>::VW as ValueWriter>::Out>> { 21 | 22 | batch_apply(&defs, move |_idx, vs| { 23 | let mut out = acc.writer(); 24 | for v in vs.stream().into_iter() { 25 | f(&v, &mut |r| out.add(r)); 26 | } 27 | out.finish() 28 | }) 29 | } 30 | 31 | -------------------------------------------------------------------------------- /tange-core/src/task.rs: -------------------------------------------------------------------------------- 1 | use std::any::Any; 2 | use std::marker::PhantomData; 3 | 4 | pub type BASS = Box; 5 | pub enum DynArgs<'a> { 6 | One(&'a BASS), 7 | Two(&'a BASS, &'a BASS) 8 | } 9 | 10 | pub trait DynRun: Send + Sync { 11 | fn eval(&self, val: DynArgs) -> Option; 12 | } 13 | 14 | pub struct DynFn B>(F,PhantomData,PhantomData); 15 | 16 | impl B> DynFn { 17 | pub fn new(f: F) -> Self { 18 | DynFn(f, PhantomData, PhantomData) 19 | } 20 | } 21 | 22 | impl B> DynRun for DynFn { 23 | 24 | fn eval(&self, val: DynArgs) -> Option { 25 | match val { 26 | DynArgs::One(v) => v.downcast_ref::().map(|a| { 27 | let b = self.0(a); 28 | let bx: BASS = Box::new(b); 29 | bx 30 | }), 31 | _ => None 32 | } 33 | } 34 | } 35 | 36 | pub struct DynFn2 C>(F,PhantomData,PhantomData,PhantomData); 37 | 38 | impl C> DynFn2 { 39 | pub fn new(f: F) -> Self { 40 | DynFn2(f, PhantomData, PhantomData, PhantomData) 41 | } 42 | } 43 | 44 | impl C> DynRun for DynFn2 { 45 | 46 | fn eval(&self, val: DynArgs) -> Option { 47 | match val { 48 | DynArgs::Two(a, b) => { 49 | a.downcast_ref::().and_then(|a| { 50 | b.downcast_ref::().map(|b| { 51 | let c = self.0(a, b); 52 | let cx: BASS = Box::new(c); 53 | cx 54 | }) 55 | }) 56 | }, 57 | _ => None 58 | } 59 | } 60 | } 61 | 62 | 63 | -------------------------------------------------------------------------------- /tange-core/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! tange-core 2 | //! 3 | //! `tange-core` provides primitives for building and running task-based computations. 4 | //! 5 | //! What is it? 6 | //! --- 7 | //! 8 | //! `Tange` is a framework that makes it easy to write defered, data parallel computations that are executed concurrently across a local machine. It can scale up to millions of tasks per Graph and can be useful for a number of different applications: 9 | //! 10 | //! * Data processing. 11 | //! * All-Reduce operations. 12 | //! * Distributed machine learning algorithms. 13 | //! * General parallel computing. 14 | //! 15 | //! How to Use It? 16 | //! --- 17 | //! 18 | //! Tange defines a `Deferred` struct which represents a computation. `Deferred` objects are accessed with three simple functions: 19 | //! 20 | //! 1. `lift` - Lift takes a concrete value and lifts it into a Deferred object 21 | //! 2. `apply` - Apply applies a function to a Deferred, producing a new Deferred object. 22 | //! 3. `join` - Join combines two Deferred objects with a joiner function, producing a new Deferred. 23 | //! 24 | //! Example - Hello World! 25 | //! --- 26 | //! ```rust 27 | //! use tange::deferred::Deferred; 28 | //! use tange::scheduler::GreedyScheduler; 29 | //! 30 | //! let hello = Deferred::lift("Hello".to_owned(), None); 31 | //! let world = Deferred::lift("World".to_owned(), None); 32 | //! let world_exclaim = world.apply(|w| format!("{}!", w)); 33 | //! let hello_world = hello.join(&world_exclaim, |h, w| format!("{} {}", h, w)); 34 | //! assert_eq!(hello_world.run(&GreedyScheduler::new()), Some("Hello World!".into())); 35 | //! ``` 36 | //! 37 | //! 38 | //! 39 | 40 | #![warn(missing_docs)] 41 | 42 | #[macro_use] 43 | extern crate log; 44 | 45 | /// Contains Deferred primitive and function definitions 46 | pub mod deferred; 47 | 48 | /// Contains Scheduler trait definition and implementations 49 | pub mod scheduler; 50 | 51 | /// Internal Graph implementation 52 | mod graph; 53 | 54 | /// Internal task definitions 55 | mod task; 56 | 57 | -------------------------------------------------------------------------------- /tange-collection/src/utils.rs: -------------------------------------------------------------------------------- 1 | //! Utilities for creating collections 2 | use std::io::prelude::*; 3 | use std::io::{SeekFrom,BufReader,Error}; 4 | use std::fs::{File,metadata}; 5 | 6 | use tange::deferred::{Deferred, batch_apply}; 7 | 8 | use collection::memory::MemoryCollection; 9 | 10 | #[derive(Clone)] 11 | struct Chunk { path: String, start: u64, end: u64 } 12 | 13 | /// Reads a new-line delimited text file, creating a new partition every `chunk_size` 14 | pub fn read_text(path: &str, chunk_size: u64) -> Result,Error> { 15 | // Read the file size 16 | let file_size = metadata(path)?.len(); 17 | let mut dfs = Vec::new(); 18 | let mut cur_offset = 0u64; 19 | while cur_offset < file_size { 20 | let chunk = Chunk { 21 | path: path.into(), 22 | start: cur_offset, 23 | end: cur_offset + chunk_size 24 | }; 25 | dfs.push(Deferred::lift(chunk, 26 | Some(&format!("File: {}, start: {}", path, cur_offset)))); 27 | cur_offset += chunk_size; 28 | } 29 | 30 | Ok(MemoryCollection::from_defs(batch_apply(&dfs, read))) 31 | } 32 | 33 | fn read(_idx: usize, chunk: &Chunk) -> Vec { 34 | let f = File::open(&chunk.path) 35 | .expect("Error when opening file"); 36 | let mut reader = BufReader::new(f); 37 | reader.seek(SeekFrom::Start(chunk.start)) 38 | .expect("Error when reading file!"); 39 | 40 | let mut start = if chunk.start > 0 { 41 | // Skip first line, which is likely a partial line 42 | let mut s = Vec::new(); 43 | let size = reader.read_until(b'\n', &mut s) 44 | .expect("Error reading line from file!"); 45 | chunk.start + size as u64 46 | } else { 47 | 0 48 | }; 49 | 50 | let total = chunk.end; 51 | let mut lines = Vec::new(); 52 | loop { 53 | let mut s = String::new(); 54 | match reader.read_line(&mut s) { 55 | Ok(0) => break, 56 | Ok(size) => { 57 | start += size as u64; 58 | s.shrink_to_fit(); 59 | lines.push(s); 60 | }, 61 | _ => break 62 | }; 63 | if start > total { break; } 64 | } 65 | lines.shrink_to_fit(); 66 | lines 67 | } 68 | -------------------------------------------------------------------------------- /tange-core/src/graph.rs: -------------------------------------------------------------------------------- 1 | //! 2 | //! Graph definition libraries. These are typically not used directly, instead accessed 3 | //! via Deferred objects. 4 | //! 5 | use std::sync::Arc; 6 | use std::sync::atomic::{AtomicUsize, ATOMIC_USIZE_INIT, Ordering}; 7 | 8 | use task::{BASS,DynRun}; 9 | 10 | static GLOBAL_HANDLE_COUNT: AtomicUsize = ATOMIC_USIZE_INIT; 11 | 12 | /// Interface for providing inputs into the graph, such as reading a file 13 | pub trait Input: Send + Sync { 14 | fn read(&self) -> BASS; 15 | } 16 | 17 | /// Unique values representing a task in a Graph. Handles are globally unique and 18 | /// defined by order. 19 | #[derive(Debug,Clone,PartialEq,Eq,Hash)] 20 | pub struct Handle(String, usize); 21 | 22 | impl Handle { 23 | /// Creates a new handle. 24 | fn new(name: String) -> Self { 25 | Handle(name, GLOBAL_HANDLE_COUNT.fetch_add(1, Ordering::SeqCst)) 26 | } 27 | } 28 | 29 | /// ADT for handling either Tasks or reading data into the graph 30 | pub enum Task { 31 | 32 | /// Node which consumes down stream data to produce new data 33 | Function(Box), 34 | 35 | /// Node which generates data 36 | Input(Box) 37 | } 38 | 39 | /// Holds references to the number of arguments to pass into a Task 40 | #[derive(Clone)] 41 | pub enum FnArgs { 42 | 43 | /// Single argument 44 | Single(Arc), 45 | 46 | /// Used for joining two separate task outputs 47 | Join(Arc, Arc) 48 | } 49 | 50 | /// Graphs contain the computational pieces needed to represent the data flow 51 | /// between multiple different tasks, their combination, and eventual output. 52 | #[derive(Clone)] 53 | pub struct Graph { 54 | 55 | /// Pointer to the underlying Handle representing this computation 56 | pub handle: Arc, 57 | 58 | /// Task to run 59 | pub task: Arc, 60 | 61 | /// Arguments consumed by defined Task 62 | pub args: Option 63 | 64 | } 65 | 66 | impl Graph { 67 | 68 | /// Adds a new input into the Graph 69 | pub fn create_input(input: I, name: &str) -> Arc { 70 | let i_name = format!("Input", name); 71 | let handle = Arc::new(Handle::new(i_name)); 72 | let inp = Arc::new(Task::Input(Box::new(input))); 73 | Arc::new(Graph { 74 | handle: handle, 75 | task: inp, 76 | args: None 77 | }) 78 | } 79 | 80 | /// Adds a task to the dataset with the given inputs. No effort is made to ensure the 81 | /// handles exist within the graph. 82 | pub fn create_task(inputs: FnArgs, t: D, name: &str) -> Arc { 83 | // Get new handle 84 | let h_name = format!("Task", name); 85 | let handle = Arc::new(Handle::new(h_name)); 86 | let task = Arc::new(Task::Function(Box::new(t))); 87 | Arc::new(Graph { 88 | handle: handle, 89 | task: task, 90 | args: Some(inputs) 91 | }) 92 | } 93 | 94 | } 95 | 96 | -------------------------------------------------------------------------------- /tange-collection/README.md: -------------------------------------------------------------------------------- 1 | Tange-Collection 2 | --- 3 | Tange-Collection is a medium-level dataflow library for high speed data processing. 4 | 5 | What is it? 6 | --- 7 | Tange-Collection provides dataflow operatores for quickly executing data processing tasks. It uses task-based parallelization for construction of complex computation graphs, scalable to hundreds of millions of independent stages. 8 | 9 | It was created to solve the same sort of processing tasks as Dask and Spark, with a higher 10 | emphasis on batch processing rather than analytics. 11 | 12 | API 13 | --- 14 | 15 | * [Overall](https://docs.rs/tange-collection/0.1.0/tange_collection/) 16 | * [MemoryCollection](https://docs.rs/tange-collection/0.1.0/tange_collection/collection/memory/struct.MemoryCollection.html) 17 | * [DiskCollection](https://docs.rs/tange-collection/0.1.0/tange_collection/collection/disk/struct.DiskCollection.html) 18 | 19 | Example - Word Count 20 | --- 21 | 22 | ```rust 23 | extern crate tange; 24 | extern crate tange_collection; 25 | 26 | use tange::scheduler::GreedyScheduler; 27 | use tange_collection::utils::read_text; 28 | 29 | use std::env::args; 30 | 31 | fn main() { 32 | let path = args().nth(1).unwrap(); 33 | let col = read_text(&path, 4_000_000) 34 | .expect("File missing"); 35 | 36 | let graph = col 37 | .map(|line| line.split_whitespace().fold(0usize, |a,_x| a + 1)) 38 | .fold_by(|_count| 1, 39 | || 0usize, 40 | |acc, c| { *acc += c }, 41 | |acc1, acc2| { *acc1 += acc2 }, 42 | 1); 43 | 44 | if let Some(counts) = graph.run(&GreedyScheduler::new()) { 45 | println!("Counts: {:?}", counts); 46 | } 47 | } 48 | ``` 49 | Example - IDF count 50 | --- 51 | ```rust 52 | extern crate tange; 53 | extern crate tange_collection; 54 | 55 | use tange::scheduler::GreedyScheduler; 56 | use tange_collection::utils::read_text; 57 | 58 | use std::env::args; 59 | use std::collections::HashSet; 60 | 61 | fn main() { 62 | env_logger::init(); 63 | 64 | let path = args().nth(1).unwrap(); 65 | let col = read_text(&path, 64_000_000) 66 | .expect("File missing"); 67 | 68 | let total_lines = col.count(); 69 | let word_freq = col 70 | .emit_to_disk("/tmp".into(), |line, emitter| { 71 | let unique: HashSet<_> = line.split_whitespace().map(|p| p.to_lowercase()).collect(); 72 | for word in unique { 73 | emitter(word); 74 | } 75 | }) 76 | .frequencies(16); 77 | 78 | // Cross product 79 | let idfs = total_lines.join_on( 80 | &word_freq.to_memory(), 81 | |_c| 1, 82 | |_wc| 1, 83 | |total, (word, count)| { 84 | (word.clone(), (1f64 + (*total as f64 / *count as f64)).ln()) 85 | }, 86 | 1 87 | ) 88 | .map(|(_k, x)| x.clone()) 89 | .sort_by(|(word, _count)| word.clone()); 90 | 91 | if let Some(word_idf) = idfs.run(&GreedyScheduler::new()) { 92 | for (w, idf) in word_idf { 93 | println!("{}: {}", w, idf); 94 | } 95 | } 96 | } 97 | ``` 98 | 99 | 100 | -------------------------------------------------------------------------------- /tange-collection/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Tange-Collection 2 | //! --- 3 | //! Tange-Collection is a medium-level dataflow library for high speed data processing. 4 | //! 5 | //! What is it? 6 | //! --- 7 | //! Tange-Collection provides dataflow operatores for quickly executing data processing tasks. It uses task-based parallelization for construction of complex computation graphs, scalable to hundreds of millions of independent stages. 8 | //! 9 | //! It was created to solve the same sort of processing tasks as Dask and Spark, with a higher 10 | //! emphasis on batch processing rather than analytics. 11 | //! 12 | //! Example - Word Count 13 | //! --- 14 | //! 15 | //! ```ignore 16 | //! extern crate tange; 17 | //! extern crate tange_collection; 18 | //! 19 | //! use tange::scheduler::GreedyScheduler; 20 | //! use tange_collection::utils::read_text; 21 | //! 22 | //! use std::env::args; 23 | //! 24 | //! fn main() { 25 | //! let path = args().nth(1).unwrap(); 26 | //! let col = read_text(&path, 4_000_000) 27 | //! .expect("File missing"); 28 | //! 29 | //! let graph = col 30 | //! .map(|line| line.split_whitespace().fold(0usize, |a,_x| a + 1)) 31 | //! .fold_by(|_count| 1, 32 | //! || 0usize, 33 | //! |acc, c| { *acc += c }, 34 | //! |acc1, acc2| { *acc1 += acc2 }, 35 | //! 1); 36 | //! 37 | //! if let Some(counts) = graph.run(&GreedyScheduler::new()) { 38 | //! println!("Counts: {:?}", counts); 39 | //! } 40 | //! } 41 | //! ``` 42 | //! Example - IDF count 43 | //! --- 44 | //! ```ignore 45 | //! extern crate tange; 46 | //! extern crate tange_collection; 47 | //! 48 | //! use tange::scheduler::GreedyScheduler; 49 | //! use tange_collection::utils::read_text; 50 | //! 51 | //! use std::env::args; 52 | //! use std::collections::HashSet; 53 | //! 54 | //! fn main() { 55 | //! 56 | //! let path = args().nth(1).unwrap(); 57 | //! let col = read_text(&path, 64_000_000) 58 | //! .expect("File missing"); 59 | //! 60 | //! let total_lines = col.count(); 61 | //! let word_freq = col 62 | //! .emit_to_disk("/tmp".into(), |line, emitter| { 63 | //! let unique: HashSet<_> = line.split_whitespace().map(|p| p.to_lowercase()).collect(); 64 | //! for word in unique { 65 | //! emitter(word); 66 | //! } 67 | //! }) 68 | //! .frequencies(16); 69 | //! 70 | //! // Cross product 71 | //! let idfs = total_lines.join_on( 72 | //! &word_freq.to_memory(), 73 | //! |_c| 1, 74 | //! |_wc| 1, 75 | //! |total, (word, count)| { 76 | //! (word.clone(), (1f64 + (*total as f64 / *count as f64)).ln()) 77 | //! }, 78 | //! 1 79 | //! ) 80 | //! .map(|(_k, x)| x.clone()) 81 | //! .sort_by(|(word, _count)| word.clone()); 82 | //! 83 | //! if let Some(word_idf) = idfs.run(&GreedyScheduler::new()) { 84 | //! for (w, idf) in word_idf { 85 | //! println!("{}: {}", w, idf); 86 | //! } 87 | //! } 88 | //! } 89 | //! ``` 90 | 91 | #![warn(missing_docs)] 92 | 93 | extern crate tange; 94 | 95 | /// Defines useful utilities, such as reading files 96 | pub mod utils; 97 | 98 | /// Describes basic interfaces for storing and consuming data 99 | pub mod interfaces; 100 | 101 | /// Defines the two major primitives: MemoryColleciton and DiskCollection 102 | pub mod collection; 103 | 104 | mod partitioned; 105 | 106 | -------------------------------------------------------------------------------- /tange-core/README.md: -------------------------------------------------------------------------------- 1 | Tange 2 | === 3 | 4 | A Task-based parallelization framework. 5 | 6 | What is it? 7 | --- 8 | 9 | `Tange` is a framework that makes it easy to write defered, data parallel computations that are executed concurrently across a local machine. It can scale up to millions of tasks per Graph and can be useful for a number of different applications: 10 | 11 | * Data processing. 12 | * All-Reduce operations. 13 | * Distributed machine learning algorithms. 14 | * General parallel computing. 15 | 16 | How to Use It? 17 | --- 18 | 19 | Tange defines a `Deferred` struct which represents a computation. `Deferred` objects are accessed with three simple functions: 20 | 21 | 1. `lift` - Lift takes a concrete value and lifts it into a Deferred object 22 | 2. `apply` - Apply applies a function to a Deferred, producing a new Deferred object. 23 | 3. `join` - Join combines two Deferred objects with a joiner function, producing a new Deferred. 24 | 25 | Example - Hello World 26 | --- 27 | 28 | ```rust 29 | use tange::deferred::Deferred; 30 | use tange::scheduler::GreedyScheduler; 31 | 32 | // Create two Deferred object 33 | let hello = Deferred::lift("Hello".to_owned(), None); 34 | let world = Deferred::lift("World".to_owned(), None); 35 | 36 | // Add an exclamation mark to "World" 37 | let world_exclaim = world.apply(|w| format!("{}!", w)); 38 | 39 | // Join the words! 40 | let hello_world = hello.join(&world_exclaim, |h, w| format!("{} {}", h, w)); 41 | 42 | assert_eq!(hello_world.run(&GreedyScheduler::new()), Some("Hello World!".into())); 43 | ``` 44 | 45 | Example 46 | --- 47 | 48 | Let's count all the words across a directory. 49 | 50 | ```rust 51 | extern crate tange; 52 | 53 | use tange::scheduler::GreedyScheduler; 54 | use tange::deferred::{Deferred,batch_apply,tree_reduce}; 55 | 56 | use std::io::{BufReader,BufRead}; 57 | use std::env::args; 58 | 59 | use std::io; 60 | use std::fs::{File, read_dir}; 61 | use std::path::Path; 62 | 63 | fn read_files(dir: &Path, buffer: &mut Vec>) -> io::Result<()> { 64 | if dir.is_dir() { 65 | for entry in read_dir(dir)? { 66 | let entry = entry?; 67 | let path = entry.path(); 68 | if path.is_dir() { 69 | read_files(&path, buffer)?; 70 | } else { 71 | let p = path.to_string_lossy().into_owned(); 72 | buffer.push(Deferred::lift(p, None)); 73 | } 74 | } 75 | } 76 | Ok(()) 77 | } 78 | 79 | fn main() { 80 | let mut defs = Vec::new(); 81 | for path in args().skip(1) { 82 | read_files(&Path::new(&path), &mut defs).expect("Error reading directory!"); 83 | } 84 | 85 | if defs.len() == 0 { 86 | panic!("No files to count!"); 87 | } 88 | 89 | // Read a file and count the number of words, split by white space 90 | let counts = batch_apply(&defs, |_idx, fname| { 91 | let mut count = 0usize; 92 | if let Ok(f) = File::open(&fname) { 93 | let mut br = BufReader::new(f); 94 | for maybe_line in br.lines() { 95 | if let Ok(line) = maybe_line { 96 | for p in line.split_whitespace() { 97 | if p.len() > 0 { 98 | count += 1; 99 | } 100 | } 101 | } else { 102 | eprintln!("Error reading {}, skipping rest of file...", fname); 103 | break 104 | } 105 | } 106 | }; 107 | count 108 | }); 109 | 110 | // Sum the counts 111 | let total = tree_reduce(&counts, |left, right| left + right) 112 | .expect("Can't reduce if there are no files in the directory!"); 113 | 114 | let count = total.run(&GreedyScheduler::new()).unwrap(); 115 | println!("Found {} words", count); 116 | } 117 | ``` 118 | -------------------------------------------------------------------------------- /tange-collection/src/partitioned.rs: -------------------------------------------------------------------------------- 1 | extern crate tange; 2 | 3 | use std::any::Any; 4 | use std::hash::{Hasher,Hash}; 5 | use std::collections::hash_map::DefaultHasher; 6 | use std::collections::HashMap; 7 | use std::sync::Arc; 8 | 9 | use tange::deferred::{Deferred, batch_apply, tree_reduce}; 10 | use interfaces::*; 11 | 12 | pub fn block_reduce< 13 | A, 14 | B, 15 | Col: Any + Sync + Send + Clone + Stream, 16 | K: Any + Sync + Send + Clone + Hash + Eq, 17 | C: Any + Sync + Send + Clone, 18 | D: 'static + Sync + Send + Clone + Fn() -> B, 19 | F: 'static + Sync + Send + Clone + Fn(&A) -> K, 20 | O: 'static + Sync + Send + Clone + Fn(&mut B, &A) -> (), 21 | M: 'static + Sync + Send + Clone + Fn(HashMap) -> C, 22 | >( 23 | defs: &[Deferred], 24 | key: F, 25 | default: D, 26 | binop: O, 27 | map: M 28 | ) -> Vec> { 29 | batch_apply(defs, move |_idx, vs| { 30 | let mut reducer = HashMap::new(); 31 | for v in vs.stream().into_iter() { 32 | let k = key(&v); 33 | let e = reducer.entry(k).or_insert_with(&default); 34 | binop(e, &v); 35 | } 36 | map(reducer) 37 | }) 38 | } 39 | 40 | pub fn split_by_key< 41 | Col: Any + Sync + Send + Clone + Accumulator + Stream, 42 | A: Clone, 43 | F: 'static + Sync + Send + Clone + Fn(usize, &A) -> usize 44 | >( 45 | defs: &[Deferred], 46 | partitions: usize, 47 | hash_function: F 48 | ) -> Vec>> 49 | where Col::VW: ValueWriter { 50 | 51 | // Group into buckets 52 | let stage1 = batch_apply(&defs, move |_idx, vs| { 53 | let mut parts: Vec<_> = (0..partitions).map(|_| vs.writer()).collect(); 54 | for (idx, x) in vs.stream().into_iter().enumerate() { 55 | let p = hash_function(idx, &x) % partitions; 56 | parts[p].add(x.clone()); 57 | } 58 | parts.into_iter().map(|x| x.finish()).collect::>() 59 | }); 60 | 61 | // For each partition in each chunk, pull out at index and regroup. 62 | // Tree reduce to concatenate 63 | let mut splits = Vec::with_capacity(partitions); 64 | for idx in 0usize..partitions { 65 | let mut partition = Vec::with_capacity(stage1.len()); 66 | 67 | for s in stage1.iter() { 68 | partition.push(s.apply(move |parts| parts[idx].copy())); 69 | } 70 | splits.push(partition); 71 | } 72 | splits 73 | } 74 | 75 | pub fn partition< 76 | Col: Any + Sync + Send + Clone + Accumulator + Stream, 77 | A: Any + Send + Sync + Clone, 78 | F: 'static + Sync + Send + Clone + Fn(usize, &A) -> usize 79 | >( 80 | defs: &[Deferred], 81 | partitions: usize, 82 | key: F 83 | ) -> Vec> 84 | where Col::VW: ValueWriter { 85 | 86 | let groups = split_by_key(defs, partitions, key); 87 | 88 | let mut new_chunks = Vec::with_capacity(groups.len()); 89 | for group in groups { 90 | if let Some(d) = concat(&group) { 91 | new_chunks.push(d); 92 | } 93 | } 94 | new_chunks 95 | } 96 | 97 | pub fn fold_by< 98 | A: Clone, 99 | C1: Any + Sync + Send + Clone + Accumulator + Stream, 100 | B: Any + Sync + Send + Clone, 101 | K: Any + Sync + Send + Clone + Hash + Eq, 102 | D: 'static + Sync + Send + Clone + Fn() -> B, 103 | F: 'static + Sync + Send + Clone + Fn(&A) -> K, 104 | O: 'static + Sync + Send + Clone + Fn(&mut B, &A) -> (), 105 | R: 'static + Sync + Send + Clone + Fn(&mut B, &B) -> (), 106 | Acc: 'static + Accumulator<(K, B)> + Stream<(K,B)> 107 | >( 108 | defs: &[Deferred], 109 | key: F, 110 | default: D, 111 | binop: O, 112 | reduce: R, 113 | acc: Acc, 114 | partitions: usize 115 | ) -> Vec>::VW as ValueWriter<(K, B)>>::Out>> 116 | where Acc::VW: ValueWriter<(K, B),Out=Acc> { 117 | 118 | let acc2 = Arc::new(acc); 119 | let am = acc2.clone(); 120 | let stage1 = block_reduce(defs, key, default, binop, move |x| { 121 | let mut out = am.writer(); 122 | out.extend(&mut x.into_iter()); 123 | out.finish() 124 | }); 125 | 126 | // Split into chunks 127 | let chunks = partition_by_key::(&stage1, partitions, |x| x.0.clone()); 128 | 129 | // partition reduce 130 | let am = acc2.clone(); 131 | let concat: Vec<_> = chunks.into_iter().map(move |chunk| { 132 | let am = am.clone(); 133 | batch_apply(&chunk, move |_idx, vs| { 134 | let mut hm = HashMap::new(); 135 | for (k, v) in vs.stream() { 136 | hm.insert(k, v); 137 | } 138 | let mut out = am.writer(); 139 | out.extend(&mut hm.into_iter()); 140 | out.finish() 141 | }) 142 | }).collect(); 143 | 144 | let mut reduction = Vec::new(); 145 | let rm = Arc::new(reduce); 146 | for group in concat { 147 | let amc = acc2.clone(); 148 | let ri = rm.clone(); 149 | 150 | let out = tree_reduce(&group, move |left, right| { 151 | let mut nl = HashMap::new(); 152 | for (k, v) in left.stream() { 153 | nl.insert(k, v); 154 | } 155 | for (k, v) in right.stream() { 156 | if !nl.contains_key(&k) { 157 | nl.insert(k, v); 158 | } else { 159 | nl.entry(k) 160 | .and_modify(|e| ri(e, &v)) 161 | .or_insert_with(|| v); 162 | } 163 | } 164 | let mut out = amc.writer(); 165 | 166 | for item in nl.into_iter() { 167 | out.add(item); 168 | } 169 | out.finish() 170 | }); 171 | reduction.push(out.unwrap()); 172 | } 173 | reduction 174 | } 175 | 176 | pub fn partition_by_key< 177 | C: Any + Sync + Send + Clone + Accumulator + Stream, 178 | A: Clone, 179 | K: Any + Sync + Send + Clone + Hash + Eq, 180 | F: 'static + Sync + Send + Clone + Fn(&A) -> K 181 | >( 182 | defs: &[Deferred], 183 | n_chunks: usize, 184 | key: F 185 | ) -> Vec>> 186 | where C::VW: ValueWriter { 187 | split_by_key(defs, n_chunks, move |_idx, v| { 188 | let k = key(v); 189 | let mut hasher = DefaultHasher::new(); 190 | k.hash(&mut hasher); 191 | hasher.finish() as usize 192 | }) 193 | } 194 | 195 | pub fn concat< 196 | Col: Any + Sync + Send + Accumulator + Stream, 197 | A: Clone, 198 | >( 199 | defs: &[Deferred] 200 | ) -> Option> 201 | where Col::VW: ValueWriter { 202 | 203 | tree_reduce(&defs, |x, y| { 204 | let mut out = x.writer(); 205 | for xi in x.stream() { 206 | out.add(xi); 207 | } 208 | for yi in y.stream() { 209 | out.add(yi); 210 | } 211 | out.finish() 212 | }) 213 | } 214 | 215 | pub fn join_on_key< 216 | A, 217 | B, 218 | Col1: Any + Sync + Send + Clone + Stream<(K, A)>, 219 | Col2: Any + Sync + Send + Clone + Stream<(K, B)>, 220 | K: Any + Send + Sync + Clone + Hash + Eq, 221 | C: Any + Sync + Send + Clone, 222 | J: 'static + Sync + Send + Clone + Fn(&A, &B) -> C, 223 | Acc: 'static + Accumulator<(K, C)> 224 | >( 225 | d1: &Deferred, 226 | d2: &Deferred, 227 | acc: Acc, 228 | joiner: J 229 | ) -> Deferred<<>::VW as ValueWriter<(K, C)>>::Out> { 230 | 231 | d1.join(d2, move |left, right| { 232 | // Slurp up left into a hashmap 233 | let mut hm = HashMap::new(); 234 | for (k, lv) in left.stream() { 235 | let e = hm.entry(k).or_insert_with(|| Vec::with_capacity(1)); 236 | e.push(lv); 237 | } 238 | let mut ret = acc.writer(); 239 | for (k, rv) in right.stream() { 240 | if let Some(lvs) = hm.get(&k) { 241 | for lv in lvs.iter() { 242 | ret.add((k.clone(), joiner(&lv, &rv))) 243 | } 244 | } 245 | } 246 | ret.finish() 247 | }) 248 | } 249 | 250 | -------------------------------------------------------------------------------- /tange-collection/src/interfaces.rs: -------------------------------------------------------------------------------- 1 | //! Defines the internal collections traits and objects.. 2 | extern crate serde; 3 | extern crate bincode; 4 | extern crate uuid; 5 | extern crate snap; 6 | 7 | use std::any::Any; 8 | use std::fs::{File,remove_file,create_dir_all}; 9 | use std::io::{BufReader,BufWriter}; 10 | use std::marker::PhantomData; 11 | use std::sync::Arc; 12 | 13 | use self::snap::{Writer,Reader}; 14 | use self::serde::{Serialize,Deserialize}; 15 | use self::bincode::{serialize_into, deserialize_from,ErrorKind}; 16 | use self::uuid::Uuid; 17 | 18 | /// Accumulators are object which can create 'Writers', using effectively the Builder 19 | /// pattern 20 | pub trait Accumulator: Send + Sync + Clone { 21 | 22 | /// ValueWriter created 23 | type VW: ValueWriter; 24 | 25 | /// Create a new ValueWriter 26 | fn writer(&self) -> Self::VW; 27 | 28 | /// Convert a Vec into a ValueWriter output 29 | fn write_vec(&self, vs: Vec) -> <>::VW as ValueWriter>::Out { 30 | let mut out = self.writer(); 31 | for a in vs { 32 | out.add(a) 33 | } 34 | out.finish() 35 | } 36 | } 37 | 38 | /// ValueWriters write Values into some internal state. When finished, yields some 39 | /// construct that 'contains' the output. 40 | pub trait ValueWriter: Sized { 41 | /// Value Store 42 | type Out: Accumulator; 43 | 44 | /// Add an element to the ValueWriter 45 | fn add(&mut self, item: A) -> (); 46 | 47 | /// Writes an iterator to the ValueWriter 48 | fn extend>(&mut self, i: &mut I) -> () { 49 | for item in i { 50 | self.add(item); 51 | } 52 | } 53 | 54 | /// Close the ValueWriter, returning the store 55 | fn finish(self) -> Self::Out; 56 | } 57 | 58 | /// Defines an Accumulator that writes values in memory, using Vec as the store. 59 | #[derive(Clone)] 60 | pub struct Memory; 61 | 62 | impl Accumulator for Memory { 63 | type VW = Vec; 64 | 65 | fn writer(&self) -> Self::VW { 66 | Vec::new() 67 | } 68 | } 69 | 70 | impl Accumulator for Vec { 71 | type VW = Vec; 72 | 73 | fn writer(&self) -> Self::VW { 74 | Vec::new() 75 | } 76 | } 77 | 78 | impl ValueWriter for Vec { 79 | type Out = Vec; 80 | 81 | fn add(&mut self, item: A) -> () { 82 | self.push(item); 83 | } 84 | 85 | fn finish(mut self) -> Self::Out { 86 | self.shrink_to_fit(); 87 | self 88 | } 89 | } 90 | 91 | /// Uniform API for reading Values from a Store 92 | pub trait Stream { 93 | /// Iterator, yielding owned value 94 | type Iter: IntoIterator; 95 | 96 | /// Returns an iterator with owned values. 97 | fn stream(&self) -> Self::Iter; 98 | 99 | /// Returns a copy of the store. 100 | fn copy(&self) -> Self; 101 | } 102 | 103 | impl Stream for Vec { 104 | type Iter = Vec; 105 | 106 | fn stream(&self) -> Self::Iter { 107 | self.clone() 108 | } 109 | 110 | fn copy(&self) -> Self { 111 | self.clone() 112 | } 113 | } 114 | 115 | /// Writes values to a directory 116 | #[derive(Clone)] 117 | pub struct Disk(pub Arc); 118 | 119 | impl Disk { 120 | /// Creates a new Disk object from a path 121 | pub fn from_str(s: &str) -> Self { 122 | Disk(Arc::new(s.to_owned())) 123 | } 124 | } 125 | 126 | /// An open buffer for writing records to disk 127 | pub struct DiskBuffer { 128 | root_path: Arc, 129 | name: String, 130 | pd: PhantomData, 131 | out: Writer> 132 | } 133 | 134 | impl DiskBuffer { 135 | fn new(path: Arc) -> Self { 136 | let name = format!("{}/tange-{}", &path, Uuid::new_v4()); 137 | { 138 | let p: &str = &path; 139 | create_dir_all(p).expect("Unable to create directory!"); 140 | } 141 | let fd = File::create(&name).expect("Can't create file!"); 142 | let bw = BufWriter::new(fd); 143 | let encoder = Writer::new(bw); 144 | DiskBuffer { 145 | root_path: path, 146 | name: name, 147 | pd: PhantomData, 148 | out: encoder 149 | } 150 | } 151 | } 152 | 153 | /// Contains a root path for storing temporary files 154 | #[derive(Clone)] 155 | pub struct FileStore { 156 | root_path: Arc, 157 | name: Option, 158 | pd: PhantomData 159 | } 160 | 161 | impl FileStore { 162 | 163 | /// Create an empty FileStore at the given path 164 | pub fn empty(path: Arc) -> Self { 165 | FileStore { 166 | root_path: path, 167 | name: None, 168 | pd: PhantomData 169 | } 170 | } 171 | } 172 | 173 | // Delete the temporary file on disk when dropped 174 | impl Drop for FileStore { 175 | fn drop(&mut self) { 176 | if let Some(ref name) = self.name { 177 | if let Err(e) = remove_file(name) { 178 | eprintln!("Error Deleting {}: {:?}J", name, e); 179 | } 180 | } 181 | } 182 | } 183 | 184 | impl Accumulator for Disk { 185 | type VW = DiskBuffer; 186 | 187 | fn writer(&self) -> Self::VW { 188 | DiskBuffer::new(self.0.clone()) 189 | } 190 | } 191 | 192 | impl Accumulator for Arc> { 193 | type VW = DiskBuffer; 194 | 195 | fn writer(&self) -> Self::VW { 196 | DiskBuffer::new(self.root_path.clone()) 197 | } 198 | } 199 | 200 | impl ValueWriter for DiskBuffer { 201 | type Out = Arc>; 202 | 203 | fn add(&mut self, item: A) -> () { 204 | serialize_into(&mut self.out, &item).expect("Couldn't write record!"); 205 | } 206 | 207 | fn finish(self) -> Self::Out { 208 | Arc::new(FileStore { 209 | root_path: self.root_path.clone(), 210 | name: Some(self.name), 211 | pd: PhantomData 212 | }) 213 | } 214 | } 215 | 216 | 217 | impl Deserialize<'de>> Stream for Arc> { 218 | type Iter = RecordFile; 219 | 220 | fn stream(&self) -> Self::Iter { 221 | RecordFile(self.name.clone(), PhantomData) 222 | } 223 | 224 | fn copy(&self) -> Self { self.clone() } 225 | } 226 | 227 | /// Streams records from an optional File. If the file is none, returns the Empty iterator 228 | pub struct RecordFile(Option, PhantomData); 229 | 230 | impl Deserialize<'de>> IntoIterator for RecordFile { 231 | type Item = A; 232 | type IntoIter = RecordStreamer; 233 | 234 | fn into_iter(self) -> Self::IntoIter { 235 | if let Some(ref n) = self.0 { 236 | let fd = File::open(n).expect("File didn't exist on open!"); 237 | let brfd = BufReader::new(fd); 238 | let decoder = Reader::new(brfd); 239 | RecordStreamer(Some(decoder), PhantomData) 240 | } else { 241 | RecordStreamer(None, PhantomData) 242 | } 243 | } 244 | } 245 | 246 | /// Stream Records from an open file 247 | pub struct RecordStreamer(Option>>, PhantomData); 248 | 249 | impl Deserialize<'de>> Iterator for RecordStreamer { 250 | type Item = A; 251 | 252 | fn next(&mut self) -> Option { 253 | if let Some(ref mut bw) = self.0 { 254 | //deserialize_from(bw).expect("Failure on deserialization!") 255 | match deserialize_from(bw) { 256 | Ok(record) => Some(record), 257 | Err(e) => { 258 | let ek: &ErrorKind = &e; 259 | match ek { 260 | &ErrorKind::DeserializeAnyNotSupported => { 261 | eprintln!("Bincode doesn't work with certain types!"); 262 | panic!(); 263 | }, 264 | _ => None 265 | } 266 | } 267 | } 268 | } else { 269 | None 270 | } 271 | } 272 | } 273 | -------------------------------------------------------------------------------- /tange-core/src/deferred.rs: -------------------------------------------------------------------------------- 1 | //! Defines the Deferred primitive 2 | //! 3 | use std::marker::PhantomData; 4 | use std::sync::Arc; 5 | use std::any::Any; 6 | 7 | use task::{DynFn,DynFn2,BASS}; 8 | use graph::*; 9 | use scheduler::Scheduler; 10 | 11 | struct Lift(A); 12 | 13 | impl Input for Lift { 14 | fn read(&self) -> BASS { 15 | Box::new(self.0.clone()) 16 | } 17 | } 18 | 19 | /// A `Deferred` is the core struct defining how computations are composed 20 | /// The type parameter indicates the type of data contained within the `Deferred` 21 | #[derive(Clone)] 22 | pub struct Deferred { 23 | 24 | /// Dependency graph required to evaluate to the given A 25 | graph: Arc, 26 | 27 | /// Phantom type for Any 28 | items: PhantomData 29 | } 30 | 31 | impl Deferred { 32 | 33 | /// Applies a function to a Deferred, returning a new Deferred. This is effectively 34 | /// a Functor. 35 | /// 36 | /// ``` 37 | /// use tange::deferred::Deferred; 38 | /// use tange::scheduler::GreedyScheduler; 39 | /// 40 | /// let def = Deferred::lift(vec![1u8, 2, 3, 4], "Vector".into()); 41 | /// let size = def.apply(|v| v.len()); 42 | /// let results = size.run(&GreedyScheduler::new()); 43 | /// assert_eq!(results, Some(4usize)); 44 | /// ``` 45 | /// 46 | pub fn apply B>(&self, f: F) -> Deferred { 47 | let ng = Graph::create_task( 48 | FnArgs::Single(self.graph.clone()), DynFn::new(f), "Apply"); 49 | Deferred { 50 | graph: ng, 51 | items: PhantomData 52 | } 53 | 54 | } 55 | 56 | /// Joins two Deferred objects with a function, creating a new Deferred object. 57 | /// 58 | /// ``` 59 | /// use tange::deferred::Deferred; 60 | /// use tange::scheduler::GreedyScheduler; 61 | /// 62 | /// let left = Deferred::lift(vec![1f32, 2., 3., 4.], "Vector".into()); 63 | /// let right = Deferred::lift(10f32, "Num".into()); 64 | /// let multiplied: Deferred> = left.join(&right, 65 | /// |l,r| l.iter().map(|x| x * r).collect()); 66 | /// let results = multiplied.run(&GreedyScheduler::new()); 67 | /// assert_eq!(results, Some(vec![10., 20., 30., 40.])); 68 | /// ``` 69 | /// 70 | pub fn join C>(&self, other: &Deferred, f: F) -> Deferred { 71 | let ng = Graph::create_task( 72 | FnArgs::Join(self.graph.clone(), other.graph.clone()), 73 | DynFn2::new(f), "Join"); 74 | 75 | Deferred { 76 | graph: ng, 77 | items: PhantomData 78 | } 79 | 80 | } 81 | } 82 | 83 | impl Deferred { 84 | /// Lifts a value into a Deferred object. 85 | /// ``` 86 | /// use tange::deferred::Deferred; 87 | /// use tange::scheduler::GreedyScheduler; 88 | /// 89 | /// let id = Deferred::lift("Some String".to_owned(), "String".into()); 90 | /// assert_eq!(id.run(&GreedyScheduler::new()), Some("Some String".into())); 91 | /// 92 | /// ``` 93 | pub fn lift(a: A, name: Option<&str>) -> Self { 94 | let graph = Graph::create_input(Lift(a), name.unwrap_or("Input")); 95 | Deferred { 96 | graph: graph, 97 | items: PhantomData 98 | } 99 | } 100 | 101 | /// Evaluates the Deferred object and dependency graph, returning the result 102 | /// of the computation. 103 | /// 104 | /// ``` 105 | /// use tange::deferred::Deferred; 106 | /// use tange::scheduler::GreedyScheduler; 107 | /// 108 | /// let a = Deferred::lift(1usize, "a".into()); 109 | /// let b = Deferred::lift(2usize, "b".into()); 110 | /// let c = a.join(&b, |x, y| x + y); 111 | /// assert_eq!(c.run(&GreedyScheduler::new()), Some(3usize)); 112 | /// 113 | /// ``` 114 | 115 | pub fn run(&self, s: &S) -> Option { 116 | s.compute(self.graph.clone()).and_then(|v| { 117 | Arc::try_unwrap(v).ok().and_then(|ab| { 118 | ab.downcast_ref::().map(|x| x.clone()) 119 | }) 120 | }) 121 | } 122 | } 123 | 124 | /// `batch_apply` is a convenience method that takes a set of homogenous `Deferred`s 125 | /// and applies a function to each, returning a new set of `Deferred`s. Unlike 126 | /// `Deferred::apply`, `batch_apply` passes in an order index. 127 | /// ``` 128 | /// use tange::deferred::{Deferred, batch_apply}; 129 | /// use tange::scheduler::GreedyScheduler; 130 | /// 131 | /// let vec: Vec<_> = (0usize..10) 132 | /// .map(|v| Deferred::lift(v, None)).collect(); 133 | /// let out = batch_apply(&vec, |idx, v| idx + v); 134 | /// assert_eq!(out[1].run(&GreedyScheduler::new()), Some(2)); 135 | /// assert_eq!(out[5].run(&GreedyScheduler::new()), Some(10)); 136 | /// ``` 137 | /// 138 | pub fn batch_apply< 139 | A: Any + Send + Sync + Clone, 140 | B: Any + Send + Sync, 141 | F: 'static + Sync + Send + Clone + Fn(usize, &A) -> B 142 | >(defs: &[Deferred], f: F) 143 | -> Vec> { 144 | let mut nps = Vec::with_capacity(defs.len()); 145 | let fa = Arc::new(f); 146 | for (idx, p) in defs.iter().enumerate() { 147 | let mf = fa.clone(); 148 | let np = p.apply(move |vs| { mf(idx, vs) }); 149 | nps.push(np); 150 | } 151 | nps 152 | } 153 | 154 | /// Often times, we want to combine a set of Deferred objects into a single Deferred. 155 | /// `tree_reduce` combines pairs of Deferred recursively using `f`, building a dependency 156 | /// tree which attempts to maximize parallelism. 157 | /// ``` 158 | /// use tange::deferred::{Deferred, tree_reduce}; 159 | /// use tange::scheduler::LeveledScheduler; 160 | /// 161 | /// let vec: Vec<_> = (0usize..10) 162 | /// .map(|v| Deferred::lift(v, None)).collect(); 163 | /// let out = tree_reduce(&vec, |left, right| left + right).unwrap(); 164 | /// let expected = (0usize..10).fold(0, |acc, x| acc + x); 165 | /// assert_eq!(out.run(&LeveledScheduler), Some(expected)); 166 | /// ``` 167 | pub fn tree_reduce A 169 | >( 170 | defs: &[Deferred], 171 | f: F 172 | ) -> Option> { 173 | tree_reduce_until(defs, 1, f).map(|mut defs| { 174 | defs.remove(0) 175 | }) 176 | } 177 | 178 | /// `tree_reduce_until` is similar to `tree_reduce` except that it will stop reducing 179 | /// when the number of `Deferred`s left is less than or equal to `parts`. 180 | /// 181 | /// ``` 182 | /// use tange::deferred::{Deferred, tree_reduce_until}; 183 | /// use tange::scheduler::GreedyScheduler; 184 | /// 185 | /// let vec: Vec<_> = (0usize..8) 186 | /// .map(|v| Deferred::lift(v, None)).collect(); 187 | /// let out = tree_reduce_until(&vec, 2, |left, right| left + right).unwrap(); 188 | /// assert_eq!(out.len(), 2); 189 | /// assert_eq!(out[0].run(&GreedyScheduler::new()), Some(0+1+2+3)); 190 | /// ``` 191 | pub fn tree_reduce_until A 193 | >( 194 | defs: &[Deferred], 195 | parts: usize, 196 | f: F 197 | ) -> Option>> { 198 | if defs.len() == 0 { 199 | None 200 | } else if defs.len() <= parts { 201 | Some(defs.clone().to_vec()) 202 | } else { 203 | // First pass 204 | let mut pass = Vec::new(); 205 | for i in (0..defs.len() - 1).step_by(2) { 206 | pass.push(defs[i].join(&defs[i+1], f.clone())); 207 | } 208 | if defs.len() % 2 == 1 { 209 | pass.push(defs[defs.len() - 1].clone()); 210 | } 211 | tree_reduce_until(&pass, parts, f) 212 | } 213 | } 214 | 215 | #[cfg(test)] 216 | mod def_test { 217 | use super::*; 218 | use scheduler::{LeveledScheduler,GreedyScheduler}; 219 | 220 | #[test] 221 | fn test_tree_reduce() { 222 | let v: Vec<_> = (0..999usize).into_iter() 223 | .map(|x| Deferred::lift(x, None)) 224 | .map(|d| d.apply(|x| x + 1)) 225 | .collect(); 226 | 227 | let res = (1..1000usize).sum(); 228 | 229 | let agg = tree_reduce(&v, |x, y| x + y).unwrap(); 230 | let results = agg.run(&LeveledScheduler); 231 | assert_eq!(results, Some(res)); 232 | } 233 | 234 | #[test] 235 | fn test_tree_reduce_greedy() { 236 | let v: Vec<_> = (0..2usize).into_iter() 237 | .map(|x| Deferred::lift(x, None)) 238 | .collect(); 239 | 240 | let res = (0..2usize).sum(); 241 | 242 | let agg = tree_reduce(&v, |x, y| x + y).unwrap(); 243 | let results = agg.run(&GreedyScheduler::new()); 244 | assert_eq!(results, Some(res)); 245 | } 246 | 247 | } 248 | -------------------------------------------------------------------------------- /tange-core/src/scheduler.rs: -------------------------------------------------------------------------------- 1 | //! Contains all the runtimes scheduling Graphs for execution. 2 | extern crate num_cpus; 3 | extern crate log; 4 | extern crate priority_queue; 5 | extern crate jobpool; 6 | 7 | use std::sync::{Mutex,Arc,mpsc}; 8 | use std::collections::{HashMap, HashSet}; 9 | use std::hash::Hash; 10 | 11 | use log::Level::{Trace,Debug as LDebug}; 12 | use self::priority_queue::PriorityQueue; 13 | use self::jobpool::JobPool; 14 | 15 | use task::{BASS,DynArgs}; 16 | use graph::{Graph,Task,Handle,FnArgs}; 17 | 18 | type DepGraph = HashMap, HashSet>>; 19 | type ChainGraph = HashMap>, HashSet>>; 20 | 21 | // Keeps track of data that are needed by downstream computations 22 | #[derive(Debug)] 23 | struct DataStore { 24 | data: HashMap, 25 | counts: HashMap 26 | } 27 | 28 | impl DataStore { 29 | // Creates a new DataStore 30 | // `Counts` are the number of times a piece of data will be consumed. 31 | fn new( 32 | data: HashMap, 33 | counts: HashMap 34 | ) -> Self { 35 | DataStore {data: data, counts: counts} 36 | } 37 | 38 | // Gets a piece of data from the DataStore. If the key doesn't exist, 39 | // returns None 40 | fn get(&mut self, handle: &K) -> Option { 41 | let count = self.counts.get_mut(handle).map(|c| { 42 | *c -= 1; 43 | *c 44 | }).unwrap_or(0); 45 | 46 | if count == 0 { 47 | self.data.remove(handle) 48 | } else { 49 | self.data.get(handle).map(|x| x.clone()) 50 | } 51 | } 52 | 53 | // Adds a key/value to the datastore. 54 | fn insert(&mut self, handle: K, data: V) { 55 | self.data.insert(handle, data); 56 | } 57 | } 58 | 59 | /// Defines the Scheduler object. Schedulers take in Graphs and return the result 60 | /// of their computation. 61 | pub trait Scheduler { 62 | /// Compute the given Graph, returning the value. 63 | fn compute(&self, graph: Arc) -> Option>; 64 | } 65 | 66 | enum Limbo { 67 | One(Arc), 68 | Two(Arc, Arc) 69 | } 70 | 71 | struct DAG { 72 | 73 | /// Output handle to task 74 | pub tasks: HashMap, Arc>, 75 | 76 | /// Dependencies between tasks 77 | pub dependencies: HashMap, Option> 78 | 79 | } 80 | 81 | impl DAG { 82 | /// Converts a Graph into a Directed Acyclic Graph. 83 | fn new(g: Arc) -> Self { 84 | let mut tasks = HashMap::new(); 85 | let mut dependencies = HashMap::new(); 86 | 87 | let mut stack = vec![g]; 88 | 89 | let mut hs = HashSet::new(); 90 | 91 | while !stack.is_empty() { 92 | trace!("Stack size: {}", stack.len()); 93 | let ag = stack.pop().unwrap(); 94 | if !hs.contains(&ag.handle) { 95 | hs.insert(ag.handle.clone()); 96 | tasks.insert(ag.handle.clone(), ag.task.clone()); 97 | dependencies.insert(ag.handle.clone(), ag.args.clone()); 98 | if let Some(ref fns) = ag.args { 99 | match fns { 100 | FnArgs::Single(g) => stack.push(g.clone()), 101 | FnArgs::Join(g1, g2) => { 102 | stack.push(g1.clone()); 103 | stack.push(g2.clone()); 104 | } 105 | }; 106 | } 107 | } 108 | } 109 | DAG { 110 | tasks: tasks, 111 | dependencies: dependencies 112 | } 113 | } 114 | } 115 | 116 | /// Reads out dependencies into a Limbo object, which is really just a simple Enum 117 | fn get_fnargs(ds: &mut DataStore,Arc>, fa: &FnArgs) -> Option { 118 | match fa { 119 | &FnArgs::Single(ref g) => { 120 | ds.get(&g.handle).map(|args| { 121 | Limbo::One(args) 122 | }) 123 | }, 124 | &FnArgs::Join(ref lg, ref rg) => { 125 | ds.get(&lg.handle).and_then(|left| { 126 | ds.get(&rg.handle).map(|right| { 127 | Limbo::Two(left, right) 128 | }) 129 | }) 130 | } 131 | } 132 | } 133 | 134 | // Converts a flattened graph into a dependency list 135 | fn build_dep_graph(graph: &DAG) -> (DepGraph, DepGraph) { 136 | // Build out dependencies 137 | let mut inbound: DepGraph = HashMap::new(); 138 | let mut outbound: DepGraph = HashMap::new(); 139 | for (output, ref inputs) in graph.dependencies.iter() { 140 | // Only track unique handles 141 | let mut hs = HashSet::new(); 142 | if let Some(inp) = inputs { 143 | let fna: &FnArgs = &inp; 144 | match fna { 145 | &FnArgs::Single(ref h) => hs.insert(h.handle.clone()), 146 | &FnArgs::Join(ref h1, ref h2) => { 147 | hs.insert(h1.handle.clone()); 148 | hs.insert(h2.handle.clone()) 149 | }, 150 | }; 151 | } 152 | // Add outbound 153 | for h in hs.iter() { 154 | let e = outbound.entry(h.clone()).or_insert_with(|| HashSet::with_capacity(1)); 155 | e.insert(output.clone()); 156 | } 157 | inbound.insert(output.clone(), hs); 158 | } 159 | inbound.shrink_to_fit(); 160 | outbound.shrink_to_fit(); 161 | (inbound, outbound) 162 | } 163 | 164 | // Constructs a set of nodes that have no dependencies between them 165 | fn generate_levels(collapsed: ChainGraph) -> Vec>>> { 166 | // Create outbound 167 | let mut outbound = HashMap::new(); 168 | for (nodes, deps) in collapsed.iter() { 169 | for d in deps.iter() { 170 | let e = outbound.entry(d).or_insert_with(|| HashSet::with_capacity(1)); 171 | e.insert(nodes); 172 | } 173 | } 174 | let mut inbound = collapsed.clone(); 175 | // Compute task levels 176 | let mut levels = Vec::new(); 177 | let mut cur_level: Vec>> = inbound.iter() 178 | .filter(|(_, v)| v.is_empty()) 179 | .map(|(k, _)| k.clone()) 180 | .collect(); 181 | 182 | loop { 183 | 184 | if cur_level.is_empty() { 185 | break; 186 | } 187 | 188 | // Remove nodes from graph 189 | for handles in cur_level.iter() { 190 | inbound.remove(handles); 191 | } 192 | 193 | // Update dependencies 194 | let mut next_level = Vec::new(); 195 | for hs in cur_level.iter() { 196 | // Get outbound nodes 197 | let last = &hs[hs.len() - 1]; 198 | if let Some(node_set) = outbound.get(last) { 199 | for node in node_set.iter() { 200 | if let Some(set) = inbound.get_mut(*node) { 201 | set.remove(last); 202 | if set.is_empty() { 203 | next_level.push((*node).clone()); 204 | } 205 | } 206 | } 207 | } 208 | } 209 | 210 | levels.push(cur_level); 211 | cur_level = next_level; 212 | } 213 | if log_enabled!(LDebug) { 214 | let mut max_con = 0usize; 215 | for (i, l) in levels.iter().enumerate() { 216 | max_con = max_con.max(l.len()); 217 | debug!("Level: {}, Tasks: {}", i, l.len()); 218 | } 219 | debug!("Max Concurrency: {}", max_con); 220 | } 221 | levels 222 | } 223 | 224 | fn run_task( 225 | graph: &DAG, 226 | chain: &[Arc], 227 | dsam: Arc, Arc>>> 228 | ) { 229 | // Pull out arguments from the datasource 230 | trace!("Reading dependencies for chain {:?}", chain[0]); 231 | let ot = graph.dependencies.get(&chain[0]); 232 | let mut largs = { 233 | let ds: &mut DataStore<_,_> = &mut *dsam.lock().unwrap(); 234 | // Get inputs 235 | match ot { 236 | Some(Some(ar)) => get_fnargs(ds, &ar), 237 | _ => None 238 | } 239 | }; 240 | 241 | for handle in chain { 242 | trace!("Processing handle: {:?}", handle); 243 | let out = match graph.tasks.get(handle) { 244 | Some(ref task) => { 245 | let task_ref: &Task = &task; 246 | match task_ref { 247 | Task::Input(ref input) => Some(input.read()), 248 | Task::Function(ref t) => { 249 | match largs { 250 | Some(Limbo::One(ref a)) => { 251 | t.eval(DynArgs::One(a)) 252 | }, 253 | Some(Limbo::Two(ref a, ref b)) => { 254 | t.eval(DynArgs::Two(a, b)) 255 | }, 256 | None => None 257 | } 258 | } 259 | } 260 | }, 261 | None => None 262 | }; 263 | if let Some(bass) = out { 264 | largs = Some(Limbo::One(Arc::new(bass))); 265 | } 266 | } 267 | 268 | if let Some(Limbo::One(d)) = largs { 269 | let mut ds = dsam.lock().unwrap(); 270 | ds.insert(chain[chain.len() - 1].clone(), d); 271 | } 272 | } 273 | 274 | // Finds chains of tasks that can be collapsed into a single task. While this isn't 275 | // strictly needed, both the LeveledScheduler and GreedyScheduler benefit from it in 276 | // different ways: 277 | use std::fmt::Debug; 278 | fn collapse_graph( 279 | mut nodes: HashMap> 280 | ) -> HashMap, HashSet> { 281 | 282 | // Generate outbound edges 283 | let mut outbound = HashMap::new(); 284 | let mut roots = Vec::new(); 285 | let mut inbound: HashMap> = HashMap::new(); 286 | for (node, deps) in nodes.iter() { 287 | if !outbound.contains_key(node) { 288 | outbound.insert(node.clone(), Vec::new()); 289 | } 290 | 291 | for d in deps.iter() { 292 | let e = outbound.entry(d.clone()).or_insert(Vec::new()); 293 | e.push(node.clone()); 294 | } 295 | 296 | if deps.is_empty() { 297 | roots.push(vec![node.clone()]); 298 | } 299 | 300 | inbound.insert(node.clone(), deps.iter().cloned().collect()); 301 | } 302 | 303 | let mut new_nodes = HashMap::new(); 304 | let mut seen = HashSet::new(); 305 | while !roots.is_empty() { 306 | if let Some(mut chain) = roots.pop() { 307 | let link = { 308 | let tail = &chain[chain.len() - 1]; 309 | 310 | // If outbound == 1 and that refernce only has one inbound 311 | if outbound[tail].len() == 1 && inbound[&outbound[tail][0]].len() == 1 { 312 | // We found a link in a chain 313 | // Add the node to the current list 314 | Some(outbound[tail][0].clone()) 315 | } else { 316 | None 317 | // Our chain is finished, emit it 318 | } 319 | }; 320 | 321 | if let Some(node) = link { 322 | chain.push(node); 323 | roots.push(chain); 324 | } else { 325 | // If current chain is ended, add the outbound nodes 326 | { 327 | let tail = &chain[chain.len() - 1]; 328 | for node in outbound[tail].iter() { 329 | if !seen.contains(node) { 330 | roots.push(vec![node.clone()]); 331 | seen.insert(node.clone()); 332 | } 333 | } 334 | } 335 | // Emit current chain 336 | let deps = nodes.remove(&chain[0]).unwrap(); 337 | new_nodes.insert(chain, deps); 338 | } 339 | } 340 | } 341 | 342 | new_nodes 343 | } 344 | 345 | /// LeveledScheduler computes sets of mutually exclusive tasks that can be run 346 | /// concurrently. Unlike GreedyScheduler, which will immediately consume the next 347 | /// available task regardless of level, LeveledScheduler will wait for an entire level 348 | /// to finish computation before moving to the next one. 349 | /// 350 | /// This has some small benefits when it to reproducibility: it natually is more 351 | /// deterministic than the GreedyScheduler, though potentially slower in some cases 352 | /// a set of tasks on a level are slower. 353 | pub struct LeveledScheduler; 354 | 355 | impl Scheduler for LeveledScheduler{ 356 | 357 | fn compute( 358 | &self, 359 | graph: Arc 360 | ) -> Option> { 361 | 362 | let out_handle = graph.handle.clone(); 363 | let dag = Arc::new(DAG::new(graph)); 364 | debug!("Number of Tasks Specified: {}", dag.tasks.len()); 365 | 366 | let (inbound, _outbound) = build_dep_graph(&dag); 367 | 368 | let collapsed = collapse_graph(inbound); 369 | 370 | debug!("Number of Tasks to Run: {}", collapsed.len()); 371 | 372 | // Build the counts 373 | let mut counts: HashMap,_> = HashMap::new(); 374 | for (_k, vs) in collapsed.iter() { 375 | for v in vs.iter() { 376 | let e = counts.entry(v.clone()).or_insert(0usize); 377 | *e += 1; 378 | } 379 | } 380 | 381 | // Build out the levels 382 | let levels = generate_levels(collapsed); 383 | 384 | // Load up the inputs 385 | let data: HashMap,Arc> = HashMap::new(); 386 | 387 | // Add all handles 388 | let raw_ds: DataStore, Arc> = DataStore::new(data, counts); 389 | let dsam = Arc::new(Mutex::new(raw_ds)); 390 | 391 | for (i, level) in levels.into_iter().enumerate() { 392 | let mut pool = JobPool::new(num_cpus::get()); 393 | debug!("Running level: {}", i); 394 | for chain in level { 395 | let g = dag.clone(); 396 | let c = chain.clone(); 397 | let d = dsam.clone(); 398 | pool.queue(move || { run_task(&g, &c, d); }); 399 | } 400 | 401 | // block until all are done 402 | pool.shutdown(); 403 | } 404 | 405 | debug!("Finished"); 406 | let ret = { 407 | dsam.lock().unwrap().get(&out_handle) 408 | }; 409 | ret 410 | } 411 | } 412 | 413 | /// GreedyScheduler is the recommend scheduler for Tange-Core. After computing the DAG 414 | /// from the Graph, it uses a priority heap to determine which task to execute next, 415 | /// biasing toward reduction. That is, joins are preferred over an apply since it reduces 416 | /// the number of thunks by one. Inputs are preferred last. 417 | /// 418 | pub struct GreedyScheduler(usize); 419 | 420 | impl GreedyScheduler { 421 | 422 | /// Creates a new GreedyScheduler with the default number of threads. 423 | pub fn new() -> Self { 424 | GreedyScheduler(num_cpus::get()) 425 | } 426 | 427 | /// Sets the number of threads to use. By default, uses one thread per core. 428 | pub fn set_threads(&mut self, n_threads: usize) -> () { 429 | self.0 = n_threads; 430 | } 431 | } 432 | 433 | impl Scheduler for GreedyScheduler { 434 | 435 | fn compute( 436 | &self, 437 | graph: Arc 438 | ) -> Option> { 439 | 440 | let out_handle = graph.handle.clone(); 441 | 442 | trace!("Building Dag..."); 443 | let dag = Arc::new(DAG::new(graph)); 444 | 445 | debug!("Number of Tasks Specified: {}", dag.tasks.len()); 446 | 447 | let (inbound, mut outbound) = build_dep_graph(&dag); 448 | 449 | let collapsed = collapse_graph(inbound); 450 | 451 | let total_jobs = collapsed.len(); 452 | debug!("Number of Tasks to Run: {}", total_jobs); 453 | 454 | // Build the counts 455 | let mut counts: HashMap,_> = HashMap::new(); 456 | let mut queue = PriorityQueue::new(); 457 | for (chain, deps) in collapsed.iter() { 458 | // Add the inputs 459 | if deps.len() == 0 { 460 | trace!("Adding intial chain: {:?}, Priority: {}", chain, 0usize); 461 | queue.push(chain.clone(), 0usize); 462 | } 463 | 464 | for d in deps.iter() { 465 | let e = counts.entry(d.clone()).or_insert(0usize); 466 | *e += 1; 467 | } 468 | } 469 | 470 | // Make the graph a bit easier to work with 471 | let mut head_map: HashMap<_,_> = collapsed.into_iter().map(|(chain, deps)| { 472 | (chain[0].clone(), (chain, deps.len(), deps)) 473 | }).collect(); 474 | 475 | // Load up the inputs 476 | let data: HashMap,Arc> = HashMap::new(); 477 | 478 | // Initialize an empty data store 479 | let raw_ds: DataStore, Arc> = DataStore::new(data, counts); 480 | let dsam = Arc::new(Mutex::new(raw_ds)); 481 | 482 | // Start the loop! 483 | 484 | if log_enabled!(Trace) { 485 | for (ref index, &(ref chain, ref _priority, ref deps)) in head_map.iter() { 486 | trace!("Index: {:?}, Chain: {:?}, Deps: {:?}", index, chain, deps); 487 | } 488 | } 489 | debug!("Starting tasks..."); 490 | let mut jobs_done = 0usize; 491 | { 492 | let mut pool = JobPool::new(self.0); 493 | let mut free_threads = self.0; 494 | let (tx, rx) = mpsc::channel(); 495 | loop { 496 | // Queue up all free items 497 | while free_threads > 0 && !queue.is_empty(){ 498 | if let Some((chain, priority)) = queue.pop() { 499 | trace!("Training chain: {:?}, Priority: {}", chain, priority); 500 | let g = dag.clone(); 501 | let c = chain.clone(); 502 | let d = dsam.clone(); 503 | let thread_tx = tx.clone(); 504 | pool.queue(move || { 505 | run_task(&g, &c, d); 506 | thread_tx.send(c[c.len() - 1].clone()) 507 | .expect("Error sending thread!"); 508 | }); 509 | free_threads -= 1; 510 | } 511 | } 512 | 513 | // Eat! 514 | let handle = rx.recv().unwrap(); 515 | // Remove it as deps from remaining tasks 516 | trace!("{:?} finished", handle); 517 | free_threads += 1; 518 | if let Some(out) = outbound.remove(&handle) { 519 | for out_handle in out { 520 | trace!("Updating {:?}", out_handle); 521 | if let Some((chain, p, deps)) = head_map.get_mut(&out_handle) { 522 | trace!("Updating {:?}", out_handle); 523 | deps.remove(&handle); 524 | if deps.is_empty() { 525 | trace!("Adding new chain: {:?}, Priority: {}", chain, p); 526 | queue.push(chain.clone(), *p); 527 | } else { 528 | trace!("Remaining Deps: {:?}", deps); 529 | } 530 | } 531 | } 532 | } 533 | 534 | jobs_done += 1; 535 | if total_jobs > 10 && jobs_done % (total_jobs as f64 / 10.) as usize == 0 { 536 | debug!("Finished {}/{} of jobs", jobs_done, total_jobs); 537 | if log_enabled!(Trace) { 538 | let ds = dsam.lock().unwrap(); 539 | trace!("Data Chunks in memory: {}", ds.data.len()); 540 | } 541 | 542 | } 543 | // Are we done yet? 544 | if free_threads == self.0 && queue.is_empty() { 545 | break 546 | } 547 | } 548 | pool.shutdown(); 549 | } 550 | 551 | if log_enabled!(Trace) { 552 | let ds = dsam.lock().unwrap(); 553 | trace!("Still Holding data for:"); 554 | for (k, _v) in ds.data.iter() { 555 | trace!("- {:?}", k); 556 | } 557 | } 558 | 559 | debug!("Finished"); 560 | let ret = { 561 | dsam.lock().unwrap().get(&out_handle) 562 | }; 563 | ret 564 | } 565 | } 566 | 567 | #[cfg(test)] 568 | mod size_test { 569 | use super::*; 570 | 571 | #[test] 572 | fn test_graph_collapse() { 573 | /* 574 | 1 -> 2 -> 3 575 | \ 576 | 4 -> 5 577 | 578 | We should collapse 1 -> 2 and 4 -> 5 579 | */ 580 | let one_deps = HashSet::new(); 581 | let mut two_deps = HashSet::new(); 582 | two_deps.insert(1usize); 583 | 584 | let mut three_deps = HashSet::new(); 585 | three_deps.insert(2usize); 586 | 587 | let mut four_deps = HashSet::new(); 588 | four_deps.insert(2usize); 589 | 590 | let mut five_deps = HashSet::new(); 591 | five_deps.insert(4usize); 592 | 593 | let mut deps = HashMap::new(); 594 | deps.insert(1usize, one_deps); 595 | deps.insert(2usize, two_deps); 596 | deps.insert(3usize, three_deps); 597 | deps.insert(4usize, four_deps); 598 | deps.insert(5usize, five_deps); 599 | 600 | let out = collapse_graph(deps); 601 | let mut res = HashMap::new(); 602 | res.insert(vec![1, 2], vec![].iter().cloned().collect()); 603 | res.insert(vec![3], vec![2].iter().cloned().collect()); 604 | res.insert(vec![4, 5], vec![2].iter().cloned().collect()); 605 | 606 | assert_eq!(out, res); 607 | } 608 | 609 | #[test] 610 | fn test_graph_collapse_2() { 611 | /* 612 | 2 -> 4 613 | / | 614 | 1 ---> 3 615 | 616 | */ 617 | let one_deps = HashSet::new(); 618 | let mut two_deps = HashSet::new(); 619 | two_deps.insert(1usize); 620 | 621 | let mut three_deps = HashSet::new(); 622 | three_deps.insert(1usize); 623 | 624 | let mut four_deps = HashSet::new(); 625 | four_deps.insert(2usize); 626 | four_deps.insert(3usize); 627 | 628 | let mut deps = HashMap::new(); 629 | deps.insert(1usize, one_deps); 630 | deps.insert(2usize, two_deps); 631 | deps.insert(3usize, three_deps); 632 | deps.insert(4usize, four_deps); 633 | 634 | let res = deps.clone().into_iter().map(|(k, v)| (vec![k], v)).collect(); 635 | let out = collapse_graph(deps); 636 | 637 | assert_eq!(out, res); 638 | } 639 | 640 | } 641 | -------------------------------------------------------------------------------- /tange-collection/src/collection/memory.rs: -------------------------------------------------------------------------------- 1 | //! MemoryCollection 2 | //! --- 3 | //! MemoryCollection provides a variety of dataflow operators for consuming and mutating 4 | //! data. Unlike its Disk-based counterpart, DiskCollection, MemoryCollection keeps all 5 | //! data in memory, maximizing speed. 6 | //! 7 | 8 | extern crate serde; 9 | use std::fs; 10 | use std::any::Any; 11 | use std::io::prelude::*; 12 | use std::io::BufWriter; 13 | use std::hash::Hash; 14 | use std::sync::Arc; 15 | 16 | use self::serde::{Deserialize,Serialize}; 17 | 18 | use collection::disk::DiskCollection; 19 | use tange::deferred::{Deferred, batch_apply, tree_reduce}; 20 | use tange::scheduler::{Scheduler,GreedyScheduler}; 21 | use partitioned::{join_on_key as jok, partition, partition_by_key, fold_by, concat}; 22 | use interfaces::{Memory,Disk}; 23 | use super::emit; 24 | 25 | 26 | /// MemoryCollection struct 27 | #[derive(Clone)] 28 | pub struct MemoryCollection { 29 | partitions: Vec>> 30 | } 31 | 32 | impl MemoryCollection { 33 | 34 | /// Creates a MemoryCollection from a set of Deferred objects. 35 | pub fn from_defs(vs: Vec>>) -> MemoryCollection { 36 | MemoryCollection { 37 | partitions: vs 38 | } 39 | } 40 | 41 | /// Provides raw access to the underlying Deferred objects 42 | pub fn to_defs(&self) -> &Vec>> { 43 | &self.partitions 44 | } 45 | 46 | /// Creates a new MemoryCollection from a Vec of items 47 | /// ```rust 48 | /// extern crate tange; 49 | /// extern crate tange_collection; 50 | /// use tange::scheduler::GreedyScheduler; 51 | /// use tange_collection::collection::memory::MemoryCollection; 52 | /// 53 | /// let col = MemoryCollection::from_vec(vec![1,2,3usize]); 54 | /// assert_eq!(col.run(&GreedyScheduler::new()), Some(vec![1,2,3usize])); 55 | /// ``` 56 | pub fn from_vec(vs: Vec) -> MemoryCollection { 57 | MemoryCollection { 58 | partitions: vec![Deferred::lift(vs, None)], 59 | } 60 | } 61 | 62 | /// Returns the current number of data partitions 63 | pub fn n_partitions(&self) -> usize { 64 | self.partitions.len() 65 | } 66 | 67 | /// Concatentates two collections into a single Collection 68 | /// ```rust 69 | /// extern crate tange; 70 | /// extern crate tange_collection; 71 | /// use tange::scheduler::GreedyScheduler; 72 | /// use tange_collection::collection::memory::MemoryCollection; 73 | /// 74 | /// let one = MemoryCollection::from_vec(vec![1,2,3usize]); 75 | /// let two = MemoryCollection::from_vec(vec![4usize, 5, 6]); 76 | /// let cat = one.concat(&two); 77 | /// assert_eq!(cat.run(&GreedyScheduler::new()), Some(vec![1,2,3,4,5,6])); 78 | /// ``` 79 | pub fn concat(&self, other: &MemoryCollection) -> MemoryCollection { 80 | let mut nps: Vec<_> = self.partitions.iter() 81 | .map(|p| (*p).clone()).collect(); 82 | 83 | for p in other.partitions.iter() { 84 | nps.push(p.clone()); 85 | } 86 | 87 | MemoryCollection { partitions: nps } 88 | } 89 | 90 | /// Maps a function over the values in the DiskCollection, returning a new DiskCollection 91 | /// ```rust 92 | /// extern crate tange; 93 | /// extern crate tange_collection; 94 | /// use tange::scheduler::GreedyScheduler; 95 | /// use tange_collection::collection::memory::MemoryCollection; 96 | /// 97 | /// let one = MemoryCollection::from_vec(vec![1,2,3usize]); 98 | /// let strings = one.map(|i| format!("{}", i)); 99 | /// assert_eq!(strings.run(&GreedyScheduler::new()), 100 | /// Some(vec!["1".into(),"2".into(),"3".into()])); 101 | /// ``` 102 | pub fn map< 103 | B: Any + Send + Sync + Clone, 104 | F: 'static + Sync + Send + Clone + Fn(&A) -> B 105 | >(&self, f: F) -> MemoryCollection { 106 | self.emit(move |x, emitter| { 107 | emitter(f(x)) 108 | }) 109 | } 110 | 111 | /// Filters out items in the collection that fail the predicate. 112 | /// ```rust 113 | /// extern crate tange; 114 | /// extern crate tange_collection; 115 | /// use tange::scheduler::GreedyScheduler; 116 | /// use tange_collection::collection::memory::MemoryCollection; 117 | /// 118 | /// let col = MemoryCollection::from_vec(vec![1,2,3usize]); 119 | /// let odds = col.filter(|x| x % 2 == 1); 120 | /// assert_eq!(odds.run(&GreedyScheduler::new()), 121 | /// Some(vec![1, 3usize])); 122 | /// ``` 123 | 124 | pub fn filter< 125 | F: 'static + Sync + Send + Clone + Fn(&A) -> bool 126 | >(&self, f: F) -> MemoryCollection { 127 | self.emit(move |x, emitter| { 128 | if f(x) { 129 | emitter(x.clone()) 130 | } 131 | }) 132 | } 133 | 134 | /// Re-partitions a collection by the number of provided chunks. It uniformly distributes data from each old partition into each new partition. 135 | /// ```rust 136 | /// extern crate tange; 137 | /// extern crate tange_collection; 138 | /// use tange::scheduler::GreedyScheduler; 139 | /// use tange_collection::collection::memory::MemoryCollection; 140 | /// 141 | /// let col = MemoryCollection::from_vec(vec![1,2,3usize]); 142 | /// assert_eq!(col.n_partitions(), 1); 143 | /// let two = col.split(2); 144 | /// assert_eq!(two.n_partitions(), 2); 145 | /// ``` 146 | pub fn split(&self, n_chunks: usize) -> MemoryCollection { 147 | self.partition(n_chunks, |idx, _k| idx) 148 | } 149 | 150 | /// Maps over all items in a collection, optionally emitting new values. It can be used 151 | /// to efficiently fuse a number of map/filter/flat_map functions into a single method. 152 | /// ```rust 153 | /// extern crate tange; 154 | /// extern crate tange_collection; 155 | /// use tange::scheduler::GreedyScheduler; 156 | /// use tange_collection::collection::memory::MemoryCollection; 157 | /// 158 | /// let col = MemoryCollection::from_vec(vec![1,2,3usize]); 159 | /// let new = col.emit(|item, emitter| { 160 | /// if item % 2 == 0 { 161 | /// emitter(format!("{}!", item)); 162 | /// } 163 | /// }); 164 | /// assert_eq!(new.run(&GreedyScheduler::new()), Some(vec!["2!".into()])); 165 | /// ``` 166 | 167 | pub fn emit< 168 | B: Any + Send + Sync + Clone, 169 | F: 'static + Sync + Send + Clone + Fn(&A, &mut FnMut(B) -> ()) 170 | >(&self, f: F) -> MemoryCollection { 171 | let parts = emit(&self.partitions, Memory, f); 172 | 173 | MemoryCollection { partitions: parts } 174 | } 175 | 176 | /// Maps over all items in a collection, emitting new values. It can be used 177 | /// to efficiently fuse a number of map/filter/flat_map functions into a single method. 178 | /// `emit_to_disk` differs from the original `emit` by writing the emitted values directly 179 | /// to disk, returning a DiskCollection instead of MemoryCollection. This makes it convenient to switch to out-of-core when needed. 180 | /// ```rust 181 | /// extern crate tange; 182 | /// extern crate tange_collection; 183 | /// use tange::scheduler::GreedyScheduler; 184 | /// use tange_collection::collection::memory::MemoryCollection; 185 | /// 186 | /// let col = MemoryCollection::from_vec(vec![1,2,3usize]); 187 | /// let new = col.emit_to_disk("/tmp".into(), |item, emitter| { 188 | /// if item % 2 == 0 { 189 | /// emitter(format!("{}!", item)); 190 | /// } 191 | /// }); 192 | /// assert_eq!(new.run(&GreedyScheduler::new()), Some(vec!["2!".into()])); 193 | /// ``` 194 | 195 | pub fn emit_to_disk< 196 | B: Any + Send + Sync + Clone + Serialize + for<'de>Deserialize<'de>, 197 | F: 'static + Sync + Send + Clone + Fn(&A, &mut FnMut(B) -> ()) 198 | >(&self, path: String, f: F) -> DiskCollection { 199 | let parts = emit(&self.partitions, Disk::from_str(&path), f); 200 | 201 | DiskCollection::from_stores(path, parts) 202 | } 203 | 204 | /// Re-partitions data into N new partitions by the given function. The user provided 205 | /// function is used as a hash function, mapping the returned value to a partition index. 206 | /// This makes it useful for managing which partition data ends up! 207 | /// ```rust 208 | /// extern crate tange; 209 | /// extern crate tange_collection; 210 | /// use tange::scheduler::GreedyScheduler; 211 | /// use tange_collection::collection::memory::MemoryCollection; 212 | /// 213 | /// let col = MemoryCollection::from_vec(vec![1,2,3,4usize]); 214 | /// let new_col = col.partition(2, |idx, x| if *x < 3 { 1 } else { 2 }); 215 | /// 216 | /// assert_eq!(new_col.n_partitions(), 2); 217 | /// assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![3, 4, 1, 2])); 218 | /// ``` 219 | pub fn partition< 220 | F: 'static + Sync + Send + Clone + Fn(usize, &A) -> usize 221 | >(&self, partitions: usize, f: F) -> MemoryCollection { 222 | let new_chunks = partition(&self.partitions, 223 | partitions, 224 | f); 225 | // Loop over each bucket 226 | MemoryCollection { partitions: new_chunks } 227 | } 228 | 229 | /// Folds and accumulates values across multiple partitions into K new partitions. 230 | /// This is also known as a "group by" with a following reducer. 231 | /// 232 | /// MemoryCollection first performs a block aggregation: that is, it combines values 233 | /// within each partition first using the `binop` function. It then hashes 234 | /// each key to a new partition index, where it will then aggregate all keys using the 235 | /// `reduce` function. 236 | /// 237 | /// ```rust 238 | /// extern crate tange; 239 | /// extern crate tange_collection; 240 | /// use tange::scheduler::GreedyScheduler; 241 | /// use tange_collection::collection::memory::MemoryCollection; 242 | /// 243 | /// let col = MemoryCollection::from_vec(vec![1,2,3,4,5usize]); 244 | /// // Sum all odds and evens together 245 | /// let group_sum = col.fold_by(|x| x % 2, 246 | /// || 0usize, 247 | /// |block_acc, item| {*block_acc += *item}, 248 | /// |part_acc1, part_acc2| {*part_acc1 += *part_acc2}, 249 | /// 1) 250 | /// .sort_by(|x| x.0); 251 | /// 252 | /// assert_eq!(group_sum.n_partitions(), 1); 253 | /// assert_eq!(group_sum.run(&GreedyScheduler::new()), Some(vec![(0, 6), (1, 9)])); 254 | /// ``` 255 | 256 | pub fn fold_by B, 259 | F: 'static + Sync + Send + Clone + Fn(&A) -> K, 260 | O: 'static + Sync + Send + Clone + Fn(&mut B, &A) -> (), 261 | R: 'static + Sync + Send + Clone + Fn(&mut B, &B) -> ()>( 262 | &self, key: F, default: D, binop: O, reduce: R, partitions: usize 263 | ) -> MemoryCollection<(K,B)> { 264 | let results = fold_by(&self.partitions, key, default, binop, 265 | reduce, Vec::with_capacity(0), partitions); 266 | MemoryCollection { partitions: results } 267 | } 268 | 269 | /// Simple function to re-partition values by a given key. The return key is hashed 270 | /// and moduloed by the new partition count to determine where it will end up. 271 | /// ```rust 272 | /// extern crate tange; 273 | /// extern crate tange_collection; 274 | /// use tange::scheduler::GreedyScheduler; 275 | /// use tange_collection::collection::memory::MemoryCollection; 276 | /// 277 | /// let col = MemoryCollection::from_vec(vec![1,2,3,4usize]); 278 | /// let new_col = col.partition_by_key(2, |x| format!("{}", x)); 279 | /// 280 | /// assert_eq!(new_col.n_partitions(), 2); 281 | /// assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![4, 1, 2, 3])); 282 | /// ``` 283 | pub fn partition_by_key< 284 | K: Any + Sync + Send + Clone + Hash + Eq, 285 | F: 'static + Sync + Send + Clone + Fn(&A) -> K 286 | >(&self, n_chunks: usize, key: F) -> MemoryCollection { 287 | let results = partition_by_key(&self.partitions, n_chunks, key); 288 | let groups = results.into_iter().map(|part| concat(&part).unwrap()).collect(); 289 | MemoryCollection {partitions: groups} 290 | } 291 | 292 | /// Sorts values within each partition by a key function. If a global sort is desired, 293 | /// the collection needs to be re-partitioned into a single partition 294 | /// ```rust 295 | /// extern crate tange; 296 | /// extern crate tange_collection; 297 | /// use tange::scheduler::GreedyScheduler; 298 | /// use tange_collection::collection::memory::MemoryCollection; 299 | /// 300 | /// let col = MemoryCollection::from_vec(vec![1,2,3,4i32]); 301 | /// let new_col = col.sort_by(|x| -*x); 302 | /// 303 | /// assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![4, 3, 2, 1])); 304 | /// ``` 305 | pub fn sort_by< 306 | K: Ord, 307 | F: 'static + Sync + Send + Clone + Fn(&A) -> K 308 | >(&self, key: F) -> MemoryCollection { 309 | let nps = batch_apply(&self.partitions, move |_idx, vs| { 310 | let mut v2: Vec<_> = vs.clone(); 311 | v2.sort_by_key(|v| key(v)); 312 | v2 313 | }); 314 | MemoryCollection { partitions: nps } 315 | } 316 | 317 | /// Inner Joins two collections by the provided key function. 318 | /// If multiple values of the same key are found, they will be cross product for each 319 | /// pair found. 320 | /// ```rust 321 | /// extern crate tange; 322 | /// extern crate tange_collection; 323 | /// use tange::scheduler::GreedyScheduler; 324 | /// use tange_collection::collection::memory::MemoryCollection; 325 | /// 326 | /// let name_age: Vec<(String,u32)> = vec![("Andrew".into(), 33), ("Leah".into(), 12)]; 327 | /// let name_money: Vec<(String,f32)> = vec![("Leah".into(), 20.50)]; 328 | /// 329 | /// let na = MemoryCollection::from_vec(name_age); 330 | /// let nm = MemoryCollection::from_vec(name_money); 331 | /// let joined = na.join_on(&nm, 332 | /// |nax| nax.0.clone(), 333 | /// |nmx| nmx.0.clone(), 334 | /// |nax, nmx| (nax.0.clone(), nax.1, nmx.1), 335 | /// 1); 336 | /// assert_eq!(joined.run(&GreedyScheduler::new()), 337 | /// Some(vec![("Leah".into(), ("Leah".into(), 12, 20.50))])); 338 | /// ``` 339 | 340 | pub fn join_on< 341 | K: Any + Sync + Send + Clone + Hash + Eq, 342 | B: Any + Sync + Send + Clone, 343 | C: Any + Sync + Send + Clone, 344 | KF1: 'static + Sync + Send + Clone + Fn(&A) -> K, 345 | KF2: 'static + Sync + Send + Clone + Fn(&B) -> K, 346 | J: 'static + Sync + Send + Clone + Fn(&A, &B) -> C, 347 | >( 348 | &self, 349 | other: &MemoryCollection, 350 | key1: KF1, 351 | key2: KF2, 352 | joiner: J, 353 | partitions: usize, 354 | ) -> MemoryCollection<(K,C)> { 355 | // Group each by a common key 356 | let p1 = self.map(move |x| (key1(x), x.clone())) 357 | .partition_by_key(partitions, |x| x.0.clone()); 358 | let p2 = other.map(move |x| (key2(x), x.clone())) 359 | .partition_by_key(partitions, |x| x.0.clone()); 360 | 361 | let mut new_parts = Vec::with_capacity(p1.partitions.len()); 362 | for (l, r) in p1.partitions.iter().zip(p2.partitions.iter()) { 363 | new_parts.push(jok(l, r, Memory, joiner.clone())); 364 | } 365 | 366 | MemoryCollection { partitions: new_parts } 367 | } 368 | 369 | /// Executes the Collection, returning the result of the computation 370 | pub fn run(&self, s: &S) -> Option> { 371 | let cat = tree_reduce(&self.partitions, |x, y| { 372 | let mut v1: Vec<_> = (*x).clone(); 373 | for yi in y { 374 | v1.push(yi.clone()); 375 | } 376 | v1 377 | }); 378 | cat.and_then(|x| x.run(s)) 379 | } 380 | 381 | /// Executes the Collection, returning the result of the computation 382 | pub fn eval(&self) -> Option> { 383 | self.run(&GreedyScheduler::new()) 384 | } 385 | 386 | } 387 | 388 | impl MemoryCollection> { 389 | 390 | /// Flattens a vector of values 391 | /// ```rust 392 | /// extern crate tange; 393 | /// extern crate tange_collection; 394 | /// use tange::scheduler::GreedyScheduler; 395 | /// use tange_collection::collection::memory::MemoryCollection; 396 | /// 397 | /// let col = MemoryCollection::from_vec(vec![vec![1usize,2],vec![3,4]]); 398 | /// let flattened = col.flatten(); 399 | /// assert_eq!(flattened.run(&GreedyScheduler::new()), Some(vec![1, 2, 3, 4])); 400 | /// ``` 401 | 402 | pub fn flatten(&self) -> MemoryCollection { 403 | self.emit(move |x, emitter| { 404 | for xi in x { 405 | emitter(xi.clone()); 406 | } 407 | }) 408 | } 409 | } 410 | 411 | impl MemoryCollection { 412 | 413 | /// Returns the number of items in the collection. 414 | /// ```rust 415 | /// extern crate tange; 416 | /// extern crate tange_collection; 417 | /// use tange::scheduler::GreedyScheduler; 418 | /// use tange_collection::collection::memory::MemoryCollection; 419 | /// 420 | /// let col = MemoryCollection::from_vec(vec![vec![1usize,2],vec![3,4]]); 421 | /// assert_eq!(col.count().run(&GreedyScheduler::new()), Some(vec![2])); 422 | /// let flattened = col.flatten(); 423 | /// assert_eq!(flattened.count().run(&GreedyScheduler::new()), Some(vec![4])); 424 | /// ``` 425 | pub fn count(&self) -> MemoryCollection { 426 | let nps = batch_apply(&self.partitions, |_idx, vs| vs.len()); 427 | let count = tree_reduce(&nps, |x, y| x + y).unwrap(); 428 | let out = count.apply(|x| vec![*x]); 429 | MemoryCollection { partitions: vec![out] } 430 | } 431 | } 432 | 433 | impl MemoryCollection { 434 | 435 | /// Computes the frequencies of the items in collection. 436 | /// ```rust 437 | /// extern crate tange; 438 | /// extern crate tange_collection; 439 | /// use tange::scheduler::GreedyScheduler; 440 | /// use tange_collection::collection::memory::MemoryCollection; 441 | /// 442 | /// let col = MemoryCollection::from_vec(vec![1, 2, 1, 5, 1, 2]); 443 | /// let freqs = col.frequencies(1).sort_by(|x| x.0); 444 | /// assert_eq!(freqs.run(&GreedyScheduler::new()), Some(vec![(1, 3), (2, 2), (5, 1)])); 445 | /// ``` 446 | pub fn frequencies(&self, partitions: usize) -> MemoryCollection<(A, usize)> { 447 | //self.partition(chunks, |x| x); 448 | self.fold_by(|s| s.clone(), 449 | || 0usize, 450 | |acc, _l| *acc += 1, 451 | |x, y| *x += *y, 452 | partitions) 453 | } 454 | } 455 | 456 | // Writes out data 457 | impl MemoryCollection { 458 | 459 | /// Writes each record in a collection to disk, newline delimited. 460 | /// MemoryCollection will create a new file within the path for each partition. 461 | pub fn sink(&self, path: &str) -> MemoryCollection { 462 | let p: Arc = Arc::new(path.to_owned()); 463 | let pats = batch_apply(&self.partitions, move |idx, vs| { 464 | let p2: Arc = p.clone(); 465 | let local: &str = &p2; 466 | fs::create_dir_all(local) 467 | .expect("Welp, something went terribly wrong when creating directory"); 468 | 469 | let file = fs::File::create(&format!("{}/{}", local, idx)) 470 | .expect("Issues opening file!"); 471 | let mut bw = BufWriter::new(file); 472 | 473 | let size = vs.len(); 474 | for line in vs { 475 | bw.write(line.as_bytes()).expect("Error writing out line"); 476 | bw.write(b"\n").expect("Error writing out line"); 477 | } 478 | 479 | vec![size] 480 | }); 481 | 482 | MemoryCollection { partitions: pats } 483 | } 484 | } 485 | 486 | impl Deserialize<'de>> MemoryCollection { 487 | 488 | /// Copies the MemoryCollection to disk, returning a DiskCollection 489 | pub fn to_disk(&self, path: String) -> DiskCollection { 490 | DiskCollection::from_memory(path, &self.partitions) 491 | } 492 | } 493 | 494 | #[cfg(test)] 495 | mod test_lib { 496 | use super::*; 497 | use tange::scheduler::LeveledScheduler; 498 | 499 | #[test] 500 | fn test_fold_by() { 501 | let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]); 502 | let out = col.fold_by(|x| *x, || 0, |x, _y| *x += 1, |x, y| *x += y, 1); 503 | let mut results = out.run(&mut LeveledScheduler).unwrap(); 504 | results.sort(); 505 | assert_eq!(results, vec![(1, 2), (2, 2), (3, 1)]); 506 | } 507 | 508 | #[test] 509 | fn test_fold_by_parts() { 510 | let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]); 511 | let out = col.fold_by(|x| *x, || 0, |x, _y| *x += 1, |x, y| *x += y, 2); 512 | assert_eq!(out.partitions.len(), 2); 513 | let mut results = out.run(&mut LeveledScheduler).unwrap(); 514 | results.sort(); 515 | assert_eq!(results, vec![(1, 2), (2, 2), (3, 1)]); 516 | } 517 | 518 | #[test] 519 | fn test_partition_by_key() { 520 | let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]); 521 | let computed = col.partition_by_key(2, |x| *x) 522 | .sort_by(|x| *x); 523 | assert_eq!(computed.partitions.len(), 2); 524 | let results = computed.run(&mut LeveledScheduler).unwrap(); 525 | assert_eq!(results, vec![2, 2, 3, 1, 1]); 526 | } 527 | 528 | #[test] 529 | fn test_partition() { 530 | let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]); 531 | let computed = col.partition(2, |_idx, x| x % 2) 532 | .sort_by(|x| *x); 533 | assert_eq!(computed.partitions.len(), 2); 534 | let results = computed.run(&mut LeveledScheduler).unwrap(); 535 | assert_eq!(results, vec![2, 2, 1, 1, 3]); 536 | } 537 | 538 | #[test] 539 | fn test_count() { 540 | let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]); 541 | let results = col.split(3).count().run(&mut LeveledScheduler).unwrap(); 542 | assert_eq!(results, vec![5]); 543 | } 544 | 545 | #[test] 546 | fn test_join() { 547 | let col1 = MemoryCollection::from_vec(vec![1,2,3,1,2usize]); 548 | let col2 = MemoryCollection::from_vec( 549 | vec![(2, 1.23f64), (3usize, 2.34)]); 550 | let out = col1.join_on(&col2, |x| *x, |y| y.0, |x, y| { 551 | (*x, y.1) 552 | }, 5).split(1).sort_by(|x| x.0); 553 | let results = out.run(&mut LeveledScheduler).unwrap(); 554 | let expected = vec![(2, (2, 1.23)), (2, (2, 1.23)), (3, (3, 2.34))]; 555 | assert_eq!(results, expected); 556 | } 557 | 558 | #[test] 559 | fn test_emit() { 560 | let results = MemoryCollection::from_vec(vec![1,2,3usize]) 561 | .emit(|num, emitter| { 562 | for i in 0..*num { 563 | emitter(i); 564 | } 565 | }) 566 | .sort_by(|x| *x) 567 | .run(&mut LeveledScheduler).unwrap(); 568 | let expected = vec![0, 0, 0, 1, 1, 2]; 569 | assert_eq!(results, expected); 570 | } 571 | 572 | #[test] 573 | fn test_sort() { 574 | let results = MemoryCollection::from_vec(vec![1, 3, 2usize]) 575 | .sort_by(|x| *x) 576 | .run(&mut LeveledScheduler).unwrap(); 577 | let expected = vec![1, 2, 3]; 578 | assert_eq!(results, expected); 579 | } 580 | 581 | } 582 | -------------------------------------------------------------------------------- /tange-collection/src/collection/disk.rs: -------------------------------------------------------------------------------- 1 | //! Disk Collections 2 | //! --- 3 | //! This module defines the Dataflow interfaces for Out-Of-Core data processing. 4 | //! `DiskCollection` is intended to be used for processing datasets that might not fit 5 | //! in memory. 6 | //! 7 | //! All partitions are written to disk for every application, cleaning up the file when 8 | //! finished. This allows DiskCollection to only need the currently executing task in 9 | //! in memory. However, this also means there is going to be a fair amount of serialization/deserialization. 10 | //! Under the surface, we use bincode to serialize dat quickly to minmize the penalty. 11 | //! 12 | 13 | extern crate serde; 14 | use std::fs; 15 | use std::any::Any; 16 | use std::io::prelude::*; 17 | use std::io::BufWriter; 18 | use std::hash::Hash; 19 | use std::sync::Arc; 20 | 21 | use self::serde::Deserialize; 22 | use self::serde::Serialize; 23 | 24 | use tange::deferred::{Deferred, batch_apply, tree_reduce}; 25 | use tange::scheduler::{Scheduler,GreedyScheduler}; 26 | 27 | use collection::memory::MemoryCollection; 28 | use partitioned::{join_on_key as jok, partition, partition_by_key, fold_by, concat}; 29 | use interfaces::*; 30 | use super::emit; 31 | 32 | 33 | /// DiskCollection struct. 34 | #[derive(Clone)] 35 | pub struct DiskCollection { 36 | path: Arc, 37 | partitions: Vec>>> 38 | } 39 | 40 | impl Deserialize<'de>> DiskCollection { 41 | 42 | /// Create a new DiskCollection form a Vector of objects. 43 | /// ```rust 44 | /// extern crate tange; 45 | /// extern crate tange_collection; 46 | /// use tange::scheduler::GreedyScheduler; 47 | /// use tange_collection::collection::disk::DiskCollection; 48 | /// 49 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]); 50 | /// assert_eq!(col.run(&GreedyScheduler::new()), Some(vec![1,2,3usize])); 51 | /// ``` 52 | pub fn from_vec(path: String, vec: Vec) -> DiskCollection { 53 | MemoryCollection::from_vec(vec).to_disk(path) 54 | } 55 | 56 | /// Converts a collection of Deferred objects into a DiskCollection 57 | /// This is usually best used from the `MemoryCollection` 58 | pub fn from_memory(path: String, mc: &Vec>>) -> DiskCollection { 59 | ::std::fs::create_dir_all(&path).expect("Unable to create directory!"); 60 | let shared = Arc::new(path); 61 | let acc = Arc::new(FileStore::empty(shared.clone())); 62 | let defs = batch_apply(&mc, move |_idx, vs| { 63 | acc.write_vec(vs.clone()) 64 | }); 65 | DiskCollection { path: shared, partitions: defs } 66 | } 67 | 68 | /// Creats a DiskCollection for a set of FileStores. 69 | pub fn from_stores(path: String, fs: Vec>>>) -> DiskCollection { 70 | DiskCollection { path: Arc::new(path), partitions: fs } 71 | } 72 | 73 | /// Provides raw access to the underlying partitions 74 | pub fn to_defs(&self) -> &Vec>>> { 75 | &self.partitions 76 | } 77 | 78 | /// Converts a DiskCollection to a MemoryCollection 79 | pub fn to_memory(&self) -> MemoryCollection { 80 | let defs = batch_apply(&self.partitions, |_idx, vs| { 81 | vs.stream().into_iter().collect() 82 | }); 83 | MemoryCollection::from_defs(defs) 84 | } 85 | 86 | /// Returns the current number of data partitions 87 | pub fn n_partitions(&self) -> usize { 88 | self.partitions.len() 89 | } 90 | 91 | fn from_defs(&self, defs: Vec>>>) -> DiskCollection { 92 | DiskCollection { path: self.path.clone(), partitions: defs } 93 | } 94 | 95 | /// Concatentates two collections into a single Collection 96 | /// ```rust 97 | /// extern crate tange; 98 | /// extern crate tange_collection; 99 | /// use tange::scheduler::GreedyScheduler; 100 | /// use tange_collection::collection::disk::DiskCollection; 101 | /// 102 | /// let one = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]); 103 | /// let two = DiskCollection::from_vec("/tmp".into(), vec![4usize, 5, 6]); 104 | /// let cat = one.concat(&two); 105 | /// assert_eq!(cat.run(&GreedyScheduler::new()), Some(vec![1,2,3,4,5,6])); 106 | /// ``` 107 | pub fn concat(&self, other: &DiskCollection) -> DiskCollection { 108 | let mut nps: Vec<_> = self.partitions.iter() 109 | .map(|p| (*p).clone()).collect(); 110 | 111 | for p in other.partitions.iter() { 112 | nps.push(p.clone()); 113 | } 114 | 115 | self.from_defs(nps) 116 | } 117 | 118 | /// Maps a function over the values in the DiskCollection, returning a new DiskCollection 119 | /// ```rust 120 | /// extern crate tange; 121 | /// extern crate tange_collection; 122 | /// use tange::scheduler::GreedyScheduler; 123 | /// use tange_collection::collection::disk::DiskCollection; 124 | /// 125 | /// let one = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]); 126 | /// let strings = one.map(|i| format!("{}", i)); 127 | /// assert_eq!(strings.run(&GreedyScheduler::new()), 128 | /// Some(vec!["1".into(),"2".into(),"3".into()])); 129 | /// ``` 130 | pub fn map< 131 | B: Any + Send + Sync + Clone + Serialize, 132 | F: 'static + Sync + Send + Clone + Fn(&A) -> B 133 | >(&self, f: F) -> DiskCollection { 134 | self.emit(move |x, emitter| { 135 | emitter(f(x)) 136 | }) 137 | } 138 | 139 | /// Filters out items in the collection that fail the predicate. 140 | /// ```rust 141 | /// extern crate tange; 142 | /// extern crate tange_collection; 143 | /// use tange::scheduler::GreedyScheduler; 144 | /// use tange_collection::collection::disk::DiskCollection; 145 | /// 146 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]); 147 | /// let odds = col.filter(|x| x % 2 == 1); 148 | /// assert_eq!(odds.run(&GreedyScheduler::new()), 149 | /// Some(vec![1, 3usize])); 150 | /// ``` 151 | 152 | pub fn filter< 153 | F: 'static + Sync + Send + Clone + Fn(&A) -> bool 154 | >(&self, f: F) -> DiskCollection { 155 | self.emit(move |x, emitter| { 156 | if f(x) { 157 | emitter(x.clone()) 158 | } 159 | }) 160 | } 161 | 162 | /// Re-partitions a collection by the number of provided chunks. It uniformly distributes data from each old partition into each new partition. 163 | /// ```rust 164 | /// extern crate tange; 165 | /// extern crate tange_collection; 166 | /// use tange::scheduler::GreedyScheduler; 167 | /// use tange_collection::collection::disk::DiskCollection; 168 | /// 169 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]); 170 | /// assert_eq!(col.n_partitions(), 1); 171 | /// let two = col.split(2); 172 | /// assert_eq!(two.n_partitions(), 2); 173 | /// let two = col.split(3); 174 | /// assert_eq!(two.n_partitions(), 3); 175 | /// ``` 176 | 177 | pub fn split(&self, n_chunks: usize) -> DiskCollection { 178 | self.partition(n_chunks, |idx, _k| idx) 179 | } 180 | 181 | /// Maps over all items in a collection, optionally emitting new values. It can be used 182 | /// to efficiently fuse a number of map/filter/flat_map functions into a single method. 183 | /// ```rust 184 | /// extern crate tange; 185 | /// extern crate tange_collection; 186 | /// use tange::scheduler::GreedyScheduler; 187 | /// use tange_collection::collection::disk::DiskCollection; 188 | /// 189 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]); 190 | /// let new = col.emit(|item, emitter| { 191 | /// if item % 2 == 0 { 192 | /// emitter(format!("{}!", item)); 193 | /// } 194 | /// }); 195 | /// assert_eq!(new.run(&GreedyScheduler::new()), Some(vec!["2!".into()])); 196 | /// ``` 197 | pub fn emit< 198 | B: Any + Send + Sync + Clone + Serialize, 199 | F: 'static + Sync + Send + Clone + Fn(&A, &mut FnMut(B) -> ()) 200 | >(&self, f: F) -> DiskCollection { 201 | 202 | let parts = emit(&self.partitions, Disk(self.path.clone()), f); 203 | 204 | self.from_defs(parts) 205 | } 206 | 207 | /// Re-partitions data into N new partitions by the given function. The user provided 208 | /// function is used as a hash function, mapping the returned value to a partition index. 209 | /// This makes it useful for managing which partition data ends up! 210 | /// ```rust 211 | /// extern crate tange; 212 | /// extern crate tange_collection; 213 | /// use tange::scheduler::GreedyScheduler; 214 | /// use tange_collection::collection::disk::DiskCollection; 215 | /// 216 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3,4usize]); 217 | /// let new_col = col.partition(2, |idx, x| if *x < 3 { 1 } else { 2 }); 218 | /// 219 | /// assert_eq!(new_col.n_partitions(), 2); 220 | /// assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![3, 4, 1, 2])); 221 | /// ``` 222 | 223 | pub fn partition< 224 | F: 'static + Sync + Send + Clone + Fn(usize, &A) -> usize 225 | >(&self, partitions: usize, f: F) -> DiskCollection { 226 | let new_chunks = partition(&self.partitions, 227 | partitions, 228 | f); 229 | // Loop over each bucket 230 | self.from_defs(new_chunks) 231 | } 232 | 233 | /// Folds and accumulates values across multiple partitions into K new partitions. 234 | /// This is also known as a "group by" with a following reducer. 235 | /// 236 | /// DiskCollection first performs a block aggregation: that is, it combines values 237 | /// within each partition first using the `binop` function. It then hashes 238 | /// each key to a new partition index, where it will then aggregate all keys using the 239 | /// `reduce` function. 240 | /// 241 | /// ```rust 242 | /// extern crate tange; 243 | /// extern crate tange_collection; 244 | /// use tange::scheduler::GreedyScheduler; 245 | /// use tange_collection::collection::disk::DiskCollection; 246 | /// 247 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3,4,5usize]); 248 | /// // Sum all odds and evens together 249 | /// let group_sum = col.fold_by(|x| x % 2, 250 | /// || 0usize, 251 | /// |block_acc, item| {*block_acc += *item}, 252 | /// |part_acc1, part_acc2| {*part_acc1 += *part_acc2}, 253 | /// 1) 254 | /// .sort_by(|x| x.0); 255 | /// 256 | /// assert_eq!(group_sum.n_partitions(), 1); 257 | /// assert_eq!(group_sum.run(&GreedyScheduler::new()), Some(vec![(0, 6), (1, 9)])); 258 | /// ``` 259 | 260 | pub fn fold_by Deserialize<'de>, 261 | B: Any + Sync + Send + Clone + Serialize + for<'de> Deserialize<'de>, 262 | D: 'static + Sync + Send + Clone + Fn() -> B, 263 | F: 'static + Sync + Send + Clone + Fn(&A) -> K, 264 | O: 'static + Sync + Send + Clone + Fn(&mut B, &A) -> (), 265 | R: 'static + Sync + Send + Clone + Fn(&mut B, &B) -> ()>( 266 | &self, key: F, default: D, binop: O, reduce: R, partitions: usize 267 | ) -> DiskCollection<(K,B)> { 268 | let fs = Arc::new(FileStore::empty(self.path.clone())); 269 | let results = fold_by(&self.partitions, key, default, binop, 270 | reduce, fs, partitions); 271 | self.from_defs(results) 272 | } 273 | 274 | /// Simple function to re-partition values by a given key. The return key is hashed 275 | /// and moduloed by the new partition count to determine where it will end up. 276 | /// ```rust 277 | /// extern crate tange; 278 | /// extern crate tange_collection; 279 | /// use tange::scheduler::GreedyScheduler; 280 | /// use tange_collection::collection::disk::DiskCollection; 281 | /// 282 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3,4usize]); 283 | /// let new_col = col.partition_by_key(2, |x| format!("{}", x)); 284 | /// 285 | /// assert_eq!(new_col.n_partitions(), 2); 286 | /// assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![4, 1, 2, 3])); 287 | /// ``` 288 | 289 | pub fn partition_by_key< 290 | K: Any + Sync + Send + Clone + Hash + Eq, 291 | F: 'static + Sync + Send + Clone + Fn(&A) -> K 292 | >(&self, n_chunks: usize, key: F) -> DiskCollection { 293 | let results = partition_by_key(&self.partitions, n_chunks, key); 294 | let groups = results.into_iter().map(|part| concat(&part).unwrap()).collect(); 295 | self.from_defs(groups) 296 | } 297 | 298 | /// Sorts values within each partition by a key function. If a global sort is desired, 299 | /// the collection needs to be re-partitioned into a single partition 300 | /// ```rust 301 | /// extern crate tange; 302 | /// extern crate tange_collection; 303 | /// use tange::scheduler::GreedyScheduler; 304 | /// use tange_collection::collection::disk::DiskCollection; 305 | /// 306 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3,4i32]); 307 | /// let new_col = col.sort_by(|x| -*x); 308 | /// 309 | /// assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![4, 3, 2, 1])); 310 | /// ``` 311 | pub fn sort_by< 312 | K: Ord, 313 | F: 'static + Sync + Send + Clone + Fn(&A) -> K 314 | >(&self, key: F) -> DiskCollection { 315 | let acc = Arc::new(FileStore::empty(self.path.clone())); 316 | let nps = batch_apply(&self.partitions, move |_idx, vs| { 317 | let mut out = acc.writer(); 318 | let mut v2: Vec<_> = vs.stream().into_iter().collect(); 319 | v2.sort_by_key(|v| key(v)); 320 | for vi in v2 { 321 | out.add(vi); 322 | } 323 | out.finish() 324 | }); 325 | self.from_defs(nps) 326 | } 327 | 328 | /// Inner Joins two collections by the provided key function. 329 | /// If multiple values of the same key are found, they will be cross product for each 330 | /// pair found. 331 | /// ```rust 332 | /// extern crate tange; 333 | /// extern crate tange_collection; 334 | /// use tange::scheduler::GreedyScheduler; 335 | /// use tange_collection::collection::disk::DiskCollection; 336 | /// let name_age: Vec<(String,u32)> = vec![("Andrew".into(), 33), ("Leah".into(), 12)]; 337 | /// let name_money: Vec<(String,f32)> = vec![("Leah".into(), 20.50)]; 338 | /// 339 | /// let na = DiskCollection::from_vec("/tmp".into(), name_age); 340 | /// let nm = DiskCollection::from_vec("/tmp".into(), name_money); 341 | /// let joined = na.join_on(&nm, 342 | /// |nax| nax.0.clone(), 343 | /// |nmx| nmx.0.clone(), 344 | /// |nax, nmx| (nax.0.clone(), nax.1, nmx.1), 345 | /// 1); 346 | /// assert_eq!(joined.run(&GreedyScheduler::new()), 347 | /// Some(vec![("Leah".into(), ("Leah".into(), 12, 20.50))])); 348 | /// ``` 349 | pub fn join_on< 350 | K: Any + Sync + Send + Clone + Hash + Eq + Serialize + for<'de> Deserialize<'de>, 351 | B: Any + Sync + Send + Clone + Serialize + for<'de> Deserialize<'de>, 352 | C: Any + Sync + Send + Clone + Serialize, 353 | KF1: 'static + Sync + Send + Clone + Fn(&A) -> K, 354 | KF2: 'static + Sync + Send + Clone + Fn(&B) -> K, 355 | J: 'static + Sync + Send + Clone + Fn(&A, &B) -> C, 356 | >( 357 | &self, 358 | other: &DiskCollection, 359 | key1: KF1, 360 | key2: KF2, 361 | joiner: J, 362 | partitions: usize, 363 | ) -> DiskCollection<(K,C)> { 364 | // Group each by a common key 365 | let p1 = self.map(move |x| (key1(x), x.clone())) 366 | .partition_by_key(partitions, |x| x.0.clone()); 367 | let p2 = other.map(move |x| (key2(x), x.clone())) 368 | .partition_by_key(partitions, |x| x.0.clone()); 369 | 370 | let mut new_parts = Vec::with_capacity(p1.partitions.len()); 371 | for (l, r) in p1.partitions.iter().zip(p2.partitions.iter()) { 372 | let acc = Arc::new(FileStore::empty(self.path.clone())); 373 | new_parts.push(jok(l, r, acc, joiner.clone())); 374 | } 375 | 376 | self.from_defs(new_parts) 377 | } 378 | 379 | /// Executes the Collection, returning the result of the computation 380 | pub fn run(&self, s: &S) -> Option> { 381 | let defs = batch_apply(&self.partitions, |_idx, vs| { 382 | vs.stream().into_iter().collect::>() 383 | }); 384 | let cat = tree_reduce(&defs, |x, y| { 385 | let mut v1: Vec<_> = (*x).clone(); 386 | for yi in y { 387 | v1.push(yi.clone()); 388 | } 389 | v1 390 | }); 391 | cat.and_then(|x| x.run(s)) 392 | } 393 | 394 | /// Executes the Collection, returning the result of the computation 395 | pub fn eval(&self) -> Option> { 396 | self.run(&GreedyScheduler::new()) 397 | } 398 | 399 | } 400 | 401 | impl Deserialize<'de>> DiskCollection> { 402 | /// Flattens a vector of values 403 | /// ```rust 404 | /// extern crate tange; 405 | /// extern crate tange_collection; 406 | /// use tange::scheduler::GreedyScheduler; 407 | /// use tange_collection::collection::disk::DiskCollection; 408 | /// 409 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![vec![1usize,2],vec![3,4]]); 410 | /// let flattened = col.flatten(); 411 | /// assert_eq!(flattened.run(&GreedyScheduler::new()), Some(vec![1, 2, 3, 4])); 412 | /// ``` 413 | pub fn flatten(&self) -> DiskCollection { 414 | self.emit(move |x, emitter| { 415 | for xi in x { 416 | emitter(xi.clone()); 417 | } 418 | }) 419 | } 420 | } 421 | 422 | impl Deserialize<'de>> DiskCollection { 423 | /// Returns the number of items in the collection 424 | /// ```rust 425 | /// extern crate tange; 426 | /// extern crate tange_collection; 427 | /// use tange::scheduler::GreedyScheduler; 428 | /// use tange_collection::collection::disk::DiskCollection; 429 | /// 430 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![vec![1usize,2],vec![3,4]]); 431 | /// assert_eq!(col.count().run(&GreedyScheduler::new()), Some(vec![2])); 432 | /// let flattened = col.flatten(); 433 | /// assert_eq!(flattened.count().run(&GreedyScheduler::new()), Some(vec![4])); 434 | /// ``` 435 | pub fn count(&self) -> DiskCollection { 436 | let nps = batch_apply(&self.partitions, |_idx, vs| { 437 | vs.stream().into_iter().map(|_| 1usize).sum::() 438 | }); 439 | let count = tree_reduce(&nps, |x, y| x + y).unwrap(); 440 | let acc = Arc::new(FileStore::empty(self.path.clone())); 441 | let out = count.apply(move |x| { 442 | acc.write_vec(vec![*x]) 443 | }); 444 | self.from_defs(vec![out]) 445 | } 446 | } 447 | 448 | impl Deserialize<'de>> DiskCollection { 449 | 450 | /// Computes the frequencies of the items in collection. 451 | /// ```rust 452 | /// extern crate tange; 453 | /// extern crate tange_collection; 454 | /// use tange::scheduler::GreedyScheduler; 455 | /// use tange_collection::collection::disk::DiskCollection; 456 | /// 457 | /// let col = DiskCollection::from_vec("/tmp".into(), vec![1, 2, 1, 5, 1, 2]); 458 | /// let freqs = col.frequencies(1).sort_by(|x| x.0); 459 | /// assert_eq!(freqs.run(&GreedyScheduler::new()), Some(vec![(1, 3), (2, 2), (5, 1)])); 460 | /// ``` 461 | pub fn frequencies(&self, partitions: usize) -> DiskCollection<(A, usize)> { 462 | //self.partition(chunks, |x| x); 463 | self.fold_by(|s| s.clone(), 464 | || 0usize, 465 | |acc, _l| *acc += 1, 466 | |x, y| *x += *y, 467 | partitions) 468 | } 469 | } 470 | 471 | // Writes out data 472 | impl DiskCollection { 473 | /// Writes each record in a collection to disk, newline delimited. 474 | /// DiskCollection will create anew file within the path for each partition written. 475 | pub fn sink(&self, path: &str) -> DiskCollection { 476 | let acc = Arc::new(FileStore::empty(self.path.clone())); 477 | let p: Arc = Arc::new(path.to_owned()); 478 | let pats = batch_apply(&self.partitions, move |idx, vs| { 479 | let p2 = p.clone(); 480 | let local: &str = &p2; 481 | fs::create_dir_all(local) 482 | .expect("Welp, something went terribly wrong when creating directory"); 483 | 484 | let file = fs::File::create(&format!("{}/{}", local, idx)) 485 | .expect("Issues opening file!"); 486 | let mut bw = BufWriter::new(file); 487 | 488 | let mut size = 0usize; 489 | for line in vs.stream() { 490 | bw.write(line.as_bytes()).expect("Error writing out line"); 491 | bw.write(b"\n").expect("Error writing out line"); 492 | size += 1; 493 | } 494 | 495 | acc.write_vec(vec![size]) 496 | }); 497 | 498 | self.from_defs(pats) 499 | } 500 | } 501 | 502 | #[cfg(test)] 503 | mod test_lib { 504 | use super::*; 505 | use tange::scheduler::{GreedyScheduler,LeveledScheduler}; 506 | 507 | fn make_col() -> DiskCollection { 508 | DiskCollection::from_vec("/tmp".into(), vec![1,2,3,1,2usize]) 509 | } 510 | 511 | #[test] 512 | fn test_fold_by() { 513 | let col = make_col(); 514 | let out = col.fold_by(|x| *x, || 0, |x, _y| *x += 1, |x, y| *x += y, 1); 515 | let mut results = out.run(&LeveledScheduler).unwrap(); 516 | results.sort(); 517 | assert_eq!(results, vec![(1, 2), (2, 2), (3, 1)]); 518 | } 519 | 520 | #[test] 521 | fn test_fold_by_parts() { 522 | let col = make_col(); 523 | let out = col.fold_by(|x| *x, || 0, |x, _y| *x += 1, |x, y| *x += y, 2); 524 | assert_eq!(out.partitions.len(), 2); 525 | let mut results = out.run(&LeveledScheduler).unwrap(); 526 | results.sort(); 527 | assert_eq!(results, vec![(1, 2), (2, 2), (3, 1)]); 528 | } 529 | 530 | #[test] 531 | fn test_partition_by_key() { 532 | let col = make_col(); 533 | let computed = col.partition_by_key(2, |x| *x) 534 | .sort_by(|x| *x); 535 | assert_eq!(computed.partitions.len(), 2); 536 | let results = computed.run(&LeveledScheduler).unwrap(); 537 | assert_eq!(results, vec![2, 2, 3, 1, 1]); 538 | } 539 | 540 | #[test] 541 | fn test_partition() { 542 | let col = make_col(); 543 | let computed = col.partition(2, |_idx, x| x % 2) 544 | .sort_by(|x| *x); 545 | assert_eq!(computed.partitions.len(), 2); 546 | let results = computed.run(&GreedyScheduler::new()).unwrap(); 547 | assert_eq!(results, vec![2, 2, 1, 1, 3]); 548 | } 549 | 550 | #[test] 551 | fn test_count() { 552 | let col = make_col(); 553 | let results = col.split(3).count().run(&mut LeveledScheduler).unwrap(); 554 | assert_eq!(results, vec![5]); 555 | } 556 | 557 | #[test] 558 | fn test_join() { 559 | let col1 = make_col(); 560 | let col2 = DiskCollection::from_vec("/tmp".into(), 561 | vec![(2, 1.23f64), (3usize, 2.34)]); 562 | let out = col1.join_on(&col2, |x| *x, |y| y.0, |x, y| { 563 | (*x, y.1) 564 | }, 5).split(1).sort_by(|x| x.0); 565 | let results = out.run(&LeveledScheduler).unwrap(); 566 | let expected = vec![(2, (2, 1.23)), (2, (2, 1.23)), (3, (3, 2.34))]; 567 | assert_eq!(results, expected); 568 | } 569 | 570 | #[test] 571 | fn test_emit() { 572 | let results = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]) 573 | .emit(|num, emitter| { 574 | for i in 0..*num { 575 | emitter(i); 576 | } 577 | }) 578 | .sort_by(|x| *x) 579 | .run(&LeveledScheduler).unwrap(); 580 | let expected = vec![0, 0, 0, 1, 1, 2]; 581 | assert_eq!(results, expected); 582 | } 583 | 584 | #[test] 585 | fn test_sort() { 586 | let results = DiskCollection::from_vec("/tmp".into(), vec![1, 3, 2usize]) 587 | .sort_by(|x| *x) 588 | .run(&LeveledScheduler).unwrap(); 589 | let expected = vec![1, 2, 3]; 590 | assert_eq!(results, expected); 591 | } 592 | 593 | } 594 | --------------------------------------------------------------------------------