├── .gitignore
├── tange-collection
    ├── .gitignore
    ├── Cargo.toml
    ├── src
    │   ├── collection
    │   │   ├── mod.rs
    │   │   ├── memory.rs
    │   │   └── disk.rs
    │   ├── utils.rs
    │   ├── lib.rs
    │   ├── partitioned.rs
    │   └── interfaces.rs
    └── README.md
├── tange-core
    ├── .gitignore
    ├── Cargo.toml
    ├── src
    │   ├── task.rs
    │   ├── lib.rs
    │   ├── graph.rs
    │   ├── deferred.rs
    │   └── scheduler.rs
    └── README.md
├── Cargo.toml
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | Cargo.lock
2 | target/
3 | 


--------------------------------------------------------------------------------
/tange-collection/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | 


--------------------------------------------------------------------------------
/tange-core/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | /target/
3 | **/*.rs.bk
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = [
3 |     "tange-core",
4 |     "tange-collection",
5 | ]
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Tange
2 | ---
3 | 
4 | Tange is a Task-based parallelization library written for Rust.  It currently comes with two flavors:
5 | 
6 | 1. [tange-core](tange-core): This contains the primitives for constructing and executing a task graphs
7 | 2. [tange-collection](tange-collection): This contains a higher level Dataflow interface for convenient munging of data.
8 | 


--------------------------------------------------------------------------------
/tange-core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tange"
 3 | version = "0.1.1"
 4 | authors = ["Andrew Stanton <refefer@gmail.com>"]
 5 | description = "Scalable Task-based Parallelism Framework"
 6 | license = "Apache-2.0/MIT"
 7 | repository = "https://github.com/Refefer/tange/tree/master/tange-core"
 8 | readme = "README.md"
 9 | keywords = ["parallel", "thread", "concurrency", "dataflow", "performance"]
10 | categories = ["concurrency"]
11 | 
12 | [dependencies]
13 | log = "0.4"
14 | priority-queue = "0.5.1"
15 | jobpool = "0.3.8"
16 | num_cpus = "1.0"
17 | 
18 | [lib]
19 | name = "tange"
20 | path = "src/lib.rs"
21 | 


--------------------------------------------------------------------------------
/tange-collection/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tange-collection"
 3 | version = "0.1.2"
 4 | authors = ["Andrew Stanton <refefer@gmail.com>"]
 5 | description = "Dataflow computation"
 6 | license = "Apache-2.0/MIT"
 7 | repository = "https://github.com/Refefer/tange/tree/master/tange-collection"
 8 | readme = "README.md"
 9 | keywords = ["parallel", "thread", "concurrency", "dataflow", "performance"]
10 | categories = ["concurrency", "algorithms"]
11 | 
12 | [dependencies]
13 | tange = { path = "../tange-core" }
14 | bincode = "1.0"
15 | serde = "1.0"
16 | serde_derive = "1.0"
17 | uuid = { version = "0.6", features = ["v4"] }
18 | snap = "0.2.5"
19 | 
20 | [lib]
21 | name = "tange_collection"
22 | path = "src/lib.rs"
23 | 


--------------------------------------------------------------------------------
/tange-collection/src/collection/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Contains the two main primitives: MemoryCollection and DiskCollection
 2 | 
 3 | /// Defines MemoryCollection and assorted functions
 4 | pub mod memory;
 5 | 
 6 | /// Defines DiskCollection and assorted functions
 7 | pub mod disk;
 8 | 
 9 | use std::any::Any;
10 | 
11 | use tange::deferred::{Deferred, batch_apply};
12 | use interfaces::{Accumulator,ValueWriter,Stream};
13 | 
14 | fn emit<
15 |     A,
16 |     Col: Any + Send + Sync + Clone + Stream<A>,
17 |     B: Any + Send + Sync + Clone,
18 |     F: 'static + Sync + Send + Clone + Fn(&A, &mut FnMut(B) -> ()),
19 |     Acc: 'static + Accumulator<B>
20 | >(defs: &[Deferred<Col>], acc: Acc, f: F) -> Vec<Deferred<<<Acc as Accumulator<B>>::VW as ValueWriter<B>>::Out>> {
21 | 
22 |     batch_apply(&defs, move |_idx, vs| {
23 |         let mut out = acc.writer();
24 |         for v in vs.stream().into_iter() {
25 |             f(&v, &mut |r| out.add(r));
26 |         }
27 |         out.finish()
28 |     })
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/tange-core/src/task.rs:
--------------------------------------------------------------------------------
 1 | use std::any::Any;
 2 | use std::marker::PhantomData;
 3 | 
 4 | pub type BASS = Box<Any + Send + Sync>;
 5 | pub enum DynArgs<'a> {
 6 |     One(&'a BASS),
 7 |     Two(&'a BASS, &'a BASS)
 8 | }
 9 | 
10 | pub trait DynRun: Send + Sync {
11 |     fn eval(&self, val: DynArgs) -> Option<BASS>;
12 | }
13 | 
14 | pub struct DynFn<A,B,F: Fn(&A) -> B>(F,PhantomData<A>,PhantomData<B>);
15 | 
16 | impl <A,B,F: Fn(&A) -> B> DynFn<A,B,F> {
17 |     pub fn new(f: F) -> Self {
18 |         DynFn(f, PhantomData, PhantomData)
19 |     }
20 | }
21 | 
22 | impl <A: Any + Send + Sync, B: Any + Send + Sync, F: Send + Sync + Fn(&A) -> B> DynRun for DynFn<A,B,F> {
23 | 
24 |     fn eval(&self, val: DynArgs) -> Option<BASS> {
25 |         match val {
26 |             DynArgs::One(v) => v.downcast_ref::<A>().map(|a| {
27 |                 let b = self.0(a);
28 |                 let bx: BASS = Box::new(b);
29 |                 bx
30 |             }),
31 |             _ => None
32 |         }
33 |     }
34 | }
35 | 
36 | pub struct DynFn2<A,B,C,F: Fn(&A, &B) -> C>(F,PhantomData<A>,PhantomData<B>,PhantomData<C>);
37 | 
38 | impl <A,B,C,F: Fn(&A, &B) -> C> DynFn2<A,B,C,F> {
39 |     pub fn new(f: F) -> Self {
40 |         DynFn2(f, PhantomData, PhantomData, PhantomData)
41 |     }
42 | }
43 | 
44 | impl <A: Any + Send + Sync, B: Any + Send + Sync, C: Any + Send + Sync, F: Send + Sync + Fn(&A, &B) -> C> DynRun for DynFn2<A,B,C,F> {
45 | 
46 |     fn eval(&self, val: DynArgs) -> Option<BASS> {
47 |         match val {
48 |             DynArgs::Two(a, b) => {
49 |                 a.downcast_ref::<A>().and_then(|a| {
50 |                     b.downcast_ref::<B>().map(|b| {
51 |                         let c = self.0(a, b);
52 |                         let cx: BASS = Box::new(c);
53 |                         cx
54 |                     })
55 |                 })
56 |             },
57 |             _ => None
58 |         }
59 |     }
60 | }
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/tange-core/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! tange-core
 2 | //!
 3 | //! `tange-core` provides primitives for building and running task-based computations.
 4 | //!  
 5 | //! What is it?
 6 | //! ---
 7 | //! 
 8 | //! `Tange` is a framework that makes it easy to write defered, data parallel computations that are executed concurrently across a local machine.  It can scale up to millions of tasks per Graph and can be useful for a number of different applications:
 9 | //! 
10 | //! * Data processing.
11 | //! * All-Reduce operations.
12 | //! * Distributed machine learning algorithms.
13 | //! * General parallel computing.
14 | //! 
15 | //! How to Use It?
16 | //! ---
17 | //! 
18 | //! Tange defines a `Deferred` struct which represents a computation.  `Deferred` objects are accessed with three simple functions:
19 | //! 
20 | //! 1. `lift` - Lift takes a concrete value and lifts it into a Deferred object
21 | //! 2. `apply` - Apply applies a function to a Deferred, producing a new Deferred object.
22 | //! 3. `join` -  Join combines two Deferred objects with a joiner function, producing a new Deferred.
23 | //! 
24 | //! Example - Hello World!
25 | //! ---
26 | //! ```rust
27 | //! use tange::deferred::Deferred;
28 | //! use tange::scheduler::GreedyScheduler;
29 | //! 
30 | //! let hello = Deferred::lift("Hello".to_owned(), None);
31 | //! let world = Deferred::lift("World".to_owned(), None);
32 | //! let world_exclaim = world.apply(|w| format!("{}!", w));
33 | //! let hello_world = hello.join(&world_exclaim, |h, w| format!("{} {}", h, w));
34 | //! assert_eq!(hello_world.run(&GreedyScheduler::new()), Some("Hello World!".into()));
35 | //! ```
36 | //! 
37 | //! 
38 | //! 
39 | 
40 | #![warn(missing_docs)]
41 | 
42 | #[macro_use]
43 | extern crate log;
44 | 
45 | /// Contains Deferred primitive and function definitions
46 | pub mod deferred;
47 | 
48 | /// Contains Scheduler trait definition and implementations
49 | pub mod scheduler;
50 | 
51 | /// Internal Graph implementation
52 | mod graph;
53 | 
54 | /// Internal task definitions
55 | mod task;
56 | 
57 | 


--------------------------------------------------------------------------------
/tange-collection/src/utils.rs:
--------------------------------------------------------------------------------
 1 | //! Utilities for creating collections
 2 | use std::io::prelude::*;
 3 | use std::io::{SeekFrom,BufReader,Error};
 4 | use std::fs::{File,metadata};
 5 | 
 6 | use tange::deferred::{Deferred, batch_apply};
 7 | 
 8 | use collection::memory::MemoryCollection;
 9 | 
10 | #[derive(Clone)]
11 | struct Chunk { path: String, start: u64, end: u64 }
12 | 
13 | /// Reads a new-line delimited text file, creating a new partition every `chunk_size`
14 | pub fn read_text(path: &str, chunk_size: u64) -> Result<MemoryCollection<String>,Error> {
15 |     // Read the file size
16 |     let file_size = metadata(path)?.len();
17 |     let mut dfs = Vec::new();
18 |     let mut cur_offset = 0u64;
19 |     while cur_offset < file_size {
20 |         let chunk = Chunk {
21 |             path: path.into(),
22 |             start: cur_offset,
23 |             end: cur_offset + chunk_size
24 |         };
25 |         dfs.push(Deferred::lift(chunk, 
26 |                                 Some(&format!("File: {}, start: {}", path, cur_offset))));
27 |         cur_offset += chunk_size;
28 |     }
29 | 
30 |     Ok(MemoryCollection::from_defs(batch_apply(&dfs, read)))
31 | }
32 | 
33 | fn read(_idx: usize, chunk: &Chunk) -> Vec<String> {
34 |     let f = File::open(&chunk.path)
35 |         .expect("Error when opening file");
36 |     let mut reader = BufReader::new(f);
37 |     reader.seek(SeekFrom::Start(chunk.start))
38 |         .expect("Error when reading file!");
39 | 
40 |     let mut start = if chunk.start > 0 { 
41 |         // Skip first line, which is likely a partial line
42 |         let mut s = Vec::new();
43 |         let size = reader.read_until(b'\n', &mut s)
44 |             .expect("Error reading line from file!");
45 |         chunk.start + size as u64
46 |     } else {
47 |         0
48 |     };
49 | 
50 |     let total = chunk.end;
51 |     let mut lines = Vec::new();
52 |     loop {
53 |         let mut s = String::new();
54 |         match reader.read_line(&mut s) {
55 |             Ok(0) => break,
56 |             Ok(size) => {
57 |                 start += size as u64;
58 |                 s.shrink_to_fit();
59 |                 lines.push(s);
60 |             },
61 |             _ => break
62 |         };
63 |         if start > total { break; }
64 |     }
65 |     lines.shrink_to_fit();
66 |     lines
67 | }
68 | 


--------------------------------------------------------------------------------
/tange-core/src/graph.rs:
--------------------------------------------------------------------------------
 1 | //!
 2 | //! Graph definition libraries.  These are typically not used directly, instead accessed
 3 | //! via Deferred objects.
 4 | //!
 5 | use std::sync::Arc;
 6 | use std::sync::atomic::{AtomicUsize, ATOMIC_USIZE_INIT, Ordering};
 7 | 
 8 | use task::{BASS,DynRun};
 9 | 
10 | static GLOBAL_HANDLE_COUNT: AtomicUsize = ATOMIC_USIZE_INIT;
11 | 
12 | /// Interface for providing inputs into the graph, such as reading a file
13 | pub trait Input: Send + Sync {
14 |     fn read(&self) -> BASS;
15 | }
16 | 
17 | /// Unique values representing a task in a Graph.  Handles are globally unique and
18 | /// defined by order.
19 | #[derive(Debug,Clone,PartialEq,Eq,Hash)]
20 | pub struct Handle(String, usize);
21 | 
22 | impl Handle {
23 |     /// Creates a new handle.  
24 |     fn new(name: String) -> Self {
25 |         Handle(name, GLOBAL_HANDLE_COUNT.fetch_add(1, Ordering::SeqCst))
26 |     }
27 | }
28 | 
29 | /// ADT for handling either Tasks or reading data into the graph
30 | pub enum Task {
31 | 
32 |     /// Node which consumes down stream data to produce new data
33 |     Function(Box<DynRun>),
34 |     
35 |     /// Node which generates data
36 |     Input(Box<Input>)
37 | }
38 | 
39 | /// Holds references to the number of arguments to pass into a Task
40 | #[derive(Clone)]
41 | pub enum FnArgs {
42 | 
43 |     /// Single argument
44 |     Single(Arc<Graph>),
45 | 
46 |     /// Used for joining two separate task outputs
47 |     Join(Arc<Graph>, Arc<Graph>)
48 | }
49 | 
50 | /// Graphs contain the computational pieces needed to represent the data flow
51 | /// between multiple different tasks, their combination, and eventual output.
52 | #[derive(Clone)]
53 | pub struct Graph {
54 | 
55 |     /// Pointer to the underlying Handle representing this computation
56 |     pub handle: Arc<Handle>,
57 | 
58 |     /// Task to run
59 |     pub task: Arc<Task>,
60 | 
61 |     /// Arguments consumed by defined Task
62 |     pub args: Option<FnArgs>
63 | 
64 | }
65 | 
66 | impl Graph {
67 | 
68 |     /// Adds a new input into the Graph
69 |     pub fn create_input<I: Input + 'static>(input: I, name: &str) -> Arc<Graph> {
70 |         let i_name = format!("Input<name={}>", name);
71 |         let handle = Arc::new(Handle::new(i_name));
72 |         let inp = Arc::new(Task::Input(Box::new(input)));
73 |         Arc::new(Graph {
74 |             handle: handle,
75 |             task: inp,
76 |             args: None
77 |         })
78 |     }
79 | 
80 |     /// Adds a task to the dataset with the given inputs.  No effort is made to ensure the
81 |     /// handles exist within the graph.
82 |     pub fn create_task<D: 'static + DynRun>(inputs: FnArgs, t: D, name: &str) -> Arc<Graph> {
83 |         // Get new handle
84 |         let h_name = format!("Task<name={}>", name);
85 |         let handle = Arc::new(Handle::new(h_name));
86 |         let task = Arc::new(Task::Function(Box::new(t)));
87 |         Arc::new(Graph {
88 |             handle: handle,
89 |             task: task,
90 |             args: Some(inputs)
91 |         })
92 |     }
93 | 
94 | }
95 | 
96 | 


--------------------------------------------------------------------------------
/tange-collection/README.md:
--------------------------------------------------------------------------------
  1 | Tange-Collection
  2 | ---
  3 | Tange-Collection is a medium-level dataflow library for high speed data processing.
  4 | 
  5 | What is it?
  6 | ---
  7 | Tange-Collection provides dataflow operatores for quickly executing data processing tasks.  It uses task-based parallelization for construction of complex computation graphs, scalable to hundreds of millions of independent stages.
  8 | 
  9 | It was created to solve the same sort of processing tasks as Dask and Spark, with a higher
 10 | emphasis on batch processing rather than analytics.
 11 | 
 12 | API
 13 | ---
 14 | 
 15 | * [Overall](https://docs.rs/tange-collection/0.1.0/tange_collection/)
 16 | * [MemoryCollection](https://docs.rs/tange-collection/0.1.0/tange_collection/collection/memory/struct.MemoryCollection.html)
 17 | * [DiskCollection](https://docs.rs/tange-collection/0.1.0/tange_collection/collection/disk/struct.DiskCollection.html)
 18 | 
 19 | Example - Word Count
 20 | ---
 21 | 
 22 | ```rust
 23 | extern crate tange;
 24 | extern crate tange_collection;
 25 | 
 26 | use tange::scheduler::GreedyScheduler;
 27 | use tange_collection::utils::read_text;
 28 | 
 29 | use std::env::args;
 30 | 
 31 | fn main() {
 32 |     let path = args().nth(1).unwrap();
 33 |     let col = read_text(&path, 4_000_000)
 34 |         .expect("File missing");
 35 | 
 36 |     let graph = col 
 37 |         .map(|line| line.split_whitespace().fold(0usize, |a,_x| a + 1)) 
 38 |         .fold_by(|_count| 1,
 39 |                  || 0usize,
 40 |                  |acc, c| { *acc += c },
 41 |                  |acc1, acc2| { *acc1 += acc2 },
 42 |                  1);
 43 |     
 44 |     if let Some(counts) = graph.run(&GreedyScheduler::new()) {
 45 |         println!("Counts: {:?}", counts);
 46 |     }   
 47 | }
 48 | ```
 49 | Example - IDF count
 50 | ---
 51 | ```rust
 52 | extern crate tange;
 53 | extern crate tange_collection;
 54 | 
 55 | use tange::scheduler::GreedyScheduler;
 56 | use tange_collection::utils::read_text;
 57 | 
 58 | use std::env::args;
 59 | use std::collections::HashSet;
 60 | 
 61 | fn main() {
 62 |     env_logger::init();
 63 |      
 64 |     let path = args().nth(1).unwrap();
 65 |     let col = read_text(&path, 64_000_000)
 66 |         .expect("File missing");
 67 | 
 68 |     let total_lines = col.count();
 69 |     let word_freq = col
 70 |         .emit_to_disk("/tmp".into(), |line, emitter| {
 71 |             let unique: HashSet<_> = line.split_whitespace().map(|p| p.to_lowercase()).collect();
 72 |             for word in unique {
 73 |                 emitter(word);
 74 |             }
 75 |         })
 76 |         .frequencies(16);
 77 | 
 78 |     // Cross product
 79 |     let idfs = total_lines.join_on(
 80 |             &word_freq.to_memory(),
 81 |             |_c| 1,
 82 |             |_wc| 1,
 83 |             |total, (word, count)| {
 84 |                 (word.clone(), (1f64 + (*total as f64 / *count as f64)).ln())
 85 |             },  
 86 |             1   
 87 |         )
 88 |         .map(|(_k, x)| x.clone())
 89 |         .sort_by(|(word, _count)| word.clone());
 90 | 
 91 |     if let Some(word_idf) = idfs.run(&GreedyScheduler::new()) {
 92 |         for (w, idf) in word_idf {
 93 |             println!("{}: {}", w, idf);
 94 |         }
 95 |     }
 96 | }
 97 | ```
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/tange-collection/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! Tange-Collection
  2 | //! ---
  3 | //! Tange-Collection is a medium-level dataflow library for high speed data processing.
  4 | //!
  5 | //! What is it?
  6 | //! ---
  7 | //! Tange-Collection provides dataflow operatores for quickly executing data processing tasks.  It uses task-based parallelization for construction of complex computation graphs, scalable to hundreds of millions of independent stages.
  8 | //! 
  9 | //! It was created to solve the same sort of processing tasks as Dask and Spark, with a higher
 10 | //! emphasis on batch processing rather than analytics.
 11 | //!
 12 | //! Example - Word Count
 13 | //! ---
 14 | //!
 15 | //! ```ignore
 16 | //! extern crate tange;
 17 | //! extern crate tange_collection;
 18 | //! 
 19 | //! use tange::scheduler::GreedyScheduler;
 20 | //! use tange_collection::utils::read_text;
 21 | //! 
 22 | //! use std::env::args;
 23 | //! 
 24 | //! fn main() {
 25 | //!     let path = args().nth(1).unwrap();
 26 | //!     let col = read_text(&path, 4_000_000)
 27 | //!         .expect("File missing");
 28 | //! 
 29 | //!     let graph = col 
 30 | //!         .map(|line| line.split_whitespace().fold(0usize, |a,_x| a + 1)) 
 31 | //!         .fold_by(|_count| 1,
 32 | //!                  || 0usize,
 33 | //!                  |acc, c| { *acc += c },
 34 | //!                  |acc1, acc2| { *acc1 += acc2 },
 35 | //!                  1);
 36 | //!     
 37 | //!     if let Some(counts) = graph.run(&GreedyScheduler::new()) {
 38 | //!         println!("Counts: {:?}", counts);
 39 | //!     }   
 40 | //! }
 41 | //! ```
 42 | //! Example - IDF count
 43 | //! ---
 44 | //! ```ignore
 45 | //! extern crate tange;
 46 | //! extern crate tange_collection;
 47 | //! 
 48 | //! use tange::scheduler::GreedyScheduler;
 49 | //! use tange_collection::utils::read_text;
 50 | //! 
 51 | //! use std::env::args;
 52 | //! use std::collections::HashSet;
 53 | //! 
 54 | //! fn main() {
 55 | //!      
 56 | //!     let path = args().nth(1).unwrap();
 57 | //!     let col = read_text(&path, 64_000_000)
 58 | //!         .expect("File missing");
 59 | //! 
 60 | //!     let total_lines = col.count();
 61 | //!     let word_freq = col
 62 | //!         .emit_to_disk("/tmp".into(), |line, emitter| {
 63 | //!             let unique: HashSet<_> = line.split_whitespace().map(|p| p.to_lowercase()).collect();
 64 | //!             for word in unique {
 65 | //!                 emitter(word);
 66 | //!             }
 67 | //!         })
 68 | //!         .frequencies(16);
 69 | //! 
 70 | //!     // Cross product
 71 | //!     let idfs = total_lines.join_on(
 72 | //!             &word_freq.to_memory(),
 73 | //!             |_c| 1,
 74 | //!             |_wc| 1,
 75 | //!             |total, (word, count)| {
 76 | //!                 (word.clone(), (1f64 + (*total as f64 / *count as f64)).ln())
 77 | //!             },  
 78 | //!             1   
 79 | //!         )
 80 | //!         .map(|(_k, x)| x.clone())
 81 | //!         .sort_by(|(word, _count)| word.clone());
 82 | //! 
 83 | //!     if let Some(word_idf) = idfs.run(&GreedyScheduler::new()) {
 84 | //!         for (w, idf) in word_idf {
 85 | //!             println!("{}: {}", w, idf);
 86 | //!         }
 87 | //!     }
 88 | //! }
 89 | //! ```
 90 | 
 91 | #![warn(missing_docs)]
 92 | 
 93 | extern crate tange;
 94 | 
 95 | /// Defines useful utilities, such as reading files
 96 | pub mod utils;
 97 | 
 98 | /// Describes basic interfaces for storing and consuming data
 99 | pub mod interfaces;
100 | 
101 | /// Defines the two major primitives: MemoryColleciton and DiskCollection
102 | pub mod collection;
103 | 
104 | mod partitioned;
105 | 
106 | 


--------------------------------------------------------------------------------
/tange-core/README.md:
--------------------------------------------------------------------------------
  1 | Tange
  2 | ===
  3 | 
  4 | A Task-based parallelization framework.
  5 | 
  6 | What is it?
  7 | ---
  8 | 
  9 | `Tange` is a framework that makes it easy to write defered, data parallel computations that are executed concurrently across a local machine.  It can scale up to millions of tasks per Graph and can be useful for a number of different applications:
 10 | 
 11 | * Data processing.
 12 | * All-Reduce operations.
 13 | * Distributed machine learning algorithms.
 14 | * General parallel computing.
 15 | 
 16 | How to Use It?
 17 | ---
 18 | 
 19 | Tange defines a `Deferred` struct which represents a computation.  `Deferred` objects are accessed with three simple functions:
 20 | 
 21 | 1. `lift` - Lift takes a concrete value and lifts it into a Deferred object
 22 | 2. `apply` - Apply applies a function to a Deferred, producing a new Deferred object.
 23 | 3. `join` -  Join combines two Deferred objects with a joiner function, producing a new Deferred.
 24 | 
 25 | Example - Hello World
 26 | ---
 27 | 
 28 | ```rust
 29 | use tange::deferred::Deferred;
 30 | use tange::scheduler::GreedyScheduler;
 31 | 
 32 | // Create two Deferred object
 33 | let hello = Deferred::lift("Hello".to_owned(), None);
 34 | let world = Deferred::lift("World".to_owned(), None);
 35 | 
 36 | // Add an exclamation mark to "World"
 37 | let world_exclaim = world.apply(|w| format!("{}!", w));
 38 | 
 39 | // Join the words!
 40 | let hello_world = hello.join(&world_exclaim, |h, w| format!("{} {}", h, w));
 41 | 
 42 | assert_eq!(hello_world.run(&GreedyScheduler::new()), Some("Hello World!".into()));
 43 | ```
 44 | 
 45 | Example
 46 | ---
 47 | 
 48 | Let's count all the words across a directory.
 49 | 
 50 | ```rust
 51 | extern crate tange;
 52 | 
 53 | use tange::scheduler::GreedyScheduler;
 54 | use tange::deferred::{Deferred,batch_apply,tree_reduce};
 55 | 
 56 | use std::io::{BufReader,BufRead};
 57 | use std::env::args;
 58 | 
 59 | use std::io;
 60 | use std::fs::{File, read_dir};
 61 | use std::path::Path;
 62 | 
 63 | fn read_files(dir: &Path, buffer: &mut Vec<Deferred<String>>) -> io::Result<()> {
 64 |     if dir.is_dir() {
 65 |         for entry in read_dir(dir)? {
 66 |             let entry = entry?;
 67 |             let path = entry.path();
 68 |             if path.is_dir() {
 69 |                 read_files(&path, buffer)?;
 70 |             } else {
 71 |                 let p = path.to_string_lossy().into_owned();
 72 |                 buffer.push(Deferred::lift(p, None));
 73 |             }
 74 |         }
 75 |     }
 76 |     Ok(())
 77 | }
 78 | 
 79 | fn main() {
 80 |     let mut defs = Vec::new();
 81 |     for path in args().skip(1) {
 82 |         read_files(&Path::new(&path), &mut defs).expect("Error reading directory!");
 83 |     }
 84 | 
 85 |     if defs.len() == 0 {
 86 |         panic!("No files to count!");
 87 |     }
 88 | 
 89 |     // Read a file and count the number of words, split by white space
 90 |     let counts = batch_apply(&defs, |_idx, fname| {
 91 |         let mut count = 0usize;
 92 |         if let Ok(f) = File::open(&fname) {
 93 |             let mut br = BufReader::new(f);
 94 |             for maybe_line in br.lines() {
 95 |                 if let Ok(line) = maybe_line {
 96 |                     for p in line.split_whitespace() {
 97 |                         if p.len() > 0 {
 98 |                             count += 1;
 99 |                         }
100 |                     }
101 |                 } else {
102 |                     eprintln!("Error reading {}, skipping rest of file...", fname);
103 |                     break
104 |                 }
105 |             }
106 |         };
107 |         count
108 |     });
109 | 
110 |     // Sum the counts
111 |     let total = tree_reduce(&counts, |left, right| left + right)
112 |         .expect("Can't reduce if there are no files in the directory!");
113 | 
114 |     let count = total.run(&GreedyScheduler::new()).unwrap();
115 |     println!("Found {} words", count);
116 | }
117 | ```
118 | 


--------------------------------------------------------------------------------
/tange-collection/src/partitioned.rs:
--------------------------------------------------------------------------------
  1 | extern crate tange;
  2 | 
  3 | use std::any::Any;
  4 | use std::hash::{Hasher,Hash};
  5 | use std::collections::hash_map::DefaultHasher;
  6 | use std::collections::HashMap;
  7 | use std::sync::Arc;
  8 | 
  9 | use tange::deferred::{Deferred, batch_apply, tree_reduce};
 10 | use interfaces::*;
 11 | 
 12 | pub fn block_reduce<
 13 |     A,
 14 |     B,
 15 |     Col: Any + Sync + Send + Clone + Stream<A>,
 16 |     K: Any + Sync + Send + Clone + Hash + Eq,
 17 |     C: Any + Sync + Send + Clone,
 18 |     D: 'static + Sync + Send + Clone + Fn() -> B, 
 19 |     F: 'static + Sync + Send + Clone + Fn(&A) -> K, 
 20 |     O: 'static + Sync + Send + Clone + Fn(&mut B, &A) -> (),
 21 |     M: 'static + Sync + Send + Clone + Fn(HashMap<K,B>) -> C,
 22 | >(
 23 |     defs: &[Deferred<Col>], 
 24 |     key: F, 
 25 |     default: D, 
 26 |     binop: O,
 27 |     map: M
 28 | ) -> Vec<Deferred<C>> {
 29 |     batch_apply(defs, move |_idx, vs| {
 30 |         let mut reducer = HashMap::new();
 31 |         for v in vs.stream().into_iter() {
 32 |             let k = key(&v);
 33 |             let e = reducer.entry(k).or_insert_with(&default);
 34 |             binop(e, &v);
 35 |         }
 36 |         map(reducer)
 37 |     })
 38 | }
 39 | 
 40 | pub fn split_by_key<
 41 |     Col: Any + Sync + Send + Clone + Accumulator<A> + Stream<A>,
 42 |     A: Clone,
 43 |     F: 'static + Sync + Send + Clone + Fn(usize, &A) -> usize
 44 | >(
 45 |     defs: &[Deferred<Col>], 
 46 |     partitions: usize, 
 47 |     hash_function: F
 48 | ) -> Vec<Vec<Deferred<Col>>> 
 49 |         where Col::VW: ValueWriter<A,Out=Col> {
 50 | 
 51 |     // Group into buckets 
 52 |     let stage1 = batch_apply(&defs, move |_idx, vs| {
 53 |         let mut parts: Vec<_> = (0..partitions).map(|_| vs.writer()).collect();
 54 |         for (idx, x) in vs.stream().into_iter().enumerate() {
 55 |             let p = hash_function(idx, &x) % partitions;
 56 |             parts[p].add(x.clone());
 57 |         }
 58 |         parts.into_iter().map(|x| x.finish()).collect::<Vec<_>>()
 59 |     });
 60 | 
 61 |     // For each partition in each chunk, pull out at index and regroup.
 62 |     // Tree reduce to concatenate
 63 |     let mut splits = Vec::with_capacity(partitions);
 64 |     for idx in 0usize..partitions {
 65 |         let mut partition = Vec::with_capacity(stage1.len());
 66 | 
 67 |         for s in stage1.iter() {
 68 |             partition.push(s.apply(move |parts| parts[idx].copy()));
 69 |         }
 70 |         splits.push(partition);
 71 |     }
 72 |     splits
 73 | }
 74 | 
 75 | pub fn partition<
 76 |     Col: Any + Sync + Send + Clone + Accumulator<A> + Stream<A>,
 77 |     A: Any + Send + Sync + Clone,
 78 |     F: 'static + Sync + Send + Clone + Fn(usize, &A) -> usize
 79 | >(
 80 |     defs: &[Deferred<Col>], 
 81 |     partitions: usize, 
 82 |     key: F
 83 | ) -> Vec<Deferred<Col>>
 84 |         where Col::VW: ValueWriter<A,Out=Col> {
 85 |     
 86 |     let groups = split_by_key(defs, partitions, key);
 87 |     
 88 |     let mut new_chunks = Vec::with_capacity(groups.len());
 89 |     for group in groups {
 90 |         if let Some(d) = concat(&group) {
 91 |             new_chunks.push(d);
 92 |         }
 93 |     }
 94 |     new_chunks
 95 | }
 96 | 
 97 | pub fn fold_by<
 98 |     A: Clone,
 99 |     C1: Any + Sync + Send + Clone + Accumulator<A> + Stream<A>,
100 |     B: Any + Sync + Send + Clone,
101 |     K: Any + Sync + Send + Clone + Hash + Eq,
102 |     D: 'static + Sync + Send + Clone + Fn() -> B, 
103 |     F: 'static + Sync + Send + Clone + Fn(&A) -> K, 
104 |     O: 'static + Sync + Send + Clone + Fn(&mut B, &A) -> (),
105 |     R: 'static + Sync + Send + Clone + Fn(&mut B, &B) -> (),
106 |     Acc: 'static + Accumulator<(K, B)> + Stream<(K,B)>
107 | >(
108 |     defs: &[Deferred<C1>],
109 |     key: F, 
110 |     default: D, 
111 |     binop: O, 
112 |     reduce: R, 
113 |     acc: Acc,
114 |     partitions: usize
115 | ) -> Vec<Deferred<<<Acc as Accumulator<(K, B)>>::VW as ValueWriter<(K, B)>>::Out>>
116 |         where Acc::VW: ValueWriter<(K, B),Out=Acc> {
117 | 
118 |     let acc2 = Arc::new(acc);
119 |     let am = acc2.clone();
120 |     let stage1 = block_reduce(defs, key, default, binop, move |x| {
121 |         let mut out = am.writer();
122 |         out.extend(&mut x.into_iter());
123 |         out.finish()
124 |     });
125 | 
126 |     // Split into chunks
127 |     let chunks = partition_by_key::<Acc,_,_,_>(&stage1, partitions, |x| x.0.clone());
128 | 
129 |     // partition reduce
130 |     let am = acc2.clone();
131 |     let concat: Vec<_> = chunks.into_iter().map(move |chunk| {
132 |         let am = am.clone();
133 |         batch_apply(&chunk, move |_idx, vs| {
134 |             let mut hm = HashMap::new();
135 |             for (k, v) in vs.stream() {
136 |                 hm.insert(k, v);
137 |             }
138 |             let mut out = am.writer();
139 |             out.extend(&mut hm.into_iter());
140 |             out.finish()
141 |         })
142 |     }).collect();
143 | 
144 |     let mut reduction = Vec::new();
145 |     let rm = Arc::new(reduce);
146 |     for group in concat {
147 |         let amc = acc2.clone();
148 |         let ri = rm.clone();
149 | 
150 |         let out = tree_reduce(&group, move |left, right| {
151 |             let mut nl = HashMap::new();
152 |             for (k, v) in left.stream() {
153 |                 nl.insert(k, v);
154 |             }
155 |             for (k, v) in right.stream() {
156 |                 if !nl.contains_key(&k) {
157 |                     nl.insert(k, v);
158 |                 } else {
159 |                     nl.entry(k)
160 |                         .and_modify(|e| ri(e, &v))
161 |                         .or_insert_with(|| v); 
162 |                 }
163 |             }
164 |             let mut out = amc.writer();
165 | 
166 |             for item in nl.into_iter() {
167 |                 out.add(item);
168 |             }
169 |             out.finish()
170 |         });
171 |         reduction.push(out.unwrap());
172 |     }
173 |     reduction
174 | }
175 | 
176 | pub fn partition_by_key<
177 |     C: Any + Sync + Send + Clone + Accumulator<A> + Stream<A>,
178 |     A: Clone,
179 |     K: Any + Sync + Send + Clone + Hash + Eq,
180 |     F: 'static + Sync + Send + Clone + Fn(&A) -> K
181 | >(
182 |     defs: &[Deferred<C>], 
183 |     n_chunks: usize, 
184 |     key: F
185 | ) -> Vec<Vec<Deferred<C>>>
186 |         where C::VW: ValueWriter<A,Out=C> {
187 |     split_by_key(defs, n_chunks, move |_idx, v| {
188 |         let k = key(v);
189 |         let mut hasher = DefaultHasher::new();
190 |         k.hash(&mut hasher);
191 |         hasher.finish() as usize
192 |     })
193 | }
194 | 
195 | pub fn concat<
196 |     Col: Any + Sync + Send + Accumulator<A> + Stream<A>,
197 |     A: Clone,
198 | >(
199 |     defs: &[Deferred<Col>]
200 | ) -> Option<Deferred<Col>>
201 |         where  Col::VW: ValueWriter<A,Out=Col> {
202 | 
203 |     tree_reduce(&defs, |x, y| {
204 |         let mut out = x.writer();
205 |         for xi in x.stream() {
206 |             out.add(xi);
207 |         }
208 |         for yi in y.stream() {
209 |             out.add(yi);
210 |         }
211 |         out.finish()
212 |     })
213 | }
214 | 
215 | pub fn join_on_key<
216 |     A, 
217 |     B,
218 |     Col1: Any + Sync + Send + Clone + Stream<(K, A)>,
219 |     Col2: Any + Sync + Send + Clone + Stream<(K, B)>,
220 |     K: Any + Send + Sync + Clone + Hash + Eq,
221 |     C: Any + Sync + Send + Clone,
222 |     J: 'static + Sync + Send + Clone + Fn(&A, &B) -> C,
223 |     Acc: 'static + Accumulator<(K, C)>
224 | >(
225 |     d1: &Deferred<Col1>, 
226 |     d2: &Deferred<Col2>, 
227 |     acc: Acc,
228 |     joiner: J
229 | ) -> Deferred<<<Acc as Accumulator<(K, C)>>::VW as ValueWriter<(K, C)>>::Out> {
230 | 
231 |     d1.join(d2, move |left, right| {
232 |         // Slurp up left into a hashmap
233 |         let mut hm = HashMap::new();
234 |         for (k, lv) in left.stream() {
235 |             let e = hm.entry(k).or_insert_with(|| Vec::with_capacity(1)); 
236 |             e.push(lv);
237 |         }
238 |         let mut ret = acc.writer();
239 |         for (k, rv) in right.stream() {
240 |             if let Some(lvs) = hm.get(&k) {
241 |                 for lv in lvs.iter() {
242 |                     ret.add((k.clone(), joiner(&lv, &rv)))
243 |                 }
244 |             }
245 |         }
246 |         ret.finish()
247 |     })
248 | }
249 | 
250 | 


--------------------------------------------------------------------------------
/tange-collection/src/interfaces.rs:
--------------------------------------------------------------------------------
  1 | //! Defines the internal collections traits and objects..
  2 | extern crate serde;
  3 | extern crate bincode;
  4 | extern crate uuid;
  5 | extern crate snap;
  6 | 
  7 | use std::any::Any;
  8 | use std::fs::{File,remove_file,create_dir_all};
  9 | use std::io::{BufReader,BufWriter};
 10 | use std::marker::PhantomData;
 11 | use std::sync::Arc;
 12 | 
 13 | use self::snap::{Writer,Reader};
 14 | use self::serde::{Serialize,Deserialize};
 15 | use self::bincode::{serialize_into, deserialize_from,ErrorKind};
 16 | use self::uuid::Uuid;
 17 | 
 18 | /// Accumulators are object which can create 'Writers', using effectively the Builder
 19 | /// pattern
 20 | pub trait Accumulator<A>: Send + Sync + Clone  {
 21 | 
 22 |     /// ValueWriter created
 23 |     type VW: ValueWriter<A>;
 24 |     
 25 |     /// Create a new ValueWriter
 26 |     fn writer(&self) -> Self::VW;
 27 | 
 28 |     /// Convert a Vec into a ValueWriter output
 29 |     fn write_vec(&self, vs: Vec<A>) -> <<Self as Accumulator<A>>::VW as ValueWriter<A>>::Out {
 30 |         let mut out = self.writer();
 31 |         for a in vs {
 32 |             out.add(a)
 33 |         }
 34 |         out.finish()
 35 |     }
 36 | }
 37 | 
 38 | /// ValueWriters write Values into some internal state.  When finished, yields some
 39 | /// construct that 'contains' the output.
 40 | pub trait ValueWriter<A>: Sized {
 41 |     /// Value Store
 42 |     type Out: Accumulator<A>;
 43 | 
 44 |     /// Add an element to the ValueWriter
 45 |     fn add(&mut self, item: A) -> ();
 46 | 
 47 |     /// Writes an iterator to the ValueWriter
 48 |     fn extend<I: Iterator<Item=A>>(&mut self, i: &mut I) -> () {
 49 |         for item in i {
 50 |             self.add(item);
 51 |         }
 52 |     }
 53 | 
 54 |     /// Close the ValueWriter, returning the store
 55 |     fn finish(self) -> Self::Out;
 56 | }
 57 | 
 58 | /// Defines an Accumulator that writes values in memory, using Vec as the store.
 59 | #[derive(Clone)]
 60 | pub struct Memory;
 61 | 
 62 | impl <A: Any + Send + Sync + Clone> Accumulator<A> for Memory {
 63 |     type VW = Vec<A>;
 64 | 
 65 |     fn writer(&self) -> Self::VW {
 66 |         Vec::new()
 67 |     }
 68 | }
 69 | 
 70 | impl <A: Any + Send + Sync + Clone> Accumulator<A> for Vec<A> {
 71 |     type VW = Vec<A>;
 72 | 
 73 |     fn writer(&self) -> Self::VW {
 74 |         Vec::new()
 75 |     }
 76 | }
 77 | 
 78 | impl <A: Any + Send + Sync + Clone> ValueWriter<A> for Vec<A> {
 79 |     type Out = Vec<A>;
 80 | 
 81 |     fn add(&mut self, item: A) -> () {
 82 |         self.push(item);
 83 |     }
 84 | 
 85 |     fn finish(mut self) -> Self::Out {
 86 |         self.shrink_to_fit();
 87 |         self
 88 |     }
 89 | }
 90 | 
 91 | /// Uniform API for reading Values from a Store
 92 | pub trait Stream<A> {
 93 |     /// Iterator, yielding owned value
 94 |     type Iter: IntoIterator<Item=A>;
 95 | 
 96 |     /// Returns an iterator with owned values.
 97 |     fn stream(&self) -> Self::Iter;
 98 | 
 99 |     /// Returns a copy of the store.
100 |     fn copy(&self) -> Self;
101 | }
102 | 
103 | impl <A: Clone> Stream<A> for Vec<A> {
104 |     type Iter = Vec<A>;
105 | 
106 |     fn stream(&self) -> Self::Iter {
107 |         self.clone()
108 |     }
109 | 
110 |     fn copy(&self) -> Self {
111 |         self.clone()
112 |     }
113 | }
114 | 
115 | /// Writes values to a directory
116 | #[derive(Clone)]
117 | pub struct Disk(pub Arc<String>);
118 | 
119 | impl Disk {
120 |     /// Creates a new Disk object from a path
121 |     pub fn from_str(s: &str) -> Self {
122 |         Disk(Arc::new(s.to_owned()))
123 |     }
124 | }
125 | 
126 | /// An open buffer for writing records to disk
127 | pub struct DiskBuffer<A> {
128 |     root_path: Arc<String>, 
129 |     name: String,
130 |     pd: PhantomData<A>,
131 |     out: Writer<BufWriter<File>>
132 | }
133 | 
134 | impl <A> DiskBuffer<A> {
135 |     fn new(path: Arc<String>) -> Self {
136 |         let name = format!("{}/tange-{}", &path, Uuid::new_v4());
137 |         {
138 |             let p: &str = &path;
139 |             create_dir_all(p).expect("Unable to create directory!");
140 |         }
141 |         let fd = File::create(&name).expect("Can't create file!");
142 |         let bw = BufWriter::new(fd);
143 |         let encoder = Writer::new(bw);
144 |         DiskBuffer { 
145 |             root_path: path, 
146 |             name: name, 
147 |             pd: PhantomData,
148 |             out: encoder
149 |         }
150 |     }
151 | }
152 | 
153 | /// Contains a root path for storing temporary files
154 | #[derive(Clone)]
155 | pub struct FileStore<A: Clone + Send + Sync> {
156 |     root_path: Arc<String>, 
157 |     name: Option<String>,
158 |     pd: PhantomData<A>
159 | }
160 | 
161 | impl <A: Clone + Send + Sync> FileStore<A> {
162 | 
163 |     /// Create an empty FileStore at the given path
164 |     pub fn empty(path: Arc<String>) -> Self {
165 |         FileStore {
166 |             root_path: path,
167 |             name: None,
168 |             pd: PhantomData
169 |         }
170 |     }
171 | }
172 | 
173 | // Delete the temporary file on disk when dropped
174 | impl <A: Clone + Send + Sync> Drop for FileStore<A> {
175 |     fn drop(&mut self) {
176 |         if let Some(ref name) = self.name {
177 |             if let Err(e) = remove_file(name) {
178 |                 eprintln!("Error Deleting {}: {:?}J", name, e);
179 |             }
180 |         }
181 |     }
182 | }
183 | 
184 | impl <A: Serialize + Clone + Send + Sync> Accumulator<A> for Disk {
185 |     type VW = DiskBuffer<A>;
186 | 
187 |     fn writer(&self) -> Self::VW {
188 |         DiskBuffer::new(self.0.clone())
189 |     }
190 | }
191 | 
192 | impl <A: Serialize + Clone + Send + Sync> Accumulator<A> for Arc<FileStore<A>> {
193 |     type VW = DiskBuffer<A>;
194 | 
195 |     fn writer(&self) -> Self::VW {
196 |         DiskBuffer::new(self.root_path.clone())
197 |     }
198 | }
199 | 
200 | impl <A: Serialize + Clone + Send + Sync> ValueWriter<A> for DiskBuffer<A> {
201 |     type Out = Arc<FileStore<A>>;
202 | 
203 |     fn add(&mut self, item: A) -> () {
204 |         serialize_into(&mut self.out, &item).expect("Couldn't write record!");
205 |     }
206 | 
207 |     fn finish(self) -> Self::Out {
208 |         Arc::new(FileStore { 
209 |             root_path: self.root_path.clone(), 
210 |             name: Some(self.name), 
211 |             pd: PhantomData
212 |         })
213 |     }
214 | }
215 | 
216 | 
217 | impl <A: Clone + Send + Sync + for<'de> Deserialize<'de>> Stream<A> for Arc<FileStore<A>> {
218 |     type Iter = RecordFile<A>;
219 | 
220 |     fn stream(&self) -> Self::Iter {
221 |         RecordFile(self.name.clone(), PhantomData)
222 |     }
223 | 
224 |     fn copy(&self) -> Self { self.clone() }
225 | }
226 | 
227 | /// Streams records from an optional File.  If the file is none, returns the Empty iterator
228 | pub struct RecordFile<A>(Option<String>, PhantomData<A>);
229 | 
230 | impl <A: Clone + Send + Sync + for<'de> Deserialize<'de>> IntoIterator for RecordFile<A> {
231 |     type Item = A;
232 |     type IntoIter = RecordStreamer<A>;
233 | 
234 |     fn into_iter(self) -> Self::IntoIter {
235 |         if let Some(ref n) = self.0 {
236 |             let fd = File::open(n).expect("File didn't exist on open!");
237 |             let brfd = BufReader::new(fd);
238 |             let decoder = Reader::new(brfd);
239 |             RecordStreamer(Some(decoder), PhantomData)
240 |         } else {
241 |             RecordStreamer(None, PhantomData)
242 |         }
243 |     }
244 | }
245 | 
246 | /// Stream Records from an open file
247 | pub struct RecordStreamer<A>(Option<Reader<BufReader<File>>>, PhantomData<A>);
248 | 
249 | impl <A: Clone + Send + Sync + for<'de> Deserialize<'de>> Iterator for RecordStreamer<A> {
250 |     type Item = A;
251 | 
252 |     fn next(&mut self) -> Option<Self::Item> {
253 |         if let Some(ref mut bw) = self.0 {
254 |             //deserialize_from(bw).expect("Failure on deserialization!")
255 |             match deserialize_from(bw) {
256 |                 Ok(record) => Some(record),
257 |                 Err(e) => {
258 |                     let ek: &ErrorKind = &e;
259 |                     match ek {
260 |                         &ErrorKind::DeserializeAnyNotSupported => {
261 |                             eprintln!("Bincode doesn't work with certain types!");
262 |                             panic!();
263 |                         },
264 |                         _ => None
265 |                     }
266 |                 }
267 |             }
268 |         } else {
269 |             None
270 |         }
271 |     }
272 | }
273 | 


--------------------------------------------------------------------------------
/tange-core/src/deferred.rs:
--------------------------------------------------------------------------------
  1 | //! Defines the Deferred primitive
  2 | //!
  3 | use std::marker::PhantomData;
  4 | use std::sync::Arc;
  5 | use std::any::Any;
  6 | 
  7 | use task::{DynFn,DynFn2,BASS};
  8 | use graph::*;
  9 | use scheduler::Scheduler;
 10 | 
 11 | struct Lift<A>(A);
 12 | 
 13 | impl <A: Any + Send + Sync + Clone> Input for Lift<A> {
 14 |     fn read(&self) -> BASS {
 15 |         Box::new(self.0.clone())
 16 |     }
 17 | }
 18 | 
 19 | /// A `Deferred` is the core struct defining how computations are composed
 20 | /// The type parameter indicates the type of data contained within the `Deferred`
 21 | #[derive(Clone)]
 22 | pub struct Deferred<A> {
 23 | 
 24 |     /// Dependency graph required to evaluate to the given A
 25 |     graph: Arc<Graph>,
 26 | 
 27 |     /// Phantom type for Any
 28 |     items: PhantomData<A>
 29 | }
 30 | 
 31 | impl <A: Any + Send + Sync> Deferred<A> {
 32 |     
 33 |     /// Applies a function to a Deferred, returning a new Deferred.  This is effectively
 34 |     /// a Functor.
 35 |     ///
 36 |     /// ```
 37 |     /// use tange::deferred::Deferred;
 38 |     /// use tange::scheduler::GreedyScheduler;
 39 |     ///
 40 |     /// let def = Deferred::lift(vec![1u8, 2, 3, 4], "Vector".into());
 41 |     /// let size = def.apply(|v| v.len());
 42 |     /// let results = size.run(&GreedyScheduler::new());
 43 |     /// assert_eq!(results, Some(4usize));
 44 |     /// ```
 45 |     ///
 46 |     pub fn apply<B: Any + Send + Sync, F: Send + Sync + 'static + Fn(&A) -> B>(&self, f: F) -> Deferred<B> {
 47 |         let ng = Graph::create_task(
 48 |             FnArgs::Single(self.graph.clone()), DynFn::new(f), "Apply");
 49 |         Deferred {
 50 |             graph: ng,
 51 |             items: PhantomData
 52 |         }
 53 | 
 54 |     }
 55 | 
 56 |     /// Joins two Deferred objects with a function, creating a new Deferred object.
 57 |     ///
 58 |     /// ```
 59 |     /// use tange::deferred::Deferred;
 60 |     /// use tange::scheduler::GreedyScheduler;
 61 |     ///
 62 |     /// let left  = Deferred::lift(vec![1f32, 2., 3., 4.], "Vector".into());
 63 |     /// let right  = Deferred::lift(10f32, "Num".into());
 64 |     /// let multiplied: Deferred<Vec<f32>> = left.join(&right, 
 65 |     ///        |l,r| l.iter().map(|x| x * r).collect());
 66 |     /// let results = multiplied.run(&GreedyScheduler::new());
 67 |     /// assert_eq!(results, Some(vec![10., 20., 30., 40.]));
 68 |     /// ```
 69 |     ///
 70 |     pub fn join<B: Any + Send + Sync, C: Any + Send + Sync, F: Send + Sync + 'static + Fn(&A, &B) -> C>(&self, other: &Deferred<B>, f: F) -> Deferred<C> {
 71 |         let ng = Graph::create_task(
 72 |             FnArgs::Join(self.graph.clone(), other.graph.clone()), 
 73 |             DynFn2::new(f), "Join");
 74 | 
 75 |         Deferred {
 76 |             graph: ng,
 77 |             items: PhantomData
 78 |         }
 79 | 
 80 |     }
 81 | }
 82 | 
 83 | impl <A: Any + Send + Sync + Clone> Deferred<A> {
 84 |     /// Lifts a value into a Deferred object.
 85 |     /// ```
 86 |     /// use tange::deferred::Deferred;
 87 |     /// use tange::scheduler::GreedyScheduler;
 88 |     ///
 89 |     /// let id = Deferred::lift("Some String".to_owned(), "String".into());
 90 |     /// assert_eq!(id.run(&GreedyScheduler::new()), Some("Some String".into()));
 91 |     /// 
 92 |     /// ```
 93 |     pub fn lift(a: A, name: Option<&str>) -> Self {
 94 |         let graph = Graph::create_input(Lift(a), name.unwrap_or("Input"));
 95 |         Deferred {
 96 |             graph: graph,
 97 |             items: PhantomData
 98 |         }
 99 |     }
100 | 
101 |     /// Evaluates the Deferred object and dependency graph, returning the result 
102 |     /// of the computation.  
103 |     /// 
104 |     /// ```
105 |     /// use tange::deferred::Deferred;
106 |     /// use tange::scheduler::GreedyScheduler;
107 |     ///
108 |     /// let a = Deferred::lift(1usize, "a".into());
109 |     /// let b = Deferred::lift(2usize, "b".into());
110 |     /// let c = a.join(&b, |x, y| x + y);
111 |     /// assert_eq!(c.run(&GreedyScheduler::new()), Some(3usize));
112 |     /// 
113 |     /// ```
114 | 
115 |     pub fn run<S: Scheduler>(&self, s: &S) -> Option<A> {
116 |         s.compute(self.graph.clone()).and_then(|v| { 
117 |             Arc::try_unwrap(v).ok().and_then(|ab| {
118 |                 ab.downcast_ref::<A>().map(|x| x.clone())
119 |             })
120 |         })
121 |     }
122 | }
123 | 
124 | /// `batch_apply` is a convenience method that takes a set of homogenous `Deferred`s
125 | /// and applies a function to each, returning a new set of `Deferred`s.  Unlike 
126 | /// `Deferred::apply`, `batch_apply` passes in an order index. 
127 | /// ```
128 | /// use tange::deferred::{Deferred, batch_apply};
129 | /// use tange::scheduler::GreedyScheduler;
130 | ///
131 | /// let vec: Vec<_> = (0usize..10)
132 | ///     .map(|v| Deferred::lift(v, None)).collect();
133 | /// let out = batch_apply(&vec, |idx, v| idx + v);
134 | /// assert_eq!(out[1].run(&GreedyScheduler::new()), Some(2));
135 | /// assert_eq!(out[5].run(&GreedyScheduler::new()), Some(10));
136 | /// ```
137 | ///
138 | pub fn batch_apply<
139 |     A: Any + Send + Sync + Clone, 
140 |     B: Any + Send + Sync, 
141 |     F: 'static + Sync + Send + Clone + Fn(usize, &A) -> B
142 |     >(defs: &[Deferred<A>], f: F) 
143 | -> Vec<Deferred<B>> {
144 |     let mut nps = Vec::with_capacity(defs.len());
145 |     let fa = Arc::new(f);
146 |     for (idx, p) in defs.iter().enumerate() {
147 |         let mf = fa.clone();
148 |         let np = p.apply(move |vs| { mf(idx, vs) }); 
149 |         nps.push(np);
150 |     }   
151 |     nps 
152 | }
153 | 
154 | /// Often times, we want to combine a set of Deferred objects into a single Deferred.
155 | /// `tree_reduce` combines pairs of Deferred recursively using `f`, building a dependency
156 | /// tree which attempts to maximize parallelism.
157 | /// ```
158 | /// use tange::deferred::{Deferred, tree_reduce};
159 | /// use tange::scheduler::LeveledScheduler;
160 | ///
161 | /// let vec: Vec<_> = (0usize..10)
162 | ///     .map(|v| Deferred::lift(v, None)).collect();
163 | /// let out = tree_reduce(&vec, |left, right| left + right).unwrap();
164 | /// let expected = (0usize..10).fold(0, |acc, x| acc + x);
165 | /// assert_eq!(out.run(&LeveledScheduler), Some(expected));
166 | /// ```
167 | pub fn tree_reduce<A: Any + Send + Sync + Clone, 
168 |                    F: 'static + Sync + Send + Clone + Fn(&A, &A) -> A
169 | >(
170 |     defs: &[Deferred<A>], 
171 |     f: F
172 | ) -> Option<Deferred<A>> {
173 |     tree_reduce_until(defs, 1, f).map(|mut defs| {
174 |         defs.remove(0)
175 |     })
176 | }
177 | 
178 | /// `tree_reduce_until` is similar to `tree_reduce` except that it will stop reducing
179 | /// when the number of `Deferred`s left is less than or equal to `parts`.
180 | ///
181 | /// ```
182 | /// use tange::deferred::{Deferred, tree_reduce_until};
183 | /// use tange::scheduler::GreedyScheduler;
184 | ///
185 | /// let vec: Vec<_> = (0usize..8)
186 | ///     .map(|v| Deferred::lift(v, None)).collect();
187 | /// let out = tree_reduce_until(&vec, 2, |left, right| left + right).unwrap();
188 | /// assert_eq!(out.len(), 2);
189 | /// assert_eq!(out[0].run(&GreedyScheduler::new()), Some(0+1+2+3));
190 | /// ```
191 | pub fn tree_reduce_until<A: Any + Send + Sync + Clone, 
192 |                    F: 'static + Sync + Send + Clone + Fn(&A, &A) -> A
193 | >(
194 |     defs: &[Deferred<A>], 
195 |     parts: usize, 
196 |     f: F
197 | ) -> Option<Vec<Deferred<A>>> {
198 |     if defs.len() == 0 {
199 |         None
200 |     } else if defs.len() <= parts {
201 |         Some(defs.clone().to_vec())
202 |     } else {
203 |         // First pass
204 |         let mut pass = Vec::new();
205 |         for i in (0..defs.len() - 1).step_by(2) {
206 |             pass.push(defs[i].join(&defs[i+1], f.clone()));
207 |         }
208 |         if defs.len() % 2 == 1 {
209 |             pass.push(defs[defs.len() - 1].clone());
210 |         }
211 |         tree_reduce_until(&pass, parts, f)
212 |     }
213 | }
214 | 
215 | #[cfg(test)]
216 | mod def_test {
217 |     use super::*;
218 |     use scheduler::{LeveledScheduler,GreedyScheduler};
219 | 
220 |     #[test]
221 |     fn test_tree_reduce() {
222 |         let v: Vec<_> = (0..999usize).into_iter()
223 |             .map(|x| Deferred::lift(x, None))
224 |             .map(|d| d.apply(|x| x + 1))
225 |             .collect();
226 | 
227 |         let res = (1..1000usize).sum();
228 | 
229 |         let agg = tree_reduce(&v, |x, y| x + y).unwrap();
230 |         let results = agg.run(&LeveledScheduler);
231 |         assert_eq!(results, Some(res));
232 |     }
233 | 
234 |     #[test]
235 |     fn test_tree_reduce_greedy() {
236 |         let v: Vec<_> = (0..2usize).into_iter()
237 |             .map(|x| Deferred::lift(x, None))
238 |             .collect();
239 | 
240 |         let res = (0..2usize).sum();
241 | 
242 |         let agg = tree_reduce(&v, |x, y| x + y).unwrap();
243 |         let results = agg.run(&GreedyScheduler::new());
244 |         assert_eq!(results, Some(res));
245 |     }
246 | 
247 | }
248 | 


--------------------------------------------------------------------------------
/tange-core/src/scheduler.rs:
--------------------------------------------------------------------------------
  1 | //! Contains all the runtimes scheduling Graphs for execution.
  2 | extern crate num_cpus;
  3 | extern crate log;
  4 | extern crate priority_queue;
  5 | extern crate jobpool;
  6 | 
  7 | use std::sync::{Mutex,Arc,mpsc};
  8 | use std::collections::{HashMap, HashSet};
  9 | use std::hash::Hash;
 10 | 
 11 | use log::Level::{Trace,Debug as LDebug};
 12 | use self::priority_queue::PriorityQueue;
 13 | use self::jobpool::JobPool;
 14 | 
 15 | use task::{BASS,DynArgs};
 16 | use graph::{Graph,Task,Handle,FnArgs};
 17 | 
 18 | type DepGraph = HashMap<Arc<Handle>, HashSet<Arc<Handle>>>; 
 19 | type ChainGraph = HashMap<Vec<Arc<Handle>>, HashSet<Arc<Handle>>>; 
 20 | 
 21 | // Keeps track of data that are needed by downstream computations
 22 | #[derive(Debug)]
 23 | struct DataStore<K: PartialEq + Hash + Eq, V> {
 24 |     data: HashMap<K, V>,
 25 |     counts: HashMap<K, usize>
 26 | }
 27 | 
 28 | impl <K: PartialEq + Hash + Eq, V: Clone> DataStore<K,V> {
 29 |     // Creates a new DataStore
 30 |     // `Counts` are the number of times a piece of data will be consumed.
 31 |     fn new(
 32 |         data: HashMap<K, V>, 
 33 |         counts: HashMap<K, usize>
 34 |     ) -> Self {
 35 |         DataStore {data: data, counts: counts}
 36 |     }
 37 | 
 38 |     // Gets a piece of data from the DataStore.  If the key doesn't exist,
 39 |     // returns None
 40 |     fn get(&mut self, handle: &K) -> Option<V> {
 41 |         let count = self.counts.get_mut(handle).map(|c| {
 42 |             *c -= 1;
 43 |             *c
 44 |         }).unwrap_or(0);
 45 | 
 46 |         if count == 0 {
 47 |             self.data.remove(handle)
 48 |         } else {
 49 |             self.data.get(handle).map(|x| x.clone())
 50 |         }
 51 |     }
 52 | 
 53 |     // Adds a key/value to the datastore.
 54 |     fn insert(&mut self, handle: K, data: V) {
 55 |         self.data.insert(handle, data);
 56 |     }
 57 | }
 58 | 
 59 | /// Defines the Scheduler object.  Schedulers take in Graphs and return the result
 60 | /// of their computation.
 61 | pub trait Scheduler {
 62 |     /// Compute the given Graph, returning the value.
 63 |     fn compute(&self, graph: Arc<Graph>) -> Option<Arc<BASS>>; 
 64 | }
 65 | 
 66 | enum Limbo {
 67 |     One(Arc<BASS>),
 68 |     Two(Arc<BASS>, Arc<BASS>)
 69 | }
 70 | 
 71 | struct DAG {
 72 |    
 73 |     /// Output handle to task
 74 |     pub tasks: HashMap<Arc<Handle>, Arc<Task>>,
 75 | 
 76 |     /// Dependencies between tasks
 77 |     pub dependencies: HashMap<Arc<Handle>, Option<FnArgs>>
 78 |  
 79 | }
 80 | 
 81 | impl DAG {
 82 |     /// Converts a Graph into a Directed Acyclic Graph.
 83 |     fn new(g: Arc<Graph>) -> Self {
 84 |         let mut tasks = HashMap::new();
 85 |         let mut dependencies = HashMap::new();
 86 | 
 87 |         let mut stack = vec![g];
 88 | 
 89 |         let mut hs = HashSet::new();
 90 | 
 91 |         while !stack.is_empty() {
 92 |             trace!("Stack size: {}", stack.len());
 93 |             let ag = stack.pop().unwrap();
 94 |             if !hs.contains(&ag.handle) {
 95 |                 hs.insert(ag.handle.clone());
 96 |                 tasks.insert(ag.handle.clone(), ag.task.clone());
 97 |                 dependencies.insert(ag.handle.clone(), ag.args.clone());
 98 |                 if let Some(ref fns) = ag.args {
 99 |                     match fns {
100 |                         FnArgs::Single(g) => stack.push(g.clone()),
101 |                         FnArgs::Join(g1, g2) => {
102 |                             stack.push(g1.clone());
103 |                             stack.push(g2.clone());
104 |                         }
105 |                     };
106 |                 }
107 |             }
108 |         }
109 |         DAG {
110 |             tasks: tasks,
111 |             dependencies: dependencies
112 |         }
113 |     }
114 | }
115 | 
116 | /// Reads out dependencies into a Limbo object, which is really just a simple Enum
117 | fn get_fnargs(ds: &mut DataStore<Arc<Handle>,Arc<BASS>>, fa: &FnArgs) -> Option<Limbo> {
118 |     match fa {
119 |         &FnArgs::Single(ref g) => {
120 |             ds.get(&g.handle).map(|args| {
121 |                 Limbo::One(args)
122 |             })
123 |         },
124 |         &FnArgs::Join(ref lg, ref rg) => {
125 |             ds.get(&lg.handle).and_then(|left| {
126 |                 ds.get(&rg.handle).map(|right| {
127 |                     Limbo::Two(left, right)
128 |                 })
129 |             })
130 |         }
131 |     }
132 | }
133 | 
134 | // Converts a flattened graph into a dependency list
135 | fn build_dep_graph(graph: &DAG) -> (DepGraph, DepGraph) {
136 |     // Build out dependencies
137 |     let mut inbound: DepGraph = HashMap::new();
138 |     let mut outbound: DepGraph = HashMap::new();
139 |     for (output, ref inputs) in graph.dependencies.iter() {
140 |         // Only track unique handles
141 |         let mut hs = HashSet::new();
142 |         if let Some(inp) = inputs {
143 |             let fna: &FnArgs = &inp;
144 |             match fna {
145 |                 &FnArgs::Single(ref h) => hs.insert(h.handle.clone()),
146 |                 &FnArgs::Join(ref h1, ref h2) => {
147 |                     hs.insert(h1.handle.clone());
148 |                     hs.insert(h2.handle.clone())
149 |                 },
150 |             };
151 |         }
152 |         // Add outbound
153 |         for h in hs.iter() {
154 |             let e = outbound.entry(h.clone()).or_insert_with(|| HashSet::with_capacity(1));
155 |             e.insert(output.clone());
156 |         }
157 |         inbound.insert(output.clone(), hs);
158 |     }
159 |     inbound.shrink_to_fit();
160 |     outbound.shrink_to_fit();
161 |     (inbound, outbound)
162 | }
163 | 
164 | // Constructs a set of nodes that have no dependencies between them
165 | fn generate_levels(collapsed: ChainGraph) -> Vec<Vec<Vec<Arc<Handle>>>> {
166 |     // Create outbound
167 |     let mut outbound = HashMap::new();
168 |     for (nodes, deps) in collapsed.iter() {
169 |         for d in deps.iter() {
170 |             let e = outbound.entry(d).or_insert_with(|| HashSet::with_capacity(1));
171 |             e.insert(nodes);
172 |         }
173 |     }
174 |     let mut inbound = collapsed.clone();
175 |     // Compute task levels
176 |     let mut levels = Vec::new();
177 |     let mut cur_level: Vec<Vec<Arc<Handle>>> = inbound.iter()
178 |             .filter(|(_, v)| v.is_empty())
179 |             .map(|(k, _)| k.clone())
180 |             .collect();
181 | 
182 |     loop {
183 |         
184 |         if cur_level.is_empty() {
185 |             break;
186 |         }
187 | 
188 |         // Remove nodes from graph
189 |         for handles in cur_level.iter() {
190 |             inbound.remove(handles);
191 |         }
192 | 
193 |         // Update dependencies
194 |         let mut next_level = Vec::new();
195 |         for hs in cur_level.iter() {
196 |             // Get outbound nodes
197 |             let last = &hs[hs.len() - 1];
198 |             if let Some(node_set) = outbound.get(last) {
199 |                 for node in node_set.iter() {
200 |                     if let Some(set) = inbound.get_mut(*node) {
201 |                         set.remove(last);
202 |                         if set.is_empty() {
203 |                             next_level.push((*node).clone());
204 |                         }
205 |                     }
206 |                 }
207 |             }
208 |         }
209 | 
210 |         levels.push(cur_level);
211 |         cur_level = next_level;
212 |     }
213 |     if log_enabled!(LDebug) {
214 |         let mut max_con = 0usize;
215 |         for (i, l) in levels.iter().enumerate() {
216 |             max_con = max_con.max(l.len());
217 |             debug!("Level: {}, Tasks: {}", i, l.len());
218 |         }
219 |         debug!("Max Concurrency: {}", max_con);
220 |     }
221 |     levels
222 | }
223 | 
224 | fn run_task(
225 |     graph: &DAG, 
226 |     chain: &[Arc<Handle>], 
227 |     dsam: Arc<Mutex<DataStore<Arc<Handle>, Arc<BASS>>>> 
228 | ) {
229 |     // Pull out arguments from the datasource
230 |     trace!("Reading dependencies for chain {:?}", chain[0]);
231 |     let ot = graph.dependencies.get(&chain[0]);
232 |     let mut largs = {
233 |         let ds: &mut DataStore<_,_> = &mut *dsam.lock().unwrap();
234 |         // Get inputs
235 |         match ot {
236 |             Some(Some(ar)) => get_fnargs(ds, &ar),
237 |             _              => None
238 |         }
239 |     };
240 | 
241 |     for handle in chain {
242 |         trace!("Processing handle: {:?}", handle);
243 |         let out = match graph.tasks.get(handle) {
244 |             Some(ref task) => {
245 |                 let task_ref: &Task = &task;
246 |                 match task_ref {
247 |                     Task::Input(ref input) => Some(input.read()),
248 |                     Task::Function(ref t) => {
249 |                         match largs {
250 |                             Some(Limbo::One(ref a)) => {
251 |                                 t.eval(DynArgs::One(a))
252 |                             },
253 |                             Some(Limbo::Two(ref a, ref b)) => {
254 |                                 t.eval(DynArgs::Two(a, b))
255 |                             },
256 |                             None => None
257 |                         }
258 |                     }
259 |                 }
260 |             },
261 |             None => None
262 |         };
263 |         if let Some(bass) = out {
264 |             largs = Some(Limbo::One(Arc::new(bass)));
265 |         }
266 |     }
267 | 
268 |     if let Some(Limbo::One(d)) = largs {
269 |         let mut ds = dsam.lock().unwrap();
270 |         ds.insert(chain[chain.len() - 1].clone(), d);
271 |     } 
272 | }
273 | 
274 | // Finds chains of tasks that can be collapsed into a single task.  While this isn't
275 | // strictly needed, both the LeveledScheduler and GreedyScheduler benefit from it in
276 | // different ways: 
277 | use std::fmt::Debug;
278 | fn collapse_graph<K: Hash + Eq + Debug + Clone>(
279 |     mut nodes: HashMap<K, HashSet<K>>
280 | ) -> HashMap<Vec<K>, HashSet<K>> {
281 | 
282 |     // Generate outbound edges
283 |     let mut outbound = HashMap::new();
284 |     let mut roots = Vec::new();
285 |     let mut inbound: HashMap<K, Vec<K>> = HashMap::new();
286 |     for (node, deps) in nodes.iter() {
287 |         if !outbound.contains_key(node) {
288 |             outbound.insert(node.clone(), Vec::new());
289 |         }
290 | 
291 |         for d in deps.iter() {
292 |             let e = outbound.entry(d.clone()).or_insert(Vec::new());
293 |             e.push(node.clone());
294 |         }
295 | 
296 |         if deps.is_empty() {
297 |             roots.push(vec![node.clone()]);
298 |         }
299 | 
300 |         inbound.insert(node.clone(), deps.iter().cloned().collect());
301 |     }
302 | 
303 |     let mut new_nodes = HashMap::new();
304 |     let mut seen = HashSet::new();
305 |     while !roots.is_empty() {
306 |         if let Some(mut chain) = roots.pop() {
307 |             let link = {
308 |                 let tail = &chain[chain.len() - 1];
309 | 
310 |                 // If outbound == 1 and that refernce only has one inbound
311 |                 if outbound[tail].len() == 1 && inbound[&outbound[tail][0]].len() == 1 {
312 |                     // We found a link in a chain
313 |                     // Add the node to the current list
314 |                     Some(outbound[tail][0].clone())
315 |                 } else {
316 |                     None
317 |                     // Our chain is finished, emit it
318 |                 }
319 |             };
320 | 
321 |             if let Some(node) = link {
322 |                 chain.push(node);
323 |                 roots.push(chain);
324 |             } else {
325 |                 // If current chain is ended, add the outbound nodes
326 |                 {
327 |                     let tail = &chain[chain.len() - 1];
328 |                     for node in outbound[tail].iter() {
329 |                         if !seen.contains(node) {
330 |                             roots.push(vec![node.clone()]);
331 |                             seen.insert(node.clone());
332 |                         }
333 |                     }
334 |                 }
335 |                 // Emit current chain
336 |                 let deps = nodes.remove(&chain[0]).unwrap();
337 |                 new_nodes.insert(chain, deps);
338 |             }
339 |         }
340 |     }
341 | 
342 |     new_nodes
343 | }
344 | 
345 | /// LeveledScheduler computes sets of mutually exclusive tasks that can be run
346 | /// concurrently.  Unlike GreedyScheduler, which will immediately consume the next
347 | /// available task regardless of level, LeveledScheduler will wait for an entire level
348 | /// to finish computation before moving to the next one.
349 | ///
350 | /// This has some small benefits when it to reproducibility: it natually is more
351 | /// deterministic than the GreedyScheduler, though potentially slower in some cases 
352 | /// a set of tasks on a level are slower.
353 | pub struct LeveledScheduler;
354 | 
355 | impl Scheduler for LeveledScheduler{
356 | 
357 |     fn compute(
358 |         &self, 
359 |         graph: Arc<Graph>
360 |     ) -> Option<Arc<BASS>> {
361 |         
362 |         let out_handle = graph.handle.clone();
363 |         let dag = Arc::new(DAG::new(graph));
364 |         debug!("Number of Tasks Specified: {}", dag.tasks.len());
365 | 
366 |         let (inbound, _outbound) = build_dep_graph(&dag);
367 | 
368 |         let collapsed = collapse_graph(inbound);
369 | 
370 |         debug!("Number of Tasks to Run: {}", collapsed.len());
371 |         
372 |         // Build the counts
373 |         let mut counts: HashMap<Arc<Handle>,_> = HashMap::new();
374 |         for (_k, vs) in collapsed.iter() {
375 |             for v in vs.iter() {
376 |                 let e = counts.entry(v.clone()).or_insert(0usize);
377 |                 *e += 1;
378 |             }
379 |         }
380 | 
381 |         // Build out the levels
382 |         let levels = generate_levels(collapsed);
383 |         
384 |         // Load up the inputs
385 |         let data: HashMap<Arc<Handle>,Arc<BASS>> = HashMap::new();
386 | 
387 |         // Add all handles
388 |         let raw_ds: DataStore<Arc<Handle>, Arc<BASS>> = DataStore::new(data, counts);
389 |         let dsam = Arc::new(Mutex::new(raw_ds));
390 | 
391 |         for (i, level) in levels.into_iter().enumerate() {
392 |             let mut pool = JobPool::new(num_cpus::get());
393 |             debug!("Running level: {}", i);
394 |             for chain in level {
395 |                 let g = dag.clone();
396 |                 let c = chain.clone();
397 |                 let d = dsam.clone();
398 |                 pool.queue(move || { run_task(&g, &c, d); });
399 |             }
400 | 
401 |             // block until all are done
402 |             pool.shutdown();
403 |         }
404 | 
405 |         debug!("Finished");
406 |         let ret = {
407 |             dsam.lock().unwrap().get(&out_handle)
408 |         };
409 |         ret
410 |     }
411 | }
412 | 
413 | /// GreedyScheduler is the recommend scheduler for Tange-Core.  After computing the DAG
414 | /// from the Graph, it uses a priority heap to determine which task to execute next,
415 | /// biasing toward reduction.  That is, joins are preferred over an apply since it reduces
416 | /// the number of thunks by one.  Inputs are preferred last.
417 | ///
418 | pub struct GreedyScheduler(usize);
419 | 
420 | impl GreedyScheduler {
421 | 
422 |     /// Creates a new GreedyScheduler with the default number of threads.
423 |     pub fn new() -> Self {
424 |         GreedyScheduler(num_cpus::get())
425 |     }
426 | 
427 |     /// Sets the number of threads to use.  By default, uses one thread per core.
428 |     pub fn set_threads(&mut self, n_threads: usize) -> () {
429 |          self.0 = n_threads;
430 |     }
431 | }
432 | 
433 | impl Scheduler for GreedyScheduler {
434 | 
435 |     fn compute(
436 |         &self, 
437 |         graph: Arc<Graph>
438 |     ) -> Option<Arc<BASS>> {
439 |         
440 |         let out_handle = graph.handle.clone();
441 | 
442 |         trace!("Building Dag...");
443 |         let dag = Arc::new(DAG::new(graph));
444 |         
445 |         debug!("Number of Tasks Specified: {}", dag.tasks.len());
446 | 
447 |         let (inbound, mut outbound) = build_dep_graph(&dag);
448 | 
449 |         let collapsed = collapse_graph(inbound);
450 | 
451 |         let total_jobs = collapsed.len();
452 |         debug!("Number of Tasks to Run: {}", total_jobs);
453 |         
454 |         // Build the counts
455 |         let mut counts: HashMap<Arc<Handle>,_> = HashMap::new();
456 |         let mut queue = PriorityQueue::new();
457 |         for (chain, deps) in collapsed.iter() {
458 |             // Add the inputs
459 |             if deps.len() == 0 {
460 |                 trace!("Adding intial chain: {:?}, Priority: {}", chain, 0usize);
461 |                 queue.push(chain.clone(), 0usize);
462 |             }
463 | 
464 |             for d in deps.iter() {
465 |                 let e = counts.entry(d.clone()).or_insert(0usize);
466 |                 *e += 1;
467 |             }
468 |         }
469 | 
470 |         // Make the graph a bit easier to work with
471 |         let mut head_map: HashMap<_,_> = collapsed.into_iter().map(|(chain, deps)| {
472 |             (chain[0].clone(), (chain, deps.len(), deps))
473 |         }).collect();
474 | 
475 |         // Load up the inputs
476 |         let data: HashMap<Arc<Handle>,Arc<BASS>> = HashMap::new();
477 | 
478 |         // Initialize an empty data store
479 |         let raw_ds: DataStore<Arc<Handle>, Arc<BASS>> = DataStore::new(data, counts);
480 |         let dsam = Arc::new(Mutex::new(raw_ds));
481 | 
482 |         // Start the loop!
483 | 
484 |         if log_enabled!(Trace) {
485 |             for (ref index, &(ref chain, ref _priority, ref deps)) in head_map.iter() {
486 |                 trace!("Index: {:?}, Chain: {:?}, Deps: {:?}", index, chain, deps);
487 |             }
488 |         }
489 |         debug!("Starting tasks...");
490 |         let mut jobs_done = 0usize;
491 |         {
492 |             let mut pool = JobPool::new(self.0);
493 |             let mut free_threads = self.0;
494 |             let (tx, rx) = mpsc::channel();
495 |             loop {
496 |                 // Queue up all free items
497 |                 while free_threads > 0 && !queue.is_empty(){
498 |                     if let Some((chain, priority)) = queue.pop() {
499 |                         trace!("Training chain: {:?}, Priority: {}", chain, priority);
500 |                         let g = dag.clone();
501 |                         let c = chain.clone();
502 |                         let d = dsam.clone();
503 |                         let thread_tx = tx.clone();
504 |                         pool.queue(move || {
505 |                             run_task(&g, &c, d);
506 |                             thread_tx.send(c[c.len() - 1].clone())
507 |                                 .expect("Error sending thread!");
508 |                         });
509 |                         free_threads -= 1;
510 |                     } 
511 |                 }
512 | 
513 |                 // Eat!
514 |                 let handle = rx.recv().unwrap(); 
515 |                 // Remove it as deps from remaining tasks
516 |                 trace!("{:?} finished", handle);
517 |                 free_threads += 1;
518 |                 if let Some(out) = outbound.remove(&handle) {
519 |                     for out_handle in out {
520 |                         trace!("Updating {:?}", out_handle);
521 |                         if let Some((chain, p, deps)) = head_map.get_mut(&out_handle) {
522 |                             trace!("Updating {:?}", out_handle);
523 |                             deps.remove(&handle);
524 |                             if deps.is_empty() {
525 |                                 trace!("Adding new chain: {:?}, Priority: {}", chain, p);
526 |                                 queue.push(chain.clone(), *p);
527 |                             } else {
528 |                                 trace!("Remaining Deps: {:?}", deps);
529 |                             }
530 |                         }
531 |                     }
532 |                 }
533 | 
534 |                 jobs_done += 1;
535 |                 if total_jobs > 10 && jobs_done % (total_jobs as f64 / 10.) as usize == 0 {
536 |                     debug!("Finished {}/{} of jobs", jobs_done, total_jobs);
537 |                     if log_enabled!(Trace) {
538 |                         let ds = dsam.lock().unwrap();
539 |                         trace!("Data Chunks in memory: {}", ds.data.len());
540 |                     }
541 | 
542 |                 }
543 |                 // Are we done yet?
544 |                 if free_threads == self.0 && queue.is_empty() {
545 |                     break
546 |                 }
547 |             }
548 |             pool.shutdown();
549 |         }
550 | 
551 |         if log_enabled!(Trace) {
552 |             let ds = dsam.lock().unwrap();
553 |             trace!("Still Holding data for:");
554 |             for (k, _v) in ds.data.iter() {
555 |                 trace!("- {:?}", k);
556 |             }
557 |         }
558 | 
559 |         debug!("Finished");
560 |         let ret = {
561 |             dsam.lock().unwrap().get(&out_handle)
562 |         };
563 |         ret
564 |     }
565 | }
566 | 
567 | #[cfg(test)]
568 | mod size_test {
569 |     use super::*;
570 | 
571 |     #[test]
572 |     fn test_graph_collapse() {
573 |         /*
574 |         1 -> 2 -> 3
575 |               \
576 |                4 -> 5
577 | 
578 |         We should collapse 1 -> 2 and 4 -> 5
579 |         */
580 |         let one_deps = HashSet::new();
581 |         let mut two_deps = HashSet::new();
582 |         two_deps.insert(1usize);
583 | 
584 |         let mut three_deps = HashSet::new();
585 |         three_deps.insert(2usize);
586 | 
587 |         let mut four_deps = HashSet::new();
588 |         four_deps.insert(2usize);
589 | 
590 |         let mut five_deps = HashSet::new();
591 |         five_deps.insert(4usize);
592 | 
593 |         let mut deps = HashMap::new();
594 |         deps.insert(1usize, one_deps);
595 |         deps.insert(2usize, two_deps);
596 |         deps.insert(3usize, three_deps);
597 |         deps.insert(4usize, four_deps);
598 |         deps.insert(5usize, five_deps);
599 | 
600 |         let out = collapse_graph(deps);
601 |         let mut res = HashMap::new();
602 |         res.insert(vec![1, 2], vec![].iter().cloned().collect());
603 |         res.insert(vec![3], vec![2].iter().cloned().collect());
604 |         res.insert(vec![4, 5], vec![2].iter().cloned().collect());
605 | 
606 |         assert_eq!(out, res);
607 |     }
608 | 
609 |     #[test]
610 |     fn test_graph_collapse_2() {
611 |         /*
612 |              2 -> 4
613 |             /     |
614 |            1 ---> 3
615 | 
616 |         */
617 |         let one_deps = HashSet::new();
618 |         let mut two_deps = HashSet::new();
619 |         two_deps.insert(1usize);
620 | 
621 |         let mut three_deps = HashSet::new();
622 |         three_deps.insert(1usize);
623 | 
624 |         let mut four_deps = HashSet::new();
625 |         four_deps.insert(2usize);
626 |         four_deps.insert(3usize);
627 | 
628 |         let mut deps = HashMap::new();
629 |         deps.insert(1usize, one_deps);
630 |         deps.insert(2usize, two_deps);
631 |         deps.insert(3usize, three_deps);
632 |         deps.insert(4usize, four_deps);
633 | 
634 |         let res = deps.clone().into_iter().map(|(k, v)| (vec![k], v)).collect();
635 |         let out = collapse_graph(deps);
636 | 
637 |         assert_eq!(out, res);
638 |     }
639 | 
640 | }
641 | 


--------------------------------------------------------------------------------
/tange-collection/src/collection/memory.rs:
--------------------------------------------------------------------------------
  1 | //! MemoryCollection
  2 | //! ---
  3 | //! MemoryCollection provides a variety of dataflow operators for consuming and mutating
  4 | //! data.  Unlike its Disk-based counterpart, DiskCollection, MemoryCollection keeps all
  5 | //! data in memory, maximizing speed.
  6 | //!
  7 | 
  8 | extern crate serde;
  9 | use std::fs;
 10 | use std::any::Any;
 11 | use std::io::prelude::*;
 12 | use std::io::BufWriter;
 13 | use std::hash::Hash;
 14 | use std::sync::Arc;
 15 | 
 16 | use self::serde::{Deserialize,Serialize};
 17 | 
 18 | use collection::disk::DiskCollection;
 19 | use tange::deferred::{Deferred, batch_apply, tree_reduce};
 20 | use tange::scheduler::{Scheduler,GreedyScheduler};
 21 | use partitioned::{join_on_key as jok, partition, partition_by_key, fold_by, concat};
 22 | use interfaces::{Memory,Disk};
 23 | use super::emit;
 24 | 
 25 | 
 26 | /// MemoryCollection struct
 27 | #[derive(Clone)]
 28 | pub struct MemoryCollection<A>  {
 29 |     partitions: Vec<Deferred<Vec<A>>>
 30 | }
 31 | 
 32 | impl <A: Any + Send + Sync + Clone> MemoryCollection<A> {
 33 | 
 34 |     /// Creates a MemoryCollection from a set of Deferred objects.
 35 |     pub fn from_defs(vs: Vec<Deferred<Vec<A>>>) -> MemoryCollection<A> {
 36 |         MemoryCollection {
 37 |             partitions: vs
 38 |         }
 39 |     }
 40 | 
 41 |     /// Provides raw access to the underlying Deferred objects
 42 |     pub fn to_defs(&self) -> &Vec<Deferred<Vec<A>>> {
 43 |         &self.partitions
 44 |     }
 45 | 
 46 |     /// Creates a new MemoryCollection from a Vec of items
 47 |     /// ```rust
 48 |     ///   extern crate tange;
 49 |     ///   extern crate tange_collection;
 50 |     ///   use tange::scheduler::GreedyScheduler;
 51 |     ///   use tange_collection::collection::memory::MemoryCollection;
 52 |     ///   
 53 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3usize]);
 54 |     ///   assert_eq!(col.run(&GreedyScheduler::new()), Some(vec![1,2,3usize]));
 55 |     /// ```
 56 |     pub fn from_vec(vs: Vec<A>) -> MemoryCollection<A> {
 57 |         MemoryCollection {
 58 |             partitions: vec![Deferred::lift(vs, None)],
 59 |         }
 60 |     }
 61 | 
 62 |     /// Returns the current number of data partitions 
 63 |     pub fn n_partitions(&self) -> usize {
 64 |         self.partitions.len()
 65 |     }
 66 | 
 67 |     /// Concatentates two collections into a single Collection
 68 |     /// ```rust
 69 |     ///   extern crate tange;
 70 |     ///   extern crate tange_collection;
 71 |     ///   use tange::scheduler::GreedyScheduler;
 72 |     ///   use tange_collection::collection::memory::MemoryCollection;
 73 |     ///   
 74 |     ///   let one = MemoryCollection::from_vec(vec![1,2,3usize]);
 75 |     ///   let two = MemoryCollection::from_vec(vec![4usize, 5, 6]);
 76 |     ///   let cat = one.concat(&two);
 77 |     ///   assert_eq!(cat.run(&GreedyScheduler::new()), Some(vec![1,2,3,4,5,6]));
 78 |     /// ```
 79 |     pub fn concat(&self, other: &MemoryCollection<A>) -> MemoryCollection<A> {
 80 |         let mut nps: Vec<_> = self.partitions.iter()
 81 |             .map(|p| (*p).clone()).collect();
 82 | 
 83 |         for p in other.partitions.iter() {
 84 |             nps.push(p.clone());
 85 |         }
 86 | 
 87 |         MemoryCollection { partitions: nps }
 88 |     }
 89 |     
 90 |     /// Maps a function over the values in the DiskCollection, returning a new DiskCollection
 91 |     /// ```rust
 92 |     ///   extern crate tange;
 93 |     ///   extern crate tange_collection;
 94 |     ///   use tange::scheduler::GreedyScheduler;
 95 |     ///   use tange_collection::collection::memory::MemoryCollection;
 96 |     ///   
 97 |     ///   let one = MemoryCollection::from_vec(vec![1,2,3usize]);
 98 |     ///   let strings = one.map(|i| format!("{}", i));
 99 |     ///   assert_eq!(strings.run(&GreedyScheduler::new()), 
100 |     ///     Some(vec!["1".into(),"2".into(),"3".into()]));
101 |     /// ```
102 |     pub fn map<
103 |         B: Any + Send + Sync + Clone, 
104 |         F: 'static + Sync + Send + Clone + Fn(&A) -> B
105 |     >(&self, f: F) -> MemoryCollection<B> {
106 |         self.emit(move |x, emitter| {
107 |             emitter(f(x))
108 |         })
109 |     }
110 | 
111 |     /// Filters out items in the collection that fail the predicate.
112 |     /// ```rust
113 |     ///   extern crate tange;
114 |     ///   extern crate tange_collection;
115 |     ///   use tange::scheduler::GreedyScheduler;
116 |     ///   use tange_collection::collection::memory::MemoryCollection;
117 |     ///   
118 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3usize]);
119 |     ///   let odds = col.filter(|x| x % 2 == 1);
120 |     ///   assert_eq!(odds.run(&GreedyScheduler::new()), 
121 |     ///     Some(vec![1, 3usize]));
122 |     /// ```
123 | 
124 |     pub fn filter<
125 |         F: 'static + Sync + Send + Clone + Fn(&A) -> bool
126 |     >(&self, f: F) -> MemoryCollection<A> {
127 |         self.emit(move |x, emitter| {
128 |             if f(x) { 
129 |                 emitter(x.clone())
130 |             }
131 |         })
132 |     }
133 |     
134 |     /// Re-partitions a collection by the number of provided chunks.  It uniformly distributes data from each old partition into each new partition.
135 |     /// ```rust
136 |     ///   extern crate tange;
137 |     ///   extern crate tange_collection;
138 |     ///   use tange::scheduler::GreedyScheduler;
139 |     ///   use tange_collection::collection::memory::MemoryCollection;
140 |     ///   
141 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3usize]);
142 |     ///   assert_eq!(col.n_partitions(), 1);
143 |     ///   let two = col.split(2);
144 |     ///   assert_eq!(two.n_partitions(), 2);
145 |     /// ```
146 |     pub fn split(&self, n_chunks: usize) -> MemoryCollection<A> {
147 |         self.partition(n_chunks, |idx, _k| idx)
148 |     }
149 | 
150 |     /// Maps over all items in a collection, optionally emitting new values.  It can be used
151 |     /// to efficiently fuse a number of map/filter/flat_map functions into a single method.
152 |     /// ```rust
153 |     ///   extern crate tange;
154 |     ///   extern crate tange_collection;
155 |     ///   use tange::scheduler::GreedyScheduler;
156 |     ///   use tange_collection::collection::memory::MemoryCollection;
157 |     ///   
158 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3usize]);
159 |     ///   let new = col.emit(|item, emitter| {
160 |     ///     if item % 2 == 0 {
161 |     ///         emitter(format!("{}!", item));
162 |     ///     }
163 |     ///   });
164 |     ///   assert_eq!(new.run(&GreedyScheduler::new()), Some(vec!["2!".into()]));
165 |     /// ```
166 | 
167 |     pub fn emit<
168 |         B: Any + Send + Sync + Clone,
169 |         F: 'static + Sync + Send + Clone + Fn(&A, &mut FnMut(B) -> ())
170 |     >(&self, f: F) -> MemoryCollection<B> {
171 |         let parts = emit(&self.partitions, Memory, f);
172 | 
173 |         MemoryCollection { partitions: parts }
174 |     }
175 | 
176 |     /// Maps over all items in a collection, emitting new values.  It can be used
177 |     /// to efficiently fuse a number of map/filter/flat_map functions into a single method.
178 |     /// `emit_to_disk` differs from the original `emit` by writing the emitted values directly
179 |     /// to disk, returning a DiskCollection instead of MemoryCollection.  This makes it convenient to switch to out-of-core when needed.
180 |     /// ```rust
181 |     ///   extern crate tange;
182 |     ///   extern crate tange_collection;
183 |     ///   use tange::scheduler::GreedyScheduler;
184 |     ///   use tange_collection::collection::memory::MemoryCollection;
185 |     ///   
186 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3usize]);
187 |     ///   let new = col.emit_to_disk("/tmp".into(), |item, emitter| {
188 |     ///     if item % 2 == 0 {
189 |     ///         emitter(format!("{}!", item));
190 |     ///     }
191 |     ///   });
192 |     ///   assert_eq!(new.run(&GreedyScheduler::new()), Some(vec!["2!".into()]));
193 |     /// ```
194 | 
195 |     pub fn emit_to_disk<
196 |         B: Any + Send + Sync + Clone + Serialize + for<'de>Deserialize<'de>,
197 |         F: 'static + Sync + Send + Clone + Fn(&A, &mut FnMut(B) -> ())
198 |     >(&self, path: String, f: F) -> DiskCollection<B> {
199 |         let parts = emit(&self.partitions, Disk::from_str(&path), f);
200 | 
201 |         DiskCollection::from_stores(path, parts)
202 |     }
203 | 
204 |     /// Re-partitions data into N new partitions by the given function.  The user provided
205 |     /// function is used as a hash function, mapping the returned value to a partition index.
206 |     /// This makes it useful for managing which partition data ends up!
207 |     /// ```rust
208 |     ///   extern crate tange;
209 |     ///   extern crate tange_collection;
210 |     ///   use tange::scheduler::GreedyScheduler;
211 |     ///   use tange_collection::collection::memory::MemoryCollection;
212 |     ///   
213 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3,4usize]);
214 |     ///   let new_col = col.partition(2, |idx, x| if *x < 3 { 1 } else { 2 });
215 |     ///   
216 |     ///   assert_eq!(new_col.n_partitions(), 2);
217 |     ///   assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![3, 4, 1, 2]));
218 |     /// ```
219 |     pub fn partition<
220 |         F: 'static + Sync + Send + Clone + Fn(usize, &A) -> usize
221 |     >(&self, partitions: usize, f: F) -> MemoryCollection<A> {
222 |         let new_chunks = partition(&self.partitions, 
223 |                                    partitions, 
224 |                                    f);
225 |         // Loop over each bucket
226 |         MemoryCollection { partitions: new_chunks }
227 |     }
228 | 
229 |     /// Folds and accumulates values across multiple partitions into K new partitions.
230 |     /// This is also known as a "group by" with a following reducer.
231 |     ///
232 |     /// MemoryCollection first performs a block aggregation: that is, it combines values
233 |     /// within each partition first using the `binop` function.  It then hashes
234 |     /// each key to a new partition index, where it will then aggregate all keys using the
235 |     /// `reduce` function.
236 |     ///
237 |     /// ```rust
238 |     ///   extern crate tange;
239 |     ///   extern crate tange_collection;
240 |     ///   use tange::scheduler::GreedyScheduler;
241 |     ///   use tange_collection::collection::memory::MemoryCollection;
242 |     ///   
243 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3,4,5usize]);
244 |     ///   // Sum all odds and evens together
245 |     ///   let group_sum = col.fold_by(|x| x % 2,
246 |     ///                               || 0usize,
247 |     ///                               |block_acc, item| {*block_acc += *item},
248 |     ///                               |part_acc1, part_acc2| {*part_acc1 += *part_acc2},
249 |     ///                               1)
250 |     ///                   .sort_by(|x| x.0);
251 |     ///   
252 |     ///   assert_eq!(group_sum.n_partitions(), 1);
253 |     ///   assert_eq!(group_sum.run(&GreedyScheduler::new()), Some(vec![(0, 6), (1, 9)]));
254 |     /// ```
255 | 
256 |     pub fn fold_by<K: Any + Sync + Send + Clone + Hash + Eq,
257 |                    B: Any + Sync + Send + Clone,
258 |                    D: 'static + Sync + Send + Clone + Fn() -> B, 
259 |                    F: 'static + Sync + Send + Clone + Fn(&A) -> K, 
260 |                    O: 'static + Sync + Send + Clone + Fn(&mut B, &A) -> (),
261 |                    R: 'static + Sync + Send + Clone + Fn(&mut B, &B) -> ()>(
262 |         &self, key: F, default: D, binop: O, reduce: R, partitions: usize
263 |     ) -> MemoryCollection<(K,B)> {
264 |         let results = fold_by(&self.partitions, key, default, binop, 
265 |                               reduce, Vec::with_capacity(0), partitions);
266 |         MemoryCollection { partitions: results }
267 |     }
268 | 
269 |     /// Simple function to re-partition values by a given key.  The return key is hashed
270 |     /// and moduloed by the new partition count to determine where it will end up.
271 |     /// ```rust
272 |     ///   extern crate tange;
273 |     ///   extern crate tange_collection;
274 |     ///   use tange::scheduler::GreedyScheduler;
275 |     ///   use tange_collection::collection::memory::MemoryCollection;
276 |     ///   
277 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3,4usize]);
278 |     ///   let new_col = col.partition_by_key(2, |x| format!("{}", x));
279 |     ///   
280 |     ///   assert_eq!(new_col.n_partitions(), 2);
281 |     ///   assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![4, 1, 2, 3]));
282 |     /// ```
283 |     pub fn partition_by_key<
284 |         K: Any + Sync + Send + Clone + Hash + Eq,
285 |         F: 'static + Sync + Send + Clone + Fn(&A) -> K
286 |     >(&self, n_chunks: usize, key: F) -> MemoryCollection<A> {
287 |         let results = partition_by_key(&self.partitions, n_chunks, key);
288 |         let groups = results.into_iter().map(|part| concat(&part).unwrap()).collect();
289 |         MemoryCollection {partitions: groups}
290 |     }
291 | 
292 |     /// Sorts values within each partition by a key function.  If a global sort is desired,
293 |     /// the collection needs to be re-partitioned into a single partition
294 |     /// ```rust
295 |     ///   extern crate tange;
296 |     ///   extern crate tange_collection;
297 |     ///   use tange::scheduler::GreedyScheduler;
298 |     ///   use tange_collection::collection::memory::MemoryCollection;
299 |     ///   
300 |     ///   let col = MemoryCollection::from_vec(vec![1,2,3,4i32]);
301 |     ///   let new_col = col.sort_by(|x| -*x);
302 |     ///   
303 |     ///   assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![4, 3, 2, 1]));
304 |     /// ```
305 |     pub fn sort_by<
306 |         K: Ord,
307 |         F: 'static + Sync + Send + Clone + Fn(&A) -> K
308 |     >(&self, key: F) -> MemoryCollection<A> {
309 |         let nps = batch_apply(&self.partitions, move |_idx, vs| {
310 |             let mut v2: Vec<_> = vs.clone();
311 |             v2.sort_by_key(|v| key(v));
312 |             v2
313 |         });
314 |         MemoryCollection { partitions: nps }
315 |     }
316 | 
317 |     /// Inner Joins two collections by the provided key function.
318 |     /// If multiple values of the same key are found, they will be cross product for each
319 |     /// pair found.
320 |     /// ```rust
321 |     ///   extern crate tange;
322 |     ///   extern crate tange_collection;
323 |     ///   use tange::scheduler::GreedyScheduler;
324 |     ///   use tange_collection::collection::memory::MemoryCollection;
325 |     ///
326 |     ///   let name_age: Vec<(String,u32)> = vec![("Andrew".into(), 33), ("Leah".into(), 12)];
327 |     ///   let name_money: Vec<(String,f32)> = vec![("Leah".into(), 20.50)];
328 |     ///   
329 |     ///   let na = MemoryCollection::from_vec(name_age);
330 |     ///   let nm = MemoryCollection::from_vec(name_money);
331 |     ///   let joined = na.join_on(&nm,
332 |     ///                           |nax| nax.0.clone(),
333 |     ///                           |nmx| nmx.0.clone(),
334 |     ///                           |nax, nmx| (nax.0.clone(), nax.1, nmx.1),
335 |     ///                           1);
336 |     ///   assert_eq!(joined.run(&GreedyScheduler::new()), 
337 |     ///           Some(vec![("Leah".into(), ("Leah".into(), 12, 20.50))]));
338 |     /// ```
339 | 
340 |     pub fn join_on<
341 |         K: Any + Sync + Send + Clone + Hash + Eq,
342 |         B: Any + Sync + Send + Clone,
343 |         C: Any + Sync + Send + Clone,
344 |         KF1: 'static + Sync + Send + Clone + Fn(&A) -> K,
345 |         KF2: 'static + Sync + Send + Clone + Fn(&B) -> K,
346 |         J:   'static + Sync + Send + Clone + Fn(&A, &B) -> C,
347 |     >(
348 |         &self, 
349 |         other: &MemoryCollection<B>, 
350 |         key1: KF1, 
351 |         key2: KF2,
352 |         joiner: J,
353 |         partitions: usize, 
354 |     ) -> MemoryCollection<(K,C)> {
355 |         // Group each by a common key
356 |         let p1 = self.map(move |x| (key1(x), x.clone()))
357 |             .partition_by_key(partitions, |x| x.0.clone());
358 |         let p2 = other.map(move |x| (key2(x), x.clone()))
359 |            .partition_by_key(partitions, |x| x.0.clone());
360 | 
361 |         let mut new_parts = Vec::with_capacity(p1.partitions.len());
362 |         for (l, r) in p1.partitions.iter().zip(p2.partitions.iter()) {
363 |             new_parts.push(jok(l, r, Memory, joiner.clone()));
364 |         }
365 | 
366 |         MemoryCollection { partitions: new_parts }
367 |     }
368 | 
369 |     /// Executes the Collection, returning the result of the computation
370 |     pub fn run<S: Scheduler>(&self, s: &S) -> Option<Vec<A>> {
371 |         let cat = tree_reduce(&self.partitions, |x, y| {
372 |             let mut v1: Vec<_> = (*x).clone();
373 |             for yi in y {
374 |                 v1.push(yi.clone());
375 |             }
376 |             v1
377 |         });
378 |         cat.and_then(|x| x.run(s))
379 |     }
380 |     
381 |     /// Executes the Collection, returning the result of the computation
382 |     pub fn eval(&self) -> Option<Vec<A>> {
383 |         self.run(&GreedyScheduler::new())
384 |     }
385 | 
386 | }
387 | 
388 | impl <A: Any + Send + Sync + Clone> MemoryCollection<Vec<A>> {
389 | 
390 |     /// Flattens a vector of values
391 |     /// ```rust
392 |     ///   extern crate tange;
393 |     ///   extern crate tange_collection;
394 |     ///   use tange::scheduler::GreedyScheduler;
395 |     ///   use tange_collection::collection::memory::MemoryCollection;
396 |     ///   
397 |     ///   let col = MemoryCollection::from_vec(vec![vec![1usize,2],vec![3,4]]);
398 |     ///   let flattened = col.flatten();
399 |     ///   assert_eq!(flattened.run(&GreedyScheduler::new()), Some(vec![1, 2, 3, 4]));
400 |     /// ```
401 | 
402 |     pub fn flatten(&self) -> MemoryCollection<A> {
403 |         self.emit(move |x, emitter| {
404 |             for xi in x {
405 |                 emitter(xi.clone());
406 |             }
407 |         })
408 |     }
409 | }
410 | 
411 | impl <A: Any + Send + Sync + Clone> MemoryCollection<A> {
412 | 
413 |     /// Returns the number of items in the collection.
414 |     /// ```rust
415 |     ///   extern crate tange;
416 |     ///   extern crate tange_collection;
417 |     ///   use tange::scheduler::GreedyScheduler;
418 |     ///   use tange_collection::collection::memory::MemoryCollection;
419 |     ///   
420 |     ///   let col = MemoryCollection::from_vec(vec![vec![1usize,2],vec![3,4]]);
421 |     ///   assert_eq!(col.count().run(&GreedyScheduler::new()), Some(vec![2]));
422 |     ///   let flattened = col.flatten();
423 |     ///   assert_eq!(flattened.count().run(&GreedyScheduler::new()), Some(vec![4]));
424 |     /// ```
425 |     pub fn count(&self) -> MemoryCollection<usize> {
426 |         let nps = batch_apply(&self.partitions, |_idx, vs| vs.len());
427 |         let count = tree_reduce(&nps, |x, y| x + y).unwrap();
428 |         let out = count.apply(|x| vec![*x]);
429 |         MemoryCollection { partitions: vec![out] }
430 |     }
431 | }
432 | 
433 | impl <A: Any + Send + Sync + Clone + PartialEq + Hash + Eq> MemoryCollection<A> {
434 | 
435 |     /// Computes the frequencies of the items in collection.
436 |     /// ```rust
437 |     ///   extern crate tange;
438 |     ///   extern crate tange_collection;
439 |     ///   use tange::scheduler::GreedyScheduler;
440 |     ///   use tange_collection::collection::memory::MemoryCollection;
441 |     ///   
442 |     ///   let col = MemoryCollection::from_vec(vec![1, 2, 1, 5, 1, 2]);
443 |     ///   let freqs = col.frequencies(1).sort_by(|x| x.0);
444 |     ///   assert_eq!(freqs.run(&GreedyScheduler::new()), Some(vec![(1, 3), (2, 2), (5, 1)]));
445 |     /// ```
446 | pub fn frequencies(&self, partitions: usize) -> MemoryCollection<(A, usize)> {
447 |         //self.partition(chunks, |x| x);
448 |         self.fold_by(|s| s.clone(), 
449 |                      || 0usize, 
450 |                      |acc, _l| *acc += 1, 
451 |                      |x, y| *x += *y, 
452 |                      partitions)
453 |     }
454 | }
455 | 
456 | // Writes out data
457 | impl MemoryCollection<String> {
458 | 
459 |     /// Writes each record in a collection to disk, newline delimited.
460 |     /// MemoryCollection will create a new file within the path for each partition.
461 |     pub fn sink(&self, path: &str) -> MemoryCollection<usize> {
462 |         let p: Arc<String> = Arc::new(path.to_owned());
463 |         let pats = batch_apply(&self.partitions, move |idx, vs| {
464 |             let p2: Arc<String> = p.clone();
465 |             let local: &str = &p2;
466 |             fs::create_dir_all(local)
467 |                 .expect("Welp, something went terribly wrong when creating directory");
468 | 
469 |             let file = fs::File::create(&format!("{}/{}", local, idx))
470 |                 .expect("Issues opening file!");
471 |             let mut bw = BufWriter::new(file);
472 | 
473 |             let size = vs.len();
474 |             for line in vs {
475 |                 bw.write(line.as_bytes()).expect("Error writing out line");
476 |                 bw.write(b"\n").expect("Error writing out line");
477 |             }
478 | 
479 |             vec![size]
480 |         });
481 |         
482 |         MemoryCollection { partitions: pats }
483 |     }
484 | }
485 | 
486 | impl <A: Any + Send + Sync + Clone + Serialize + for<'de>Deserialize<'de>> MemoryCollection<A> {
487 | 
488 |     /// Copies the MemoryCollection to disk, returning a DiskCollection
489 |     pub fn to_disk(&self, path: String) -> DiskCollection<A> {
490 |         DiskCollection::from_memory(path, &self.partitions)
491 |     }
492 | }
493 | 
494 | #[cfg(test)]
495 | mod test_lib {
496 |     use super::*;
497 |     use tange::scheduler::LeveledScheduler;
498 | 
499 |     #[test]
500 |     fn test_fold_by() {
501 |         let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]);
502 |         let out = col.fold_by(|x| *x, || 0, |x, _y| *x += 1, |x, y| *x += y, 1);
503 |         let mut results = out.run(&mut LeveledScheduler).unwrap();
504 |         results.sort();
505 |         assert_eq!(results, vec![(1, 2), (2, 2), (3, 1)]);
506 |     }
507 | 
508 |     #[test]
509 |     fn test_fold_by_parts() {
510 |         let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]);
511 |         let out = col.fold_by(|x| *x, || 0, |x, _y| *x += 1, |x, y| *x += y, 2);
512 |         assert_eq!(out.partitions.len(), 2);
513 |         let mut results = out.run(&mut LeveledScheduler).unwrap();
514 |         results.sort();
515 |         assert_eq!(results, vec![(1, 2), (2, 2), (3, 1)]);
516 |     }
517 | 
518 |     #[test]
519 |     fn test_partition_by_key() {
520 |         let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]);
521 |         let computed = col.partition_by_key(2, |x| *x)
522 |             .sort_by(|x| *x);
523 |         assert_eq!(computed.partitions.len(), 2);
524 |         let results = computed.run(&mut LeveledScheduler).unwrap();
525 |         assert_eq!(results, vec![2, 2, 3, 1, 1]);
526 |     }
527 | 
528 |     #[test]
529 |     fn test_partition() {
530 |         let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]);
531 |         let computed = col.partition(2, |_idx, x| x % 2)
532 |             .sort_by(|x| *x);
533 |         assert_eq!(computed.partitions.len(), 2);
534 |         let results = computed.run(&mut LeveledScheduler).unwrap();
535 |         assert_eq!(results, vec![2, 2, 1, 1, 3]);
536 |     }
537 | 
538 |     #[test]
539 |     fn test_count() {
540 |         let col = MemoryCollection::from_vec(vec![1,2,3,1,2usize]);
541 |         let results = col.split(3).count().run(&mut LeveledScheduler).unwrap();
542 |         assert_eq!(results, vec![5]);
543 |     }
544 | 
545 |     #[test]
546 |     fn test_join() {
547 |         let col1 = MemoryCollection::from_vec(vec![1,2,3,1,2usize]);
548 |         let col2 = MemoryCollection::from_vec(
549 |             vec![(2, 1.23f64), (3usize, 2.34)]);
550 |         let out = col1.join_on(&col2, |x| *x, |y| y.0, |x, y| {
551 |             (*x, y.1)
552 |         }, 5).split(1).sort_by(|x| x.0);
553 |         let results = out.run(&mut LeveledScheduler).unwrap();
554 |         let expected = vec![(2, (2, 1.23)), (2, (2, 1.23)), (3, (3, 2.34))];
555 |         assert_eq!(results, expected);
556 |     }
557 | 
558 |     #[test]
559 |     fn test_emit() {
560 |         let results = MemoryCollection::from_vec(vec![1,2,3usize])
561 |             .emit(|num, emitter| {
562 |                 for i in 0..*num {
563 |                     emitter(i);
564 |                 }
565 |             })
566 |             .sort_by(|x| *x)
567 |             .run(&mut LeveledScheduler).unwrap();
568 |         let expected = vec![0, 0, 0, 1, 1, 2];
569 |         assert_eq!(results, expected);
570 |     }
571 | 
572 |     #[test]
573 |     fn test_sort() {
574 |         let results = MemoryCollection::from_vec(vec![1, 3, 2usize])
575 |             .sort_by(|x| *x)
576 |             .run(&mut LeveledScheduler).unwrap();
577 |         let expected = vec![1, 2, 3];
578 |         assert_eq!(results, expected);
579 |     }
580 | 
581 | }
582 | 


--------------------------------------------------------------------------------
/tange-collection/src/collection/disk.rs:
--------------------------------------------------------------------------------
  1 | //! Disk Collections
  2 | //! ---
  3 | //! This module defines the Dataflow interfaces for Out-Of-Core data processing.
  4 | //! `DiskCollection` is intended to be used for processing datasets that might not fit
  5 | //! in memory.
  6 | //!
  7 | //! All partitions are written to disk for every application, cleaning up the file when
  8 | //! finished.  This allows DiskCollection to only need the currently executing task in
  9 | //! in memory.  However, this also means there is going to be a fair amount of serialization/deserialization.
 10 | //! Under the surface, we use bincode to serialize dat quickly to minmize the penalty.
 11 | //!
 12 | 
 13 | extern crate serde;
 14 | use std::fs;
 15 | use std::any::Any;
 16 | use std::io::prelude::*;
 17 | use std::io::BufWriter;
 18 | use std::hash::Hash;
 19 | use std::sync::Arc;
 20 | 
 21 | use self::serde::Deserialize;
 22 | use self::serde::Serialize;
 23 | 
 24 | use tange::deferred::{Deferred, batch_apply, tree_reduce};
 25 | use tange::scheduler::{Scheduler,GreedyScheduler};
 26 | 
 27 | use collection::memory::MemoryCollection;
 28 | use partitioned::{join_on_key as jok, partition, partition_by_key, fold_by, concat};
 29 | use interfaces::*;
 30 | use super::emit;
 31 | 
 32 | 
 33 | /// DiskCollection struct.
 34 | #[derive(Clone)]
 35 | pub struct DiskCollection<A: Clone + Send + Sync>  {
 36 |     path: Arc<String>,
 37 |     partitions: Vec<Deferred<Arc<FileStore<A>>>>
 38 | }
 39 | 
 40 | impl <A: Any + Send + Sync + Clone + Serialize + for<'de>Deserialize<'de>> DiskCollection<A> {
 41 | 
 42 |     /// Create a new DiskCollection form a Vector of objects.
 43 |     /// ```rust
 44 |     ///   extern crate tange;
 45 |     ///   extern crate tange_collection;
 46 |     ///   use tange::scheduler::GreedyScheduler;
 47 |     ///   use tange_collection::collection::disk::DiskCollection;
 48 |     ///   
 49 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]);
 50 |     ///   assert_eq!(col.run(&GreedyScheduler::new()), Some(vec![1,2,3usize]));
 51 |     /// ```
 52 |     pub fn from_vec(path: String, vec: Vec<A>) -> DiskCollection<A> {
 53 |         MemoryCollection::from_vec(vec).to_disk(path)
 54 |     }
 55 | 
 56 |     /// Converts a collection of Deferred objects into a DiskCollection
 57 |     /// This is usually best used from the `MemoryCollection`
 58 |     pub fn from_memory(path: String, mc: &Vec<Deferred<Vec<A>>>) -> DiskCollection<A> {
 59 |         ::std::fs::create_dir_all(&path).expect("Unable to create directory!");
 60 |         let shared = Arc::new(path);
 61 |         let acc = Arc::new(FileStore::empty(shared.clone()));
 62 |         let defs = batch_apply(&mc, move |_idx, vs| {
 63 |             acc.write_vec(vs.clone())
 64 |         });
 65 |         DiskCollection { path: shared, partitions: defs }
 66 |     }
 67 | 
 68 |     /// Creats a DiskCollection for a set of FileStores.
 69 |     pub fn from_stores(path: String, fs: Vec<Deferred<Arc<FileStore<A>>>>) -> DiskCollection<A> {
 70 |         DiskCollection { path: Arc::new(path), partitions: fs }
 71 |     }
 72 | 
 73 |     /// Provides raw access to the underlying partitions
 74 |     pub fn to_defs(&self) -> &Vec<Deferred<Arc<FileStore<A>>>> {
 75 |         &self.partitions
 76 |     }
 77 | 
 78 |     /// Converts a DiskCollection to a MemoryCollection
 79 |     pub fn to_memory(&self) -> MemoryCollection<A> {
 80 |         let defs = batch_apply(&self.partitions, |_idx, vs| {
 81 |             vs.stream().into_iter().collect()
 82 |         });
 83 |         MemoryCollection::from_defs(defs)
 84 |     }
 85 | 
 86 |     /// Returns the current number of data partitions 
 87 |     pub fn n_partitions(&self) -> usize {
 88 |         self.partitions.len()
 89 |     }
 90 | 
 91 |     fn from_defs<B: Clone + Send + Sync>(&self, defs: Vec<Deferred<Arc<FileStore<B>>>>) -> DiskCollection<B> {
 92 |         DiskCollection { path: self.path.clone(), partitions: defs }
 93 |     }
 94 | 
 95 |     /// Concatentates two collections into a single Collection
 96 |     /// ```rust
 97 |     ///   extern crate tange;
 98 |     ///   extern crate tange_collection;
 99 |     ///   use tange::scheduler::GreedyScheduler;
100 |     ///   use tange_collection::collection::disk::DiskCollection;
101 |     ///   
102 |     ///   let one = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]);
103 |     ///   let two = DiskCollection::from_vec("/tmp".into(), vec![4usize, 5, 6]);
104 |     ///   let cat = one.concat(&two);
105 |     ///   assert_eq!(cat.run(&GreedyScheduler::new()), Some(vec![1,2,3,4,5,6]));
106 |     /// ```
107 |     pub fn concat(&self, other: &DiskCollection<A>) -> DiskCollection<A> {
108 |         let mut nps: Vec<_> = self.partitions.iter()
109 |             .map(|p| (*p).clone()).collect();
110 | 
111 |         for p in other.partitions.iter() {
112 |             nps.push(p.clone());
113 |         }
114 | 
115 |         self.from_defs(nps)
116 |     }
117 |     
118 |     /// Maps a function over the values in the DiskCollection, returning a new DiskCollection
119 |     /// ```rust
120 |     ///   extern crate tange;
121 |     ///   extern crate tange_collection;
122 |     ///   use tange::scheduler::GreedyScheduler;
123 |     ///   use tange_collection::collection::disk::DiskCollection;
124 |     ///   
125 |     ///   let one = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]);
126 |     ///   let strings = one.map(|i| format!("{}", i));
127 |     ///   assert_eq!(strings.run(&GreedyScheduler::new()), 
128 |     ///     Some(vec!["1".into(),"2".into(),"3".into()]));
129 |     /// ```
130 |     pub fn map<
131 |         B: Any + Send + Sync + Clone + Serialize, 
132 |         F: 'static + Sync + Send + Clone + Fn(&A) -> B
133 |     >(&self, f: F) -> DiskCollection<B> {
134 |         self.emit(move |x, emitter| {
135 |             emitter(f(x))
136 |         })
137 |     }
138 | 
139 |     /// Filters out items in the collection that fail the predicate.
140 |     /// ```rust
141 |     ///   extern crate tange;
142 |     ///   extern crate tange_collection;
143 |     ///   use tange::scheduler::GreedyScheduler;
144 |     ///   use tange_collection::collection::disk::DiskCollection;
145 |     ///   
146 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]);
147 |     ///   let odds = col.filter(|x| x % 2 == 1);
148 |     ///   assert_eq!(odds.run(&GreedyScheduler::new()), 
149 |     ///     Some(vec![1, 3usize]));
150 |     /// ```
151 | 
152 |     pub fn filter<
153 |         F: 'static + Sync + Send + Clone + Fn(&A) -> bool
154 |     >(&self, f: F) -> DiskCollection<A> {
155 |         self.emit(move |x, emitter| {
156 |             if f(x) { 
157 |                 emitter(x.clone())
158 |             }
159 |         })
160 |     }
161 |     
162 |     /// Re-partitions a collection by the number of provided chunks.  It uniformly distributes data from each old partition into each new partition.
163 |     /// ```rust
164 |     ///   extern crate tange;
165 |     ///   extern crate tange_collection;
166 |     ///   use tange::scheduler::GreedyScheduler;
167 |     ///   use tange_collection::collection::disk::DiskCollection;
168 |     ///   
169 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]);
170 |     ///   assert_eq!(col.n_partitions(), 1);
171 |     ///   let two = col.split(2);
172 |     ///   assert_eq!(two.n_partitions(), 2);
173 |     ///   let two = col.split(3);
174 |     ///   assert_eq!(two.n_partitions(), 3);
175 |     /// ```
176 | 
177 |     pub fn split(&self, n_chunks: usize) -> DiskCollection<A> {
178 |         self.partition(n_chunks, |idx, _k| idx)
179 |     }
180 | 
181 |     /// Maps over all items in a collection, optionally emitting new values.  It can be used
182 |     /// to efficiently fuse a number of map/filter/flat_map functions into a single method.
183 |     /// ```rust
184 |     ///   extern crate tange;
185 |     ///   extern crate tange_collection;
186 |     ///   use tange::scheduler::GreedyScheduler;
187 |     ///   use tange_collection::collection::disk::DiskCollection;
188 |     ///   
189 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize]);
190 |     ///   let new = col.emit(|item, emitter| {
191 |     ///     if item % 2 == 0 {
192 |     ///         emitter(format!("{}!", item));
193 |     ///     }
194 |     ///   });
195 |     ///   assert_eq!(new.run(&GreedyScheduler::new()), Some(vec!["2!".into()]));
196 |     /// ```
197 |     pub fn emit<
198 |         B: Any + Send + Sync + Clone + Serialize,
199 |         F: 'static + Sync + Send + Clone + Fn(&A, &mut FnMut(B) -> ())
200 |     >(&self, f: F) -> DiskCollection<B> {
201 | 
202 |         let parts = emit(&self.partitions, Disk(self.path.clone()), f);
203 | 
204 |         self.from_defs(parts)
205 |     }
206 | 
207 |     /// Re-partitions data into N new partitions by the given function.  The user provided
208 |     /// function is used as a hash function, mapping the returned value to a partition index.
209 |     /// This makes it useful for managing which partition data ends up!
210 |     /// ```rust
211 |     ///   extern crate tange;
212 |     ///   extern crate tange_collection;
213 |     ///   use tange::scheduler::GreedyScheduler;
214 |     ///   use tange_collection::collection::disk::DiskCollection;
215 |     ///   
216 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3,4usize]);
217 |     ///   let new_col = col.partition(2, |idx, x| if *x < 3 { 1 } else { 2 });
218 |     ///   
219 |     ///   assert_eq!(new_col.n_partitions(), 2);
220 |     ///   assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![3, 4, 1, 2]));
221 |     /// ```
222 | 
223 |     pub fn partition<
224 |         F: 'static + Sync + Send + Clone + Fn(usize, &A) -> usize
225 |     >(&self, partitions: usize, f: F) -> DiskCollection<A> {
226 |         let new_chunks = partition(&self.partitions, 
227 |                                    partitions, 
228 |                                    f);
229 |         // Loop over each bucket
230 |         self.from_defs(new_chunks)
231 |     }
232 | 
233 |     /// Folds and accumulates values across multiple partitions into K new partitions.
234 |     /// This is also known as a "group by" with a following reducer.
235 |     ///
236 |     /// DiskCollection first performs a block aggregation: that is, it combines values
237 |     /// within each partition first using the `binop` function.  It then hashes
238 |     /// each key to a new partition index, where it will then aggregate all keys using the
239 |     /// `reduce` function.
240 |     ///
241 |     /// ```rust
242 |     ///   extern crate tange;
243 |     ///   extern crate tange_collection;
244 |     ///   use tange::scheduler::GreedyScheduler;
245 |     ///   use tange_collection::collection::disk::DiskCollection;
246 |     ///   
247 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3,4,5usize]);
248 |     ///   // Sum all odds and evens together
249 |     ///   let group_sum = col.fold_by(|x| x % 2,
250 |     ///                               || 0usize,
251 |     ///                               |block_acc, item| {*block_acc += *item},
252 |     ///                               |part_acc1, part_acc2| {*part_acc1 += *part_acc2},
253 |     ///                               1)
254 |     ///                   .sort_by(|x| x.0);
255 |     ///   
256 |     ///   assert_eq!(group_sum.n_partitions(), 1);
257 |     ///   assert_eq!(group_sum.run(&GreedyScheduler::new()), Some(vec![(0, 6), (1, 9)]));
258 |     /// ```
259 | 
260 |     pub fn fold_by<K: Any + Sync + Send + Clone + Hash + Eq + Serialize + for<'de> Deserialize<'de>,
261 |                    B: Any + Sync + Send + Clone + Serialize + for<'de> Deserialize<'de>,
262 |                    D: 'static + Sync + Send + Clone + Fn() -> B,
263 |                    F: 'static + Sync + Send + Clone + Fn(&A) -> K, 
264 |                    O: 'static + Sync + Send + Clone + Fn(&mut B, &A) -> (),
265 |                    R: 'static + Sync + Send + Clone + Fn(&mut B, &B) -> ()>(
266 |         &self, key: F, default: D, binop: O, reduce: R, partitions: usize
267 |     ) -> DiskCollection<(K,B)> {
268 |         let fs = Arc::new(FileStore::empty(self.path.clone()));
269 |         let results = fold_by(&self.partitions, key, default, binop, 
270 |                               reduce, fs, partitions);
271 |         self.from_defs(results)
272 |     }
273 | 
274 |     /// Simple function to re-partition values by a given key.  The return key is hashed
275 |     /// and moduloed by the new partition count to determine where it will end up.
276 |     /// ```rust
277 |     ///   extern crate tange;
278 |     ///   extern crate tange_collection;
279 |     ///   use tange::scheduler::GreedyScheduler;
280 |     ///   use tange_collection::collection::disk::DiskCollection;
281 |     ///   
282 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3,4usize]);
283 |     ///   let new_col = col.partition_by_key(2, |x| format!("{}", x));
284 |     ///   
285 |     ///   assert_eq!(new_col.n_partitions(), 2);
286 |     ///   assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![4, 1, 2, 3]));
287 |     /// ```
288 | 
289 |     pub fn partition_by_key<
290 |         K: Any + Sync + Send + Clone + Hash + Eq,
291 |         F: 'static + Sync + Send + Clone + Fn(&A) -> K
292 |     >(&self, n_chunks: usize, key: F) -> DiskCollection<A> {
293 |         let results = partition_by_key(&self.partitions, n_chunks, key);
294 |         let groups = results.into_iter().map(|part| concat(&part).unwrap()).collect();
295 |         self.from_defs(groups)
296 |     }
297 | 
298 |     /// Sorts values within each partition by a key function.  If a global sort is desired,
299 |     /// the collection needs to be re-partitioned into a single partition
300 |     /// ```rust
301 |     ///   extern crate tange;
302 |     ///   extern crate tange_collection;
303 |     ///   use tange::scheduler::GreedyScheduler;
304 |     ///   use tange_collection::collection::disk::DiskCollection;
305 |     ///   
306 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1,2,3,4i32]);
307 |     ///   let new_col = col.sort_by(|x| -*x);
308 |     ///   
309 |     ///   assert_eq!(new_col.run(&GreedyScheduler::new()), Some(vec![4, 3, 2, 1]));
310 |     /// ```
311 | pub fn sort_by<
312 |         K: Ord,
313 |         F: 'static + Sync + Send + Clone + Fn(&A) -> K
314 |     >(&self, key: F) -> DiskCollection<A> {
315 |         let acc = Arc::new(FileStore::empty(self.path.clone()));
316 |         let nps = batch_apply(&self.partitions, move |_idx, vs| {
317 |             let mut out = acc.writer();
318 |             let mut v2: Vec<_> = vs.stream().into_iter().collect();
319 |             v2.sort_by_key(|v| key(v));
320 |             for vi in v2 {
321 |                 out.add(vi);
322 |             }
323 |             out.finish()
324 |         });
325 |         self.from_defs(nps)
326 |     }
327 | 
328 |     /// Inner Joins two collections by the provided key function.
329 |     /// If multiple values of the same key are found, they will be cross product for each
330 |     /// pair found.
331 |     /// ```rust
332 |     ///   extern crate tange;
333 |     ///   extern crate tange_collection;
334 |     ///   use tange::scheduler::GreedyScheduler;
335 |     ///   use tange_collection::collection::disk::DiskCollection;
336 |     ///   let name_age: Vec<(String,u32)> = vec![("Andrew".into(), 33), ("Leah".into(), 12)];
337 |     ///   let name_money: Vec<(String,f32)> = vec![("Leah".into(), 20.50)];
338 |     ///   
339 |     ///   let na = DiskCollection::from_vec("/tmp".into(), name_age);
340 |     ///   let nm = DiskCollection::from_vec("/tmp".into(), name_money);
341 |     ///   let joined = na.join_on(&nm,
342 |     ///                           |nax| nax.0.clone(),
343 |     ///                           |nmx| nmx.0.clone(),
344 |     ///                           |nax, nmx| (nax.0.clone(), nax.1, nmx.1),
345 |     ///                           1);
346 |     ///   assert_eq!(joined.run(&GreedyScheduler::new()), 
347 |     ///           Some(vec![("Leah".into(), ("Leah".into(), 12, 20.50))]));
348 |     /// ```
349 |     pub fn join_on<
350 |         K: Any + Sync + Send + Clone + Hash + Eq + Serialize + for<'de> Deserialize<'de>,
351 |         B: Any + Sync + Send + Clone + Serialize + for<'de> Deserialize<'de>,
352 |         C: Any + Sync + Send + Clone + Serialize,
353 |         KF1: 'static + Sync + Send + Clone + Fn(&A) -> K,
354 |         KF2: 'static + Sync + Send + Clone + Fn(&B) -> K,
355 |         J:   'static + Sync + Send + Clone + Fn(&A, &B) -> C,
356 |     >(
357 |         &self, 
358 |         other: &DiskCollection<B>, 
359 |         key1: KF1, 
360 |         key2: KF2,
361 |         joiner: J,
362 |         partitions: usize, 
363 |     ) -> DiskCollection<(K,C)> {
364 |         // Group each by a common key
365 |         let p1 = self.map(move |x| (key1(x), x.clone()))
366 |             .partition_by_key(partitions, |x| x.0.clone());
367 |         let p2 = other.map(move |x| (key2(x), x.clone()))
368 |             .partition_by_key(partitions, |x| x.0.clone());
369 | 
370 |         let mut new_parts = Vec::with_capacity(p1.partitions.len());
371 |         for (l, r) in p1.partitions.iter().zip(p2.partitions.iter()) {
372 |             let acc = Arc::new(FileStore::empty(self.path.clone()));
373 |             new_parts.push(jok(l, r, acc, joiner.clone()));
374 |         }
375 | 
376 |         self.from_defs(new_parts)
377 |     }
378 | 
379 |     /// Executes the Collection, returning the result of the computation
380 |     pub fn run<S: Scheduler>(&self, s: &S) -> Option<Vec<A>> {
381 |         let defs = batch_apply(&self.partitions, |_idx, vs| {
382 |             vs.stream().into_iter().collect::<Vec<_>>()
383 |         });
384 |         let cat = tree_reduce(&defs, |x, y| {
385 |             let mut v1: Vec<_> = (*x).clone();
386 |             for yi in y {
387 |                 v1.push(yi.clone());
388 |             }
389 |             v1
390 |         });
391 |         cat.and_then(|x| x.run(s))
392 |     }
393 |     
394 |     /// Executes the Collection, returning the result of the computation
395 |     pub fn eval(&self) -> Option<Vec<A>> {
396 |         self.run(&GreedyScheduler::new())
397 |     }
398 | 
399 | }
400 | 
401 | impl <A: Any + Send + Sync + Clone + Serialize + for<'de>Deserialize<'de>> DiskCollection<Vec<A>> {
402 |     /// Flattens a vector of values
403 |     /// ```rust
404 |     ///   extern crate tange;
405 |     ///   extern crate tange_collection;
406 |     ///   use tange::scheduler::GreedyScheduler;
407 |     ///   use tange_collection::collection::disk::DiskCollection;
408 |     ///   
409 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![vec![1usize,2],vec![3,4]]);
410 |     ///   let flattened = col.flatten();
411 |     ///   assert_eq!(flattened.run(&GreedyScheduler::new()), Some(vec![1, 2, 3, 4]));
412 |     /// ```
413 |     pub fn flatten(&self) -> DiskCollection<A> {
414 |         self.emit(move |x, emitter| {
415 |             for xi in x {
416 |                 emitter(xi.clone());
417 |             }
418 |         })
419 |     }
420 | }
421 | 
422 | impl <A: Any + Send + Sync + Clone + Serialize + for<'de>Deserialize<'de>> DiskCollection<A> {
423 |     /// Returns the number of items in the collection
424 |     /// ```rust
425 |     ///   extern crate tange;
426 |     ///   extern crate tange_collection;
427 |     ///   use tange::scheduler::GreedyScheduler;
428 |     ///   use tange_collection::collection::disk::DiskCollection;
429 |     ///   
430 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![vec![1usize,2],vec![3,4]]);
431 |     ///   assert_eq!(col.count().run(&GreedyScheduler::new()), Some(vec![2]));
432 |     ///   let flattened = col.flatten();
433 |     ///   assert_eq!(flattened.count().run(&GreedyScheduler::new()), Some(vec![4]));
434 |     /// ```
435 |     pub fn count(&self) -> DiskCollection<usize> {
436 |         let nps = batch_apply(&self.partitions, |_idx, vs| {
437 |             vs.stream().into_iter().map(|_| 1usize).sum::<usize>()
438 |         });
439 |         let count = tree_reduce(&nps, |x, y| x + y).unwrap();
440 |         let acc = Arc::new(FileStore::empty(self.path.clone()));
441 |         let out = count.apply(move |x| {
442 |             acc.write_vec(vec![*x])
443 |         });
444 |         self.from_defs(vec![out])
445 |     }
446 | }
447 | 
448 | impl <A: Any + Send + Sync + Clone + PartialEq + Hash + Eq + Serialize + for<'de>Deserialize<'de>> DiskCollection<A> {
449 | 
450 |     /// Computes the frequencies of the items in collection.
451 |     /// ```rust
452 |     ///   extern crate tange;
453 |     ///   extern crate tange_collection;
454 |     ///   use tange::scheduler::GreedyScheduler;
455 |     ///   use tange_collection::collection::disk::DiskCollection;
456 |     ///   
457 |     ///   let col = DiskCollection::from_vec("/tmp".into(), vec![1, 2, 1, 5, 1, 2]);
458 |     ///   let freqs = col.frequencies(1).sort_by(|x| x.0);
459 |     ///   assert_eq!(freqs.run(&GreedyScheduler::new()), Some(vec![(1, 3), (2, 2), (5, 1)]));
460 |     /// ```
461 | pub fn frequencies(&self, partitions: usize) -> DiskCollection<(A, usize)> {
462 |         //self.partition(chunks, |x| x);
463 |         self.fold_by(|s| s.clone(), 
464 |                      || 0usize, 
465 |                      |acc, _l| *acc += 1, 
466 |                      |x, y| *x += *y, 
467 |                      partitions)
468 |     }
469 | }
470 | 
471 | // Writes out data
472 | impl DiskCollection<String> {
473 |     /// Writes each record in a collection to disk, newline delimited.
474 |     /// DiskCollection will create anew file within the path for each partition written.
475 |     pub fn sink(&self, path: &str) -> DiskCollection<usize> {
476 |         let acc = Arc::new(FileStore::empty(self.path.clone()));
477 |         let p: Arc<String> = Arc::new(path.to_owned());
478 |         let pats = batch_apply(&self.partitions, move |idx, vs| {
479 |             let p2 = p.clone();
480 |             let local: &str = &p2;
481 |             fs::create_dir_all(local)
482 |                 .expect("Welp, something went terribly wrong when creating directory");
483 | 
484 |             let file = fs::File::create(&format!("{}/{}", local, idx))
485 |                 .expect("Issues opening file!");
486 |             let mut bw = BufWriter::new(file);
487 | 
488 |             let mut size = 0usize;
489 |             for line in vs.stream() {
490 |                 bw.write(line.as_bytes()).expect("Error writing out line");
491 |                 bw.write(b"\n").expect("Error writing out line");
492 |                 size += 1;
493 |             }
494 | 
495 |             acc.write_vec(vec![size])
496 |         });
497 |         
498 |         self.from_defs(pats)
499 |     }
500 | }
501 | 
502 | #[cfg(test)]
503 | mod test_lib {
504 |     use super::*;
505 |     use tange::scheduler::{GreedyScheduler,LeveledScheduler};
506 | 
507 |     fn make_col() -> DiskCollection<usize> {
508 |         DiskCollection::from_vec("/tmp".into(), vec![1,2,3,1,2usize])
509 |     }
510 | 
511 |     #[test]
512 |     fn test_fold_by() {
513 |         let col = make_col();
514 |         let out = col.fold_by(|x| *x, || 0, |x, _y| *x += 1, |x, y| *x += y, 1);
515 |         let mut results = out.run(&LeveledScheduler).unwrap();
516 |         results.sort();
517 |         assert_eq!(results, vec![(1, 2), (2, 2), (3, 1)]);
518 |     }
519 | 
520 |     #[test]
521 |     fn test_fold_by_parts() {
522 |         let col = make_col();
523 |         let out = col.fold_by(|x| *x, || 0, |x, _y| *x += 1, |x, y| *x += y, 2);
524 |         assert_eq!(out.partitions.len(), 2);
525 |         let mut results = out.run(&LeveledScheduler).unwrap();
526 |         results.sort();
527 |         assert_eq!(results, vec![(1, 2), (2, 2), (3, 1)]);
528 |     }
529 | 
530 |     #[test]
531 |     fn test_partition_by_key() {
532 |         let col = make_col();
533 |         let computed = col.partition_by_key(2, |x| *x)
534 |             .sort_by(|x| *x);
535 |         assert_eq!(computed.partitions.len(), 2);
536 |         let results = computed.run(&LeveledScheduler).unwrap();
537 |         assert_eq!(results, vec![2, 2, 3, 1, 1]);
538 |     }
539 | 
540 |     #[test]
541 |     fn test_partition() {
542 |         let col = make_col();
543 |         let computed = col.partition(2, |_idx, x| x % 2)
544 |             .sort_by(|x| *x);
545 |         assert_eq!(computed.partitions.len(), 2);
546 |         let results = computed.run(&GreedyScheduler::new()).unwrap();
547 |         assert_eq!(results, vec![2, 2, 1, 1, 3]);
548 |     }
549 | 
550 |     #[test]
551 |     fn test_count() {
552 |         let col = make_col();
553 |         let results = col.split(3).count().run(&mut LeveledScheduler).unwrap();
554 |         assert_eq!(results, vec![5]);
555 |     }
556 | 
557 |     #[test]
558 |     fn test_join() {
559 |         let col1 = make_col();
560 |         let col2 = DiskCollection::from_vec("/tmp".into(),
561 |             vec![(2, 1.23f64), (3usize, 2.34)]);
562 |         let out = col1.join_on(&col2, |x| *x, |y| y.0, |x, y| {
563 |             (*x, y.1)
564 |         }, 5).split(1).sort_by(|x| x.0);
565 |         let results = out.run(&LeveledScheduler).unwrap();
566 |         let expected = vec![(2, (2, 1.23)), (2, (2, 1.23)), (3, (3, 2.34))];
567 |         assert_eq!(results, expected);
568 |     }
569 | 
570 |     #[test]
571 |     fn test_emit() {
572 |         let results = DiskCollection::from_vec("/tmp".into(), vec![1,2,3usize])
573 |             .emit(|num, emitter| {
574 |                 for i in 0..*num {
575 |                     emitter(i);
576 |                 }
577 |             })
578 |             .sort_by(|x| *x)
579 |             .run(&LeveledScheduler).unwrap();
580 |         let expected = vec![0, 0, 0, 1, 1, 2];
581 |         assert_eq!(results, expected);
582 |     }
583 | 
584 |     #[test]
585 |     fn test_sort() {
586 |         let results = DiskCollection::from_vec("/tmp".into(), vec![1, 3, 2usize])
587 |             .sort_by(|x| *x)
588 |             .run(&LeveledScheduler).unwrap();
589 |         let expected = vec![1, 2, 3];
590 |         assert_eq!(results, expected);
591 |     }
592 | 
593 | }
594 | 


--------------------------------------------------------------------------------