├── .gitignore
├── settings.gradle
├── release.sh
├── assets
    ├── executionContext.png
    ├── executionContext2.png
    ├── executionContext3.png
    └── executionContext4.png
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── src
    ├── main
    │   └── java
    │   │   └── io
    │   │       └── tenmax
    │   │           └── poppy
    │   │               ├── RandomAccessDataFrame.java
    │   │               ├── exceptions
    │   │                   ├── ReflectionException.java
    │   │                   ├── ColumnNotFoundException.java
    │   │                   ├── ColumnNotSortableException.java
    │   │                   └── DuplicatedColumnException.java
    │   │               ├── DataSource.java
    │   │               ├── DataSink.java
    │   │               ├── DataColumn.java
    │   │               ├── SortSpec.java
    │   │               ├── dataframes
    │   │                   ├── ExecutionContext.java
    │   │                   ├── PeekDataFrame.java
    │   │                   ├── CacheDataFrame.java
    │   │                   ├── SourceDataFrame.java
    │   │                   ├── FilterDataFrame.java
    │   │                   ├── SortDataFrame.java
    │   │                   ├── DistinctDataFrame.java
    │   │                   ├── ProjectDataFrame.java
    │   │                   ├── AggregateDataFrame.java
    │   │                   └── BaseDataFrame.java
    │   │               ├── datasources
    │   │                   ├── SimpleDataSource.java
    │   │                   └── ReflectionDataSource.java
    │   │               ├── ProjectColumnSpec.java
    │   │               ├── AggregateColumnSpec.java
    │   │               ├── datasinks
    │   │                   └── DebugDataSink.java
    │   │               ├── iterators
    │   │                   ├── SequantialIterator.java
    │   │                   └── ParallelIterator.java
    │   │               ├── DataRow.java
    │   │               ├── DataFrame.java
    │   │               └── SpecUtils.java
    └── test
    │   └── java
    │       └── io
    │           └── tenmax
    │               └── poppy
    │                   ├── StudentReport.java
    │                   ├── GradeRoom.java
    │                   ├── Student.java
    │                   ├── DataFrameExceptionalTest.java
    │                   ├── DataFrameParallelTest.java
    │                   └── DataFrameTest.java
├── gradle.properties
├── gradlew.bat
├── README.md
└── gradlew


/.gitignore:
--------------------------------------------------------------------------------
1 | sftp-config.json
2 | slides.key
3 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'poppy'
2 | 


--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
1 | gradle clean build bintrayUpload -PdryRun=false
2 | 


--------------------------------------------------------------------------------
/assets/executionContext.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tenmax/poppy/HEAD/assets/executionContext.png


--------------------------------------------------------------------------------
/assets/executionContext2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tenmax/poppy/HEAD/assets/executionContext2.png


--------------------------------------------------------------------------------
/assets/executionContext3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tenmax/poppy/HEAD/assets/executionContext3.png


--------------------------------------------------------------------------------
/assets/executionContext4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tenmax/poppy/HEAD/assets/executionContext4.png


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tenmax/poppy/HEAD/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/RandomAccessDataFrame.java:
--------------------------------------------------------------------------------
1 | package io.tenmax.poppy;
2 | 
3 | public interface RandomAccessDataFrame extends DataFrame {
4 |     int size();
5 | 
6 |     DataRow getRow(int row);
7 | }
8 | 


--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
1 | LIBRARY_VERSION=0.1.8
2 | USER_ORG=tenmax
3 | MAVEN_GROUP=io.tenmax
4 | MAVEN_ARTIFACT=poppy
5 | DESCRIPTION=A dataframe library for java
6 | WEBSITE=https://github.com/tenmax/poppy
7 | BINTRAY_REPO=io.tenmax
8 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/exceptions/ReflectionException.java:
--------------------------------------------------------------------------------
1 | package io.tenmax.poppy.exceptions;
2 | 
3 | public class ReflectionException extends RuntimeException{
4 |     public ReflectionException(Throwable cause) {
5 |         super(cause);
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Thu Apr 28 08:36:07 CST 2016
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.12-bin.zip
7 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/exceptions/ColumnNotFoundException.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.exceptions;
 2 | 
 3 | public class ColumnNotFoundException extends RuntimeException{
 4 | 
 5 |     public ColumnNotFoundException(String column) {
 6 |         super("Column not found: " + column);
 7 |     }
 8 | 
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/DataSource.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | import java.util.Iterator;
 4 | 
 5 | public interface DataSource<T> {
 6 | 
 7 |     int getPartitionCount();
 8 | 
 9 |     Iterator<T> getPartition(int index);
10 | 
11 |     DataColumn[] getColumns();
12 | 
13 |     Object get(T data, String columnName);
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/exceptions/ColumnNotSortableException.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.exceptions;
 2 | 
 3 | /**
 4 |  * Created by popcorny on 4/20/16.
 5 |  */
 6 | public class ColumnNotSortableException extends RuntimeException{
 7 |     public ColumnNotSortableException(String column) {
 8 |         super("Column not sortable: " + column);
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/exceptions/DuplicatedColumnException.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.exceptions;
 2 | 
 3 | /**
 4 |  * Created by popcorny on 4/20/16.
 5 |  */
 6 | public class DuplicatedColumnException extends RuntimeException{
 7 | 
 8 |     public DuplicatedColumnException(String column) {
 9 |         super("Column Duplicated : " + column);
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/DataSink.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | public interface DataSink {
 4 | 
 5 |     default void sinkStart(int partitionCount, DataColumn[] columns){}
 6 | 
 7 |     default void sinkComplete(){}
 8 | 
 9 |     default void partitionStart(int partition){}
10 | 
11 |     default void partitionRow(int partition, DataRow row){}
12 | 
13 |     default void partitionComplete(int partition){}
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/DataColumn.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | public class DataColumn {
 4 |     private final String name;
 5 |     private final Class type;
 6 | 
 7 |     public DataColumn(String name, Class type) {
 8 |         this.name = name;
 9 |         this.type = type;
10 |     }
11 | 
12 |     public String getName() {
13 |         return name;
14 |     }
15 | 
16 |     public Class getType() {
17 |         return type;
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/SortSpec.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | public class SortSpec {
 4 |     public enum Order {
 5 |         ASC, DESC
 6 |     };
 7 | 
 8 |     private final String column;
 9 |     private final Order order;
10 | 
11 |     public SortSpec(String column, Order order) {
12 |         this.column = column;
13 |         this.order = order;
14 |     }
15 | 
16 |     public String getColumn() {
17 |         return column;
18 |     }
19 | 
20 |     public Order getOrder() {
21 |         return order;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/ExecutionContext.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.dataframes;
 2 | 
 3 | public class ExecutionContext {
 4 |     private boolean closed;
 5 |     private int numThreads = 1;
 6 | 
 7 |     public int getNumThreads() {
 8 |         return numThreads;
 9 |     }
10 | 
11 |     public void setNumThreads(int numThreads) {
12 |         if(numThreads <= 0) {
13 |             throw new IllegalArgumentException("numThreads should be greater than 0");
14 |         }
15 | 
16 |         this.numThreads = numThreads;
17 |     }
18 | 
19 |     public void close() {
20 |         this.closed = true;
21 |     }
22 | 
23 |     public boolean isClosed() {
24 |         return closed;
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/datasources/SimpleDataSource.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.datasources;
 2 | 
 3 | import java.util.Iterator;
 4 | 
 5 | /**
 6 |  * SimpleDataSource use the java reflection to define the columns. And using
 7 |  * the Java Bean conversion to get the value of a column.
 8 |  *
 9 |  * @param <T> The source data type.
10 |  */
11 | public class SimpleDataSource<T> extends ReflectionDataSource<T> {
12 |     private final Iterable<T>[] iterables;
13 | 
14 |     public SimpleDataSource(Class<T> clazz, Iterable<T>... iterables) {
15 |         super(clazz);
16 |         this.iterables = iterables;
17 |     }
18 | 
19 |     @Override
20 |     public int getPartitionCount() {
21 |         return iterables.length;
22 |     }
23 | 
24 |     @Override
25 |     public Iterator<T> getPartition(int index) {
26 |         return iterables[index].iterator();
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/ProjectColumnSpec.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | import java.util.function.Function;
 4 | 
 5 | public class ProjectColumnSpec<T> {
 6 |     private final String column;
 7 |     private final Class<T> type;
 8 |     private final Function<DataRow, T> mapper;
 9 | 
10 |     public ProjectColumnSpec(String column, Class<T> type, Function<DataRow, T> mapper) {
11 |         this.column = column;
12 |         this.type = type;
13 |         this.mapper = mapper;
14 |     }
15 | 
16 |     public ProjectColumnSpec(String column, String from, Class<T> type, Function<? super Object, T> mapper) {
17 |         this.column = column;
18 |         this.type = type;
19 | 
20 |         if (mapper == null) {
21 |             this.mapper = (row) -> (T)row.get(from);
22 |         } else {
23 |             this.mapper = (row) -> mapper.apply(row.get(from));
24 |         }
25 |     }
26 | 
27 |     public String getColumn() {
28 |         return column;
29 |     }
30 | 
31 |     public Class<T> getType() {
32 |         return type;
33 |     }
34 | 
35 |     public Function<DataRow, T> getMapper() {
36 |         return mapper;
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/java/io/tenmax/poppy/StudentReport.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | /**
 4 |  * Created by popcorny on 4/24/16.
 5 |  */
 6 | public class StudentReport {
 7 |     int grade;
 8 |     int room;
 9 |     double weight;
10 |     double height;
11 | 
12 |     public int getGrade() {
13 |         return grade;
14 |     }
15 | 
16 |     public void setGrade(int grade) {
17 |         this.grade = grade;
18 |     }
19 | 
20 |     public int getRoom() {
21 |         return room;
22 |     }
23 | 
24 |     public void setRoom(int room) {
25 |         this.room = room;
26 |     }
27 | 
28 |     public double getWeight() {
29 |         return weight;
30 |     }
31 | 
32 |     public void setWeight(double weight) {
33 |         this.weight = weight;
34 |     }
35 | 
36 |     public double getHeight() {
37 |         return height;
38 |     }
39 | 
40 |     public void setHeight(double height) {
41 |         this.height = height;
42 |     }
43 | 
44 |     @Override
45 |     public String toString() {
46 |         return "StudentReport{" +
47 |                 "grade=" + grade +
48 |                 ", room=" + room +
49 |                 ", weight=" + weight +
50 |                 ", height=" + height +
51 |                 '}';
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/AggregateColumnSpec.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | import java.util.stream.Collector;
 4 | 
 5 | public class AggregateColumnSpec<T> {
 6 |     private final String column;
 7 |     private final Class<T> type;
 8 |     private final String typeFromColumn;
 9 |     private final Collector<DataRow, ?, T> collector;
10 | 
11 |     public AggregateColumnSpec(String column, Class<T> type, Collector<DataRow, ?, T> collector) {
12 |         this.column = column;
13 |         this.type = type;
14 |         this.typeFromColumn = null;
15 |         this.collector = collector;
16 |     }
17 | 
18 |     public AggregateColumnSpec(String column, String typeFromColumn, Collector<DataRow, ?, T> collector) {
19 |         this.column = column;
20 |         this.type = null;
21 |         this.typeFromColumn = typeFromColumn;
22 |         this.collector = collector;
23 |     }
24 | 
25 |     public String getColumn() {
26 |         return column;
27 |     }
28 | 
29 |     public Class<T> getType() {
30 |         return type;
31 |     }
32 | 
33 |     public String getTypeFromColumn() {
34 |         return typeFromColumn;
35 |     }
36 | 
37 |     public Collector<DataRow, ?, T> getCollector() {
38 |         return collector;
39 |     }
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/datasinks/DebugDataSink.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.datasinks;
 2 | 
 3 | import io.tenmax.poppy.DataColumn;
 4 | import io.tenmax.poppy.DataRow;
 5 | import io.tenmax.poppy.DataSink;
 6 | 
 7 | public class DebugDataSink implements DataSink{
 8 |     @Override
 9 |     public void sinkStart(int partitionCount, DataColumn[] columns) {
10 |         System.out.printf("[sinkStart] partitionCount=%d\n", partitionCount);
11 |         for (DataColumn column : columns) {
12 |             System.out.printf("\t%s\t%s\n", column.getType().getName(),column.getName());
13 |         }
14 |     }
15 | 
16 |     @Override
17 |     public void sinkComplete() {
18 |         System.out.printf("[sinkComplete]\n");
19 |     }
20 | 
21 |     @Override
22 |     public void partitionStart(int partiton) {
23 |         System.out.printf("[partitionStart] partition=%d\n", partiton);
24 |     }
25 | 
26 |     @Override
27 |     public void partitionRow(int partition, DataRow row) {
28 |         System.out.printf("[partitionRow] partition=%d\n", partition);
29 |         System.out.printf("    %s\n", row);
30 |     }
31 | 
32 |     @Override
33 |     public void partitionComplete(int partiton) {
34 |         System.out.printf("[partitionComplete] partition=%d\n", partiton);
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/iterators/SequantialIterator.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.iterators;
 2 | 
 3 | import io.tenmax.poppy.DataRow;
 4 | import io.tenmax.poppy.dataframes.BaseDataFrame;
 5 | 
 6 | import java.util.Iterator;
 7 | 
 8 | public class SequantialIterator implements Iterator<DataRow> {
 9 |     private final BaseDataFrame dataFrame;
10 |     private int top;
11 |     private int partitionCount;
12 |     private Iterator<DataRow> iterator;
13 | 
14 |     public SequantialIterator(BaseDataFrame dataFrame) {
15 |         this.dataFrame = dataFrame;
16 |         this.partitionCount = dataFrame.getPartitionCount();
17 |     }
18 | 
19 |     @Override
20 |     public boolean hasNext() {
21 |         while (true) {
22 |             if(dataFrame.getContext().isClosed()) {
23 |                 return false;
24 |             }
25 | 
26 |             if(iterator != null && iterator.hasNext()) {
27 |                 return true;
28 |             }
29 | 
30 |             if (top >= partitionCount) {
31 |                 return false;
32 |             }
33 |             iterator = dataFrame.getPartition(top++);
34 |         }
35 |     }
36 | 
37 |     @Override
38 |     public DataRow next() {
39 |         if (iterator == null) {
40 |             if (!hasNext()) {
41 |                 return null;
42 |             }
43 |         }
44 |         return iterator.next();
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/test/java/io/tenmax/poppy/GradeRoom.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | /**
 4 |  * Created by popcorny on 4/28/16.
 5 |  */
 6 | public class GradeRoom {
 7 |     int grade;
 8 |     int room;
 9 | 
10 |     public GradeRoom() {
11 |     }
12 | 
13 |     public GradeRoom(int grade, int room) {
14 |         this.grade = grade;
15 |         this.room = room;
16 |     }
17 | 
18 |     public int getGrade() {
19 |         return grade;
20 |     }
21 | 
22 |     public void setGrade(int grade) {
23 |         this.grade = grade;
24 |     }
25 | 
26 |     public int getRoom() {
27 |         return room;
28 |     }
29 | 
30 |     public void setRoom(int room) {
31 |         this.room = room;
32 |     }
33 | 
34 |     @Override
35 |     public boolean equals(Object o) {
36 |         if (this == o) return true;
37 |         if (o == null || getClass() != o.getClass()) return false;
38 | 
39 |         GradeRoom gradeRoom = (GradeRoom) o;
40 | 
41 |         if (grade != gradeRoom.grade) return false;
42 |         if (room != gradeRoom.room) return false;
43 | 
44 |         return true;
45 |     }
46 | 
47 |     @Override
48 |     public int hashCode() {
49 |         int result = grade;
50 |         result = 31 * result + room;
51 |         return result;
52 |     }
53 | 
54 |     @Override
55 |     public String toString() {
56 |         return "GradeRoom{" +
57 |                 "grade=" + grade +
58 |                 ", room=" + room +
59 |                 '}';
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/PeekDataFrame.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.dataframes;
 2 | 
 3 | import io.tenmax.poppy.DataRow;
 4 | 
 5 | import java.util.Iterator;
 6 | import java.util.function.Consumer;
 7 | 
 8 | public class PeekDataFrame extends BaseDataFrame {
 9 |     private final BaseDataFrame parent;
10 |     private final Consumer<DataRow> consumer;
11 | 
12 |     public PeekDataFrame(BaseDataFrame parent, Consumer<DataRow> consumer) {
13 |         super(parent.context, parent.getColumns());
14 |         this.parent = parent;
15 |         this.groupedColumns = parent.groupedColumns;
16 |         this.consumer = consumer;
17 |     }
18 | 
19 |     @Override
20 |     public int getPartitionCount() {
21 |         return parent.getPartitionCount();
22 |     }
23 | 
24 |     @Override
25 |     public Iterator<DataRow> getPartition(int index) {
26 |         return new PeekIterator(parent.getPartition(index));
27 |     }
28 | 
29 |     class PeekIterator implements Iterator<DataRow> {
30 |         private Iterator<DataRow> wrapped;
31 | 
32 |         PeekIterator(Iterator<DataRow> wrapped) {
33 |             this.wrapped = wrapped;
34 |         }
35 | 
36 |         @Override
37 |         public boolean hasNext() {
38 |             return wrapped.hasNext();
39 |         }
40 | 
41 |         @Override
42 |         public DataRow next() {
43 |             DataRow row = wrapped.next();
44 |             consumer.accept(row);
45 |             return row;
46 |         }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/datasources/ReflectionDataSource.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.datasources;
 2 | 
 3 | import io.tenmax.poppy.DataColumn;
 4 | import io.tenmax.poppy.DataSource;
 5 | import io.tenmax.poppy.exceptions.ColumnNotFoundException;
 6 | import org.apache.commons.beanutils.PropertyUtils;
 7 | 
 8 | import java.beans.PropertyDescriptor;
 9 | import java.util.ArrayList;
10 | 
11 | public abstract class ReflectionDataSource<T> implements DataSource<T>{
12 |     private final DataColumn[] columns;
13 | 
14 |     public ReflectionDataSource(Class<T> clazz) {
15 |         this.columns = schemaFromClass(clazz);
16 |     }
17 | 
18 |     private static DataColumn[] schemaFromClass(Class clazz) {
19 |         PropertyDescriptor[] props = PropertyUtils.getPropertyDescriptors(clazz);
20 |         ArrayList<DataColumn> columns = new ArrayList<>();
21 | 
22 |         for (PropertyDescriptor prop : props) {
23 |             if(prop.getName().equals("class")) {
24 |                 continue;
25 |             }
26 |             columns.add(new DataColumn(prop.getName(), prop.getPropertyType()));
27 |         }
28 | 
29 |         return columns.toArray(new DataColumn[0]);
30 |     }
31 | 
32 |     @Override
33 |     public DataColumn[] getColumns() {
34 |         return columns;
35 |     }
36 | 
37 |     @Override
38 |     public Object get(T data, String columnName) {
39 |         try {
40 |             return PropertyUtils.getProperty(data, columnName);
41 |         } catch (Exception e) {
42 |             throw new ColumnNotFoundException(columnName);
43 |         }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/CacheDataFrame.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.dataframes;
 2 | 
 3 | import io.tenmax.poppy.DataColumn;
 4 | import io.tenmax.poppy.DataRow;
 5 | import io.tenmax.poppy.RandomAccessDataFrame;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.Iterator;
 9 | 
10 | public class CacheDataFrame extends BaseDataFrame implements RandomAccessDataFrame {
11 | 
12 |     private final ArrayList<DataRow> rows = new ArrayList<>();
13 |     private final BaseDataFrame parent;
14 | 
15 |     public CacheDataFrame(BaseDataFrame parent) {
16 |         super(new ExecutionContext(), parent.columns);
17 |         this.parent = parent;
18 |         this.groupedColumns = parent.groupedColumns;
19 |         for (DataRow row : parent) {
20 |             rows.add(new CacheDataRow(row));
21 |         }
22 |     }
23 | 
24 |     @Override
25 |     public int getPartitionCount() {
26 |         return 1;
27 |     }
28 | 
29 |     @Override
30 |     public Iterator<DataRow> getPartition(int index) {
31 |         return rows.iterator();
32 |     }
33 | 
34 |     @Override
35 |     public int size() {
36 |         return rows.size();
37 |     }
38 | 
39 |     @Override
40 |     public DataRow getRow(int row) {
41 |         return rows.get(row);
42 |     }
43 | 
44 |     class CacheDataRow extends BaseDataRow {
45 |         ArrayList value = new ArrayList();
46 | 
47 |         CacheDataRow(DataRow row) {
48 |             for (int i=0; i<row.getColumns().length; i++) {
49 |                 value.add(row.get(i));
50 |             }
51 |         }
52 | 
53 |         @Override
54 |         public Object get(int index) {
55 |             return value.get(index);
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/test/java/io/tenmax/poppy/Student.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | /**
 4 |  * Created by popcorny on 4/17/16.
 5 |  */
 6 | public class Student {
 7 |     private int studentId;
 8 |     private String name;
 9 |     private int grade;
10 |     private int room;
11 |     private int height;
12 |     private int weight;
13 | 
14 |     public Student() {
15 |     }
16 | 
17 | 
18 | 
19 |     public Student(int studentId, String name, int grade, int room,  int height, int weight) {
20 |         this.studentId = studentId;
21 |         this.grade = grade;
22 |         this.room = room;
23 |         this.name = name;
24 |         this.height = height;
25 |         this.weight = weight;
26 |     }
27 | 
28 | 
29 | 
30 |     public int getStudentId() {
31 |         return studentId;
32 |     }
33 | 
34 |     public String getName() {
35 |         return name;
36 |     }
37 | 
38 |     public void setGrade(int grade) {
39 |         this.grade = grade;
40 |     }
41 | 
42 |     public int getGrade() {
43 |         return grade;
44 |     }
45 | 
46 |     public void setRoom(int room) {
47 |         this.room = room;
48 |     }
49 | 
50 |     public int getRoom() {
51 |         return room;
52 |     }
53 | 
54 |     public int getHeight() {
55 |         return height;
56 |     }
57 | 
58 |     public int getWeight() {
59 |         return weight;
60 |     }
61 | 
62 |     @Override
63 |     public String toString() {
64 |         return "Student{" +
65 |                 "studentId=" + studentId +
66 |                 ", name='" + name + '\'' +
67 |                 ", grade=" + grade +
68 |                 ", room=" + room +
69 |                 ", height=" + height +
70 |                 ", weight=" + weight +
71 |                 '}';
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/SourceDataFrame.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.dataframes;
 2 | 
 3 | import io.tenmax.poppy.DataColumn;
 4 | import io.tenmax.poppy.DataRow;
 5 | import io.tenmax.poppy.DataSource;
 6 | 
 7 | import java.util.Iterator;
 8 | 
 9 | public class SourceDataFrame extends BaseDataFrame{
10 |     private final DataSource dataSource;
11 | 
12 |     public <T> SourceDataFrame(
13 |             DataSource<T> dataSource)
14 |     {
15 |         super(new ExecutionContext(), dataSource.getColumns());
16 |         this.dataSource = dataSource;
17 |     }
18 | 
19 |     @Override
20 |     public int getPartitionCount() {
21 |         return dataSource.getPartitionCount();
22 |     }
23 | 
24 |     @Override
25 |     public Iterator<DataRow> getPartition(int index) {
26 |         return new SourceIterator(dataSource.getPartition(index));
27 |     }
28 | 
29 |     class SourceIterator implements Iterator<DataRow> {
30 |         private Iterator source;
31 | 
32 |         SourceIterator(Iterator source) {
33 |             this.source = source;
34 |         }
35 | 
36 |         @Override
37 |         public boolean hasNext() {
38 |             return source.hasNext();
39 |         }
40 | 
41 |         @Override
42 |         public DataRow next() {
43 |             return new SourceDataRow<>(source.next());
44 |         }
45 |     }
46 | 
47 |     class SourceDataRow<T> extends BaseDataRow {
48 |         private final T data;
49 | 
50 |         SourceDataRow(T data) {
51 |             this.data = data;
52 |         }
53 | 
54 |         @Override
55 |         public Object get(int index) {
56 |             return get(columns[index].getName());
57 |         }
58 | 
59 |         @Override
60 |         public Object get(String name) {
61 |             return dataSource.get(data, name);
62 |         }
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/FilterDataFrame.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.dataframes;
 2 | 
 3 | import io.tenmax.poppy.DataColumn;
 4 | import io.tenmax.poppy.DataRow;
 5 | import io.tenmax.poppy.ProjectColumnSpec;
 6 | 
 7 | import java.util.Iterator;
 8 | import java.util.function.Predicate;
 9 | 
10 | public class FilterDataFrame extends BaseDataFrame {
11 |     private final BaseDataFrame parent;
12 |     private final Predicate<DataRow> predicate;
13 | 
14 |     public FilterDataFrame(BaseDataFrame parent, Predicate<DataRow> predicate) {
15 |         super(parent.context, parent.getColumns());
16 |         this.parent = parent;
17 |         this.groupedColumns = parent.groupedColumns;
18 |         this.predicate = predicate;
19 |     }
20 | 
21 |     @Override
22 |     public int getPartitionCount() {
23 |         return parent.getPartitionCount();
24 |     }
25 | 
26 |     @Override
27 |     public Iterator<DataRow> getPartition(int index) {
28 |         return new FilterIterator(parent.getPartition(index));
29 |     }
30 | 
31 |     class FilterIterator implements Iterator<DataRow> {
32 |         private Iterator<DataRow> wrapped;
33 |         private DataRow row;
34 |         private boolean ready;
35 | 
36 |         FilterIterator(Iterator<DataRow> wrapped) {
37 |             this.wrapped = wrapped;
38 |         }
39 | 
40 |         @Override
41 |         public boolean hasNext() {
42 |             if (!ready) {
43 |                 findNext();
44 |             }
45 |             return row != null;
46 |         }
47 | 
48 |         @Override
49 |         public DataRow next() {
50 |             if (!ready) {
51 |                 findNext();
52 |             }
53 | 
54 |             ready = false;
55 |             return row;
56 |         }
57 | 
58 |         private void findNext() {
59 |             while (wrapped.hasNext()) {
60 |                 row = wrapped.next();
61 |                 if (predicate.test(row)) {
62 |                     ready = true;
63 |                     return;
64 |                 }
65 |             }
66 |             row = null;
67 |             ready = false;
68 |         }
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/DataRow.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | import java.util.Date;
 4 | import java.util.Iterator;
 5 | 
 6 | public interface DataRow extends Iterable {
 7 | 
 8 |     DataColumn[] getColumns();
 9 | 
10 |     Object get(int index);
11 | 
12 |     Object get(String name);
13 | 
14 |     default boolean getBoolean(int index) {
15 |         return ((Boolean) get(index)).booleanValue();
16 |     }
17 | 
18 |     default boolean getBoolean(String name) {
19 |         return ((Boolean) get(name)).booleanValue();
20 |     }
21 | 
22 |     default int getInteger(int index) {
23 |         return ((Number) get(index)).intValue();
24 |     }
25 | 
26 |     default int getInteger(String name) {
27 |         return ((Number) get(name)).intValue();
28 |     }
29 | 
30 |     default long getLong(int index) {
31 |         return ((Number) get(index)).longValue();
32 |     }
33 | 
34 |     default long getLong(String name) {
35 |         return ((Number) get(name)).longValue();
36 |     }
37 | 
38 |     default float getFloat(int index) {
39 |         return ((Number) get(index)).floatValue();
40 |     }
41 | 
42 |     default float getFloat(String name) {
43 |         return ((Number) get(name)).floatValue();
44 |     }
45 | 
46 |     default double getDouble(int index) {
47 |         return ((Number) get(index)).doubleValue();
48 |     }
49 | 
50 |     default double getDouble(String name) {
51 |         return ((Number) get(name)).doubleValue();
52 |     }
53 | 
54 |     default String getString(String name) {
55 |         return ((String) get(name));
56 |     }
57 | 
58 |     default String getString(int index) {
59 |         return ((String) get(index));
60 |     }
61 | 
62 |     default Date getDate(int index) {
63 |         return ((Date) get(index));
64 |     }
65 | 
66 |     default Date getDate(String name) {
67 |         return ((Date) get(name));
68 |     }
69 | 
70 |     default
71 |     Iterator iterator() {
72 |         return new Iterator() {
73 |             private int i = 0;
74 |             private int n = getColumns().length;
75 | 
76 |             @Override
77 |             public boolean hasNext() {
78 |                 return i < n;
79 |             }
80 | 
81 |             @Override
82 |             public Object next() {
83 |                 return get(i++);
84 |             }
85 |         };
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/SortDataFrame.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy.dataframes;
 2 | 
 3 | import io.tenmax.poppy.DataColumn;
 4 | import io.tenmax.poppy.DataRow;
 5 | import io.tenmax.poppy.SortSpec;
 6 | import io.tenmax.poppy.exceptions.ColumnNotSortableException;
 7 | 
 8 | import java.util.*;
 9 | 
10 | public class SortDataFrame extends BaseDataFrame{
11 | 
12 |     private final SortSpec[] specs;
13 |     private final BaseDataFrame parent;
14 | 
15 |     public SortDataFrame(BaseDataFrame parent, SortSpec[] specs) {
16 |         super(new ExecutionContext(), parent.columns);
17 | 
18 |         for (SortSpec spec: specs) {
19 |             DataColumn column = parent.getColumn(spec.getColumn());
20 |             if (column.getType().isAssignableFrom(Comparable.class)) {
21 |                 throw new ColumnNotSortableException(spec.getColumn());
22 |             }
23 |         }
24 | 
25 |         this.parent = parent;
26 |         this.groupedColumns = parent.groupedColumns;
27 |         this.specs = specs;
28 | 
29 |     }
30 | 
31 |     @Override
32 |     public int getPartitionCount() {
33 |         return 1;
34 |     }
35 | 
36 |     @Override
37 |     public Iterator<DataRow> getPartition(int index) {
38 |         ArrayList<DataRow> rows = new ArrayList<>();
39 | 
40 |         Iterator<DataRow> iterator = parent.iterator();
41 |         while (iterator.hasNext()) {
42 |             rows.add(iterator.next());
43 |         }
44 | 
45 |         Collections.sort(rows, new DataRowComparator());
46 |         return rows.iterator();
47 |     }
48 | 
49 |     class DataRowComparator implements Comparator<DataRow> {
50 |         @Override
51 |         public int compare(DataRow row1, DataRow row2) {
52 |             for (SortSpec spec: specs) {
53 |                 Comparable v1 = (Comparable)row1.get(spec.getColumn());
54 |                 Comparable v2 = (Comparable)row2.get(spec.getColumn());
55 | 
56 |                 if (v1 == null || v2 == null) {
57 |                     if (v1 == null) {
58 |                         if (v2 != null) {
59 |                             return -1;
60 |                         } else {
61 |                             continue;
62 |                         }
63 |                     } else {
64 |                         return 1;
65 |                     }
66 |                 }
67 | 
68 |                 if (v1.compareTo(v2) == 0) {
69 |                     continue;
70 |                 }
71 | 
72 |                 return spec.getOrder() == SortSpec.Order.ASC ?
73 |                        v1.compareTo(v2) :
74 |                        -v1.compareTo(v2);
75 |             }
76 | 
77 |             return 0;
78 |         }
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12 | set DEFAULT_JVM_OPTS=
13 | 
14 | set DIRNAME=%~dp0
15 | if "%DIRNAME%" == "" set DIRNAME=.
16 | set APP_BASE_NAME=%~n0
17 | set APP_HOME=%DIRNAME%
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | if "%@eval[2+2]" == "4" goto 4NT_args
53 | 
54 | :win9xME_args
55 | @rem Slurp the command line arguments.
56 | set CMD_LINE_ARGS=
57 | set _SKIP=2
58 | 
59 | :win9xME_args_slurp
60 | if "x%~1" == "x" goto execute
61 | 
62 | set CMD_LINE_ARGS=%*
63 | goto execute
64 | 
65 | :4NT_args
66 | @rem Get arguments from the 4NT Shell from JP Software
67 | set CMD_LINE_ARGS=%$
68 | 
69 | :execute
70 | @rem Setup the command line
71 | 
72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73 | 
74 | @rem Execute Gradle
75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76 | 
77 | :end
78 | @rem End local scope for the variables with windows NT shell
79 | if "%ERRORLEVEL%"=="0" goto mainEnd
80 | 
81 | :fail
82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83 | rem the _cmd.exe /c_ return code!
84 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85 | exit /b 1
86 | 
87 | :mainEnd
88 | if "%OS%"=="Windows_NT" endlocal
89 | 
90 | :omega
91 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/DataFrame.java:
--------------------------------------------------------------------------------
 1 | package io.tenmax.poppy;
 2 | 
 3 | import io.tenmax.poppy.dataframes.BaseDataFrame;
 4 | 
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | import java.util.function.Predicate;
 8 | import java.util.function.Consumer;
 9 | 
10 | /**
11 |  * DataFrame is a sequence of schema-defined rows. The following
12 |  * example illustrates how to use the {@link DataFrame}:
13 |  *
14 |  * <pre>{@code
15 |  * List<Student> students = ...;
16 |  *
17 |  * DataFrame.from(students, Student.class)
18 |  *          .groupby("grade", "room")
19 |  *          .aggregate(
20 |  *              avgLong("weight").as("weight"),
21 |  *              avgLong("height").as("height"))
22 |  *          .sort("grade", "room")
23 |  *          .print();
24 |  * }</pre>
25 |  *
26 |  * Just like {@link java.util.stream.Stream}, it iterates through a
27 |  * data source with as little memory as possible. This allows you processing
28 |  * billion of data with only constant memory.
29 |  *
30 |  * DataFrame provides operations which SQL provides. For example {@link #project(ProjectColumnSpec[]) projection},
31 |  * ,{@link #filter(java.util.function.Predicate) filtering}, {@link #groupby(String...) grouping}, and
32 |  * {@link #aggregate(AggregateColumnSpec[]) aggregation}, {@link #sort(SortSpec...) soring}. These operations make it possible to
33 |  * write your own SQL-like statements in your application. In the above example, it is equivalent to
34 |  *
35 |  * <pre>{@code
36 |  *     select
37 |  *         grade,
38 |  *         room,
39 |  *         avg(weight) as weight,
40 |  *         avg(height) as height
41 |  *     from Student
42 |  *     group by grade, room
43 |  *     order by grade, room
44 |  * }</pre>
45 |  *
46 |  */
47 | public interface DataFrame extends Iterable<DataRow>{
48 | 
49 |     DataColumn[] getColumns();
50 | 
51 |     DataColumn getColumn(String name);
52 | 
53 |     DataColumn getColumn(int index);
54 | 
55 |     static <T> DataFrame from(Iterable<T> source, Class<T> clazz) {
56 |         return BaseDataFrame.from(source,clazz);
57 |     }
58 | 
59 |     static <T> DataFrame from(DataSource<T> source) {
60 |         return BaseDataFrame.from(source);
61 |     }
62 | 
63 | 
64 |     DataFrame project(String... columns);
65 | 
66 |     DataFrame project(ProjectColumnSpec... columns);
67 | 
68 |     DataFrame groupby(String... columns);
69 | 
70 |     DataFrame aggregate(AggregateColumnSpec... specs);
71 | 
72 |     DataFrame sort(String... columns);
73 | 
74 |     DataFrame sort(SortSpec... columns);
75 | 
76 |     DataFrame distinct(String... columns);
77 | 
78 |     DataFrame peek(Consumer<DataRow> consumer);
79 | 
80 |     DataFrame filter(Predicate<DataRow> predicate);
81 | 
82 |     DataFrame parallel(int numThreads);
83 | 
84 |     RandomAccessDataFrame cache();
85 | 
86 |     void to(DataSink sink);
87 | 
88 |     List<List> toList();
89 | 
90 |     <T> List<T> toList(Class<T> clazz);
91 | 
92 |     Map<List, List> toMap();
93 | 
94 |     <K, V> Map<K, V> toMap(Class<K> keyClazz, Class<V> valueClazz);
95 | 
96 |     void print();
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/DistinctDataFrame.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy.dataframes;
  2 | 
  3 | import io.tenmax.poppy.DataColumn;
  4 | import io.tenmax.poppy.DataRow;
  5 | import io.tenmax.poppy.iterators.SequantialIterator;
  6 | 
  7 | import java.util.*;
  8 | 
  9 | public class DistinctDataFrame extends BaseDataFrame {
 10 |     private final BaseDataFrame parent;
 11 | 
 12 |     public DistinctDataFrame(BaseDataFrame parent, String[] distinctColumns) {
 13 |         super(new ExecutionContext(), columnsFromNames(parent, distinctColumns));
 14 | 
 15 |         this.parent = parent;
 16 |     }
 17 | 
 18 |     private static DataColumn[] columnsFromNames(BaseDataFrame parent, String[] distinctColumns) {
 19 |         DataColumn[] dataColumns = new DataColumn[distinctColumns.length];
 20 |         int i = 0;
 21 | 
 22 |         for (String columnName : distinctColumns) {
 23 |             dataColumns[i++] = parent.getColumn(columnName);
 24 |         }
 25 | 
 26 |         return  dataColumns;
 27 |     }
 28 | 
 29 |     @Override
 30 |     public int getPartitionCount() {
 31 |         return 1;
 32 |     }
 33 | 
 34 |     @Override
 35 |     public Iterator<DataRow> getPartition(int index) {
 36 |         return new DistinctIterator(parent.iterator());
 37 |     }
 38 | 
 39 |     class DistinctIterator implements Iterator<DataRow> {
 40 |         private Iterator<DataRow> wrapped;
 41 |         private boolean ready;
 42 |         private DataRow row;
 43 | 
 44 |         private HashSet<List> set = new HashSet<>();
 45 | 
 46 |         DistinctIterator(Iterator<DataRow> wrapped) {
 47 |             this.wrapped = wrapped;
 48 |         }
 49 | 
 50 |         @Override
 51 |         public boolean hasNext() {
 52 |             if (!ready) {
 53 |                 findNext();
 54 |             }
 55 | 
 56 |             return row != null;
 57 |         }
 58 | 
 59 |         @Override
 60 |         public DataRow next() {
 61 |             if (!ready) {
 62 |                 findNext();
 63 |             }
 64 | 
 65 |             ready = false;
 66 |             return row;
 67 |         }
 68 | 
 69 |         private void findNext() {
 70 |             DataRow row;
 71 |             List value = new ArrayList();
 72 | 
 73 |             while(wrapped.hasNext()) {
 74 |                 row = wrapped.next();
 75 |                 for (DataColumn column: columns) {
 76 |                     value.add(row.get(column.getName()));
 77 |                 }
 78 | 
 79 |                 if (!set.contains(value)) {
 80 |                     this.row = new DistinctDataRow(value);
 81 |                     this.ready = true;
 82 |                     set.add(value);
 83 |                     return;
 84 |                 }
 85 |             }
 86 |             this.row = null;
 87 |             this.ready = false;
 88 |         }
 89 |     }
 90 | 
 91 |     class DistinctDataRow extends BaseDataRow {
 92 | 
 93 |         private List value;
 94 | 
 95 |         DistinctDataRow(List value) {
 96 |             this.value = value;
 97 |         }
 98 | 
 99 | 
100 |         @Override
101 |         public Object get(int index) {
102 |             return value.get(index);
103 |         }
104 | 
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Poppy
 2 | *poppy* is dataframe library for java, which provides common SQL operations (e.g. select, from, where, group by, order by, distinct) to process data in java.
 3 | 
 4 | Unlike other dataframe libraries, which keep all the data in memory, *poppy* process data in streaming manager. That is, it is more similar as [Java8 Stream library](https://docs.oracle.com/javase/8/docs/api/java/util/stream/package-summary.html), but relational version.
 5 | 
 6 | Here is a simple example. We have a `Student` class
 7 | 
 8 | ```java
 9 | public class Student {
10 |     private int studentId;
11 |     private String name;
12 |     private int grade;
13 |     private int room;
14 |     private int height;
15 |     private int weight;
16 |     ...
17 | }
18 | ```
19 | 
20 | In SQL, we have a query like this
21 | 
22 | ```sql
23 | select 
24 |     grade, 
25 |     room, 
26 |     avg(weight) as weight, 
27 |     avg(height) as height
28 | from Student
29 | group by grade, room
30 | order by grade, room
31 | ```
32 | 
33 | Here is the *Poppy*'s version 
34 | 
35 | ```java
36 | List<Student> students = ...;
37 | 
38 | DataFrame
39 | .from(students, Student.class)
40 | .groupby("grade", "room")
41 | .aggregate(
42 |     avgLong("weight").as("weight"),
43 |     avgLong("height").as("height"))
44 | .sort("grade", "room")
45 | .print();
46 | ```
47 | 
48 | 
49 | 
50 | # Getting Started
51 | 
52 | ## Requirement
53 | Java 8 or higher
54 | 
55 | ## Dependency
56 | 
57 | Poppy's package is managed by [JCenter](https://bintray.com/bintray/jcenter) repository.
58 | 
59 | Maven
60 | 
61 | ```
62 | <dependency>
63 |   <groupId>io.tenmax</groupId>
64 |   <artifactId>poppy</artifactId>
65 |   <version>0.1.8</version>
66 |   <type>pom</type>
67 | </dependency>
68 | ```
69 | 
70 | Gradle
71 | 
72 | ```
73 | compile 'io.tenmax:poppy:0.1.8'
74 | ```
75 | ## Features
76 | 
77 | 1. Support the most common operations in SQL. e.g. select, from, where, group by, order by, distinct
78 | 2. Support the most common aggregation functions in SQL. e.g. *avg()*, *sum()*, *count()*, *min()*, *max()*
79 | 3. **Custom aggregation functions.** by  [java.util.stream.Collector](https://docs.oracle.com/javase/8/docs/api/java/util/stream/Collector.html)
80 | 4. **Partition support.** Partition is the unit of parallelism. Multiple partitions allow you processing data concurrently.
81 | 5. **Multi-threaded support**. For CPU-bound jobs, it leverages all your CPU resources for better performance; for IO-bound jobs, it reduces the waiting time, and take adventages of better concurrency.
82 | 6. Suitable for both **batch** and **streaming** scenario.
83 | 7. **Lightweight**. Comparing to [Spark DataFrame API](https://spark.apache.org/docs/latest/sql-programming-guide.html), it is much more lightweight to embed in your application.
84 | 8. **Stream-based design**. Comparing to [joinery](https://github.com/cardillo/joinery), which keeps the whole data in memory. *Poppy*'s streaming behaviour allows limited memory to process huge volume of data.
85 | 
86 | ## Documentation
87 | 
88 | - [JavaDoc](http://tenmax.github.io/poppy/docs/javadoc/index.html)
89 | - [User Manual](http://tenmax.github.io/poppy/)
90 | 
91 | # Contribution
92 | 
93 | Please fork this project and pull request to me and any comment would be appreciated!
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/iterators/ParallelIterator.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy.iterators;
  2 | 
  3 | import io.tenmax.poppy.DataFrame;
  4 | import io.tenmax.poppy.DataRow;
  5 | import io.tenmax.poppy.dataframes.BaseDataFrame;
  6 | import io.tenmax.poppy.dataframes.ExecutionContext;
  7 | import org.slf4j.Logger;
  8 | import org.slf4j.LoggerFactory;
  9 | 
 10 | import java.util.Iterator;
 11 | import java.util.concurrent.BlockingQueue;
 12 | import java.util.concurrent.ExecutorService;
 13 | import java.util.concurrent.Executors;
 14 | import java.util.concurrent.LinkedBlockingQueue;
 15 | 
 16 | public class ParallelIterator  implements Iterator<DataRow> {
 17 |     private static Logger logger = LoggerFactory.getLogger(ParallelIterator.class);
 18 | 
 19 |     private final BaseDataFrame dataFrame;
 20 | 
 21 |     private BlockingQueue<Message> queue = new LinkedBlockingQueue<>();
 22 |     private int countDown;
 23 |     private boolean hasNext;
 24 |     private DataRow row;
 25 | 
 26 |     public ParallelIterator(BaseDataFrame dataFrame) {
 27 |         this.dataFrame = dataFrame;
 28 |         this.countDown = dataFrame.getPartitionCount();
 29 | 
 30 |         start();
 31 |     }
 32 | 
 33 |     public void start() {
 34 |         ExecutorService executor = Executors.newFixedThreadPool(dataFrame.getContext().getNumThreads());
 35 | 
 36 |         for (int i=0; i<dataFrame.getPartitionCount(); i++) {
 37 |             final int fi = i;
 38 | 
 39 |             executor.execute(()-> {
 40 |                 try {
 41 |                     Iterator<DataRow> iter = dataFrame.getPartition(fi);
 42 |                     while (iter.hasNext()) {
 43 |                         queue.put(new Message(iter.next()));
 44 | 
 45 |                         if(dataFrame.getContext().isClosed()) {
 46 |                             break;
 47 |                         }
 48 |                     }
 49 |                 } catch (Exception e) {
 50 |                     logger.error("Error occured", e);
 51 |                 } finally {
 52 |                     queue.add(Message.END_OF_MESSAGE);
 53 |                 }
 54 |             });
 55 |         }
 56 | 
 57 |         // Shutdown while all task handled
 58 |         executor.shutdown();
 59 |     }
 60 | 
 61 |     @Override
 62 |     public boolean hasNext() {
 63 |         if (!hasNext) {
 64 |             findNext();
 65 |         }
 66 | 
 67 |         return hasNext;
 68 |     }
 69 | 
 70 |     @Override
 71 |     public DataRow next() {
 72 |         if (!hasNext) {
 73 |             findNext();
 74 |         }
 75 | 
 76 |         if (hasNext) {
 77 |             hasNext = false;
 78 |             return row;
 79 |         } else {
 80 |             return null;
 81 |         }
 82 |     }
 83 | 
 84 |     public void findNext() {
 85 |         hasNext = false;
 86 |         while (countDown > 0) {
 87 | 
 88 |             Message message = null;
 89 |             try {
 90 |                 message = queue.take();
 91 | 
 92 |                 if (message == Message.END_OF_MESSAGE) {
 93 |                     countDown--;
 94 |                 } else {
 95 |                     row = message.row;
 96 |                     hasNext = true;
 97 |                     break;
 98 |                 }
 99 |             } catch (InterruptedException e) {
100 | 
101 |             }
102 |         }
103 |     }
104 | 
105 | 
106 |     static class Message {
107 |         static Message END_OF_MESSAGE = new Message(null);
108 | 
109 |         DataRow row;
110 | 
111 |         Message(DataRow row) {
112 |             this.row = row;
113 |         }
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/ProjectDataFrame.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy.dataframes;
  2 | 
  3 | import io.tenmax.poppy.DataColumn;
  4 | import io.tenmax.poppy.DataRow;
  5 | import io.tenmax.poppy.ProjectColumnSpec;
  6 | 
  7 | import java.util.HashMap;
  8 | import java.util.Iterator;
  9 | import java.util.function.Function;
 10 | 
 11 | public class ProjectDataFrame extends BaseDataFrame {
 12 |     private final ProjectColumnSpec[] specs;
 13 |     private final BaseDataFrame parent;
 14 |     private final HashMap<String, ProjectColumnSpec> specsMap= new HashMap<>();
 15 | 
 16 |     public ProjectDataFrame(BaseDataFrame parent, String[] columnNames) {
 17 |         this(parent, specsFromColumnNames(parent, columnNames));
 18 |     }
 19 | 
 20 |     public ProjectDataFrame(BaseDataFrame parent, ProjectColumnSpec[] specs) {
 21 |         super(parent.context, columnsFromSpec(fixSpecs(parent, specs)));
 22 |         this.parent = parent;
 23 |         this.specs = fixSpecs(parent, specs);
 24 | 
 25 |         for (ProjectColumnSpec spec: specs) {
 26 |             specsMap.put(spec.getColumn(), spec);
 27 |         }
 28 |     }
 29 | 
 30 |     private static ProjectColumnSpec[] fixSpecs(BaseDataFrame parent, ProjectColumnSpec[] specs) {
 31 |         ProjectColumnSpec[] newSpecs = new ProjectColumnSpec[specs.length];
 32 |         int i = 0;
 33 | 
 34 |         for (ProjectColumnSpec spec : specs) {
 35 |             String column = spec.getColumn();
 36 |             Class type = spec.getType();
 37 |             Function<DataRow, ?> mapper = spec.getMapper();
 38 |             if (type == null) {
 39 |                 if (mapper == null) {
 40 |                     type = parent.getColumn(column).getType();
 41 |                 } else {
 42 |                     new IllegalArgumentException("not type defined for " + column);
 43 |                 }
 44 |             }
 45 |             if (mapper == null) {
 46 |                 mapper = (DataRow row) -> row.get(column);
 47 |             }
 48 | 
 49 |             newSpecs[i++] = new ProjectColumnSpec(column, type, mapper);
 50 |         }
 51 | 
 52 |         return  newSpecs;
 53 |     }
 54 | 
 55 |     private static ProjectColumnSpec[] specsFromColumnNames(BaseDataFrame parent, String[] columnNames) {
 56 |         ProjectColumnSpec[] specs = new ProjectColumnSpec[columnNames.length];
 57 | 
 58 | 
 59 |         int i = 0;
 60 | 
 61 |         for (String columnName : columnNames) {
 62 | 
 63 |             specs[i++] = new ProjectColumnSpec(
 64 |                     columnName,
 65 |                     columnName,
 66 |                     parent.getColumn(columnName).getType(),
 67 |                     null);
 68 |         }
 69 | 
 70 |         return specs;
 71 |     }
 72 | 
 73 | 
 74 | 
 75 |     private static DataColumn[] columnsFromSpec(ProjectColumnSpec[] specs) {
 76 |         DataColumn[] dataColumns = new DataColumn[specs.length];
 77 |         int i = 0;
 78 |         for (ProjectColumnSpec spec : specs) {
 79 |             dataColumns[i++] = new DataColumn(spec.getColumn(), spec.getType());
 80 |         }
 81 |         return  dataColumns;
 82 |     }
 83 | 
 84 |     @Override
 85 |     public int getPartitionCount() {
 86 |         return parent.getPartitionCount();
 87 |     }
 88 | 
 89 |     @Override
 90 |     public Iterator<DataRow> getPartition(int index) {
 91 |         return new ProjectIterator(parent.getPartition(index));
 92 |     }
 93 | 
 94 |     class ProjectIterator implements Iterator<DataRow> {
 95 |         private Iterator<DataRow> wrapped;
 96 | 
 97 |         ProjectIterator(Iterator<DataRow> wrapped) {
 98 |             this.wrapped = wrapped;
 99 |         }
100 | 
101 |         @Override
102 |         public boolean hasNext() {
103 |             return wrapped.hasNext();
104 |         }
105 | 
106 |         @Override
107 |         public DataRow next() {
108 |             return new ProjectDataRow(wrapped.next());
109 |         }
110 |     }
111 | 
112 |     class ProjectDataRow extends BaseDataRow {
113 |         private DataRow row;
114 | 
115 |         ProjectDataRow(DataRow row) {
116 |             this.row = row;
117 |         }
118 | 
119 |         @Override
120 |         public Object get(int index) {
121 |             ProjectColumnSpec spec = specs[index];
122 |             if (spec.getMapper() != null) {
123 |                 return spec.getMapper().apply(row);
124 |             } else {
125 |                 return row.get(spec.getColumn());
126 |             }
127 |         }
128 |     }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/test/java/io/tenmax/poppy/DataFrameExceptionalTest.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy;
  2 | 
  3 | import io.tenmax.poppy.datasinks.DebugDataSink;
  4 | import io.tenmax.poppy.datasources.SimpleDataSource;
  5 | import junit.framework.TestCase;
  6 | import org.junit.Before;
  7 | import org.junit.Test;
  8 | 
  9 | import java.util.ArrayList;
 10 | import java.util.Arrays;
 11 | import java.util.Collections;
 12 | import java.util.Iterator;
 13 | import java.util.concurrent.atomic.AtomicInteger;
 14 | import java.util.stream.Collectors;
 15 | 
 16 | import static io.tenmax.poppy.SpecUtils.*;
 17 | import static io.tenmax.poppy.SpecUtils.desc;
 18 | 
 19 | public class DataFrameExceptionalTest {
 20 | 
 21 | 
 22 |     @Test
 23 |     (expected = RuntimeException.class)
 24 |     public void testAggre1() throws Exception {
 25 |         DataFrame.from(new ExceptionalDataSource(ExceptionalDataSource.ErrorType.GetPartitionCount))
 26 |         .parallel(4)
 27 |         .aggregate(
 28 |                 avgLong("weight").as("weight"),
 29 |                 avgLong("height").as("height"),
 30 |                 count().as("count"),
 31 |                 aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi"))
 32 |         .print();
 33 |     }
 34 | 
 35 |     @Test
 36 |     (expected = RuntimeException.class)
 37 |     public void testAggre2() throws Exception {
 38 |         DataFrame.from(new ExceptionalDataSource(ExceptionalDataSource.ErrorType.GetPartition))
 39 |         .parallel(4)
 40 |         .aggregate(
 41 |                 avgLong("weight").as("weight"),
 42 |                 avgLong("height").as("height"),
 43 |                 count().as("count"),
 44 |                 aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi"))
 45 |         .print();
 46 |     }
 47 | 
 48 |     @Test
 49 |             (expected = RuntimeException.class)
 50 |     public void testAggre3() throws Exception {
 51 |         DataFrame.from(new ExceptionalDataSource(ExceptionalDataSource.ErrorType.Iterator))
 52 |         .parallel(4)
 53 |         .aggregate(
 54 |                 avgLong("weight").as("weight"),
 55 |                 avgLong("height").as("height"),
 56 |                 count().as("count"),
 57 |                 aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi"))
 58 |         .print();
 59 |     }
 60 | 
 61 | }
 62 | 
 63 | 
 64 | class ExceptionalDataSource implements DataSource<Student> {
 65 |     enum ErrorType {
 66 |         GetPartitionCount,
 67 |         GetPartition,
 68 |         Iterator
 69 |     }
 70 | 
 71 |     private ErrorType errorType;
 72 | 
 73 |     ExceptionalDataSource(ErrorType errorType) {
 74 |         this.errorType = errorType;
 75 |     }
 76 | 
 77 |     @Override
 78 |     public int getPartitionCount() {
 79 |         if(errorType == ErrorType.GetPartitionCount) {
 80 |             throw new RuntimeException("hello exception");
 81 |         }
 82 | 
 83 |         return 3;
 84 |     }
 85 | 
 86 |     @Override
 87 |     public Iterator<Student> getPartition(int index) {
 88 | 
 89 |         if (index > 0) {
 90 | 
 91 |             if (errorType == ErrorType.GetPartition) {
 92 |                 throw new RuntimeException("hello exception");
 93 |             } else if(errorType == ErrorType.Iterator) {
 94 |                 return new ExceptionalIterator();
 95 |             }
 96 |         }
 97 | 
 98 |         return Arrays.asList(new Student(1, "pop", 5, 2, 176, 68)).iterator();
 99 |     }
100 | 
101 |     @Override
102 |     public DataColumn[] getColumns() {
103 |         return new DataColumn[] {
104 |             new DataColumn("name", String.class),
105 |             new DataColumn("weight", Integer.class),
106 |             new DataColumn("height", Integer.class)
107 |         };
108 |     }
109 | 
110 |     @Override
111 |     public Object get(Student student, String columnName) {
112 |         switch (columnName) {
113 |             case "name":
114 |                 return student.getName();
115 |             case "weight":
116 |                 return student.getWeight();
117 |             case "height":
118 |                 return student.getHeight();
119 |         }
120 |         return null;
121 |     }
122 | }
123 | 
124 | class ExceptionalIterator implements Iterator<Student> {
125 |     @Override
126 |     public boolean hasNext() {
127 |         return true;
128 |     }
129 | 
130 |     @Override
131 |     public Student next() {
132 |         throw new RuntimeException("hello exception");
133 |     }
134 | }


--------------------------------------------------------------------------------
/src/test/java/io/tenmax/poppy/DataFrameParallelTest.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy;
  2 | 
  3 | import io.tenmax.poppy.datasinks.DebugDataSink;
  4 | import io.tenmax.poppy.datasources.SimpleDataSource;
  5 | import junit.framework.TestCase;
  6 | import org.junit.Before;
  7 | import org.junit.Test;
  8 | 
  9 | import java.util.ArrayList;
 10 | import java.util.concurrent.atomic.AtomicInteger;
 11 | import java.util.stream.Collectors;
 12 | 
 13 | import static io.tenmax.poppy.SpecUtils.*;
 14 | import static io.tenmax.poppy.SpecUtils.desc;
 15 | import static org.junit.Assert.assertEquals;
 16 | 
 17 | 
 18 | public class DataFrameParallelTest {
 19 | 
 20 |     private DataFrame df;
 21 | 
 22 | 
 23 |     @Before
 24 |     public void setUp() throws Exception {
 25 | 
 26 |         ArrayList<Student> list1 = new ArrayList<>();
 27 |         ArrayList<Student> list2 = new ArrayList<>();
 28 |         ArrayList<Student> list3 = new ArrayList<>();
 29 | 
 30 |         list1.add(new Student(1, "pop",     5,2,170,60));
 31 |         list1.add(new Student(2, "foo",     5,3,175,70));
 32 |         list1.add(new Student(3, "bar",     5,4,168,80));
 33 |         list1.add(new Student(4, "john",    5,4,160,60));
 34 | 
 35 | 
 36 |         list2.add(new Student(5, "richard", 4,1,170,68));
 37 |         list2.add(new Student(6, "howard",  4,2,178,90));
 38 |         list2.add(new Student(7, "michael", 4,3,169,80));
 39 |         list2.add(new Student(8, "coco",    4,4,158,65));
 40 | 
 41 | 
 42 |         list3.add(new Student(9, "tina",    3,2,155,44));
 43 |         list3.add(new Student(10, "chloe",  3,2,158,45));
 44 |         list3.add(new Student(11, "george", 3,5,163,90));
 45 |         list3.add(new Student(12, "mary",   3,1,170,60));
 46 | 
 47 |         df= DataFrame.from(
 48 |             new SimpleDataSource<>(Student.class,list1, list2, list3))
 49 |             .parallel(4);
 50 |     }
 51 | 
 52 |     @Test
 53 |     public void testBasic() throws Exception {
 54 |         df
 55 |         .print();
 56 |     }
 57 | 
 58 |     @Test
 59 |     public void testProject() throws Exception {
 60 |         df
 61 |         .project("name", "weight", "height")
 62 |         .print();
 63 |     }
 64 | 
 65 |     @Test
 66 |     public void testProject2() throws Exception {
 67 |         df
 68 |         .project(
 69 |                 col("name"),
 70 |                 colMap("weight").as("w"),
 71 |                 colMap("height", Float.class, (Integer height) -> (height / 10f)).as("h"))
 72 |         .print();
 73 |     }
 74 | 
 75 |     @Test
 76 |     public void testFilter() throws Exception {
 77 |         df
 78 |         .filter(row -> row.getInteger("height") >= 170)
 79 |         .project("name", "weight", "height")
 80 |         .print();
 81 |     }
 82 | 
 83 |     @Test
 84 |     public void testAggre() throws Exception {
 85 |         df
 86 |         .aggregate(
 87 |                 avgLong("weight").as("weight"),
 88 |                 avgLong("height").as("height"),
 89 |                 count().as("count"),
 90 |                 aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi"))
 91 |         .print();
 92 |     }
 93 | 
 94 |     @Test
 95 |     public void testGroupBy() throws Exception {
 96 |         df
 97 |         .groupby("grade", "room")
 98 |         .aggregate(
 99 |                 avgLong("weight").as("weight"),
100 |                 avgLong("height").as("height"))
101 |         .sort("grade", "room")
102 |         .print();
103 |     }
104 | 
105 |     @Test
106 |     public void testSort() throws Exception {
107 |         df
108 |         .sort("weight", "height")
109 |         .print();
110 |     }
111 | 
112 |     @Test
113 |     public void testSort2() throws Exception {
114 |         df
115 |         .sort(asc("weight"), desc("height"))
116 |         .print();
117 |     }
118 | 
119 |     @Test
120 |     public void testDistinct() throws Exception {
121 |         df
122 |         .distinct("grade", "room")
123 |         .print();
124 |     }
125 | 
126 |     @Test
127 |     public void testTo() throws Exception {
128 |         TestDataSink sink = new TestDataSink();
129 | //        df.to(new DebugDataSink());
130 |         df.to(sink);
131 |         assertEquals(1, sink.sinkStart.get());
132 |         assertEquals(1, sink.sinkComplete.get());
133 |         assertEquals(3, sink.partitionStart.get());
134 |         assertEquals(12, sink.partitionRow.get());
135 |         assertEquals(3, sink.partitionComplete.get());
136 |     }
137 | 
138 |     class TestDataSink implements DataSink {
139 |         AtomicInteger sinkStart = new AtomicInteger();
140 |         AtomicInteger sinkComplete = new AtomicInteger();
141 |         AtomicInteger partitionStart = new AtomicInteger();
142 |         AtomicInteger partitionRow = new AtomicInteger();
143 |         AtomicInteger partitionComplete = new AtomicInteger();
144 | 
145 |         @Override
146 |         public void sinkStart(int partitionCount, DataColumn[] columns) {
147 |             sinkStart.incrementAndGet();
148 |         }
149 | 
150 |         @Override
151 |         public void sinkComplete() {
152 |             sinkComplete.incrementAndGet();
153 | 
154 |         }
155 | 
156 |         @Override
157 |         public void partitionStart(int partition) {
158 |             partitionStart.incrementAndGet();
159 |         }
160 | 
161 |         @Override
162 |         public void partitionRow(int partition, DataRow row) {
163 |             partitionRow.incrementAndGet();
164 |         }
165 | 
166 |         @Override
167 |         public void partitionComplete(int partition) {
168 |             partitionComplete.incrementAndGet();
169 |         }
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 10 | DEFAULT_JVM_OPTS=""
 11 | 
 12 | APP_NAME="Gradle"
 13 | APP_BASE_NAME=`basename "$0"`
 14 | 
 15 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 16 | MAX_FD="maximum"
 17 | 
 18 | warn ( ) {
 19 |     echo "$*"
 20 | }
 21 | 
 22 | die ( ) {
 23 |     echo
 24 |     echo "$*"
 25 |     echo
 26 |     exit 1
 27 | }
 28 | 
 29 | # OS specific support (must be 'true' or 'false').
 30 | cygwin=false
 31 | msys=false
 32 | darwin=false
 33 | case "`uname`" in
 34 |   CYGWIN* )
 35 |     cygwin=true
 36 |     ;;
 37 |   Darwin* )
 38 |     darwin=true
 39 |     ;;
 40 |   MINGW* )
 41 |     msys=true
 42 |     ;;
 43 | esac
 44 | 
 45 | # Attempt to set APP_HOME
 46 | # Resolve links: $0 may be a link
 47 | PRG="$0"
 48 | # Need this for relative symlinks.
 49 | while [ -h "$PRG" ] ; do
 50 |     ls=`ls -ld "$PRG"`
 51 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 52 |     if expr "$link" : '/.*' > /dev/null; then
 53 |         PRG="$link"
 54 |     else
 55 |         PRG=`dirname "$PRG"`"/$link"
 56 |     fi
 57 | done
 58 | SAVED="`pwd`"
 59 | cd "`dirname \"$PRG\"`/" >/dev/null
 60 | APP_HOME="`pwd -P`"
 61 | cd "$SAVED" >/dev/null
 62 | 
 63 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 64 | 
 65 | # Determine the Java command to use to start the JVM.
 66 | if [ -n "$JAVA_HOME" ] ; then
 67 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 68 |         # IBM's JDK on AIX uses strange locations for the executables
 69 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 70 |     else
 71 |         JAVACMD="$JAVA_HOME/bin/java"
 72 |     fi
 73 |     if [ ! -x "$JAVACMD" ] ; then
 74 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 75 | 
 76 | Please set the JAVA_HOME variable in your environment to match the
 77 | location of your Java installation."
 78 |     fi
 79 | else
 80 |     JAVACMD="java"
 81 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 82 | 
 83 | Please set the JAVA_HOME variable in your environment to match the
 84 | location of your Java installation."
 85 | fi
 86 | 
 87 | # Increase the maximum file descriptors if we can.
 88 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
 89 |     MAX_FD_LIMIT=`ulimit -H -n`
 90 |     if [ $? -eq 0 ] ; then
 91 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 92 |             MAX_FD="$MAX_FD_LIMIT"
 93 |         fi
 94 |         ulimit -n $MAX_FD
 95 |         if [ $? -ne 0 ] ; then
 96 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
 97 |         fi
 98 |     else
 99 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
100 |     fi
101 | fi
102 | 
103 | # For Darwin, add options to specify how the application appears in the dock
104 | if $darwin; then
105 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
106 | fi
107 | 
108 | # For Cygwin, switch paths to Windows format before running java
109 | if $cygwin ; then
110 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
111 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
112 |     JAVACMD=`cygpath --unix "$JAVACMD"`
113 | 
114 |     # We build the pattern for arguments to be converted via cygpath
115 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
116 |     SEP=""
117 |     for dir in $ROOTDIRSRAW ; do
118 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
119 |         SEP="|"
120 |     done
121 |     OURCYGPATTERN="(^($ROOTDIRS))"
122 |     # Add a user-defined pattern to the cygpath arguments
123 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
124 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
125 |     fi
126 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
127 |     i=0
128 |     for arg in "$@" ; do
129 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
130 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
131 | 
132 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
133 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
134 |         else
135 |             eval `echo args$i`="\"$arg\""
136 |         fi
137 |         i=$((i+1))
138 |     done
139 |     case $i in
140 |         (0) set -- ;;
141 |         (1) set -- "$args0" ;;
142 |         (2) set -- "$args0" "$args1" ;;
143 |         (3) set -- "$args0" "$args1" "$args2" ;;
144 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
145 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
146 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
147 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
148 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
149 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
150 |     esac
151 | fi
152 | 
153 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
154 | function splitJvmOpts() {
155 |     JVM_OPTS=("$@")
156 | }
157 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
158 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
159 | 
160 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
161 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/SpecUtils.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy;
  2 | 
  3 | import java.util.Comparator;
  4 | import java.util.Objects;
  5 | import java.util.Optional;
  6 | import java.util.Set;
  7 | import java.util.function.BiConsumer;
  8 | import java.util.function.BinaryOperator;
  9 | import java.util.function.Function;
 10 | import java.util.function.Supplier;
 11 | import java.util.stream.Collector;
 12 | import java.util.stream.Collectors;
 13 | 
 14 | public class SpecUtils {
 15 |     // ProjectColumnSpec
 16 | 
 17 |     public static ProjectColumnSpec col(String columnName) {
 18 |         return new ProjectColumnSpecBuilder().as(columnName);
 19 |     }
 20 | 
 21 |     public static ProjectColumnSpecBuilder colMap(String columnRef) {
 22 |         return new ProjectColumnSpecBuilder<Object>(null, (DataRow row) -> row.get(columnRef));
 23 |     }
 24 | 
 25 |     public static <T,R> ProjectColumnSpecBuilder colMap(String columnRef, Class<T> type, Function<R, T> mapper) {
 26 |         return new ProjectColumnSpecBuilder<T>(type, (DataRow row) -> mapper.apply((R)row.get(columnRef)));
 27 |     }
 28 | 
 29 |     public static class ProjectColumnSpecBuilder<T> {
 30 |         private final Class<T> type;
 31 |         private final Function<DataRow, T> mapper;
 32 | 
 33 |         public ProjectColumnSpecBuilder() {
 34 |             this(null, null);
 35 |         }
 36 | 
 37 |         public ProjectColumnSpecBuilder(Class<T> type, Function<DataRow, T> mapper) {
 38 |             this.type = type;
 39 |             this.mapper = mapper;
 40 |         }
 41 | 
 42 |         public ProjectColumnSpec<T> as(String column) {
 43 |             return new ProjectColumnSpec<>(column, type, mapper);
 44 |         }
 45 |     }
 46 | 
 47 |     // AggregateColumnSpec
 48 |     public static AggregateColumnSpecBuilder sumLong(String columnRef) {
 49 |         return new AggregateColumnSpecBuilder(
 50 |                 Long.class,
 51 |                 Collectors.summingLong((DataRow row) -> row.getLong(columnRef)));
 52 |     }
 53 | 
 54 |     public static AggregateColumnSpecBuilder sumDouble(String columnRef) {
 55 |         return new AggregateColumnSpecBuilder(
 56 |                 Double.class,
 57 |                 Collectors.summingDouble((DataRow row) -> row.getDouble(columnRef)));
 58 |     }
 59 | 
 60 |     public static AggregateColumnSpecBuilder avgLong(String columnRef) {
 61 |         return new AggregateColumnSpecBuilder(
 62 |                 Double.class,
 63 |                 Collectors.averagingLong((DataRow row) -> row.getLong(columnRef)));
 64 |     }
 65 | 
 66 |     public static AggregateColumnSpecBuilder avgDouble(String columnRef) {
 67 |         return new AggregateColumnSpecBuilder(
 68 |                 Double.class,
 69 |                 Collectors.averagingDouble((DataRow row) -> row.getDouble(columnRef)));
 70 |     }
 71 | 
 72 |     public static AggregateColumnSpecBuilder count() {
 73 |         return new AggregateColumnSpecBuilder(
 74 |                 Long.class,
 75 |                 Collectors.counting());
 76 |     }
 77 | 
 78 |     public static AggregateColumnSpecBuilder count(String columnRef) {
 79 |         return new AggregateColumnSpecBuilder(
 80 |                 Long.class,
 81 |                 Collectors.summingLong((DataRow row) -> row.get(columnRef) != null ? 1 : 0));
 82 |     }
 83 | 
 84 |     public static AggregateColumnSpecBuilder min(String columnRef) {
 85 |         Function<DataRow, ?> mapper = row -> row.get(columnRef);
 86 |         Comparator comparator = Comparator.naturalOrder();
 87 |         Collector<?,?,Optional<?>> collector = Collectors
 88 |                 .<DataRow,Object,Object,Object>mapping(mapper, Collectors.minBy(comparator));
 89 |         Collector collector2 = Collectors.collectingAndThen(collector, (opt) -> opt.orElse(null));
 90 | 
 91 |         return new AggregateColumnSpecBuilder(columnRef,collector2);
 92 |     }
 93 | 
 94 |     public static AggregateColumnSpecBuilder max(String columnRef) {
 95 |         Function<DataRow, ?> mapper = row -> row.get(columnRef);
 96 |         Comparator comparator = Comparator.naturalOrder();
 97 |         Collector<?,?,Optional<?>> collector = Collectors
 98 |                 .<DataRow,Object,Object,Object>mapping(mapper, Collectors.maxBy(comparator));
 99 |         Collector collector2 = Collectors.collectingAndThen(collector, (opt) -> opt.orElse(null));
100 | 
101 |         return new AggregateColumnSpecBuilder(columnRef,collector2);
102 | 
103 |     }
104 | 
105 |     public static <T, A, R> AggregateColumnSpecBuilder aggreMap(String columnRef, Class<R> type, Collector<T,A,R> collector) {
106 |         Function<DataRow, T> mapper = row -> (T)row.get(columnRef);
107 |         Collector<DataRow, ?, R> newCollector = Collectors.mapping(mapper, collector);
108 |         return new AggregateColumnSpecBuilder(type,newCollector);
109 |     }
110 | 
111 | 
112 |     public static class AggregateColumnSpecBuilder<T> {
113 |         private Class<T> type;
114 |         private String typeFromColumn;
115 |         private Collector<DataRow, ?, T> collector;
116 | 
117 |         public AggregateColumnSpecBuilder(Class<T> type, Collector<DataRow, ?, T> collector) {
118 |             this.type = type;
119 |             this.collector = collector;
120 |         }
121 | 
122 |         public AggregateColumnSpecBuilder(String typeFromColumn, Collector<DataRow, ?, T> collector) {
123 |             this.typeFromColumn = typeFromColumn;
124 |             this.collector = collector;
125 |         }
126 | 
127 |         public AggregateColumnSpec<T> as(String column) {
128 |             if (type != null) {
129 |                 return new AggregateColumnSpec<>(column, type, collector);
130 |             }
131 | 
132 |             if (typeFromColumn != null) {
133 |                 return new AggregateColumnSpec<T>(column, typeFromColumn, collector);
134 |             }
135 | 
136 |             throw new IllegalStateException("type and typeFromColumn not defined");
137 |         }
138 |     }
139 | 
140 |     // SortSpec
141 |     public static SortSpec asc(String name) {
142 |         return new SortSpec(name, SortSpec.Order.ASC);
143 |     }
144 | 
145 |     public static SortSpec desc(String name) {
146 |         return new SortSpec(name, SortSpec.Order.DESC);
147 |     }
148 | 
149 | 
150 | }
151 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/AggregateDataFrame.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy.dataframes;
  2 | 
  3 | import io.tenmax.poppy.AggregateColumnSpec;
  4 | import io.tenmax.poppy.DataColumn;
  5 | import io.tenmax.poppy.DataRow;
  6 | 
  7 | import java.util.*;
  8 | import java.util.concurrent.CompletableFuture;
  9 | import java.util.concurrent.ExecutorService;
 10 | import java.util.concurrent.Executors;
 11 | import java.util.function.Function;
 12 | 
 13 | public class AggregateDataFrame extends  BaseDataFrame{
 14 |     private final AggregateColumnSpec[] specs;
 15 |     private final BaseDataFrame parent;
 16 |     private final HashMap<String, AggregateColumnSpec> specsMap= new HashMap<>();
 17 |     private final int dimSize;
 18 | 
 19 |     public AggregateDataFrame(BaseDataFrame parent, AggregateColumnSpec[] specs) {
 20 |         super(new ExecutionContext(), columnsFromSpec(parent, specs));
 21 |         this.parent = parent;
 22 |         this.specs = specs;
 23 |         this.dimSize = parent.groupedColumns.length;
 24 |         this.groupedColumns = parent.groupedColumns;
 25 | 
 26 |         for (AggregateColumnSpec spec: specs) {
 27 |             specsMap.put(spec.getColumn(), spec);
 28 |         }
 29 |     }
 30 | 
 31 |     private static DataColumn[] columnsFromSpec(BaseDataFrame parent, AggregateColumnSpec[] specs) {
 32 |         DataColumn[] dataColumns = new DataColumn[parent.groupedColumns.length + specs.length];
 33 |         int i = 0;
 34 | 
 35 |         for (DataColumn dataColumn : parent.groupedColumns) {
 36 |             dataColumns[i++] = new DataColumn(dataColumn.getName(), dataColumn.getType());
 37 |         }
 38 | 
 39 |         for (AggregateColumnSpec spec : specs) {
 40 |             if (spec.getType() != null) {
 41 |                 dataColumns[i++] = new DataColumn(spec.getColumn(), spec.getType());
 42 |             } else {
 43 |                 dataColumns[i++] = new DataColumn(spec.getColumn(), parent.getColumn(spec.getTypeFromColumn()).getType());
 44 |             }
 45 |         }
 46 | 
 47 |         return  dataColumns;
 48 |     }
 49 | 
 50 |     @Override
 51 |     public int getPartitionCount() {
 52 |         return 1;
 53 |     }
 54 | 
 55 |     @Override
 56 |     public Iterator<DataRow> getPartition(int index) {
 57 |         int count = parent.getPartitionCount();
 58 |         HashMap<List, List> result = new HashMap<>();
 59 | 
 60 | 
 61 |         if (parent.context.getNumThreads() == 1) {
 62 |             // sequatial
 63 |             for (int i = 0; i < count; i++) {
 64 |                 HashMap<List, List> resultPartial = accumulate(parent.getPartition(i));
 65 |                 combine(result, resultPartial);
 66 |             }
 67 |         } else {
 68 |             // parallel
 69 |             ExecutorService executorService = Executors.newFixedThreadPool(parent.context.getNumThreads());
 70 |             ArrayList<CompletableFuture<Void>> futures = new ArrayList<>();
 71 | 
 72 |             for (int i = 0; i < count; i++) {
 73 | 
 74 |                 final int fi = i;
 75 | 
 76 |                 CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
 77 |                     try {
 78 |                         HashMap<List, List> resultPartial = accumulate(parent.getPartition(fi));
 79 |                         synchronized (result) {
 80 |                             combine(result, resultPartial);
 81 |                         }
 82 |                     } catch (Exception e) {
 83 |                         throw e;
 84 |                     }
 85 |                 }, executorService);
 86 | 
 87 |                 futures.add(future);
 88 |             }
 89 | 
 90 |             try {
 91 |                 CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
 92 |             } finally {
 93 |                 // shutdown the executor while all tasks complete
 94 |                 executorService.shutdown();
 95 |             }
 96 | 
 97 |         }
 98 | 
 99 |         HashMap<List, List> finalResult = finish(result);
100 |         return new AggregateIterator(finalResult);
101 |     }
102 | 
103 | 
104 |     HashMap<List, List> accumulate(Iterator<DataRow> iterator) {
105 |         HashMap<List, List> result = new HashMap<>();
106 | 
107 |         while(iterator.hasNext()) {
108 |             DataRow row = iterator.next();
109 | 
110 |             List dims  = new ArrayList<>();
111 |             for (DataColumn gc : parent.groupedColumns) {
112 |                 dims.add(row.get(gc.getName()));
113 |             }
114 | 
115 |             List accus;          // accumulators
116 | 
117 |             if (result.containsKey(dims)) {
118 |                 accus = result.get(dims);
119 |             } else {
120 |                 accus = new ArrayList(specs.length);
121 |                 for (int i = 0; i < specs.length; i++) {
122 |                     accus.add(specs[i].getCollector().supplier().get());
123 |                 }
124 |                 result.put(dims, accus);
125 |             }
126 | 
127 |             // aggregate
128 |             for (int i=0; i<specs.length; i++) {
129 |                 specs[i].getCollector().accumulator().accept(accus.get(i), row);
130 |             }
131 |         }
132 | 
133 |         return result;
134 |     }
135 | 
136 |     void combine(HashMap<List, List> result1, HashMap<List, List> result2) {
137 | 
138 |         result2.forEach((dims, accus2) -> {
139 |             if (result1.containsKey(dims)) {
140 |                 List accus1 = result1.get(dims);
141 |                 for (int i=0; i<specs.length; i++) {
142 |                     Object accu = specs[i].getCollector().combiner().apply(accus1.get(i), accus2.get(i));
143 |                     accus1.set(i, accu);
144 |                 }
145 |             } else {
146 |                 result1.put(dims,accus2);
147 |             }
148 |         });
149 |     }
150 | 
151 |     HashMap<List, List> finish(HashMap<List, List> result) {
152 |         final HashMap<List, List> finalResult = new HashMap<>();
153 | 
154 |         result.forEach((dims, accus) -> {
155 | 
156 |             List values = new ArrayList(specs.length);
157 |             for (int i=0; i<specs.length; i++) {
158 |                 Function finisher = specs[i].getCollector().finisher();
159 |                 if (finisher != null) {
160 |                     values.add(finisher.apply(accus.get(i)));
161 |                 } else {
162 |                     values.add(accus.get(i));
163 |                 }
164 |             }
165 |             finalResult.put(dims,values);
166 |         });
167 | 
168 |         return finalResult;
169 |     }
170 | 
171 |     class AggregateIterator implements Iterator<DataRow> {
172 |         private Iterator<Map.Entry<List, List>> iterator;
173 | 
174 |         AggregateIterator(HashMap<List, List> result) {
175 |             this.iterator = result.entrySet().iterator();
176 |         }
177 | 
178 |         @Override
179 |         public boolean hasNext() {
180 |             return iterator.hasNext();
181 |         }
182 | 
183 |         @Override
184 |         public DataRow next() {
185 |             return new AggregateDataRow(iterator.next());
186 |         }
187 |     }
188 | 
189 |     class AggregateDataRow extends BaseDataRow{
190 |         private Map.Entry<List, List> entry;
191 | 
192 |         AggregateDataRow(Map.Entry<List, List> entry) {
193 |             this.entry = entry;
194 |         }
195 | 
196 |         @Override
197 |         public Object get(int index) {
198 |             if (index < dimSize) {
199 |                 return entry.getKey().get(index);
200 |             } else {
201 |                 return entry.getValue().get(index - dimSize);
202 |             }
203 |         }
204 |     }
205 | }
206 | 


--------------------------------------------------------------------------------
/src/test/java/io/tenmax/poppy/DataFrameTest.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy;
  2 | 
  3 | import junit.framework.TestCase;
  4 | import org.junit.Before;
  5 | import org.junit.Test;
  6 | 
  7 | import java.util.ArrayList;
  8 | import java.util.Iterator;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | import java.util.stream.Collector;
 12 | import java.util.stream.Collectors;
 13 | 
 14 | import static io.tenmax.poppy.SpecUtils.*;
 15 | import static org.junit.Assert.assertEquals;
 16 | 
 17 | public class DataFrameTest {
 18 | 
 19 |     private ArrayList<Student> list;
 20 | 
 21 |     @Before
 22 |     public void setUp() throws Exception {
 23 | 
 24 |         list = new ArrayList<>();
 25 |         list.add(new Student(1, "pop", 5,2,170,60));
 26 |         list.add(new Student(2, "foo", 5,3,175,70));
 27 |         list.add(new Student(3, "bar", 5,4,168,80));
 28 |         list.add(new Student(4, null, 5,4,160,60));
 29 |     }
 30 | 
 31 |     @Test
 32 |     public void testBasic() throws Exception {
 33 |         Iterator<DataRow> it = DataFrame
 34 |         .from(list, Student.class)
 35 |         .iterator();
 36 | 
 37 |         assertEquals("pop", it.next().getString("name"));
 38 |         assertEquals(5, it.next().getInteger("grade"));
 39 |         assertEquals(168, it.next().getInteger("height"));
 40 |         assertEquals(60, it.next().getInteger("weight"));
 41 |     }
 42 | 
 43 |     @Test
 44 |     public void testProject() throws Exception {
 45 |         Iterator<DataRow> it = DataFrame
 46 |         .from(list, Student.class)
 47 |         .project("name", "weight", "height")
 48 |         .iterator();
 49 | 
 50 |         assertEquals("pop", it.next().getString(0));
 51 |         assertEquals(70, it.next().getInteger(1));
 52 |         assertEquals(168, it.next().getInteger(2));
 53 |         assertEquals(160, it.next().getInteger("height"));
 54 |     }
 55 | 
 56 |     @Test
 57 |     public void testProject2() throws Exception {
 58 |         Iterator<DataRow> it = DataFrame
 59 |         .from(list, Student.class)
 60 |         .project(
 61 |             col("name"),
 62 |             colMap("weight").as("w"),
 63 |             colMap("height", Float.class, (Integer height) -> (height / 10f)).as("h"))
 64 |         .iterator();
 65 | 
 66 |         assertEquals("pop", it.next().getString(0));
 67 |         assertEquals(70, it.next().getInteger("w"));
 68 |         assertEquals(16.8f, it.next().getFloat("h"), 0.1);
 69 |         assertEquals(16.0f, it.next().getFloat(2), 0.1);
 70 |     }
 71 | 
 72 |     @Test
 73 |     public void testFilter() throws Exception {
 74 |         Iterator<DataRow> it = DataFrame
 75 |         .from(list, Student.class)
 76 |         .filter(row -> row.getInteger("height") >= 170)
 77 |         .project("name", "weight", "height")
 78 |         .iterator();
 79 | 
 80 |         assertEquals("pop", it.next().getString(0));
 81 |         assertEquals(175, it.next().getInteger(2));
 82 |         assertEquals(false, it.hasNext());
 83 |     }
 84 | 
 85 |     @Test
 86 |     public void testAggre() throws Exception {
 87 |         Iterator<DataRow> it =
 88 |         DataFrame
 89 |         .from(list, Student.class)
 90 |         .aggregate(
 91 |                 sumLong("height").as("sum"),
 92 |                 avgLong("height").as("avg"),
 93 |                 min("height").as("min"),
 94 |                 max("height").as("max"),
 95 |                 count().as("count"),
 96 |                 aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi"))
 97 |         .iterator();
 98 | 
 99 |         DataRow row = it.next();
100 |         assertEquals(row.getLong("sum"), 673);
101 |         assertEquals(row.getDouble("avg"), 168.25, 0.1);
102 |         assertEquals(row.getInteger("min"), 160);
103 |         assertEquals(row.getInteger("max"), 175);
104 |         assertEquals(row.getLong("count"), 4);
105 |         assertEquals(row.getInteger("wi"), 270);
106 |     }
107 | 
108 |     @Test
109 |     public void testCountWithNull() throws Exception {
110 |         Iterator<DataRow> it =
111 |         DataFrame
112 |         .from(list, Student.class)
113 |         .aggregate(
114 |                 count().as("count"),
115 |                 count("name").as("countName"))
116 |         .iterator();
117 | 
118 |         DataRow row = it.next();
119 |         assertEquals(row.getLong("count"), 4);
120 |         assertEquals(row.getLong("countName"), 3);
121 |     }
122 | 
123 |     @Test
124 |     public void testGroupBy() throws Exception {
125 |         Iterator<DataRow> it = DataFrame
126 |         .from(list, Student.class)
127 |         .groupby("grade", "room")
128 |         .aggregate(
129 |             avgLong("weight").as("weight"),
130 |             avgLong("height").as("height"))
131 |         .sort("grade", "room")
132 |         .iterator();
133 | 
134 |         assertEquals(2, it.next().getInteger("room"));
135 |         assertEquals(70.0, it.next().getDouble("weight"), 0.1);
136 |         assertEquals(164.0, it.next().getDouble("height"), 0.1);
137 |     }
138 | 
139 |     @Test
140 |     public void testSort() throws Exception {
141 |         Iterator<DataRow> it = DataFrame
142 |                 .from(list, Student.class)
143 |                 .sort("weight", "height")
144 |                 .iterator();
145 | 
146 |         assertEquals(4, it.next().getInteger("studentId"));
147 |         assertEquals(1, it.next().getInteger("studentId"));
148 |         assertEquals(2, it.next().getInteger("studentId"));
149 |         assertEquals(3, it.next().getInteger("studentId"));
150 |     }
151 | 
152 |     @Test
153 |     public void testSort2() throws Exception {
154 |         Iterator<DataRow> it = DataFrame
155 |         .from(list, Student.class)
156 |         .sort(asc("weight"), desc("height"))
157 |         .iterator();
158 | 
159 |         assertEquals(1, it.next().getInteger("studentId"));
160 |         assertEquals(4, it.next().getInteger("studentId"));
161 |         assertEquals(2, it.next().getInteger("studentId"));
162 |         assertEquals(3, it.next().getInteger("studentId"));
163 |     }
164 | 
165 |     @Test
166 |     public void testSortWithNull() throws Exception {
167 |         Iterator<DataRow> it = DataFrame
168 |                 .from(list, Student.class)
169 |                 .sort(asc("name"))
170 |                 .iterator();
171 | 
172 |         assertEquals(4, it.next().getInteger("studentId"));
173 |         assertEquals(3, it.next().getInteger("studentId"));
174 |         assertEquals(2, it.next().getInteger("studentId"));
175 |         assertEquals(1, it.next().getInteger("studentId"));
176 |     }
177 | 
178 |     @Test
179 |     public void testDistinct() throws Exception {
180 |         Iterator<DataRow> it = DataFrame
181 |         .from(list, Student.class)
182 |         .distinct("grade", "room")
183 |         .iterator();
184 | 
185 |         assertEquals(2, it.next().getInteger("room"));
186 |         assertEquals(3, it.next().getInteger("room"));
187 |         assertEquals(4, it.next().getInteger("room"));
188 |         assertEquals(false, it.hasNext());
189 |     }
190 | 
191 |     @Test
192 |     public void testCache() throws Exception {
193 |         RandomAccessDataFrame cache = DataFrame.from(list, Student.class)
194 |                 .cache();
195 | 
196 |         assertEquals(4, cache.size());
197 |         assertEquals(2, cache.getRow(1).getInteger("studentId"));
198 |         assertEquals(80, cache.getRow(2).getInteger("weight"));
199 |         assertEquals(null, cache.getRow(3).getString("name"));
200 |     }
201 | 
202 |     @Test
203 |     public void testToList() throws Exception {
204 |         List<StudentReport> studentReports =
205 |         DataFrame
206 |         .from(list, Student.class)
207 |         .groupby("grade", "room")
208 |         .aggregate(
209 |                 avgLong("weight").as("weight"),
210 |                 avgLong("height").as("height"))
211 |         .sort("grade", "room")
212 |         .toList(StudentReport.class);
213 | 
214 |         StudentReport report = studentReports.get(0);
215 |         assertEquals(5, report.getGrade());
216 |         assertEquals(2, report.getRoom());
217 |         assertEquals(60.0, report.getWeight(),0.1);
218 |         assertEquals(170.0, report.getHeight(),0.1);
219 | 
220 |     }
221 | 
222 |     @Test
223 |     public void testToMap() throws Exception {
224 | 
225 |         Map<GradeRoom, StudentReport> reportMap =
226 |         DataFrame
227 |         .from(list, Student.class)
228 |         .groupby("grade", "room")
229 |         .aggregate(
230 |                 avgLong("weight").as("weight"),
231 |                 sumLong("weight").as("weightTotal"),
232 |                 avgLong("height").as("height"),
233 |                 sumLong("height").as("heightTotal"))
234 |         .sort("grade", "room")
235 |         .toMap(GradeRoom.class, StudentReport.class);
236 | 
237 |         reportMap.forEach((key, value) -> {
238 |             System.out.println(key);
239 |             System.out.println(value);
240 |         });
241 | 
242 |         assertEquals(reportMap.get(new GradeRoom(5,2)).getWeight(), 60.0, 0.1);
243 |         assertEquals(reportMap.get(new GradeRoom(5,3)).getHeight(), 175.0, 0.1);
244 |         assertEquals(reportMap.get(new GradeRoom(5,4)).getHeight(), 164.0, 0.1);
245 |     }
246 | 
247 | 
248 | }
249 | 


--------------------------------------------------------------------------------
/src/main/java/io/tenmax/poppy/dataframes/BaseDataFrame.java:
--------------------------------------------------------------------------------
  1 | package io.tenmax.poppy.dataframes;
  2 | 
  3 | import io.tenmax.poppy.*;
  4 | import io.tenmax.poppy.datasources.SimpleDataSource;
  5 | import io.tenmax.poppy.exceptions.ColumnNotFoundException;
  6 | import io.tenmax.poppy.exceptions.ReflectionException;
  7 | import io.tenmax.poppy.iterators.ParallelIterator;
  8 | import io.tenmax.poppy.iterators.SequantialIterator;
  9 | import org.apache.commons.beanutils.PropertyUtils;
 10 | 
 11 | import java.beans.PropertyDescriptor;
 12 | import java.lang.reflect.InvocationTargetException;
 13 | import java.util.*;
 14 | import java.util.concurrent.CompletableFuture;
 15 | import java.util.concurrent.ExecutorService;
 16 | import java.util.concurrent.Executors;
 17 | import java.util.concurrent.atomic.AtomicInteger;
 18 | import java.util.function.BiConsumer;
 19 | import java.util.function.Consumer;
 20 | import java.util.function.Predicate;
 21 | 
 22 | abstract public class BaseDataFrame implements DataFrame{
 23 |     protected final ExecutionContext context;
 24 |     protected final DataColumn[] columns;
 25 |     protected final HashMap<String, Integer> columnsMap;
 26 |     protected DataColumn[] groupedColumns = new DataColumn[0];
 27 | 
 28 |     public BaseDataFrame(ExecutionContext context, DataColumn[] columns) {
 29 |         this.context = context;
 30 |         this.columns = columns;
 31 |         this.columnsMap = new HashMap<>();
 32 | 
 33 |         int i = 0;
 34 |         for (DataColumn column: columns) {
 35 |             columnsMap.put(column.getName(), i++);
 36 |         }
 37 |     }
 38 | 
 39 |     @Override
 40 |     public DataColumn[] getColumns() {
 41 |         return columns;
 42 |     }
 43 | 
 44 |     @Override
 45 |     public DataColumn getColumn(String name) {
 46 |         Integer index = columnsMap.get(name);
 47 |         if(index == null) {
 48 |             throw new ColumnNotFoundException(name);
 49 |         }
 50 | 
 51 |         return columns[index];
 52 |     }
 53 | 
 54 |     @Override
 55 |     public DataColumn getColumn(int index) {
 56 |         return columns[index];
 57 |     }
 58 | 
 59 |     @Override
 60 |     public DataFrame project(String... columns) {
 61 |         return new ProjectDataFrame(this, columns);
 62 |     }
 63 | 
 64 |     @Override
 65 |     public DataFrame project(ProjectColumnSpec... columns) {
 66 |         return new ProjectDataFrame(this, columns);
 67 |     }
 68 | 
 69 |     @Override
 70 |     public DataFrame groupby(String... groupedColumns) {
 71 |         DataColumn[] gc = new DataColumn[groupedColumns.length];
 72 | 
 73 |         for (int i=0; i<groupedColumns.length; i++) {
 74 |             gc[i] = getColumn(groupedColumns[i]);
 75 |         }
 76 |         this.groupedColumns = gc;
 77 | 
 78 |         return this;
 79 |     }
 80 | 
 81 |     @Override
 82 |     public DataFrame aggregate(AggregateColumnSpec... specs) {
 83 |         return new AggregateDataFrame(this, specs);
 84 |     }
 85 | 
 86 |     @Override
 87 |     public DataFrame sort(String... columns) {
 88 |         SortSpec[] specs = new SortSpec[columns.length];
 89 |         for (int i=0; i<columns.length; i++) {
 90 |             specs[i] = new SortSpec(columns[i], SortSpec.Order.ASC);
 91 |         }
 92 |         return sort(specs);
 93 |     }
 94 | 
 95 |     @Override
 96 |     public DataFrame sort(SortSpec... specs) {
 97 |         return new SortDataFrame(this, specs);
 98 |     }
 99 | 
100 |     @Override
101 |     public DataFrame distinct(String... columns) {
102 |         return new DistinctDataFrame(this, columns);
103 |     }
104 | 
105 |     @Override
106 |     public DataFrame filter(Predicate<DataRow> predicate) {
107 |         return new FilterDataFrame(this, predicate);
108 |     }
109 | 
110 |     @Override
111 |     public DataFrame peek(Consumer<DataRow> consumer) {
112 |         return new PeekDataFrame(this, consumer);
113 |     }
114 | 
115 |     @Override
116 |     public DataFrame parallel(int numThreads) {
117 |         context.setNumThreads(numThreads);
118 |         return this;
119 |     }
120 | 
121 |     @Override
122 |     public RandomAccessDataFrame cache() {
123 |         return new CacheDataFrame(this);
124 |     }
125 | 
126 |     @Override
127 |     public void print() {
128 |         Arrays.stream(columns).forEach(column ->{
129 |             System.out.printf("%s\t", column.getName());
130 |         });
131 |         System.out.println();
132 | 
133 |         forEach((row) -> {
134 |             for (Object o : row) {
135 |                 System.out.printf("%s\t", o);
136 |             }
137 | 
138 |             System.out.println();
139 |         });
140 |     }
141 | 
142 |     @Override
143 |     public Iterator<DataRow> iterator() {
144 | 
145 | 
146 |         if (context.getNumThreads() > 1) {
147 |             return new ParallelIterator(this);
148 |         } else {
149 |             return new SequantialIterator(this);
150 |         }
151 |     }
152 | 
153 |     public void forEachPartition(BiConsumer<Integer, DataRow> consumer) {
154 |         int partitionCount = getPartitionCount();
155 | 
156 |         if (partitionCount == 1) {
157 |             for (int i = 0; i < partitionCount; i++) {
158 |                 Iterator<DataRow> partition = getPartition(i);
159 |                 while (partition.hasNext()) {
160 |                     consumer.accept(i, partition.next());
161 |                 }
162 |             }
163 |         } else {
164 |             forEachPartitionAsync(consumer).join();
165 |         }
166 |     }
167 | 
168 |     public CompletableFuture<Void> forEachPartitionAsync(BiConsumer<Integer, DataRow> consumer) {
169 |         ExecutorService executorService = Executors.newFixedThreadPool(context.getNumThreads());
170 |         int partitionCount = getPartitionCount();
171 |         CompletableFuture[] futures = new CompletableFuture[partitionCount];
172 | 
173 |         for (int i=0; i<partitionCount; i++) {
174 |             final int fi = i;
175 |             futures[i] =
176 |             CompletableFuture.runAsync(() -> {
177 |                 Iterator<DataRow> partition = getPartition(fi);
178 |                 while (partition.hasNext()) {
179 |                     consumer.accept(fi, partition.next());
180 |                 }
181 |             }, executorService);
182 |         }
183 | 
184 |         CompletableFuture<Void> future = CompletableFuture.allOf(futures);
185 |         executorService.shutdown();
186 | 
187 |         return future;
188 |     }
189 | 
190 |     @Override
191 |     public void to(DataSink sink) {
192 |         sink.sinkStart(getPartitionCount(), columns);
193 | 
194 |         ExecutorService executorService = Executors.newFixedThreadPool(context.getNumThreads());
195 |         int partitionCount = getPartitionCount();
196 |         CompletableFuture[] futures = new CompletableFuture[partitionCount];
197 | 
198 |         AtomicInteger counter = new AtomicInteger();
199 |         for (int i=0; i<partitionCount; i++) {
200 |             final int fi = i;
201 |             futures[i] =
202 |             CompletableFuture.runAsync(() -> {
203 |                 sink.partitionStart(fi);
204 | 
205 |                 Iterator<DataRow> partition = getPartition(fi);
206 |                 while (partition.hasNext()) {
207 |                     sink.partitionRow(fi, partition.next());
208 |                 }
209 | 
210 |                 sink.partitionComplete(fi);
211 | 
212 |             }, executorService);
213 |         }
214 | 
215 |         CompletableFuture<Void> future = CompletableFuture.allOf(futures);
216 |         executorService.shutdown();
217 |         future.join();
218 | 
219 |         sink.sinkComplete();
220 |     }
221 | 
222 |     @Override
223 |     public List<List> toList() {
224 |         ArrayList<List> list = new ArrayList<>();
225 | 
226 |         for (DataRow row : this) {
227 |             List data = new ArrayList();
228 |             for (int i=0; i<columns.length; i++) {
229 |                 data.add(row.get(i));
230 |             }
231 |             list.add(data);
232 |         }
233 | 
234 |         return list;
235 |     }
236 | 
237 |     @Override
238 |     public <T> List<T> toList(Class<T> clazz) {
239 |         ArrayList<T> list = new ArrayList<>();
240 | 
241 |         try {
242 |             for (DataRow row : this) {
243 |                 T t = clazz.newInstance();
244 | 
245 |                 for (DataColumn column : columns) {
246 |                     if (PropertyUtils.isWriteable(t, column.getName())) {
247 |                         PropertyUtils.setProperty(t, column.getName(), row.get(column.getName()));
248 |                     }
249 |                 }
250 |                 list.add(t);
251 |             }
252 |         } catch (InstantiationException |
253 |                 IllegalAccessException |
254 |                 NoSuchMethodException  |
255 |                 InvocationTargetException e)
256 |         {
257 |             throw new ReflectionException(e);
258 |         }
259 | 
260 |         return list;
261 |     }
262 | 
263 |     @Override
264 |     public Map<List, List> toMap() {
265 |         HashMap<List, List> map = new HashMap<>();
266 | 
267 |         List<Integer> keyColumns = new ArrayList<>();
268 |         List<Integer> valueColumns = new ArrayList<>();
269 | 
270 |         Set<String> groupedColumnsSet = new HashSet();
271 | 
272 |         for (DataColumn groupColumn : groupedColumns) {
273 |             groupedColumnsSet.add(groupColumn.getName());
274 |         }
275 | 
276 |         for (int i=0; i<columns.length; i++) {
277 |             if (groupedColumnsSet.contains(columns[i].getName())) {
278 |                 keyColumns.add(i);
279 |             } else {
280 |                 valueColumns.add(i);
281 |             }
282 |         }
283 | 
284 |         for (DataRow row : this) {
285 | 
286 |             List key = new ArrayList();
287 |             List value = new ArrayList();
288 | 
289 |             for (int i: keyColumns) {
290 |                 key.add(row.get(i));
291 |             }
292 | 
293 |             for (int i: valueColumns) {
294 |                 value.add(row.get(i));
295 |             }
296 | 
297 |             map.put(key, value);
298 |         }
299 | 
300 |         return map;
301 |     }
302 | 
303 |     @Override
304 |     public <K, V> Map<K, V> toMap(Class<K> keyClazz, Class<V> valueClazz) {
305 |         HashMap<K, V> map = new HashMap<>();
306 | 
307 |         List<Integer> keyColumns = new ArrayList<>();
308 | 
309 |         Set<String> groupedColumnsSet = new HashSet();
310 | 
311 |         for (DataColumn groupColumn : groupedColumns) {
312 |             groupedColumnsSet.add(groupColumn.getName());
313 |         }
314 | 
315 |         for (int i=0; i<columns.length; i++) {
316 |             if (groupedColumnsSet.contains(columns[i].getName())) {
317 |                 keyColumns.add(i);
318 |             }
319 |         }
320 | 
321 |         PropertyDescriptor[] props;
322 | 
323 |         HashSet<String> keyProps = new HashSet<>();
324 |         props = PropertyUtils.getPropertyDescriptors(keyClazz);
325 |         for (PropertyDescriptor prop : props) {
326 |             if (prop.getWriteMethod() != null) {
327 |                 keyProps.add(prop.getName());
328 |             }
329 |         }
330 | 
331 |         HashSet<String> valueProps = new HashSet<>();
332 |         props = PropertyUtils.getPropertyDescriptors(valueClazz);
333 |         for (PropertyDescriptor prop : props) {
334 |             if (prop.getWriteMethod() != null) {
335 |                 valueProps.add(prop.getName());
336 |             }
337 |         }
338 | 
339 | 
340 |         try {
341 |             for (DataRow row : this) {
342 |                 K key = keyClazz.newInstance();
343 |                 V value = valueClazz.newInstance();
344 | 
345 | 
346 |                 for (int i: keyColumns) {
347 |                     String columnName = columns[i].getName();
348 | 
349 |                     if (keyProps.contains(columnName)) {
350 |                         PropertyUtils.setProperty(
351 |                                 key,
352 |                                 columnName,
353 |                                 row.get(columnName)
354 |                         );
355 |                     }
356 |                 }
357 | 
358 |                 for (DataColumn column: columns) {
359 |                     String columnName = column.getName();
360 | 
361 |                     if (valueProps.contains(columnName)) {
362 |                         PropertyUtils.setProperty(
363 |                                 value,
364 |                                 columnName,
365 |                                 row.get(columnName)
366 |                         );
367 |                     }
368 |                 }
369 | 
370 |                 map.put(key, value);
371 |             }
372 |         } catch (InstantiationException |
373 |                 IllegalAccessException |
374 |                 NoSuchMethodException  |
375 |                 InvocationTargetException e)
376 |         {
377 |             throw new ReflectionException(e);
378 |         }
379 | 
380 |         return map;
381 |     }
382 | 
383 |     abstract public int getPartitionCount();
384 | 
385 |     public ExecutionContext getContext() {
386 |         return context;
387 |     }
388 | 
389 |     abstract public Iterator<DataRow> getPartition(int index);
390 | 
391 |     public static <T> DataFrame from(Iterable<T> source, Class<T> clazz) {
392 |         return new SourceDataFrame(new SimpleDataSource(clazz, source));
393 |     }
394 | 
395 |     public static <T> DataFrame from(DataSource<T> source) {
396 |         return new SourceDataFrame(source);
397 |     }
398 | 
399 | 
400 |     abstract class BaseDataRow implements DataRow {
401 |         @Override
402 |         public DataColumn[] getColumns() {
403 |             return columns;
404 |         }
405 | 
406 |         @Override
407 |         public Object get(String name) {
408 |             return get(columnsMap.get(name));
409 |         }
410 | 
411 |         @Override
412 |         public String toString() {
413 |             StringBuilder sb = new StringBuilder();
414 | 
415 |             for (int i=0; i<columns.length; i++) {
416 |                 sb.append(columns[i].getName() + "=" + get(i) + ",");
417 |             }
418 | 
419 |             return sb.toString();
420 |         }
421 |     }
422 | }
423 | 


--------------------------------------------------------------------------------