├── .gitignore ├── settings.gradle ├── release.sh ├── assets ├── executionContext.png ├── executionContext2.png ├── executionContext3.png └── executionContext4.png ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── src ├── main │ └── java │ │ └── io │ │ └── tenmax │ │ └── poppy │ │ ├── RandomAccessDataFrame.java │ │ ├── exceptions │ │ ├── ReflectionException.java │ │ ├── ColumnNotFoundException.java │ │ ├── ColumnNotSortableException.java │ │ └── DuplicatedColumnException.java │ │ ├── DataSource.java │ │ ├── DataSink.java │ │ ├── DataColumn.java │ │ ├── SortSpec.java │ │ ├── dataframes │ │ ├── ExecutionContext.java │ │ ├── PeekDataFrame.java │ │ ├── CacheDataFrame.java │ │ ├── SourceDataFrame.java │ │ ├── FilterDataFrame.java │ │ ├── SortDataFrame.java │ │ ├── DistinctDataFrame.java │ │ ├── ProjectDataFrame.java │ │ ├── AggregateDataFrame.java │ │ └── BaseDataFrame.java │ │ ├── datasources │ │ ├── SimpleDataSource.java │ │ └── ReflectionDataSource.java │ │ ├── ProjectColumnSpec.java │ │ ├── AggregateColumnSpec.java │ │ ├── datasinks │ │ └── DebugDataSink.java │ │ ├── iterators │ │ ├── SequantialIterator.java │ │ └── ParallelIterator.java │ │ ├── DataRow.java │ │ ├── DataFrame.java │ │ └── SpecUtils.java └── test │ └── java │ └── io │ └── tenmax │ └── poppy │ ├── StudentReport.java │ ├── GradeRoom.java │ ├── Student.java │ ├── DataFrameExceptionalTest.java │ ├── DataFrameParallelTest.java │ └── DataFrameTest.java ├── gradle.properties ├── gradlew.bat ├── README.md └── gradlew /.gitignore: -------------------------------------------------------------------------------- 1 | sftp-config.json 2 | slides.key 3 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'poppy' 2 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | gradle clean build bintrayUpload -PdryRun=false 2 | -------------------------------------------------------------------------------- /assets/executionContext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tenmax/poppy/HEAD/assets/executionContext.png -------------------------------------------------------------------------------- /assets/executionContext2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tenmax/poppy/HEAD/assets/executionContext2.png -------------------------------------------------------------------------------- /assets/executionContext3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tenmax/poppy/HEAD/assets/executionContext3.png -------------------------------------------------------------------------------- /assets/executionContext4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tenmax/poppy/HEAD/assets/executionContext4.png -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tenmax/poppy/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/RandomAccessDataFrame.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | public interface RandomAccessDataFrame extends DataFrame { 4 | int size(); 5 | 6 | DataRow getRow(int row); 7 | } 8 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | LIBRARY_VERSION=0.1.8 2 | USER_ORG=tenmax 3 | MAVEN_GROUP=io.tenmax 4 | MAVEN_ARTIFACT=poppy 5 | DESCRIPTION=A dataframe library for java 6 | WEBSITE=https://github.com/tenmax/poppy 7 | BINTRAY_REPO=io.tenmax 8 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/exceptions/ReflectionException.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.exceptions; 2 | 3 | public class ReflectionException extends RuntimeException{ 4 | public ReflectionException(Throwable cause) { 5 | super(cause); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Thu Apr 28 08:36:07 CST 2016 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.12-bin.zip 7 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/exceptions/ColumnNotFoundException.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.exceptions; 2 | 3 | public class ColumnNotFoundException extends RuntimeException{ 4 | 5 | public ColumnNotFoundException(String column) { 6 | super("Column not found: " + column); 7 | } 8 | 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/DataSource.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | import java.util.Iterator; 4 | 5 | public interface DataSource { 6 | 7 | int getPartitionCount(); 8 | 9 | Iterator getPartition(int index); 10 | 11 | DataColumn[] getColumns(); 12 | 13 | Object get(T data, String columnName); 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/exceptions/ColumnNotSortableException.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.exceptions; 2 | 3 | /** 4 | * Created by popcorny on 4/20/16. 5 | */ 6 | public class ColumnNotSortableException extends RuntimeException{ 7 | public ColumnNotSortableException(String column) { 8 | super("Column not sortable: " + column); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/exceptions/DuplicatedColumnException.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.exceptions; 2 | 3 | /** 4 | * Created by popcorny on 4/20/16. 5 | */ 6 | public class DuplicatedColumnException extends RuntimeException{ 7 | 8 | public DuplicatedColumnException(String column) { 9 | super("Column Duplicated : " + column); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/DataSink.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | public interface DataSink { 4 | 5 | default void sinkStart(int partitionCount, DataColumn[] columns){} 6 | 7 | default void sinkComplete(){} 8 | 9 | default void partitionStart(int partition){} 10 | 11 | default void partitionRow(int partition, DataRow row){} 12 | 13 | default void partitionComplete(int partition){} 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/DataColumn.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | public class DataColumn { 4 | private final String name; 5 | private final Class type; 6 | 7 | public DataColumn(String name, Class type) { 8 | this.name = name; 9 | this.type = type; 10 | } 11 | 12 | public String getName() { 13 | return name; 14 | } 15 | 16 | public Class getType() { 17 | return type; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/SortSpec.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | public class SortSpec { 4 | public enum Order { 5 | ASC, DESC 6 | }; 7 | 8 | private final String column; 9 | private final Order order; 10 | 11 | public SortSpec(String column, Order order) { 12 | this.column = column; 13 | this.order = order; 14 | } 15 | 16 | public String getColumn() { 17 | return column; 18 | } 19 | 20 | public Order getOrder() { 21 | return order; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/dataframes/ExecutionContext.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.dataframes; 2 | 3 | public class ExecutionContext { 4 | private boolean closed; 5 | private int numThreads = 1; 6 | 7 | public int getNumThreads() { 8 | return numThreads; 9 | } 10 | 11 | public void setNumThreads(int numThreads) { 12 | if(numThreads <= 0) { 13 | throw new IllegalArgumentException("numThreads should be greater than 0"); 14 | } 15 | 16 | this.numThreads = numThreads; 17 | } 18 | 19 | public void close() { 20 | this.closed = true; 21 | } 22 | 23 | public boolean isClosed() { 24 | return closed; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/datasources/SimpleDataSource.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.datasources; 2 | 3 | import java.util.Iterator; 4 | 5 | /** 6 | * SimpleDataSource use the java reflection to define the columns. And using 7 | * the Java Bean conversion to get the value of a column. 8 | * 9 | * @param The source data type. 10 | */ 11 | public class SimpleDataSource extends ReflectionDataSource { 12 | private final Iterable[] iterables; 13 | 14 | public SimpleDataSource(Class clazz, Iterable... iterables) { 15 | super(clazz); 16 | this.iterables = iterables; 17 | } 18 | 19 | @Override 20 | public int getPartitionCount() { 21 | return iterables.length; 22 | } 23 | 24 | @Override 25 | public Iterator getPartition(int index) { 26 | return iterables[index].iterator(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/ProjectColumnSpec.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | import java.util.function.Function; 4 | 5 | public class ProjectColumnSpec { 6 | private final String column; 7 | private final Class type; 8 | private final Function mapper; 9 | 10 | public ProjectColumnSpec(String column, Class type, Function mapper) { 11 | this.column = column; 12 | this.type = type; 13 | this.mapper = mapper; 14 | } 15 | 16 | public ProjectColumnSpec(String column, String from, Class type, Function mapper) { 17 | this.column = column; 18 | this.type = type; 19 | 20 | if (mapper == null) { 21 | this.mapper = (row) -> (T)row.get(from); 22 | } else { 23 | this.mapper = (row) -> mapper.apply(row.get(from)); 24 | } 25 | } 26 | 27 | public String getColumn() { 28 | return column; 29 | } 30 | 31 | public Class getType() { 32 | return type; 33 | } 34 | 35 | public Function getMapper() { 36 | return mapper; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/java/io/tenmax/poppy/StudentReport.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | /** 4 | * Created by popcorny on 4/24/16. 5 | */ 6 | public class StudentReport { 7 | int grade; 8 | int room; 9 | double weight; 10 | double height; 11 | 12 | public int getGrade() { 13 | return grade; 14 | } 15 | 16 | public void setGrade(int grade) { 17 | this.grade = grade; 18 | } 19 | 20 | public int getRoom() { 21 | return room; 22 | } 23 | 24 | public void setRoom(int room) { 25 | this.room = room; 26 | } 27 | 28 | public double getWeight() { 29 | return weight; 30 | } 31 | 32 | public void setWeight(double weight) { 33 | this.weight = weight; 34 | } 35 | 36 | public double getHeight() { 37 | return height; 38 | } 39 | 40 | public void setHeight(double height) { 41 | this.height = height; 42 | } 43 | 44 | @Override 45 | public String toString() { 46 | return "StudentReport{" + 47 | "grade=" + grade + 48 | ", room=" + room + 49 | ", weight=" + weight + 50 | ", height=" + height + 51 | '}'; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/AggregateColumnSpec.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | import java.util.stream.Collector; 4 | 5 | public class AggregateColumnSpec { 6 | private final String column; 7 | private final Class type; 8 | private final String typeFromColumn; 9 | private final Collector collector; 10 | 11 | public AggregateColumnSpec(String column, Class type, Collector collector) { 12 | this.column = column; 13 | this.type = type; 14 | this.typeFromColumn = null; 15 | this.collector = collector; 16 | } 17 | 18 | public AggregateColumnSpec(String column, String typeFromColumn, Collector collector) { 19 | this.column = column; 20 | this.type = null; 21 | this.typeFromColumn = typeFromColumn; 22 | this.collector = collector; 23 | } 24 | 25 | public String getColumn() { 26 | return column; 27 | } 28 | 29 | public Class getType() { 30 | return type; 31 | } 32 | 33 | public String getTypeFromColumn() { 34 | return typeFromColumn; 35 | } 36 | 37 | public Collector getCollector() { 38 | return collector; 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/datasinks/DebugDataSink.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.datasinks; 2 | 3 | import io.tenmax.poppy.DataColumn; 4 | import io.tenmax.poppy.DataRow; 5 | import io.tenmax.poppy.DataSink; 6 | 7 | public class DebugDataSink implements DataSink{ 8 | @Override 9 | public void sinkStart(int partitionCount, DataColumn[] columns) { 10 | System.out.printf("[sinkStart] partitionCount=%d\n", partitionCount); 11 | for (DataColumn column : columns) { 12 | System.out.printf("\t%s\t%s\n", column.getType().getName(),column.getName()); 13 | } 14 | } 15 | 16 | @Override 17 | public void sinkComplete() { 18 | System.out.printf("[sinkComplete]\n"); 19 | } 20 | 21 | @Override 22 | public void partitionStart(int partiton) { 23 | System.out.printf("[partitionStart] partition=%d\n", partiton); 24 | } 25 | 26 | @Override 27 | public void partitionRow(int partition, DataRow row) { 28 | System.out.printf("[partitionRow] partition=%d\n", partition); 29 | System.out.printf(" %s\n", row); 30 | } 31 | 32 | @Override 33 | public void partitionComplete(int partiton) { 34 | System.out.printf("[partitionComplete] partition=%d\n", partiton); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/iterators/SequantialIterator.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.iterators; 2 | 3 | import io.tenmax.poppy.DataRow; 4 | import io.tenmax.poppy.dataframes.BaseDataFrame; 5 | 6 | import java.util.Iterator; 7 | 8 | public class SequantialIterator implements Iterator { 9 | private final BaseDataFrame dataFrame; 10 | private int top; 11 | private int partitionCount; 12 | private Iterator iterator; 13 | 14 | public SequantialIterator(BaseDataFrame dataFrame) { 15 | this.dataFrame = dataFrame; 16 | this.partitionCount = dataFrame.getPartitionCount(); 17 | } 18 | 19 | @Override 20 | public boolean hasNext() { 21 | while (true) { 22 | if(dataFrame.getContext().isClosed()) { 23 | return false; 24 | } 25 | 26 | if(iterator != null && iterator.hasNext()) { 27 | return true; 28 | } 29 | 30 | if (top >= partitionCount) { 31 | return false; 32 | } 33 | iterator = dataFrame.getPartition(top++); 34 | } 35 | } 36 | 37 | @Override 38 | public DataRow next() { 39 | if (iterator == null) { 40 | if (!hasNext()) { 41 | return null; 42 | } 43 | } 44 | return iterator.next(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/test/java/io/tenmax/poppy/GradeRoom.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | /** 4 | * Created by popcorny on 4/28/16. 5 | */ 6 | public class GradeRoom { 7 | int grade; 8 | int room; 9 | 10 | public GradeRoom() { 11 | } 12 | 13 | public GradeRoom(int grade, int room) { 14 | this.grade = grade; 15 | this.room = room; 16 | } 17 | 18 | public int getGrade() { 19 | return grade; 20 | } 21 | 22 | public void setGrade(int grade) { 23 | this.grade = grade; 24 | } 25 | 26 | public int getRoom() { 27 | return room; 28 | } 29 | 30 | public void setRoom(int room) { 31 | this.room = room; 32 | } 33 | 34 | @Override 35 | public boolean equals(Object o) { 36 | if (this == o) return true; 37 | if (o == null || getClass() != o.getClass()) return false; 38 | 39 | GradeRoom gradeRoom = (GradeRoom) o; 40 | 41 | if (grade != gradeRoom.grade) return false; 42 | if (room != gradeRoom.room) return false; 43 | 44 | return true; 45 | } 46 | 47 | @Override 48 | public int hashCode() { 49 | int result = grade; 50 | result = 31 * result + room; 51 | return result; 52 | } 53 | 54 | @Override 55 | public String toString() { 56 | return "GradeRoom{" + 57 | "grade=" + grade + 58 | ", room=" + room + 59 | '}'; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/dataframes/PeekDataFrame.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.dataframes; 2 | 3 | import io.tenmax.poppy.DataRow; 4 | 5 | import java.util.Iterator; 6 | import java.util.function.Consumer; 7 | 8 | public class PeekDataFrame extends BaseDataFrame { 9 | private final BaseDataFrame parent; 10 | private final Consumer consumer; 11 | 12 | public PeekDataFrame(BaseDataFrame parent, Consumer consumer) { 13 | super(parent.context, parent.getColumns()); 14 | this.parent = parent; 15 | this.groupedColumns = parent.groupedColumns; 16 | this.consumer = consumer; 17 | } 18 | 19 | @Override 20 | public int getPartitionCount() { 21 | return parent.getPartitionCount(); 22 | } 23 | 24 | @Override 25 | public Iterator getPartition(int index) { 26 | return new PeekIterator(parent.getPartition(index)); 27 | } 28 | 29 | class PeekIterator implements Iterator { 30 | private Iterator wrapped; 31 | 32 | PeekIterator(Iterator wrapped) { 33 | this.wrapped = wrapped; 34 | } 35 | 36 | @Override 37 | public boolean hasNext() { 38 | return wrapped.hasNext(); 39 | } 40 | 41 | @Override 42 | public DataRow next() { 43 | DataRow row = wrapped.next(); 44 | consumer.accept(row); 45 | return row; 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/datasources/ReflectionDataSource.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.datasources; 2 | 3 | import io.tenmax.poppy.DataColumn; 4 | import io.tenmax.poppy.DataSource; 5 | import io.tenmax.poppy.exceptions.ColumnNotFoundException; 6 | import org.apache.commons.beanutils.PropertyUtils; 7 | 8 | import java.beans.PropertyDescriptor; 9 | import java.util.ArrayList; 10 | 11 | public abstract class ReflectionDataSource implements DataSource{ 12 | private final DataColumn[] columns; 13 | 14 | public ReflectionDataSource(Class clazz) { 15 | this.columns = schemaFromClass(clazz); 16 | } 17 | 18 | private static DataColumn[] schemaFromClass(Class clazz) { 19 | PropertyDescriptor[] props = PropertyUtils.getPropertyDescriptors(clazz); 20 | ArrayList columns = new ArrayList<>(); 21 | 22 | for (PropertyDescriptor prop : props) { 23 | if(prop.getName().equals("class")) { 24 | continue; 25 | } 26 | columns.add(new DataColumn(prop.getName(), prop.getPropertyType())); 27 | } 28 | 29 | return columns.toArray(new DataColumn[0]); 30 | } 31 | 32 | @Override 33 | public DataColumn[] getColumns() { 34 | return columns; 35 | } 36 | 37 | @Override 38 | public Object get(T data, String columnName) { 39 | try { 40 | return PropertyUtils.getProperty(data, columnName); 41 | } catch (Exception e) { 42 | throw new ColumnNotFoundException(columnName); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/dataframes/CacheDataFrame.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.dataframes; 2 | 3 | import io.tenmax.poppy.DataColumn; 4 | import io.tenmax.poppy.DataRow; 5 | import io.tenmax.poppy.RandomAccessDataFrame; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Iterator; 9 | 10 | public class CacheDataFrame extends BaseDataFrame implements RandomAccessDataFrame { 11 | 12 | private final ArrayList rows = new ArrayList<>(); 13 | private final BaseDataFrame parent; 14 | 15 | public CacheDataFrame(BaseDataFrame parent) { 16 | super(new ExecutionContext(), parent.columns); 17 | this.parent = parent; 18 | this.groupedColumns = parent.groupedColumns; 19 | for (DataRow row : parent) { 20 | rows.add(new CacheDataRow(row)); 21 | } 22 | } 23 | 24 | @Override 25 | public int getPartitionCount() { 26 | return 1; 27 | } 28 | 29 | @Override 30 | public Iterator getPartition(int index) { 31 | return rows.iterator(); 32 | } 33 | 34 | @Override 35 | public int size() { 36 | return rows.size(); 37 | } 38 | 39 | @Override 40 | public DataRow getRow(int row) { 41 | return rows.get(row); 42 | } 43 | 44 | class CacheDataRow extends BaseDataRow { 45 | ArrayList value = new ArrayList(); 46 | 47 | CacheDataRow(DataRow row) { 48 | for (int i=0; iNUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/DataFrame.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | import io.tenmax.poppy.dataframes.BaseDataFrame; 4 | 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.function.Predicate; 8 | import java.util.function.Consumer; 9 | 10 | /** 11 | * DataFrame is a sequence of schema-defined rows. The following 12 | * example illustrates how to use the {@link DataFrame}: 13 | * 14 | *
{@code
15 |  * List students = ...;
16 |  *
17 |  * DataFrame.from(students, Student.class)
18 |  *          .groupby("grade", "room")
19 |  *          .aggregate(
20 |  *              avgLong("weight").as("weight"),
21 |  *              avgLong("height").as("height"))
22 |  *          .sort("grade", "room")
23 |  *          .print();
24 |  * }
25 | * 26 | * Just like {@link java.util.stream.Stream}, it iterates through a 27 | * data source with as little memory as possible. This allows you processing 28 | * billion of data with only constant memory. 29 | * 30 | * DataFrame provides operations which SQL provides. For example {@link #project(ProjectColumnSpec[]) projection}, 31 | * ,{@link #filter(java.util.function.Predicate) filtering}, {@link #groupby(String...) grouping}, and 32 | * {@link #aggregate(AggregateColumnSpec[]) aggregation}, {@link #sort(SortSpec...) soring}. These operations make it possible to 33 | * write your own SQL-like statements in your application. In the above example, it is equivalent to 34 | * 35 | *
{@code
36 |  *     select
37 |  *         grade,
38 |  *         room,
39 |  *         avg(weight) as weight,
40 |  *         avg(height) as height
41 |  *     from Student
42 |  *     group by grade, room
43 |  *     order by grade, room
44 |  * }
45 | * 46 | */ 47 | public interface DataFrame extends Iterable{ 48 | 49 | DataColumn[] getColumns(); 50 | 51 | DataColumn getColumn(String name); 52 | 53 | DataColumn getColumn(int index); 54 | 55 | static DataFrame from(Iterable source, Class clazz) { 56 | return BaseDataFrame.from(source,clazz); 57 | } 58 | 59 | static DataFrame from(DataSource source) { 60 | return BaseDataFrame.from(source); 61 | } 62 | 63 | 64 | DataFrame project(String... columns); 65 | 66 | DataFrame project(ProjectColumnSpec... columns); 67 | 68 | DataFrame groupby(String... columns); 69 | 70 | DataFrame aggregate(AggregateColumnSpec... specs); 71 | 72 | DataFrame sort(String... columns); 73 | 74 | DataFrame sort(SortSpec... columns); 75 | 76 | DataFrame distinct(String... columns); 77 | 78 | DataFrame peek(Consumer consumer); 79 | 80 | DataFrame filter(Predicate predicate); 81 | 82 | DataFrame parallel(int numThreads); 83 | 84 | RandomAccessDataFrame cache(); 85 | 86 | void to(DataSink sink); 87 | 88 | List toList(); 89 | 90 | List toList(Class clazz); 91 | 92 | Map toMap(); 93 | 94 | Map toMap(Class keyClazz, Class valueClazz); 95 | 96 | void print(); 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/dataframes/DistinctDataFrame.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.dataframes; 2 | 3 | import io.tenmax.poppy.DataColumn; 4 | import io.tenmax.poppy.DataRow; 5 | import io.tenmax.poppy.iterators.SequantialIterator; 6 | 7 | import java.util.*; 8 | 9 | public class DistinctDataFrame extends BaseDataFrame { 10 | private final BaseDataFrame parent; 11 | 12 | public DistinctDataFrame(BaseDataFrame parent, String[] distinctColumns) { 13 | super(new ExecutionContext(), columnsFromNames(parent, distinctColumns)); 14 | 15 | this.parent = parent; 16 | } 17 | 18 | private static DataColumn[] columnsFromNames(BaseDataFrame parent, String[] distinctColumns) { 19 | DataColumn[] dataColumns = new DataColumn[distinctColumns.length]; 20 | int i = 0; 21 | 22 | for (String columnName : distinctColumns) { 23 | dataColumns[i++] = parent.getColumn(columnName); 24 | } 25 | 26 | return dataColumns; 27 | } 28 | 29 | @Override 30 | public int getPartitionCount() { 31 | return 1; 32 | } 33 | 34 | @Override 35 | public Iterator getPartition(int index) { 36 | return new DistinctIterator(parent.iterator()); 37 | } 38 | 39 | class DistinctIterator implements Iterator { 40 | private Iterator wrapped; 41 | private boolean ready; 42 | private DataRow row; 43 | 44 | private HashSet set = new HashSet<>(); 45 | 46 | DistinctIterator(Iterator wrapped) { 47 | this.wrapped = wrapped; 48 | } 49 | 50 | @Override 51 | public boolean hasNext() { 52 | if (!ready) { 53 | findNext(); 54 | } 55 | 56 | return row != null; 57 | } 58 | 59 | @Override 60 | public DataRow next() { 61 | if (!ready) { 62 | findNext(); 63 | } 64 | 65 | ready = false; 66 | return row; 67 | } 68 | 69 | private void findNext() { 70 | DataRow row; 71 | List value = new ArrayList(); 72 | 73 | while(wrapped.hasNext()) { 74 | row = wrapped.next(); 75 | for (DataColumn column: columns) { 76 | value.add(row.get(column.getName())); 77 | } 78 | 79 | if (!set.contains(value)) { 80 | this.row = new DistinctDataRow(value); 81 | this.ready = true; 82 | set.add(value); 83 | return; 84 | } 85 | } 86 | this.row = null; 87 | this.ready = false; 88 | } 89 | } 90 | 91 | class DistinctDataRow extends BaseDataRow { 92 | 93 | private List value; 94 | 95 | DistinctDataRow(List value) { 96 | this.value = value; 97 | } 98 | 99 | 100 | @Override 101 | public Object get(int index) { 102 | return value.get(index); 103 | } 104 | 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Poppy 2 | *poppy* is dataframe library for java, which provides common SQL operations (e.g. select, from, where, group by, order by, distinct) to process data in java. 3 | 4 | Unlike other dataframe libraries, which keep all the data in memory, *poppy* process data in streaming manager. That is, it is more similar as [Java8 Stream library](https://docs.oracle.com/javase/8/docs/api/java/util/stream/package-summary.html), but relational version. 5 | 6 | Here is a simple example. We have a `Student` class 7 | 8 | ```java 9 | public class Student { 10 | private int studentId; 11 | private String name; 12 | private int grade; 13 | private int room; 14 | private int height; 15 | private int weight; 16 | ... 17 | } 18 | ``` 19 | 20 | In SQL, we have a query like this 21 | 22 | ```sql 23 | select 24 | grade, 25 | room, 26 | avg(weight) as weight, 27 | avg(height) as height 28 | from Student 29 | group by grade, room 30 | order by grade, room 31 | ``` 32 | 33 | Here is the *Poppy*'s version 34 | 35 | ```java 36 | List students = ...; 37 | 38 | DataFrame 39 | .from(students, Student.class) 40 | .groupby("grade", "room") 41 | .aggregate( 42 | avgLong("weight").as("weight"), 43 | avgLong("height").as("height")) 44 | .sort("grade", "room") 45 | .print(); 46 | ``` 47 | 48 | 49 | 50 | # Getting Started 51 | 52 | ## Requirement 53 | Java 8 or higher 54 | 55 | ## Dependency 56 | 57 | Poppy's package is managed by [JCenter](https://bintray.com/bintray/jcenter) repository. 58 | 59 | Maven 60 | 61 | ``` 62 | 63 | io.tenmax 64 | poppy 65 | 0.1.8 66 | pom 67 | 68 | ``` 69 | 70 | Gradle 71 | 72 | ``` 73 | compile 'io.tenmax:poppy:0.1.8' 74 | ``` 75 | ## Features 76 | 77 | 1. Support the most common operations in SQL. e.g. select, from, where, group by, order by, distinct 78 | 2. Support the most common aggregation functions in SQL. e.g. *avg()*, *sum()*, *count()*, *min()*, *max()* 79 | 3. **Custom aggregation functions.** by [java.util.stream.Collector](https://docs.oracle.com/javase/8/docs/api/java/util/stream/Collector.html) 80 | 4. **Partition support.** Partition is the unit of parallelism. Multiple partitions allow you processing data concurrently. 81 | 5. **Multi-threaded support**. For CPU-bound jobs, it leverages all your CPU resources for better performance; for IO-bound jobs, it reduces the waiting time, and take adventages of better concurrency. 82 | 6. Suitable for both **batch** and **streaming** scenario. 83 | 7. **Lightweight**. Comparing to [Spark DataFrame API](https://spark.apache.org/docs/latest/sql-programming-guide.html), it is much more lightweight to embed in your application. 84 | 8. **Stream-based design**. Comparing to [joinery](https://github.com/cardillo/joinery), which keeps the whole data in memory. *Poppy*'s streaming behaviour allows limited memory to process huge volume of data. 85 | 86 | ## Documentation 87 | 88 | - [JavaDoc](http://tenmax.github.io/poppy/docs/javadoc/index.html) 89 | - [User Manual](http://tenmax.github.io/poppy/) 90 | 91 | # Contribution 92 | 93 | Please fork this project and pull request to me and any comment would be appreciated! 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/iterators/ParallelIterator.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.iterators; 2 | 3 | import io.tenmax.poppy.DataFrame; 4 | import io.tenmax.poppy.DataRow; 5 | import io.tenmax.poppy.dataframes.BaseDataFrame; 6 | import io.tenmax.poppy.dataframes.ExecutionContext; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import java.util.Iterator; 11 | import java.util.concurrent.BlockingQueue; 12 | import java.util.concurrent.ExecutorService; 13 | import java.util.concurrent.Executors; 14 | import java.util.concurrent.LinkedBlockingQueue; 15 | 16 | public class ParallelIterator implements Iterator { 17 | private static Logger logger = LoggerFactory.getLogger(ParallelIterator.class); 18 | 19 | private final BaseDataFrame dataFrame; 20 | 21 | private BlockingQueue queue = new LinkedBlockingQueue<>(); 22 | private int countDown; 23 | private boolean hasNext; 24 | private DataRow row; 25 | 26 | public ParallelIterator(BaseDataFrame dataFrame) { 27 | this.dataFrame = dataFrame; 28 | this.countDown = dataFrame.getPartitionCount(); 29 | 30 | start(); 31 | } 32 | 33 | public void start() { 34 | ExecutorService executor = Executors.newFixedThreadPool(dataFrame.getContext().getNumThreads()); 35 | 36 | for (int i=0; i { 40 | try { 41 | Iterator iter = dataFrame.getPartition(fi); 42 | while (iter.hasNext()) { 43 | queue.put(new Message(iter.next())); 44 | 45 | if(dataFrame.getContext().isClosed()) { 46 | break; 47 | } 48 | } 49 | } catch (Exception e) { 50 | logger.error("Error occured", e); 51 | } finally { 52 | queue.add(Message.END_OF_MESSAGE); 53 | } 54 | }); 55 | } 56 | 57 | // Shutdown while all task handled 58 | executor.shutdown(); 59 | } 60 | 61 | @Override 62 | public boolean hasNext() { 63 | if (!hasNext) { 64 | findNext(); 65 | } 66 | 67 | return hasNext; 68 | } 69 | 70 | @Override 71 | public DataRow next() { 72 | if (!hasNext) { 73 | findNext(); 74 | } 75 | 76 | if (hasNext) { 77 | hasNext = false; 78 | return row; 79 | } else { 80 | return null; 81 | } 82 | } 83 | 84 | public void findNext() { 85 | hasNext = false; 86 | while (countDown > 0) { 87 | 88 | Message message = null; 89 | try { 90 | message = queue.take(); 91 | 92 | if (message == Message.END_OF_MESSAGE) { 93 | countDown--; 94 | } else { 95 | row = message.row; 96 | hasNext = true; 97 | break; 98 | } 99 | } catch (InterruptedException e) { 100 | 101 | } 102 | } 103 | } 104 | 105 | 106 | static class Message { 107 | static Message END_OF_MESSAGE = new Message(null); 108 | 109 | DataRow row; 110 | 111 | Message(DataRow row) { 112 | this.row = row; 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/dataframes/ProjectDataFrame.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.dataframes; 2 | 3 | import io.tenmax.poppy.DataColumn; 4 | import io.tenmax.poppy.DataRow; 5 | import io.tenmax.poppy.ProjectColumnSpec; 6 | 7 | import java.util.HashMap; 8 | import java.util.Iterator; 9 | import java.util.function.Function; 10 | 11 | public class ProjectDataFrame extends BaseDataFrame { 12 | private final ProjectColumnSpec[] specs; 13 | private final BaseDataFrame parent; 14 | private final HashMap specsMap= new HashMap<>(); 15 | 16 | public ProjectDataFrame(BaseDataFrame parent, String[] columnNames) { 17 | this(parent, specsFromColumnNames(parent, columnNames)); 18 | } 19 | 20 | public ProjectDataFrame(BaseDataFrame parent, ProjectColumnSpec[] specs) { 21 | super(parent.context, columnsFromSpec(fixSpecs(parent, specs))); 22 | this.parent = parent; 23 | this.specs = fixSpecs(parent, specs); 24 | 25 | for (ProjectColumnSpec spec: specs) { 26 | specsMap.put(spec.getColumn(), spec); 27 | } 28 | } 29 | 30 | private static ProjectColumnSpec[] fixSpecs(BaseDataFrame parent, ProjectColumnSpec[] specs) { 31 | ProjectColumnSpec[] newSpecs = new ProjectColumnSpec[specs.length]; 32 | int i = 0; 33 | 34 | for (ProjectColumnSpec spec : specs) { 35 | String column = spec.getColumn(); 36 | Class type = spec.getType(); 37 | Function mapper = spec.getMapper(); 38 | if (type == null) { 39 | if (mapper == null) { 40 | type = parent.getColumn(column).getType(); 41 | } else { 42 | new IllegalArgumentException("not type defined for " + column); 43 | } 44 | } 45 | if (mapper == null) { 46 | mapper = (DataRow row) -> row.get(column); 47 | } 48 | 49 | newSpecs[i++] = new ProjectColumnSpec(column, type, mapper); 50 | } 51 | 52 | return newSpecs; 53 | } 54 | 55 | private static ProjectColumnSpec[] specsFromColumnNames(BaseDataFrame parent, String[] columnNames) { 56 | ProjectColumnSpec[] specs = new ProjectColumnSpec[columnNames.length]; 57 | 58 | 59 | int i = 0; 60 | 61 | for (String columnName : columnNames) { 62 | 63 | specs[i++] = new ProjectColumnSpec( 64 | columnName, 65 | columnName, 66 | parent.getColumn(columnName).getType(), 67 | null); 68 | } 69 | 70 | return specs; 71 | } 72 | 73 | 74 | 75 | private static DataColumn[] columnsFromSpec(ProjectColumnSpec[] specs) { 76 | DataColumn[] dataColumns = new DataColumn[specs.length]; 77 | int i = 0; 78 | for (ProjectColumnSpec spec : specs) { 79 | dataColumns[i++] = new DataColumn(spec.getColumn(), spec.getType()); 80 | } 81 | return dataColumns; 82 | } 83 | 84 | @Override 85 | public int getPartitionCount() { 86 | return parent.getPartitionCount(); 87 | } 88 | 89 | @Override 90 | public Iterator getPartition(int index) { 91 | return new ProjectIterator(parent.getPartition(index)); 92 | } 93 | 94 | class ProjectIterator implements Iterator { 95 | private Iterator wrapped; 96 | 97 | ProjectIterator(Iterator wrapped) { 98 | this.wrapped = wrapped; 99 | } 100 | 101 | @Override 102 | public boolean hasNext() { 103 | return wrapped.hasNext(); 104 | } 105 | 106 | @Override 107 | public DataRow next() { 108 | return new ProjectDataRow(wrapped.next()); 109 | } 110 | } 111 | 112 | class ProjectDataRow extends BaseDataRow { 113 | private DataRow row; 114 | 115 | ProjectDataRow(DataRow row) { 116 | this.row = row; 117 | } 118 | 119 | @Override 120 | public Object get(int index) { 121 | ProjectColumnSpec spec = specs[index]; 122 | if (spec.getMapper() != null) { 123 | return spec.getMapper().apply(row); 124 | } else { 125 | return row.get(spec.getColumn()); 126 | } 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/test/java/io/tenmax/poppy/DataFrameExceptionalTest.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | import io.tenmax.poppy.datasinks.DebugDataSink; 4 | import io.tenmax.poppy.datasources.SimpleDataSource; 5 | import junit.framework.TestCase; 6 | import org.junit.Before; 7 | import org.junit.Test; 8 | 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.Collections; 12 | import java.util.Iterator; 13 | import java.util.concurrent.atomic.AtomicInteger; 14 | import java.util.stream.Collectors; 15 | 16 | import static io.tenmax.poppy.SpecUtils.*; 17 | import static io.tenmax.poppy.SpecUtils.desc; 18 | 19 | public class DataFrameExceptionalTest { 20 | 21 | 22 | @Test 23 | (expected = RuntimeException.class) 24 | public void testAggre1() throws Exception { 25 | DataFrame.from(new ExceptionalDataSource(ExceptionalDataSource.ErrorType.GetPartitionCount)) 26 | .parallel(4) 27 | .aggregate( 28 | avgLong("weight").as("weight"), 29 | avgLong("height").as("height"), 30 | count().as("count"), 31 | aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi")) 32 | .print(); 33 | } 34 | 35 | @Test 36 | (expected = RuntimeException.class) 37 | public void testAggre2() throws Exception { 38 | DataFrame.from(new ExceptionalDataSource(ExceptionalDataSource.ErrorType.GetPartition)) 39 | .parallel(4) 40 | .aggregate( 41 | avgLong("weight").as("weight"), 42 | avgLong("height").as("height"), 43 | count().as("count"), 44 | aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi")) 45 | .print(); 46 | } 47 | 48 | @Test 49 | (expected = RuntimeException.class) 50 | public void testAggre3() throws Exception { 51 | DataFrame.from(new ExceptionalDataSource(ExceptionalDataSource.ErrorType.Iterator)) 52 | .parallel(4) 53 | .aggregate( 54 | avgLong("weight").as("weight"), 55 | avgLong("height").as("height"), 56 | count().as("count"), 57 | aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi")) 58 | .print(); 59 | } 60 | 61 | } 62 | 63 | 64 | class ExceptionalDataSource implements DataSource { 65 | enum ErrorType { 66 | GetPartitionCount, 67 | GetPartition, 68 | Iterator 69 | } 70 | 71 | private ErrorType errorType; 72 | 73 | ExceptionalDataSource(ErrorType errorType) { 74 | this.errorType = errorType; 75 | } 76 | 77 | @Override 78 | public int getPartitionCount() { 79 | if(errorType == ErrorType.GetPartitionCount) { 80 | throw new RuntimeException("hello exception"); 81 | } 82 | 83 | return 3; 84 | } 85 | 86 | @Override 87 | public Iterator getPartition(int index) { 88 | 89 | if (index > 0) { 90 | 91 | if (errorType == ErrorType.GetPartition) { 92 | throw new RuntimeException("hello exception"); 93 | } else if(errorType == ErrorType.Iterator) { 94 | return new ExceptionalIterator(); 95 | } 96 | } 97 | 98 | return Arrays.asList(new Student(1, "pop", 5, 2, 176, 68)).iterator(); 99 | } 100 | 101 | @Override 102 | public DataColumn[] getColumns() { 103 | return new DataColumn[] { 104 | new DataColumn("name", String.class), 105 | new DataColumn("weight", Integer.class), 106 | new DataColumn("height", Integer.class) 107 | }; 108 | } 109 | 110 | @Override 111 | public Object get(Student student, String columnName) { 112 | switch (columnName) { 113 | case "name": 114 | return student.getName(); 115 | case "weight": 116 | return student.getWeight(); 117 | case "height": 118 | return student.getHeight(); 119 | } 120 | return null; 121 | } 122 | } 123 | 124 | class ExceptionalIterator implements Iterator { 125 | @Override 126 | public boolean hasNext() { 127 | return true; 128 | } 129 | 130 | @Override 131 | public Student next() { 132 | throw new RuntimeException("hello exception"); 133 | } 134 | } -------------------------------------------------------------------------------- /src/test/java/io/tenmax/poppy/DataFrameParallelTest.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | import io.tenmax.poppy.datasinks.DebugDataSink; 4 | import io.tenmax.poppy.datasources.SimpleDataSource; 5 | import junit.framework.TestCase; 6 | import org.junit.Before; 7 | import org.junit.Test; 8 | 9 | import java.util.ArrayList; 10 | import java.util.concurrent.atomic.AtomicInteger; 11 | import java.util.stream.Collectors; 12 | 13 | import static io.tenmax.poppy.SpecUtils.*; 14 | import static io.tenmax.poppy.SpecUtils.desc; 15 | import static org.junit.Assert.assertEquals; 16 | 17 | 18 | public class DataFrameParallelTest { 19 | 20 | private DataFrame df; 21 | 22 | 23 | @Before 24 | public void setUp() throws Exception { 25 | 26 | ArrayList list1 = new ArrayList<>(); 27 | ArrayList list2 = new ArrayList<>(); 28 | ArrayList list3 = new ArrayList<>(); 29 | 30 | list1.add(new Student(1, "pop", 5,2,170,60)); 31 | list1.add(new Student(2, "foo", 5,3,175,70)); 32 | list1.add(new Student(3, "bar", 5,4,168,80)); 33 | list1.add(new Student(4, "john", 5,4,160,60)); 34 | 35 | 36 | list2.add(new Student(5, "richard", 4,1,170,68)); 37 | list2.add(new Student(6, "howard", 4,2,178,90)); 38 | list2.add(new Student(7, "michael", 4,3,169,80)); 39 | list2.add(new Student(8, "coco", 4,4,158,65)); 40 | 41 | 42 | list3.add(new Student(9, "tina", 3,2,155,44)); 43 | list3.add(new Student(10, "chloe", 3,2,158,45)); 44 | list3.add(new Student(11, "george", 3,5,163,90)); 45 | list3.add(new Student(12, "mary", 3,1,170,60)); 46 | 47 | df= DataFrame.from( 48 | new SimpleDataSource<>(Student.class,list1, list2, list3)) 49 | .parallel(4); 50 | } 51 | 52 | @Test 53 | public void testBasic() throws Exception { 54 | df 55 | .print(); 56 | } 57 | 58 | @Test 59 | public void testProject() throws Exception { 60 | df 61 | .project("name", "weight", "height") 62 | .print(); 63 | } 64 | 65 | @Test 66 | public void testProject2() throws Exception { 67 | df 68 | .project( 69 | col("name"), 70 | colMap("weight").as("w"), 71 | colMap("height", Float.class, (Integer height) -> (height / 10f)).as("h")) 72 | .print(); 73 | } 74 | 75 | @Test 76 | public void testFilter() throws Exception { 77 | df 78 | .filter(row -> row.getInteger("height") >= 170) 79 | .project("name", "weight", "height") 80 | .print(); 81 | } 82 | 83 | @Test 84 | public void testAggre() throws Exception { 85 | df 86 | .aggregate( 87 | avgLong("weight").as("weight"), 88 | avgLong("height").as("height"), 89 | count().as("count"), 90 | aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi")) 91 | .print(); 92 | } 93 | 94 | @Test 95 | public void testGroupBy() throws Exception { 96 | df 97 | .groupby("grade", "room") 98 | .aggregate( 99 | avgLong("weight").as("weight"), 100 | avgLong("height").as("height")) 101 | .sort("grade", "room") 102 | .print(); 103 | } 104 | 105 | @Test 106 | public void testSort() throws Exception { 107 | df 108 | .sort("weight", "height") 109 | .print(); 110 | } 111 | 112 | @Test 113 | public void testSort2() throws Exception { 114 | df 115 | .sort(asc("weight"), desc("height")) 116 | .print(); 117 | } 118 | 119 | @Test 120 | public void testDistinct() throws Exception { 121 | df 122 | .distinct("grade", "room") 123 | .print(); 124 | } 125 | 126 | @Test 127 | public void testTo() throws Exception { 128 | TestDataSink sink = new TestDataSink(); 129 | // df.to(new DebugDataSink()); 130 | df.to(sink); 131 | assertEquals(1, sink.sinkStart.get()); 132 | assertEquals(1, sink.sinkComplete.get()); 133 | assertEquals(3, sink.partitionStart.get()); 134 | assertEquals(12, sink.partitionRow.get()); 135 | assertEquals(3, sink.partitionComplete.get()); 136 | } 137 | 138 | class TestDataSink implements DataSink { 139 | AtomicInteger sinkStart = new AtomicInteger(); 140 | AtomicInteger sinkComplete = new AtomicInteger(); 141 | AtomicInteger partitionStart = new AtomicInteger(); 142 | AtomicInteger partitionRow = new AtomicInteger(); 143 | AtomicInteger partitionComplete = new AtomicInteger(); 144 | 145 | @Override 146 | public void sinkStart(int partitionCount, DataColumn[] columns) { 147 | sinkStart.incrementAndGet(); 148 | } 149 | 150 | @Override 151 | public void sinkComplete() { 152 | sinkComplete.incrementAndGet(); 153 | 154 | } 155 | 156 | @Override 157 | public void partitionStart(int partition) { 158 | partitionStart.incrementAndGet(); 159 | } 160 | 161 | @Override 162 | public void partitionRow(int partition, DataRow row) { 163 | partitionRow.incrementAndGet(); 164 | } 165 | 166 | @Override 167 | public void partitionComplete(int partition) { 168 | partitionComplete.incrementAndGet(); 169 | } 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # Attempt to set APP_HOME 46 | # Resolve links: $0 may be a link 47 | PRG="$0" 48 | # Need this for relative symlinks. 49 | while [ -h "$PRG" ] ; do 50 | ls=`ls -ld "$PRG"` 51 | link=`expr "$ls" : '.*-> \(.*\)$'` 52 | if expr "$link" : '/.*' > /dev/null; then 53 | PRG="$link" 54 | else 55 | PRG=`dirname "$PRG"`"/$link" 56 | fi 57 | done 58 | SAVED="`pwd`" 59 | cd "`dirname \"$PRG\"`/" >/dev/null 60 | APP_HOME="`pwd -P`" 61 | cd "$SAVED" >/dev/null 62 | 63 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 64 | 65 | # Determine the Java command to use to start the JVM. 66 | if [ -n "$JAVA_HOME" ] ; then 67 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 68 | # IBM's JDK on AIX uses strange locations for the executables 69 | JAVACMD="$JAVA_HOME/jre/sh/java" 70 | else 71 | JAVACMD="$JAVA_HOME/bin/java" 72 | fi 73 | if [ ! -x "$JAVACMD" ] ; then 74 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 75 | 76 | Please set the JAVA_HOME variable in your environment to match the 77 | location of your Java installation." 78 | fi 79 | else 80 | JAVACMD="java" 81 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 82 | 83 | Please set the JAVA_HOME variable in your environment to match the 84 | location of your Java installation." 85 | fi 86 | 87 | # Increase the maximum file descriptors if we can. 88 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 89 | MAX_FD_LIMIT=`ulimit -H -n` 90 | if [ $? -eq 0 ] ; then 91 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 92 | MAX_FD="$MAX_FD_LIMIT" 93 | fi 94 | ulimit -n $MAX_FD 95 | if [ $? -ne 0 ] ; then 96 | warn "Could not set maximum file descriptor limit: $MAX_FD" 97 | fi 98 | else 99 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 100 | fi 101 | fi 102 | 103 | # For Darwin, add options to specify how the application appears in the dock 104 | if $darwin; then 105 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 106 | fi 107 | 108 | # For Cygwin, switch paths to Windows format before running java 109 | if $cygwin ; then 110 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 111 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 112 | JAVACMD=`cygpath --unix "$JAVACMD"` 113 | 114 | # We build the pattern for arguments to be converted via cygpath 115 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 116 | SEP="" 117 | for dir in $ROOTDIRSRAW ; do 118 | ROOTDIRS="$ROOTDIRS$SEP$dir" 119 | SEP="|" 120 | done 121 | OURCYGPATTERN="(^($ROOTDIRS))" 122 | # Add a user-defined pattern to the cygpath arguments 123 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 124 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 125 | fi 126 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 127 | i=0 128 | for arg in "$@" ; do 129 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 130 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 131 | 132 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 133 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 134 | else 135 | eval `echo args$i`="\"$arg\"" 136 | fi 137 | i=$((i+1)) 138 | done 139 | case $i in 140 | (0) set -- ;; 141 | (1) set -- "$args0" ;; 142 | (2) set -- "$args0" "$args1" ;; 143 | (3) set -- "$args0" "$args1" "$args2" ;; 144 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 145 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 146 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 147 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 148 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 149 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 150 | esac 151 | fi 152 | 153 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 154 | function splitJvmOpts() { 155 | JVM_OPTS=("$@") 156 | } 157 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 158 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 159 | 160 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 161 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/SpecUtils.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | import java.util.Comparator; 4 | import java.util.Objects; 5 | import java.util.Optional; 6 | import java.util.Set; 7 | import java.util.function.BiConsumer; 8 | import java.util.function.BinaryOperator; 9 | import java.util.function.Function; 10 | import java.util.function.Supplier; 11 | import java.util.stream.Collector; 12 | import java.util.stream.Collectors; 13 | 14 | public class SpecUtils { 15 | // ProjectColumnSpec 16 | 17 | public static ProjectColumnSpec col(String columnName) { 18 | return new ProjectColumnSpecBuilder().as(columnName); 19 | } 20 | 21 | public static ProjectColumnSpecBuilder colMap(String columnRef) { 22 | return new ProjectColumnSpecBuilder(null, (DataRow row) -> row.get(columnRef)); 23 | } 24 | 25 | public static ProjectColumnSpecBuilder colMap(String columnRef, Class type, Function mapper) { 26 | return new ProjectColumnSpecBuilder(type, (DataRow row) -> mapper.apply((R)row.get(columnRef))); 27 | } 28 | 29 | public static class ProjectColumnSpecBuilder { 30 | private final Class type; 31 | private final Function mapper; 32 | 33 | public ProjectColumnSpecBuilder() { 34 | this(null, null); 35 | } 36 | 37 | public ProjectColumnSpecBuilder(Class type, Function mapper) { 38 | this.type = type; 39 | this.mapper = mapper; 40 | } 41 | 42 | public ProjectColumnSpec as(String column) { 43 | return new ProjectColumnSpec<>(column, type, mapper); 44 | } 45 | } 46 | 47 | // AggregateColumnSpec 48 | public static AggregateColumnSpecBuilder sumLong(String columnRef) { 49 | return new AggregateColumnSpecBuilder( 50 | Long.class, 51 | Collectors.summingLong((DataRow row) -> row.getLong(columnRef))); 52 | } 53 | 54 | public static AggregateColumnSpecBuilder sumDouble(String columnRef) { 55 | return new AggregateColumnSpecBuilder( 56 | Double.class, 57 | Collectors.summingDouble((DataRow row) -> row.getDouble(columnRef))); 58 | } 59 | 60 | public static AggregateColumnSpecBuilder avgLong(String columnRef) { 61 | return new AggregateColumnSpecBuilder( 62 | Double.class, 63 | Collectors.averagingLong((DataRow row) -> row.getLong(columnRef))); 64 | } 65 | 66 | public static AggregateColumnSpecBuilder avgDouble(String columnRef) { 67 | return new AggregateColumnSpecBuilder( 68 | Double.class, 69 | Collectors.averagingDouble((DataRow row) -> row.getDouble(columnRef))); 70 | } 71 | 72 | public static AggregateColumnSpecBuilder count() { 73 | return new AggregateColumnSpecBuilder( 74 | Long.class, 75 | Collectors.counting()); 76 | } 77 | 78 | public static AggregateColumnSpecBuilder count(String columnRef) { 79 | return new AggregateColumnSpecBuilder( 80 | Long.class, 81 | Collectors.summingLong((DataRow row) -> row.get(columnRef) != null ? 1 : 0)); 82 | } 83 | 84 | public static AggregateColumnSpecBuilder min(String columnRef) { 85 | Function mapper = row -> row.get(columnRef); 86 | Comparator comparator = Comparator.naturalOrder(); 87 | Collector> collector = Collectors 88 | .mapping(mapper, Collectors.minBy(comparator)); 89 | Collector collector2 = Collectors.collectingAndThen(collector, (opt) -> opt.orElse(null)); 90 | 91 | return new AggregateColumnSpecBuilder(columnRef,collector2); 92 | } 93 | 94 | public static AggregateColumnSpecBuilder max(String columnRef) { 95 | Function mapper = row -> row.get(columnRef); 96 | Comparator comparator = Comparator.naturalOrder(); 97 | Collector> collector = Collectors 98 | .mapping(mapper, Collectors.maxBy(comparator)); 99 | Collector collector2 = Collectors.collectingAndThen(collector, (opt) -> opt.orElse(null)); 100 | 101 | return new AggregateColumnSpecBuilder(columnRef,collector2); 102 | 103 | } 104 | 105 | public static AggregateColumnSpecBuilder aggreMap(String columnRef, Class type, Collector collector) { 106 | Function mapper = row -> (T)row.get(columnRef); 107 | Collector newCollector = Collectors.mapping(mapper, collector); 108 | return new AggregateColumnSpecBuilder(type,newCollector); 109 | } 110 | 111 | 112 | public static class AggregateColumnSpecBuilder { 113 | private Class type; 114 | private String typeFromColumn; 115 | private Collector collector; 116 | 117 | public AggregateColumnSpecBuilder(Class type, Collector collector) { 118 | this.type = type; 119 | this.collector = collector; 120 | } 121 | 122 | public AggregateColumnSpecBuilder(String typeFromColumn, Collector collector) { 123 | this.typeFromColumn = typeFromColumn; 124 | this.collector = collector; 125 | } 126 | 127 | public AggregateColumnSpec as(String column) { 128 | if (type != null) { 129 | return new AggregateColumnSpec<>(column, type, collector); 130 | } 131 | 132 | if (typeFromColumn != null) { 133 | return new AggregateColumnSpec(column, typeFromColumn, collector); 134 | } 135 | 136 | throw new IllegalStateException("type and typeFromColumn not defined"); 137 | } 138 | } 139 | 140 | // SortSpec 141 | public static SortSpec asc(String name) { 142 | return new SortSpec(name, SortSpec.Order.ASC); 143 | } 144 | 145 | public static SortSpec desc(String name) { 146 | return new SortSpec(name, SortSpec.Order.DESC); 147 | } 148 | 149 | 150 | } 151 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/dataframes/AggregateDataFrame.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.dataframes; 2 | 3 | import io.tenmax.poppy.AggregateColumnSpec; 4 | import io.tenmax.poppy.DataColumn; 5 | import io.tenmax.poppy.DataRow; 6 | 7 | import java.util.*; 8 | import java.util.concurrent.CompletableFuture; 9 | import java.util.concurrent.ExecutorService; 10 | import java.util.concurrent.Executors; 11 | import java.util.function.Function; 12 | 13 | public class AggregateDataFrame extends BaseDataFrame{ 14 | private final AggregateColumnSpec[] specs; 15 | private final BaseDataFrame parent; 16 | private final HashMap specsMap= new HashMap<>(); 17 | private final int dimSize; 18 | 19 | public AggregateDataFrame(BaseDataFrame parent, AggregateColumnSpec[] specs) { 20 | super(new ExecutionContext(), columnsFromSpec(parent, specs)); 21 | this.parent = parent; 22 | this.specs = specs; 23 | this.dimSize = parent.groupedColumns.length; 24 | this.groupedColumns = parent.groupedColumns; 25 | 26 | for (AggregateColumnSpec spec: specs) { 27 | specsMap.put(spec.getColumn(), spec); 28 | } 29 | } 30 | 31 | private static DataColumn[] columnsFromSpec(BaseDataFrame parent, AggregateColumnSpec[] specs) { 32 | DataColumn[] dataColumns = new DataColumn[parent.groupedColumns.length + specs.length]; 33 | int i = 0; 34 | 35 | for (DataColumn dataColumn : parent.groupedColumns) { 36 | dataColumns[i++] = new DataColumn(dataColumn.getName(), dataColumn.getType()); 37 | } 38 | 39 | for (AggregateColumnSpec spec : specs) { 40 | if (spec.getType() != null) { 41 | dataColumns[i++] = new DataColumn(spec.getColumn(), spec.getType()); 42 | } else { 43 | dataColumns[i++] = new DataColumn(spec.getColumn(), parent.getColumn(spec.getTypeFromColumn()).getType()); 44 | } 45 | } 46 | 47 | return dataColumns; 48 | } 49 | 50 | @Override 51 | public int getPartitionCount() { 52 | return 1; 53 | } 54 | 55 | @Override 56 | public Iterator getPartition(int index) { 57 | int count = parent.getPartitionCount(); 58 | HashMap result = new HashMap<>(); 59 | 60 | 61 | if (parent.context.getNumThreads() == 1) { 62 | // sequatial 63 | for (int i = 0; i < count; i++) { 64 | HashMap resultPartial = accumulate(parent.getPartition(i)); 65 | combine(result, resultPartial); 66 | } 67 | } else { 68 | // parallel 69 | ExecutorService executorService = Executors.newFixedThreadPool(parent.context.getNumThreads()); 70 | ArrayList> futures = new ArrayList<>(); 71 | 72 | for (int i = 0; i < count; i++) { 73 | 74 | final int fi = i; 75 | 76 | CompletableFuture future = CompletableFuture.runAsync(() -> { 77 | try { 78 | HashMap resultPartial = accumulate(parent.getPartition(fi)); 79 | synchronized (result) { 80 | combine(result, resultPartial); 81 | } 82 | } catch (Exception e) { 83 | throw e; 84 | } 85 | }, executorService); 86 | 87 | futures.add(future); 88 | } 89 | 90 | try { 91 | CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); 92 | } finally { 93 | // shutdown the executor while all tasks complete 94 | executorService.shutdown(); 95 | } 96 | 97 | } 98 | 99 | HashMap finalResult = finish(result); 100 | return new AggregateIterator(finalResult); 101 | } 102 | 103 | 104 | HashMap accumulate(Iterator iterator) { 105 | HashMap result = new HashMap<>(); 106 | 107 | while(iterator.hasNext()) { 108 | DataRow row = iterator.next(); 109 | 110 | List dims = new ArrayList<>(); 111 | for (DataColumn gc : parent.groupedColumns) { 112 | dims.add(row.get(gc.getName())); 113 | } 114 | 115 | List accus; // accumulators 116 | 117 | if (result.containsKey(dims)) { 118 | accus = result.get(dims); 119 | } else { 120 | accus = new ArrayList(specs.length); 121 | for (int i = 0; i < specs.length; i++) { 122 | accus.add(specs[i].getCollector().supplier().get()); 123 | } 124 | result.put(dims, accus); 125 | } 126 | 127 | // aggregate 128 | for (int i=0; i result1, HashMap result2) { 137 | 138 | result2.forEach((dims, accus2) -> { 139 | if (result1.containsKey(dims)) { 140 | List accus1 = result1.get(dims); 141 | for (int i=0; i finish(HashMap result) { 152 | final HashMap finalResult = new HashMap<>(); 153 | 154 | result.forEach((dims, accus) -> { 155 | 156 | List values = new ArrayList(specs.length); 157 | for (int i=0; i { 172 | private Iterator> iterator; 173 | 174 | AggregateIterator(HashMap result) { 175 | this.iterator = result.entrySet().iterator(); 176 | } 177 | 178 | @Override 179 | public boolean hasNext() { 180 | return iterator.hasNext(); 181 | } 182 | 183 | @Override 184 | public DataRow next() { 185 | return new AggregateDataRow(iterator.next()); 186 | } 187 | } 188 | 189 | class AggregateDataRow extends BaseDataRow{ 190 | private Map.Entry entry; 191 | 192 | AggregateDataRow(Map.Entry entry) { 193 | this.entry = entry; 194 | } 195 | 196 | @Override 197 | public Object get(int index) { 198 | if (index < dimSize) { 199 | return entry.getKey().get(index); 200 | } else { 201 | return entry.getValue().get(index - dimSize); 202 | } 203 | } 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /src/test/java/io/tenmax/poppy/DataFrameTest.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy; 2 | 3 | import junit.framework.TestCase; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Iterator; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.stream.Collector; 12 | import java.util.stream.Collectors; 13 | 14 | import static io.tenmax.poppy.SpecUtils.*; 15 | import static org.junit.Assert.assertEquals; 16 | 17 | public class DataFrameTest { 18 | 19 | private ArrayList list; 20 | 21 | @Before 22 | public void setUp() throws Exception { 23 | 24 | list = new ArrayList<>(); 25 | list.add(new Student(1, "pop", 5,2,170,60)); 26 | list.add(new Student(2, "foo", 5,3,175,70)); 27 | list.add(new Student(3, "bar", 5,4,168,80)); 28 | list.add(new Student(4, null, 5,4,160,60)); 29 | } 30 | 31 | @Test 32 | public void testBasic() throws Exception { 33 | Iterator it = DataFrame 34 | .from(list, Student.class) 35 | .iterator(); 36 | 37 | assertEquals("pop", it.next().getString("name")); 38 | assertEquals(5, it.next().getInteger("grade")); 39 | assertEquals(168, it.next().getInteger("height")); 40 | assertEquals(60, it.next().getInteger("weight")); 41 | } 42 | 43 | @Test 44 | public void testProject() throws Exception { 45 | Iterator it = DataFrame 46 | .from(list, Student.class) 47 | .project("name", "weight", "height") 48 | .iterator(); 49 | 50 | assertEquals("pop", it.next().getString(0)); 51 | assertEquals(70, it.next().getInteger(1)); 52 | assertEquals(168, it.next().getInteger(2)); 53 | assertEquals(160, it.next().getInteger("height")); 54 | } 55 | 56 | @Test 57 | public void testProject2() throws Exception { 58 | Iterator it = DataFrame 59 | .from(list, Student.class) 60 | .project( 61 | col("name"), 62 | colMap("weight").as("w"), 63 | colMap("height", Float.class, (Integer height) -> (height / 10f)).as("h")) 64 | .iterator(); 65 | 66 | assertEquals("pop", it.next().getString(0)); 67 | assertEquals(70, it.next().getInteger("w")); 68 | assertEquals(16.8f, it.next().getFloat("h"), 0.1); 69 | assertEquals(16.0f, it.next().getFloat(2), 0.1); 70 | } 71 | 72 | @Test 73 | public void testFilter() throws Exception { 74 | Iterator it = DataFrame 75 | .from(list, Student.class) 76 | .filter(row -> row.getInteger("height") >= 170) 77 | .project("name", "weight", "height") 78 | .iterator(); 79 | 80 | assertEquals("pop", it.next().getString(0)); 81 | assertEquals(175, it.next().getInteger(2)); 82 | assertEquals(false, it.hasNext()); 83 | } 84 | 85 | @Test 86 | public void testAggre() throws Exception { 87 | Iterator it = 88 | DataFrame 89 | .from(list, Student.class) 90 | .aggregate( 91 | sumLong("height").as("sum"), 92 | avgLong("height").as("avg"), 93 | min("height").as("min"), 94 | max("height").as("max"), 95 | count().as("count"), 96 | aggreMap("weight", Integer.class, Collectors.summingInt((Integer i) -> i)).as("wi")) 97 | .iterator(); 98 | 99 | DataRow row = it.next(); 100 | assertEquals(row.getLong("sum"), 673); 101 | assertEquals(row.getDouble("avg"), 168.25, 0.1); 102 | assertEquals(row.getInteger("min"), 160); 103 | assertEquals(row.getInteger("max"), 175); 104 | assertEquals(row.getLong("count"), 4); 105 | assertEquals(row.getInteger("wi"), 270); 106 | } 107 | 108 | @Test 109 | public void testCountWithNull() throws Exception { 110 | Iterator it = 111 | DataFrame 112 | .from(list, Student.class) 113 | .aggregate( 114 | count().as("count"), 115 | count("name").as("countName")) 116 | .iterator(); 117 | 118 | DataRow row = it.next(); 119 | assertEquals(row.getLong("count"), 4); 120 | assertEquals(row.getLong("countName"), 3); 121 | } 122 | 123 | @Test 124 | public void testGroupBy() throws Exception { 125 | Iterator it = DataFrame 126 | .from(list, Student.class) 127 | .groupby("grade", "room") 128 | .aggregate( 129 | avgLong("weight").as("weight"), 130 | avgLong("height").as("height")) 131 | .sort("grade", "room") 132 | .iterator(); 133 | 134 | assertEquals(2, it.next().getInteger("room")); 135 | assertEquals(70.0, it.next().getDouble("weight"), 0.1); 136 | assertEquals(164.0, it.next().getDouble("height"), 0.1); 137 | } 138 | 139 | @Test 140 | public void testSort() throws Exception { 141 | Iterator it = DataFrame 142 | .from(list, Student.class) 143 | .sort("weight", "height") 144 | .iterator(); 145 | 146 | assertEquals(4, it.next().getInteger("studentId")); 147 | assertEquals(1, it.next().getInteger("studentId")); 148 | assertEquals(2, it.next().getInteger("studentId")); 149 | assertEquals(3, it.next().getInteger("studentId")); 150 | } 151 | 152 | @Test 153 | public void testSort2() throws Exception { 154 | Iterator it = DataFrame 155 | .from(list, Student.class) 156 | .sort(asc("weight"), desc("height")) 157 | .iterator(); 158 | 159 | assertEquals(1, it.next().getInteger("studentId")); 160 | assertEquals(4, it.next().getInteger("studentId")); 161 | assertEquals(2, it.next().getInteger("studentId")); 162 | assertEquals(3, it.next().getInteger("studentId")); 163 | } 164 | 165 | @Test 166 | public void testSortWithNull() throws Exception { 167 | Iterator it = DataFrame 168 | .from(list, Student.class) 169 | .sort(asc("name")) 170 | .iterator(); 171 | 172 | assertEquals(4, it.next().getInteger("studentId")); 173 | assertEquals(3, it.next().getInteger("studentId")); 174 | assertEquals(2, it.next().getInteger("studentId")); 175 | assertEquals(1, it.next().getInteger("studentId")); 176 | } 177 | 178 | @Test 179 | public void testDistinct() throws Exception { 180 | Iterator it = DataFrame 181 | .from(list, Student.class) 182 | .distinct("grade", "room") 183 | .iterator(); 184 | 185 | assertEquals(2, it.next().getInteger("room")); 186 | assertEquals(3, it.next().getInteger("room")); 187 | assertEquals(4, it.next().getInteger("room")); 188 | assertEquals(false, it.hasNext()); 189 | } 190 | 191 | @Test 192 | public void testCache() throws Exception { 193 | RandomAccessDataFrame cache = DataFrame.from(list, Student.class) 194 | .cache(); 195 | 196 | assertEquals(4, cache.size()); 197 | assertEquals(2, cache.getRow(1).getInteger("studentId")); 198 | assertEquals(80, cache.getRow(2).getInteger("weight")); 199 | assertEquals(null, cache.getRow(3).getString("name")); 200 | } 201 | 202 | @Test 203 | public void testToList() throws Exception { 204 | List studentReports = 205 | DataFrame 206 | .from(list, Student.class) 207 | .groupby("grade", "room") 208 | .aggregate( 209 | avgLong("weight").as("weight"), 210 | avgLong("height").as("height")) 211 | .sort("grade", "room") 212 | .toList(StudentReport.class); 213 | 214 | StudentReport report = studentReports.get(0); 215 | assertEquals(5, report.getGrade()); 216 | assertEquals(2, report.getRoom()); 217 | assertEquals(60.0, report.getWeight(),0.1); 218 | assertEquals(170.0, report.getHeight(),0.1); 219 | 220 | } 221 | 222 | @Test 223 | public void testToMap() throws Exception { 224 | 225 | Map reportMap = 226 | DataFrame 227 | .from(list, Student.class) 228 | .groupby("grade", "room") 229 | .aggregate( 230 | avgLong("weight").as("weight"), 231 | sumLong("weight").as("weightTotal"), 232 | avgLong("height").as("height"), 233 | sumLong("height").as("heightTotal")) 234 | .sort("grade", "room") 235 | .toMap(GradeRoom.class, StudentReport.class); 236 | 237 | reportMap.forEach((key, value) -> { 238 | System.out.println(key); 239 | System.out.println(value); 240 | }); 241 | 242 | assertEquals(reportMap.get(new GradeRoom(5,2)).getWeight(), 60.0, 0.1); 243 | assertEquals(reportMap.get(new GradeRoom(5,3)).getHeight(), 175.0, 0.1); 244 | assertEquals(reportMap.get(new GradeRoom(5,4)).getHeight(), 164.0, 0.1); 245 | } 246 | 247 | 248 | } 249 | -------------------------------------------------------------------------------- /src/main/java/io/tenmax/poppy/dataframes/BaseDataFrame.java: -------------------------------------------------------------------------------- 1 | package io.tenmax.poppy.dataframes; 2 | 3 | import io.tenmax.poppy.*; 4 | import io.tenmax.poppy.datasources.SimpleDataSource; 5 | import io.tenmax.poppy.exceptions.ColumnNotFoundException; 6 | import io.tenmax.poppy.exceptions.ReflectionException; 7 | import io.tenmax.poppy.iterators.ParallelIterator; 8 | import io.tenmax.poppy.iterators.SequantialIterator; 9 | import org.apache.commons.beanutils.PropertyUtils; 10 | 11 | import java.beans.PropertyDescriptor; 12 | import java.lang.reflect.InvocationTargetException; 13 | import java.util.*; 14 | import java.util.concurrent.CompletableFuture; 15 | import java.util.concurrent.ExecutorService; 16 | import java.util.concurrent.Executors; 17 | import java.util.concurrent.atomic.AtomicInteger; 18 | import java.util.function.BiConsumer; 19 | import java.util.function.Consumer; 20 | import java.util.function.Predicate; 21 | 22 | abstract public class BaseDataFrame implements DataFrame{ 23 | protected final ExecutionContext context; 24 | protected final DataColumn[] columns; 25 | protected final HashMap columnsMap; 26 | protected DataColumn[] groupedColumns = new DataColumn[0]; 27 | 28 | public BaseDataFrame(ExecutionContext context, DataColumn[] columns) { 29 | this.context = context; 30 | this.columns = columns; 31 | this.columnsMap = new HashMap<>(); 32 | 33 | int i = 0; 34 | for (DataColumn column: columns) { 35 | columnsMap.put(column.getName(), i++); 36 | } 37 | } 38 | 39 | @Override 40 | public DataColumn[] getColumns() { 41 | return columns; 42 | } 43 | 44 | @Override 45 | public DataColumn getColumn(String name) { 46 | Integer index = columnsMap.get(name); 47 | if(index == null) { 48 | throw new ColumnNotFoundException(name); 49 | } 50 | 51 | return columns[index]; 52 | } 53 | 54 | @Override 55 | public DataColumn getColumn(int index) { 56 | return columns[index]; 57 | } 58 | 59 | @Override 60 | public DataFrame project(String... columns) { 61 | return new ProjectDataFrame(this, columns); 62 | } 63 | 64 | @Override 65 | public DataFrame project(ProjectColumnSpec... columns) { 66 | return new ProjectDataFrame(this, columns); 67 | } 68 | 69 | @Override 70 | public DataFrame groupby(String... groupedColumns) { 71 | DataColumn[] gc = new DataColumn[groupedColumns.length]; 72 | 73 | for (int i=0; i predicate) { 107 | return new FilterDataFrame(this, predicate); 108 | } 109 | 110 | @Override 111 | public DataFrame peek(Consumer consumer) { 112 | return new PeekDataFrame(this, consumer); 113 | } 114 | 115 | @Override 116 | public DataFrame parallel(int numThreads) { 117 | context.setNumThreads(numThreads); 118 | return this; 119 | } 120 | 121 | @Override 122 | public RandomAccessDataFrame cache() { 123 | return new CacheDataFrame(this); 124 | } 125 | 126 | @Override 127 | public void print() { 128 | Arrays.stream(columns).forEach(column ->{ 129 | System.out.printf("%s\t", column.getName()); 130 | }); 131 | System.out.println(); 132 | 133 | forEach((row) -> { 134 | for (Object o : row) { 135 | System.out.printf("%s\t", o); 136 | } 137 | 138 | System.out.println(); 139 | }); 140 | } 141 | 142 | @Override 143 | public Iterator iterator() { 144 | 145 | 146 | if (context.getNumThreads() > 1) { 147 | return new ParallelIterator(this); 148 | } else { 149 | return new SequantialIterator(this); 150 | } 151 | } 152 | 153 | public void forEachPartition(BiConsumer consumer) { 154 | int partitionCount = getPartitionCount(); 155 | 156 | if (partitionCount == 1) { 157 | for (int i = 0; i < partitionCount; i++) { 158 | Iterator partition = getPartition(i); 159 | while (partition.hasNext()) { 160 | consumer.accept(i, partition.next()); 161 | } 162 | } 163 | } else { 164 | forEachPartitionAsync(consumer).join(); 165 | } 166 | } 167 | 168 | public CompletableFuture forEachPartitionAsync(BiConsumer consumer) { 169 | ExecutorService executorService = Executors.newFixedThreadPool(context.getNumThreads()); 170 | int partitionCount = getPartitionCount(); 171 | CompletableFuture[] futures = new CompletableFuture[partitionCount]; 172 | 173 | for (int i=0; i { 177 | Iterator partition = getPartition(fi); 178 | while (partition.hasNext()) { 179 | consumer.accept(fi, partition.next()); 180 | } 181 | }, executorService); 182 | } 183 | 184 | CompletableFuture future = CompletableFuture.allOf(futures); 185 | executorService.shutdown(); 186 | 187 | return future; 188 | } 189 | 190 | @Override 191 | public void to(DataSink sink) { 192 | sink.sinkStart(getPartitionCount(), columns); 193 | 194 | ExecutorService executorService = Executors.newFixedThreadPool(context.getNumThreads()); 195 | int partitionCount = getPartitionCount(); 196 | CompletableFuture[] futures = new CompletableFuture[partitionCount]; 197 | 198 | AtomicInteger counter = new AtomicInteger(); 199 | for (int i=0; i { 203 | sink.partitionStart(fi); 204 | 205 | Iterator partition = getPartition(fi); 206 | while (partition.hasNext()) { 207 | sink.partitionRow(fi, partition.next()); 208 | } 209 | 210 | sink.partitionComplete(fi); 211 | 212 | }, executorService); 213 | } 214 | 215 | CompletableFuture future = CompletableFuture.allOf(futures); 216 | executorService.shutdown(); 217 | future.join(); 218 | 219 | sink.sinkComplete(); 220 | } 221 | 222 | @Override 223 | public List toList() { 224 | ArrayList list = new ArrayList<>(); 225 | 226 | for (DataRow row : this) { 227 | List data = new ArrayList(); 228 | for (int i=0; i List toList(Class clazz) { 239 | ArrayList list = new ArrayList<>(); 240 | 241 | try { 242 | for (DataRow row : this) { 243 | T t = clazz.newInstance(); 244 | 245 | for (DataColumn column : columns) { 246 | if (PropertyUtils.isWriteable(t, column.getName())) { 247 | PropertyUtils.setProperty(t, column.getName(), row.get(column.getName())); 248 | } 249 | } 250 | list.add(t); 251 | } 252 | } catch (InstantiationException | 253 | IllegalAccessException | 254 | NoSuchMethodException | 255 | InvocationTargetException e) 256 | { 257 | throw new ReflectionException(e); 258 | } 259 | 260 | return list; 261 | } 262 | 263 | @Override 264 | public Map toMap() { 265 | HashMap map = new HashMap<>(); 266 | 267 | List keyColumns = new ArrayList<>(); 268 | List valueColumns = new ArrayList<>(); 269 | 270 | Set groupedColumnsSet = new HashSet(); 271 | 272 | for (DataColumn groupColumn : groupedColumns) { 273 | groupedColumnsSet.add(groupColumn.getName()); 274 | } 275 | 276 | for (int i=0; i Map toMap(Class keyClazz, Class valueClazz) { 305 | HashMap map = new HashMap<>(); 306 | 307 | List keyColumns = new ArrayList<>(); 308 | 309 | Set groupedColumnsSet = new HashSet(); 310 | 311 | for (DataColumn groupColumn : groupedColumns) { 312 | groupedColumnsSet.add(groupColumn.getName()); 313 | } 314 | 315 | for (int i=0; i keyProps = new HashSet<>(); 324 | props = PropertyUtils.getPropertyDescriptors(keyClazz); 325 | for (PropertyDescriptor prop : props) { 326 | if (prop.getWriteMethod() != null) { 327 | keyProps.add(prop.getName()); 328 | } 329 | } 330 | 331 | HashSet valueProps = new HashSet<>(); 332 | props = PropertyUtils.getPropertyDescriptors(valueClazz); 333 | for (PropertyDescriptor prop : props) { 334 | if (prop.getWriteMethod() != null) { 335 | valueProps.add(prop.getName()); 336 | } 337 | } 338 | 339 | 340 | try { 341 | for (DataRow row : this) { 342 | K key = keyClazz.newInstance(); 343 | V value = valueClazz.newInstance(); 344 | 345 | 346 | for (int i: keyColumns) { 347 | String columnName = columns[i].getName(); 348 | 349 | if (keyProps.contains(columnName)) { 350 | PropertyUtils.setProperty( 351 | key, 352 | columnName, 353 | row.get(columnName) 354 | ); 355 | } 356 | } 357 | 358 | for (DataColumn column: columns) { 359 | String columnName = column.getName(); 360 | 361 | if (valueProps.contains(columnName)) { 362 | PropertyUtils.setProperty( 363 | value, 364 | columnName, 365 | row.get(columnName) 366 | ); 367 | } 368 | } 369 | 370 | map.put(key, value); 371 | } 372 | } catch (InstantiationException | 373 | IllegalAccessException | 374 | NoSuchMethodException | 375 | InvocationTargetException e) 376 | { 377 | throw new ReflectionException(e); 378 | } 379 | 380 | return map; 381 | } 382 | 383 | abstract public int getPartitionCount(); 384 | 385 | public ExecutionContext getContext() { 386 | return context; 387 | } 388 | 389 | abstract public Iterator getPartition(int index); 390 | 391 | public static DataFrame from(Iterable source, Class clazz) { 392 | return new SourceDataFrame(new SimpleDataSource(clazz, source)); 393 | } 394 | 395 | public static DataFrame from(DataSource source) { 396 | return new SourceDataFrame(source); 397 | } 398 | 399 | 400 | abstract class BaseDataRow implements DataRow { 401 | @Override 402 | public DataColumn[] getColumns() { 403 | return columns; 404 | } 405 | 406 | @Override 407 | public Object get(String name) { 408 | return get(columnsMap.get(name)); 409 | } 410 | 411 | @Override 412 | public String toString() { 413 | StringBuilder sb = new StringBuilder(); 414 | 415 | for (int i=0; i