├── .gitignore └── src ├── constraints.proto ├── messages.proto └── summaries.proto /.gitignore: -------------------------------------------------------------------------------- 1 | generated 2 | generated/ -------------------------------------------------------------------------------- /src/constraints.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | import "messages.proto"; 4 | 5 | option java_package = "com.whylogs.core.constraint"; 6 | option java_outer_classname = "Constraints"; 7 | option java_multiple_files = true; 8 | 9 | /* constraints specify one of the following binary boolean relationships. */ 10 | enum Op { 11 | unused = 0; 12 | LT = 1; 13 | LE = 2; 14 | EQ = 3; 15 | NE = 4; 16 | GE = 5; 17 | GT = 6; 18 | } 19 | 20 | /* Summary constraints specify a relationship between a summary field and a literal value, 21 | or between two summary fields. 22 | e.g. 'min' < 6 23 | 'std_dev' < 2.17 24 | 'min' > 'avg' 25 | */ 26 | message SummaryConstraintMsg { 27 | string name = 1; 28 | string first_field = 2; 29 | oneof second { 30 | string second_field = 3; 31 | double value = 4; 32 | } 33 | Op op = 5; 34 | bool verbose = 6; 35 | } 36 | 37 | /* ValueConstraints express a binary boolean relationship between an implied numeric value and a literal. 38 | These are applied to every incoming value that is processed by whylogs. */ 39 | message ValueConstraintMsg { 40 | string name = 1; 41 | double value = 2; 42 | Op op = 3; 43 | bool verbose = 4; 44 | } 45 | 46 | message ValueConstraintMsgs { 47 | repeated ValueConstraintMsg constraints = 1; 48 | } 49 | 50 | message SummaryConstraintMsgs { 51 | repeated SummaryConstraintMsg constraints = 1; 52 | } 53 | 54 | message DatasetConstraintMsg { 55 | DatasetProperties properties = 1; 56 | map value_constraints = 2; 57 | map summary_constraints = 3; 58 | } 59 | 60 | -------------------------------------------------------------------------------- /src/messages.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | import "google/protobuf/wrappers.proto"; 4 | 5 | option java_package = "com.whylogs.core.message"; 6 | option java_outer_classname = "Messages"; 7 | option java_multiple_files = true; 8 | 9 | message Counters { 10 | int64 count = 1; 11 | 12 | google.protobuf.Int64Value true_count = 2; 13 | google.protobuf.Int64Value null_count = 3; 14 | } 15 | 16 | message InferredType { 17 | enum Type { 18 | UNKNOWN = 0; 19 | NULL = 1; 20 | FRACTIONAL = 2; 21 | INTEGRAL = 3; 22 | BOOLEAN = 4; 23 | STRING = 5; 24 | } 25 | 26 | Type type = 1; 27 | double ratio = 2; 28 | } 29 | 30 | message DoublesMessage { 31 | int64 count = 1; 32 | double min = 2; 33 | double max = 3; 34 | double sum = 4; 35 | } 36 | 37 | message LongsMessage { 38 | int64 count = 1; 39 | int64 min = 2; 40 | int64 max = 3; 41 | int64 sum = 4; 42 | } 43 | 44 | message VarianceMessage { 45 | int64 count = 1; 46 | double sum = 2; // sample variance * (n-1) 47 | double mean = 3; 48 | } 49 | 50 | message FrequentNumbersSketchMessage { 51 | bytes sketch = 1; 52 | int32 lg_max_k = 2; 53 | } 54 | 55 | message FrequentItemsSketchMessage { 56 | bytes sketch = 1; 57 | int32 lg_max_k = 2; 58 | } 59 | 60 | message NumbersMessage { 61 | VarianceMessage variance = 1; 62 | oneof numbers { 63 | DoublesMessage doubles = 2; 64 | LongsMessage longs = 3; 65 | } 66 | 67 | // sketches 68 | bytes histogram = 4; 69 | bytes theta = 5; 70 | bytes compact_theta = 6; 71 | FrequentNumbersSketchMessage frequent_numbers = 7; 72 | } 73 | 74 | message CharPosMessage { 75 | string char_list = 1; 76 | map char_pos_map = 2; 77 | } 78 | 79 | message StringsMessage { 80 | int64 count = 1; 81 | 82 | // sketches 83 | bytes theta = 2; 84 | bytes items = 3; 85 | bytes compact_theta = 4; 86 | NumbersMessage length = 5; 87 | NumbersMessage token_length = 6; 88 | CharPosMessage char_pos_tracker = 7; 89 | 90 | } 91 | 92 | 93 | message SchemaMessage { 94 | map typeCounts = 1; 95 | InferredType inferred_type = 2; 96 | } 97 | 98 | message ColumnMessage { 99 | string name = 1; 100 | Counters counters = 2; 101 | SchemaMessage schema = 3; 102 | NumbersMessage numbers = 4; 103 | StringsMessage strings = 5; 104 | InferredType inferred_type = 6; 105 | FrequentItemsSketchMessage frequent_items = 7; 106 | HllSketchMessage cardinality_tracker = 8; 107 | } 108 | 109 | message DatasetProperties { 110 | uint32 schema_major_version = 1; 111 | uint32 schema_minor_version = 2; 112 | 113 | string session_id = 3; 114 | int64 session_timestamp = 4; 115 | int64 data_timestamp = 5; 116 | map tags = 6; 117 | map metadata = 7; 118 | // TODO: store other configuration here 119 | } 120 | 121 | message ScoreMatrixMessage { 122 | repeated string labels = 1; 123 | string prediction_field = 2; 124 | string target_field = 3; 125 | string score_field = 4; 126 | 127 | // a flattened NxN matrix (N = len(labels)) 128 | repeated NumbersMessage scores = 10; 129 | } 130 | 131 | message RegressionMetricsMessage{ 132 | string prediction_field = 1; 133 | string target_field = 2; 134 | uint64 count = 3; 135 | double sum_abs_diff = 4; 136 | double sum_diff = 5; 137 | double sum2_diff = 6; 138 | } 139 | 140 | enum ModelType { 141 | UNKNOWN = 0; 142 | CLASSIFICATION = 1; 143 | REGRESSION = 2; 144 | EMBEDDINGS = 3; 145 | } 146 | 147 | message ModelMetricsMessage { 148 | ScoreMatrixMessage scoreMatrix = 1; 149 | ModelType modelType = 2; 150 | RegressionMetricsMessage regressionMetrics = 3; 151 | } 152 | 153 | message ModelProfileMessage { 154 | repeated string output_fields = 1; 155 | // Reserving fields for ModelMessage 156 | 157 | ModelMetricsMessage metrics = 10; 158 | } 159 | 160 | message DatasetProfileMessage { 161 | DatasetProperties properties = 1; 162 | map columns = 2; 163 | // reserve other fields for dataset level data 164 | ModelProfileMessage modeProfile = 10; 165 | } 166 | 167 | /** 168 | * The follow section is for transmission and reconstruction of the dataset 169 | * in WhyLogs backend 170 | */ 171 | message ColumnsChunkSegment { 172 | // UUID is required to aggregate to the original message 173 | // This should map back to the original dataset 174 | string marker = 1; 175 | repeated ColumnMessage columns = 2; 176 | } 177 | 178 | message DatasetMetadataSegment { 179 | string marker = 1; 180 | DatasetProperties properties = 2; 181 | } 182 | 183 | // A segment of a dataset profile. This can be used to composed the 184 | // original object back 185 | message MessageSegment { 186 | string marker = 1; 187 | oneof item { 188 | DatasetMetadataSegment metadata = 2; 189 | ColumnsChunkSegment columns = 3; 190 | } 191 | } 192 | 193 | message HllSketchMessage { 194 | bytes sketch = 1; 195 | int32 lg_k = 2; 196 | } 197 | -------------------------------------------------------------------------------- /src/summaries.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | import "messages.proto"; 4 | 5 | import "google/protobuf/any.proto"; 6 | import "google/protobuf/struct.proto"; 7 | 8 | option java_package = "com.whylogs.core.message"; 9 | option java_outer_classname = "Summaries"; 10 | option java_multiple_files = true; 11 | 12 | message UniqueCountSummary { 13 | double estimate = 1; 14 | double upper = 2; 15 | double lower = 3; 16 | } 17 | 18 | message FrequentStringsSummary { 19 | message FrequentItem { 20 | string value = 1; 21 | double estimate = 2; 22 | } 23 | repeated FrequentItem items = 1; 24 | } 25 | 26 | message FrequentNumbersSummary { 27 | message FrequentDoubleItem { 28 | int64 estimate = 1; 29 | double value = 2; 30 | int32 rank = 3; 31 | } 32 | message FrequentLongItem { 33 | int64 estimate = 1; 34 | int64 value = 2; 35 | int32 rank = 3; 36 | } 37 | repeated FrequentDoubleItem doubles = 1; 38 | repeated FrequentLongItem longs = 2; 39 | } 40 | 41 | message FrequentItemsSummary { 42 | message FrequentItem { 43 | int64 estimate = 1; 44 | string json_value = 2; 45 | } 46 | repeated FrequentItem items = 1; 47 | } 48 | 49 | 50 | message CharPosSummary { 51 | string character_list=1; 52 | map char_pos_map =2; 53 | } 54 | 55 | message StringsSummary { 56 | UniqueCountSummary unique_count = 1; 57 | FrequentStringsSummary frequent = 2; 58 | NumberSummary length =3; 59 | NumberSummary token_length=4; 60 | CharPosSummary char_pos_tracker=5; 61 | } 62 | 63 | message SchemaSummary { 64 | InferredType inferred_type = 1; 65 | map type_counts = 2; 66 | } 67 | 68 | message HistogramSummary { 69 | double start = 1; 70 | double end = 2; 71 | double width = 3; 72 | repeated int64 counts = 4; 73 | 74 | double max = 5; 75 | double min = 6; 76 | repeated double bins = 7; 77 | int64 n = 8; 78 | 79 | } 80 | 81 | message QuantileSummary { 82 | repeated double quantiles = 1; 83 | repeated double quantile_values = 2; 84 | } 85 | 86 | message NumberSummary { 87 | uint64 count = 1; 88 | double min = 2; 89 | double max = 3; 90 | double mean = 4; 91 | double stddev = 5; 92 | 93 | HistogramSummary histogram = 6; 94 | UniqueCountSummary unique_count = 7; 95 | QuantileSummary quantiles = 8; 96 | FrequentNumbersSummary frequent_numbers = 9; 97 | 98 | bool is_discrete = 10; 99 | } 100 | 101 | message ColumnSummary { 102 | Counters counters = 1; 103 | SchemaSummary schema = 2; 104 | NumberSummary number_summary = 3; 105 | StringsSummary string_summary = 4; 106 | FrequentItemsSummary frequent_items = 5; 107 | UniqueCountSummary unique_count = 6; 108 | } 109 | 110 | message DatasetSummary { 111 | DatasetProperties properties = 1; 112 | map columns = 2; 113 | ModelSummary model = 3; 114 | } 115 | 116 | message ModelSummary { 117 | MetricsSummary metrics = 1; 118 | } 119 | 120 | message MetricsSummary { 121 | ModelType model_type = 1; 122 | ROCCurve roc_fpr_tpr = 2; 123 | RecallCurve recall_prec = 3; 124 | ConfusionMatrix confusion_matrix = 4; 125 | } 126 | 127 | message ConfusionMatrix { 128 | repeated string labels = 1; 129 | string target_field = 2; 130 | string predictions_field=3; 131 | string score_field=4; 132 | repeated google.protobuf.ListValue counts = 5; // e.g. [[33, 6], [11, 27]] 133 | } 134 | 135 | message ROCCurve { 136 | // e.g. "values": [ [1, 0.42857], [1, 0.42857], ... ] 137 | repeated google.protobuf.ListValue values = 1; 138 | } 139 | 140 | 141 | message RecallCurve { 142 | // e.g. "values": [ [1, 1], [1, 1], ... ] 143 | repeated google.protobuf.ListValue values = 1; 144 | } 145 | 146 | message DatasetSummaries { 147 | map profiles = 1; 148 | } 149 | --------------------------------------------------------------------------------