├── README.md
├── 商业组第一名
    └── 易观OLAP-PingCAP.pptx
├── 开源组第一名
    ├── 0001-Add-AggregateFunctionPath.patch
    ├── README.md
    ├── from_24s_to_0_5s.vectorlinex.pptx
    ├── readme.txt
    ├── tools
    │   ├── README.md
    │   ├── a.sql
    │   ├── col.model
    │   ├── createtable.go
    │   ├── import.sh
    │   ├── index.go
    │   ├── process_data.sh
    │   ├── q.sh
    │   ├── query_sql.sh
    │   ├── queryx.sh
    │   └── to_csv.sh
    └── 易观OLAP比赛源码-向量线科技.zip
└── 易观
    ├── AggregationLDCount.java
    └── AggregationLDSum.java


/README.md:
--------------------------------------------------------------------------------
 1 | # olap
 2 | 易观olap大赛
 3 | 
 4 | 
 5 | 易观目前使用Presto实现，通过自定义UDAF实现有序漏斗转化，请参考AggregationLDCount.java和AggregationLDSum.java实现，大赛测试用例实现如下：
 6 | 
 7 | 1、查询2017年1月份，时间窗口为7天，事件顺序为10001、10004、10008的漏斗，结果为[3999974, 3995900, 3608934]，21s
 8 | SELECT ld_sum(xwho_state, 3)
 9 | FROM (SELECT ld_count(xwhen, 7 * 86400000, xwhat_id, '10001,10004,10008') AS xwho_state
10 | 	FROM t_funnel_devicelog
11 | 	WHERE day >= '20170101'
12 | 		AND day <= '20170131'
13 | 		AND xwhat_id IN (10004, 10001, 10008)
14 | 	GROUP BY xwho
15 | 	) a;
16 | 
17 | 2、查询2017年1月份，时间窗口为3天，事件顺序为10004、10008、10010的漏斗，结果为[3999422,3573367,697506]，11s
18 | SELECT ld_sum(xwho_state, 3)
19 | FROM (SELECT ld_count(xwhen, 3 * 86400000, xwhat_id, '10004,10008,10010') AS xwho_state
20 | 	FROM t_funnel_devicelog
21 | 	WHERE day >= '20170101'
22 | 		AND day <= '20170131'
23 | 		AND xwhat_id IN (10004, 10010, 10008)
24 | 	GROUP BY xwho
25 | 	) a;
26 | 
27 | 3、查询2017年1月份，时间窗口为3天，事件顺序为10004、10007、10009、10010，并且10004事件的brand属性为’Apple’的漏斗，结果为[3639301, 2449480, 559517, 35795]，14s
28 | SELECT ld_sum(xwho_state, 4)
29 | FROM (SELECT ld_count(xwhen, 3 * 86400000, xwhat_id, '10004,10007,10009,10010') AS xwho_state
30 | 	FROM t_funnel_devicelog
31 | 	WHERE day >= '20170101'
32 | 		AND day <= '20170131'
33 | 		AND (xwhat_id IN (10007, 10009, 10010)
34 | 			OR xwhat_id = 10004
35 | 			AND view_brand = 'Apple')
36 | 	GROUP BY xwho
37 | 	) a;
38 | 
39 | 


--------------------------------------------------------------------------------
/商业组第一名/易观OLAP-PingCAP.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/analysys/olap/c4a0078fbd363f2e5217d76aacecec26568e04d4/商业组第一名/易观OLAP-PingCAP.pptx


--------------------------------------------------------------------------------
/开源组第一名/0001-Add-AggregateFunctionPath.patch:
--------------------------------------------------------------------------------
  1 | From 02075e1e1cbc023787d6f01f524d472f552ca051 Mon Sep 17 00:00:00 2001
  2 | From: flow <flowbehappy@gmail.com>
  3 | Date: Sun, 17 Sep 2017 15:49:46 +0800
  4 | Subject: [PATCH] Add AggregateFunctionPath
  5 | 
  6 | ---
  7 |  dbms/CMakeLists.txt                                |   2 +
  8 |  .../AggregateFunctions/AggregateFunctionPath.cpp   |  32 +++
  9 |  .../src/AggregateFunctions/AggregateFunctionPath.h | 249 +++++++++++++++++++++
 10 |  .../registerAggregateFunctions.cpp                 |   2 +
 11 |  .../Storages/MergeTree/MergedBlockOutputStream.cpp |   8 +-
 12 |  .../Storages/MergeTree/MergedBlockOutputStream.h   |   8 +-
 13 |  6 files changed, 296 insertions(+), 5 deletions(-)
 14 |  create mode 100644 dbms/src/AggregateFunctions/AggregateFunctionPath.cpp
 15 |  create mode 100644 dbms/src/AggregateFunctions/AggregateFunctionPath.h
 16 | 
 17 | diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
 18 | index 0dcf288..e3cbbe9 100644
 19 | --- a/dbms/CMakeLists.txt
 20 | +++ b/dbms/CMakeLists.txt
 21 | @@ -177,6 +177,7 @@ target_link_libraries (dbms
 22 |      ${Boost_SYSTEM_LIBRARY}
 23 |      ${Poco_Data_LIBRARY}
 24 |      btrie
 25 | +    daemon
 26 |  )
 27 |  
 28 |  if (Poco_DataODBC_FOUND)
 29 | @@ -212,6 +213,7 @@ target_include_directories (dbms PUBLIC ${MYSQLXX_INCLUDE_DIR})
 30 |  target_include_directories (dbms PRIVATE ${POCOEXT_INCLUDE_DIR})
 31 |  target_include_directories (dbms PRIVATE ${COMMON_INCLUDE_DIR})
 32 |  target_include_directories (dbms PUBLIC ${DBMS_INCLUDE_DIR})
 33 | +target_include_directories (dbms PUBLIC ${ClickHouse_SOURCE_DIR}/libs/libdaemon/include)
 34 |  
 35 |  if (ENABLE_TESTS)
 36 |      add_subdirectory (tests)
 37 | diff --git a/dbms/src/AggregateFunctions/AggregateFunctionPath.cpp b/dbms/src/AggregateFunctions/AggregateFunctionPath.cpp
 38 | new file mode 100644
 39 | index 0000000..c3387f8
 40 | --- /dev/null
 41 | +++ b/dbms/src/AggregateFunctions/AggregateFunctionPath.cpp
 42 | @@ -0,0 +1,32 @@
 43 | +#include <AggregateFunctions/AggregateFunctionFactory.h>
 44 | +#include <AggregateFunctions/AggregateFunctionPath.h>
 45 | +#include <AggregateFunctions/Helpers.h>
 46 | +
 47 | +namespace DB
 48 | +{
 49 | +
 50 | +namespace
 51 | +{
 52 | +
 53 | +AggregateFunctionPtr createAggregateFunctionPath(const std::string & name, const DataTypes & argument_types, const Array & params)
 54 | +{
 55 | +
 56 | +    if (params.size() <= 0 || params.size() > 32)
 57 | +        throw Exception("Aggregate function " + name + " requires (1, 32] event ids.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 58 | +
 59 | +    AggregateFunctionPtr res(createWithNumericType<AggregateFunctionPath>(*argument_types[1]));
 60 | +
 61 | +    if (!res)
 62 | +        throw Exception("Illegal type " + argument_types[1]->getName() + " of argument 2 for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
 63 | +
 64 | +    return res;
 65 | +}
 66 | +
 67 | +}
 68 | +
 69 | +void registerAggregateFunctionPath(AggregateFunctionFactory & factory)
 70 | +{
 71 | +    factory.registerFunction("path", createAggregateFunctionPath, AggregateFunctionFactory::CaseInsensitive);
 72 | +}
 73 | +
 74 | +}
 75 | diff --git a/dbms/src/AggregateFunctions/AggregateFunctionPath.h b/dbms/src/AggregateFunctions/AggregateFunctionPath.h
 76 | new file mode 100644
 77 | index 0000000..14d9265
 78 | --- /dev/null
 79 | +++ b/dbms/src/AggregateFunctions/AggregateFunctionPath.h
 80 | @@ -0,0 +1,249 @@
 81 | +#pragma once
 82 | +
 83 | +#include <unordered_set>
 84 | +#include <IO/ReadHelpers.h>
 85 | +#include <IO/WriteHelpers.h>
 86 | +#include <common/logger_useful.h>
 87 | +
 88 | +#include <Common/ArenaAllocator.h>
 89 | +
 90 | +#include <Columns/ColumnsNumber.h>
 91 | +#include <DataTypes/DataTypesNumber.h>
 92 | +
 93 | +#include <AggregateFunctions/IBinaryAggregateFunction.h>
 94 | +
 95 | +
 96 | +namespace DB
 97 | +{
 98 | +struct ComparePairFirst final
 99 | +{
100 | +    template <typename T1, typename T2>
101 | +    bool operator()(const std::pair<T1, T2> & lhs, const std::pair<T1, T2> & rhs) const
102 | +    {
103 | +        return lhs.first < rhs.first;
104 | +    }
105 | +};
106 | +
107 | +struct AggregateFunctionPathData final
108 | +{
109 | +    using Allocator = MixedArenaAllocator<4096>;
110 | +    using TimestampEvent = std::pair<UInt64, UInt16>;
111 | +    using TimestampEvents = PODArray<TimestampEvent, 32, Allocator>;
112 | +    using Comparator = ComparePairFirst;
113 | +
114 | +    bool done = false;
115 | +    UInt32 level = 0;
116 | +
117 | +    bool sorted = true;
118 | +    TimestampEvents timestamp_events;
119 | +
120 | +    void add(UInt64 timestamp, UInt16 event, Arena * arena)
121 | +    {
122 | +        if (done)
123 | +            throw Exception("This group is done! Looks like you forgot to correctly manage your data among nodes");
124 | +
125 | +        // Since most events should have already been sorted by timestamp.
126 | +        if (sorted && timestamp_events.size() > 0 && timestamp_events.back().first > timestamp)
127 | +            sorted = false;
128 | +        timestamp_events.push_back(std::make_pair(timestamp, event), arena);
129 | +    }
130 | +
131 | +    void merge(const AggregateFunctionPathData & other, Arena * arena)
132 | +    {
133 | +        done = done || other.done;
134 | +        level = std::max(level, other.level);
135 | +        if (done)
136 | +            return;
137 | +
138 | +        const auto size = timestamp_events.size();
139 | +
140 | +        timestamp_events.insert(std::begin(other.timestamp_events), std::end(other.timestamp_events), arena);
141 | +
142 | +        /// either sort whole container or do so partially merging ranges afterwards
143 | +        if (!sorted && !other.sorted)
144 | +            std::sort(std::begin(timestamp_events), std::end(timestamp_events), Comparator{});
145 | +        else
146 | +        {
147 | +            const auto begin = std::begin(timestamp_events);
148 | +            const auto middle = std::next(begin, size);
149 | +            const auto end = std::end(timestamp_events);
150 | +
151 | +            if (!sorted)
152 | +                std::sort(begin, middle, Comparator{});
153 | +
154 | +            if (!other.sorted)
155 | +                std::sort(middle, end, Comparator{});
156 | +
157 | +            std::inplace_merge(begin, middle, end, Comparator{});
158 | +        }
159 | +
160 | +        sorted = true;
161 | +    }
162 | +
163 | +    void sort()
164 | +    {
165 | +        if (!sorted)
166 | +        {
167 | +            std::sort(std::begin(timestamp_events), std::end(timestamp_events), Comparator{});
168 | +            sorted = true;
169 | +        }
170 | +    }
171 | +};
172 | +
173 | +
174 | +template <typename T>
175 | +class AggregateFunctionPath final : public IBinaryAggregateFunction<AggregateFunctionPathData, AggregateFunctionPath<T>>
176 | +{
177 | +private:
178 | +    using Events = UInt16[32];
179 | +
180 | +    UInt64 window;
181 | +    Events check_events;
182 | +    size_t check_events_size;
183 | +
184 | +    // return the index + 1 of event
185 | +    inline size_t findEventLevel(UInt16 event) const
186 | +    {
187 | +        for (size_t i = 0; i < check_events_size; i++)
188 | +        {
189 | +            if (event == check_events[i])
190 | +            {
191 | +                return i + 1;
192 | +            }
193 | +        }
194 | +        return 0xFFFF;
195 | +    }
196 | +
197 | +    UInt32 match(const AggregateFunctionPathData & data) const
198 | +    {
199 | +        if (data.done)
200 | +            return data.level;
201 | +
202 | +        if (check_events_size == 1)
203 | +            return 1;
204 | +
205 | +        const_cast<AggregateFunctionPathData &>(data).sort();
206 | +
207 | +        auto total_len = data.timestamp_events.size();
208 | +        size_t max_level = 0;
209 | +        for (size_t i = total_len; i > 0; i--)
210 | +        {
211 | +            auto event = (data.timestamp_events)[i - 1].second;
212 | +            auto event_level = findEventLevel(event);
213 | +            if (event_level <= max_level)
214 | +                continue;
215 | +
216 | +            if (search(data, i, event_level))
217 | +            {
218 | +                max_level = event_level;
219 | +                if (max_level == check_events_size)
220 | +                    break;
221 | +            }
222 | +        }
223 | +
224 | +        return max_level;
225 | +    }
226 | +
227 | +
228 | +    inline bool search(const AggregateFunctionPathData & data, size_t end_event_pos, size_t end_event_level) const
229 | +    {
230 | +        if (end_event_level == 1)
231 | +        {
232 | +            return true;
233 | +        }
234 | +        auto edge_time = (data.timestamp_events)[end_event_pos - 1].first - window;
235 | +        auto event_level = end_event_level;
236 | +        for (size_t i = end_event_pos; i > 0; i--)
237 | +        {
238 | +            auto time_event = (data.timestamp_events)[i - 1];
239 | +            if (time_event.first < edge_time)
240 | +                return false;
241 | +            if (check_events[event_level - 1] == time_event.second)
242 | +            {
243 | +                event_level--;
244 | +                if (event_level == 0)
245 | +                    return true;
246 | +            }
247 | +        }
248 | +        return false;
249 | +    }
250 | +
251 | +public:
252 | +    String getName() const override
253 | +    {
254 | +        return "path";
255 | +    }
256 | +
257 | +    DataTypePtr getReturnType() const override
258 | +    {
259 | +        return std::make_shared<DataTypeUInt8>();
260 | +    }
261 | +
262 | +    void setParameters(const Array & params) override
263 | +    {
264 | +        if (params.size() <= 1 || params.size() > 33)
265 | +            throw Exception("Aggregate function " + getName() + " requires (windows_in_seconds, 1_to_32_event_ids).",
266 | +                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
267 | +
268 | +        // Only support up to seconds in param, and timestamp is in milliseconds
269 | +        window = params[0].safeGet<UInt64>() * 1000;
270 | +
271 | +        check_events_size = params.size() - 1;
272 | +        for (size_t i = 1; i < params.size(); i++)
273 | +        {
274 | +            UInt64 p = params[i].safeGet<UInt64>();
275 | +            check_events[i - 1] = (UInt16)p;
276 | +        }
277 | +    }
278 | +
279 | +    void setArgumentsImpl(const DataTypes & arguments)
280 | +    {
281 | +        DataTypePtr timestampType = arguments[0];
282 | +        DataTypePtr eventType = arguments[1];
283 | +
284 | +        if (!(timestampType->getName() == "UInt64"))
285 | +            throw Exception("Illegal type " + timestampType->getName() + " of argument for aggregate function " + getName()
286 | +                    + " (1 arg, timestamp: UInt64)",
287 | +                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
288 | +        if (!(eventType->isNumeric()))
289 | +            throw Exception(
290 | +                "Illegal type " + eventType->getName() + " of argument for aggregate function " + getName() + " (2 arg, event id: numeric)",
291 | +                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
292 | +    }
293 | +
294 | +    void addImpl(
295 | +        AggregateDataPtr place, const IColumn & column_timestamp, const IColumn & column_event, size_t row_num, Arena * arena) const
296 | +    {
297 | +        this->data(place).add( //
298 | +            static_cast<const ColumnVector<UInt64> &>(column_timestamp).getData()[row_num],
299 | +            static_cast<const ColumnVector<T> &>(column_event).getData()[row_num],
300 | +            arena);
301 | +    }
302 | +
303 | +    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
304 | +    {
305 | +        this->data(place).merge(this->data(rhs), arena);
306 | +    }
307 | +
308 | +    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
309 | +    {
310 | +        writeVarUInt(match(this->data(place)), buf);
311 | +    }
312 | +
313 | +    void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
314 | +    {
315 | +        readVarUInt(this->data(place).level, buf);
316 | +        this->data(place).done = true;
317 | +    }
318 | +
319 | +    void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
320 | +    {
321 | +        static_cast<ColumnUInt8 &>(to).getData().push_back(match(this->data(place)));
322 | +    }
323 | +
324 | +    bool allocatesMemoryInArena() const override
325 | +    {
326 | +        return true;
327 | +    }
328 | +};
329 | +}
330 | diff --git a/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp b/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp
331 | index 5c8646f..f055e18 100644
332 | --- a/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp
333 | +++ b/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp
334 | @@ -5,6 +5,7 @@
335 |  namespace DB
336 |  {
337 |  
338 | +void registerAggregateFunctionPath(AggregateFunctionFactory & factory);
339 |  void registerAggregateFunctionAvg(AggregateFunctionFactory & factory);
340 |  void registerAggregateFunctionCount(AggregateFunctionFactory & factory);
341 |  void registerAggregateFunctionGroupArray(AggregateFunctionFactory & factory);
342 | @@ -30,6 +31,7 @@ void registerAggregateFunctions()
343 |  {
344 |      auto & factory = AggregateFunctionFactory::instance();
345 |  
346 | +    registerAggregateFunctionPath(factory);
347 |      registerAggregateFunctionAvg(factory);
348 |      registerAggregateFunctionCount(factory);
349 |      registerAggregateFunctionGroupArray(factory);
350 | diff --git a/dbms/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/dbms/src/Storages/MergeTree/MergedBlockOutputStream.cpp
351 | index 4936cd8..6e0b05f 100644
352 | --- a/dbms/src/Storages/MergeTree/MergedBlockOutputStream.cpp
353 | +++ b/dbms/src/Storages/MergeTree/MergedBlockOutputStream.cpp
354 | @@ -37,7 +37,7 @@ IMergedBlockOutputStream::IMergedBlockOutputStream(
355 |      min_compress_block_size(min_compress_block_size_),
356 |      max_compress_block_size(max_compress_block_size_),
357 |      aio_threshold(aio_threshold_),
358 | -    compression_method(compression_method_)
359 | +    _compression_method(compression_method_)
360 |  {
361 |  }
362 |  
363 | @@ -69,7 +69,7 @@ void IMergedBlockOutputStream::addStream(
364 |              path + escaped_column_name, NULL_MAP_EXTENSION,
365 |              path + escaped_column_name, NULL_MARKS_FILE_EXTENSION,
366 |              max_compress_block_size,
367 | -            compression_method,
368 | +            compression_method(name),
369 |              estimated_size,
370 |              aio_threshold);
371 |  
372 | @@ -91,7 +91,7 @@ void IMergedBlockOutputStream::addStream(
373 |                  path + escaped_size_name, DATA_FILE_EXTENSION,
374 |                  path + escaped_size_name, MARKS_FILE_EXTENSION,
375 |                  max_compress_block_size,
376 | -                compression_method,
377 | +                compression_method(name),
378 |                  estimated_size,
379 |                  aio_threshold);
380 |          }
381 | @@ -105,7 +105,7 @@ void IMergedBlockOutputStream::addStream(
382 |              path + escaped_column_name, DATA_FILE_EXTENSION,
383 |              path + escaped_column_name, MARKS_FILE_EXTENSION,
384 |              max_compress_block_size,
385 | -            compression_method,
386 | +            compression_method(name),
387 |              estimated_size,
388 |              aio_threshold);
389 |      }
390 | diff --git a/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h b/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h
391 | index 92a0dda..2749980 100644
392 | --- a/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h
393 | +++ b/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h
394 | @@ -81,9 +81,15 @@ protected:
395 |  
396 |      size_t aio_threshold;
397 |  
398 | -    CompressionMethod compression_method;
399 | +    CompressionMethod compression_method(const String & name){
400 | +        if(endsWith(name, "_nc")){
401 | +            return CompressionMethod::NONE;
402 | +        }
403 | +        return _compression_method;
404 | +    }
405 |  
406 |  private:
407 | +    CompressionMethod _compression_method;
408 |      /// Internal version of writeData.
409 |      void writeDataImpl(const String & name, const IDataType & type, const IColumn & column,
410 |          OffsetColumns & offset_columns, size_t level, bool write_array_data, bool skip_offsets);
411 | -- 
412 | 2.10.1 (Apple Git-78)
413 | 
414 | 


--------------------------------------------------------------------------------
/开源组第一名/README.md:
--------------------------------------------------------------------------------
  1 | ## 使用方法：
  2 | 
  3 | 去github拉取ClickHouse源码，然后添加修改
  4 | 
  5 | ```
  6 | git clone git@github.com:yandex/ClickHouse.git
  7 | cd ClickHouse
  8 | git checkout ab7672f329f7736756542268178e6f9f7e32325a
  9 | git checkout -b path
 10 | git apply 0001-Add-AggregateFunctionPath.patch
 11 | ```
 12 | 
 13 | 
 14 | ## 编译方法：
 15 | 按照文档，安装所有依赖，https://clickhouse.yandex/docs/en/development/build.html
 16 | 
 17 | 然后使用以下命令编译出 clickhouse 文件
 18 | 
 19 | ```
 20 | mkdir build
 21 | cd build
 22 | cmake ..
 23 | make -j 8 clickhouse
 24 | ls dbms/src/Server/clickhouse
 25 | ```
 26 | 
 27 | 可执行文件：
 28 | dbms/src/Server/clickhouse 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | ## 部署方法：
 35 | 
 36 | 部署方法：
 37 | 
 38 | 每一个目标节点安装依赖：
 39 | 
 40 | ```
 41 | sudo yum -y install rpm-build redhat-rpm-config gcc-c++ readline-devel\
 42 |   unixODBC-devel subversion python-devel git wget openssl-devel m4 createrepo\
 43 |   libicu-devel zlib-devel libtool-ltdl-devel
 44 | ```
 45 | 
 46 | 然后把 clickhouse 文件放到 /data/ccc/ 目录，命名为 ccc
 47 | 
 48 | ```
 49 | mkdir -p /data/ccc/
 50 | cp clickhouse /data/ccc/ccc
 51 | ```
 52 | 
 53 | 把源码的 dbms/src/Server/config.xml 放到 /data/ccc/ 目录，注意修改里面的相关配置：
 54 | 
 55 | ```
 56 | <path>/var/lib/clickhouse/</path> 指定存储目录
 57 | <tmp_path>/var/lib/clickhouse/tmp/</tmp_path> 指定临时目录
 58 | 以下指定监听端口和地址
 59 |     <tcp_port>9000</tcp_port>
 60 | 
 61 |     <!-- <listen_host>::</listen_host> -->
 62 |     <listen_host>::1</listen_host>
 63 |     <listen_host>127.0.0.1</listen_host>
 64 | ```
 65 | 
 66 | 启动Server：
 67 | `./ccc --server --config-file=/data/ccc/config.xml`
 68 | 
 69 | 启动Client：
 70 | `./ccc --client --host 127.0.0.1 --port 9000`
 71 | 
 72 | 
 73 | 
 74 | ## 数据导入：
 75 | 
 76 | 在每一个clickhouse节点建本地表：
 77 | 
 78 | ```
 79 | CREATE TABLE event (
 80 | user_id UInt32,
 81 | timestamp_nc UInt64,
 82 | event_id_nc UInt32,
 83 | event_name String,
 84 | event_tag_brand String,
 85 | event_tag_content String,
 86 | event_tag_how Int32,
 87 | event_tag_page_num Int32,
 88 | event_tag_price Int32,
 89 | event_tag_price_all Int32,
 90 | event_date_nc Date) 
 91 | ENGINE = MergeTree(event_date_nc, (user_id, timestamp_nc, event_date_nc), 8192);
 92 | ```
 93 | 
 94 | 然后在每一个clickhouse节点建 分布式表：
 95 | 
 96 | ```
 97 | CREATE TABLE dist_event (
 98 | user_id UInt32,
 99 | timestamp_nc UInt64,
100 | event_id_nc UInt32,
101 | event_name String,
102 | event_tag_brand String,
103 | event_tag_content String,
104 | event_tag_how Int32,
105 | event_tag_page_num Int32,
106 | event_tag_price Int32,
107 | event_tag_price_all Int32,
108 | event_date_nc Date) 
109 | ENGINE = Distributed(default, default, event, user_id);
110 | ```
111 | 
112 | 分布式表需要对配置文件添加的 <remote_servers> 配置，参考 https://clickhouse.yandex/docs/en/table_engines/distributed.html
113 | 
114 | 然后使用 tools 文件夹里面的工具，把数据文件处理之后，导入 dist_event 表。参考 `tools/README.md`
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/开源组第一名/from_24s_to_0_5s.vectorlinex.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/analysys/olap/c4a0078fbd363f2e5217d76aacecec26568e04d4/开源组第一名/from_24s_to_0_5s.vectorlinex.pptx


--------------------------------------------------------------------------------
/开源组第一名/readme.txt:
--------------------------------------------------------------------------------
1 | 请下载压缩包查阅
2 | 


--------------------------------------------------------------------------------
/开源组第一名/tools/README.md:
--------------------------------------------------------------------------------
 1 | 数据导入、测试文档
 2 | ---
 3 | 
 4 | #### 环境准备
 5 | - 参考上级目录README文档编译部署好ClickHouse服务
 6 | - 安装好Go环境,方便处理数据
 7 | 
 8 | #### 数据处理
 9 | 
10 | - 安装三方json包,节省导入效率
11 | ```
12 | ## 安装三方json包,节省导入效率 
13 | go get -u github.com/json-iterator/go
14 | 
15 | ```
16 | 
17 | -  读一遍数据文件识别动态Scheme生成sql和模型文件
18 | ```
19 | go run createtable.go  -files=`ls 2017*` | tee  create_table.sql`
20 | ```
21 | 
22 | - 通过模型文件,处理数据, 将数据存入 output目录
23 | ```
24 | ## 数据如果单节点硬盘存不下,可以将数据按月份分散到多台机器,修改 `ls 2017XXX`来处理数据
25 | output="`pwd`/output"
26 | mkdir -p $output
27 | for f in `ls 2017*`;do
28 | 	go run index.go -file="$f" -out="`pwd`/output"   
29 | done
30 | ```
31 | 
32 | - 生成csv文件
33 | ```
34 | 	## 正式数据,user_id内容需要去掉前缀id,提高group效率
35 | 	## prefix参数,2017表示要处理数据文件名的前缀,如果是多台机器分散处理,合理修改此参数 
36 | 	sh to_csv.sh 2017
37 | ```
38 | 
39 | 
40 | - 导入数据
41 | ```
42 |  ## prefix参数,2017表示要处理数据文件名的前缀,如果是多台机器分散处理,合理修改此参数 
43 |  sh import.sh 2017
44 | ```
45 | 
46 | - 按月合并数据块,提高查询效率
47 | ```
48 | 	# https://clickhouse.yandex/docs/en/query_language/queries.html#optimize
49 | 	# 依次在各个节点sql交互环境下执行以下命令,按月合并数据块
50 | 	OPTIMIZE TABLE event PARTITION 201706 FINAL;
51 | 	OPTIMIZE TABLE event PARTITION 201707 FINAL;
52 | 	OPTIMIZE TABLE event PARTITION 201708 FINAL;
53 | ```
54 | 
55 | - 查询
56 | 	- 测试sql:7,8月份, 转化路径 10004,10008, 10009,10010, 且10004事件的标签品牌是 Apple或者LianX,  时间窗口为30天的漏斗情况
57 | 	- 修改q.sh中的clients变量值, 设置为各个节点的hosts
58 | 	- 参考a.sql文件示例,执行:`sh query_sql.sh a.sql`
59 | 	- 结果 `4000000 4000000 3999994 3999936`
60 | 	
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/开源组第一名/tools/a.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 		    sumIf(c, level >= 1) AS _1,
 3 | 		    sumIf(c, level >= 2) AS _2,
 4 | 		    sumIf(c, level >= 3) AS _3,
 5 | 		    sumIf(c, level >= 4) AS _4
 6 | 		FROM
 7 | 		(
 8 | 		    SELECT
 9 | 		        level,
10 | 		        count(*) AS c
11 | 		    FROM
12 | 		    (
13 | 		        SELECT
14 | 		            user_id,
15 | 		            path(2592000, 10004, 10008, 10009,10010)(timestamp_nc, event_id_nc) AS level
16 | 		        FROM event
17 | 		        WHERE ( (event_date_nc >= toDate('2017-07-01')) AND (event_date_nc <= toDate('2017-08-31')) AND ( (event_id_nc IN (10008, 10009, 10010)) OR
18 | 		                    (event_id_nc = 10004 AND (event_tag_brand = 'Apple' or event_tag_brand = 'LianX')  ) ) )
19 | 		        GROUP BY user_id
20 | 		    )
21 | 		    GROUP BY level
22 | 		    ORDER BY level ASC
23 | 		);


--------------------------------------------------------------------------------
/开源组第一名/tools/col.model:
--------------------------------------------------------------------------------
1 | {"price":"Int32","content":"String","page_num":"Int32","price_all":"Int32","how":"Int32","brand":"String"}


--------------------------------------------------------------------------------
/开源组第一名/tools/createtable.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"flag"
  7 | 	"fmt"
  8 | 	"log"
  9 | 	"os"
 10 | 	"sort"
 11 | 	"strings"
 12 | 	"sync"
 13 | 	"unsafe"
 14 | 
 15 | 	json "github.com/json-iterator/go"
 16 | )
 17 | 
 18 | //这个脚本处理所有文件的数据,在当前目录动态生成 modelFile 文件
 19 | var (
 20 | 	files  string
 21 | 	tagMap = make(map[string]string)
 22 | 	lock   sync.Mutex
 23 | 	index  = 4
 24 | 
 25 | 	emptyjs   = []byte("{}")
 26 | 	tabBs     = []byte("\t")
 27 | 	modelFile = "col.model"
 28 | 	keys      = []string{}
 29 | )
 30 | 
 31 | func init() {
 32 | 	flag.StringVar(&files, "files", "", "file to load")
 33 | }
 34 | 
 35 | func main() {
 36 | 	flag.Parse()
 37 | 	var wg sync.WaitGroup
 38 | 	for _, f := range strings.Split(files, " ") {
 39 | 		wg.Add(1)
 40 | 		go func() {
 41 | 			process(f)
 42 | 			wg.Done()
 43 | 		}()
 44 | 	}
 45 | 	wg.Wait()
 46 | 
 47 | 	saveModel()
 48 | 	sql := getSql()
 49 | 	fmt.Println(sql)
 50 | }
 51 | 
 52 | func process(f string) {
 53 | 	r, err := os.Open(f)
 54 | 	if err != nil {
 55 | 		panic(err)
 56 | 	}
 57 | 	sc := bufio.NewScanner(r)
 58 | 	for sc.Scan() {
 59 | 		js := bytes.Split(sc.Bytes(), tabBs)[index]
 60 | 		if bytes.Equal(js, emptyjs) {
 61 | 			continue
 62 | 		}
 63 | 		it := make(map[string]interface{})
 64 | 		err := json.Unmarshal(js, &it)
 65 | 		if err != nil {
 66 | 			log.Print(err.Error())
 67 | 		} else {
 68 | 			setTag(it)
 69 | 		}
 70 | 	}
 71 | }
 72 | 
 73 | func setTag(it map[string]interface{}) {
 74 | 	for k, v := range it {
 75 | 		if _, ok := tagMap[k]; !ok {
 76 | 			lock.Lock()
 77 | 			str := "Int32"
 78 | 			switch v.(type) {
 79 | 			case string:
 80 | 				str = "String"
 81 | 			}
 82 | 			tagMap[k] = str
 83 | 			lock.Unlock()
 84 | 		}
 85 | 	}
 86 | }
 87 | 
 88 | func saveModel() {
 89 | 	for k, _ := range tagMap {
 90 | 		keys = append(keys, k)
 91 | 	}
 92 | 	sort.Strings(keys)
 93 | 
 94 | 	f, err := os.OpenFile(modelFile, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0660)
 95 | 	if err != nil {
 96 | 		panic(err.Error())
 97 | 	}
 98 | 	bs, _ := json.Marshal(tagMap)
 99 | 	f.Write(bs)
100 | 	f.Close()
101 | }
102 | 
103 | func getSql() string {
104 | 	sqlTemplate := `CREATE TABLE trend_event
105 | (
106 |     user_id UInt32, 
107 |     timestamp UInt64, 
108 |     event_id UInt32, 
109 |     event_name String, 
110 |     
111 | %s
112 |     event_date Date
113 | )  engine = MergeTree(event_date, (user_id, timestamp, event_date), 8192);
114 | `
115 | 	bs := bytes.NewBuffer([]byte{})
116 | 	for _, k := range keys {
117 | 		v := tagMap[k]
118 | 		bs.WriteString("    ")
119 | 		bs.WriteString("event_tag_" + k)
120 | 		bs.WriteString(" ")
121 | 		bs.WriteString(v)
122 | 		bs.WriteString(",\n")
123 | 	}
124 | 	return fmt.Sprintf(sqlTemplate, bs.String())
125 | }
126 | 
127 | func String2Bytes(s string) []byte {
128 | 	x := (*[2]uintptr)(unsafe.Pointer(&s))
129 | 	h := [3]uintptr{x[0], x[1], x[1]}
130 | 	return *(*[]byte)(unsafe.Pointer(&h))
131 | }
132 | 
133 | func Bytes2String(b []byte) string {
134 | 	return *(*string)(unsafe.Pointer(&b))
135 | }
136 | 


--------------------------------------------------------------------------------
/开源组第一名/tools/import.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | prefix=$1
4 | echo ${prefix}
5 | for f in `ls csv/${prefix}*`;do
6 |     /data/ccc/ccc --client --query="INSERT INTO dis_event FORMAT TabSeparated"
7 | done
8 | 


--------------------------------------------------------------------------------
/开源组第一名/tools/index.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"flag"
  7 | 	"fmt"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"sort"
 11 | 	"sync"
 12 | 	"unsafe"
 13 | 
 14 | 	json "github.com/json-iterator/go"
 15 | )
 16 | 
 17 | // 处理id
 18 | // head output/20170501 | awk -F"\t"  'OFS="\t"{gsub("id", "",$1);$1=$1;print $0}'
 19 | var (
 20 | 	file       string
 21 | 	outDir     string
 22 | 	tagMap     = make(map[string]string)
 23 | 	lock       sync.Mutex
 24 | 	index      = 4
 25 | 	emptyjs    = []byte("{}")
 26 | 	modelFile  = "col.model"
 27 | 	tabBs      = []byte("\t")
 28 | 	keys       []string
 29 | 	valueTypes []int
 30 | )
 31 | 
 32 | const (
 33 | 	stringType = 1
 34 | 	intType    = 2
 35 | )
 36 | 
 37 | func init() {
 38 | 	flag.StringVar(&file, "file", "", "file to load")
 39 | 	flag.StringVar(&outDir, "out", "/data/yiguan/output", "outdir path")
 40 | }
 41 | 
 42 | func main() {
 43 | 	flag.Parse()
 44 | 	readModel()
 45 | 	process(file)
 46 | }
 47 | 
 48 | func process(f string) {
 49 | 	r, err := os.Open(f)
 50 | 	if err != nil {
 51 | 		panic(err)
 52 | 	}
 53 | 	info, _ := r.Stat()
 54 | 	sc := bufio.NewScanner(r)
 55 | 	of, _ := os.OpenFile(filepath.Join(outDir, info.Name()), os.O_TRUNC|os.O_CREATE|os.O_RDWR, 0660)
 56 | 	output := bufio.NewWriter(of)
 57 | 
 58 | 	var i = 0
 59 | 	for sc.Scan() {
 60 | 		i++
 61 | 		bs := bytes.Split(sc.Bytes(), tabBs)
 62 | 		//先输出 0-3
 63 | 
 64 | 		output.Write(bs[0])
 65 | 		output.WriteRune('\t')
 66 | 
 67 | 		output.Write(bs[1])
 68 | 		output.WriteRune('\t')
 69 | 
 70 | 		output.Write(bs[2])
 71 | 		output.WriteRune('\t')
 72 | 
 73 | 		output.Write(bs[3])
 74 | 		output.WriteRune('\t')
 75 | 
 76 | 		it := make(map[string]interface{})
 77 | 		if !bytes.Equal(bs[4], emptyjs) {
 78 | 			err := json.Unmarshal(bs[4], &it)
 79 | 			if err != nil {
 80 | 				panic(err)
 81 | 			}
 82 | 		}
 83 | 		for i, k := range keys {
 84 | 			v := valueTypes[i]
 85 | 			if v == stringType {
 86 | 				if _, ok := it[k]; !ok {
 87 | 					output.WriteRune(' ')
 88 | 					output.WriteRune('\t')
 89 | 					continue
 90 | 				}
 91 | 				output.WriteString(it[k].(string))
 92 | 			} else {
 93 | 				if _, ok := it[k]; !ok {
 94 | 					output.WriteRune('0')
 95 | 					output.WriteRune('\t')
 96 | 					continue
 97 | 				}
 98 | 				output.WriteString(fmt.Sprintf("%v", it[k]))
 99 | 			}
100 | 			output.WriteRune('\t')
101 | 		}
102 | 
103 | 		//5 转date类型 20160707
104 | 		dd := bs[5]
105 | 
106 | 		output.Write(dd[:4])
107 | 		output.WriteRune('-')
108 | 		output.Write(dd[4:6])
109 | 		output.WriteRune('-')
110 | 		output.Write(dd[6:8])
111 | 		output.WriteRune('\n')
112 | 		if i%1000000 == 0 {
113 | 			output.Flush()
114 | 		}
115 | 	}
116 | 	output.Flush()
117 | 
118 | 	fmt.Println("done => ", f)
119 | }
120 | 
121 | func readModel() {
122 | 	f, _ := os.Open(modelFile)
123 | 	err := json.NewDecoder(f).Decode(&tagMap)
124 | 	if err != nil {
125 | 		panic(err)
126 | 	}
127 | 	for k, _ := range tagMap {
128 | 		keys = append(keys, k)
129 | 	}
130 | 	sort.Strings(keys)
131 | 	for _, k := range keys {
132 | 		if tagMap[k] == "String" {
133 | 			valueTypes = append(valueTypes, stringType)
134 | 		} else {
135 | 			valueTypes = append(valueTypes, intType)
136 | 		}
137 | 	}
138 | }
139 | 
140 | func String2Bytes(s string) []byte {
141 | 	x := (*[2]uintptr)(unsafe.Pointer(&s))
142 | 	h := [3]uintptr{x[0], x[1], x[1]}
143 | 	return *(*[]byte)(unsafe.Pointer(&h))
144 | }
145 | 
146 | func Bytes2String(b []byte) string {
147 | 	return *(*string)(unsafe.Pointer(&b))
148 | }
149 | 
150 | func cout(b []byte) {
151 | 	fmt.Print(Bytes2String(b))
152 | }
153 | 


--------------------------------------------------------------------------------
/开源组第一名/tools/process_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # 2631201798数据
 3 | # 668726242   668904531  637996737 655574288
 4 | 
 5 | prefix=$1
 6 | output="`pwd`/output"
 7 | mkdir -p $output
 8 | for f in `ls /data/zhaoshu/${prefix}*`;do
 9 | 	./index -file="$f" -out="`pwd`/output"
10 | done
11 | 
12 | 


--------------------------------------------------------------------------------
/开源组第一名/tools/q.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | sql=$1
 3 | clients="10.9.161.77 10.9.113.205 10.9.83.235 10.9.169.253"
 4 | 
 5 | function fetch(){
 6 | 	echo "$sql" | /data/ccc/ccc --client -m --host $1
 7 | }
 8 | for c in $clients;do
 9 | 	fetch "$c" &
10 | done
11 | 
12 | wait 


--------------------------------------------------------------------------------
/开源组第一名/tools/query_sql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | sql=$1
3 | sh q.sh  "`cat $sql`"  | awk '{for(i=1;i<=NF;i++){a[i] += $i}}END{ for(j=1;j<=length(a);j++) {printf a[j]" "
4 |  };printf "\n" }'


--------------------------------------------------------------------------------
/开源组第一名/tools/queryx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | sql=$1
3 | sh q.sh  "$sql"  | awk '{for(i=1;i<=NF;i++){a[i] += $i}}END{ for(j=1;j<=length(a);j++) {printf a[j]" "
4 |  };printf "\n" }'


--------------------------------------------------------------------------------
/开源组第一名/tools/to_csv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | ## 替换user_id中的id
 3 | 
 4 | prefix=$1
 5 | echo ${prefix}
 6 | 
 7 | cd output
 8 | for f in `ls ${prefix}*`;do
 9 |     awk -F"\t"  'OFS="\t"{gsub("id", "",$1);$1=$1;print $0}' ${f}  > ../csv/${f}
10 | done


--------------------------------------------------------------------------------
/开源组第一名/易观OLAP比赛源码-向量线科技.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/analysys/olap/c4a0078fbd363f2e5217d76aacecec26568e04d4/开源组第一名/易观OLAP比赛源码-向量线科技.zip


--------------------------------------------------------------------------------
/易观/AggregationLDCount.java:
--------------------------------------------------------------------------------
  1 | package aggregation;
  2 | 
  3 | import com.facebook.presto.spi.block.BlockBuilder;
  4 | import com.facebook.presto.spi.function.*;
  5 | import com.facebook.presto.spi.type.StandardTypes;
  6 | 
  7 | import io.airlift.slice.Slice;
  8 | import io.airlift.slice.Slices;
  9 | import state.SliceState;
 10 | 
 11 | import java.util.*;
 12 | 
 13 | /*
 14 | 计算漏斗的聚合函数, 同时能够保存人群, 步骤一
 15 | 
 16 | 目标: 查询12月1号到20号20天, 时间窗口为7天, 事件个数为3个的漏斗
 17 | select xwho, ld_count(xwhen, 7*86400000, xwhat, 'A,B,C') as xwho_state
 18 | from tablename
 19 | where ds >= '2016-12-01' and ds < '2016-12-21' and xwhat in ('A', 'B', 'C')
 20 | group by xwho;
 21 | 
 22 | 输出结果:
 23 | 0001 2
 24 | 0002 1
 25 | 0003 2
 26 |  */
 27 | @AggregationFunction("ld_count")
 28 | public class AggregationLDCount extends AggregationBase {
 29 | 
 30 |     private static final int COUNT_FLAG_LENGTH = 10;     // 状态变量最前3位存放临时变量(1, 1, 8)
 31 |     private static final int COUNT_ONE_LENGTH = 8;       // 状态变量中每个事件和其时间所占位数(long)
 32 | 
 33 |     @InputFunction
 34 |     public static void input(SliceState state,                                  // 每个用户的状态
 35 |                              @SqlType(StandardTypes.BIGINT) long xwhen,         // 当前事件的时间戳
 36 |                              @SqlType(StandardTypes.BIGINT) long windows,       // 当前查询的时间窗口大小
 37 |                              @SqlType(StandardTypes.VARCHAR) Slice xwhat,       // 当前事件的名称, A还是B或者C
 38 |                              @SqlType(StandardTypes.VARCHAR) Slice events) {    // 当前查询的全部事件, 逗号分隔
 39 |         // 获取状态
 40 |         Slice slice = state.getSlice();
 41 | 
 42 |         // 判断是否需要初始化events
 43 |         if (!event_pos_dict.containsKey(events)) {
 44 |             init_events(events, 0);
 45 |         }
 46 | 
 47 |         // 初始化slice
 48 |         if (null == slice) {
 49 |             slice = Slices.allocate(COUNT_FLAG_LENGTH);
 50 | 
 51 |             // 初始化前3位存放临时变量: {是否包含事件A(byte), 事件个数(byte), 时间窗口大小(long)}
 52 |             slice.setByte(0, 0);
 53 |             slice.setByte(1, event_pos_dict.get(events).size());
 54 |             slice.setLong(2, windows);
 55 | 
 56 |         }
 57 | 
 58 |         // 新建slice, 并初始化
 59 |         int slice_length = slice.length();
 60 |         Slice new_slice = Slices.allocate(slice_length + COUNT_ONE_LENGTH);
 61 |         new_slice.setBytes(0, slice.getBytes());
 62 | 
 63 |         // 更改状态变量
 64 |         byte xwhat_index = event_pos_dict.get(events).get(xwhat);
 65 |         if (xwhat_index == 0) {
 66 |             new_slice.setByte(0, 1);
 67 |         }
 68 |         new_slice.setLong(slice_length, xwhen * 10 + xwhat_index);
 69 | 
 70 |         // 返回结果
 71 |         state.setSlice(new_slice);
 72 |     }
 73 | 
 74 |     @CombineFunction
 75 |     public static void combine(SliceState state, SliceState otherState) {
 76 |         // 获取状态
 77 |         Slice slice = state.getSlice();
 78 |         Slice otherslice = otherState.getSlice();
 79 | 
 80 |         // 更新状态, 并返回结果
 81 |         if (null == slice) {
 82 |             state.setSlice(otherslice);
 83 |         } else {
 84 |             int length1 = slice.length();
 85 |             int length2 = otherslice.length();
 86 | 
 87 |             // 初始化
 88 |             Slice slice_new = Slices.allocate(length1 + length2 - COUNT_FLAG_LENGTH);
 89 | 
 90 |             // 赋值
 91 |             slice_new.setBytes(0, slice.getBytes());
 92 |             slice_new.setBytes(length1, otherslice.getBytes(), COUNT_FLAG_LENGTH, length2 - COUNT_FLAG_LENGTH);
 93 |             if (otherslice.getByte(0) == 1) {
 94 |                 slice_new.setByte(0, 1);
 95 |             }
 96 | 
 97 |             // 返回结果
 98 |             state.setSlice(slice_new);
 99 |         }
100 |     }
101 | 
102 |     @OutputFunction(StandardTypes.INTEGER)
103 |     public static void output(SliceState state, BlockBuilder out) {
104 |         // 获取状态
105 |         Slice slice = state.getSlice();
106 | 
107 |         // 数据为空, 或者没有事件A, 返回0
108 |         if ((null == slice) || (slice.getByte(0) == 0)) {
109 |             out.writeInt(0);
110 |             out.closeEntry();
111 |             return;
112 |         }
113 | 
114 |         // 构造列表和字典, 为排序做准备
115 |         List<Long> time_array = new ArrayList<>();
116 |         for (int index = COUNT_FLAG_LENGTH; index < slice.length(); index += COUNT_ONE_LENGTH) {
117 |             time_array.add(slice.getLong(index));
118 |         }
119 | 
120 |         // 排序数组, 这里可能比较耗时
121 |         Collections.sort(time_array);
122 | 
123 |         // 获取中间变量
124 |         byte events_count = slice.getByte(1);
125 |         long windows = slice.getLong(2);
126 | 
127 |         // 遍历时间戳数据, 也就是遍历有序事件, 并构造结果
128 |         int max_xwhat_index = 0;
129 |         List<long[]> temp = new ArrayList<>();
130 |         for (long xwhen_xwhat: time_array) {
131 |             // 事件有序进入
132 |             long timestamp = xwhen_xwhat / 10;
133 |             byte xwhat = (byte) (xwhen_xwhat % 10);
134 | 
135 |             if (xwhat == 0) {
136 |                 // 新建临时对象, 存放 (A事件的时间戳, 当前最后一个事件的下标)
137 |                 long[] flag = {timestamp, xwhat};
138 |                 temp.add(flag);
139 |             } else {
140 |                 // 更新临时对象: 从后往前, 并根据条件适当跳出
141 |                 for (int i = temp.size() - 1; i >= 0; --i) {
142 |                     long[] flag = temp.get(i);
143 |                     if ((timestamp - flag[0]) >= windows) {
144 |                         // 当前事件的时间戳减去flag[0]超过时间窗口不合法, 跳出
145 |                         break;
146 |                     } else if (xwhat == (flag[1] + 1)) {
147 |                         // 当前事件为下一个事件, 更新数据并跳出
148 |                         flag[1] = xwhat;
149 |                         if (max_xwhat_index < xwhat) {
150 |                             max_xwhat_index = xwhat;
151 |                         }
152 |                         break;
153 |                     }
154 |                 }
155 | 
156 |                 // 漏斗流程结束, 提前退出
157 |                 if ((max_xwhat_index + 1) == events_count) {
158 |                     break;
159 |                 }
160 |             }
161 |         }
162 | 
163 |         // 返回结果
164 |         out.writeInt(max_xwhat_index + 1);
165 |         out.closeEntry();
166 |     }
167 | }
168 | 


--------------------------------------------------------------------------------
/易观/AggregationLDSum.java:
--------------------------------------------------------------------------------
 1 | package aggregation;
 2 | 
 3 | // import com.facebook.presto.operator.aggregation.state.SliceState;
 4 | 
 5 | import com.facebook.presto.spi.block.BlockBuilder;
 6 | import com.facebook.presto.spi.block.BlockBuilderStatus;
 7 | import com.facebook.presto.spi.function.*;
 8 | import com.facebook.presto.spi.type.BigintType;
 9 | import com.facebook.presto.spi.type.StandardTypes;
10 | import io.airlift.slice.Slice;
11 | import io.airlift.slice.Slices;
12 | import state.SliceState;
13 | 
14 | /*
15 | 计算漏斗的聚合函数, 步骤二
16 |  */
17 | @AggregationFunction("ld_sum")
18 | public class AggregationLDSum extends AggregationBase {
19 | 
20 |     @InputFunction
21 |     public static void input(SliceState state,
22 |                              @SqlType(StandardTypes.INTEGER) long xwho_state,       // 每个用户的状态
23 |                              @SqlType(StandardTypes.INTEGER) long events_count) {   // 查询事件的个数
24 |         // 获取state状态
25 |         Slice slice = state.getSlice();
26 | 
27 |         // 初始化state, 长度为events_length个int
28 |         if (null == slice) {
29 |             slice = Slices.allocate((int) events_count * 4);
30 |         }
31 | 
32 |         // 计算用户数
33 |         for (int status = 0; status < xwho_state; ++status) {
34 |             int index = status * 4;
35 |             slice.setInt(index, slice.getInt(index) + 1);
36 |         }
37 | 
38 |         // 返回状态
39 |         state.setSlice(slice);
40 |     }
41 | 
42 |     @CombineFunction
43 |     public static void combine(SliceState state, SliceState otherState) {
44 |         // 获取状态
45 |         Slice slice = state.getSlice();
46 |         Slice otherslice = otherState.getSlice();
47 | 
48 |         // 更新状态并返回结果
49 |         if (null == slice) {
50 |             state.setSlice(otherslice);
51 |         } else {
52 |             for (int index = 0; index < slice.length(); index += 4) {
53 |                 slice.setInt(index, slice.getInt(index) + otherslice.getInt(index));
54 |             }
55 |             state.setSlice(slice);
56 |         }
57 |     }
58 | 
59 |     @OutputFunction("array(" + StandardTypes.BIGINT + ")")
60 |     public static void output(SliceState state, BlockBuilder out) {
61 |         // 获取状态
62 |         Slice slice = state.getSlice();
63 | 
64 |         // 数据为空, 返回一个空数组
65 |         if (null == slice) {
66 |             BlockBuilder blockBuilder = BigintType.BIGINT.createBlockBuilder(new BlockBuilderStatus(), 0);
67 |             out.writeObject(blockBuilder.build());
68 |             out.closeEntry();
69 |             return;
70 |         }
71 | 
72 |         // 构造结果: [A:100, B:50, C:10, ......]
73 |         BlockBuilder blockBuilder = BigintType.BIGINT.createBlockBuilder(new BlockBuilderStatus(), slice.length() / 4);
74 |         for (int index = 0; index < slice.length(); index += 4) {
75 |             BigintType.BIGINT.writeLong(blockBuilder, slice.getInt(index));
76 |         }
77 | 
78 |         // 返回结果
79 |         out.writeObject(blockBuilder.build());
80 |         out.closeEntry();
81 |     }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------