├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── base ├── CMakeLists.txt ├── actor.hpp ├── color.hpp ├── get_ip.hpp ├── magic.hpp ├── message.cpp ├── message.hpp ├── node.cpp ├── node.hpp ├── sarray_binstream.cpp ├── sarray_binstream.hpp ├── sarray_binstream_test.cpp ├── third_party │ ├── network_utils.cpp │ ├── network_utils.h │ ├── network_utils_test.cpp │ ├── range.h │ └── sarray.h └── threadsafe_queue.hpp ├── cmake ├── dep.cmake └── modules │ ├── gflags.cmake │ ├── glog.cmake │ ├── gtest.cmake │ └── zeromq.cmake ├── comm ├── CMakeLists.txt ├── abstract_mailbox.hpp ├── abstract_sender.hpp ├── basic_mailbox.cpp ├── basic_mailbox.hpp ├── resender.hpp ├── scheduler_mailbox.cpp ├── scheduler_mailbox.hpp ├── scheduler_mailbox_test.cpp ├── sender.cpp ├── sender.hpp ├── simple_sender.hpp ├── worker_mailbox.cpp ├── worker_mailbox.hpp └── worker_mailbox_test.cpp ├── core ├── CMakeLists.txt ├── abstract_collection_map.hpp ├── cache │ ├── abstract_fetcher.hpp │ ├── bin_to_part_mappers.hpp │ ├── fetcher.cpp │ ├── fetcher.hpp │ ├── fetcher_test.cpp │ └── typed_cache.hpp ├── collection_map.hpp ├── engine.cpp ├── engine.hpp ├── engine_elem.hpp ├── executor │ ├── abstract_executor.hpp │ ├── executor.cpp │ ├── executor.hpp │ ├── executor_test.cpp │ ├── thread_pool.cpp │ ├── thread_pool.hpp │ └── thread_pool_test.cpp ├── index │ ├── abstract_key_to_part_mapper.hpp │ ├── abstract_part_to_node_mapper.hpp │ ├── hash_key_to_part_mapper.hpp │ ├── hash_key_to_part_mapper_test.cpp │ ├── hash_part_to_node_mapper.hpp │ ├── hash_part_to_node_mapper_test.cpp │ ├── key_to_part_mappers.hpp │ ├── range_key_to_part_mapper.hpp │ └── simple_part_to_node_mapper.hpp ├── intermediate │ ├── abstract_intermediate_store.hpp │ ├── intermediate_store.hpp │ └── simple_intermediate_store.hpp ├── map_output │ ├── abstract_map_output.hpp │ ├── map_output_storage.cpp │ ├── map_output_storage.hpp │ ├── map_output_stream.hpp │ ├── map_output_stream_store.hpp │ ├── partitioned_map_output.hpp │ └── partitioned_map_output_test.cpp ├── partition │ ├── abstract_fetcher.hpp │ ├── abstract_partition.hpp │ ├── block_partition.hpp │ ├── file_partition.hpp │ ├── indexed_seq_partition.hpp │ ├── indexed_seq_partition_test.cpp │ ├── partition_manager.cpp │ ├── partition_manager.hpp │ ├── partition_manager_test.cpp │ ├── range_indexed_seq_partition.hpp │ ├── seq_partition.hpp │ ├── seq_partition_test.cpp │ └── task_timer.hpp ├── plan │ ├── abstract_function_store.hpp │ ├── checkpoint.hpp │ ├── collection.hpp │ ├── collection_spec.hpp │ ├── context.cpp │ ├── context.hpp │ ├── dag.cpp │ ├── dag.hpp │ ├── dag_test.cpp │ ├── distribute.hpp │ ├── function_store.cpp │ ├── function_store.hpp │ ├── load.hpp │ ├── mappartupdate.hpp │ ├── mappartwithupdate.hpp │ ├── mapupdate.hpp │ ├── mapupdate_test.cpp │ ├── mapwithupdate.hpp │ ├── mapwithupdate_test.hpp │ ├── plan_base.hpp │ ├── plan_spec.hpp │ ├── runner.cpp │ ├── runner.hpp │ ├── spec_wrapper.cpp │ ├── spec_wrapper.hpp │ ├── update_helper.hpp │ └── write.hpp ├── program_context.hpp ├── queue_node_map.hpp ├── scheduler │ ├── block_manager.cpp │ ├── block_manager.hpp │ ├── checkpoint_loader.cpp │ ├── checkpoint_loader.hpp │ ├── checkpoint_manager.cpp │ ├── checkpoint_manager.hpp │ ├── collection_manager.cpp │ ├── collection_manager.hpp │ ├── collection_status.cpp │ ├── collection_status.hpp │ ├── collection_view.hpp │ ├── control.cpp │ ├── control.hpp │ ├── control_manager.cpp │ ├── control_manager.hpp │ ├── dag_runner.cpp │ ├── dag_runner.hpp │ ├── distribute_manager.cpp │ ├── distribute_manager.hpp │ ├── recover_manager.cpp │ ├── recover_manager.hpp │ ├── scheduler.cpp │ ├── scheduler.hpp │ ├── scheduler_elem.cpp │ ├── scheduler_elem.hpp │ ├── scheduler_test.cpp │ ├── worker.cpp │ ├── worker.hpp │ ├── worker_test.cpp │ ├── write_manager.cpp │ └── write_manager.hpp ├── shuffle_meta.hpp ├── tmp.cpp ├── tmp.hpp └── worker │ ├── abstract_plan_controller.hpp │ ├── controller.cpp │ ├── controller.hpp │ ├── delayed_combiner.cpp │ ├── delayed_combiner.hpp │ ├── plan_controller.cpp │ ├── plan_controller.hpp │ └── plan_controller_test.cpp ├── examples ├── CMakeLists.txt ├── a.cpp ├── crawler.cpp ├── crawler_util.py ├── graph_matching.cpp ├── kmeans │ ├── CMakeLists.txt │ ├── kmeans.cpp │ ├── kmeans_helper.hpp │ └── kmeans_row.cpp ├── load_example.cpp ├── lr │ ├── CMakeLists.txt │ ├── basic_lr.hpp │ ├── dense_lr.cpp │ ├── dense_lr_2.cpp │ ├── dense_lr_row.cpp │ └── sparse_lr.cpp ├── nomad.cpp ├── nomad2.cpp ├── pagerank │ ├── CMakeLists.txt │ ├── compare_pr.cpp │ ├── compare_pr.py │ ├── pagerank-converge-bsp.cpp │ ├── pagerank-converge-bsp.py │ ├── pagerank-converge.cpp │ ├── pagerank-converge.py │ ├── pagerank.cpp │ ├── pagerank_with.cpp │ ├── sum.cpp │ └── sum.py ├── scheduler_example.cpp ├── scheduler_main.cpp ├── sssp.cpp ├── test_cp.cpp ├── test_fetch.cpp ├── test_lb.cpp ├── tfidf │ ├── CMakeLists.txt │ ├── tfidf.cpp │ ├── tfidf.py │ ├── tfidf2.cpp │ ├── tfidf3.cpp │ ├── wordcount.cpp │ └── wordcount.py ├── tfidf_lr.cpp └── worker_example.cpp ├── io ├── CMakeLists.txt ├── abstract_block_reader.hpp ├── abstract_browser.hpp ├── abstract_reader.hpp ├── abstract_writer.hpp ├── assigner.cpp ├── assigner.hpp ├── assigner_test.cpp ├── fake_block_reader.hpp ├── fake_reader.hpp ├── fake_writer.hpp ├── hdfs_assigner_main.cpp ├── hdfs_block_reader.cpp ├── hdfs_block_reader.hpp ├── hdfs_block_reader_main.cpp ├── hdfs_browser.cpp ├── hdfs_browser.hpp ├── hdfs_helper.hpp ├── hdfs_reader.cpp ├── hdfs_reader.hpp ├── hdfs_reader_main.cpp ├── hdfs_writer.cpp ├── hdfs_writer.hpp ├── hdfs_writer_main.cpp ├── io_wrapper.hpp └── meta.hpp ├── machinefiles ├── 20nodes └── 5nodes ├── scripts ├── a.py ├── clang-format.py ├── cpplint.py ├── crawler.py ├── graph_matching.py ├── kill.py ├── kmeans.py ├── launch_utils.py ├── launcher.py ├── lint.py ├── load_example.py ├── lr.py ├── lr99.py ├── mailbox_example.py ├── nomad.py ├── nomad2.py ├── nomad99.py ├── pagerank.py ├── pagerank99.py ├── pagerank_with.py ├── pagerank_with99.py ├── sssp99.py ├── tfidf.py └── tfidf_lr99.py ├── test ├── CMakeLists.txt └── test_main.cpp └── utils ├── busy.cpp └── compile.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Temp files 2 | *.swp 3 | *.swo 4 | *.bak 5 | 6 | # Python 7 | *.pyc 8 | 9 | # C++ 10 | *.out 11 | *.o 12 | 13 | # Configure Directory 14 | conf/* 15 | 16 | # Build Directory 17 | build/* 18 | debug/* 19 | release/* 20 | 21 | # ThirdParty Temp Directory 22 | third_party/src/* 23 | third_party/tmp/* 24 | 25 | # Generated by doxygen 26 | html/* 27 | latex/* 28 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | group: edge 4 | 5 | language: python # workaround for not setting CC/CXX with default value 6 | 7 | python: 2.7 8 | 9 | addons: 10 | apt: 11 | sources: 12 | - ubuntu-toolchain-r-test 13 | - llvm-toolchain-precise-3.9 14 | packages: 15 | - build-essential 16 | - cmake 17 | - gcc-5 18 | - g++-5 19 | - clang-format-3.9 20 | 21 | before_install: 22 | - sudo add-apt-repository -y ppa:kojoley/boost 23 | - sudo apt-get -q update 24 | - sudo apt-get install -qq software-properties-common 25 | - sudo apt-get install -qq libgoogle-perftools-dev libzmq3-dev mercurial git 26 | - sudo apt-get -y install libboost-{chrono,program-options,date-time,thread,system,filesystem,regex,serialization}1.58{-dev,.0} 27 | 28 | install: 29 | - mkdir tmp-zmq 30 | - cd tmp-zmq 31 | - git clone https://github.com/zeromq/cppzmq 32 | - cd cppzmq 33 | - git reset --hard 4648ebc9643119cff2a433dff4609f1a5cb640ec # Since only libzmq3-dev can be installed. The new zmq.hpp requires version above 4. 34 | - sudo cp zmq.hpp /usr/local/include 35 | - cd ../.. 36 | - rm -rf tmp-zmq 37 | 38 | before_script: 39 | - export CLANG_FORMAT=clang-format-3.9 40 | - ./scripts/lint.py 41 | - ./scripts/clang-format.py -o check 42 | 43 | matrix: 44 | include: 45 | - compiler: "gcc-5" 46 | env: CC=gcc-5 CXX=g++-5 47 | 48 | script: 49 | - mkdir release 50 | - cd release 51 | - cmake -DCMAKE_BUILD_TYPE=release -DBUILD_SHARED_LIBRARY=on .. 52 | - make -j4 HuskyUnitTest 53 | - ./HuskyUnitTest --gtest_shuffle # To check if the order causes some time-out failures. 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2017-2019 Husky Team 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /base/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | file(GLOB base-src-files 4 | node.cpp 5 | message.cpp 6 | sarray_binstream.cpp 7 | third_party/network_utils.cpp) 8 | 9 | add_library(base-objs OBJECT ${base-src-files}) 10 | set_property(TARGET base-objs PROPERTY CXX_STANDARD 11) 11 | add_dependencies(base-objs ${external_project_dependencies}) 12 | 13 | -------------------------------------------------------------------------------- /base/actor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "base/threadsafe_queue.hpp" 6 | #include "base/message.hpp" 7 | #include "base/sarray_binstream.hpp" 8 | 9 | namespace xyz { 10 | 11 | class Actor { 12 | public: 13 | Actor(int qid): queue_id_(qid) {} 14 | virtual ~Actor() = default; 15 | 16 | ThreadsafeQueue* GetWorkQueue() { return &work_queue_; } 17 | int Qid() const { return queue_id_; } 18 | 19 | virtual void Process(Message msg) = 0; 20 | 21 | void Start() { 22 | work_thread_ = std::thread([this]() { 23 | Main(); 24 | }); 25 | } 26 | 27 | void Stop() { 28 | Message msg; 29 | msg.meta.flag = Flag::kActorExit; 30 | work_queue_.Push(msg); 31 | work_thread_.join(); 32 | } 33 | 34 | private: 35 | void Main() { 36 | while (true) { 37 | Message msg; 38 | work_queue_.WaitAndPop(&msg); 39 | if (msg.meta.flag == Flag::kActorExit) { 40 | break; 41 | } 42 | Process(std::move(msg)); 43 | } 44 | } 45 | 46 | private: 47 | int queue_id_; 48 | ThreadsafeQueue work_queue_; 49 | std::thread work_thread_; 50 | }; 51 | 52 | } // namespace xyz 53 | 54 | -------------------------------------------------------------------------------- /base/color.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define BLUE(str) std::string("\033[1;34m") + str + std::string("\033[0m") 4 | #define GREEN(str) std::string("\033[1;32m") + str + std::string("\033[0m") 5 | #define RED(str) std::string("\033[1;31m") + str + std::string("\033[0m") 6 | #define YELLOW(str) std::string("\033[1;33m") + str + std::string("\033[0m") 7 | #define CLAY(str) std::string("\033[1;36m") + str + std::string("\033[0m") 8 | #define PURPLE(str) std::string("\033[1;35m") + str + std::string("\033[0m") 9 | #define WHITE(str) std::string("\033[1;37m") + str + std::string("\033[0m") 10 | #define BLACK(str) std::string("\033[1;30m") + str + std::string("\033[0m") 11 | -------------------------------------------------------------------------------- /base/get_ip.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace xyz { 11 | namespace { 12 | 13 | std::string GetIP(std::string host) { 14 | struct hostent* phe = gethostbyname(host.c_str()); 15 | for (int i = 0; phe->h_addr_list[i] != 0; i++) { 16 | struct in_addr addr; 17 | memcpy(&addr, phe->h_addr_list[i], sizeof(struct in_addr)); 18 | std::string ret = inet_ntoa(addr); 19 | return ret; 20 | } 21 | } 22 | 23 | } 24 | } // namespace xyz 25 | 26 | -------------------------------------------------------------------------------- /base/magic.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "glog/logging.h" 3 | 4 | namespace xyz { 5 | namespace { 6 | 7 | // combine timeout 8 | // <0 : send without combine 9 | // 0: directly combine and send 10 | // 0-kMaxCombineTimeout: timeout in ms 11 | // >kMaxCombineTimeout: shuffle combine 12 | // used in DelayedCombiner (worker/delayed_combiner). 13 | const int kMaxCombineTimeout = 10000; 14 | 15 | const int kShuffleCombine = kMaxCombineTimeout + 1; 16 | const int kDirectCombine = 0; 17 | const int kNoCombine = -1; 18 | 19 | int ParseCombineTimeout(std::string s) { 20 | if (s == "kShuffleCombine") { 21 | return kShuffleCombine; 22 | } else if (s == "kDirectCombine") { 23 | return kDirectCombine; 24 | } else if (s == "kNoCombine") { 25 | return kNoCombine; 26 | } else { 27 | int timeout; 28 | try { 29 | timeout = std::stoi(s); 30 | if (timeout > kMaxCombineTimeout || timeout <= kDirectCombine) { 31 | CHECK(false) << "invalid combine_timeout: " << s; 32 | } 33 | } catch (...) { 34 | CHECK(false) << "invalid combine_timeout: " << s; 35 | } 36 | return timeout; 37 | } 38 | } 39 | 40 | } // namespace 41 | } // namespace xyz 42 | 43 | -------------------------------------------------------------------------------- /base/message.cpp: -------------------------------------------------------------------------------- 1 | #include "base/message.hpp" 2 | #include "base/sarray_binstream.hpp" 3 | 4 | namespace xyz { 5 | } // namespace xyz 6 | -------------------------------------------------------------------------------- /base/message.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "base/third_party/sarray.h" 8 | #include "base/node.hpp" 9 | 10 | namespace xyz { 11 | class SArrayBinStream; 12 | 13 | enum class Flag : char { 14 | kMailboxControl, 15 | kActorExit, 16 | kOthers 17 | }; 18 | static const char* FlagName[] = { 19 | "kMailboxControl", 20 | "kActorExit", 21 | "kOthers" 22 | }; 23 | 24 | struct Meta { 25 | int sender; 26 | int recver; 27 | Flag flag; 28 | 29 | std::string DebugString() const { 30 | std::stringstream ss; 31 | ss << "Meta: { "; 32 | ss << "sender: " << sender; 33 | ss << ", recver: " << recver; 34 | ss << ", flag: " << FlagName[static_cast(flag)]; 35 | ss << "}"; 36 | return ss.str(); 37 | } 38 | }; 39 | 40 | struct Message { 41 | Meta meta; 42 | std::vector> data; 43 | 44 | template 45 | void AddData(const third_party::SArray& val) { 46 | data.push_back(third_party::SArray(val)); 47 | } 48 | 49 | std::string DebugString() const { 50 | std::stringstream ss; 51 | ss << meta.DebugString(); 52 | if (data.size()) { 53 | ss << " Body:"; 54 | for (const auto& d : data) 55 | ss << " data_size=" << d.size(); 56 | } 57 | return ss.str(); 58 | } 59 | }; 60 | 61 | } // namespace xyz 62 | -------------------------------------------------------------------------------- /base/node.cpp: -------------------------------------------------------------------------------- 1 | #include "base/node.hpp" 2 | #include "base/sarray_binstream.hpp" 3 | 4 | #include 5 | 6 | namespace xyz { 7 | 8 | const int Node::kEmpty = std::numeric_limits::max(); 9 | 10 | 11 | SArrayBinStream& operator<<(xyz::SArrayBinStream& stream, const Node& node) { 12 | stream << node.id << node.hostname << node.port << node.is_recovery; 13 | return stream; 14 | } 15 | 16 | SArrayBinStream& operator>>(xyz::SArrayBinStream& stream, Node& node) { 17 | stream >> node.id >> node.hostname >> node.port >> node.is_recovery; 18 | return stream; 19 | } 20 | 21 | } // namespace xyz 22 | -------------------------------------------------------------------------------- /base/node.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace xyz { 6 | 7 | class SArrayBinStream; 8 | struct Node { 9 | static const int kEmpty; 10 | 11 | //Role role; 12 | int id; 13 | std::string hostname; 14 | int port; 15 | bool is_recovery; 16 | 17 | 18 | std::string DebugString() const { 19 | std::stringstream ss; 20 | ss << " { id=" << id << " hostname=" << hostname << " port=" << port << " is_recovery=" << is_recovery << " }"; 21 | return ss.str(); 22 | } 23 | 24 | bool operator==(const Node& other) const { 25 | return id == other.id && hostname == other.hostname && port == other.port && is_recovery == other.is_recovery; 26 | } 27 | 28 | friend SArrayBinStream& operator<<(xyz::SArrayBinStream& stream, const Node& node); 29 | friend SArrayBinStream& operator>>(xyz::SArrayBinStream& stream, Node& node); 30 | }; 31 | 32 | } // namespace xyz 33 | -------------------------------------------------------------------------------- /base/sarray_binstream.cpp: -------------------------------------------------------------------------------- 1 | #include "base/sarray_binstream.hpp" 2 | 3 | namespace xyz { 4 | 5 | size_t SArrayBinStream::Size() const { return buffer_.size() - front_; } 6 | 7 | void SArrayBinStream::AddBin(const char* bin, size_t sz) { 8 | buffer_.append_bytes_double(bin, sz); 9 | } 10 | 11 | void* SArrayBinStream::PopBin(size_t sz) { 12 | CHECK_LE(front_ + sz, buffer_.size()); 13 | void* ret = &buffer_[front_]; 14 | front_ += sz; 15 | return ret; 16 | } 17 | 18 | Message SArrayBinStream::ToMsg() const { 19 | Message msg; 20 | msg.AddData(buffer_); 21 | return msg; 22 | } 23 | 24 | void SArrayBinStream::FromMsg(const Message& msg) { 25 | //CHECK_EQ(msg.data.size(), 1); 26 | FromSArray(msg.data[0]); 27 | } 28 | 29 | third_party::SArray SArrayBinStream::ToSArray() const { 30 | return buffer_; 31 | } 32 | 33 | SArrayBinStream& operator<<(SArrayBinStream& stream, const SArrayBinStream& bin) { 34 | stream << bin.Size(); 35 | stream.AddBin(bin.GetPtr(), bin.Size()); 36 | return stream; 37 | } 38 | SArrayBinStream& operator>>(SArrayBinStream& stream, SArrayBinStream& bin) { 39 | size_t len; 40 | stream >> len; 41 | CHECK(bin.Size() == 0); 42 | bin.CopyFrom(stream.GetPtr(), len); 43 | stream.PopBin(len); 44 | return stream; 45 | } 46 | 47 | } // namespace xyz 48 | -------------------------------------------------------------------------------- /base/third_party/network_utils.h: -------------------------------------------------------------------------------- 1 | /** 2 | * From ps-lite 3 | */ 4 | #pragma once 5 | 6 | #include 7 | #ifdef _MSC_VER 8 | #include 9 | #include 10 | #include 11 | #include 12 | #undef interface 13 | #else 14 | #include 15 | #include 16 | #include 17 | #include 18 | #endif 19 | #include 20 | #include 21 | 22 | namespace xyz { 23 | namespace third_party { 24 | 25 | /** 26 | * \brief return the IP address for given interface eth0, eth1, ... 27 | */ 28 | void GetIP(const std::string& interface, std::string* ip); 29 | 30 | 31 | /** 32 | * \brief return the IP address and Interface the first interface which is not 33 | * loopback 34 | * 35 | * only support IPv4 36 | */ 37 | void GetAvailableInterfaceAndIP(std::string* interface, std::string* ip); 38 | 39 | /** 40 | * \brief return an available port on local machine 41 | * 42 | * only support IPv4 43 | * \return 0 on failure 44 | */ 45 | int GetAvailablePort(); 46 | 47 | } // namespace third_party 48 | } // namespace xyz -------------------------------------------------------------------------------- /base/third_party/network_utils_test.cpp: -------------------------------------------------------------------------------- 1 | #include "glog/logging.h" 2 | #include "gtest/gtest.h" 3 | 4 | #include "base/third_party/network_utils.cpp" 5 | 6 | namespace xyz { 7 | namespace third_party { 8 | namespace { 9 | 10 | class TestNetworkUtils : public testing::Test {}; 11 | 12 | TEST_F(TestNetworkUtils, GetAvailableInterfaceAndIP) { 13 | std::string interface; 14 | std::string ip; 15 | GetAvailableInterfaceAndIP(&interface, &ip); 16 | EXPECT_NE(interface, ""); 17 | EXPECT_NE(ip, ""); 18 | VLOG(1) << "interface: " << interface; 19 | VLOG(1) << "ip: " << ip; 20 | } 21 | 22 | TEST_F(TestNetworkUtils, GetAvailablePort) { 23 | int port = 0; 24 | port = GetAvailablePort(); 25 | EXPECT_NE(port, 0); 26 | VLOG(1) << "port: " << port; 27 | } 28 | 29 | } // namespace 30 | } // namespace third_party 31 | } // namespace xyz -------------------------------------------------------------------------------- /base/third_party/range.h: -------------------------------------------------------------------------------- 1 | /** 2 | * From ps-lite 3 | */ 4 | #pragma once 5 | 6 | #include 7 | 8 | namespace xyz { 9 | namespace third_party { 10 | 11 | /** 12 | * \brief a range [begin, end) 13 | */ 14 | class Range { 15 | public: 16 | Range() : Range(0, 0) {} 17 | Range(uint64_t begin, uint64_t end) : begin_(begin), end_(end) { } 18 | 19 | uint64_t begin() const { return begin_; } 20 | uint64_t end() const { return end_; } 21 | uint64_t size() const { return end_ - begin_; } 22 | /* 23 | friend SArrayBinStream& operator<<(SArrayBinStream& stream, const third_party::Range& range) { 24 | stream << range.begin() << range.end(); 25 | return stream; 26 | } 27 | friend SArrayBinStream& operator>>(SArrayBinStream& stream, third_party::Range& range) { 28 | stream >> range.begin() >> range.end(); 29 | return stream; 30 | } 31 | */ 32 | private: 33 | uint64_t begin_; 34 | uint64_t end_; 35 | }; 36 | 37 | } // namespace third_party 38 | } // namespace xyz 39 | -------------------------------------------------------------------------------- /base/threadsafe_queue.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace xyz { 9 | 10 | template 11 | class ThreadsafeQueue { 12 | public: 13 | ThreadsafeQueue() = default; 14 | ~ThreadsafeQueue() = default; 15 | ThreadsafeQueue(const ThreadsafeQueue&) = delete; 16 | ThreadsafeQueue& operator=(const ThreadsafeQueue&) = delete; 17 | ThreadsafeQueue(ThreadsafeQueue&&) = delete; 18 | ThreadsafeQueue& operator=(ThreadsafeQueue&&) = delete; 19 | 20 | void Push(T elem) { 21 | mu_.lock(); 22 | queue_.push(std::move(elem)); 23 | mu_.unlock(); 24 | cond_.notify_all(); 25 | } 26 | 27 | void WaitAndPop(T* elem) { 28 | std::unique_lock lk(mu_); 29 | cond_.wait(lk, [this] { return !queue_.empty(); }); 30 | *elem = std::move(queue_.front()); 31 | queue_.pop(); 32 | } 33 | 34 | int Size() { 35 | std::lock_guard lk(mu_); 36 | return queue_.size(); 37 | } 38 | 39 | private: 40 | std::mutex mu_; 41 | std::queue queue_; 42 | std::condition_variable cond_; 43 | }; 44 | 45 | } // namespace xyz 46 | -------------------------------------------------------------------------------- /cmake/dep.cmake: -------------------------------------------------------------------------------- 1 | ### LibHDFS3 ### 2 | 3 | find_path(LIBHDFS3_INCLUDE_DIR NAMES hdfs/hdfs.h) 4 | find_library(LIBHDFS3_LIBRARY NAMES hdfs3) 5 | if(LIBHDFS3_INCLUDE_DIR AND LIBHDFS3_LIBRARY) 6 | set(LIBHDFS3_FOUND true) 7 | endif(LIBHDFS3_INCLUDE_DIR AND LIBHDFS3_LIBRARY) 8 | if(LIBHDFS3_FOUND) 9 | set(LIBHDFS3_DEFINITION "-DWITH_HDFS") 10 | if(NOT LIBHDFS3_FIND_QUIETLY) 11 | message (STATUS "Found libhdfs3:") 12 | message (STATUS " (Headers) ${LIBHDFS3_INCLUDE_DIR}") 13 | message (STATUS " (Library) ${LIBHDFS3_LIBRARY}") 14 | message (STATUS " (Definition) ${LIBHDFS3_DEFINITION}") 15 | endif(NOT LIBHDFS3_FIND_QUIETLY) 16 | else(LIBHDFS3_FOUND) 17 | message(STATUS "Could NOT find libhdfs3") 18 | endif(LIBHDFS3_FOUND) 19 | if(WITHOUT_HDFS) 20 | unset(LIBHDFS3_FOUND) 21 | message(STATUS "Not using libhdfs3 due to WITHOUT_HDFS option") 22 | endif(WITHOUT_HDFS) 23 | -------------------------------------------------------------------------------- /cmake/modules/gflags.cmake: -------------------------------------------------------------------------------- 1 | 2 | include (GNUInstallDirs) 3 | 4 | if(GFLAGS_SEARCH_PATH) 5 | # Note: if using GFLAGS_SEARCH_PATH, the customized format is not activated. 6 | find_path(GFLAGS_INCLUDE_DIR NAMES gflags/gflags.h PATHS ${GFLAGS_SEARCH_PATH} NO_SYSTEM_ENVIRONMENT_PATH) 7 | find_library(GFLAGS_LIBRARY NAMES gflags PATHS ${GFLAGS_SEARCH_PATH} NO_SYSTEM_ENVIRONMENT_PATH) 8 | message(STATUS "Found GFlags in search path ${GFLAGS_SEARCH_PATH}") 9 | message(STATUS " (Headers) ${GFLAGS_INCLUDE_DIR}") 10 | message(STATUS " (Library) ${GFLAGS_LIBRARY}") 11 | else(GFLAGS_SEARCH_PATH) 12 | include(ExternalProject) 13 | set(THIRDPARTY_DIR ${PROJECT_BINARY_DIR}/third_party) 14 | ExternalProject_Add( 15 | gflags 16 | GIT_REPOSITORY "https://github.com/gflags/gflags" 17 | GIT_TAG v2.2.1 18 | PREFIX ${THIRDPARTY_DIR} 19 | CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROJECT_BINARY_DIR} 20 | CMAKE_ARGS -DWITH_GFLAGS=OFF 21 | CMAKE_ARGS -DBUILD_TESTING=OFF 22 | CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF 23 | UPDATE_COMMAND "" 24 | ) 25 | list(APPEND external_project_dependencies gflags) 26 | set(GFLAGS_INCLUDE_DIR "${PROJECT_BINARY_DIR}/include") 27 | if(WIN32) 28 | set(GFLAGS_LIBRARY "${PROJECT_BINARY_DIR}/lib/libgflags.lib") 29 | else(WIN32) 30 | set(GFLAGS_LIBRARY "${PROJECT_BINARY_DIR}/lib/libgflags.a") 31 | endif(WIN32) 32 | message(STATUS "GFlags will be built as a third party") 33 | message(STATUS " (Headers should be) ${GFLAGS_INCLUDE_DIR}") 34 | message(STATUS " (Library should be) ${GFLAGS_LIBRARY}") 35 | endif(GFLAGS_SEARCH_PATH) 36 | -------------------------------------------------------------------------------- /cmake/modules/glog.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Husky Team 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ### GLOG ### 17 | 18 | include (GNUInstallDirs) 19 | 20 | if(GLOG_SEARCH_PATH) 21 | # Note: if using GLOG_SEARCH_PATH, the customized format is not activated. 22 | find_path(GLOG_INCLUDE_DIR NAMES glog/logging.h PATHS ${GLOG_SEARCH_PATH} NO_SYSTEM_ENVIRONMENT_PATH) 23 | find_library(GLOG_LIBRARY NAMES glog PATHS ${GLOG_SEARCH_PATH} NO_SYSTEM_ENVIRONMENT_PATH) 24 | message(STATUS "Found GLog in search path ${GLOG_SEARCH_PATH}") 25 | message(STATUS " (Headers) ${GLOG_INCLUDE_DIR}") 26 | message(STATUS " (Library) ${GLOG_LIBRARY}") 27 | else(GLOG_SEARCH_PATH) 28 | include(ExternalProject) 29 | set(THIRDPARTY_DIR ${PROJECT_BINARY_DIR}/third_party) 30 | ExternalProject_Add( 31 | glog 32 | GIT_REPOSITORY "https://github.com/google/glog" 33 | GIT_TAG v0.3.5 34 | PREFIX ${THIRDPARTY_DIR} 35 | CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROJECT_BINARY_DIR} 36 | CMAKE_ARGS -DWITH_GFLAGS=OFF 37 | CMAKE_ARGS -DBUILD_TESTING=OFF 38 | CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF 39 | UPDATE_COMMAND "" 40 | ) 41 | list(APPEND external_project_dependencies glog) 42 | set(GLOG_INCLUDE_DIR "${PROJECT_BINARY_DIR}/include") 43 | if(WIN32) 44 | set(GLOG_LIBRARY "${PROJECT_BINARY_DIR}/lib/libglog.lib") 45 | else(WIN32) 46 | set(GLOG_LIBRARY "${PROJECT_BINARY_DIR}/lib/libglog.a") 47 | endif(WIN32) 48 | message(STATUS "GLog will be built as a third party") 49 | message(STATUS " (Headers should be) ${GLOG_INCLUDE_DIR}") 50 | message(STATUS " (Library should be) ${GLOG_LIBRARY}") 51 | endif(GLOG_SEARCH_PATH) 52 | -------------------------------------------------------------------------------- /comm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | file(GLOB comm-src-files 4 | basic_mailbox.cpp 5 | worker_mailbox.cpp 6 | scheduler_mailbox.cpp 7 | sender.cpp) 8 | 9 | add_library(comm-objs OBJECT ${comm-src-files}) 10 | set_property(TARGET comm-objs PROPERTY CXX_STANDARD 11) 11 | add_dependencies(comm-objs ${external_project_dependencies}) 12 | -------------------------------------------------------------------------------- /comm/abstract_mailbox.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "base/message.hpp" 4 | 5 | namespace xyz { 6 | 7 | class AbstractMailbox { 8 | public: 9 | virtual ~AbstractMailbox() = default; 10 | virtual int Send(const Message &msg) = 0; 11 | }; 12 | 13 | } // namespace xyz 14 | -------------------------------------------------------------------------------- /comm/abstract_sender.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "base/message.hpp" 4 | 5 | namespace xyz { 6 | 7 | class AbstractSender { 8 | public: 9 | virtual ~AbstractSender() {} 10 | virtual void Send(Message msg) = 0; 11 | }; 12 | 13 | } // namespace xyz 14 | -------------------------------------------------------------------------------- /comm/resender.hpp: -------------------------------------------------------------------------------- 1 | #ifndef RESENDER_H_ 2 | #define RESENDER_H_ 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | namespace xyz { 10 | 11 | // resend a messsage if no ack is received within a given time 12 | class Resender { 13 | public: 14 | // timeout timeout in millisecond 15 | Resender(int timeout, int max_retry, Mailbox *mailbox) { 16 | timeout_ = timeout; 17 | max_retry_ = max_retry; 18 | mailbox_ = mailbox; 19 | monitor_ = new std::thread(&Resender::Monitoring, this); 20 | } 21 | ~Resender() { 22 | exit_ = true; 23 | monitor_->join(); 24 | delete monitor_; 25 | } 26 | 27 | // add an incomming message, return true if msg has been added before or a ACK 28 | // message 29 | bool AddIncomming(const Message &msg) {} 30 | 31 | private: 32 | void Monitoring() { 33 | // TODO 34 | } 35 | 36 | std::thread *monitor_; 37 | std::unordered_set acked_; 38 | std::atomic exit_{false}; 39 | std::mutex mu_; 40 | int timeout_; 41 | int max_retry_; 42 | Mailbox *mailbox_; 43 | }; 44 | } // namespace xyz 45 | #endif // RESENDER_H_ 46 | -------------------------------------------------------------------------------- /comm/scheduler_mailbox.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "comm/basic_mailbox.hpp" 4 | #include "core/scheduler/control.hpp" 5 | 6 | namespace xyz { 7 | 8 | class SchedulerMailbox : public BasicMailbox { 9 | public: 10 | SchedulerMailbox(Node scheduler_node, int num_workers); 11 | ~SchedulerMailbox(); 12 | virtual void Start() override; 13 | 14 | private: 15 | std::mutex heartbeat_mu_; 16 | std::unordered_map heartbeats_; // heartbeats from workers 17 | // in seconds 18 | const int kHeartbeatTimeout = 3; 19 | const int kHeartbeatCheckInterval = 1; 20 | std::set GetDeadNodes(int timeout = 60); 21 | void CheckHeartbeat(int time_out); 22 | void UpdateHeartbeat(int node_id); 23 | 24 | virtual void HandleBarrierMsg() override; 25 | virtual void HandleRegisterMsg(Message *msg, Node &recovery_node) override; 26 | virtual void Receiving() override; 27 | const std::vector GetNodeIDs(); 28 | void UpdateID(Message *msg, std::set deadnodes_set, 29 | Node &recovery_node); 30 | 31 | int num_workers_; 32 | }; 33 | } // namespace xyz 34 | -------------------------------------------------------------------------------- /comm/scheduler_mailbox_test.cpp: -------------------------------------------------------------------------------- 1 | #include "glog/logging.h" 2 | #include "gtest/gtest.h" 3 | 4 | #include "comm/scheduler_mailbox.hpp" 5 | #include "comm/worker_mailbox.hpp" 6 | 7 | namespace xyz { 8 | namespace { 9 | 10 | class TestSchedulerMailbox : public testing::Test {}; 11 | 12 | TEST_F(TestSchedulerMailbox, Construct) { 13 | Node node{0, "localhost", 32145, false}; 14 | SchedulerMailbox mailbox(node, 5); 15 | } 16 | 17 | TEST_F(TestSchedulerMailbox, BindAndConnect) { 18 | Node node{0, "localhost", 32145, false}; 19 | SchedulerMailbox mailbox(node, 5); 20 | mailbox.BindAndConnect(); 21 | mailbox.CloseSockets(); 22 | } 23 | 24 | TEST_F(TestSchedulerMailbox, SendAndRecv) { 25 | Node node{0, "localhost", 32145, false}; 26 | SchedulerMailbox mailbox(node, 5); 27 | mailbox.BindAndConnect(); 28 | 29 | Message msg; 30 | msg.meta.sender = Node::kEmpty; 31 | msg.meta.recver = 0; 32 | msg.meta.flag = Flag::kOthers; 33 | third_party::SArray keys{1}; 34 | third_party::SArray vals{0.4}; 35 | msg.AddData(keys); 36 | msg.AddData(vals); 37 | 38 | mailbox.Send(msg); 39 | VLOG(2) << "Finished sending"; 40 | Message recv_msg; 41 | mailbox.Recv(&recv_msg); 42 | VLOG(2) << "Finished reciving"; 43 | EXPECT_EQ(recv_msg.meta.sender, msg.meta.sender); 44 | EXPECT_EQ(recv_msg.meta.recver, msg.meta.recver); 45 | EXPECT_EQ(recv_msg.meta.flag, msg.meta.flag); 46 | EXPECT_EQ(recv_msg.data.size(), 2); 47 | third_party::SArray recv_keys; 48 | recv_keys = recv_msg.data[0]; 49 | third_party::SArray recv_vals; 50 | recv_vals = recv_msg.data[1]; 51 | EXPECT_EQ(recv_keys[0], keys[0]); 52 | EXPECT_EQ(recv_vals[0], vals[0]); 53 | 54 | mailbox.CloseSockets(); 55 | } 56 | 57 | TEST_F(TestSchedulerMailbox, StartStop) { 58 | Node node{0, "localhost", 32145, false}; 59 | std::thread th1([=]() { 60 | // Scheduler 61 | SchedulerMailbox mailbox(node, 1); 62 | mailbox.Start(); 63 | // std::this_thread::sleep_for(std::chrono::seconds(2)); 64 | mailbox.Stop(); 65 | }); 66 | std::thread th2([=]() { 67 | // Worker 68 | WorkerMailbox mailbox(node); 69 | mailbox.Start(); 70 | mailbox.Stop(); 71 | }); 72 | th2.join(); 73 | th1.join(); 74 | } 75 | 76 | } // namespace 77 | } // namespace xyz 78 | -------------------------------------------------------------------------------- /comm/sender.cpp: -------------------------------------------------------------------------------- 1 | #include "comm/sender.hpp" 2 | 3 | namespace xyz { 4 | 5 | void Sender::Send(Message msg) { GetWorkQueue()->Push(std::move(msg)); } 6 | 7 | void Sender::Process(Message msg) { mailbox_->Send(msg); } 8 | 9 | } // namespace xyz 10 | -------------------------------------------------------------------------------- /comm/sender.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "base/actor.hpp" 4 | #include "comm/abstract_mailbox.hpp" 5 | #include "comm/abstract_sender.hpp" 6 | 7 | namespace xyz { 8 | 9 | class Sender : public AbstractSender, public Actor { 10 | public: 11 | Sender(int qid, AbstractMailbox *mailbox) : Actor(qid), mailbox_(mailbox) { 12 | Start(); 13 | } 14 | ~Sender() { Stop(); } 15 | 16 | virtual void Send(Message msg) override; 17 | virtual void Process(Message msg) override; 18 | 19 | private: 20 | AbstractMailbox *mailbox_; 21 | }; 22 | 23 | } // namespace xyz 24 | -------------------------------------------------------------------------------- /comm/simple_sender.hpp: -------------------------------------------------------------------------------- 1 | #include "base/threadsafe_queue.hpp" 2 | #include "comm/abstract_sender.hpp" 3 | 4 | namespace xyz { 5 | 6 | class SimpleSender : public AbstractSender { 7 | public: 8 | virtual void Send(Message msg) override { msgs.Push(std::move(msg)); } 9 | Message Get() { 10 | Message msg; 11 | msgs.WaitAndPop(&msg); 12 | return msg; 13 | } 14 | ThreadsafeQueue msgs; 15 | }; 16 | 17 | } // namespace xyz 18 | -------------------------------------------------------------------------------- /comm/worker_mailbox.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "comm/basic_mailbox.hpp" 4 | 5 | namespace xyz { 6 | 7 | class WorkerMailbox : public BasicMailbox { 8 | public: 9 | WorkerMailbox(Node scheduler_node); 10 | ~WorkerMailbox(); 11 | 12 | virtual void Start() override; 13 | // Just for test 14 | virtual void StopHeartbeat(); 15 | 16 | private: 17 | const int kHeartbeatReportInterval = 1; 18 | virtual void Heartbeat(); 19 | virtual void HandleBarrierMsg() override; 20 | virtual void HandleRegisterMsg(Message *msg, Node &recovery_node) override; 21 | void UpdateID(Message *msg, Node &recovery_node); 22 | virtual void Receiving() override; 23 | }; 24 | } // namespace xyz 25 | -------------------------------------------------------------------------------- /comm/worker_mailbox_test.cpp: -------------------------------------------------------------------------------- 1 | #include "glog/logging.h" 2 | #include "gtest/gtest.h" 3 | 4 | #include "comm/scheduler_mailbox.hpp" 5 | #include "comm/worker_mailbox.hpp" 6 | 7 | namespace xyz { 8 | namespace { 9 | 10 | class TestWorkerMailbox : public testing::Test {}; 11 | 12 | TEST_F(TestWorkerMailbox, Construct) { 13 | Node node{0, "localhost", 32145, false}; 14 | WorkerMailbox mailbox(node); 15 | } 16 | 17 | TEST_F(TestWorkerMailbox, BindAndConnect) { 18 | Node node{0, "localhost", 32145, false}; 19 | WorkerMailbox mailbox(node); 20 | mailbox.BindAndConnect(); 21 | mailbox.CloseSockets(); 22 | } 23 | 24 | } // namespace 25 | } // namespace xyz 26 | -------------------------------------------------------------------------------- /core/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | file(GLOB core-src-files 4 | partition/partition_manager.cpp 5 | cache/fetcher.cpp 6 | partition/partition_tracker.cpp 7 | executor/thread_pool.cpp 8 | executor/executor.cpp 9 | plan/function_store.cpp 10 | map_output/map_output_storage.cpp 11 | scheduler/control.cpp 12 | scheduler/scheduler_elem.cpp 13 | scheduler/worker.cpp 14 | scheduler/dag_runner.cpp 15 | scheduler/scheduler.cpp 16 | scheduler/block_manager.cpp 17 | scheduler/control_manager.cpp 18 | scheduler/write_manager.cpp 19 | scheduler/distribute_manager.cpp 20 | scheduler/collection_manager.cpp 21 | scheduler/checkpoint_manager.cpp 22 | scheduler/recover_manager.cpp 23 | scheduler/checkpoint_loader.cpp 24 | scheduler/collection_status.cpp 25 | plan/context.cpp 26 | plan/spec_wrapper.cpp 27 | plan/dag.cpp 28 | worker/controller.cpp 29 | worker/plan_controller.cpp 30 | worker/delayed_combiner.cpp 31 | ) 32 | 33 | # TODO now we let engine and worker depends on HDFS 34 | # so that they won't be built for unit test. 35 | if(LIBHDFS3_FOUND) 36 | file(GLOB core-src-hdfs-files 37 | engine.cpp 38 | plan/runner.cpp 39 | ) 40 | list(APPEND core-src-files ${core-src-hdfs-files}) 41 | endif(LIBHDFS3_FOUND) 42 | 43 | add_library(core-objs OBJECT ${core-src-files}) 44 | set_property(TARGET core-objs PROPERTY CXX_STANDARD 14) 45 | add_dependencies(core-objs ${external_project_dependencies}) 46 | -------------------------------------------------------------------------------- /core/abstract_collection_map.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace xyz { 4 | 5 | struct AbstractCollectionMap { 6 | virtual ~AbstractCollectionMap() {} 7 | virtual int Lookup(int collection_id, int part_id) = 0; 8 | }; 9 | 10 | } // namespace xyz 11 | 12 | -------------------------------------------------------------------------------- /core/cache/abstract_fetcher.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "core/partition/abstract_partition.hpp" 8 | #include "base/sarray_binstream.hpp" 9 | #include "core/scheduler/control.hpp" 10 | 11 | namespace xyz { 12 | 13 | struct AbstractFetcher { 14 | virtual ~AbstractFetcher() = default; 15 | virtual void FetchObjs(int plan_id, int upstream_part_id, int collection_id, 16 | const std::map& part_to_keys, 17 | std::vector* const rets) = 0; 18 | virtual std::shared_ptr FetchPart(FetchMeta meta) = 0; 19 | // call FinishPart after accessing the part 20 | virtual void FinishPart(FetchMeta meta) = 0; 21 | }; 22 | 23 | } // namespace xyz 24 | 25 | -------------------------------------------------------------------------------- /core/cache/bin_to_part_mappers.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "base/sarray_binstream.hpp" 7 | #include "core/partition/abstract_partition.hpp" 8 | 9 | namespace xyz { 10 | 11 | class BinToPartMappers { 12 | public: 13 | using BinToPartFuncT = std::function(SArrayBinStream bin)>; 14 | void Add(int id, BinToPartFuncT func) { 15 | CHECK(mappers_.find(id) == mappers_.end()); 16 | mappers_.insert({id, func}); 17 | } 18 | std::shared_ptr Call(int id, SArrayBinStream bin) { 19 | CHECK(mappers_.find(id) != mappers_.end()); 20 | return mappers_[id](bin); 21 | } 22 | private: 23 | std::map mappers_; 24 | }; 25 | 26 | } // namespace xyz 27 | 28 | -------------------------------------------------------------------------------- /core/cache/fetcher_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/cache/fetcher.hpp" 5 | 6 | #include "comm/simple_sender.hpp" 7 | #include "core/index/key_to_part_mappers.hpp" 8 | 9 | namespace xyz { 10 | namespace { 11 | 12 | class TestFetcher : public testing::Test {}; 13 | 14 | TEST_F(TestFetcher, Construct) { 15 | const int qid = 0; 16 | auto partition_manager = std::make_shared(); 17 | auto function_store = std::make_shared(); 18 | auto collection_map = std::make_shared(); 19 | auto sender = std::make_shared(); 20 | Fetcher fetcher(qid, function_store, partition_manager, collection_map, sender); 21 | } 22 | 23 | } // namespace 24 | } // namespace xyz 25 | 26 | -------------------------------------------------------------------------------- /core/collection_map.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/scheduler/collection_view.hpp" 6 | #include "core/abstract_collection_map.hpp" 7 | 8 | #include "glog/logging.h" 9 | 10 | namespace xyz { 11 | 12 | class CollectionMap : public AbstractCollectionMap { 13 | public: 14 | void Init(std::unordered_map collection_map){ 15 | std::lock_guard lk(mu_); 16 | collection_map_ = collection_map; 17 | } 18 | void Insert(CollectionView cv) { 19 | std::lock_guard lk(mu_); 20 | collection_map_[cv.collection_id] = cv; 21 | } 22 | CollectionView& Get(int cid) { 23 | std::lock_guard lk(mu_); 24 | CHECK(collection_map_.find(cid) != collection_map_.end()); 25 | return collection_map_[cid]; 26 | } 27 | int GetNumParts(int cid) { 28 | std::lock_guard lk(mu_); 29 | CHECK(collection_map_.find(cid) != collection_map_.end()); 30 | return collection_map_[cid].num_partition; 31 | } 32 | virtual int Lookup(int collection_id, int part_id) override { 33 | std::lock_guard lk(mu_); 34 | CHECK(collection_map_.find(collection_id) != collection_map_.end()); 35 | auto c = collection_map_[collection_id]; 36 | int ret = c.mapper.Get(part_id); 37 | return ret; 38 | } 39 | friend SArrayBinStream& operator<<(xyz::SArrayBinStream& stream, const CollectionMap& m) { 40 | stream << m.collection_map_; 41 | return stream; 42 | } 43 | friend SArrayBinStream& operator>>(xyz::SArrayBinStream& stream, CollectionMap& m) { 44 | stream >> m.collection_map_; 45 | return stream; 46 | } 47 | std::string DebugString() const { 48 | std::stringstream ss; 49 | for (auto& kv: collection_map_) { 50 | ss << kv.second.DebugString() << "\n"; 51 | } 52 | return ss.str(); 53 | } 54 | private: 55 | std::unordered_map collection_map_; 56 | std::mutex mu_; 57 | }; 58 | 59 | } // namespace xyz 60 | 61 | -------------------------------------------------------------------------------- /core/engine.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "core/executor/executor.hpp" 9 | #include "core/partition/partition_manager.hpp" 10 | #include "core/map_output/abstract_map_output.hpp" 11 | #include "core/plan/function_store.hpp" 12 | #include "core/intermediate/intermediate_store.hpp" 13 | #include "core/plan/plan_spec.hpp" 14 | 15 | #include "core/scheduler/worker.hpp" 16 | #include "core/engine_elem.hpp" 17 | #include "core/cache/fetcher.hpp" 18 | #include "comm/worker_mailbox.hpp" 19 | #include "comm/sender.hpp" 20 | 21 | #include "core/worker/controller.hpp" 22 | 23 | namespace xyz { 24 | 25 | class Engine { 26 | public: 27 | struct Config { 28 | std::string scheduler; 29 | int scheduler_port; 30 | int num_local_threads; 31 | int num_update_threads; 32 | int num_combine_threads; 33 | std::string namenode; 34 | int port; 35 | std::string DebugString() const { 36 | std::stringstream ss; 37 | ss << " { "; 38 | ss << ", scheduler: " << scheduler; 39 | ss << ", scheduler_port: " << scheduler_port; 40 | ss << ", num_local_threads: " << num_local_threads; 41 | ss << ", num_update_threads: " << num_update_threads; 42 | ss << ", num_combine_threads: " << num_combine_threads; 43 | ss << ", namenode: " << namenode; 44 | ss << ", port: " << port; 45 | ss << " } "; 46 | return ss.str(); 47 | } 48 | }; 49 | 50 | Engine() = default; 51 | ~Engine() = default; 52 | 53 | void RegisterProgram(ProgramContext program) { 54 | program_ = program; 55 | } 56 | void Init(Engine::Config config); 57 | void Start(); 58 | void Run(); 59 | void Stop(); 60 | 61 | template 62 | void AddFunc(Plan plan) { 63 | plan.Register(engine_elem_.function_store); 64 | } 65 | 66 | template 67 | void AddFunc(Plan* plan) { 68 | plan->Register(engine_elem_.function_store); 69 | } 70 | 71 | private: 72 | ProgramContext program_; 73 | EngineElem engine_elem_; 74 | Config config_; 75 | 76 | std::shared_ptr mailbox_; 77 | std::shared_ptr worker_; 78 | std::shared_ptr fetcher_; 79 | std::shared_ptr controller_; 80 | }; 81 | 82 | } // namespace xyz 83 | 84 | -------------------------------------------------------------------------------- /core/engine_elem.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "base/node.hpp" 6 | #include "core/executor/executor.hpp" 7 | #include "core/partition/partition_manager.hpp" 8 | #include "core/intermediate/simple_intermediate_store.hpp" 9 | #include "core/plan/function_store.hpp" 10 | #include "comm/abstract_sender.hpp" 11 | #include "core/collection_map.hpp" 12 | #include "core/cache/fetcher.hpp" 13 | 14 | namespace xyz { 15 | 16 | struct EngineElem { 17 | Node node; 18 | std::shared_ptr executor; 19 | std::shared_ptr partition_manager; 20 | std::shared_ptr function_store; 21 | std::shared_ptr intermediate_store; 22 | std::shared_ptr sender; 23 | std::shared_ptr collection_map; 24 | std::shared_ptr fetcher; 25 | 26 | std::string namenode; 27 | int port; 28 | 29 | int num_local_threads; 30 | int num_update_threads; 31 | int num_combine_threads; 32 | }; 33 | 34 | } // namespace xyz 35 | 36 | -------------------------------------------------------------------------------- /core/executor/abstract_executor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace xyz { 7 | 8 | class AbstractExecutor { 9 | public: 10 | virtual std::future Add(const std::function& func) = 0; 11 | ~AbstractExecutor() {} 12 | }; 13 | 14 | } // namespace xyz 15 | 16 | -------------------------------------------------------------------------------- /core/executor/executor.cpp: -------------------------------------------------------------------------------- 1 | #include "core/executor/executor.hpp" 2 | 3 | namespace xyz { 4 | 5 | std::future Executor::Add(const std::function& func) { 6 | { 7 | std::lock_guard lk(mu_); 8 | num_added_ += 1; 9 | } 10 | return thread_pool_.enqueue([this, func]() { 11 | func(); 12 | std::lock_guard lk(mu_); 13 | num_finished_ += 1; 14 | }); 15 | } 16 | 17 | } // namespace xyz 18 | 19 | -------------------------------------------------------------------------------- /core/executor/executor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/executor/abstract_executor.hpp" 6 | #include "core/executor/thread_pool.hpp" 7 | 8 | namespace xyz { 9 | 10 | /* 11 | * A wrapper around ThreadPool. 12 | * Only accept void->void function. 13 | * 14 | * TODO: Not sure whether the GetNumPendingTask() and HasFreeThreads() functions are accurate. 15 | * Another way is to let the Engine design whether the Executor can accept more tasks. 16 | */ 17 | class Executor: public AbstractExecutor { 18 | public: 19 | Executor(size_t threads): thread_pool_(threads), num_threads_(threads) {} 20 | virtual std::future Add(const std::function& func) override; 21 | // Return the number of tasks that are either running or waiting in the queue. 22 | int GetNumPendingTask() { 23 | std::lock_guard lk(mu_); 24 | return num_added_ - num_finished_; 25 | } 26 | bool HasFreeThreads() { 27 | return GetNumPendingTask() < num_threads_; 28 | } 29 | int GetNumAdded() { 30 | std::lock_guard lk(mu_); 31 | return num_added_; 32 | } 33 | int GetNumFinished() { 34 | std::lock_guard lk(mu_); 35 | return num_finished_; 36 | } 37 | private: 38 | ThreadPool thread_pool_; 39 | int num_threads_; 40 | int num_added_ = 0; 41 | int num_finished_ = 0; 42 | std::mutex mu_; 43 | }; 44 | 45 | } // namespace xyz 46 | 47 | -------------------------------------------------------------------------------- /core/executor/executor_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/executor/executor.hpp" 5 | 6 | #include 7 | 8 | namespace xyz { 9 | namespace { 10 | 11 | class TestExecutor : public testing::Test {}; 12 | 13 | TEST_F(TestExecutor, Construct) { 14 | Executor executor(4); 15 | } 16 | 17 | TEST_F(TestExecutor, Add) { 18 | Executor executor(4); 19 | std::atomic a(0); 20 | int size = 10; 21 | std::vector> futures; 22 | for (int i = 0; i < size; ++ i) { 23 | futures.push_back(executor.Add([&a](){ a.fetch_add(1); })); 24 | } 25 | for (auto& f : futures) { 26 | f.get(); 27 | } 28 | EXPECT_EQ(a, size); 29 | EXPECT_EQ(executor.GetNumPendingTask(), 0); 30 | EXPECT_EQ(executor.HasFreeThreads(), true); 31 | EXPECT_EQ(executor.GetNumAdded(), size); 32 | EXPECT_EQ(executor.GetNumFinished(), size); 33 | } 34 | 35 | } 36 | } // namespace xyz 37 | 38 | -------------------------------------------------------------------------------- /core/executor/thread_pool.cpp: -------------------------------------------------------------------------------- 1 | #include "core/executor/thread_pool.hpp" 2 | 3 | namespace xyz { 4 | 5 | ThreadPool::ThreadPool(size_t threads) : stop_(false) { 6 | for (size_t i = 0; i < threads; ++i) { 7 | workers_.emplace_back([this] { 8 | for (;;) { 9 | std::function task; 10 | { 11 | std::unique_lock lock(this->queue_mutex_); 12 | this->cond_.wait( 13 | lock, [this] { return this->stop_ || !this->tasks_.empty(); }); 14 | if (this->stop_ && this->tasks_.empty()) 15 | return; 16 | task = std::move(this->tasks_.front()); 17 | this->tasks_.pop(); 18 | } 19 | 20 | task(); 21 | } 22 | }); 23 | } 24 | } 25 | 26 | // the destructor joins all threads 27 | ThreadPool::~ThreadPool() { 28 | { 29 | std::unique_lock lock(queue_mutex_); 30 | stop_ = true; 31 | } 32 | cond_.notify_all(); 33 | for (std::thread &worker : workers_) 34 | worker.join(); 35 | } 36 | 37 | } // namespace xyz 38 | -------------------------------------------------------------------------------- /core/executor/thread_pool.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | namespace xyz { 15 | 16 | /* 17 | * Copy from: https://github.com/progschj/ThreadPool/blob/master/ThreadPool.h. 18 | */ 19 | class ThreadPool { 20 | public: 21 | ThreadPool(size_t); 22 | template 23 | auto enqueue(F&& f, Args&&... args) 24 | -> std::future::type>; 25 | ~ThreadPool(); 26 | size_t size() { 27 | std::lock_guard lock(queue_mutex_); 28 | return tasks_.size(); 29 | } 30 | private: 31 | std::vector workers_; 32 | std::queue> tasks_; 33 | 34 | std::mutex queue_mutex_; 35 | std::condition_variable cond_; 36 | bool stop_; 37 | }; 38 | 39 | // add new work item to the pool 40 | template 41 | auto ThreadPool::enqueue(F &&f, Args &&... args) 42 | -> std::future::type> { 43 | using return_type = typename std::result_of::type; 44 | 45 | auto task = std::make_shared>( 46 | std::bind(std::forward(f), std::forward(args)...)); 47 | 48 | std::future res = task->get_future(); 49 | { 50 | std::unique_lock lock(queue_mutex_); 51 | 52 | // don't allow enqueueing after stopping the pool 53 | assert(!stop_); 54 | 55 | tasks_.emplace([task]() { (*task)(); }); 56 | } 57 | cond_.notify_one(); 58 | return res; 59 | } 60 | 61 | 62 | } // namespace xyz 63 | 64 | -------------------------------------------------------------------------------- /core/executor/thread_pool_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/executor/thread_pool.hpp" 5 | 6 | #include 7 | 8 | namespace xyz { 9 | namespace { 10 | 11 | class TestThreadPool : public testing::Test {}; 12 | 13 | TEST_F(TestThreadPool, Construct) { 14 | ThreadPool pool(4); 15 | } 16 | 17 | TEST_F(TestThreadPool, EnqueueOne) { 18 | ThreadPool pool(4); 19 | pool.enqueue([]{ 20 | return 10; 21 | }); 22 | } 23 | 24 | TEST_F(TestThreadPool, EnqueueMultiple) { 25 | ThreadPool pool(4); 26 | for (int i = 0; i < 10; ++ i) { 27 | pool.enqueue([i]{ 28 | VLOG(1) << "hello " << i; 29 | std::this_thread::sleep_for(std::chrono::milliseconds(10)); 30 | VLOG(1) << "world " << i; 31 | }); 32 | } 33 | } 34 | 35 | TEST_F(TestThreadPool, EnqueueMultipleReturn) { 36 | ThreadPool pool(4); 37 | std::vector> results; 38 | for (int i = 0; i < 10; ++ i) { 39 | results.emplace_back( 40 | pool.enqueue([i]{ 41 | VLOG(1) << "hello " << i; 42 | std::this_thread::sleep_for(std::chrono::milliseconds(10)); 43 | VLOG(1) << "world " << i; 44 | return i*i; 45 | }) 46 | ); 47 | } 48 | std::vector res(results.size()); 49 | std::transform(results.begin(), results.end(), res.begin(), [](std::future& f){ return f.get(); }); 50 | std::sort(res.begin(), res.end()); 51 | for (int i = 0; i < res.size(); ++ i) { 52 | VLOG(1) << res[i]; 53 | EXPECT_EQ(res[i], i*i); 54 | } 55 | } 56 | 57 | } 58 | } // namespace xyz 59 | 60 | -------------------------------------------------------------------------------- /core/index/abstract_key_to_part_mapper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace xyz { 6 | 7 | class AbstractKeyToPartMapper { 8 | public: 9 | AbstractKeyToPartMapper(size_t num_partition): num_partition_(num_partition) {} 10 | ~AbstractKeyToPartMapper() {} 11 | size_t GetNumPart() const { return num_partition_; } 12 | private: 13 | size_t num_partition_; 14 | }; 15 | 16 | template 17 | class TypedKeyToPartMapper : public AbstractKeyToPartMapper { 18 | public: 19 | TypedKeyToPartMapper(size_t num_partition): AbstractKeyToPartMapper(num_partition) {} 20 | virtual size_t Get(const KeyT& key) const = 0; 21 | }; 22 | 23 | } // namespace xyz 24 | 25 | -------------------------------------------------------------------------------- /core/index/abstract_part_to_node_mapper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "base/sarray_binstream.hpp" 4 | 5 | namespace xyz { 6 | 7 | class AbstractPartToNodeMapper { 8 | public: 9 | AbstractPartToNodeMapper() = default; 10 | ~AbstractPartToNodeMapper() = default; 11 | 12 | virtual int Get(int part_id) const = 0; 13 | virtual void FromBin(SArrayBinStream& bin) = 0; 14 | virtual void ToBin(SArrayBinStream& bin) const = 0; 15 | }; 16 | 17 | } // namespace xyz 18 | 19 | -------------------------------------------------------------------------------- /core/index/hash_key_to_part_mapper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/index/abstract_key_to_part_mapper.hpp" 4 | 5 | #include 6 | 7 | namespace xyz { 8 | 9 | template 10 | class HashKeyToPartMapper : public TypedKeyToPartMapper { 11 | public: 12 | HashKeyToPartMapper(size_t num_partition):TypedKeyToPartMapper(num_partition) {} 13 | 14 | virtual size_t Get(const KeyT& key) const override { 15 | return std::hash()(key) % this->GetNumPart(); 16 | } 17 | }; 18 | 19 | 20 | template 21 | class RoundRobinKeyToPartMapper: public TypedKeyToPartMapper { 22 | public: 23 | RoundRobinKeyToPartMapper(size_t num_partition):TypedKeyToPartMapper(num_partition) {} 24 | 25 | virtual size_t Get(const KeyT& key) const override { 26 | return key % this->GetNumPart(); 27 | } 28 | }; 29 | 30 | } // namespace xyz 31 | -------------------------------------------------------------------------------- /core/index/hash_key_to_part_mapper_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/index/hash_key_to_part_mapper.hpp" 5 | 6 | namespace xyz { 7 | namespace { 8 | 9 | class TestHashKeyToPartMapper : public testing::Test {}; 10 | 11 | TEST_F(TestHashKeyToPartMapper, Construct) { 12 | HashKeyToPartMapper m1(4); 13 | HashKeyToPartMapper m2(4); 14 | } 15 | 16 | TEST_F(TestHashKeyToPartMapper, Get) { 17 | HashKeyToPartMapper m(4); 18 | auto a = m.Get("Hello"); 19 | VLOG(1) << a; 20 | EXPECT_EQ(a, 1); 21 | EXPECT_EQ(m.GetNumPart(), 4); 22 | } 23 | 24 | } // namespace 25 | } // namespace xyz 26 | 27 | -------------------------------------------------------------------------------- /core/index/hash_part_to_node_mapper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/index/abstract_part_to_node_mapper.hpp" 4 | 5 | #include 6 | 7 | namespace xyz { 8 | 9 | class HashPartToNodeMapper : public AbstractPartToNodeMapper { 10 | public: 11 | HashPartToNodeMapper(int num_nodes): num_nodes_(num_nodes) {} 12 | virtual int Get(int part_id) const { 13 | return std::hash()(part_id) % num_nodes_; 14 | } 15 | 16 | int GetNumNodes() const { return num_nodes_; } 17 | void SetNumNodes(int num_nodes) { num_nodes_ = num_nodes; } 18 | 19 | virtual void FromBin(SArrayBinStream& bin) override { 20 | CHECK(false) << "Not implemented"; 21 | } 22 | virtual void ToBin(SArrayBinStream& bin) const override { 23 | CHECK(false) << "Not implemented"; 24 | } 25 | private: 26 | int num_nodes_; 27 | }; 28 | 29 | } // namespace xyz 30 | 31 | -------------------------------------------------------------------------------- /core/index/hash_part_to_node_mapper_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/index/hash_part_to_node_mapper.hpp" 5 | 6 | namespace xyz { 7 | namespace { 8 | 9 | class TestHashPartToNodeMapper : public testing::Test {}; 10 | 11 | TEST_F(TestHashPartToNodeMapper, Construct) { 12 | HashPartToNodeMapper mapper(4); 13 | } 14 | 15 | TEST_F(TestHashPartToNodeMapper, Get) { 16 | HashPartToNodeMapper mapper(4); 17 | VLOG(1) << mapper.Get(2); 18 | } 19 | 20 | } // namespace 21 | } // namespace xyz 22 | 23 | -------------------------------------------------------------------------------- /core/index/key_to_part_mappers.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/index/abstract_key_to_part_mapper.hpp" 6 | 7 | #include "glog/logging.h" 8 | 9 | namespace xyz { 10 | 11 | class KeyToPartMappers { 12 | public: 13 | KeyToPartMappers() = default; 14 | bool Has(int collection_id) { 15 | return mappers_.find(collection_id) != mappers_.end(); 16 | } 17 | 18 | void Add(int collection_id, std::shared_ptr mapper) { 19 | CHECK(mappers_.find(collection_id) == mappers_.end()); 20 | mappers_.insert({collection_id, mapper}); 21 | } 22 | std::shared_ptr Get(int collection_id) { 23 | CHECK(mappers_.find(collection_id) != mappers_.end()); 24 | return mappers_[collection_id]; 25 | } 26 | private: 27 | std::map> mappers_; 28 | }; 29 | 30 | } // namespace xyz 31 | 32 | -------------------------------------------------------------------------------- /core/index/range_key_to_part_mapper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/index/abstract_key_to_part_mapper.hpp" 4 | #include "base/third_party/range.h" 5 | 6 | #include 7 | #include 8 | 9 | #include "glog/logging.h" 10 | 11 | namespace xyz { 12 | 13 | template 14 | class RangeKeyToPartMapper : public TypedKeyToPartMapper { 15 | public: 16 | RangeKeyToPartMapper(const std::vector& ranges) 17 | : TypedKeyToPartMapper(ranges.size()), ranges_(ranges) { 18 | CHECK_GT(ranges_.size(), 0); 19 | interval_ = ranges_[0].size(); 20 | for (int i = 0; i < ranges_.size() - 1; ++ i) { 21 | CHECK_EQ(ranges_[i].size(), interval_) << "ranges should have equal size"; 22 | } 23 | } 24 | 25 | virtual size_t Get(const KeyT& key) const override { 26 | size_t ret = key/interval_; 27 | CHECK_LT(ret, ranges_[ranges_.size()-1].end()); 28 | return ret; 29 | } 30 | 31 | int GetNumRanges() const { return ranges_.size(); } 32 | // void ResetRanges(const std::vector& ranges) { ranges_ = ranges; } 33 | third_party::Range GetRange(int part_id) { 34 | CHECK_LT(part_id, ranges_.size()); 35 | return ranges_[part_id]; 36 | } 37 | 38 | private: 39 | int num_nodes_; 40 | std::vector ranges_; 41 | int interval_; 42 | }; 43 | 44 | } // namespace xyz 45 | 46 | -------------------------------------------------------------------------------- /core/intermediate/abstract_intermediate_store.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "base/message.hpp" 4 | 5 | namespace xyz { 6 | 7 | class AbstractIntermediateStore { 8 | public: 9 | virtual ~AbstractIntermediateStore() {} 10 | virtual void Add(Message msg) = 0; 11 | }; 12 | 13 | } // namespace xyz 14 | -------------------------------------------------------------------------------- /core/intermediate/intermediate_store.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "base/message.hpp" 4 | #include "core/intermediate/abstract_intermediate_store.hpp" 5 | #include "comm/abstract_sender.hpp" 6 | 7 | namespace xyz { 8 | 9 | class IntermediateStore : public AbstractIntermediateStore { 10 | public: 11 | IntermediateStore(std::shared_ptr sender) 12 | :sender_(sender) {} 13 | 14 | virtual void Add(Message msg) override { 15 | sender_->Send(std::move(msg)); 16 | } 17 | private: 18 | std::shared_ptr sender_; 19 | }; 20 | 21 | } // namespace xyz 22 | 23 | -------------------------------------------------------------------------------- /core/intermediate/simple_intermediate_store.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "base/message.hpp" 4 | #include "core/intermediate/abstract_intermediate_store.hpp" 5 | 6 | namespace xyz { 7 | 8 | class SimpleIntermediateStore : public AbstractIntermediateStore { 9 | public: 10 | virtual void Add(Message msg) override { 11 | msgs.push_back(std::move(msg)); 12 | } 13 | 14 | std::vector Get() const { 15 | return msgs; 16 | } 17 | private: 18 | std::vector msgs; 19 | }; 20 | 21 | } // namespace xyz 22 | -------------------------------------------------------------------------------- /core/map_output/abstract_map_output.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "base/sarray_binstream.hpp" 7 | #include "core/map_output/map_output_stream.hpp" 8 | 9 | namespace xyz { 10 | 11 | class AbstractMapOutput { 12 | public: 13 | virtual ~AbstractMapOutput() {} 14 | 15 | virtual std::vector Serialize() = 0; 16 | virtual void Combine() = 0; 17 | 18 | virtual int GetBufferSize() = 0; 19 | virtual std::shared_ptr Get(int i) = 0; 20 | }; 21 | 22 | } // namespace xyz 23 | -------------------------------------------------------------------------------- /core/map_output/map_output_storage.cpp: -------------------------------------------------------------------------------- 1 | #include "core/map_output/map_output_storage.hpp" 2 | 3 | namespace xyz { 4 | 5 | void MapOutputManager::Add(int plan_id, std::shared_ptr map_output) { 6 | map_outputs_[plan_id].push_back(std::move(map_output)); 7 | } 8 | 9 | const std::vector>& MapOutputManager::Get(int plan_id) { 10 | return map_outputs_[plan_id]; 11 | } 12 | 13 | } // namespace 14 | 15 | -------------------------------------------------------------------------------- /core/map_output/map_output_storage.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/map_output/abstract_map_output.hpp" 6 | 7 | namespace xyz { 8 | 9 | class MapOutputManager { 10 | public: 11 | MapOutputManager() = default; 12 | void Add(int plan_id, std::shared_ptr map_output); 13 | const std::vector>& Get(int plan_id); 14 | private: 15 | std::unordered_map>> map_outputs_; 16 | }; 17 | 18 | } // namespace 19 | 20 | -------------------------------------------------------------------------------- /core/map_output/map_output_stream_store.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/map_output/map_output_stream.hpp" 4 | 5 | #include "glog/logging.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace xyz { 13 | 14 | class MapOutputStreamStore { 15 | public: 16 | MapOutputStreamStore() = default; 17 | ~MapOutputStreamStore() = default; 18 | 19 | void Insert(std::tuple,int> k, std::shared_ptr v) { 20 | std::lock_guard lk(mu_); 21 | CHECK(stream_store_.find(k) == stream_store_.end()); 22 | stream_store_[k] = std::move(v); 23 | } 24 | 25 | void Remove(std::tuple,int> k) { 26 | std::lock_guard lk(mu_); 27 | CHECK(stream_store_.find(k) != stream_store_.end()); 28 | stream_store_.erase(k); 29 | } 30 | 31 | std::shared_ptr Get(std::tuple,int> k) { 32 | std::lock_guard lk(mu_); 33 | CHECK(stream_store_.find(k) != stream_store_.end()); 34 | return stream_store_[k]; 35 | } 36 | 37 | private: 38 | // part_id, {upstream_part_ids}, version 39 | std::map,int>, std::shared_ptr> stream_store_; 40 | std::mutex mu_; 41 | }; 42 | 43 | } // namespace 44 | 45 | -------------------------------------------------------------------------------- /core/partition/abstract_fetcher.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace xyz { 4 | 5 | class AbstractFetcher { 6 | public: 7 | virtual ~AbstractFetcher() = default; 8 | 9 | virtual void FetchRemote(int collection_id, int partition_id, int version) = 0; 10 | }; 11 | 12 | } // namespace xyz 13 | 14 | -------------------------------------------------------------------------------- /core/partition/abstract_partition.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "base/sarray_binstream.hpp" 6 | 7 | namespace xyz { 8 | 9 | class AbstractPartition { 10 | public: 11 | virtual ~AbstractPartition() {} 12 | virtual void FromBin(SArrayBinStream& bin) = 0; 13 | virtual void ToBin(SArrayBinStream& bin) = 0; 14 | 15 | virtual size_t GetSize() const = 0; 16 | int id; 17 | }; 18 | 19 | template 20 | class Indexable { 21 | public: 22 | virtual ~Indexable() = default; 23 | virtual ObjT Get(typename ObjT::KeyT) { 24 | CHECK(false); 25 | } 26 | /* 27 | * The return pointer will be invalid when the storage size change. 28 | * You should assume the pointer is invalid once the partition changes. 29 | */ 30 | virtual ObjT* FindOrCreate(typename ObjT::KeyT) { 31 | CHECK(false); 32 | } 33 | 34 | virtual void Sort() { 35 | CHECK(false); 36 | } 37 | }; 38 | 39 | template 40 | class TypedPartition : public AbstractPartition { 41 | public: 42 | // Add obj into partition 43 | void Add(ObjT obj) { 44 | TypedAdd(std::move(obj)); 45 | } 46 | virtual void TypedAdd(ObjT obj) = 0; 47 | 48 | /* 49 | * Subclasses need to implement Iterator and implement CreateIterator() function 50 | * to support range-based for loop. 51 | */ 52 | struct Iterator { 53 | virtual ObjT& Deref() = 0; 54 | virtual ObjT* Ref() = 0; 55 | virtual void SubAdvance() = 0; 56 | virtual bool SubUnequal(const std::unique_ptr& other) = 0; 57 | virtual ~Iterator() {} 58 | }; 59 | struct IterWrapper { 60 | std::unique_ptr iter; 61 | ObjT& operator*() { 62 | return iter->Deref(); 63 | } 64 | ObjT* operator->() { 65 | return iter->Ref(); 66 | } 67 | IterWrapper& operator++() { 68 | iter->SubAdvance(); 69 | return *this; 70 | } 71 | bool operator!=(const IterWrapper& iw) const { 72 | return iter->SubUnequal(iw.iter); 73 | } 74 | bool operator==(const IterWrapper& iw) const { 75 | return !iter->SubUnequal(iw.iter); 76 | } 77 | }; 78 | IterWrapper begin() { 79 | return CreateIterator(true); 80 | } 81 | IterWrapper end() { 82 | return CreateIterator(false); 83 | } 84 | 85 | virtual IterWrapper CreateIterator(bool) { 86 | CHECK(false); 87 | } 88 | 89 | }; 90 | 91 | } // namespace 92 | -------------------------------------------------------------------------------- /core/partition/block_partition.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/partition/abstract_partition.hpp" 6 | #include "io/meta.hpp" 7 | #include "io/abstract_block_reader.hpp" 8 | 9 | namespace xyz { 10 | 11 | class BlockPartition : public TypedPartition { 12 | public: 13 | BlockPartition(AssignedBlock block, 14 | std::function()> block_reader_getter) 15 | : block_(block), block_reader_getter_(block_reader_getter) { 16 | } 17 | virtual void TypedAdd(std::string s) override { 18 | CHECK(false) << "not implemented"; 19 | } 20 | 21 | virtual size_t GetSize() const override { 22 | CHECK(false) << "not implemented"; 23 | } 24 | 25 | virtual void FromBin(SArrayBinStream& bin) override { 26 | CHECK(false) << "not implemented"; 27 | } 28 | virtual void ToBin(SArrayBinStream& bin) override { 29 | CHECK(false) << "not implemented"; 30 | // TODO: serialize the reader? or get the function from other place 31 | } 32 | virtual typename TypedPartition::IterWrapper CreateIterator(bool is_begin) override { 33 | CHECK(false) << "not implemented"; 34 | } 35 | 36 | std::shared_ptr GetReader() { 37 | CHECK(block_reader_getter_); 38 | auto block_reader = block_reader_getter_(); 39 | block_reader->Init(block_.url, block_.offset); 40 | return block_reader; 41 | } 42 | private: 43 | AssignedBlock block_; 44 | std::function()> block_reader_getter_; 45 | }; 46 | 47 | } // namespace xyz 48 | 49 | -------------------------------------------------------------------------------- /core/partition/file_partition.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/partition/abstract_partition.hpp" 6 | #include "io/meta.hpp" 7 | #include "io/abstract_reader.hpp" 8 | 9 | namespace xyz { 10 | 11 | class FilePartition : public TypedPartition { 12 | public: 13 | FilePartition(AssignedBlock block, 14 | std::function()> reader_getter) 15 | : block_(block), reader_getter_(reader_getter) { 16 | } 17 | virtual void TypedAdd(std::string s) override { 18 | CHECK(false) << "not implemented"; 19 | } 20 | 21 | virtual size_t GetSize() const override { 22 | CHECK(false) << "not implemented"; 23 | } 24 | 25 | virtual void FromBin(SArrayBinStream& bin) override { 26 | CHECK(false) << "not implemented"; 27 | } 28 | virtual void ToBin(SArrayBinStream& bin) override { 29 | CHECK(false) << "not implemented"; 30 | // TODO: serialize the reader? or get the function from other place 31 | } 32 | virtual typename TypedPartition::IterWrapper CreateIterator(bool is_begin) override { 33 | CHECK(false) << "not implemented"; 34 | } 35 | 36 | std::string GetFileString() { 37 | CHECK_EQ(block_.offset, 0); // the block should be a file 38 | auto reader = reader_getter_(); 39 | reader->Init(block_.url); 40 | size_t file_size = reader->GetFileSize(); 41 | CHECK_GT(file_size, 0); 42 | 43 | std::string file_str; 44 | file_str.resize(file_size); 45 | reader->Read(&file_str[0], file_size); 46 | return file_str; 47 | } 48 | private: 49 | AssignedBlock block_; 50 | std::function()> reader_getter_; 51 | }; 52 | 53 | } // namespace xyz 54 | 55 | 56 | -------------------------------------------------------------------------------- /core/partition/partition_manager.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/partition/abstract_partition.hpp" 4 | 5 | #include "glog/logging.h" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace xyz { 12 | 13 | class PartitionManager { 14 | public: 15 | PartitionManager() = default; 16 | ~PartitionManager(); 17 | 18 | bool Has(int collection_id, int partition_id); 19 | std::shared_ptr Get(int collection_id, int partition_id); 20 | 21 | std::vector> Get(int collection_id); 22 | int GetNumLocalParts(int collection_id); 23 | 24 | void Insert(int collection_id, int partition_id, std::shared_ptr&&); 25 | 26 | void Remove(int collection_id, int partition_id); 27 | private: 28 | // > 29 | // Let PartitionManager own the partition. 30 | std::map>> partitions_; 31 | // Make it thread-safe 32 | std::mutex mu_; 33 | }; 34 | 35 | } // namespace 36 | 37 | -------------------------------------------------------------------------------- /core/partition/partition_manager_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/partition/partition_manager.hpp" 5 | 6 | namespace xyz { 7 | namespace { 8 | 9 | class TestPartitionManager : public testing::Test {}; 10 | 11 | template 12 | class FakePartition : public AbstractPartition { 13 | virtual void FromBin(SArrayBinStream& bin) override {} 14 | virtual void ToBin(SArrayBinStream& bin) override {} 15 | virtual size_t GetSize() const override { return 0; } 16 | }; 17 | 18 | TEST_F(TestPartitionManager, Construct) { 19 | PartitionManager manager; 20 | } 21 | 22 | TEST_F(TestPartitionManager, Insert) { 23 | auto p1 = std::make_shared>(); 24 | auto p2 = std::make_shared>(); 25 | PartitionManager manager; 26 | manager.Insert(0, 0, std::move(p1)); 27 | manager.Insert(0, 1, std::move(p2)); 28 | } 29 | 30 | TEST_F(TestPartitionManager, InsertRemove) { 31 | auto p1 = std::make_shared>(); 32 | auto p2 = std::make_shared>(); 33 | PartitionManager manager; 34 | manager.Insert(0, 0, std::move(p1)); 35 | manager.Insert(0, 1, std::move(p2)); 36 | manager.Remove(0, 1); 37 | } 38 | 39 | TEST_F(TestPartitionManager, InsertGet) { 40 | auto p1 = std::make_shared>(); 41 | auto p2 = std::make_shared>(); 42 | PartitionManager manager; 43 | manager.Insert(0, 0, std::move(p1)); 44 | manager.Insert(0, 1, std::move(p2)); 45 | auto get_p1 = manager.Get(0, 1); 46 | EXPECT_EQ(get_p1.use_count(), 2); 47 | auto get_p2 = manager.Get(0, 1); 48 | EXPECT_EQ(get_p2.use_count(), 3); 49 | } 50 | 51 | } // namespace 52 | } // namespace xyz 53 | 54 | -------------------------------------------------------------------------------- /core/partition/range_indexed_seq_partition.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/partition/seq_partition.hpp" 4 | #include "base/third_party/range.h" 5 | 6 | #include 7 | #include 8 | 9 | namespace xyz { 10 | 11 | template 12 | class RangeIndexedSeqPartition : public SeqPartition, public Indexable { 13 | public: 14 | RangeIndexedSeqPartition() = default; 15 | RangeIndexedSeqPartition(const third_party::Range& range): range_(range) { 16 | CHECK_GE(range_.size(), 0); 17 | for (int i = range_.begin(); i < range_.end(); ++ i) { 18 | this->storage_.push_back(ObjT(i)); 19 | } 20 | } 21 | 22 | virtual void TypedAdd(ObjT obj) override { 23 | CHECK(false); 24 | // this->storage_[obj.Key()] = std::move(obj); 25 | } 26 | 27 | virtual ObjT Get(typename ObjT::KeyT key) override { 28 | ObjT* obj = Find(key); 29 | CHECK_NOTNULL(obj); 30 | return *obj; 31 | } 32 | 33 | virtual ObjT* FindOrCreate(typename ObjT::KeyT key) override { 34 | // LOG(INFO) << "FindOrCreate: " << key; 35 | return Find(key); 36 | /* 37 | ObjT* obj = Find(key); 38 | if (obj) { 39 | return obj; 40 | } 41 | // If cannot find, add it. 42 | ObjT new_obj(key); // Assume the constructor is low cost. 43 | TypedAdd(std::move(new_obj)); 44 | return &this->storage_[key]; 45 | */ 46 | } 47 | 48 | virtual ObjT* Find(typename ObjT::KeyT key) { 49 | CHECK_GE(key, range_.begin()); 50 | CHECK_LT(key, range_.end()); 51 | return &this->storage_[key - range_.begin()]; 52 | } 53 | 54 | virtual void FromBin(SArrayBinStream& bin) override { 55 | bin >> this->storage_; 56 | bin >> range_; 57 | } 58 | 59 | virtual void ToBin(SArrayBinStream& bin) override { 60 | bin << this->storage_; 61 | bin << range_; 62 | } 63 | 64 | virtual void Sort() override { 65 | LOG(INFO) << "Do Nothing"; 66 | } 67 | 68 | private: 69 | third_party::Range range_; 70 | }; 71 | 72 | } // namespace 73 | 74 | -------------------------------------------------------------------------------- /core/partition/seq_partition.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/partition/abstract_partition.hpp" 4 | 5 | namespace xyz { 6 | 7 | /* 8 | * Basic sequential partition implementation. 9 | * Support range-based for loop. 10 | */ 11 | template 12 | class SeqPartition : public TypedPartition { 13 | public: 14 | virtual void TypedAdd(ObjT obj) override { 15 | storage_.push_back(std::move(obj)); 16 | } 17 | 18 | virtual size_t GetSize() const override { return storage_.size(); } 19 | 20 | virtual void FromBin(SArrayBinStream& bin) override { 21 | bin >> storage_; 22 | } 23 | virtual void ToBin(SArrayBinStream& bin) override { 24 | bin << storage_; 25 | } 26 | 27 | /* 28 | * Implement the Iterator to support range-based for loop 29 | */ 30 | struct Iterator : public TypedPartition::Iterator { 31 | Iterator(ObjT* ptr, size_t pos): ptr_(ptr), pos_(pos) {} 32 | virtual ObjT& Deref() { 33 | return ptr_[pos_]; 34 | } 35 | virtual ObjT* Ref() { 36 | return &ptr_[pos_]; 37 | } 38 | virtual void SubAdvance() { 39 | ++ pos_; 40 | } 41 | virtual bool SubUnequal(const std::unique_ptr::Iterator>& other) { 42 | return pos_ != static_cast(other.get())->pos_; 43 | } 44 | ObjT* ptr_; 45 | size_t pos_; 46 | }; 47 | 48 | virtual typename TypedPartition::IterWrapper CreateIterator(bool is_begin) override { 49 | typename TypedPartition::IterWrapper iw; 50 | if (storage_.empty()) { 51 | iw.iter.reset(new typename SeqPartition::Iterator(nullptr, 0)); 52 | return iw; 53 | } 54 | if (is_begin) { 55 | iw.iter.reset(new typename SeqPartition::Iterator(&storage_[0], 0)); 56 | } else { 57 | iw.iter.reset(new typename SeqPartition::Iterator(&storage_[0], storage_.size())); 58 | } 59 | return iw; 60 | } 61 | 62 | std::vector GetStorage() { 63 | return storage_; 64 | } 65 | protected: 66 | std::vector storage_; 67 | }; 68 | 69 | } // namespace 70 | 71 | -------------------------------------------------------------------------------- /core/partition/seq_partition_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/partition/seq_partition.hpp" 5 | 6 | namespace xyz { 7 | namespace { 8 | 9 | class TestSeqPartition : public testing::Test {}; 10 | 11 | struct ObjT { 12 | using KeyT = int; 13 | using ValT = int; 14 | int key; 15 | int val; 16 | KeyT Key() const { return key; } 17 | }; 18 | 19 | TEST_F(TestSeqPartition, Create) { 20 | SeqPartition part; 21 | } 22 | TEST_F(TestSeqPartition, Add) { 23 | SeqPartition part; 24 | part.Add(ObjT{2, 3}); 25 | part.Add(ObjT{1, 2}); 26 | EXPECT_EQ(part.GetSize(), 2); 27 | } 28 | 29 | TEST_F(TestSeqPartition, EmptyIterate) { 30 | SeqPartition part; 31 | for (auto& elem : part) { 32 | } 33 | } 34 | 35 | TEST_F(TestSeqPartition, Iterate) { 36 | SeqPartition part; 37 | std::vector v{ObjT{1, 2}, ObjT{2, 3}}; 38 | part.Add(v[0]); 39 | part.Add(v[1]); 40 | ASSERT_EQ(part.GetSize(), 2); 41 | int i = 0; 42 | for (auto& elem : part) { 43 | EXPECT_EQ(elem.Key(), v[i].Key()); 44 | i ++; 45 | } 46 | } 47 | 48 | TEST_F(TestSeqPartition, Bin) { 49 | SeqPartition part; 50 | part.Add(ObjT{2, 3}); 51 | part.Add(ObjT{1, 2}); 52 | SArrayBinStream bin; 53 | part.ToBin(bin); 54 | SeqPartition new_part; 55 | new_part.FromBin(bin); 56 | ASSERT_EQ(new_part.GetSize(), 2); 57 | } 58 | 59 | } // namespace 60 | } // namespace xyz 61 | 62 | -------------------------------------------------------------------------------- /core/partition/task_timer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace xyz { 6 | 7 | class TaskTimer { 8 | public: 9 | TaskTimer() { 10 | Add(); 11 | } 12 | void Add() { 13 | add_timepoint = std::chrono::steady_clock::now(); 14 | } 15 | void Run() { 16 | run_timepoint = std::chrono::steady_clock::now(); 17 | } 18 | void Finish() { 19 | end_timepoint = std::chrono::steady_clock::now(); 20 | } 21 | 22 | int GetTimeFromAdd() { 23 | return GetTimeFrom(add_timepoint); 24 | } 25 | int GetTimeFromStart() { 26 | return GetTimeFrom(run_timepoint); 27 | } 28 | 29 | private: 30 | int GetTimeFrom(std::chrono::time_point t) { 31 | return std::chrono::duration_cast( 32 | std::chrono::steady_clock::now() - t).count(); 33 | } 34 | 35 | private: 36 | std::chrono::time_point add_timepoint; 37 | std::chrono::time_point run_timepoint; 38 | std::chrono::time_point end_timepoint; 39 | }; 40 | 41 | } // namespace xyz 42 | 43 | -------------------------------------------------------------------------------- /core/plan/checkpoint.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/plan_base.hpp" 4 | #include "glog/logging.h" 5 | 6 | namespace xyz { 7 | 8 | struct Checkpoint : public PlanBase { 9 | enum class Type : char { 10 | checkpoint, loadcheckpoint 11 | }; 12 | Checkpoint(int _plan_id, int _cid, std::string _url, 13 | Type _type) 14 | : PlanBase(_plan_id), cid(_cid), url(_url), type(_type) { 15 | } 16 | 17 | virtual SpecWrapper GetSpec() override { 18 | SpecWrapper w; 19 | SpecWrapper::Type t; 20 | if (type == Type::checkpoint) { 21 | t = SpecWrapper::Type::kCheckpoint; 22 | } else { 23 | t = SpecWrapper::Type::kLoadCheckpoint; 24 | } 25 | w.SetSpec(plan_id, t, cid, url); 26 | w.name = name; 27 | return w; 28 | } 29 | 30 | virtual void Register(std::shared_ptr function_store) override { 31 | 32 | } 33 | 34 | std::string url; 35 | int cid; // collection id 36 | Type type; 37 | }; 38 | 39 | } // namespace xyz 40 | 41 | -------------------------------------------------------------------------------- /core/plan/collection_spec.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "base/sarray_binstream.hpp" 6 | 7 | namespace xyz { 8 | 9 | enum class CollectionSource : char { 10 | kDistribute, 11 | kLoad, 12 | kOthers 13 | }; 14 | 15 | static const char* CollectionSourceName[] = { 16 | "kDistribute", 17 | "kLoad", 18 | "kOthers" 19 | }; 20 | 21 | struct CollectionSpec { 22 | int collection_id; 23 | int num_partition; 24 | CollectionSource source; 25 | SArrayBinStream data; 26 | std::string load_url; 27 | 28 | std::string DebugString() const { 29 | std::stringstream ss; 30 | ss << "{ collection_id: " << collection_id; 31 | ss << ", num_partition: " << num_partition; 32 | ss << ", source: " << CollectionSourceName[static_cast(source)]; 33 | ss << ", data size in char: " << data.Size(); 34 | ss << ", load_url: " << load_url; 35 | ss << "}"; 36 | return ss.str(); 37 | } 38 | 39 | friend SArrayBinStream& operator<<(xyz::SArrayBinStream& stream, const CollectionSpec& s) { 40 | stream << s.collection_id << s.num_partition << s.source << s.data << s.load_url; 41 | return stream; 42 | } 43 | 44 | friend SArrayBinStream& operator>>(xyz::SArrayBinStream& stream, CollectionSpec& s) { 45 | stream >> s.collection_id >> s.num_partition >> s.source >> s.data >> s.load_url; 46 | return stream; 47 | } 48 | }; 49 | 50 | } // namespace xyz 51 | 52 | -------------------------------------------------------------------------------- /core/plan/context.cpp: -------------------------------------------------------------------------------- 1 | #include "core/plan/context.hpp" 2 | 3 | namespace xyz { 4 | 5 | Store Context::collections_; 6 | Store Context::plans_; 7 | Dag Context::dag_; 8 | 9 | } // namespace xyz 10 | 11 | -------------------------------------------------------------------------------- /core/plan/dag_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/plan/dag.hpp" 5 | 6 | namespace xyz { 7 | namespace { 8 | 9 | class TestDag : public testing::Test {}; 10 | 11 | TEST_F(TestDag, construct) { 12 | Dag d; 13 | } 14 | 15 | TEST_F(TestDag, AddDagNode) { 16 | Dag d; 17 | d.AddDagNode(0, {}, {0}); 18 | d.AddDagNode(1, {}, {1}); 19 | d.AddDagNode(2, {0}, {1}); 20 | LOG(INFO) << d.DebugString(); 21 | } 22 | 23 | TEST_F(TestDag, Vistor) { 24 | Dag d; 25 | d.AddDagNode(0, {}, {0}); 26 | d.AddDagNode(1, {}, {1}); 27 | d.AddDagNode(2, {0}, {1}); 28 | 29 | DagVistor v(d); 30 | auto f = v.GetFront(); 31 | while (!f.empty()) { 32 | int node = f.front(); 33 | LOG(INFO) << "visiting: " << node; 34 | v.Finish(node); 35 | f = v.GetFront(); 36 | } 37 | } 38 | 39 | } // namespace 40 | } // namespace xyz 41 | 42 | -------------------------------------------------------------------------------- /core/plan/mapupdate.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/collection.hpp" 4 | #include "core/map_output/abstract_map_output.hpp" 5 | 6 | #include "core/partition/abstract_partition.hpp" 7 | 8 | #include "core/map_output/partitioned_map_output.hpp" 9 | 10 | #include "core/index/hash_key_to_part_mapper.hpp" 11 | #include "core/index/range_key_to_part_mapper.hpp" 12 | 13 | #include "core/plan/abstract_function_store.hpp" 14 | #include "core/plan/plan_spec.hpp" 15 | 16 | #include "core/plan/mappartupdate.hpp" 17 | 18 | namespace xyz { 19 | 20 | template 21 | struct MapJoin; 22 | 23 | template 24 | MapJoin GetMapJoin(int plan_id, C1* c1, C2* c2) { 25 | MapJoin plan(plan_id, c1, c2); 26 | return plan; 27 | } 28 | 29 | /* 30 | * Requires T2 to be in the form {T2::KeyT, T2::ValT} 31 | */ 32 | template 33 | struct MapJoin : public MapPartJoin{ 34 | using MapFuncT = std::function*)>; 35 | 36 | MapJoin(int plan_id, C1* map_collection, C2* update_collection) 37 | : MapPartJoin(plan_id, map_collection, update_collection) { 38 | } 39 | 40 | void SetMapPart() { 41 | CHECK(map != nullptr); 42 | // construct the mappart 43 | this->mappart = [this](TypedPartition* p, 44 | Output* o) { 45 | CHECK_NOTNULL(p); 46 | int i = 0; 47 | for (auto& elem : *p) { 48 | map(elem, o); 49 | i += 1; 50 | } 51 | }; 52 | } 53 | void Register(std::shared_ptr function_store) { 54 | SetMapPart(); 55 | MapPartJoin::Register(function_store); 56 | } 57 | 58 | MapFuncT map; // a -> b 59 | }; 60 | 61 | } // namespace xyz 62 | -------------------------------------------------------------------------------- /core/plan/mapupdate_test.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/plan/mapupdate.hpp" 5 | #include "core/partition/seq_partition.hpp" 6 | #include "core/map_output/partitioned_map_output.hpp" 7 | 8 | namespace xyz { 9 | namespace { 10 | 11 | /* 12 | * This test depends on SeqPartition and MapOutput. 13 | */ 14 | class TestMapJoin: public testing::Test {}; 15 | 16 | struct ObjT { 17 | using KeyT = int; 18 | using ValT = int; 19 | ObjT() = default; 20 | ObjT(KeyT key) : a(key), b(0) {} 21 | KeyT Key() const { return a; } 22 | int a; 23 | int b; 24 | }; 25 | 26 | TEST_F(TestMapJoin, Create) { 27 | int plan_id = 0; 28 | Collection c1{1}; 29 | Collection c2{2}; 30 | auto plan = GetMapJoin(plan_id, &c1, &c2); 31 | 32 | plan.map = [](ObjT a, Output* o) { 33 | o->Add(a.Key(), 1); 34 | }; 35 | plan.update = [](ObjT* obj, int m) { 36 | obj->b += m; 37 | }; 38 | } 39 | 40 | TEST_F(TestMapJoin, GetMapPartFunc) { 41 | int plan_id = 0; 42 | int num_part = 1; 43 | Collection c1{1}; 44 | Collection c2{2, num_part}; 45 | c2.SetMapper(std::make_shared>(num_part)); 46 | auto plan = GetMapJoin(plan_id, &c1, &c2); 47 | 48 | plan.map = [](ObjT a, Output* o) { 49 | o->Add(a.Key(), 1); 50 | }; 51 | plan.SetMapPart(); 52 | 53 | auto f = plan.GetMapPartFunc(); 54 | auto partition = std::make_shared>(); 55 | partition->Add(ObjT{10}); 56 | partition->Add(ObjT{20}); 57 | auto map_output = f(partition); 58 | auto vec = static_cast*>(map_output.get())->GetBuffer(); 59 | ASSERT_EQ(vec.size(), 1); 60 | ASSERT_EQ(vec[0].size(), 2); 61 | EXPECT_EQ(vec[0][0].first, 10); 62 | EXPECT_EQ(vec[0][0].second, 1); 63 | EXPECT_EQ(vec[0][1].first, 20); 64 | EXPECT_EQ(vec[0][1].second, 1); 65 | } 66 | 67 | } // namespace 68 | } // namespace xyz 69 | 70 | -------------------------------------------------------------------------------- /core/plan/mapwithupdate.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/mappartwithupdate.hpp" 4 | 5 | namespace xyz { 6 | 7 | template 8 | struct MapWithJoin; 9 | 10 | template 11 | MapWithJoin GetMapWithJoin(int plan_id, C1* c1, C2* c2, C3* c3) { 12 | MapWithJoin plan(plan_id, c1, c2, c3); 13 | return plan; 14 | } 15 | 16 | template 17 | struct MapWithJoin : public MapPartWithJoin { 18 | using MapWithFuncT = std::function*, Output*)>; 19 | 20 | MapWithJoin(int plan_id, C1* map_collection, 21 | C2* with_collection, 22 | C3* update_collection) 23 | : MapPartWithJoin(plan_id, map_collection, with_collection, update_collection) { 24 | } 25 | 26 | void SetMapPartWith() { 27 | CHECK(mapwith != nullptr); 28 | // construct the mappartwith 29 | this->mappartwith = [this](TypedPartition* p, 30 | TypedCache* typed_cache, 31 | Output* o) { 32 | int i = 0; 33 | for (auto& elem : *p) { 34 | mapwith(elem, typed_cache, o); 35 | i += 1; 36 | } 37 | }; 38 | } 39 | 40 | void Register(std::shared_ptr function_store) { 41 | SetMapPartWith(); 42 | MapPartWithJoin::Register(function_store); 43 | } 44 | 45 | MapWithFuncT mapwith; // a (with c) -> b 46 | }; 47 | 48 | } // namespace xyz 49 | 50 | -------------------------------------------------------------------------------- /core/plan/mapwithupdate_test.hpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "glog/logging.h" 3 | 4 | #include "core/partition/seq_partition.hpp" 5 | #include "core/plan/mapwithupdate.hpp" 6 | 7 | namespace xyz { 8 | namespace { 9 | 10 | struct ObjT { 11 | using KeyT = int; 12 | using ValT = int; 13 | ObjT() = default; 14 | ObjT(KeyT _a):a(_a) {} 15 | KeyT Key() const { return a; } 16 | int a; 17 | }; 18 | 19 | class TestMapWithJoin: public testing::Test {}; 20 | 21 | TEST_F(TestMapWithJoin, Create) { 22 | int plan_id = 0; 23 | Collection c1{1}; 24 | Collection c2{2}; 25 | Collection c3{4}; 26 | auto plan = GetMapWithJoin(plan_id, &c1, &c2, &c3); 27 | plan.mapwith = [](const ObjT& obj, TypedCache* cache, Output* o) { 28 | ObjT cache_obj = cache->Get(2); 29 | int ret = obj.Key() + cache_obj.a; 30 | o->Add(ret, 1); 31 | }; 32 | } 33 | 34 | } // namespace 35 | } // namespace xyz 36 | 37 | -------------------------------------------------------------------------------- /core/plan/plan_base.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/spec_wrapper.hpp" 4 | #include "core/plan/abstract_function_store.hpp" 5 | 6 | namespace xyz { 7 | 8 | struct PlanBase { 9 | PlanBase(int _plan_id) : plan_id(_plan_id) {} 10 | virtual ~PlanBase() = default; 11 | virtual SpecWrapper GetSpec() = 0; 12 | virtual void Register(std::shared_ptr function_store) = 0; 13 | 14 | int plan_id; 15 | std::string name = ""; 16 | }; 17 | 18 | } // namespace xyz 19 | 20 | -------------------------------------------------------------------------------- /core/plan/plan_spec.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "base/sarray_binstream.hpp" 6 | 7 | namespace xyz { 8 | 9 | struct PlanSpec { 10 | int plan_id; 11 | int map_collection_id; 12 | int update_collection_id; 13 | int cur_iter = 0; 14 | int num_iter = 1; 15 | 16 | int with_collection_id = -1; 17 | 18 | PlanSpec() = default; 19 | PlanSpec(int pid, int mid, int jid) 20 | : plan_id(pid), map_collection_id(mid), update_collection_id(jid) {} 21 | 22 | std::string DebugString() const { 23 | std::stringstream ss; 24 | ss << "{ plan_id: " << plan_id; 25 | ss << ", map_collection_id: " << map_collection_id; 26 | ss << ", update_collection_id: " << update_collection_id; 27 | ss << ", num_iter: " << num_iter; 28 | ss << ", with_collection_id: " << with_collection_id; 29 | ss << "}"; 30 | return ss.str(); 31 | } 32 | 33 | /* 34 | friend SArrayBinStream& operator<<(xyz::SArrayBinStream& stream, const PlanSpec& p) { 35 | // TODO 36 | return stream; 37 | } 38 | 39 | friend SArrayBinStream& operator>>(xyz::SArrayBinStream& stream, PlanSpec& p) { 40 | // TODO 41 | return stream; 42 | } 43 | */ 44 | }; 45 | 46 | } // namespace xyz 47 | -------------------------------------------------------------------------------- /core/plan/runner.cpp: -------------------------------------------------------------------------------- 1 | #include "core/plan/runner.hpp" 2 | 3 | DEFINE_string(scheduler, "", "The host of scheduler"); 4 | DEFINE_int32(scheduler_port, -1, "The port of scheduler"); 5 | DEFINE_string(hdfs_namenode, "", "The namenode of hdfs"); 6 | DEFINE_int32(hdfs_port, -1, "The port of hdfs"); 7 | DEFINE_int32(node_id, -1, "node id"); 8 | DEFINE_int32(num_local_threads, 20, "# local_threads"); 9 | DEFINE_int32(num_update_threads, 20, "# update_threads"); 10 | DEFINE_int32(num_combine_threads, 20, "# combine_threads"); 11 | 12 | namespace xyz { 13 | 14 | void Runner::Init(int argc, char** argv) { 15 | google::InitGoogleLogging(argv[0]); 16 | gflags::ParseCommandLineFlags(&argc, &argv, true); 17 | } 18 | 19 | void Runner::PrintDag() { 20 | LOG(INFO) << Context::get_dag().DebugString(); 21 | } 22 | 23 | void Runner::Run() { 24 | CHECK(!FLAGS_scheduler.empty()); 25 | 26 | auto plans = Context::get_allplans(); 27 | auto collections = Context::get_allcollections(); 28 | // TODO: replace ProgramContext with a DAG structure. 29 | ProgramContext program; 30 | // for (auto* c : collections) { 31 | // program.collections.push_back(c->GetSpec()); 32 | // } 33 | for (auto* p : plans) { 34 | program.specs.push_back(p->GetSpec()); 35 | } 36 | program.dag = Context::get_dag(); 37 | 38 | Engine::Config config; 39 | config.scheduler = FLAGS_scheduler; 40 | config.scheduler_port = FLAGS_scheduler_port; 41 | config.num_local_threads = FLAGS_num_local_threads; 42 | config.num_update_threads = FLAGS_num_update_threads; 43 | config.num_combine_threads = FLAGS_num_combine_threads; 44 | config.namenode = FLAGS_hdfs_namenode; 45 | config.port = FLAGS_hdfs_port; 46 | 47 | Engine engine; 48 | // initialize the components and actors, 49 | // especially the function_store, to be registed by the plan 50 | engine.Init(config); 51 | // register program containing plan and collection info 52 | engine.RegisterProgram(program); 53 | // add related functions 54 | for (auto* c : collections) { 55 | engine.AddFunc(c); 56 | } 57 | for (auto* p : plans) { 58 | engine.AddFunc(p); 59 | } 60 | 61 | // start the mailbox and start to receive messages 62 | engine.Start(); 63 | // stop the mailbox and actors 64 | engine.Stop(); 65 | } 66 | 67 | 68 | } // namespace xyz 69 | 70 | -------------------------------------------------------------------------------- /core/plan/runner.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/context.hpp" 4 | #include "core/engine.hpp" 5 | #include "core/program_context.hpp" 6 | 7 | #include "base/color.hpp" 8 | 9 | #include "gflags/gflags.h" 10 | #include "glog/logging.h" 11 | 12 | // define these variables in runner.cpp so users do need to 13 | // repeat them. 14 | DECLARE_string(scheduler); 15 | DECLARE_int32(scheduler_port); 16 | DECLARE_string(hdfs_namenode); 17 | DECLARE_int32(hdfs_port); 18 | DECLARE_int32(num_local_threads); 19 | DECLARE_int32(node_id); 20 | 21 | namespace xyz { 22 | 23 | class Runner { 24 | public: 25 | static void Init(int argc, char** argv); 26 | static void Run(); 27 | static void PrintDag(); 28 | }; 29 | 30 | } // namespace xyz 31 | 32 | -------------------------------------------------------------------------------- /core/plan/spec_wrapper.cpp: -------------------------------------------------------------------------------- 1 | #include "core/plan/spec_wrapper.hpp" 2 | 3 | namespace xyz { 4 | 5 | constexpr const char* SpecWrapper::TypeName[]; 6 | 7 | MapJoinSpec* SpecWrapper::GetMapJoinSpec() { 8 | CHECK(type == Type::kMapJoin || type == Type::kMapWithJoin); 9 | return static_cast(spec.get()); 10 | } 11 | 12 | MapWithJoinSpec* SpecWrapper::GetMapWithJoinSpec() { 13 | CHECK(type == Type::kMapWithJoin); 14 | return static_cast(spec.get()); 15 | } 16 | 17 | DistributeSpec* SpecWrapper::GetDistributeSpec() { 18 | CHECK(type == Type::kDistribute); 19 | return static_cast(spec.get()); 20 | } 21 | 22 | LoadSpec* SpecWrapper::GetLoadSpec() { 23 | CHECK(type == Type::kLoad); 24 | return static_cast(spec.get()); 25 | } 26 | 27 | CheckpointSpec* SpecWrapper::GetCheckpointSpec() { 28 | CHECK(type == Type::kCheckpoint); 29 | return static_cast(spec.get()); 30 | } 31 | 32 | LoadCheckpointSpec* SpecWrapper::GetLoadCheckpointSpec() { 33 | CHECK(type == Type::kLoadCheckpoint); 34 | return static_cast(spec.get()); 35 | } 36 | 37 | WriteSpec* SpecWrapper::GetWriteSpec() { 38 | CHECK(type == Type::kWrite); 39 | return static_cast(spec.get()); 40 | } 41 | 42 | } // namespace xyz 43 | 44 | -------------------------------------------------------------------------------- /core/plan/update_helper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/abstract_function_store.hpp" 4 | #include "core/partition/abstract_partition.hpp" 5 | 6 | namespace xyz { 7 | 8 | template 9 | AbstractFunctionStore::JoinFuncT GetJoinPartFunc(std::function update) { 10 | return [update] (std::shared_ptr partition, SArrayBinStream bin) { 11 | auto* p = dynamic_cast*>(partition.get()); 12 | CHECK_NOTNULL(p); 13 | typename T::KeyT key; 14 | MsgT msg; 15 | while (bin.Size()) { 16 | bin >> key >> msg; 17 | auto* obj = p->FindOrCreate(key); 18 | update(obj, std::move(msg)); 19 | } 20 | }; 21 | } 22 | 23 | template 24 | AbstractFunctionStore::JoinFunc2T GetJoinPartFunc2(std::function update) { 25 | return [update] (std::shared_ptr partition, std::shared_ptr stream) { 26 | auto* p = dynamic_cast*>(partition.get()); 27 | auto* s = static_cast*>(stream.get()); 28 | CHECK_NOTNULL(p); 29 | CHECK_NOTNULL(s); 30 | const auto& buffer = s->GetBuffer(); 31 | for (auto& kv : buffer) { 32 | auto* obj = p->FindOrCreate(kv.first); 33 | update(obj, std::move(kv.second)); 34 | } 35 | }; 36 | } 37 | 38 | } // namespace xyz 39 | 40 | -------------------------------------------------------------------------------- /core/plan/write.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/plan_base.hpp" 4 | #include "core/partition/abstract_partition.hpp" 5 | 6 | namespace xyz { 7 | 8 | template 9 | struct Write : public PlanBase { 10 | Write(int _plan_id, int _collection_id, std::string _url, std::function f) 11 | : PlanBase(_plan_id), collection_id(_collection_id), url(_url), write_obj(f) {} 12 | 13 | virtual SpecWrapper GetSpec() override { 14 | SpecWrapper w; 15 | w.SetSpec(plan_id, SpecWrapper::Type::kWrite, 16 | collection_id, url); 17 | w.name = name; 18 | return w; 19 | } 20 | 21 | virtual void Register(std::shared_ptr function_store) override { 22 | function_store->AddWritePart(collection_id, [this](std::shared_ptr part, 23 | std::shared_ptr writer, std::string url) { 24 | std::stringstream ss; 25 | auto* p = static_cast*>(part.get()); 26 | CHECK_NOTNULL(p); 27 | for (auto& elem : *p) { 28 | write_obj(elem, ss); 29 | } 30 | std::string s = ss.str(); 31 | writer->Write(url, s.c_str(), s.size()); 32 | }); 33 | } 34 | 35 | std::function write_obj; 36 | std::string url; 37 | int collection_id; 38 | }; 39 | 40 | } // namespace xyz 41 | 42 | -------------------------------------------------------------------------------- /core/program_context.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "base/sarray_binstream.hpp" 6 | 7 | #include "core/plan/spec_wrapper.hpp" 8 | #include "core/plan/dag.hpp" 9 | 10 | namespace xyz { 11 | 12 | struct ProgramContext { 13 | std::vector specs; 14 | Dag dag; 15 | 16 | std::string DebugString() const { 17 | std::stringstream ss; 18 | ss << "{ # of specs: " << specs.size() 19 | << " }\n"; 20 | ss << "specs:\n"; 21 | for (auto spec: specs) { 22 | ss << spec.DebugString() << "\n"; 23 | } 24 | ss << dag.DebugString(); 25 | return ss.str(); 26 | } 27 | 28 | friend SArrayBinStream& operator<<(xyz::SArrayBinStream& stream, const ProgramContext& c) { 29 | stream << c.specs << c.dag; 30 | return stream; 31 | } 32 | 33 | friend SArrayBinStream& operator>>(xyz::SArrayBinStream& stream, ProgramContext& c) { 34 | stream >> c.specs >> c.dag; 35 | return stream; 36 | } 37 | }; 38 | 39 | } // namespace xyz 40 | 41 | -------------------------------------------------------------------------------- /core/queue_node_map.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace xyz { 4 | namespace { 5 | 6 | // the number specifies the maximum number of queues in each node. 7 | const int kMagic = 8; 8 | 9 | int GetNodeId(int qid) { 10 | return qid / kMagic; 11 | } 12 | 13 | int GetWorkerQid(int nid) { 14 | return nid * kMagic; 15 | } 16 | 17 | int GetFetcherQid(int nid) { 18 | return nid * kMagic + 2; 19 | } 20 | int GetControllerActorQid(int nid) { 21 | return nid * kMagic + 3; 22 | } 23 | 24 | } 25 | } // namespace xyz 26 | 27 | -------------------------------------------------------------------------------- /core/scheduler/block_manager.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/block_manager.hpp" 2 | 3 | namespace xyz { 4 | 5 | BlockManager::BlockManager(std::shared_ptr elem, 6 | std::shared_ptr collection_manager, 7 | std::function()> builder) 8 | : elem_(elem), collection_manager_(collection_manager), 9 | builder_(builder) {} 10 | 11 | void BlockManager::Load(SpecWrapper spec_wrapper) { 12 | CHECK(spec_wrapper.type == SpecWrapper::Type::kLoad); 13 | int plan_id = spec_wrapper.id; 14 | auto* spec = static_cast(spec_wrapper.spec.get()); 15 | std::vector> assigned_nodes; 16 | std::vector num_local_threads; 17 | for (auto& kv: elem_->nodes) { 18 | assigned_nodes.push_back({kv.second.node.hostname, kv.second.node.id}); 19 | num_local_threads.push_back(kv.second.num_local_threads); 20 | } 21 | CHECK(builder_); 22 | assigners_[spec->collection_id] = builder_(); 23 | auto& assigner = assigners_[spec->collection_id]; 24 | CHECK(assigner); 25 | int num_blocks = 26 | assigner->Load(spec->collection_id, spec->url, assigned_nodes, num_local_threads, 27 | spec->is_load_meta, spec->is_whole_file); 28 | cid_pid_[spec->collection_id] = plan_id; 29 | } 30 | 31 | void BlockManager::FinishBlock(SArrayBinStream bin) { 32 | FinishedBlock block; 33 | bin >> block; 34 | LOG(INFO) << "[Scheduler] FinishBlock: " << block.DebugString(); 35 | auto& assigner = assigners_[block.collection_id]; 36 | bool done = assigner->FinishBlock(block); 37 | if (done) { 38 | auto blocks = assigner->GetFinishedBlocks(); 39 | stored_blocks_[block.collection_id] = blocks; 40 | // construct the collection view 41 | std::vector part_to_node(blocks.size()); 42 | for (int i = 0; i < part_to_node.size(); ++i) { 43 | CHECK(blocks.find(i) != blocks.end()) << "unknown block id " << i; 44 | part_to_node[i] = blocks[i].node_id; 45 | } 46 | CollectionView cv; 47 | cv.collection_id = block.collection_id; 48 | cv.mapper = SimplePartToNodeMapper(part_to_node); 49 | cv.num_partition = cv.mapper.GetNumParts(); 50 | // LOG(INFO) << "!!!!\n" << cv.DebugString(); 51 | elem_->collection_map->Insert(cv); 52 | 53 | // trigger update collection 54 | int collection_id = block.collection_id; 55 | int plan_id = cid_pid_[block.collection_id]; 56 | collection_manager_->Update(collection_id, [this, plan_id]() { 57 | SArrayBinStream reply_bin; 58 | reply_bin << plan_id; 59 | ToScheduler(elem_, ScheduleFlag::kFinishPlan, reply_bin); 60 | }); 61 | } 62 | } 63 | 64 | 65 | } // namespace xyz 66 | -------------------------------------------------------------------------------- /core/scheduler/block_manager.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/scheduler/scheduler_elem.hpp" 4 | #include "core/scheduler/collection_manager.hpp" 5 | #include "core/scheduler/control.hpp" 6 | 7 | #include "io/assigner.hpp" 8 | #include "io/meta.hpp" 9 | 10 | #include "core/plan/spec_wrapper.hpp" 11 | 12 | #include "glog/logging.h" 13 | 14 | namespace xyz { 15 | 16 | class BlockManager { 17 | public: 18 | BlockManager(std::shared_ptr elem, 19 | std::shared_ptr collection_manager, 20 | std::function()> builder); 21 | void Load(SpecWrapper spec_wrapper); 22 | void FinishBlock(SArrayBinStream bin); 23 | 24 | private: 25 | std::map> assigners_; // collection_id -> assigner 26 | std::function()> builder_; 27 | // collection_id, part_id, 28 | std::map> stored_blocks_; 29 | 30 | std::map cid_pid_; // collection_id -> plan_id 31 | 32 | std::shared_ptr elem_; 33 | 34 | std::shared_ptr collection_manager_; 35 | }; 36 | 37 | } // namespace xyz 38 | -------------------------------------------------------------------------------- /core/scheduler/checkpoint_loader.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/checkpoint_loader.hpp" 2 | 3 | namespace xyz { 4 | 5 | void CheckpointLoader::LoadCheckpoint(int cid, std::string url, 6 | std::function f) { 7 | LOG(INFO) << "[CheckpointLoader] loading checkpoint for collection: " << cid; 8 | auto& collection_view = elem_->collection_map->Get(cid); 9 | loadcheckpoint_reply_count_map_[cid] = 0; 10 | expected_loadcheckpoint_reply_count_map_[cid] = collection_view.mapper.GetNumParts(); 11 | callbacks_.insert({cid, f}); 12 | 13 | for (int i = 0; i < collection_view.mapper.GetNumParts(); ++ i) { 14 | int node_id = collection_view.mapper.Get(i); 15 | SendLoadCommand(cid, i, node_id, url); 16 | } 17 | } 18 | 19 | void CheckpointLoader::LoadCheckpointPartial(int cid, std::string url, 20 | std::vector parts, 21 | std::function f) { 22 | LOG(INFO) << "[CheckpointLoader] loading checkpoint (partial) for collection: " << cid << ", parts size: " << parts.size(); 23 | loadcheckpoint_reply_count_map_[cid] = 0; 24 | callbacks_.insert({cid, f}); 25 | expected_loadcheckpoint_reply_count_map_[cid] = parts.size(); 26 | 27 | auto& collection_view = elem_->collection_map->Get(cid); 28 | auto& part_to_node_map = collection_view.mapper.Get(); 29 | for (auto part_id: parts) { 30 | CHECK_LT(part_id, part_to_node_map.size()); 31 | int node_id = part_to_node_map[part_id]; 32 | SendLoadCommand(cid, part_id, node_id, url); 33 | } 34 | } 35 | 36 | void CheckpointLoader::SendLoadCommand(int cid, int part_id, int node_id, std::string url) { 37 | SArrayBinStream bin; 38 | std::string dest_url = GetCheckpointUrl(url, cid, part_id); 39 | bin << cid << part_id << dest_url; // collection_id, partition_id, url 40 | SendTo(elem_, node_id, ScheduleFlag::kLoadCheckpoint, bin); 41 | } 42 | 43 | void CheckpointLoader::FinishLoadCheckpoint(SArrayBinStream bin) { 44 | int qid, collection_id; 45 | bin >> qid >> collection_id; 46 | loadcheckpoint_reply_count_map_[collection_id] += 1; 47 | if (loadcheckpoint_reply_count_map_[collection_id] == expected_loadcheckpoint_reply_count_map_[collection_id]){ 48 | CHECK(callbacks_.find(collection_id) != callbacks_.end()); 49 | callbacks_[collection_id](); // invoke the callback 50 | callbacks_.erase(collection_id); 51 | } 52 | } 53 | 54 | } // namespace xyz 55 | 56 | -------------------------------------------------------------------------------- /core/scheduler/checkpoint_loader.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/scheduler/scheduler_elem.hpp" 6 | 7 | namespace xyz { 8 | 9 | 10 | class CheckpointLoader { 11 | public: 12 | CheckpointLoader(std::shared_ptr elem) 13 | : elem_(elem) {} 14 | 15 | // non thread-safe 16 | void LoadCheckpoint(int cid, std::string url, 17 | std::function f); 18 | void FinishLoadCheckpoint(SArrayBinStream bin); 19 | 20 | // parts: the partition ids that need to load 21 | void LoadCheckpointPartial(int cid, std::string url, 22 | std::vector parts, 23 | std::function f); 24 | 25 | void SendLoadCommand(int cid, int part_id, int node_id, std::string url); 26 | private: 27 | std::shared_ptr elem_; 28 | 29 | std::map loadcheckpoint_reply_count_map_; 30 | std::map expected_loadcheckpoint_reply_count_map_; 31 | std::map> callbacks_; 32 | }; 33 | 34 | } // namespace xyz 35 | -------------------------------------------------------------------------------- /core/scheduler/checkpoint_manager.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/checkpoint_manager.hpp" 2 | 3 | 4 | namespace xyz { 5 | 6 | void CheckpointManager::Checkpoint(SpecWrapper s) { 7 | CHECK(s.type == SpecWrapper::Type::kCheckpoint); 8 | auto* checkpoint_spec = static_cast(s.spec.get()); 9 | int cid = checkpoint_spec->cid; 10 | std::string url = checkpoint_spec->url; 11 | auto& collection_view = elem_->collection_map->Get(cid); 12 | cid_pid_[cid] = s.id; 13 | checkpoint_reply_count_map[cid] = 0; 14 | expected_checkpoint_reply_count_map[cid] = collection_view.mapper.GetNumParts(); 15 | for (int i = 0; i < collection_view.mapper.GetNumParts(); ++ i) { 16 | int node_id = collection_view.mapper.Get(i); 17 | SArrayBinStream bin; 18 | std::string dest_url = GetCheckpointUrl(url, cid, i); 19 | bin << cid << i << dest_url; // collection_id, partition_id, url 20 | SendTo(elem_, node_id, ScheduleFlag::kCheckpoint, bin); 21 | } 22 | collection_status_->AddCP(cid, url); // add checkpoint here 23 | } 24 | 25 | void CheckpointManager::LoadCheckpoint(SpecWrapper s) { 26 | CHECK(s.type == SpecWrapper::Type::kLoadCheckpoint); 27 | auto* load_checkpoint_spec = static_cast(s.spec.get()); 28 | int cid = load_checkpoint_spec->cid; 29 | cid_pid_[cid] = s.id; 30 | std::string url = collection_status_->GetLastCP(cid); 31 | checkpoint_loader_->LoadCheckpoint(cid, url, [this, cid]() { 32 | SArrayBinStream reply_bin; 33 | reply_bin << cid_pid_[cid]; 34 | ToScheduler(elem_, ScheduleFlag::kFinishPlan, reply_bin); 35 | cid_pid_.erase(cid); 36 | }); 37 | } 38 | 39 | void CheckpointManager::FinishCheckpoint(SArrayBinStream bin) { 40 | int qid, collection_id; 41 | bin >> qid >> collection_id; 42 | checkpoint_reply_count_map[collection_id] += 1; 43 | if (checkpoint_reply_count_map[collection_id] == expected_checkpoint_reply_count_map[collection_id]){ 44 | SArrayBinStream reply_bin; 45 | reply_bin << cid_pid_[collection_id]; 46 | ToScheduler(elem_, ScheduleFlag::kFinishPlan, reply_bin); 47 | cid_pid_.erase(collection_id); 48 | } 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /core/scheduler/checkpoint_manager.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/scheduler/scheduler_elem.hpp" 4 | #include "core/scheduler/checkpoint_loader.hpp" 5 | #include "core/scheduler/collection_status.hpp" 6 | 7 | #include "core/plan/spec_wrapper.hpp" 8 | 9 | namespace xyz { 10 | 11 | class CheckpointManager { 12 | public: 13 | CheckpointManager(std::shared_ptr elem, 14 | std::shared_ptr cp_loader, 15 | std::shared_ptr collection_status) 16 | : elem_(elem), checkpoint_loader_(cp_loader), 17 | collection_status_(collection_status) {} 18 | void Checkpoint(SpecWrapper s); 19 | void LoadCheckpoint(SpecWrapper s); 20 | void FinishCheckpoint(SArrayBinStream bin); 21 | private: 22 | std::map checkpoint_reply_count_map; 23 | std::map expected_checkpoint_reply_count_map; 24 | 25 | std::shared_ptr elem_; 26 | std::shared_ptr checkpoint_loader_; 27 | std::shared_ptr collection_status_; 28 | 29 | // collection_id -> plan_id 30 | std::map cid_pid_; 31 | }; 32 | 33 | } // namespace xyz 34 | -------------------------------------------------------------------------------- /core/scheduler/collection_manager.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/collection_manager.hpp" 2 | 3 | namespace xyz { 4 | 5 | void CollectionManager::Update(int collection_id, std::function f) { 6 | LOG(INFO) << "[Scheduler] Update Collection for collection_id: " 7 | << collection_id; 8 | callbacks_.insert({collection_id, f}); 9 | received_replies_[collection_id].clear(); 10 | 11 | SArrayBinStream reply_bin; 12 | auto& collection_view = elem_->collection_map->Get(collection_id); 13 | reply_bin << collection_id << collection_view; 14 | SendToAllWorkers(elem_, ScheduleFlag::kUpdateCollection, reply_bin); 15 | } 16 | 17 | void CollectionManager::FinishUpdate(SArrayBinStream bin) { 18 | int collection_id; 19 | int node_id; 20 | bin >> collection_id >> node_id; 21 | received_replies_[collection_id].insert(node_id); 22 | if (received_replies_[collection_id].size() == elem_->nodes.size()) { 23 | received_replies_[collection_id].clear(); 24 | LOG(INFO) << "[Scheduler] Update Collection for collection_id: " 25 | << collection_id << " done"; 26 | // finish_plan 27 | // invoke the callback 28 | CHECK(callbacks_.find(collection_id) != callbacks_.end()); 29 | callbacks_[collection_id](); // invoke the callback 30 | callbacks_.erase(collection_id); 31 | } 32 | } 33 | 34 | } // namespace xyz 35 | 36 | -------------------------------------------------------------------------------- /core/scheduler/collection_manager.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/scheduler/scheduler_elem.hpp" 6 | 7 | #include "core/plan/spec_wrapper.hpp" 8 | 9 | namespace xyz { 10 | 11 | class CollectionManager { 12 | public: 13 | CollectionManager(std::shared_ptr elem) 14 | : elem_(elem) {} 15 | 16 | // non thread-safe 17 | // update the collection map at each worker 18 | void Update(int collection_id, std::function f); 19 | 20 | void FinishUpdate(SArrayBinStream bin); 21 | 22 | private: 23 | std::shared_ptr elem_; 24 | 25 | // collection_id -> replied_node 26 | std::map> received_replies_; 27 | std::map> callbacks_; 28 | }; 29 | 30 | } // namespace xyz 31 | 32 | -------------------------------------------------------------------------------- /core/scheduler/collection_status.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "glog/logging.h" 10 | 11 | namespace xyz { 12 | 13 | class CollectionStatus { 14 | public: 15 | using ReadWriteVector = std::pair, std::vector>; 16 | 17 | std::string GetLastCP(int collection_id) const; 18 | 19 | void AddCP(int collection_id, std::string url); 20 | void AddPlan(int id, const ReadWriteVector& p); 21 | void FinishPlan(int plan_id); 22 | std::vector GetCurrentPlans(); 23 | std::string DebugString() const; 24 | std::vector GetReads() const; 25 | std::vector GetWrites() const; 26 | 27 | std::vector> GetReadsAndCP() const; 28 | std::vector> GetWritesAndCP() const; 29 | private: 30 | std::map cur_plans_; 31 | std::map plan_time_; 32 | std::map read_ids_; 33 | std::map write_ids_; 34 | 35 | std::map last_cp_; 36 | }; 37 | 38 | } // namespace xyz 39 | 40 | -------------------------------------------------------------------------------- /core/scheduler/collection_view.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "core/index/simple_part_to_node_mapper.hpp" 6 | 7 | namespace xyz { 8 | 9 | struct CollectionView { 10 | int collection_id; 11 | int num_partition; 12 | SimplePartToNodeMapper mapper; 13 | 14 | std::string DebugString() const { 15 | std::stringstream ss; 16 | ss << "{ collection_id: " << collection_id; 17 | ss << ", num_partition: " << num_partition; 18 | ss << ", setup?: " 19 | << (mapper.GetNumParts() == num_partition ? "True" : "False"); 20 | ss << ", mapper: " << mapper.DebugString(); 21 | ss << "}"; 22 | return ss.str(); 23 | } 24 | 25 | friend SArrayBinStream &operator<<(xyz::SArrayBinStream &stream, 26 | const CollectionView &c) { 27 | stream << c.collection_id << c.num_partition << c.mapper; 28 | return stream; 29 | } 30 | 31 | friend SArrayBinStream &operator>>(xyz::SArrayBinStream &stream, 32 | CollectionView &c) { 33 | stream >> c.collection_id >> c.num_partition >> c.mapper; 34 | return stream; 35 | } 36 | }; 37 | 38 | } // namespace xyz 39 | -------------------------------------------------------------------------------- /core/scheduler/control.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/control.hpp" 2 | 3 | namespace xyz { 4 | 5 | constexpr const char* ControllerMsg::FlagName[]; 6 | constexpr const char* MigrateMeta::FlagName[]; 7 | 8 | } // namespace xyz 9 | 10 | -------------------------------------------------------------------------------- /core/scheduler/dag_runner.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/dag_runner.hpp" 2 | 3 | #include 4 | 5 | #include "glog/logging.h" 6 | 7 | namespace xyz { 8 | 9 | std::vector SequentialDagRunner::GetRunnablePlans() { 10 | auto f = dag_visitor_.GetFront(); 11 | CHECK(std::find(f.begin(), f.end(), plan_count_) != f.end()); 12 | return {plan_count_}; 13 | } 14 | 15 | void SequentialDagRunner::Finish(int plan_id) { 16 | CHECK_EQ(plan_id, plan_count_); 17 | dag_visitor_.Finish(plan_id); 18 | plan_count_ += 1; 19 | } 20 | 21 | int SequentialDagRunner::GetNumRemainingPlans() { 22 | return dag_visitor_.GetNumDagNodes(); 23 | } 24 | 25 | // wide dag runner 26 | std::vector WideDagRunner::GetRunnablePlans() { 27 | auto f = dag_visitor_.GetFront(); 28 | std::vector ret; 29 | for (auto plan : f) { 30 | if (running_.find(plan) == running_.end()) { 31 | ret.push_back(plan); 32 | running_.insert(plan); 33 | } 34 | } 35 | return ret; 36 | } 37 | 38 | void WideDagRunner::Finish(int plan_id) { 39 | CHECK(running_.find(plan_id) != running_.end()); 40 | running_.erase(plan_id); 41 | dag_visitor_.Finish(plan_id); 42 | } 43 | 44 | int WideDagRunner::GetNumRemainingPlans() { 45 | return dag_visitor_.GetNumDagNodes(); 46 | } 47 | 48 | } // namespace xyz 49 | 50 | -------------------------------------------------------------------------------- /core/scheduler/dag_runner.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/dag.hpp" 4 | 5 | namespace xyz { 6 | 7 | struct AbstractDagRunner { 8 | virtual std::vector GetRunnablePlans() = 0; 9 | virtual void Finish(int) = 0; 10 | virtual int GetNumRemainingPlans() = 0; 11 | }; 12 | 13 | /* 14 | * this dag runner runs plan one by one according to 15 | * the definition order. 16 | */ 17 | class SequentialDagRunner : public AbstractDagRunner { 18 | public: 19 | SequentialDagRunner(const Dag& dag): 20 | dag_visitor_(dag) { 21 | num_plans_ = dag_visitor_.GetNumDagNodes(); 22 | plan_count_ = 0; 23 | } 24 | virtual std::vector GetRunnablePlans() override; 25 | virtual void Finish(int) override; 26 | virtual int GetNumRemainingPlans() override; 27 | private: 28 | DagVistor dag_visitor_; 29 | int plan_count_ = 0; 30 | int num_plans_ = 0; 31 | }; 32 | 33 | /* 34 | * run as many plans as possible 35 | */ 36 | class WideDagRunner : public AbstractDagRunner { 37 | public: 38 | WideDagRunner(const Dag& dag): 39 | dag_visitor_(dag) { 40 | num_plans_ = dag_visitor_.GetNumDagNodes(); 41 | plan_count_ = 0; 42 | } 43 | virtual std::vector GetRunnablePlans() override; 44 | virtual void Finish(int) override; 45 | virtual int GetNumRemainingPlans() override; 46 | private: 47 | DagVistor dag_visitor_; 48 | int plan_count_ = 0; 49 | int num_plans_ = 0; 50 | 51 | std::set running_; 52 | }; 53 | 54 | } // namespace xyz 55 | 56 | -------------------------------------------------------------------------------- /core/scheduler/distribute_manager.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/distribute_manager.hpp" 2 | 3 | namespace xyz { 4 | 5 | void DistributeManager::Distribute(SpecWrapper spec_wrapper) { 6 | auto spec = static_cast(spec_wrapper.spec.get()); 7 | LOG(INFO) << "[Scheduler] Distribute {plan_id, collection_id}: {" 8 | << spec_wrapper.id << "," << spec->collection_id << "}"; 9 | part_expected_map_[spec_wrapper.id] = spec->num_partition; 10 | // round-robin 11 | auto node_iter = elem_->nodes.begin(); 12 | for (int i = 0; i < spec->num_partition; ++i) { 13 | CHECK(node_iter != elem_->nodes.end()); 14 | Message msg; 15 | msg.meta.sender = 0; 16 | msg.meta.recver = GetWorkerQid(node_iter->second.node.id); 17 | msg.meta.flag = Flag::kOthers; 18 | SArrayBinStream ctrl_bin, bin; 19 | ctrl_bin << ScheduleFlag::kDistribute; 20 | bin << i << spec_wrapper.id; 21 | spec->ToBin(bin); 22 | msg.AddData(ctrl_bin.ToSArray()); 23 | msg.AddData(bin.ToSArray()); 24 | elem_->sender->Send(std::move(msg)); 25 | 26 | node_iter++; 27 | if (node_iter == elem_->nodes.end()) { 28 | node_iter = elem_->nodes.begin(); 29 | } 30 | } 31 | } 32 | 33 | void DistributeManager::FinishDistribute(SArrayBinStream bin) { 34 | int collection_id, part_id, node_id, plan_id; 35 | bin >> collection_id >> part_id >> node_id >> plan_id; 36 | distribute_map_[collection_id][part_id] = node_id; 37 | if (distribute_map_[collection_id].size() == part_expected_map_[plan_id]) { 38 | LOG(INFO) << "[Scheduler] Distribute {plan_id, collection_id}: {" 39 | << plan_id << "," << collection_id << "} done"; 40 | // construct the collection view 41 | std::vector part_to_node(part_expected_map_[plan_id]); 42 | for (int i = 0; i < part_to_node.size(); ++i) { 43 | CHECK(distribute_map_[collection_id].find(i) != 44 | distribute_map_[collection_id].end()); 45 | part_to_node[i] = distribute_map_[collection_id][i]; 46 | } 47 | CollectionView cv; 48 | cv.collection_id = collection_id; 49 | cv.mapper = SimplePartToNodeMapper(part_to_node); 50 | cv.num_partition = cv.mapper.GetNumParts(); 51 | elem_->collection_map->Insert(cv); 52 | LOG(INFO) << cv.DebugString(); 53 | 54 | // trigger InitWorkers 55 | collection_manager_->Update(collection_id, [this, plan_id]() { 56 | SArrayBinStream reply_bin; 57 | reply_bin << plan_id; 58 | ToScheduler(elem_, ScheduleFlag::kFinishPlan, reply_bin); 59 | }); 60 | } 61 | } 62 | 63 | } // namespace xyz 64 | -------------------------------------------------------------------------------- /core/scheduler/distribute_manager.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/scheduler/scheduler_elem.hpp" 4 | #include "core/scheduler/collection_manager.hpp" 5 | 6 | #include "core/plan/spec_wrapper.hpp" 7 | 8 | namespace xyz { 9 | 10 | class DistributeManager { 11 | public: 12 | DistributeManager(std::shared_ptr elem, 13 | std::shared_ptr collection_manager) 14 | : elem_(elem), collection_manager_(collection_manager) {} 15 | void Distribute(SpecWrapper spec); 16 | void FinishDistribute(SArrayBinStream bin); 17 | 18 | private: 19 | std::map part_expected_map_; 20 | std::shared_ptr elem_; 21 | // collection_id, part_id, node_id 22 | std::map> distribute_map_; 23 | 24 | std::shared_ptr collection_manager_; 25 | }; 26 | 27 | } // namespace xyz 28 | -------------------------------------------------------------------------------- /core/scheduler/recover_manager.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/scheduler/scheduler_elem.hpp" 4 | #include "core/scheduler/checkpoint_loader.hpp" 5 | #include "core/scheduler/collection_manager.hpp" 6 | #include "core/scheduler/collection_status.hpp" 7 | #include "core/plan/spec_wrapper.hpp" 8 | 9 | #include 10 | 11 | namespace xyz { 12 | 13 | class RecoverManager { 14 | public: 15 | RecoverManager(std::shared_ptr elem, std::shared_ptr collection_manager, 16 | std::shared_ptr checkpoint_loader) 17 | : elem_(elem), collection_manager_(collection_manager), checkpoint_loader_(checkpoint_loader) {} 18 | 19 | // : 20 | void Recover(std::set dead_nodes, 21 | std::vector> writes, 22 | std::vector> reads, 23 | std::function callback); 24 | 25 | std::vector ReplaceDeadnodesAndReturnUpdated( 26 | int cid, std::set dead_nodes); 27 | 28 | enum class Type { 29 | LoadCheckpoint, UpdateCollectionMap 30 | }; 31 | void RecoverDoneForACollection(int cid, RecoverManager::Type type); 32 | private: 33 | std::shared_ptr elem_; 34 | std::shared_ptr collection_manager_; 35 | std::shared_ptr checkpoint_loader_; 36 | 37 | std::set recovering_collections_; 38 | std::set updating_collections_; 39 | 40 | std::chrono::system_clock::time_point start_time_; 41 | bool started_ = false; 42 | std::function callback_; 43 | }; 44 | 45 | } // namespace xyz 46 | 47 | -------------------------------------------------------------------------------- /core/scheduler/scheduler_elem.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/scheduler_elem.hpp" 2 | 3 | 4 | namespace xyz { 5 | 6 | void SendToAllWorkers(std::shared_ptr elem, ScheduleFlag flag, SArrayBinStream bin) { 7 | SArrayBinStream ctrl_bin; 8 | ctrl_bin << flag; 9 | for (auto& node : elem->nodes) { 10 | Message msg; 11 | msg.meta.sender = 0; 12 | msg.meta.recver = GetWorkerQid(node.second.node.id); 13 | msg.meta.flag = Flag::kOthers; 14 | msg.AddData(ctrl_bin.ToSArray()); 15 | msg.AddData(bin.ToSArray()); 16 | elem->sender->Send(std::move(msg)); 17 | } 18 | } 19 | 20 | void SendTo(std::shared_ptr elem, int node_id, ScheduleFlag flag, SArrayBinStream bin) { 21 | SArrayBinStream ctrl_bin; 22 | ctrl_bin << flag; 23 | Message msg; 24 | msg.meta.sender = 0; 25 | msg.meta.recver = GetWorkerQid(node_id); 26 | msg.meta.flag = Flag::kOthers; 27 | msg.AddData(ctrl_bin.ToSArray()); 28 | msg.AddData(bin.ToSArray()); 29 | elem->sender->Send(std::move(msg)); 30 | } 31 | 32 | void ToScheduler(std::shared_ptr elem, ScheduleFlag flag, SArrayBinStream bin) { 33 | SArrayBinStream ctrl_bin; 34 | ctrl_bin << flag; 35 | Message msg; 36 | msg.meta.sender = -1; 37 | msg.meta.recver = 0; 38 | msg.meta.flag = Flag::kOthers; 39 | msg.AddData(ctrl_bin.ToSArray()); 40 | msg.AddData(bin.ToSArray()); 41 | elem->sender->Send(std::move(msg)); 42 | } 43 | 44 | void SendToAllControllers(std::shared_ptr elem, ControllerFlag flag, int plan_id, SArrayBinStream bin) { 45 | SArrayBinStream ctrl_bin, plan_bin; 46 | ctrl_bin << flag; 47 | plan_bin << plan_id; 48 | for (auto& node : elem->nodes) { 49 | Message msg; 50 | msg.meta.sender = 0; 51 | msg.meta.recver = GetControllerActorQid(node.second.node.id); 52 | msg.meta.flag = Flag::kOthers; 53 | msg.AddData(ctrl_bin.ToSArray()); 54 | msg.AddData(plan_bin.ToSArray()); 55 | msg.AddData(bin.ToSArray()); 56 | elem->sender->Send(std::move(msg)); 57 | } 58 | } 59 | 60 | void SendToController(std::shared_ptr elem, int node_id, ControllerFlag flag, int plan_id, SArrayBinStream bin) { 61 | SArrayBinStream ctrl_bin, plan_bin; 62 | ctrl_bin << flag; 63 | plan_bin << plan_id; 64 | Message msg; 65 | msg.meta.sender = 0; 66 | msg.meta.recver = GetControllerActorQid(node_id); 67 | msg.meta.flag = Flag::kOthers; 68 | msg.AddData(ctrl_bin.ToSArray()); 69 | msg.AddData(plan_bin.ToSArray()); 70 | msg.AddData(bin.ToSArray()); 71 | elem->sender->Send(std::move(msg)); 72 | } 73 | 74 | 75 | } 76 | -------------------------------------------------------------------------------- /core/scheduler/scheduler_elem.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "base/node.hpp" 7 | #include "base/sarray_binstream.hpp" 8 | #include "comm/abstract_sender.hpp" 9 | #include "core/collection_map.hpp" 10 | #include "core/scheduler/control.hpp" 11 | #include "core/queue_node_map.hpp" 12 | 13 | namespace xyz { 14 | 15 | struct NodeInfo { 16 | Node node; 17 | int num_local_threads; 18 | }; 19 | 20 | struct SchedulerElem { 21 | std::shared_ptr sender; 22 | std::shared_ptr collection_map; 23 | std::map nodes; 24 | }; 25 | 26 | void SendToAllControllers(std::shared_ptr elem, ControllerFlag flag, int plan_id, SArrayBinStream bin); 27 | void SendToController(std::shared_ptr elem, int node_id, ControllerFlag flag, int plan_id, SArrayBinStream bin); 28 | void SendToAllWorkers(std::shared_ptr elem, ScheduleFlag flag, SArrayBinStream bin); 29 | void SendTo(std::shared_ptr elem, int node_id, ScheduleFlag flag, SArrayBinStream bin); 30 | void ToScheduler(std::shared_ptr elem, ScheduleFlag flag, SArrayBinStream bin); 31 | 32 | 33 | } // namespace xyz 34 | 35 | -------------------------------------------------------------------------------- /core/scheduler/worker.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "base/actor.hpp" 9 | #include "base/sarray_binstream.hpp" 10 | #include "comm/abstract_sender.hpp" 11 | #include "core/engine_elem.hpp" 12 | #include "core/index/simple_part_to_node_mapper.hpp" 13 | #include "core/plan/function_store.hpp" 14 | #include "core/plan/plan_spec.hpp" 15 | 16 | #include "core/program_context.hpp" 17 | 18 | #include "glog/logging.h" 19 | #include "io/io_wrapper.hpp" 20 | #include "io/io_wrapper.hpp" 21 | 22 | namespace xyz { 23 | 24 | class Worker : public Actor { 25 | public: 26 | Worker(int qid, EngineElem engine_elem, 27 | std::shared_ptr io_wrapper, 28 | std::function()> block_reader_getter) 29 | : Actor(qid), engine_elem_(engine_elem), 30 | io_wrapper_(io_wrapper), 31 | block_reader_getter_(block_reader_getter){ 32 | Start(); 33 | } 34 | virtual ~Worker() override { Stop(); } 35 | 36 | // public api: 37 | // SetProgram should be called before kStart is recevied. 38 | void SetProgram(ProgramContext program) { 39 | program_ = program; 40 | is_program_set_ = true; 41 | } 42 | 43 | void RegisterProgram(); 44 | 45 | // Wait until the end signal. 46 | void Wait(); 47 | 48 | virtual void Process(Message msg) override; 49 | 50 | // The scheduler requests program from workers. 51 | void StartCluster(); 52 | 53 | void UpdateCollection(SArrayBinStream bin); 54 | 55 | void RunDummy(); 56 | 57 | void LoadBlock(SArrayBinStream bin); 58 | void Distribute(SArrayBinStream bin); 59 | void CheckPoint(SArrayBinStream bin); 60 | void LoadCheckPoint(SArrayBinStream bin); 61 | void WritePartition(SArrayBinStream bin); 62 | 63 | void SendMsgToScheduler(ScheduleFlag flag, SArrayBinStream bin); 64 | 65 | void Exit(); 66 | private: 67 | int Id() { return engine_elem_.node.id; } 68 | std::string WorkerId() { 69 | std::stringstream ss; 70 | ss << "[Worker " << Id() << "]: "; 71 | return ss.str(); 72 | } 73 | EngineElem engine_elem_; 74 | std::shared_ptr io_wrapper_; 75 | std::function()> block_reader_getter_; 76 | 77 | std::promise exit_promise_; 78 | 79 | ProgramContext program_; 80 | bool is_program_set_ = false; 81 | 82 | bool ready_ = false; 83 | }; 84 | 85 | } // namespace xyz 86 | -------------------------------------------------------------------------------- /core/scheduler/write_manager.cpp: -------------------------------------------------------------------------------- 1 | #include "core/scheduler/write_manager.hpp" 2 | 3 | 4 | namespace xyz { 5 | 6 | void WriteManager::Write(SpecWrapper s) { 7 | CHECK(s.type == SpecWrapper::Type::kWrite); 8 | auto* write_spec = static_cast(s.spec.get()); 9 | int id = write_spec->collection_id; 10 | int plan_id = s.id; 11 | cid_pid_[id] = plan_id; 12 | std::string url = write_spec->url; 13 | auto& collection_view = elem_->collection_map->Get(id); 14 | reply_count_map[id] = 0; 15 | expected_reply_count_map[id] = collection_view.mapper.GetNumParts(); 16 | LOG(INFO) << "[Scheduler] writing to " << expected_reply_count_map[id] << " partitions"; 17 | for (int i = 0; i < collection_view.mapper.GetNumParts(); ++ i) { 18 | int node_id = collection_view.mapper.Get(i); 19 | SArrayBinStream bin; 20 | std::string dest_url = url + "/part-" + std::to_string(i); 21 | bin << id << i << dest_url; // collection_id, partition_id, url, plan_id 22 | SendTo(elem_, node_id, ScheduleFlag::kWritePartition, bin); 23 | } 24 | } 25 | 26 | void WriteManager::FinishWritePartition(SArrayBinStream bin) { 27 | int qid, collection_id; 28 | bin >> qid >> collection_id; 29 | reply_count_map[collection_id] += 1; 30 | if(reply_count_map[collection_id] == expected_reply_count_map[collection_id]){ 31 | SArrayBinStream reply_bin; 32 | reply_bin << cid_pid_[collection_id]; 33 | ToScheduler(elem_, ScheduleFlag::kFinishPlan, reply_bin); 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /core/scheduler/write_manager.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/scheduler/scheduler_elem.hpp" 4 | 5 | #include "core/plan/spec_wrapper.hpp" 6 | 7 | namespace xyz { 8 | 9 | class WriteManager { 10 | public: 11 | WriteManager(std::shared_ptr elem) 12 | : elem_(elem) {} 13 | void Write(SpecWrapper spec); 14 | void FinishWritePartition(SArrayBinStream bin); 15 | 16 | private: 17 | std::map reply_count_map; 18 | std::map expected_reply_count_map; 19 | std::shared_ptr elem_; 20 | 21 | // collection_id -> plan_id 22 | std::map cid_pid_; 23 | }; 24 | 25 | } // namespace xyz 26 | -------------------------------------------------------------------------------- /core/shuffle_meta.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace xyz { 6 | 7 | struct ShuffleMeta { 8 | int plan_id; 9 | int collection_id; 10 | int part_id; 11 | int upstream_part_id; 12 | 13 | std::string DebugString() const { 14 | std::stringstream ss; 15 | ss << "{"; 16 | ss << " plan_id: " << plan_id; 17 | ss << ", collection_id: " << collection_id; 18 | ss << ", part_id: " << part_id; 19 | ss << ", upstream_part_id: " << upstream_part_id; 20 | ss << " }"; 21 | return ss.str(); 22 | } 23 | }; 24 | 25 | } // namespace xyz 26 | 27 | -------------------------------------------------------------------------------- /core/tmp.cpp: -------------------------------------------------------------------------------- 1 | #include "core/tmp.hpp" 2 | 3 | namespace xyz { 4 | 5 | void C::f() {} 6 | 7 | } // namespace xyz 8 | -------------------------------------------------------------------------------- /core/tmp.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace xyz { 4 | 5 | class C { 6 | public: 7 | void f(); 8 | }; 9 | 10 | } // namespace xyz 11 | -------------------------------------------------------------------------------- /core/worker/abstract_plan_controller.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "core/plan/spec_wrapper.hpp" 4 | #include "base/message.hpp" 5 | 6 | namespace xyz { 7 | 8 | struct AbstractPlanController { 9 | virtual ~AbstractPlanController() = default; 10 | virtual void Setup(SpecWrapper spec) = 0; 11 | virtual void StartPlan() = 0; 12 | virtual void FinishMap(SArrayBinStream bin) = 0; 13 | virtual void FinishJoin(SArrayBinStream bin) = 0; 14 | virtual void UpdateVersion(SArrayBinStream bin) = 0; 15 | virtual void ReceiveJoin(Message msg) = 0; 16 | virtual void ReceiveFetchRequest(Message msg) = 0; 17 | virtual void FinishFetch(SArrayBinStream bin) = 0; 18 | virtual void FinishCheckpoint(SArrayBinStream bin) = 0; 19 | 20 | virtual void MigratePartition(Message msg) = 0; 21 | virtual void FinishLoadWith(SArrayBinStream bin) = 0; 22 | 23 | virtual void ReassignMap(SArrayBinStream bin) = 0; 24 | 25 | virtual void DisplayTime() = 0; 26 | }; 27 | 28 | } // namespace xyz 29 | 30 | -------------------------------------------------------------------------------- /core/worker/controller.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "base/actor.hpp" 6 | #include "core/worker/abstract_plan_controller.hpp" 7 | #include "core/engine_elem.hpp" 8 | #include "io/io_wrapper.hpp" 9 | 10 | #include 11 | #include 12 | 13 | namespace xyz { 14 | 15 | class Controller : public Actor { 16 | public: 17 | Controller(int qid, EngineElem engine_elem, std::shared_ptr io_wrapper) 18 | : Actor(qid), engine_elem_(engine_elem), 19 | io_wrapper_(io_wrapper) { 20 | Start(); 21 | } 22 | 23 | virtual ~Controller() override { 24 | Stop(); 25 | } 26 | 27 | virtual void Process(Message msg) override; 28 | 29 | void Setup(SArrayBinStream bin); 30 | void TerminatePlan(int plan_id); 31 | void SendMsgToScheduler(SArrayBinStream bin); 32 | 33 | std::shared_ptr io_wrapper_; 34 | EngineElem engine_elem_; 35 | std::map> plan_controllers_; 36 | std::map erased; 37 | boost::shared_mutex erase_mu_; 38 | 39 | struct Timer { 40 | std::chrono::microseconds plan_time{0}; 41 | std::chrono::microseconds control_time{0}; 42 | std::chrono::time_point start_time; 43 | }; 44 | std::map plan_timer_; 45 | }; 46 | 47 | } // namespace xyz 48 | 49 | -------------------------------------------------------------------------------- /core/worker/delayed_combiner.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "core/worker/plan_controller.hpp" 9 | 10 | #include "core/map_output/map_output_stream.hpp" 11 | #include "core/map_output/partitioned_map_output.hpp" 12 | 13 | namespace xyz { 14 | 15 | class PlanController; 16 | class DelayedCombiner { 17 | public: 18 | using StreamPair = std::pair>; 19 | 20 | DelayedCombiner(PlanController* plan_controller, int combine_timeout); 21 | 22 | ~DelayedCombiner() { 23 | finished_.store(true); 24 | if (detect_thread_.joinable()) { 25 | detect_thread_.join(); 26 | } 27 | } 28 | 29 | void AddMapOutput(int upstream_part_id, int version, 30 | std::shared_ptr map_output); 31 | void AddStream(int upstream_part_id, int version, int part_id, 32 | std::shared_ptr stream); 33 | void PeriodicCombine(); 34 | void Submit(int part_id, int version, std::vector v); 35 | void CombineSerializeSend(int part_id, int version, std::vector v); 36 | void PrepareMsgAndSend(int part_id, int version, 37 | std::vector upstream_part_ids, std::shared_ptr stream); 38 | 39 | void Detect(); 40 | private: 41 | std::thread detect_thread_; 42 | std::mutex mu_; 43 | // part_id -> version -> vector of 44 | std::vector>> store_; 45 | PlanController* plan_controller_; 46 | 47 | std::shared_ptr executor_; 48 | // combine_timeout_: 49 | // <0 : send without combine 50 | // 0: directly combine and send 51 | // 0-kMaxCombineTimeout: timeout in ms 52 | // >kMaxCombineTimeout: shuffle combine 53 | const int combine_timeout_ = 0; 54 | std::atomic finished_{false}; 55 | }; 56 | 57 | } // namespace xyz 58 | 59 | -------------------------------------------------------------------------------- /core/worker/plan_controller_test.cpp: -------------------------------------------------------------------------------- 1 | #include "glog/logging.h" 2 | #include "gtest/gtest.h" 3 | 4 | #include "core/worker/plan_controller.hpp" 5 | #include "core/worker/controller.hpp" 6 | #include "io/fake_reader.hpp" 7 | #include "io/fake_writer.hpp" 8 | 9 | namespace xyz { 10 | namespace { 11 | 12 | class TestPlanController : public testing::Test {}; 13 | 14 | TEST_F(TestPlanController, Create) { 15 | int qid = 0; 16 | EngineElem elem; 17 | elem.num_local_threads = 1; 18 | elem.num_update_threads = 1; 19 | auto io_wrapper = std::make_shared( 20 | []() { return std::make_shared(); }, 21 | []() { return std::make_shared(); }); 22 | Controller controller(qid, elem, io_wrapper); 23 | PlanController plan_controller(&controller); 24 | } 25 | 26 | } // namespace 27 | } // namespace xyz 28 | -------------------------------------------------------------------------------- /examples/crawler_util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from bs4 import BeautifulSoup 3 | from urlparse import urljoin 4 | from urllib2 import urlopen 5 | import ssl 6 | 7 | def get_page(url): 8 | """Get the text of the web page at the given URL 9 | return a string containing the content""" 10 | 11 | context = ssl._create_unverified_context() 12 | fd = urlopen(url, context=context) 13 | content = fd.read() 14 | fd.close() 15 | 16 | return content.decode('utf8',"ignore") 17 | 18 | def get_links(url): 19 | """Scan the text for http URLs and return a set 20 | of URLs found, without duplicates""" 21 | 22 | # look for any http URL in the page 23 | links = set() 24 | 25 | text = get_page(url) 26 | soup = BeautifulSoup(text, "lxml") 27 | 28 | for link in soup.find_all('a'): 29 | if 'href' in link.attrs: 30 | newurl = link.attrs['href'] 31 | # resolve relative URLs 32 | if newurl.startswith('/'): 33 | newurl = urljoin(url, newurl) 34 | # ignore any URL that doesn't now start with http 35 | if newurl.startswith('http'): 36 | links.add(newurl) 37 | 38 | return links 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser(description='Process some integers.') 42 | parser.add_argument('url', help='url to crawl') 43 | 44 | args = parser.parse_args() 45 | links = get_links(args.url) 46 | print u' '.join(links).encode('utf8') 47 | 48 | 49 | -------------------------------------------------------------------------------- /examples/kmeans/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | # KmeansExample 4 | add_executable(KmeansExample kmeans.cpp) 5 | target_link_libraries(KmeansExample xyz) 6 | target_link_libraries(KmeansExample ${HUSKY_EXTERNAL_LIB}) 7 | set_property(TARGET KmeansExample PROPERTY CXX_STANDARD 14) 8 | add_dependencies(KmeansExample ${external_project_dependencies}) 9 | add_dependencies(KmeansExample ${external_project_dependencies}) 10 | 11 | # KmeansRowExample 12 | add_executable(KmeansRowExample kmeans_row.cpp) 13 | target_link_libraries(KmeansRowExample xyz) 14 | target_link_libraries(KmeansRowExample ${HUSKY_EXTERNAL_LIB}) 15 | set_property(TARGET KmeansRowExample PROPERTY CXX_STANDARD 14) 16 | add_dependencies(KmeansRowExample ${external_project_dependencies}) 17 | add_dependencies(KmeansRowExample ${external_project_dependencies}) 18 | -------------------------------------------------------------------------------- /examples/load_example.cpp: -------------------------------------------------------------------------------- 1 | #include "core/plan/runner.hpp" 2 | 3 | DEFINE_string(url, "", "The url for hdfs file"); 4 | DEFINE_string(output_url, "", ""); 5 | 6 | using namespace xyz; 7 | 8 | struct ObjT { 9 | using KeyT = std::string; 10 | using ValT = int; 11 | ObjT() = default; 12 | ObjT(KeyT key) : a(key), b(0) {} 13 | KeyT Key() const { return a; } 14 | KeyT a; 15 | int b; 16 | friend SArrayBinStream &operator<<(xyz::SArrayBinStream &stream, 17 | const ObjT &obj) { 18 | stream << obj.a << obj.b; 19 | return stream; 20 | } 21 | friend SArrayBinStream &operator>>(xyz::SArrayBinStream &stream, ObjT &obj) { 22 | stream >> obj.a >> obj.b; 23 | return stream; 24 | } 25 | }; 26 | 27 | int main(int argc, char **argv) { 28 | Runner::Init(argc, argv); 29 | 30 | auto c1 = Context::load(FLAGS_url, [](std::string s) { return s; }); 31 | auto c2 = Context::placeholder(1); 32 | 33 | auto p = Context::mapupdate( 34 | c1, c2, 35 | [](std::string word, Output *o) { o->Add(word, 1); }, 36 | [](ObjT *obj, int m) { 37 | obj->b += m; 38 | LOG(INFO) << "update result: " << obj->a << " " << obj->b; 39 | }); 40 | 41 | Context::write(c2, FLAGS_output_url, 42 | [](const ObjT &obj, std::stringstream &ss) { 43 | ss << obj.a << " " << obj.b << "\n"; 44 | }); 45 | 46 | Runner::Run(); 47 | } 48 | -------------------------------------------------------------------------------- /examples/lr/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | # DenseLRExample 4 | add_executable(DenseLRExample dense_lr.cpp) 5 | target_link_libraries(DenseLRExample xyz) 6 | target_link_libraries(DenseLRExample ${HUSKY_EXTERNAL_LIB}) 7 | set_property(TARGET DenseLRExample PROPERTY CXX_STANDARD 14) 8 | add_dependencies(DenseLRExample ${external_project_dependencies}) 9 | add_dependencies(DenseLRExample ${external_project_dependencies}) 10 | 11 | # DenseLRExample2 12 | add_executable(DenseLRExample2 dense_lr_2.cpp) 13 | target_link_libraries(DenseLRExample2 xyz) 14 | target_link_libraries(DenseLRExample2 ${HUSKY_EXTERNAL_LIB}) 15 | set_property(TARGET DenseLRExample2 PROPERTY CXX_STANDARD 14) 16 | add_dependencies(DenseLRExample2 ${external_project_dependencies}) 17 | add_dependencies(DenseLRExample2 ${external_project_dependencies}) 18 | 19 | # SparseLRExample 20 | add_executable(SparseLRExample sparse_lr.cpp) 21 | target_link_libraries(SparseLRExample xyz) 22 | target_link_libraries(SparseLRExample ${HUSKY_EXTERNAL_LIB}) 23 | set_property(TARGET SparseLRExample PROPERTY CXX_STANDARD 14) 24 | add_dependencies(SparseLRExample ${external_project_dependencies}) 25 | add_dependencies(SparseLRExample ${external_project_dependencies}) 26 | 27 | # DenseLRRowExample 28 | add_executable(DenseLRRowExample dense_lr_row.cpp) 29 | target_link_libraries(DenseLRRowExample xyz) 30 | target_link_libraries(DenseLRRowExample ${HUSKY_EXTERNAL_LIB}) 31 | set_property(TARGET DenseLRRowExample PROPERTY CXX_STANDARD 14) 32 | add_dependencies(DenseLRRowExample ${external_project_dependencies}) 33 | add_dependencies(DenseLRRowExample ${external_project_dependencies}) 34 | -------------------------------------------------------------------------------- /examples/pagerank/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | # PageRank 4 | add_executable(PageRank pagerank.cpp) 5 | target_link_libraries(PageRank xyz) 6 | target_link_libraries(PageRank ${HUSKY_EXTERNAL_LIB}) 7 | set_property(TARGET PageRank PROPERTY CXX_STANDARD 14) 8 | add_dependencies(PageRank ${external_project_dependencies}) 9 | 10 | # PageRankWith 11 | add_executable(PageRankWith pagerank_with.cpp) 12 | target_link_libraries(PageRankWith xyz) 13 | target_link_libraries(PageRankWith ${HUSKY_EXTERNAL_LIB}) 14 | set_property(TARGET PageRankWith PROPERTY CXX_STANDARD 14) 15 | add_dependencies(PageRankWith ${external_project_dependencies}) 16 | 17 | # PageRankConverge 18 | add_executable(PageRankConverge pagerank-converge.cpp) 19 | target_link_libraries(PageRankConverge xyz) 20 | target_link_libraries(PageRankConverge ${HUSKY_EXTERNAL_LIB}) 21 | set_property(TARGET PageRankConverge PROPERTY CXX_STANDARD 14) 22 | add_dependencies(PageRankConverge ${external_project_dependencies}) 23 | 24 | # ComparePR 25 | add_executable(ComparePR compare_pr.cpp) 26 | target_link_libraries(ComparePR xyz) 27 | target_link_libraries(ComparePR ${HUSKY_EXTERNAL_LIB}) 28 | set_property(TARGET ComparePR PROPERTY CXX_STANDARD 14) 29 | add_dependencies(ComparePR ${external_project_dependencies}) 30 | 31 | # Sum 32 | add_executable(Sum sum.cpp) 33 | target_link_libraries(Sum xyz) 34 | target_link_libraries(Sum ${HUSKY_EXTERNAL_LIB}) 35 | set_property(TARGET Sum PROPERTY CXX_STANDARD 14) 36 | add_dependencies(Sum ${external_project_dependencies}) 37 | 38 | # PageRankBsp 39 | add_executable(PageRankBsp pagerank-converge-bsp.cpp) 40 | target_link_libraries(PageRankBsp xyz) 41 | target_link_libraries(PageRankBsp ${HUSKY_EXTERNAL_LIB}) 42 | set_property(TARGET PageRankBsp PROPERTY CXX_STANDARD 14) 43 | add_dependencies(PageRankBsp ${external_project_dependencies}) 44 | -------------------------------------------------------------------------------- /examples/pagerank/compare_pr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os.path 5 | from os.path import dirname, realpath 6 | 7 | proj_dir = dirname(dirname(dirname(realpath(__file__)))) 8 | print proj_dir 9 | sys.path.append(proj_dir+"/scripts/") 10 | 11 | from launcher import Launcher 12 | 13 | hostfile = "machinefiles/20nodes" 14 | progfile = "release/ComparePR" 15 | schedulerfile = "release/SchedulerMain" 16 | 17 | common_params = { 18 | "scheduler" : "proj99", 19 | "scheduler_port" : "33227", 20 | "hdfs_namenode" : "proj99", 21 | "hdfs_port" : 9000, 22 | } 23 | 24 | program_params = { 25 | "url1": "/tmp/tmp/yz/tmp/0408/pr/9", 26 | "url2": "/tmp/tmp/yz/tmp/0417/0/", 27 | } 28 | 29 | scheduler_params = { 30 | "dag_runner_type" : "sequential", 31 | } 32 | 33 | env_params = ( 34 | "GLOG_logtostderr=true " 35 | "GLOG_v=-1 " 36 | "GLOG_minloglevel=0 " 37 | # this is to enable hdfs short-circuit read (disable the warning info) 38 | # change this path accordingly when we use other cluster 39 | # the current setting is for proj5-10 40 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 41 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 42 | ) 43 | 44 | dump_core = False 45 | l = Launcher(schedulerfile, progfile, hostfile, 46 | common_params, scheduler_params, program_params, env_params, 47 | dump_core) 48 | 49 | l.Launch(sys.argv) 50 | 51 | # for i in xrange(5): 52 | # program_params["url2"] = "/tmp/tmp/yz/tmp/0417/"+str(i) 53 | # l = Launcher(schedulerfile, progfile, hostfile, 54 | # common_params, scheduler_params, program_params, env_params, 55 | # dump_core) 56 | # 57 | # l.Launch(sys.argv) 58 | 59 | -------------------------------------------------------------------------------- /examples/pagerank/pagerank-converge-bsp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | from os.path import dirname, realpath 6 | proj_dir = dirname(dirname(dirname(realpath(__file__)))) 7 | sys.path.append(proj_dir+"/scripts/") 8 | 9 | from launcher import Launcher 10 | 11 | hostfile = "machinefiles/20nodes" 12 | progfile = "release/PageRankBsp" 13 | schedulerfile = "release/SchedulerMain" 14 | 15 | common_params = { 16 | "scheduler" : "proj99", 17 | "scheduler_port" : "33227", 18 | "hdfs_namenode" : "proj99", 19 | "hdfs_port" : 9000, 20 | } 21 | 22 | program_params = { 23 | "url" : "/datasets/graph/webuk-adj", 24 | "num_vertices" : 133633040, 25 | # "url" : "/datasets/graph/google-adj", 26 | # "num_vertices" : 427554, 27 | 28 | "num_local_threads" : 20, 29 | 30 | "num_parts" : 1000, 31 | "combine_type" : "kShuffleCombine", 32 | "num_iters" : 10, # write every num_iters 33 | "staleness" : 0, 34 | "pr_url" : "/tmp/tmp/yz/tmp/0408/pr/", 35 | "topk_url" : "/tmp/tmp/yz/tmp/0408/tmp2/topk-10", 36 | } 37 | 38 | scheduler_params = { 39 | "dag_runner_type" : "sequential", 40 | } 41 | 42 | env_params = ( 43 | "GLOG_logtostderr=true " 44 | "GLOG_v=-1 " 45 | "GLOG_minloglevel=0 " 46 | # this is to enable hdfs short-circuit read (disable the warning info) 47 | # change this path accordingly when we use other cluster 48 | # the current setting is for proj5-10 49 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 50 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 51 | ) 52 | 53 | dump_core = False 54 | l = Launcher(schedulerfile, progfile, hostfile, 55 | common_params, scheduler_params, program_params, env_params, 56 | dump_core) 57 | 58 | l.Launch(sys.argv) 59 | -------------------------------------------------------------------------------- /examples/pagerank/pagerank-converge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | from os.path import dirname, realpath 6 | proj_dir = dirname(dirname(dirname(realpath(__file__)))) 7 | sys.path.append(proj_dir+"/scripts/") 8 | 9 | from launcher import Launcher 10 | 11 | hostfile = "machinefiles/20nodes" 12 | progfile = "release/PageRankConverge" 13 | schedulerfile = "release/SchedulerMain" 14 | 15 | common_params = { 16 | "scheduler" : "proj99", 17 | "scheduler_port" : "33227", 18 | "hdfs_namenode" : "proj99", 19 | "hdfs_port" : 9000, 20 | } 21 | 22 | program_params = { 23 | "url" : "/datasets/graph/webuk-adj", 24 | "num_vertices" : 133633040, 25 | # "url" : "/datasets/graph/google-adj", 26 | # "num_vertices" : 427554, 27 | 28 | "num_local_threads" : 5, 29 | 30 | "num_parts" : 400, 31 | "combine_type" : "kDirectCombine", 32 | "num_iters" : 1, 33 | "staleness" : 0, 34 | "pr_url" : "/tmp/tmp/yz/tmp/0417/0/", 35 | "topk_url" : "/tmp/tmp/yz/tmp/tmp/10-1/topk", 36 | } 37 | 38 | scheduler_params = { 39 | "dag_runner_type" : "sequential", 40 | } 41 | 42 | env_params = ( 43 | "GLOG_logtostderr=true " 44 | "GLOG_v=-1 " 45 | "GLOG_minloglevel=0 " 46 | # this is to enable hdfs short-circuit read (disable the warning info) 47 | # change this path accordingly when we use other cluster 48 | # the current setting is for proj5-10 49 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 50 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 51 | ) 52 | 53 | dump_core = False 54 | l = Launcher(schedulerfile, progfile, hostfile, 55 | common_params, scheduler_params, program_params, env_params, 56 | dump_core) 57 | 58 | l.Launch(sys.argv) 59 | 60 | # for i in reversed(xrange(5)): 61 | # program_params["pr_url"] = "/tmp/tmp/yz/tmp/0417/"+str(i) 62 | # # program_params["topk_url"] = "/tmp/tmp/yz/tmp/0410/topk3/"+str(i) 63 | # program_params["num_iters"] = 10+i*10 64 | # l = Launcher(schedulerfile, progfile, hostfile, 65 | # common_params, scheduler_params, program_params, env_params, 66 | # dump_core) 67 | # 68 | # l.Launch(sys.argv) 69 | -------------------------------------------------------------------------------- /examples/pagerank/sum.cpp: -------------------------------------------------------------------------------- 1 | #include "core/plan/runner.hpp" 2 | 3 | #include "core/partition/block_partition.hpp" 4 | 5 | #include "boost/tokenizer.hpp" 6 | 7 | DEFINE_string(url, "", ""); 8 | 9 | using namespace xyz; 10 | 11 | struct Sum { 12 | int id; 13 | double sum; 14 | 15 | using KeyT = int; 16 | Sum() = default; 17 | Sum(int _id) : id(_id) {} 18 | KeyT Key() const { return id; } 19 | }; 20 | 21 | int main(int argc, char **argv) { 22 | Runner::Init(argc, argv); 23 | CHECK(FLAGS_url.size()); 24 | 25 | auto lines = Context::load_block_meta(FLAGS_url); 26 | auto sum = Context::placeholder(1); 27 | Context::mappartupdate( 28 | lines, sum, 29 | [](TypedPartition *p, Output *o) { 30 | auto *bp = dynamic_cast(p); 31 | CHECK_NOTNULL(bp); 32 | auto reader = bp->GetReader(); 33 | double sum = 0; 34 | while (reader->HasLine()) { 35 | auto line = reader->GetLine(); 36 | // LOG(INFO) << "line: " << line; 37 | boost::char_separator sep(" \t\n"); 38 | boost::tokenizer> tok(line, sep); 39 | boost::tokenizer>::iterator it = 40 | tok.begin(); 41 | std::stoi(*it++); 42 | double d = std::stof(*it); 43 | sum += d; 44 | } 45 | o->Add(0, sum); 46 | }, 47 | [](Sum *s, double c) { s->sum += c; }) 48 | ->SetCombine([](double *a, double b) { return *a += b; }); 49 | 50 | Context::foreach (sum, [](Sum s) { 51 | LOG(INFO) << RED("The sum is: " + std::to_string(s.sum)); 52 | }); 53 | 54 | Runner::Run(); 55 | } 56 | -------------------------------------------------------------------------------- /examples/pagerank/sum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os.path 5 | from os.path import dirname, realpath 6 | 7 | proj_dir = dirname(dirname(dirname(realpath(__file__)))) 8 | print proj_dir 9 | sys.path.append(proj_dir+"/scripts/") 10 | 11 | from launcher import Launcher 12 | 13 | hostfile = "machinefiles/20nodes" 14 | progfile = "release/Sum" 15 | schedulerfile = "release/SchedulerMain" 16 | 17 | common_params = { 18 | "scheduler" : "proj99", 19 | "scheduler_port" : "33227", 20 | "hdfs_namenode" : "proj99", 21 | "hdfs_port" : 9000, 22 | } 23 | 24 | program_params = { 25 | "url": "/tmp/tmp/yz/tmp/0409/50-10/pr", 26 | } 27 | 28 | scheduler_params = { 29 | "dag_runner_type" : "sequential", 30 | } 31 | 32 | env_params = ( 33 | "GLOG_logtostderr=true " 34 | "GLOG_v=-1 " 35 | "GLOG_minloglevel=0 " 36 | # this is to enable hdfs short-circuit read (disable the warning info) 37 | # change this path accordingly when we use other cluster 38 | # the current setting is for proj5-10 39 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 40 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 41 | ) 42 | 43 | dump_core = False 44 | l = Launcher(schedulerfile, progfile, hostfile, 45 | common_params, scheduler_params, program_params, env_params, 46 | dump_core) 47 | 48 | l.Launch(sys.argv) 49 | 50 | # for i in xrange(10): 51 | # program_params["url"] = "/tmp/tmp/yz/tmp/0408/pr3/"+str(i) 52 | # l = Launcher(schedulerfile, progfile, hostfile, 53 | # common_params, scheduler_params, program_params, env_params, 54 | # dump_core) 55 | # 56 | # l.Launch(sys.argv) 57 | # 58 | -------------------------------------------------------------------------------- /examples/scheduler_example.cpp: -------------------------------------------------------------------------------- 1 | #include "gflags/gflags.h" 2 | #include "glog/logging.h" 3 | 4 | //#include "base/node_util.hpp" 5 | #include "comm/scheduler_mailbox.hpp" 6 | 7 | DEFINE_int32(num_worker, -1, "The number of workers"); 8 | DEFINE_string(scheduler, "", "The host of scheduler"); 9 | DEFINE_string(scheduler_port, "", "The port of scheduler"); 10 | 11 | namespace xyz { 12 | 13 | void Run() { 14 | /* 0. Basic checks */ 15 | CHECK_NE(FLAGS_num_worker, -1); 16 | CHECK(!FLAGS_scheduler.empty()); 17 | CHECK(!FLAGS_scheduler_port.empty()); 18 | 19 | /* 1. Parse config_file */ 20 | Node scheduler_node{0, FLAGS_scheduler, std::stoi(FLAGS_scheduler_port), 21 | false}; 22 | LOG(INFO) << "scheduler_node: " << scheduler_node.DebugString(); 23 | 24 | /* 1. Scheduler program */ 25 | SchedulerMailbox scheduler_mailbox(scheduler_node, FLAGS_num_worker); 26 | scheduler_mailbox.Start(); 27 | // scheduler_mailbox.Stop(); 28 | std::this_thread::sleep_for(std::chrono::seconds(10)); 29 | scheduler_mailbox.Stop(); 30 | } 31 | 32 | } // namespace xyz 33 | 34 | int main(int argc, char **argv) { 35 | google::InitGoogleLogging(argv[0]); 36 | gflags::ParseCommandLineFlags(&argc, &argv, true); 37 | xyz::Run(); 38 | } 39 | -------------------------------------------------------------------------------- /examples/scheduler_main.cpp: -------------------------------------------------------------------------------- 1 | #include "gflags/gflags.h" 2 | #include "glog/logging.h" 3 | 4 | #include "comm/scheduler_mailbox.hpp" 5 | #include "comm/sender.hpp" 6 | #include "core/scheduler/scheduler.hpp" 7 | #include "io/assigner.hpp" 8 | #include "io/hdfs_browser.hpp" 9 | 10 | DEFINE_int32(num_worker, -1, "The number of workers"); 11 | DEFINE_string(scheduler, "proj10", "The host of scheduler"); 12 | DEFINE_int32(scheduler_port, -1, "The port of scheduler"); 13 | DEFINE_string(hdfs_namenode, "proj10", "The namenode of hdfs"); 14 | DEFINE_int32(hdfs_port, 9000, "The port of hdfs"); 15 | 16 | DEFINE_string(dag_runner_type, "sequential", ""); 17 | 18 | namespace xyz { 19 | 20 | void RunScheduler() { 21 | Node scheduler_node{0, FLAGS_scheduler, FLAGS_scheduler_port, false}; 22 | 23 | // create mailbox and sender 24 | auto scheduler_mailbox = 25 | std::make_shared(scheduler_node, FLAGS_num_worker); 26 | auto sender = std::make_shared(-1, scheduler_mailbox.get()); 27 | 28 | // create scheduler and register queue 29 | const int id = 0; 30 | const std::string namenode = FLAGS_hdfs_namenode; 31 | const int port = FLAGS_hdfs_port; 32 | auto assigner_builder = [sender, namenode, port]() { 33 | auto browser = std::make_shared(namenode, port); 34 | auto assigner = std::make_shared(sender, browser); 35 | return assigner; 36 | }; 37 | Scheduler scheduler(id, sender, assigner_builder, FLAGS_dag_runner_type); 38 | scheduler_mailbox->RegisterQueue(id, scheduler.GetWorkQueue()); 39 | 40 | // start mailbox 41 | scheduler_mailbox->Start(); 42 | 43 | // make scheduler ready 44 | auto nodes = scheduler_mailbox->GetNodes(); 45 | CHECK_GT(nodes.size(), 0); 46 | scheduler.Ready(nodes); 47 | 48 | scheduler.Wait(); 49 | scheduler_mailbox->Stop(); 50 | } 51 | 52 | } // namespace xyz 53 | 54 | int main(int argc, char **argv) { 55 | google::InitGoogleLogging(argv[0]); 56 | gflags::ParseCommandLineFlags(&argc, &argv, true); 57 | 58 | CHECK_NE(FLAGS_num_worker, -1); 59 | CHECK(!FLAGS_scheduler.empty()); 60 | 61 | xyz::RunScheduler(); 62 | } 63 | -------------------------------------------------------------------------------- /examples/tfidf/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | # TFIDF 4 | add_executable(TFIDF tfidf.cpp) 5 | target_link_libraries(TFIDF xyz) 6 | target_link_libraries(TFIDF ${HUSKY_EXTERNAL_LIB}) 7 | set_property(TARGET TFIDF PROPERTY CXX_STANDARD 14) 8 | add_dependencies(TFIDF ${external_project_dependencies}) 9 | add_dependencies(TFIDF ${external_project_dependencies}) 10 | 11 | # TFIDF2 12 | add_executable(TFIDF2 tfidf2.cpp) 13 | target_link_libraries(TFIDF2 xyz) 14 | target_link_libraries(TFIDF2 ${HUSKY_EXTERNAL_LIB}) 15 | set_property(TARGET TFIDF2 PROPERTY CXX_STANDARD 14) 16 | add_dependencies(TFIDF2 ${external_project_dependencies}) 17 | 18 | # TFIDF3 19 | add_executable(TFIDF3 tfidf3.cpp) 20 | target_link_libraries(TFIDF3 xyz) 21 | target_link_libraries(TFIDF3 ${HUSKY_EXTERNAL_LIB}) 22 | set_property(TARGET TFIDF3 PROPERTY CXX_STANDARD 14) 23 | add_dependencies(TFIDF3 ${external_project_dependencies}) 24 | 25 | # WordCount 26 | add_executable(WordCount wordcount.cpp) 27 | target_link_libraries(WordCount xyz) 28 | target_link_libraries(WordCount ${HUSKY_EXTERNAL_LIB}) 29 | set_property(TARGET WordCount PROPERTY CXX_STANDARD 14) 30 | add_dependencies(WordCount ${external_project_dependencies}) 31 | add_dependencies(WordCount ${external_project_dependencies}) 32 | add_dependencies(WordCount ${external_project_dependencies}) 33 | -------------------------------------------------------------------------------- /examples/tfidf/tfidf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from os.path import dirname, realpath 5 | proj_dir = dirname(dirname(dirname(realpath(__file__)))) 6 | sys.path.append(proj_dir+"/scripts/") 7 | from launcher import Launcher 8 | 9 | hostfile = "machinefiles/20nodes" 10 | progfile = "release/TFIDF2" 11 | schedulerfile = "release/SchedulerMain" 12 | 13 | common_params = { 14 | "scheduler" : "proj99", 15 | "scheduler_port" : "33254", 16 | "hdfs_namenode" : "proj99", 17 | "hdfs_port" : 9000, 18 | } 19 | 20 | program_params = { 21 | # "url" : "/datasets/corpus/enwiki-21g/wiki_0", 22 | # "url" : "/datasets/corpus/enwiki", 23 | # "url" : "/datasets/corpus/enwiki-21g", 24 | # "url" : "/datasets/corpus/enwiki100g", 25 | "url" : "/datasets/corpus/enwiki-200g-oneline", 26 | "num_local_threads" : 20, 27 | "num_of_docs" : 10000, 28 | "num_doc_partition" : 1000, 29 | "num_term_partition" : 100, 30 | } 31 | 32 | scheduler_params = { 33 | "dag_runner_type" : "sequential", 34 | } 35 | 36 | env_params = ( 37 | "GLOG_logtostderr=true " 38 | "GLOG_v=-1 " 39 | "GLOG_minloglevel=0 " 40 | # this is to enable hdfs short-circuit read (disable the warning info) 41 | # change this path accordingly when we use other cluster 42 | # the current setting is for proj5-10 43 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 44 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 45 | ) 46 | 47 | dump_core = False 48 | for i in xrange(3): 49 | l = Launcher(schedulerfile, progfile, hostfile, 50 | common_params, scheduler_params, program_params, env_params, 51 | dump_core) 52 | 53 | l.Launch(sys.argv) 54 | exit(0) 55 | 56 | # for url in ["/datasets/corpus/enwiki50g", "/datasets/corpus/enwiki100g", "/datasets/corpus/enwiki200g"]: 57 | # for url in ["/datasets/corpus/enwiki-200g-oneline"]: 58 | # for url in ["/datasets/corpus/enwiki200g"]: 59 | for url in ["/datasets/corpus/enwiki-50g-oneline", "/datasets/corpus/enwiki-100g-oneline", "/datasets/corpus/enwiki-200g-oneline"]: 60 | program_params["url"] = url 61 | for i in xrange(3): 62 | l = Launcher(schedulerfile, progfile, hostfile, 63 | common_params, scheduler_params, program_params, env_params, 64 | dump_core) 65 | 66 | l.Launch(sys.argv) 67 | -------------------------------------------------------------------------------- /examples/tfidf/wordcount.cpp: -------------------------------------------------------------------------------- 1 | #include "core/plan/runner.hpp" 2 | 3 | #include "core/partition/block_partition.hpp" 4 | 5 | #include "boost/tokenizer.hpp" 6 | 7 | DEFINE_string(url, "", "The url for hdfs file"); 8 | DEFINE_int32(num_parts, 100, "# word partitions"); 9 | DEFINE_string(combine_type, "kDirectCombine", 10 | "kShuffleCombine, kDirectCombine, kNoCombine, timeout"); 11 | 12 | using namespace xyz; 13 | 14 | struct WC { 15 | using KeyT = std::string; 16 | KeyT word; 17 | int count = 0; 18 | 19 | WC() = default; 20 | WC(KeyT key) : word(key) {} 21 | KeyT Key() const { return word; } 22 | 23 | // TODO: we dont need the serialization func. 24 | friend SArrayBinStream &operator<<(xyz::SArrayBinStream &stream, 25 | const WC &wc) { 26 | stream << wc.word << wc.count; 27 | return stream; 28 | } 29 | friend SArrayBinStream &operator>>(xyz::SArrayBinStream &stream, WC &wc) { 30 | stream >> wc.word >> wc.count; 31 | return stream; 32 | } 33 | }; 34 | 35 | int main(int argc, char **argv) { 36 | Runner::Init(argc, argv); 37 | const int combine_timeout = ParseCombineTimeout(FLAGS_combine_type); 38 | if (FLAGS_node_id == 0) { 39 | LOG(INFO) << "combine_type: " << FLAGS_combine_type 40 | << ", timeout: " << combine_timeout; 41 | } 42 | 43 | // use load_block_meta, read the block in mappartupdate 44 | auto lines = Context::load_block_meta(FLAGS_url); 45 | auto wordcount = Context::placeholder(FLAGS_num_parts); 46 | Context::mappartupdate( 47 | lines, wordcount, 48 | [](TypedPartition *p, Output *o) { 49 | auto *bp = dynamic_cast(p); 50 | CHECK_NOTNULL(bp); 51 | auto reader = bp->GetReader(); 52 | while (reader->HasLine()) { 53 | auto line = reader->GetLine(); 54 | // LOG(INFO) << "line: " << line; 55 | boost::char_separator sep(" \t\n"); 56 | boost::tokenizer> tok(line, sep); 57 | for (auto &w : tok) { 58 | o->Add(w, 1); 59 | } 60 | } 61 | LOG(INFO) << p->id << " map done"; 62 | }, 63 | [](WC *wc, int c) { wc->count += c; }) 64 | ->SetCombine([](int *a, int b) { return *a += b; }, combine_timeout); 65 | 66 | Context::count(wordcount); 67 | 68 | Runner::Run(); 69 | } 70 | -------------------------------------------------------------------------------- /examples/tfidf/wordcount.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | from os.path import dirname, realpath 6 | 7 | proj_dir = dirname(dirname(dirname(realpath(__file__)))) 8 | sys.path.append(proj_dir+"/scripts/") 9 | 10 | from launcher import Launcher 11 | 12 | hostfile = "machinefiles/20nodes" 13 | progfile = "release/WordCount" 14 | schedulerfile = "release/SchedulerMain" 15 | 16 | common_params = { 17 | "scheduler" : "proj99", 18 | "scheduler_port" : "33254", 19 | "hdfs_namenode" : "proj99", 20 | "hdfs_port" : 9000, 21 | } 22 | 23 | program_params = { 24 | # "url" : "/datasets/corpus/enwiki-21g/wiki_0", 25 | # "url" : "/datasets/corpus/enwiki", 26 | # "url" : "/datasets/corpus/enwiki-21g", 27 | # "url" : "/datasets/corpus/enwiki10/1", 28 | "url" : "/datasets/corpus/enwiki-200g-oneline", 29 | "num_local_threads" : 20, 30 | 31 | "num_parts" : 20*5, 32 | "combine_type": "kDirectCombine", 33 | # "combine_type": "kNoCombine", 34 | # "combine_type": "100", 35 | # "combine_type": "kShuffleCombine", 36 | } 37 | 38 | scheduler_params = { 39 | "dag_runner_type" : "sequential", 40 | } 41 | 42 | env_params = ( 43 | "GLOG_logtostderr=true " 44 | "GLOG_v=-1 " 45 | "GLOG_minloglevel=0 " 46 | # this is to enable hdfs short-circuit read (disable the warning info) 47 | # change this path accordingly when we use other cluster 48 | # the current setting is for proj5-10 49 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 50 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 51 | ) 52 | 53 | dump_core = False 54 | # l = Launcher(schedulerfile, progfile, hostfile, 55 | # common_params, scheduler_params, program_params, env_params, 56 | # dump_core) 57 | # 58 | # l.Launch(sys.argv) 59 | # exit(0) 60 | 61 | # for url in ["/datasets/corpus/enwiki50g", "/datasets/corpus/enwiki100g", "/datasets/corpus/enwiki200g"]: 62 | for url in ["/datasets/corpus/enwiki-200g-oneline"]: 63 | # for url in ["/datasets/corpus/enwiki-50g-oneline", "/datasets/corpus/enwiki-100g-oneline", "/datasets/corpus/enwiki-200g-oneline"]: 64 | program_params["url"] = url 65 | for i in xrange(3): 66 | l = Launcher(schedulerfile, progfile, hostfile, 67 | common_params, scheduler_params, program_params, env_params, 68 | dump_core) 69 | 70 | l.Launch(sys.argv) 71 | -------------------------------------------------------------------------------- /examples/worker_example.cpp: -------------------------------------------------------------------------------- 1 | #include "gflags/gflags.h" 2 | #include "glog/logging.h" 3 | 4 | //#include "base/node_util.hpp" 5 | #include "comm/worker_mailbox.hpp" 6 | 7 | DEFINE_string(scheduler, "", "The host of scheduler"); 8 | DEFINE_string(scheduler_port, "", "The port of scheduler"); 9 | 10 | namespace xyz { 11 | 12 | void Run() { 13 | /* 0. Basic checks */ 14 | CHECK(!FLAGS_scheduler.empty()); 15 | CHECK(!FLAGS_scheduler_port.empty()); 16 | 17 | Node scheduler_node{0, FLAGS_scheduler, std::stoi(FLAGS_scheduler_port), 18 | false}; 19 | LOG(INFO) << "scheduler_node: " << scheduler_node.DebugString(); 20 | 21 | /* 2. The user program */ 22 | WorkerMailbox worker_mailbox(scheduler_node); 23 | worker_mailbox.Start(); 24 | worker_mailbox.Barrier(); 25 | worker_mailbox.Stop(); 26 | } 27 | 28 | } // namespace xyz 29 | 30 | int main(int argc, char **argv) { 31 | google::InitGoogleLogging(argv[0]); 32 | gflags::ParseCommandLineFlags(&argc, &argv, true); 33 | xyz::Run(); 34 | } 35 | -------------------------------------------------------------------------------- /io/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | file(GLOB io-src-files 4 | assigner.cpp 5 | block_reader_wrapper.cpp 6 | reader_wrapper.cpp 7 | writer_wrapper.cpp 8 | io_wrapper.cpp 9 | ) 10 | 11 | if(LIBHDFS3_FOUND) 12 | file(GLOB io-src-hdfs-files 13 | hdfs_browser.cpp 14 | hdfs_block_reader.cpp 15 | hdfs_reader.cpp 16 | hdfs_writer.cpp) 17 | list(APPEND io-src-files ${io-src-hdfs-files}) 18 | endif(LIBHDFS3_FOUND) 19 | 20 | add_library(io-objs OBJECT ${io-src-files}) 21 | set_property(TARGET io-objs PROPERTY CXX_STANDARD 11) 22 | add_dependencies(io-objs ${external_project_dependencies}) 23 | 24 | if(LIBHDFS3_FOUND) 25 | add_executable(HDFSAssignerMain hdfs_assigner_main.cpp) 26 | target_link_libraries(HDFSAssignerMain xyz) 27 | target_link_libraries(HDFSAssignerMain ${HUSKY_EXTERNAL_LIB}) 28 | set_property(TARGET HDFSAssignerMain PROPERTY CXX_STANDARD 11) 29 | add_dependencies(HDFSAssignerMain ${external_project_dependencies}) 30 | 31 | add_executable(HDFSBlockReaderMain hdfs_block_reader_main.cpp) 32 | target_link_libraries(HDFSBlockReaderMain xyz) 33 | target_link_libraries(HDFSBlockReaderMain ${HUSKY_EXTERNAL_LIB}) 34 | set_property(TARGET HDFSBlockReaderMain PROPERTY CXX_STANDARD 11) 35 | add_dependencies(HDFSBlockReaderMain ${external_project_dependencies}) 36 | 37 | add_executable(HDFSReaderMain hdfs_reader_main.cpp) 38 | target_link_libraries(HDFSReaderMain xyz) 39 | target_link_libraries(HDFSReaderMain ${HUSKY_EXTERNAL_LIB}) 40 | set_property(TARGET HDFSReaderMain PROPERTY CXX_STANDARD 11) 41 | add_dependencies(HDFSReaderMain ${external_project_dependencies}) 42 | 43 | add_executable(HDFSWriterMain hdfs_writer_main.cpp) 44 | target_link_libraries(HDFSWriterMain xyz) 45 | target_link_libraries(HDFSWriterMain ${HUSKY_EXTERNAL_LIB}) 46 | set_property(TARGET HDFSWriterMain PROPERTY CXX_STANDARD 11) 47 | add_dependencies(HDFSWriterMain ${external_project_dependencies}) 48 | endif(LIBHDFS3_FOUND) 49 | -------------------------------------------------------------------------------- /io/abstract_block_reader.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace xyz { 7 | 8 | class AbstractBlockReader { 9 | public: 10 | virtual ~AbstractBlockReader() {} 11 | 12 | // call Init before reading. 13 | virtual void Init(std::string url, size_t offset) = 0; 14 | 15 | // read block api. 16 | virtual std::vector ReadBlock() = 0; 17 | 18 | /* 19 | * Usage: 20 | * HdfsBlockReader block_reader(namenode, port); 21 | * block_reader.Init(url, offset); 22 | * while (block_reader.HasLine()) { 23 | * auto s = block_reader.GetLine(); 24 | * } 25 | * c += block_reader.GetNumLineRead(); 26 | */ 27 | // Iterator based api. 28 | virtual bool HasLine() = 0; 29 | virtual std::string GetLine() = 0; 30 | virtual int GetNumLineRead() = 0; 31 | }; 32 | 33 | } // namespace xyz 34 | -------------------------------------------------------------------------------- /io/abstract_browser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace xyz { 8 | 9 | struct BlockInfo { 10 | std::string filename; 11 | size_t offset; 12 | std::string hostname; 13 | 14 | std::string DebugString() const { 15 | std::stringstream ss; 16 | ss << "filename: " << filename; 17 | ss << ", offset: " << offset; 18 | ss << ", hostname : " << hostname; 19 | return ss.str(); 20 | } 21 | }; 22 | 23 | class AbstractBrowser { 24 | public: 25 | virtual ~AbstractBrowser() {} 26 | 27 | virtual std::vector Browse(std::string url) = 0; 28 | }; 29 | 30 | } // namespace xyz 31 | -------------------------------------------------------------------------------- /io/abstract_reader.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace xyz { 6 | 7 | struct AbstractReader { 8 | virtual ~AbstractReader() {} 9 | virtual void Init(std::string url) = 0; 10 | virtual size_t GetFileSize() = 0; 11 | virtual int Read(void *buffer, size_t len) = 0; 12 | }; 13 | 14 | } // namespace xyz 15 | -------------------------------------------------------------------------------- /io/abstract_writer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace xyz { 6 | 7 | struct AbstractWriter { 8 | virtual ~AbstractWriter() {} 9 | virtual int Write(std::string dest_url, const void *buffer, size_t len) = 0; 10 | }; 11 | 12 | } // namespace xyz 13 | -------------------------------------------------------------------------------- /io/assigner.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "glog/logging.h" 11 | 12 | #include "base/message.hpp" 13 | #include "comm/abstract_sender.hpp" 14 | #include "io/abstract_browser.hpp" 15 | #include "io/meta.hpp" 16 | 17 | namespace xyz { 18 | 19 | class Assigner { 20 | public: 21 | Assigner(std::shared_ptr sender, 22 | std::shared_ptr browser) 23 | : sender_(sender), browser_(browser) {} 24 | ~Assigner() = default; 25 | 26 | // public api: 27 | // non threadsafe 28 | int Load(int collection_id, std::string url, 29 | std::vector> slaves, 30 | std::vector num_local_threads, 31 | bool is_load_meta = false, bool is_whole_file = false); 32 | 33 | // return true if all blocks finish 34 | bool FinishBlock(FinishedBlock block); 35 | bool Done(); 36 | std::map GetFinishedBlocks() const { 37 | return finished_blocks_; 38 | } 39 | 40 | void InitBlocks(std::string url); 41 | bool Assign(int collection_id, std::pair slave); 42 | std::string DebugStringLocalityMap(); 43 | std::string DebugStringBlocks(); 44 | std::string DebugStringFinishedBlocks(); 45 | int GetNumBlocks(); 46 | 47 | private: 48 | std::shared_ptr browser_; 49 | std::shared_ptr sender_; 50 | 51 | bool init_ = false; 52 | // host -> { local blocks} 53 | std::map>> locality_map_; 54 | // blocks locality information 55 | std::map, std::vector> blocks_; 56 | 57 | // assigned blocks 58 | std::map> assigned_blocks_; 59 | 60 | // finished blocks 61 | // part_id/block_id: 62 | std::map finished_blocks_; 63 | 64 | int block_id_ = 0; 65 | 66 | int num_finished_ = 0; 67 | int num_assigned_ = 0; 68 | int expected_num_finished_ = 0; 69 | bool is_load_meta_ = false; 70 | bool is_whole_file_ = false; 71 | 72 | // 73 | std::pair locality_count_{0, 0}; 74 | }; 75 | 76 | } // namespace xyz 77 | -------------------------------------------------------------------------------- /io/assigner_test.cpp: -------------------------------------------------------------------------------- 1 | #include "glog/logging.h" 2 | #include "gtest/gtest.h" 3 | 4 | #include "comm/simple_sender.hpp" 5 | #include "io/abstract_browser.hpp" 6 | #include "io/assigner.hpp" 7 | #include "io/meta.hpp" 8 | 9 | namespace xyz { 10 | namespace { 11 | 12 | class TestAssigner : public testing::Test {}; 13 | 14 | struct FakeBrowser : public AbstractBrowser { 15 | virtual std::vector Browse(std::string url) override { 16 | std::vector v{{"file0", 0, "node0"}, {"file0", 0, "node1"}, 17 | {"file0", 0, "node2"}, {"file0", 100, "node2"}, 18 | {"file0", 100, "node3"}, {"file0", 100, "node0"}, 19 | {"file1", 0, "node3"}, {"file1", 0, "node0"}, 20 | {"file1", 0, "node1"}}; 21 | return v; 22 | } 23 | }; 24 | 25 | TEST_F(TestAssigner, Create) { 26 | auto sender = std::make_shared(); 27 | auto browser = std::make_shared(); 28 | Assigner assigner(sender, browser); 29 | } 30 | 31 | /* 32 | TEST_F(TestAssigner, InitBlocks) { 33 | auto sender = std::make_shared(); 34 | auto browser = std::make_shared(); 35 | Assigner assigner(sender, browser); 36 | assigner.InitBlocks("dummy"); 37 | VLOG(1) << "locality_map_: \n" << assigner.DebugStringLocalityMap(); 38 | VLOG(1) << "blocks_: \n" << assigner.DebugStringBlocks(); 39 | } 40 | 41 | TEST_F(TestAssigner, Load) { 42 | auto sender = std::make_shared(); 43 | auto browser = std::make_shared(); 44 | Assigner assigner(sender, browser); 45 | int collection_id = 0; 46 | assigner.Load(collection_id, "dummy", {{"node0", 0}, {"node1", 1}}, {1,1}); 47 | EXPECT_EQ(assigner.Done(), false); 48 | FinishedBlock b0{0, 0, 0, "node0", collection_id}; 49 | EXPECT_EQ(assigner.FinishBlock(b0), false); 50 | FinishedBlock b1{1, 1, 0, "node1", collection_id}; 51 | EXPECT_EQ(assigner.FinishBlock(b1), false); 52 | FinishedBlock b2{2, 0, 0, "node0", collection_id}; 53 | EXPECT_EQ(assigner.FinishBlock(b2), true); 54 | } 55 | */ 56 | 57 | } // namespace 58 | } // namespace xyz 59 | -------------------------------------------------------------------------------- /io/fake_block_reader.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "io/abstract_block_reader.hpp" 4 | #include 5 | #include 6 | 7 | namespace xyz { 8 | 9 | struct FakeBlockReader : public AbstractBlockReader { 10 | public: 11 | virtual std::vector ReadBlock() override { 12 | return {"a", "b", "c"}; 13 | } 14 | virtual void Init(std::string url, size_t offset) {} 15 | virtual bool HasLine() {} 16 | virtual std::string GetLine() {} 17 | virtual int GetNumLineRead() {} 18 | }; 19 | 20 | } // namespace xyz 21 | -------------------------------------------------------------------------------- /io/fake_reader.hpp: -------------------------------------------------------------------------------- 1 | #include "io/abstract_reader.hpp" 2 | 3 | namespace xyz { 4 | 5 | struct FakeReader : public AbstractReader { 6 | virtual int Read(void *buffer, size_t len) override { 7 | return 0; 8 | } 9 | 10 | virtual void Init(std::string url) override { 11 | LOG(INFO) << "Reading from: " << url; 12 | } 13 | 14 | virtual size_t GetFileSize() override { 15 | return 0; 16 | } 17 | }; 18 | 19 | } // namespace xyz -------------------------------------------------------------------------------- /io/fake_writer.hpp: -------------------------------------------------------------------------------- 1 | #include "io/abstract_writer.hpp" 2 | 3 | namespace xyz { 4 | 5 | struct FakeWriter : public AbstractWriter { 6 | virtual int Write(std::string dest_url, const void *buffer, size_t len) override { 7 | VLOG(1) << "writing: " << len << " bytes"; 8 | return 0; 9 | } 10 | }; 11 | 12 | } // namespace xyz 13 | 14 | -------------------------------------------------------------------------------- /io/hdfs_assigner_main.cpp: -------------------------------------------------------------------------------- 1 | #include "io/assigner.hpp" 2 | #include "io/hdfs_browser.hpp" 3 | #include "io/meta.hpp" 4 | 5 | #include "comm/simple_sender.hpp" 6 | 7 | using namespace xyz; 8 | 9 | int main(int argc, char **argv) { 10 | google::InitGoogleLogging(argv[0]); 11 | 12 | const int qid = 0; 13 | const std::string namenode = "proj10"; 14 | const int port = 9000; 15 | // std::string url = "/datasets/classification/kdd12-5blocks"; 16 | std::string url = "/datasets/graph/webbase-adj"; 17 | auto sender = std::make_shared(); 18 | auto browser = std::make_shared(namenode, port); 19 | 20 | Assigner assigner(sender, browser); 21 | int collection_id = 0; 22 | int num_blocks = assigner.Load(collection_id, url, {{"proj5", 0}}, {1}); 23 | LOG(INFO) << "blocks number: " << num_blocks; 24 | 25 | for (int i = 0; i < num_blocks; ++i) { 26 | // recv 27 | auto recv_msg = sender->Get(); 28 | SArrayBinStream recv_bin; 29 | CHECK_EQ(recv_msg.data.size(), 2); 30 | recv_bin.FromSArray(recv_msg.data[1]); 31 | AssignedBlock block; 32 | recv_bin >> block; 33 | LOG(INFO) << "block: " << block.DebugString(); 34 | 35 | // send finish 36 | FinishedBlock b{block.id, 0, 0, "node5"}; 37 | CHECK_EQ(assigner.Done(), false); 38 | assigner.FinishBlock(b); 39 | } 40 | CHECK_EQ(assigner.Done(), true); 41 | LOG(INFO) << assigner.DebugStringFinishedBlocks(); 42 | } 43 | -------------------------------------------------------------------------------- /io/hdfs_block_reader.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "io/abstract_block_reader.hpp" 4 | 5 | #include "boost/utility/string_ref.hpp" 6 | #include "hdfs/hdfs.h" 7 | 8 | #include "glog/logging.h" 9 | 10 | namespace xyz { 11 | 12 | class HdfsBlockReader : public AbstractBlockReader { 13 | public: 14 | HdfsBlockReader(std::string namenode, int port) 15 | : namenode_(namenode), port_(port) {} 16 | ~HdfsBlockReader() { 17 | delete[] data_; 18 | int rc = hdfsCloseFile(fs_, file_); 19 | CHECK(rc == 0) << "close file fails"; 20 | } 21 | virtual std::vector ReadBlock() override; 22 | 23 | virtual void Init(std::string url, size_t offset) override; 24 | virtual bool HasLine() override; 25 | virtual std::string GetLine() override; 26 | virtual int GetNumLineRead() override; 27 | 28 | private: 29 | void InitHdfs(std::string hdfs_namenode, int hdfs_namenode_port, 30 | std::string url); 31 | 32 | void InitBlocksize(hdfsFS fs, std::string url); 33 | 34 | bool next(boost::string_ref &ref); 35 | size_t find_next(boost::string_ref sref, size_t l, char c); 36 | void handle_next_block(); 37 | bool fetch_new_block(); 38 | int read_block(const std::string &fn); 39 | 40 | boost::string_ref fetch_next(); 41 | 42 | private: 43 | char *data_ = nullptr; 44 | hdfsFS fs_; 45 | size_t hdfs_block_size_; 46 | 47 | size_t offset_ = 0; 48 | int l = 0; 49 | int r = 0; 50 | std::string last_part_; 51 | boost::string_ref buffer_; 52 | std::string fn_; 53 | hdfsFile file_ = NULL; 54 | 55 | boost::string_ref tmp_line_; 56 | int tmp_line_count_ = 0; 57 | 58 | std::string namenode_; 59 | int port_; 60 | }; 61 | 62 | } // namespace xyz 63 | -------------------------------------------------------------------------------- /io/hdfs_block_reader_main.cpp: -------------------------------------------------------------------------------- 1 | #include "io/hdfs_block_reader.hpp" 2 | 3 | #include "glog/logging.h" 4 | 5 | using namespace xyz; 6 | 7 | int main(int argc, char **argv) { 8 | google::InitGoogleLogging(argv[0]); 9 | 10 | const std::string namenode = "proj10"; 11 | const int port = 9000; 12 | 13 | const std::string url = "/datasets/classification/kdd12-5blocks"; 14 | std::vector offsets{0, 1048576, 2097152, 3145728, 4194304}; 15 | // block api 16 | int c = 0; 17 | for (auto offset : offsets) { 18 | HdfsBlockReader block_reader(namenode, port); 19 | block_reader.Init(url, offset); 20 | auto a = block_reader.ReadBlock(); 21 | c += a.size(); 22 | } 23 | LOG(INFO) << c << " lines in total."; 24 | 25 | // iterator api 26 | c = 0; 27 | for (auto offset : offsets) { 28 | HdfsBlockReader block_reader(namenode, port); 29 | block_reader.Init(url, offset); 30 | while (block_reader.HasLine()) { 31 | auto s = block_reader.GetLine(); 32 | } 33 | c += block_reader.GetNumLineRead(); 34 | } 35 | LOG(INFO) << c << " lines in total."; 36 | } 37 | -------------------------------------------------------------------------------- /io/hdfs_browser.cpp: -------------------------------------------------------------------------------- 1 | #include "io/hdfs_browser.hpp" 2 | 3 | #include "glog/logging.h" 4 | 5 | namespace xyz { 6 | 7 | HDFSBrowser::HDFSBrowser(std::string hdfs_namenode, int port) 8 | : hdfs_namenode_(hdfs_namenode), hdfs_namenode_port_(port) { 9 | bool suc = InitHDFS(hdfs_namenode_, hdfs_namenode_port_); 10 | CHECK(suc) << "Failed to connect to HDFS " << hdfs_namenode_ << ":" 11 | << hdfs_namenode_port_; 12 | LOG(INFO) << "Connect to HDFS, namenode:" << hdfs_namenode_ 13 | << " port:" << hdfs_namenode_port_; 14 | } 15 | 16 | std::vector HDFSBrowser::Browse(std::string url) { 17 | CHECK(fs_); 18 | CHECK_EQ(hdfsExists(fs_, url.c_str()), 0) << "url: " << url << "does not exit in hdfs" 19 | << " :<" << hdfs_namenode_ << "," << hdfs_namenode_port_ << ">."; 20 | std::vector rets; 21 | int num_files; 22 | int dummy; 23 | hdfsFileInfo *file_info = hdfsListDirectory(fs_, url.c_str(), &num_files); 24 | for (int i = 0; i < num_files; ++i) { 25 | // for every file in a directory 26 | if (file_info[i].mKind != kObjectKindFile) 27 | continue; 28 | size_t k = 0; 29 | while (k < file_info[i].mSize) { 30 | // for every block in a file 31 | auto blk_loc = 32 | hdfsGetFileBlockLocations(fs_, file_info[i].mName, k, 1, &dummy); 33 | for (int j = 0; j < blk_loc->numOfNodes; ++j) { 34 | // for every replication in a block 35 | std::string hostname = blk_loc->hosts[j]; 36 | BlockInfo b{std::string(file_info[i].mName) + '\0', k, hostname}; 37 | rets.push_back(b); 38 | } 39 | k += file_info[i].mBlockSize; 40 | } 41 | } 42 | hdfsFreeFileInfo(file_info, num_files); 43 | return rets; 44 | } 45 | 46 | bool HDFSBrowser::InitHDFS(std::string hdfs_namenode, int port) { 47 | int num_retries = 3; 48 | while (num_retries--) { 49 | struct hdfsBuilder *builder = hdfsNewBuilder(); 50 | hdfsBuilderSetNameNode(builder, hdfs_namenode.c_str()); 51 | hdfsBuilderSetNameNodePort(builder, port); 52 | fs_ = hdfsBuilderConnect(builder); 53 | hdfsFreeBuilder(builder); 54 | if (fs_) 55 | break; 56 | } 57 | if (fs_) { 58 | return true; 59 | } 60 | return false; 61 | } 62 | 63 | } // namespace xyz 64 | -------------------------------------------------------------------------------- /io/hdfs_browser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "io/abstract_browser.hpp" 4 | 5 | #include "hdfs/hdfs.h" 6 | 7 | namespace xyz { 8 | 9 | class HDFSBrowser : public AbstractBrowser { 10 | public: 11 | HDFSBrowser(std::string hdfs_namenode, int port); 12 | virtual ~HDFSBrowser() {} 13 | virtual std::vector Browse(std::string url) override; 14 | 15 | bool InitHDFS(std::string hdfs_namenode, int port); 16 | 17 | private: 18 | std::string hdfs_namenode_; 19 | int hdfs_namenode_port_; 20 | std::string url_; 21 | 22 | hdfsFS fs_ = NULL; 23 | }; 24 | 25 | } // namespace xyz 26 | -------------------------------------------------------------------------------- /io/hdfs_helper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "glog/logging.h" 4 | #include "hdfs/hdfs.h" 5 | 6 | namespace xyz { 7 | namespace { 8 | 9 | hdfsFS GetFS(std::string hdfs_namenode, int hdfs_namenode_port) { 10 | hdfsFS fs; 11 | struct hdfsBuilder *builder = hdfsNewBuilder(); 12 | hdfsBuilderSetNameNode(builder, hdfs_namenode.c_str()); 13 | hdfsBuilderSetNameNodePort(builder, hdfs_namenode_port); 14 | fs = hdfsBuilderConnect(builder); 15 | CHECK(fs); 16 | hdfsFreeBuilder(builder); 17 | return fs; 18 | } 19 | 20 | } // namespace 21 | } // namespace xyz 22 | -------------------------------------------------------------------------------- /io/hdfs_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "io/hdfs_reader.hpp" 2 | 3 | #include "glog/logging.h" 4 | #include "io/hdfs_helper.hpp" 5 | 6 | namespace xyz { 7 | 8 | void HdfsReader::Init(std::string url) { 9 | url_ = url; 10 | fs_ = GetFS(hdfs_namenode_, hdfs_namenode_port_); 11 | CHECK_EQ(hdfsExists(fs_, url.c_str()), 0); 12 | HdfsReader::InitFilesize(fs_, url); 13 | } 14 | 15 | void HdfsReader::InitFilesize(hdfsFS fs, std::string url) { 16 | hdfsFileInfo *file_info = hdfsGetPathInfo(fs, url.c_str()); 17 | CHECK_EQ(file_info[0].mKind, kObjectKindFile); 18 | hdfs_file_size_ = file_info[0].mSize; 19 | // LOG(INFO) << "File size: " << std::to_string(hdfs_file_size_); 20 | hdfsFreeFileInfo(file_info, 1); 21 | } 22 | 23 | size_t HdfsReader::GetFileSize() { 24 | return hdfs_file_size_; 25 | } 26 | 27 | int HdfsReader::Read(void *buffer, size_t len) { 28 | hdfsFile file = hdfsOpenFile(fs_, url_.c_str(), O_RDONLY, 0, 0, 0); 29 | CHECK(hdfsFileIsOpenForRead(file)); 30 | size_t start = 0; 31 | size_t nbytes = 0; 32 | while (start < hdfs_file_size_) { 33 | // only 128KB per hdfsRead 34 | nbytes = hdfsRead(fs_, file, buffer + start, hdfs_file_size_); 35 | start += nbytes; 36 | if (nbytes == 0) 37 | break; 38 | } 39 | CHECK_EQ(start, hdfs_file_size_); 40 | int rc = hdfsCloseFile(fs_, file); 41 | CHECK_EQ(rc, 0); 42 | return 0; 43 | } 44 | 45 | } // namespace xyz 46 | -------------------------------------------------------------------------------- /io/hdfs_reader.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "io/abstract_reader.hpp" 4 | #include "hdfs/hdfs.h" 5 | 6 | namespace xyz { 7 | 8 | class HdfsReader : public AbstractReader { 9 | public: 10 | HdfsReader(std::string hdfs_namenode, int hdfs_namenode_port) 11 | : hdfs_namenode_(hdfs_namenode), hdfs_namenode_port_(hdfs_namenode_port) { 12 | } 13 | virtual void Init(std::string url) override; 14 | void InitFilesize(hdfsFS fs, std::string url); 15 | virtual size_t GetFileSize() override; 16 | virtual int Read(void *buffer, size_t len) override; 17 | 18 | private: 19 | std::string hdfs_namenode_; 20 | int hdfs_namenode_port_; 21 | std::string url_; 22 | size_t offset_ = 0; 23 | size_t hdfs_file_size_; 24 | hdfsFS fs_; 25 | }; 26 | 27 | } // namespace xyz -------------------------------------------------------------------------------- /io/hdfs_reader_main.cpp: -------------------------------------------------------------------------------- 1 | #include "io/hdfs_reader.hpp" 2 | 3 | #include 4 | 5 | #include "glog/logging.h" 6 | 7 | using namespace xyz; 8 | 9 | int main(int argc, char **argv) { 10 | google::InitGoogleLogging(argv[0]); 11 | 12 | const std::string namenode = "proj10"; 13 | const int port = 9000; 14 | HdfsReader reader(namenode, port); 15 | const std::string url = "/tmp/read/a.txt"; 16 | reader.Init(url); 17 | size_t len = reader.GetFileSize(); 18 | char * data = new char[len]; 19 | int rc = reader.Read(data, len); 20 | CHECK_EQ(rc, 0); 21 | LOG(INFO) << "File content: " << data; 22 | 23 | delete [] data; 24 | } 25 | -------------------------------------------------------------------------------- /io/hdfs_writer.cpp: -------------------------------------------------------------------------------- 1 | #include "io/hdfs_writer.hpp" 2 | 3 | #include "glog/logging.h" 4 | #include "hdfs/hdfs.h" 5 | #include "io/hdfs_helper.hpp" 6 | 7 | namespace xyz { 8 | 9 | int HdfsWriter::Write(std::string dest_url, const void *buffer, size_t len) { 10 | hdfsFS fs = GetFS(hdfs_namenode_, hdfs_namenode_port_); 11 | std::string dir = dest_url.substr(0, dest_url.find_last_of("/")); 12 | // LOG(INFO) << "url: " << dest_url; 13 | // LOG(INFO) << "dir: " << dir; 14 | int rc = hdfsCreateDirectory(fs, dir.c_str()); 15 | CHECK_EQ(rc, 0) << "cannot create directory: " << dir; 16 | hdfsFile file = hdfsOpenFile(fs, dest_url.c_str(), O_WRONLY, 0, 0, 0); 17 | CHECK(hdfsFileIsOpenForWrite(file)) << "cannot open file: " << dest_url; 18 | 19 | if (len > 0) { 20 | int num_written = hdfsWrite(fs, file, buffer, len); 21 | CHECK_EQ(num_written, len); 22 | } 23 | rc = hdfsFlush(fs, file); 24 | CHECK_EQ(rc, 0); 25 | rc = hdfsCloseFile(fs, file); 26 | CHECK_EQ(rc, 0); 27 | return 0; 28 | } 29 | 30 | } // namespace xyz 31 | -------------------------------------------------------------------------------- /io/hdfs_writer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "io/abstract_writer.hpp" 4 | 5 | namespace xyz { 6 | 7 | class HdfsWriter : public AbstractWriter { 8 | public: 9 | HdfsWriter(std::string hdfs_namenode, int hdfs_namenode_port) 10 | : hdfs_namenode_(hdfs_namenode), hdfs_namenode_port_(hdfs_namenode_port) { 11 | } 12 | virtual int Write(std::string dest_url, const void *buffer, 13 | size_t len) override; 14 | 15 | private: 16 | std::string hdfs_namenode_; 17 | int hdfs_namenode_port_; 18 | }; 19 | 20 | } // namespace xyz -------------------------------------------------------------------------------- /io/hdfs_writer_main.cpp: -------------------------------------------------------------------------------- 1 | #include "io/hdfs_writer.hpp" 2 | 3 | #include 4 | 5 | #include "glog/logging.h" 6 | 7 | using namespace xyz; 8 | 9 | int main(int argc, char **argv) { 10 | google::InitGoogleLogging(argv[0]); 11 | 12 | const std::string namenode = "proj10"; 13 | const int port = 9000; 14 | HdfsWriter writer(namenode, port); 15 | std::string content = "hello world"; 16 | const std::string dest_url = "/tmp/tmp/a.txt"; 17 | int rc = writer.Write(dest_url, content.c_str(), content.size()); 18 | CHECK_EQ(rc, 0); 19 | } 20 | -------------------------------------------------------------------------------- /io/io_wrapper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "io/abstract_reader.hpp" 6 | #include "io/abstract_writer.hpp" 7 | 8 | namespace xyz { 9 | 10 | class IOWrapper { 11 | public: 12 | IOWrapper(std::function()> reader_getter, 13 | std::function()> writer_getter) 14 | : reader_getter_(reader_getter), writer_getter_(writer_getter) {} 15 | 16 | std::shared_ptr GetReader() { 17 | CHECK(reader_getter_); 18 | return reader_getter_(); 19 | } 20 | std::shared_ptr GetWriter() { 21 | CHECK(writer_getter_); 22 | return writer_getter_(); 23 | } 24 | 25 | std::function()> GetReaderGetter() { 26 | return reader_getter_; 27 | } 28 | std::function()> GetWriterGetter() { 29 | return writer_getter_; 30 | } 31 | private: 32 | std::function()> reader_getter_; 33 | std::function()> writer_getter_; 34 | }; 35 | 36 | } // namespace xyz 37 | -------------------------------------------------------------------------------- /io/meta.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "base/sarray_binstream.hpp" 6 | 7 | namespace xyz { 8 | 9 | struct StoredBlock { 10 | std::string url; 11 | size_t offset; 12 | int node_id; 13 | std::string DebugString() const { 14 | std::stringstream ss; 15 | ss << "url: " << url; 16 | ss << ", offset: " << offset; 17 | ss << ", node_id: " << node_id; 18 | return ss.str(); 19 | } 20 | }; 21 | 22 | struct AssignedBlock { 23 | std::string url; 24 | size_t offset; 25 | int id; 26 | int collection_id; 27 | bool is_load_meta; 28 | bool is_whole_file; 29 | 30 | std::string DebugString() const { 31 | std::stringstream ss; 32 | ss << "url: " << url; 33 | ss << ", offset: " << offset; 34 | ss << ", id: " << id; 35 | ss << ", collection_id: " << collection_id; 36 | ss << ", is_load_meta: " << is_load_meta; 37 | ss << ", is_whole_file: " << is_whole_file; 38 | return ss.str(); 39 | } 40 | 41 | friend SArrayBinStream &operator<<(xyz::SArrayBinStream &stream, 42 | const AssignedBlock &b) { 43 | stream << b.url << b.offset << b.id << b.collection_id << b.is_load_meta << b.is_whole_file; 44 | return stream; 45 | } 46 | friend SArrayBinStream &operator>>(xyz::SArrayBinStream &stream, 47 | AssignedBlock &b) { 48 | stream >> b.url >> b.offset >> b.id >> b.collection_id >> b.is_load_meta >> b.is_whole_file; 49 | return stream; 50 | } 51 | }; 52 | 53 | struct FinishedBlock { 54 | int block_id; 55 | int node_id; 56 | int qid; 57 | std::string hostname; 58 | int collection_id; 59 | 60 | std::string DebugString() const { 61 | std::stringstream ss; 62 | ss << "block_id: " << block_id; 63 | ss << ", node_id: " << node_id; 64 | ss << ", qid: " << qid; 65 | ss << ", hostname: " << hostname; 66 | ss << ", collection_id: " << collection_id; 67 | return ss.str(); 68 | } 69 | 70 | friend SArrayBinStream &operator<<(xyz::SArrayBinStream &stream, 71 | const FinishedBlock &b) { 72 | stream << b.block_id << b.node_id << b.qid << b.hostname << b.collection_id; 73 | return stream; 74 | } 75 | friend SArrayBinStream &operator>>(xyz::SArrayBinStream &stream, 76 | FinishedBlock &b) { 77 | stream >> b.block_id >> b.node_id >> b.qid >> b.hostname >> b.collection_id; 78 | return stream; 79 | } 80 | }; 81 | 82 | } // namespace xyz 83 | -------------------------------------------------------------------------------- /machinefiles/20nodes: -------------------------------------------------------------------------------- 1 | w1 2 | w2 3 | w3 4 | w4 5 | w5 6 | w6 7 | w7 8 | w8 9 | w9 10 | w10 11 | w11 12 | w12 13 | w13 14 | w14 15 | w15 16 | w16 17 | w17 18 | w18 19 | w19 20 | w20 21 | -------------------------------------------------------------------------------- /machinefiles/5nodes: -------------------------------------------------------------------------------- 1 | proj5 2 | proj6 3 | proj7 4 | proj8 5 | proj9 6 | -------------------------------------------------------------------------------- /scripts/a.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/20nodes" 7 | progfile = "release/A" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj99", 12 | "scheduler_port" : "33214", 13 | "hdfs_namenode" : "proj99", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | } 19 | 20 | scheduler_params = { 21 | } 22 | 23 | env_params = ( 24 | "GLOG_logtostderr=true " 25 | "GLOG_v=-1 " 26 | "GLOG_minloglevel=0 " 27 | # this is to enable hdfs short-circuit read (disable the warning info) 28 | # change this path accordingly when we use other cluster 29 | # the current setting is for proj5-10 30 | "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 31 | ) 32 | 33 | dump_core = False 34 | l = Launcher(schedulerfile, progfile, hostfile, 35 | common_params, scheduler_params, program_params, env_params, 36 | dump_core) 37 | 38 | l.Launch(sys.argv) 39 | -------------------------------------------------------------------------------- /scripts/crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/20nodes" 7 | progfile = "release/Crawler" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj99", 12 | "scheduler_port" : "33226", 13 | "hdfs_namenode" : "proj99", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | "url": "https://en.wikipedia.org/wiki/Main_Page,http://www.sina.com.cn,http://course.cse.cuhk.edu.hk/~csci4140", 19 | "num_local_threads" : 20, 20 | "python_script_path" : "/data/opt/tmp/xuan/xyz/examples/crawler_util.py", 21 | } 22 | 23 | scheduler_params = { 24 | "dag_runner_type" : "sequential", 25 | } 26 | 27 | env_params = ( 28 | "GLOG_logtostderr=true " 29 | "GLOG_v=-1 " 30 | "GLOG_minloglevel=0 " 31 | # this is to enable hdfs short-circuit read (disable the warning info) 32 | # change this path accordingly when we use other cluster 33 | # the current setting is for proj5-10 34 | "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 35 | ) 36 | 37 | dump_core = False 38 | l = Launcher(schedulerfile, progfile, hostfile, 39 | common_params, scheduler_params, program_params, env_params, 40 | dump_core) 41 | 42 | l.Launch(sys.argv) 43 | -------------------------------------------------------------------------------- /scripts/graph_matching.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/20nodes" 7 | progfile = "release/GraphMatching" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj99", 12 | "scheduler_port" : "33225", 13 | "hdfs_namenode" : "proj99", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | #"url" : "/datasets/graph/label_skitter_8m.adj", 19 | #"url" : "/datasets/graph/label_skitter.adj", 20 | "url" : "/datasets/graph/label_orkut.adj", 21 | #"url" : "/tmp/xuan/toy2.graph", 22 | #"url" : "/tmp/xuan/pattern.graph", 23 | "num_local_threads" : 20, 24 | "num_matcher_parts" : 800, 25 | "num_graph_parts" : 400, 26 | "num_matchers" : 800, 27 | #"num_vertices" : 1696415,#skitter 28 | "num_vertices" : 3072441,#orkut 29 | } 30 | 31 | scheduler_params = { 32 | "dag_runner_type" : "sequential", 33 | } 34 | 35 | env_params = ( 36 | "GLOG_logtostderr=true " 37 | "GLOG_v=-1 " 38 | "GLOG_minloglevel=0 " 39 | # this is to enable hdfs short-circuit read (disable the warning info) 40 | # change this path accordingly when we use other cluster 41 | # the current setting is for proj5-10 42 | "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 43 | ) 44 | 45 | dump_core = False 46 | l = Launcher(schedulerfile, progfile, hostfile, 47 | common_params, scheduler_params, program_params, env_params, 48 | dump_core) 49 | 50 | l.Launch(sys.argv) 51 | -------------------------------------------------------------------------------- /scripts/kill.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys 4 | from launch_utils import kill_nodes 5 | 6 | if __name__ == "__main__": 7 | if len(sys.argv) != 3: 8 | print "usage: %s " % sys.argv[0] 9 | sys.exit(1) 10 | 11 | host_file = sys.argv[1] 12 | prog_name = sys.argv[2] 13 | kill_nodes(prog_name, host_file) 14 | 15 | 16 | -------------------------------------------------------------------------------- /scripts/kmeans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/20nodes" 7 | # progfile = "release/KmeansExample" 8 | progfile = "release/KmeansRowExample" 9 | schedulerfile = "release/SchedulerMain" 10 | 11 | common_params = { 12 | "scheduler" : "proj99", 13 | "scheduler_port" : "33424", 14 | "hdfs_namenode" : "proj99", 15 | "hdfs_port" : 9000, 16 | } 17 | 18 | # for SVHN 19 | svhn_params = { 20 | "url" : "/jasper/SVHN", 21 | "num_data" : 73257, 22 | "num_dims" : 3072, 23 | "num_param_per_part" : 3072, 24 | "K" :10, 25 | } 26 | 27 | # for webspam 28 | mnist8m_params = { 29 | "url" : "/jasper/mnist8m", 30 | "num_data" : 8100000, 31 | "num_dims" : 784, 32 | "num_param_per_part" : 784*101, 33 | "K" : 100, 34 | } 35 | 36 | # for a9 37 | a9_params = { 38 | "url" : "/jasper/a9", 39 | "num_data" : 32561, 40 | "num_dims" : 123, 41 | "num_param_per_part" : 123*3, 42 | "K" : 2, 43 | } 44 | 45 | # for avazu 46 | avazu_params = { 47 | "url" : "/jasper/avazu-app", 48 | "num_data" : 40428967, 49 | "num_dims" : 1000000, 50 | "num_param_per_part" : 1000000, 51 | "K" : 2, 52 | } 53 | 54 | program_params = { 55 | "num_local_threads" : 20, 56 | "num_data_parts" : 1200, 57 | "batch_size" : 1000, 58 | "alpha" : 0.1, 59 | "num_iter" : 10, 60 | "staleness" : 0, 61 | "is_sgd" : False, 62 | # to make FT work, do not use kShuffleCombine, 63 | # to make it fast, use kShuffleCombine 64 | # "combine_type" : "kShuffleCombine", 65 | "combine_type" : "kDirectCombine", 66 | "max_lines_per_part" : -1, 67 | "replicate_factor" : 10, 68 | } 69 | 70 | # choose one of them 71 | program_params.update(mnist8m_params) 72 | # program_params.update(a9_params) 73 | # program_params.update(svhn_params) 74 | 75 | scheduler_params = { 76 | "dag_runner_type" : "sequential", 77 | } 78 | 79 | env_params = ( 80 | "GLOG_logtostderr=true " 81 | "GLOG_v=-1 " 82 | "GLOG_minloglevel=0 " 83 | # this is to enable hdfs short-circuit read (disable the warning info) 84 | # change this path accordingly when we use other cluster 85 | # the current setting is for proj5-10 86 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 87 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 88 | ) 89 | 90 | dump_core = False 91 | l = Launcher(schedulerfile, progfile, hostfile, 92 | common_params, scheduler_params, program_params, env_params, 93 | dump_core) 94 | 95 | l.Launch(sys.argv) 96 | -------------------------------------------------------------------------------- /scripts/load_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/5nodes" 7 | progfile = "debug/LoadExample" 8 | schedulerfile = "debug/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj10", 12 | "scheduler_port" : "33254", 13 | "hdfs_namenode" : "proj10", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | # "url" : "/datasets/classification/kdd12-5blocks", 19 | "url" : "/tmp/tmp/banana.txt", 20 | "output_url" : "/tmp/tmp/res/", 21 | "num_local_threads" : 2, 22 | } 23 | 24 | scheduler_params = { 25 | } 26 | 27 | env_params = ( 28 | "GLOG_logtostderr=true " 29 | "GLOG_v=-1 " 30 | "GLOG_minloglevel=0 " 31 | # this is to enable hdfs short-circuit read (disable the warning info) 32 | # change this path accordingly when we use other cluster 33 | # the current setting is for proj5-10 34 | "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 35 | ) 36 | 37 | l = Launcher(schedulerfile, progfile, hostfile, 38 | common_params, scheduler_params, program_params, env_params, 39 | dump_core=False) 40 | 41 | l.Launch(sys.argv) 42 | # l.DebugString() 43 | 44 | -------------------------------------------------------------------------------- /scripts/lr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/5nodes" 7 | progfile = "release/DenseLRExample" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj10", 12 | "scheduler_port" : "33424", 13 | "hdfs_namenode" : "proj10", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | # for webspam 18 | webspam_params = { 19 | "url" : "/ml/webspam", 20 | "num_data" : 350000, 21 | "num_params" : 16609143, 22 | "num_param_per_part" : 16609, 23 | } 24 | 25 | # for a9 26 | a9_params = { 27 | "url" : "/jasper/a9", 28 | "num_data" : 32561, 29 | "num_params" : 123, 30 | "num_param_per_part" : 10, 31 | } 32 | 33 | # for avazu 34 | avazu_params = { 35 | "url" : "/jasper/avazu-app", 36 | "num_data" : 40428967, 37 | "num_params" : 1000000, 38 | "num_param_per_part" : 1000000, 39 | } 40 | 41 | program_params = { 42 | "num_local_threads" : 20, 43 | "num_data_parts" : 400, 44 | "batch_size" : 800, 45 | "alpha" : 0.001, 46 | "num_iter" : 10, 47 | "staleness" : 0, 48 | "is_sparse" : False, 49 | "is_sgd" : False, 50 | # to make FT work, do not use kShuffleCombine, 51 | # to make it fast, use kShuffleCombine 52 | "combine_type" : "kDirectCombine", 53 | "max_lines_per_part" : -1, 54 | } 55 | 56 | # choose one of them 57 | # program_params.update(webspam_params) 58 | # program_params.update(a9_params) 59 | program_params.update(avazu_params) 60 | 61 | if program_params["is_sparse"]: 62 | progfile = "release/SparseLRExample" 63 | 64 | scheduler_params = { 65 | "dag_runner_type" : "sequential", 66 | } 67 | 68 | env_params = ( 69 | "GLOG_logtostderr=true " 70 | "GLOG_v=-1 " 71 | "GLOG_minloglevel=0 " 72 | # this is to enable hdfs short-circuit read (disable the warning info) 73 | # change this path accordingly when we use other cluster 74 | # the current setting is for proj5-10 75 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 76 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 77 | ) 78 | 79 | dump_core = False 80 | l = Launcher(schedulerfile, progfile, hostfile, 81 | common_params, scheduler_params, program_params, env_params, 82 | dump_core) 83 | 84 | l.Launch(sys.argv) 85 | -------------------------------------------------------------------------------- /scripts/mailbox_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launch_utils import launch_util 5 | 6 | hostfile = "machinefiles/5nodes" 7 | progfile = "debug/WorkerExample" 8 | schedulerfile = "debug/SchedulerExample" 9 | 10 | common_params = { 11 | #"hdfs_namenode" : "proj10", 12 | #"hdfs_namenode_port" : 9000, 13 | #"input" : "hdfs:///jasper/kdd12", 14 | "scheduler" : "proj10", 15 | "scheduler_port" : "33254", 16 | } 17 | 18 | program_params = { 19 | } 20 | 21 | scheduler_params = { 22 | } 23 | 24 | env_params = ( 25 | "GLOG_logtostderr=true " 26 | "GLOG_v=-1 " 27 | "GLOG_minloglevel=0 " 28 | # this is to enable hdfs short-circuit read (disable the warning info) 29 | # change this path accordingly when we use other cluster 30 | # the current setting is for proj5-10 31 | "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 32 | ) 33 | 34 | dump_core = False 35 | launch_util(schedulerfile, progfile, hostfile, env_params, 36 | common_params, scheduler_params, program_params, sys.argv, dump_core) 37 | -------------------------------------------------------------------------------- /scripts/pagerank.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/5nodes" 7 | progfile = "debug/PageRank" 8 | schedulerfile = "debug/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj10", 12 | "scheduler_port" : "33224", 13 | "hdfs_namenode" : "proj10", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | "url" : "/datasets/graph/google-adj", 19 | "num_local_threads" : 20, 20 | } 21 | 22 | scheduler_params = { 23 | "dag_runner_type" : "sequential", 24 | } 25 | 26 | env_params = ( 27 | "GLOG_logtostderr=true " 28 | "GLOG_v=-1 " 29 | "GLOG_minloglevel=0 " 30 | # this is to enable hdfs short-circuit read (disable the warning info) 31 | # change this path accordingly when we use other cluster 32 | # the current setting is for proj5-10 33 | "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 34 | ) 35 | 36 | dump_core = False 37 | l = Launcher(schedulerfile, progfile, hostfile, 38 | common_params, scheduler_params, program_params, env_params, 39 | dump_core) 40 | 41 | l.Launch(sys.argv) 42 | -------------------------------------------------------------------------------- /scripts/pagerank99.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/20nodes" 7 | progfile = "release/PageRank" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj99", 12 | "scheduler_port" : "33224", 13 | "hdfs_namenode" : "proj99", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | # "url" : "/datasets/graph/webbase-adj", 19 | # "url" : "/datasets/graph/google-adj", 20 | "url" : "/datasets/graph/webuk-adj", 21 | "num_local_threads" : 20, 22 | "num_parts" : 100, 23 | # "combine_type": "kShuffleCombine", 24 | "combine_type": "kDirectCombine", 25 | } 26 | 27 | scheduler_params = { 28 | "dag_runner_type" : "sequential", 29 | } 30 | 31 | env_params = ( 32 | "GLOG_logtostderr=true " 33 | "GLOG_v=-1 " 34 | "GLOG_minloglevel=0 " 35 | # this is to enable hdfs short-circuit read (disable the warning info) 36 | # change this path accordingly when we use other cluster 37 | # the current setting is for proj5-10 38 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 39 | ) 40 | 41 | dump_core = False 42 | l = Launcher(schedulerfile, progfile, hostfile, 43 | common_params, scheduler_params, program_params, env_params, 44 | dump_core) 45 | 46 | l.Launch(sys.argv) 47 | -------------------------------------------------------------------------------- /scripts/pagerank_with.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/5nodes" 7 | progfile = "debug/PageRankWith" 8 | schedulerfile = "debug/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj10", 12 | "scheduler_port" : "33224", 13 | "hdfs_namenode" : "proj10", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | "url" : "/datasets/graph/google-adj", 19 | "num_local_threads" : 20, 20 | "num_parts" : 100, 21 | "combine_type": "kDirectCombine", 22 | } 23 | 24 | scheduler_params = { 25 | "dag_runner_type" : "sequential", 26 | } 27 | 28 | env_params = ( 29 | "GLOG_logtostderr=true " 30 | "GLOG_v=-1 " 31 | "GLOG_minloglevel=0 " 32 | # this is to enable hdfs short-circuit read (disable the warning info) 33 | # change this path accordingly when we use other cluster 34 | # the current setting is for proj5-10 35 | "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 36 | 37 | # turn on this to run in w1-20 38 | # "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 39 | ) 40 | 41 | dump_core = False 42 | l = Launcher(schedulerfile, progfile, hostfile, 43 | common_params, scheduler_params, program_params, env_params, 44 | dump_core) 45 | 46 | l.Launch(sys.argv) 47 | -------------------------------------------------------------------------------- /scripts/pagerank_with99.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/20nodes" 7 | progfile = "release/PageRankWith" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj99", 12 | "scheduler_port" : "33324", 13 | "hdfs_namenode" : "proj99", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | # "url" : "/datasets/graph/webbase-adj", 19 | # "url" : "/datasets/graph/google-adj", 20 | "url" : "/datasets/graph/webuk-adj", 21 | "num_local_threads" : 20, 22 | "num_parts" : 100, 23 | # "combine_type": "kShuffleCombine", 24 | "combine_type": "kDirectCombine", 25 | } 26 | 27 | scheduler_params = { 28 | "dag_runner_type" : "sequential", 29 | } 30 | 31 | env_params = ( 32 | "GLOG_logtostderr=true " 33 | "GLOG_v=-1 " 34 | "GLOG_minloglevel=0 " 35 | # this is to enable hdfs short-circuit read (disable the warning info) 36 | # change this path accordingly when we use other cluster 37 | # the current setting is for proj5-10 38 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 39 | ) 40 | 41 | dump_core = False 42 | l = Launcher(schedulerfile, progfile, hostfile, 43 | common_params, scheduler_params, program_params, env_params, 44 | dump_core) 45 | 46 | l.Launch(sys.argv) 47 | -------------------------------------------------------------------------------- /scripts/sssp99.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/20nodes" 7 | progfile = "release/SSSP" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj99", 12 | "scheduler_port" : "33227", 13 | "hdfs_namenode" : "proj99", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | # "url" : "/datasets/graph/webbase-adj", 19 | "url" : "/datasets/graph/google-adj", 20 | #"url" : "/datasets/graph/webuk-adj", 21 | "num_local_threads" : 20, 22 | "num_parts" : 400, 23 | "sourceID" : 42, 24 | "iteration" : 100, 25 | "display" : False, 26 | # "combine_type": "kShuffleCombine", 27 | "combine_type": "kDirectCombine", 28 | } 29 | 30 | scheduler_params = { 31 | "dag_runner_type" : "sequential", 32 | } 33 | 34 | env_params = ( 35 | "GLOG_logtostderr=true " 36 | "GLOG_v=-1 " 37 | "GLOG_minloglevel=0 " 38 | # this is to enable hdfs short-circuit read (disable the warning info) 39 | # change this path accordingly when we use other cluster 40 | # the current setting is for proj5-10 41 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 42 | ) 43 | 44 | dump_core = False 45 | l = Launcher(schedulerfile, progfile, hostfile, 46 | common_params, scheduler_params, program_params, env_params, 47 | dump_core) 48 | 49 | l.Launch(sys.argv) 50 | -------------------------------------------------------------------------------- /scripts/tfidf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/5nodes" 7 | progfile = "release/TFIDF" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj10", 12 | "scheduler_port" : "33254", 13 | "hdfs_namenode" : "proj10", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | # "url" : "/datasets/corpus/enwiki/wiki_0", 19 | "url" : "/datasets/corpus/enwiki", 20 | "num_local_threads" : 20, 21 | "num_of_docs" : 10000, 22 | "num_doc_partition" : 10, 23 | "num_term_partition" : 10, 24 | } 25 | 26 | scheduler_params = { 27 | "dag_runner_type" : "sequential", 28 | } 29 | 30 | env_params = ( 31 | "GLOG_logtostderr=true " 32 | "GLOG_v=-1 " 33 | "GLOG_minloglevel=0 " 34 | # this is to enable hdfs short-circuit read (disable the warning info) 35 | # change this path accordingly when we use other cluster 36 | # the current setting is for proj5-10 37 | "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 38 | ) 39 | 40 | dump_core = False 41 | l = Launcher(schedulerfile, progfile, hostfile, 42 | common_params, scheduler_params, program_params, env_params, 43 | dump_core) 44 | 45 | l.Launch(sys.argv) 46 | -------------------------------------------------------------------------------- /scripts/tfidf_lr99.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from launcher import Launcher 5 | 6 | hostfile = "machinefiles/20nodes" 7 | progfile = "release/TFIDF_LR" 8 | schedulerfile = "release/SchedulerMain" 9 | 10 | common_params = { 11 | "scheduler" : "proj99", 12 | "scheduler_port" : "33254", 13 | "hdfs_namenode" : "proj99", 14 | "hdfs_port" : 9000, 15 | } 16 | 17 | program_params = { 18 | # "url" : "/datasets/corpus/enwiki-21g/wiki_0", 19 | #"url" : "/datasets/corpus/enwiki", 20 | # "url" : "/datasets/corpus/enwiki-21g", 21 | "url" : "/datasets/corpus/enwiki-50g-oneline", 22 | # "url" : "/datasets/corpus/enwiki200g", 23 | "num_local_threads" : 20, 24 | "num_of_docs" : 10000, 25 | "num_doc_partition" : 1000, 26 | "num_term_partition" : 100, 27 | # "num_params" : 23695351, 28 | # "num_params" : 1000, 29 | "num_params" : 262144, 30 | "num_iter" : 20, 31 | "is_sgd" : False, 32 | "staleness" : 0, 33 | "combine_type" : "kShuffleCombine", 34 | "num_param_per_part" : 2369, 35 | } 36 | 37 | scheduler_params = { 38 | "dag_runner_type" : "sequential", 39 | } 40 | 41 | env_params = ( 42 | "GLOG_logtostderr=true " 43 | "GLOG_v=-1 " 44 | "GLOG_minloglevel=0 " 45 | # this is to enable hdfs short-circuit read (disable the warning info) 46 | # change this path accordingly when we use other cluster 47 | # the current setting is for proj5-10 48 | # "LIBHDFS3_CONF=/data/opt/course/hadoop/etc/hadoop/hdfs-site.xml" 49 | "LIBHDFS3_CONF=/data/opt/hadoop-2.6.0/etc/hadoop/hdfs-site.xml" 50 | ) 51 | 52 | dump_core = False 53 | l = Launcher(schedulerfile, progfile, hostfile, 54 | common_params, scheduler_params, program_params, env_params, 55 | dump_core) 56 | 57 | l.Launch(sys.argv) 58 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${PROJECT_SOURCE_DIR} ${HUSKY_EXTERNAL_INCLUDE}) 2 | 3 | # Unit Test 4 | file(GLOB_RECURSE UNITTEST_FILES ${PROJECT_SOURCE_DIR}/*_test.cpp) 5 | include_directories(${GTEST_INCLUDE}) 6 | add_executable(HuskyUnitTest ${UNITTEST_FILES} test_main.cpp) 7 | add_dependencies(HuskyUnitTest gtest) 8 | target_link_libraries(HuskyUnitTest xyz) 9 | target_link_libraries(HuskyUnitTest ${HUSKY_EXTERNAL_LIB}) 10 | target_link_libraries(HuskyUnitTest ${GTEST_LIBRARIES}) 11 | target_link_libraries(HuskyUnitTest ${GMOCK_LIBRARIES}) 12 | set_property(TARGET HuskyUnitTest PROPERTY CXX_STANDARD 11) 13 | add_dependencies(HuskyUnitTest ${external_project_dependencies}) 14 | -------------------------------------------------------------------------------- /test/test_main.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Husky Team 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "glog/logging.h" 16 | #include "gtest/gtest.h" 17 | 18 | GTEST_API_ int main(int argc, char** argv) { 19 | testing::InitGoogleTest(&argc, argv); 20 | google::InitGoogleLogging(argv[0]); 21 | return RUN_ALL_TESTS(); 22 | } 23 | -------------------------------------------------------------------------------- /utils/busy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | int main(int argc, char* argv[]) { 8 | assert(argc == 2); 9 | const int num_threads = atoi(argv[1]); 10 | assert(num_threads > 0); 11 | assert(num_threads <= 100); 12 | std::cout << "num threads: " << num_threads << std::endl; 13 | std::vector v; 14 | for (int i = 0; i < num_threads; ++ i) { 15 | v.push_back(std::thread([]() { 16 | while (1) { ; } 17 | })); 18 | } 19 | for (auto& th : v) { 20 | th.join(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /utils/compile.sh: -------------------------------------------------------------------------------- 1 | g++ busy.cpp -lpthread -o busy 2 | --------------------------------------------------------------------------------