├── .circleci └── config.yml ├── .gcloudignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── cloudbuild.yaml ├── cluster ├── sys.config └── vm.args ├── config ├── prod_sys.config ├── prod_vm.args ├── proper.config ├── shared.config ├── sys.config ├── test.config └── vm.args ├── docs ├── design.md └── implementation.md ├── helm ├── README.md ├── storage-class.yaml └── vonnegut │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── config_map.yaml │ ├── service.yaml │ └── stateful-set.yaml │ └── values.yaml ├── include └── vg.hrl ├── rebar.config ├── rebar.lock ├── src ├── vg.erl ├── vg_active_segment.erl ├── vg_chain_state.erl ├── vg_cleaner.erl ├── vg_client.erl ├── vg_client_pool.erl ├── vg_cluster_mgr.erl ├── vg_config.erl ├── vg_conn.erl ├── vg_elli_handler.erl ├── vg_index.erl ├── vg_log_segments.erl ├── vg_peer_service.erl ├── vg_pool.erl ├── vg_pool_sup.erl ├── vg_protocol.erl ├── vg_socket.erl ├── vg_topic_mgr.erl ├── vg_topic_sup.erl ├── vg_topics.erl ├── vg_topics_sup.erl ├── vg_utils.erl ├── vonnegut.app.src ├── vonnegut_app.erl └── vonnegut_sup.erl └── test ├── cleanup_SUITE.erl ├── kafka_client_SUITE.erl ├── log_roll_SUITE.erl ├── prop_vg.erl ├── protocol_SUITE.erl ├── test_utils.hrl ├── topic_SUITE.erl ├── vg_consumer_SUITE.erl ├── vg_statem.erl ├── vg_test_utils.erl └── z_cluster_SUITE.erl /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | defaults: &defaults 2 | shell: /bin/bash 3 | working_directory: /home/circleci/vonnegut 4 | docker: 5 | - image: tsloughter/erlang-alpine:21.0.5 6 | 7 | version: 2 8 | jobs: 9 | build: 10 | <<: *defaults 11 | steps: 12 | - checkout 13 | 14 | - restore_cache: 15 | keys: 16 | - vonnegut-{{ checksum "rebar.lock" }} 17 | - vonnegut-hex-packages 18 | 19 | - run: 20 | command: rebar3 compile 21 | 22 | - store_artifacts: 23 | path: /home/circleci/vonnegut/rebar3.crashdump 24 | destination: rebar3_crashdump.txt 25 | when: on_fail 26 | 27 | - save-cache: 28 | key: vonnegut-{{ checksum "rebar.lock" }} 29 | paths: 30 | - /home/circleci/vonnegut/_build/default/lib 31 | - /home/circleci/vonnegut/_build/default/plugins 32 | 33 | - save-cache: 34 | key: vonnegut-hex-packages 35 | paths: 36 | - /root/.cache/rebar3/hex/default/packages 37 | 38 | dialyzer: 39 | <<: *defaults 40 | steps: 41 | - checkout 42 | 43 | - attach_workspace: 44 | at: /home/circleci/vonnegut 45 | 46 | - restore_cache: 47 | keys: 48 | - erlang-plt-21.0.5 49 | 50 | - restore_cache: 51 | keys: 52 | - vonnegut-{{ checksum "rebar.lock" }} 53 | - vonnegut-hex-packages 54 | 55 | - run: 56 | command: rebar3 dialyzer 57 | 58 | - save-cache: 59 | key: erlang-plt-21.0.5 60 | paths: 61 | - /root/.cache/rebar3/rebar3_21.0.5_plt 62 | 63 | xref: 64 | <<: *defaults 65 | steps: 66 | - checkout 67 | 68 | - attach_workspace: 69 | at: /home/circleci/vonnegut 70 | 71 | - restore_cache: 72 | keys: 73 | - vonnegut-{{ checksum "rebar.lock" }} 74 | - vonnegut-hex-packages 75 | 76 | - run: 77 | command: rebar3 xref 78 | 79 | tests: 80 | <<: *defaults 81 | steps: 82 | - checkout 83 | 84 | - attach_workspace: 85 | at: /home/circleci/vonnegut 86 | 87 | - restore_cache: 88 | keys: 89 | - vonnegut-{{ checksum "rebar.lock" }} 90 | - vonnegut-hex-packages 91 | 92 | - run: 93 | command: | 94 | set -eux 95 | epmd -daemon 96 | rebar3 do ct --name=testrunner@127.0.0.1, cover 97 | rebar3 covertool generate 98 | apk add --update python python-dev py-pip 99 | pip install codecov && codecov -f _build/test/covertool/vonnegut.covertool.xml 100 | 101 | - store_test_results: 102 | path: /home/circleci/vonnegut/_build/test/logs/ 103 | 104 | - store_artifacts: 105 | path: /home/circleci/vonnegut/_build/test/logs 106 | destination: common_test 107 | 108 | - store_artifacts: 109 | path: /home/circleci/vonnegut/rebar3.crashdump 110 | destination: rebar3_crashdump.txt 111 | when: on_fail 112 | 113 | workflows: 114 | version: 2 115 | build_and_test: 116 | jobs: 117 | - build 118 | - dialyzer: 119 | requires: 120 | - build 121 | - xref: 122 | requires: 123 | - build 124 | - tests: 125 | requires: 126 | - build 127 | -------------------------------------------------------------------------------- /.gcloudignore: -------------------------------------------------------------------------------- 1 | # This file specifies files that are *not* uploaded to Google Cloud Platform 2 | # using gcloud. It follows the same syntax as .gitignore, with the addition of 3 | # "#!include" directives (which insert the entries of the given .gitignore-style 4 | # file at that point). 5 | # 6 | # For more information, run: 7 | # $ gcloud topic gcloudignore 8 | # 9 | .gcloudignore 10 | _build 11 | ebin 12 | _checkouts 13 | # If you would like to upload your .git directory, .gitignore file or files 14 | # from your .gitignore file, remove the corresponding line 15 | # below: 16 | #.git 17 | #.gitignore 18 | !include:.gitignore 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .rebar3 2 | data 3 | _* 4 | .eunit 5 | *.o 6 | *.beam 7 | *.plt 8 | *.swp 9 | *.swo 10 | .erlang.cookie 11 | ebin 12 | log 13 | erl_crash.dump 14 | .rebar 15 | _rel 16 | _deps 17 | _plugins 18 | _tdeps 19 | logs 20 | _build 21 | rebar3.crashdump 22 | vonnegut.tar.gz 23 | properdata/ 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM erlang:21.0.5-alpine as builder 2 | 3 | RUN apk add --no-cache --update tar curl git bash make libc-dev gcc g++ vim 4 | 5 | RUN set -xe \ 6 | && curl -fSL -o rebar3 "https://s3.amazonaws.com/rebar3-nightly/rebar3" \ 7 | && chmod +x ./rebar3 \ 8 | && ./rebar3 local install \ 9 | && rm ./rebar3 10 | 11 | ENV PATH "$PATH:/root/.cache/rebar3/bin" 12 | 13 | WORKDIR /usr/src/app 14 | COPY . /usr/src/app 15 | 16 | RUN rebar3 as prod tar 17 | 18 | RUN mkdir -p /opt/rel 19 | RUN tar -zxvf /usr/src/app/_build/prod/rel/*/*.tar.gz -C /opt/rel 20 | 21 | FROM alpine:3.8 22 | 23 | RUN apk add --no-cache openssl-dev ncurses 24 | 25 | WORKDIR /opt/vonnegut 26 | 27 | ENV RELX_REPLACE_OS_VARS true 28 | ENV NODE 127.0.0.1 29 | ENV COOKIE vonnegut 30 | ENV CHAIN_NAME chain1 31 | ENV REPLICAS 1 32 | ENV PEER_IP 127.0.0.1 33 | ENV DISCOVERY_DOMAIN local 34 | 35 | COPY --from=builder /opt/rel /opt/vonnegut 36 | 37 | EXPOSE 5555 5555 38 | 39 | ENTRYPOINT ["/opt/vonnegut/bin/vonnegut"] 40 | 41 | CMD ["foreground"] 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | vonnegut 2 | ===== 3 | 4 | [![CircleCI](https://circleci.com/gh/SpaceTime-IoT/vonnegut.svg?style=svg)](https://circleci.com/gh/SpaceTime-IoT/vonnegut) 5 | 6 | [![codecov](https://codecov.io/gh/SpaceTime-IoT/vonnegut/branch/master/graph/badge.svg)](https://codecov.io/gh/SpaceTime-IoT/vonnegut) 7 | 8 | Vonnegut is a append-only log that follows the file format and API of Kafka 1.0. The server can be run standalone, with 1 or more chains each with 1 or more replicas, or as part of another Erlang release which can talk to it directly. 9 | 10 | Each chain is responsible for a range of the topic space. A read or write to a topic requires finding what chain the topic belongs to and then making a request to the head, in the case of a write, or the tail, in the case of a read. 11 | 12 | Configuration 13 | ----- 14 | 15 | ### Server 16 | 17 | A node in a chain can discover other nodes within the chain through DNS SRV record queries. The `replicas` configuration tells vonnegut node how many other nodes it needs to connect to to form the required chain length to ack writes. 18 | 19 | ``` 20 | {vonnegut, [{chain, [{name, "chain-1"}, 21 | {discovery, {srv, "chain-1.service.cluster.local"}}, 22 | {replicas, "2"}, 23 | {port, 5555}]} 24 | ]} 25 | ``` 26 | 27 | ### Client 28 | 29 | Clients start a pool of connections to the head and tail of each chain. Chains are found through DNS queries against endpoints: 30 | 31 | ``` 32 | {vonnegut, [{client, [{endpoints, [{"chain-1.service.cluster.local", 5555}]}]}]} 33 | ``` 34 | 35 | Erlang Interface 36 | --- 37 | 38 | A local interface can be used to create, read and write topics. 39 | 40 | ```shell 41 | $ rebar3 shell 42 | 1> vg:create_topic(<<"test_topic">>). 43 | 2> vg:write(<<"test_topic">>, [<<"some log message">>, <<"more log message">>]). 44 | 3> vg:fetch(<<"test_topic">>). 45 | {ok,#{high_water_mark => 1,partition => 0, 46 | record_batches => 47 | [#{headers => [],key => <<>>,offset => 1,sequence_number => 1, 48 | timestamp => 1517613646458,value => <<"more log message">>}, 49 | #{headers => [],key => <<>>,offset => 0,sequence_number => 0, 50 | timestamp => 1517613646458, 51 | value => <<"some log message">>}]}} 52 | ``` 53 | 54 | By default index and log files will be written to `./data`: 55 | 56 | ```shell 57 | $ ls data/test_topic-0/ 58 | 00000000000000000000.index 00000000000000000000.log 59 | ``` 60 | 61 | Kafkaesque Client 62 | --- 63 | 64 | ```erlang 65 | $ rebar3 shell 66 | 1> vg_client_pool:start(). 67 | ok 68 | 2> vg_client:produce(<<"my-topic-2">>, [<<"message 1">>, <<"message 2">>]). 69 | {ok,1} 70 | 3> vg_client:fetch(<<"my-topic-2">>). 71 | {ok,#{<<"test_topic-2">> => 72 | #{0 => 73 | #{error_code => 0,high_water_mark => 1, 74 | record_batches => 75 | [#{headers => [],key => <<>>,offset => 1, 76 | sequence_number => 1,timestamp => 1517616861441, 77 | value => <<"message 2">>}, 78 | #{headers => [],key => <<>>,offset => 0, 79 | sequence_number => 0,timestamp => 1517616861441, 80 | value => <<"message 1">>}], 81 | record_batches_size => 95}}}} 82 | ``` 83 | 84 | Running Tests 85 | ----- 86 | 87 | The tests require opening thousands of files and so may require increasing the limit per process on your system with: 88 | 89 | ```shell 90 | $ ulimit -n 63536 91 | ``` 92 | 93 | Tests also require a nodename: 94 | 95 | ```shell 96 | $ rebar3 ct 97 | ``` 98 | -------------------------------------------------------------------------------- /cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'gcr.io/cloud-builders/docker' 3 | entrypoint: 'sh' 4 | args: ['-c', 'docker build -t us.gcr.io/$PROJECT_ID/vonnegut:$$(git describe --abbrev=4 HEAD --tags) .'] 5 | images: ['us.gcr.io/nucleus-sti/vonnegut'] 6 | -------------------------------------------------------------------------------- /cluster/sys.config: -------------------------------------------------------------------------------- 1 | %% -*- erlang -*- 2 | [{vonnegut, [{chain, [{name, chain1}, 3 | {discovery, {direct, [{'chain1-0', "127.0.0.1", 15555, 5588}, 4 | {'chain1-1', "127.0.0.1", 15556, 5589}, 5 | {'chain1-2', "127.0.0.1", 15557, 5590}]}}, 6 | {replicas, 3}]}, 7 | 8 | {client, [{endpoints, [{"127.0.0.1", 5588}]}]} 9 | %% {log_dirs, ["${LOG_DIR}"]} 10 | ]}, 11 | 12 | {partisan, [{peer_ip, {127,0,0,1}}, 13 | {partisan_peer_service_manager, 14 | partisan_default_peer_service_manager}]}, 15 | 16 | {kernel, [{start_time, true}]}, 17 | 18 | {opencensus, [{reporter, {oc_reporter_noop, #{project => <<"nucleus-sti">>, 19 | service_account => <<"default">>}}}]}, 20 | 21 | {lager, 22 | [{error_logger_hwm, 100}, 23 | {crash_log_count, 5}, 24 | {crash_log_date, "$D0"}, 25 | {crash_log_size, 10485760}, 26 | {crash_log_msg_size, 65536}, 27 | {crash_log, "./log/crash.log"}, 28 | {handlers, 29 | [{lager_console_backend, [{level, info}, 30 | {formatter, lager_default_formatter}, 31 | {formatter_config, 32 | [time, color, " [",severity,"] ", 33 | pid, " ", 34 | "mod=", module, 35 | " fun=", function, " ", message, "\e[0m\r\n"]}]}, 36 | {lager_file_backend, 37 | [{file, "./log/error.log"}, 38 | {level, error}, 39 | {formatter, lager_default_formatter}, 40 | {formatter_config, 41 | [time, color, " [",severity,"] ", 42 | pid, " ", 43 | "mod=", module, 44 | " fun=", function, " ", message, "\e[0m\r\n"]}, 45 | {size, 10485760}, 46 | {date, "$D0"}, 47 | {count, 5}]}, 48 | {lager_file_backend, 49 | [{file, "./log/debug.log"}, 50 | {level, debug}, 51 | {formatter, lager_default_formatter}, 52 | {formatter_config, 53 | [time, color, " [",severity,"] ", 54 | pid, " ", 55 | "mod=", module, 56 | " fun=", function, " ", message, "\e[0m\r\n"]}, 57 | {size, 10485760}, 58 | {date, "$D0"}, 59 | {count, 5}]}]}, 60 | {error_logger_redirect, true}]}, 61 | 62 | "../../../../config/shared" 63 | ]. 64 | -------------------------------------------------------------------------------- /cluster/vm.args: -------------------------------------------------------------------------------- 1 | -name ${NODE} 2 | 3 | -setcookie vonnegut 4 | 5 | +A 100 6 | +K true 7 | 8 | -partisan peer_port ${PEER_PORT} 9 | -------------------------------------------------------------------------------- /config/prod_sys.config: -------------------------------------------------------------------------------- 1 | [{vonnegut, [{chain, [{name, "${CHAIN_NAME}"}, 2 | 3 | {discovery, {srv, "${DISCOVERY_DOMAIN}"}}, 4 | 5 | %% with direct we do not need to set the # replicas expected 6 | {replicas, "${REPLICAS}"}, 7 | {port, 5588}]} 8 | ]}, 9 | 10 | {partisan, [{peer_ip, "${PEER_IP}"}, 11 | {peer_port, 10200}, 12 | {partisan_peer_service_manager, 13 | partisan_default_peer_service_manager}]}, 14 | 15 | {kernel, [{start_time, true}]}, 16 | 17 | {opencensus, [{reporter, {oc_reporter_noop, #{project => <<"nucleus-sti">>, 18 | service_account => <<"default">>}}}]}, 19 | 20 | {lager, [{error_logger_redirect, true}, 21 | {handlers, 22 | [{lager_console_backend, 23 | [{level, info}, 24 | {formatter, lager_default_formatter}, 25 | {formatter_config, 26 | [time, " [",severity,"] ", 27 | pid, " ", 28 | {module, ["mod=", module, {function, [" fun=", function], ""}, " "], ""}, message, "\n"]}]}]}]}, 29 | 30 | "config/shared" 31 | ]. 32 | -------------------------------------------------------------------------------- /config/prod_vm.args: -------------------------------------------------------------------------------- 1 | -name vonnegut@${NODE} 2 | 3 | -setcookie ${COOKIE} 4 | 5 | +A 100 6 | +K true 7 | -------------------------------------------------------------------------------- /config/proper.config: -------------------------------------------------------------------------------- 1 | %% -*- erlang -*- 2 | [{vonnegut, [{chain, [{name, chain1}, 3 | %% {nodename, host, data-port, partisan-port} 4 | {discovery, local}, %% {direct, [{'chain1-0', "127.0.0.1", 10200, 5555}, 5 | %% {'chain1-1', "127.0.0.1", 10201, 5556}, 6 | %% {'chain1-2', "127.0.0.1", 10202, 5557}]}}, 7 | 8 | %% with direct we do not need to set the # replicas expected 9 | %% {replicas, "2"} 10 | 11 | {port, 5588}]}, 12 | {segment_bytes, 1024}, 13 | {index_max_bytes, 128}, 14 | {index_interval_bytes, 256}, 15 | 16 | %% client config for if we want to use only the vonnegut client 17 | {client, [{endpoint, [{"127.0.0.1", 5588}]}]} 18 | ]}, 19 | 20 | {partisan, [{peer_ip, {127,0,0,1}}, 21 | {peer_port, 10200}, 22 | {partisan_peer_service_manager, 23 | partisan_default_peer_service_manager}]}, 24 | 25 | {kernel, [{start_time, true}]}, 26 | 27 | {opencensus, [{reporter, {oc_noop_reporter, #{project => <<"nucleus-sti">>, 28 | service_account => <<"default">>}}}]}, 29 | 30 | {sasl, [{sasl_error_logger, false}]}, 31 | 32 | {lager, [{handlers, 33 | [{lager_console_backend, 34 | [{level, info}, 35 | {formatter, lager_default_formatter}, 36 | {formatter_config, 37 | [time, color, " [",severity,"] ", 38 | pid, " ", 39 | "mod=", module, 40 | " fun=", function, " ", message, "\e[0m\r\n"]}]}]} 41 | ]}, 42 | 43 | "config/shared" 44 | ]. 45 | -------------------------------------------------------------------------------- /config/shared.config: -------------------------------------------------------------------------------- 1 | [ 2 | {sasl, [{sasl_error_logger, false}]}, 3 | 4 | {prometheus, [{collectors, [default]}, 5 | {default_metrics, 6 | [{gauge, [{name, active_topics}, 7 | {labels, []}, 8 | {help, "number of active topic processes"}, 9 | {registry, default}]}, 10 | {gauge, [{name, log_segments}, 11 | {labels, [topic]}, 12 | {help, "number of log segments for topics"}, 13 | {registry, default}]}, 14 | {gauge, [{name, open_connections}, 15 | {labels, []}, 16 | {help, "number of open connections through the client"}, 17 | {registry, default}]}, 18 | {gauge, [{name, replicas}, 19 | {labels, []}, 20 | {help, "number of replicas in this chain"}, 21 | {registry, default}]}, 22 | {gauge, [{name, chains}, 23 | {labels, []}, 24 | {help, "number of chains in the cluster"}, 25 | {registry, default}]}, 26 | %% this is good to know but doesn't make sense right now 27 | %% {gauge, [{name, pending_write_repairs}, 28 | %% {labels, []}, 29 | %% {help, "number of records that have been repaired on this node"}, 30 | %% {registry, default}]}, 31 | 32 | %% same as active_topics right now 33 | %% {gauge, [{name, topics}, 34 | %% {labels, [chain]}, 35 | %% {help, "number of topics in a chain"}, 36 | %% {registry, default}]}, 37 | 38 | {boolean, [{name, is_active}, 39 | {labels, []}, 40 | {help, "is this brick active in the chain"}, 41 | {registry, default}]}, 42 | {boolean, [{name, is_solo}, 43 | {labels, []}, 44 | {help, "is this chain a single node"}, 45 | {registry, default}]}, 46 | {boolean, [{name, is_head}, 47 | {labels, []}, 48 | {help, "is this brick the head of a chain"}, 49 | {registry, default}]}, 50 | {boolean, [{name, is_middle}, 51 | {labels, []}, 52 | {help, "is this brick in the middle of a chain"}, 53 | {registry, default}]}, 54 | {boolean, [{name, is_tail}, 55 | {labels, []}, 56 | {help, "is this brick the tail of a chain"}, 57 | {registry, default}]}, 58 | 59 | {counter, [{name, write_repairs}, 60 | {labels, []}, 61 | {help, "number of write repairs"}, 62 | {registry, default}]}, 63 | {counter, [{name, client_requests}, 64 | {labels, []}, 65 | {help, "requests count"}, 66 | {registry, default}]}]}]} 67 | ]. 68 | -------------------------------------------------------------------------------- /config/sys.config: -------------------------------------------------------------------------------- 1 | %% -*- erlang -*- 2 | [{vonnegut, [{chain, [{name, chain1}, 3 | %% {nodename, host, data-port, partisan-port} 4 | {discovery, local}, %% {direct, [{'chain1-0', "127.0.0.1", 10200, 5555}, 5 | %% {'chain1-1', "127.0.0.1", 10201, 5556}, 6 | %% {'chain1-2', "127.0.0.1", 10202, 5557}]}}, 7 | 8 | %% with direct we do not need to set the # replicas expected 9 | %% {replicas, "2"} 10 | 11 | {port, 5588}]}, 12 | 13 | %% client config for if we want to use only the vonnegut client 14 | {client, [{endpoints, [{"127.0.0.1", 5588}]}]} 15 | ]}, 16 | 17 | {partisan, [{peer_ip, {127,0,0,1}}, 18 | {peer_port, 10200}, 19 | {partisan_peer_service_manager, 20 | partisan_default_peer_service_manager}]}, 21 | 22 | {kernel, [{start_time, true}]}, 23 | 24 | {opencensus, [{reporter, {oc_reporter_noop, #{project => <<"nucleus-sti">>, 25 | service_account => <<"default">>}}}]}, 26 | 27 | {lager, [{error_logger_redirect, true}, 28 | %% {suppress_application_start_stop, true}, 29 | %% {suppress_supervisor_start_stop, true}, 30 | {handlers, 31 | [{lager_console_backend, 32 | [{level, info}, 33 | {formatter, lager_default_formatter}, 34 | {formatter_config, 35 | [time, " [",severity,"] ", 36 | pid, " ", 37 | {module, ["mod=", module, {function, [" fun=", function], ""}, " "], ""}, message, "\n"]}]}]}]}, 38 | 39 | "config/shared" 40 | ]. 41 | -------------------------------------------------------------------------------- /config/test.config: -------------------------------------------------------------------------------- 1 | %% -*- erlang -*- 2 | [{lager, 3 | [{error_logger_hwm, 100}, 4 | {crash_log_count, 5}, 5 | {crash_log_date, "$D0"}, 6 | {crash_log_size, 10485760}, 7 | {crash_log_msg_size, 65536}, 8 | {crash_log, "./log/crash.log"}, 9 | {handlers, 10 | [{lager_console_backend, 11 | [{level, info}, 12 | {formatter, lager_default_formatter}, 13 | {formatter_config, 14 | [time, " [",severity,"] ", 15 | pid, " ", 16 | {module, ["mod=", module, {function, [" fun=", function], ""}, " "], ""}, message, "\n"]}]}, 17 | {lager_file_backend, 18 | [{file, "./log/error.log"}, 19 | {level, error}, 20 | {formatter, lager_default_formatter}, 21 | {formatter_config, 22 | [time, " [",severity,"] ", 23 | pid, " ", 24 | {module, ["mod=", module, {function, [" fun=", function], ""}, " "], ""}, message, "\n"]}]}, 25 | {lager_file_backend, 26 | [{file, "./log/debug.log"}, 27 | {level, debug}, 28 | {formatter, lager_default_formatter}, 29 | {formatter_config, 30 | [time, " [",severity,"] ", 31 | pid, " ", 32 | {module, ["mod=", module, {function, [" fun=", function], ""}, " "], ""}, message, "\n"]}]}]}, 33 | {error_logger_redirect, true}]}, 34 | 35 | 36 | "config/shared" 37 | ]. 38 | -------------------------------------------------------------------------------- /config/vm.args: -------------------------------------------------------------------------------- 1 | -name chain1-0@127.0.0.1 2 | 3 | -setcookie vonnegut 4 | 5 | +A 100 6 | +K true 7 | -------------------------------------------------------------------------------- /docs/design.md: -------------------------------------------------------------------------------- 1 | Vonnegut Design Doc (1st Iteration) 2 | ----------------------------------------- 3 | 4 | Vonnegut is an append only replicated log utilizing Kubernetes Stateful Sets for consistency and resource utilization. 5 | 6 | ## Log 7 | 8 | Append only ordered sequence of records made up of multiple log segment files stored on disk. 9 | 10 | ## Log Segment 11 | 12 | A log segment is a file with the name of its first contained log record id. The `active` log segment is the newest and the only one that has writes appended to it. When it or the corresponding index becomes too large a new active log segment is created. The index allows a reader to quickly find the start position of a record by id within a log segment. 13 | 14 | ## Chains 15 | 16 | Chains consist of `N` vonnegut nodes, the first node in the chain is the `head` and the last is the `tail`. If `N=1` then these are the same and no replication occurs. 17 | 18 | All writes are sent to the `head` of a chain, all reads occur on the `tail`. Dirty reads or historical reads (reads on data only in inactive log segments) can occur on any node in the chain. 19 | 20 | ## (not really) Virtual Nodes 21 | 22 | Initial work will be on vonnegut nodes and virtual nodes having a 1 to 1 mapping. Meaning vonnegut nodes only take part in a single chain. Increasing the number of chains requires adding vonnegut nodes and overlapping chains across physical machines or virtual machines requires overlapping the separate vonnegut nodes. We'll be utilizing Kubernetes for handling the scheduling and resource utilization optimization for overlapping vonnegut nodes within a cluster of virtual machines. 23 | 24 | ## Cluster Membership 25 | 26 | The vonnegut nodes form a cluster through finding nodes in DNS and [partisan](https://github.com/lasp-lang/partisan) for connecting and failure detection. When a failure is detected by partisan a call back is triggered on each node and the nodes wait to continue replication until the entire chain is healthy again. 27 | 28 | Reads can continue as usual during failure. Unless, of course, it is the tail is unreachable by the clients, in which case the client requests will simply fail. 29 | 30 | ## Chain Membership 31 | 32 | Chains are manually created and all nodes within a chain are added together to the cluster. No rebalancing is done within the cluster when a new chain is created. Instead, the weighted chain selection will return the new chain for new topics until the chains become balanced. 33 | 34 | The order of a chain is a lexiographical sort of the node names guaranteeing each node sees the same chain structure. Nodes are named `-{0..N-1}` where `N` is the number of nodes in the chain. 35 | 36 | ## Adding Nodes to Existing Chains 37 | 38 | New nodes are added to the end of the chain, Stateful Sets ordered node names ensures this. It is the responsibility of the current tail to promote the new node to the new tail after it has synced all topic log segments from the tail, at which point client requests for reads are redirected to the new tail and the reset of the members of the chain are notified that it is now `active`, making them capable of answering a client request for who is the current `tail`. 39 | 40 | ## Mapping Topics to Chains 41 | 42 | New topics select a chain through randomly from the chains with the lowest weight. Weight can take into account numerous metrics of load but for starters will simply be the # topics on the chain. 43 | 44 | ## Replication 45 | 46 | Chain replication is used for durability. Each write to the `head` is replicated to the next node in the chain, and so on until the `tail` is reached. Writes are acked from the `tail` to the client and on an interval the latest id written to disk is acked to the preceding on in a chain by each node except the `head`. 47 | 48 | ## Handling Failure 49 | 50 | Each chain is a Kubernetes Stateful Set. A Stateful Set provides the ordering of the nodes and replacing a failed node with one of the same name and persistent storage. Thus in the case of partisan detecting a failure the chain will stop attempting to replicate and stop accepting new writies until healthy again. 51 | 52 | Each node acks writes from its predecessor, `N-`, after receiving an ack from their successor, `N+`. Until receiving the `N+` ack all writes are kept in a history. In the event of a failure `N` sends the writes from the history to the new `N+`. 53 | 54 | After an update is written (though possibly not flushed to disk depending on configuration) and sent to the next node in the chain it can continue to receive more updates without having recieved an `ack` from the next node in the chain. Nodes will periodically send acks with the latest record id they have written. Until an `ack` of an id larger or equal to a record it is kept in the history for possibly resending if a link has failed and it needs to send the writes to the new `N+`. 55 | 56 | **To deal with potentially dropped messages by Erlang's messaging layer we need to rely on the fact record id is always increasing by 1 to force a failure. This failure notifies the predecessor to resend starting at the last in-order record received.** 57 | 58 | The client is responsible for whether it wants to wait for an ack from the tail. It can keep sending writes before an ack arrives if it is ok with potentially losing writes. 59 | 60 | **What happens if our process restarts and loses this in memory history of records not yet acked? We wouldn't want to have to consider the entire node dead when pontentially other topics are fine. Maybe an ets table under the top level supervisor is required for storing this information?** 61 | 62 | **How do we handle a case of all nodes going down and having to resync with each other from what they have on disk?** 63 | 64 | ## Clients 65 | 66 | Clients discover chains through DNS records much the same as chains discover each other. After discovery chains (each individual chain has a unique DNS record named for the chain) the client can query a vonnegut node for information on the location of topics. 67 | 68 | The client caches the information about where the head and tail of chains are and have no need to update this information unless a request fails. In the event a client attempts to read from an old `tail` the node it is requesting from (assuming it is alive and the request doesn't just fail) returns the `tail` of the chain if it knows it, otherwise the client must query the `head` for an updated view of the chain. 69 | 70 | ## DNS 71 | 72 | SRV query `vonnegut` resolves to all chain node records `-{0..N}.vonnegut`. The records `-{0..N}.vonnegut` resolve to individual nodes in the chain named ``. 73 | 74 | ## References 75 | 76 | * [Chain Replication for Supporting High Throughput and Availability](http://www.cs.cornell.edu/home/rvr/papers/OSDI04.pdf) 77 | * [Chain replication in theory and in practice](http://www.snookles.com/scott/publications/erlang2010-slf.pdf) 78 | * [Kubernetes](http://kubernetes.io/) 79 | * [Kubernetes StatefulSets](http://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/) 80 | * [Kafka Protocol](https://kafka.apache.org/protocol) 81 | -------------------------------------------------------------------------------- /docs/implementation.md: -------------------------------------------------------------------------------- 1 | ## Iteration 0 Implementation 2 | 3 | ### Cluster Manager 4 | 5 | A single global process in the cluster, `vg_cluster_mgr`, is responsible for topic creation and mapping between topics and chains. This manager is to be an abstraction on top of multiple implementations for management, such as riak ensemble based consensus. 6 | 7 | ### ETS Tables 8 | 9 | * `logs_segments_table`: This table is an in memory and index representation of the topic segments found on disk. This allows for a quick ets query to find the segment a specific offset is to be found. The segments index file is then searched to find the exact file position to read from. 10 | 11 | * `high_watermarks_table`: Fetch responses must include the high watermark (highest message offset) for the topics included in the response. This value is tracked by a global ets table mapping topics to high watermarks. This value is updated after a messageset is written, so it does not get updated per message but per set written. 12 | 13 | * `chains_table` 14 | 15 | * `topic_map`: Requests bound for a specific topic must lookup the head or tail node in the chain that is responsible for that specific topic. This ets table is responsible for storing this mapping which is updated by querying the `cluster manager` directly or a `vg_client:metadata` or Kafka client metadata request to any node in the vonnegut cluster of chains. 16 | 17 | ### Client Refresh 18 | -------------------------------------------------------------------------------- /helm/README.md: -------------------------------------------------------------------------------- 1 | Running in Minikube 2 | --- 3 | 4 | Requirements: 5 | 6 | * [minikube](https://github.com/kubernetes/minikube) 7 | * [helm](http://helm.sh/) 8 | 9 | After installing (see mac instructions below) and starting [minikube](https://github.com/kubernetes/minikube) set your docker environment to use the docker daemon in the minikube VM: 10 | 11 | ```shell 12 | $ eval $(minikube docker-env) 13 | ``` 14 | 15 | Now when you run the script `bin/docker_build.sh` it will create an image accessible by kubernetes in minikube: 16 | 17 | ```shell 18 | $ bin/docker_build.sh 19 | + rebar3 as prod tar 20 | .... 21 | + mv _build/prod/rel/vonnegut/vonnegut-0.0.1.tar.gz ./ 22 | ++ sed -n 's/vonnegut-\(.*\).tar.gz/\1/p' 23 | ++ ls vonnegut-0.0.1.tar.gz 24 | + VERSION=0.0.1 25 | + mv vonnegut-0.0.1.tar.gz vonnegut.tar.gz 26 | + docker build --rm=false -t us.gcr.io/nucleus-sti/vonnegut:0.0.1 . 27 | Sending build context to Docker daemon 120.2 MB 28 | Step 1 : FROM ubuntu:16.04 29 | .... 30 | Step 7 : ENTRYPOINT /opt/vonnegut/bin/vonnegut 31 | ---> Running in 6471152fa506 32 | ---> 2dc5ff8e5568 33 | Successfully built 2dc5ff8e5568 34 | + docker push us.gcr.io/nucleus-sti/vonnegut:0.0.1 35 | The push refers to a repository [us.gcr.io/nucleus-sti/vonnegut] 36 | 5f1584b0d108: Pushed 37 | .... 38 | 32d75bc97c41: Layer already exists 39 | 0.0.1: digest: sha256:c354e26c97d74db6f5a22c8f593f28b05c3233f9ab6b3bdefaae1e83f679625e size: 1987 40 | + rm vonnegut.tar.gz 41 | ``` 42 | 43 | Next, use the helm package provided in the repo to create a vonnegut cluster: 44 | 45 | ```shell 46 | $ cd helm 47 | $ helm init 48 | $ helm install vonnegut 49 | NAME: solemn-otter 50 | LAST DEPLOYED: Fri Jan 6 13:21:46 2017 51 | NAMESPACE: default 52 | STATUS: DEPLOYED 53 | 54 | RESOURCES: 55 | ==> v1/ConfigMap 56 | NAME DATA AGE 57 | solemn-otter-vonnegut-config 2 0s 58 | 59 | ==> v1/Service 60 | NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE 61 | vonnegut None 5555/TCP 0s 62 | 63 | ==> apps/StatefulSet 64 | NAME DESIRED CURRENT AGE 65 | chain1 2 1 0s 66 | ``` 67 | 68 | After a few seconds both pods should come up and form a cluster: 69 | 70 | ```shell 71 | $ kubectl get statefulset 72 | NAME DESIRED CURRENT AGE 73 | chain1 2 2 25m 74 | $ kubectl get pods 75 | NAME READY STATUS RESTARTS AGE 76 | chain1-0 1/1 Running 0 26m 77 | chain1-1 1/1 Running 0 25m 78 | ``` 79 | 80 | #### Mac installation notes 81 | 82 | Install minikube via the commands in the instructions and helm via `brew install kubernetes-helm`, and go (via `brew install go`) if you don't have it installed. If you have to install go, remember to set up your `GOPATH`. 83 | 84 | Then the following incantation will build the xhyve driver to work with Docker for Mac: 85 | 86 | ```shell 87 | brew install xhyve 88 | export GO15VENDOREXPERIMENT=1 89 | go get -u -d github.com/zchee/docker-machine-driver-xhyve 90 | cd $GOPATH/src/github.com/zchee/docker-machine-driver-xhyve 91 | make install # this will prompt for your password 92 | sudo chown root:wheel /usr/local/bin/docker-machine-driver-xhyve 93 | sudo chmod u+s /usr/local/bin/docker-machine-driver-xhyve 94 | minikube start --vm-driver=xhyve --container-runtime=docker --show-libmachine-logs --v=10 --alsologtostderr 95 | eval $(minikube docker-env) 96 | bin/docker_build.sh 97 | ``` 98 | 99 | Note that this does not currently work because of a version mismatch between the version of docker bundled with minikube and the one in Docker for Mac, but I'm preserving these instructions in the hopes that it works one day. 100 | -------------------------------------------------------------------------------- /helm/storage-class.yaml: -------------------------------------------------------------------------------- 1 | kind: StorageClass 2 | apiVersion: storage.k8s.io/v1beta1 3 | metadata: 4 | name: slow 5 | provisioner: kubernetes.io/gce-pd 6 | parameters: 7 | type: pd-standard 8 | -------------------------------------------------------------------------------- /helm/vonnegut/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | -------------------------------------------------------------------------------- /helm/vonnegut/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | description: A Helm chart for Kubernetes 3 | name: vonnegut 4 | version: 0.1.0 5 | -------------------------------------------------------------------------------- /helm/vonnegut/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1. Get the application URL by running these commands: 2 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "fullname" . }}" -o jsonpath="{.items[0].metadata.name}") 3 | echo "Visit http://127.0.0.1:8080 to use your application" 4 | kubectl port-forward $POD_NAME 8080:{{ .Values.service.externalPort }} 5 | -------------------------------------------------------------------------------- /helm/vonnegut/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Create a default fully qualified app name. 3 | We truncate at 24 chars because some Kubernetes name fields are limited to this 4 | (by the DNS naming spec). 5 | */}} 6 | {{define "fullname"}} 7 | {{- $name := default "vonnegut" .Values.nameOverride -}} 8 | {{printf "%s-%s" .Release.Name $name | trunc 24 -}} 9 | {{end}} 10 | -------------------------------------------------------------------------------- /helm/vonnegut/templates/config_map.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: "{{ .Release.Name }}-vonnegut-config" 5 | data: 6 | vonnegut.discovery_domain: "_partisan._tcp.{{ .Values.service.name }}.default.svc.cluster.local" 7 | vonnegut.replicas: "{{ .Values.replicaCount }}" 8 | -------------------------------------------------------------------------------- /helm/vonnegut/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: "{{ .Values.service.name }}" 5 | labels: 6 | chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" 7 | app: {{ template "fullname" . }} 8 | type: service 9 | spec: 10 | clusterIP: None 11 | ports: 12 | - port: {{ .Values.service.externalPort }} 13 | targetPort: {{ .Values.service.internalPort }} 14 | protocol: TCP 15 | name: data 16 | - port: {{ .Values.service.partisanPort }} 17 | targetPort: {{ .Values.service.partisanPort }} 18 | protocol: TCP 19 | name: partisan 20 | selector: 21 | app: {{ template "fullname" . }} 22 | type: node 23 | -------------------------------------------------------------------------------- /helm/vonnegut/templates/stateful-set.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: StatefulSet 3 | metadata: 4 | name: {{ .Values.chain.name }} 5 | labels: 6 | chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" 7 | app: {{ template "fullname" . }} 8 | type: statefulset 9 | spec: 10 | serviceName: "{{ .Values.service.name }}" 11 | replicas: {{ .Values.replicaCount }} 12 | template: 13 | metadata: 14 | labels: 15 | app: {{ template "fullname" . }} 16 | type: node 17 | spec: 18 | terminationGracePeriodSeconds: 10 19 | containers: 20 | - name: {{ .Chart.Name }} 21 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" 22 | imagePullPolicy: {{ .Values.image.pullPolicy }} 23 | 24 | command: ["/opt/vonnegut/bin/vonnegut", "foreground"] 25 | 26 | ports: 27 | - containerPort: {{ .Values.service.internalPort }} 28 | name: data 29 | protocol: TCP 30 | - containerPort: {{ .Values.service.partisanPort }} 31 | name: partisan 32 | protocol: TCP 33 | volumeMounts: 34 | - name: data 35 | mountPath: /opt/vonnegut/data 36 | env: 37 | - name: DISCOVERY_DOMAIN 38 | valueFrom: 39 | configMapKeyRef: 40 | name: "{{ .Release.Name }}-vonnegut-config" 41 | key: vonnegut.discovery_domain 42 | - name: REPLICAS 43 | valueFrom: 44 | configMapKeyRef: 45 | name: "{{ .Release.Name }}-vonnegut-config" 46 | key: vonnegut.replicas 47 | - name: CHAIN_NAME 48 | value: {{ .Values.chain.name }} 49 | 50 | volumeClaimTemplates: 51 | - metadata: 52 | name: data 53 | annotations: 54 | volume.alpha.kubernetes.io/storage-class: hostpath 55 | spec: 56 | accessModes: [ "ReadWriteOnce" ] 57 | resources: 58 | requests: 59 | storage: 256Mi 60 | -------------------------------------------------------------------------------- /helm/vonnegut/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for vonnegut. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | replicaCount: 2 5 | image: 6 | repository: us.gcr.io/nucleus-sti/vonnegut 7 | tag: 0.0.1 8 | pullPolicy: IfNotPresent 9 | service: 10 | name: vonnegut 11 | externalPort: 5555 12 | internalPort: 5555 13 | partisanPort: 10200 14 | chain: 15 | name: chain1 16 | resources: 17 | limits: 18 | cpu: 100m 19 | memory: 128Mi 20 | requests: 21 | cpu: 100m 22 | memory: 128Mi 23 | -------------------------------------------------------------------------------- /include/vg.hrl: -------------------------------------------------------------------------------- 1 | -define(CLIENT_ID, "vg_client"). 2 | -define(DEFAULT_PORT, 5588). 3 | -define(MAX_REQUEST_ID, 2147483647). 4 | 5 | -define(MAGIC_TWO, 2). 6 | -define(API_VERSION, 2). 7 | 8 | %% a recordbatch starts with FirstOffset:64, Length:32 9 | %% so 12 is an often used constant when reading batches 10 | -define(OFFSET_AND_LENGTH_BYTES, 12). 11 | 12 | -define(INDEX_ENTRY_SIZE, 8). % bytes 13 | -define(INDEX_OFFSET_BITS, 32). 14 | -define(INDEX_POS_BITS, 32). 15 | 16 | -define(PRODUCE_REQUEST, 0). 17 | -define(FETCH_REQUEST, 1). 18 | -define(METADATA_REQUEST, 3). 19 | 20 | -define(COMPRESS_NONE, 0). 21 | -define(COMPRESS_GZIP, 1). 22 | -define(COMPRESS_SNAPPY, 2). 23 | -define(COMPRESS_LZ4, 3). 24 | 25 | -define(COMPRESSION_MASK, 7). 26 | -define(COMPRESSION(Attr), ?COMPRESSION_MASK band Attr). 27 | 28 | %% non-kafka extension 29 | -define(TOPICS_REQUEST, 1000). 30 | -define(FETCH2_REQUEST, 1001). 31 | -define(ENSURE_REQUEST, 1002). 32 | -define(REPLICATE_REQUEST, 1003). 33 | -define(DELETE_TOPIC_REQUEST, 1004). 34 | -define(REPLICATE_DELETE_TOPIC_REQUEST, 1005). 35 | 36 | -define(UNKNOWN_ERROR, -1). 37 | -define(NO_ERROR, 0). 38 | -define(UNKNOWN_TOPIC_OR_PARTITION, 3). 39 | -define(NOT_LEADER_ERROR, 6). % reusing this to mean topic map has chaned 40 | -define(TIMEOUT_ERROR, 7). 41 | 42 | %% non-kafka extensions 43 | -define(FETCH_DISALLOWED_ERROR, 129). 44 | -define(PRODUCE_DISALLOWED_ERROR, 131). 45 | -define(WRITE_REPAIR, 133). 46 | -define(REPLICATE_DISALLOWED_ERROR, 135). 47 | 48 | -define(SEGMENTS_TABLE, logs_segments_table). 49 | -define(WATERMARK_TABLE, high_watermarks_table). 50 | -define(CHAINS_TABLE, chains_table). 51 | 52 | -define(topic_map, topic_map). 53 | 54 | -record(chain, { 55 | name :: binary() | atom(), 56 | nodes :: [atom()] | undefined, 57 | topics_start :: binary() | start_space | undefined, % undef required because there's no way 58 | topics_end :: binary() | end_space | undefined, % to encode these in metadata :\ 59 | head :: {inet:ip_address() | inet:hostname(), inet:port_number()}, 60 | tail :: {inet:ip_address() | inet:hostname(), inet:port_number()} 61 | }). 62 | -type chain() :: #chain{}. 63 | 64 | -ifdef('OTP_RELEASE'). 65 | -define(WITH_STACKTRACE(T, R, S), T:R:S ->). 66 | -else. 67 | -define(WITH_STACKTRACE(T, R, S), T:R -> S = erlang:get_stacktrace(),). 68 | -endif. 69 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | %% -*- erlang -*- 2 | {erl_opts, 3 | [debug_info, 4 | warn_untyped_records, 5 | warnings_as_errors, 6 | nowarn_export_all, 7 | {parse_transform, lager_transform}]}. 8 | 9 | {xref_checks,[undefined_function_calls,undefined_functions,locals_not_used, 10 | deprecated_function_calls, 11 | deprecated_functions]}. 12 | %% ignore these warnings because lz4 and snappyer are optional dependencies 13 | {xref_ignores, [{lz4, unpack, 1}, 14 | {snappyer, decompress, 1}]}. 15 | 16 | {deps, [erlware_commons, 17 | acceptor_pool, 18 | {shackle, {git, "https://github.com/lpgauth/shackle.git", {branch, "master"}}}, 19 | {partisan, {git, "https://github.com/lasp-lang/partisan.git", {branch, "master"}}}, 20 | lager, 21 | recon, 22 | gproc, 23 | backoff, 24 | oc_google_reporter, 25 | {opencensus, {git, "https://github.com/census-instrumentation/opencensus-erlang.git", {branch, "master"}}}, 26 | 27 | %% metrics and health check deps 28 | elli, 29 | prometheus, 30 | elli_prometheus]}. 31 | 32 | {relx, [{release, {vonnegut, "semver"}, 33 | [vonnegut]}, 34 | 35 | {dev_mode, true}, 36 | {include_erts, false}, 37 | 38 | {sys_config, "config/sys.config"}, 39 | {vm_args, "config/vm.args"}, 40 | 41 | {extended_start_script, true}, 42 | 43 | {overlay, [{copy, "config/shared.config", "config/shared.config"}]}]}. 44 | 45 | {project_plugins, [{rebar3_proper, {git, "https://github.com/ferd/rebar3_proper.git", {branch, "master"}}}, 46 | covertool]}. 47 | 48 | {cover_enabled, true}. 49 | {cover_opts, [verbose]}. 50 | {cover_export_enabled, true}. 51 | 52 | {covertool, [{coverdata_files, ["ct.coverdata"]}]}. 53 | 54 | {profiles, 55 | [{test, [ 56 | {deps, [{brod, "3.0.0"}, meck, {proper, "1.2.0"}]} 57 | ]}, 58 | {prod, [{relx, [{sys_config, "config/prod_sys.config"}, 59 | {vm_args, "config/prod_vm.args"}, 60 | {dev_mode, false}, 61 | {include_erts, true}, 62 | {include_src, false}, 63 | {debug_info, strip} 64 | ]}]} 65 | ]}. 66 | 67 | {proper_opts, [{sys_config, "config/proper.config"}]}. 68 | {ct_opts, [{sys_config, "config/test.config"}, 69 | {ct_hooks, [cth_surefire]}]}. 70 | {dist_node, [{name, 'testrunner@127.0.0.1'}]}. 71 | -------------------------------------------------------------------------------- /rebar.lock: -------------------------------------------------------------------------------- 1 | {"1.1.0", 2 | [{<<"accept">>,{pkg,<<"accept">>,<<"0.3.0">>},1}, 3 | {<<"acceptor_pool">>,{pkg,<<"acceptor_pool">>,<<"1.0.0-rc.0">>},0}, 4 | {<<"augle">>,{pkg,<<"augle">>,<<"0.3.0">>},1}, 5 | {<<"backoff">>, 6 | {git,"https://github.com/evanmcc/backoff.git", 7 | {ref,"13f23b9ebb3604a4322e3a96d0633a75015c792d"}}, 8 | 0}, 9 | {<<"certifi">>,{pkg,<<"certifi">>,<<"2.0.0">>},2}, 10 | {<<"cf">>,{pkg,<<"cf">>,<<"0.3.1">>},1}, 11 | {<<"counters">>,{pkg,<<"counters">>,<<"0.2.0">>},1}, 12 | {<<"ctx">>,{pkg,<<"ctx">>,<<"0.4.1">>},1}, 13 | {<<"elli">>,{pkg,<<"elli">>,<<"3.0.0">>},0}, 14 | {<<"elli_prometheus">>,{pkg,<<"elli_prometheus">>,<<"0.1.1">>},0}, 15 | {<<"erlware_commons">>,{pkg,<<"erlware_commons">>,<<"1.2.0">>},0}, 16 | {<<"foil">>,{pkg,<<"foil">>,<<"0.1.1">>},1}, 17 | {<<"goldrush">>,{pkg,<<"goldrush">>,<<"0.1.9">>},1}, 18 | {<<"gproc">>,{pkg,<<"gproc">>,<<"0.6.1">>},0}, 19 | {<<"granderl">>,{pkg,<<"granderl">>,<<"0.1.5">>},1}, 20 | {<<"hackney">>,{pkg,<<"hackney">>,<<"1.9.0">>},1}, 21 | {<<"idna">>,{pkg,<<"idna">>,<<"5.1.0">>},2}, 22 | {<<"jsx">>,{pkg,<<"jsx">>,<<"2.8.2">>},1}, 23 | {<<"lager">>,{pkg,<<"lager">>,<<"3.6.1">>},0}, 24 | {<<"metal">>,{pkg,<<"metal">>,<<"0.1.1">>},1}, 25 | {<<"metrics">>,{pkg,<<"metrics">>,<<"1.0.1">>},2}, 26 | {<<"mimerl">>,{pkg,<<"mimerl">>,<<"1.0.2">>},2}, 27 | {<<"oc_google_reporter">>, 28 | {git,"https://github.com/tsloughter/oc_google_reporter.git", 29 | {ref,"4ebd9918fecca28cd3629f0524dc811b2b3653aa"}}, 30 | 0}, 31 | {<<"opencensus">>, 32 | {git,"https://github.com/census-instrumentation/opencensus-erlang.git", 33 | {ref,"2096818702a242102dcfb8dd18010a4bb0cc17df"}}, 34 | 0}, 35 | {<<"partisan">>, 36 | {git,"https://github.com/lasp-lang/partisan.git", 37 | {ref,"8e2a6508ed958fd1dd0b4926a35e6e96b441b17d"}}, 38 | 0}, 39 | {<<"prometheus">>,{pkg,<<"prometheus">>,<<"4.1.0">>},0}, 40 | {<<"quickrand">>,{pkg,<<"quickrand">>,<<"1.7.3">>},2}, 41 | {<<"rand_compat">>,{pkg,<<"rand_compat">>,<<"0.0.3">>},1}, 42 | {<<"recon">>,{pkg,<<"recon">>,<<"2.3.4">>},0}, 43 | {<<"rfc3339">>,{pkg,<<"rfc3339">>,<<"0.9.0">>},2}, 44 | {<<"shackle">>, 45 | {git,"https://github.com/lpgauth/shackle.git", 46 | {ref,"58f4adb067e677512b3a2af43c445734a4aeec64"}}, 47 | 0}, 48 | {<<"ssl_verify_fun">>,{pkg,<<"ssl_verify_fun">>,<<"1.1.1">>},2}, 49 | {<<"time_compat">>,{pkg,<<"time_compat">>,<<"0.0.1">>},1}, 50 | {<<"types">>,{pkg,<<"types">>,<<"0.1.6">>},1}, 51 | {<<"unicode_util_compat">>,{pkg,<<"unicode_util_compat">>,<<"0.3.1">>},3}, 52 | {<<"uuid">>,{pkg,<<"uuid_erl">>,<<"1.7.3">>},1}, 53 | {<<"wts">>,{pkg,<<"wts">>,<<"0.3.0">>},1}]}. 54 | [ 55 | {pkg_hash,[ 56 | {<<"accept">>, <<"2505B60BCB992CA79BD03AB7B8FEC8A520A47D9730F286DF1A479CC98B03F94B">>}, 57 | {<<"acceptor_pool">>, <<"679D741DF87FC13599B1AEF2DF8F78F1F880449A6BEFAB7C44FB6FAE0E92A2DE">>}, 58 | {<<"augle">>, <<"25633E47BB163ECB74EB34628DB49B622FEC659137C2A3250FA5430965DF272B">>}, 59 | {<<"certifi">>, <<"A0C0E475107135F76B8C1D5BC7EFB33CD3815CB3CF3DEA7AEFDD174DABEAD064">>}, 60 | {<<"cf">>, <<"5CB902239476E141EA70A740340233782D363A31EEA8AD37049561542E6CD641">>}, 61 | {<<"counters">>, <<"EF00F33404FDD9BD233F9B7966233222469E4560DBE1C712EA2E1AB63BB8FEFD">>}, 62 | {<<"ctx">>, <<"E4297DD25CCDE992BC7DE298F514BEACD0A44FAA9126A1F2567306D94C519A13">>}, 63 | {<<"elli">>, <<"D7CC24CFA886AC6A51D369B2C974392BEC9CD1E1CAA3931194D2BF52B763D82F">>}, 64 | {<<"elli_prometheus">>, <<"FF41EA8D88D1EBD1CD7A6D43FCC02B33B47FF20272C097B9D3A3CCCD79980C05">>}, 65 | {<<"erlware_commons">>, <<"2BAB99CF88941145767A502F1209886F1F0D31695EEF21978A30F15E645721E0">>}, 66 | {<<"foil">>, <<"4D07B62C114636BBC3EEBD5CEE04B23A7AAB1262B0F68AA79005A6FBC3790472">>}, 67 | {<<"goldrush">>, <<"F06E5D5F1277DA5C413E84D5A2924174182FB108DABB39D5EC548B27424CD106">>}, 68 | {<<"gproc">>, <<"4579663E5677970758A05D8F65D13C3E9814EC707AD51D8DCEF7294EDA1A730C">>}, 69 | {<<"granderl">>, <<"F20077A68BD80B8D8783BD15A052813C6483771DEC1A5B837D307CBE92F14122">>}, 70 | {<<"hackney">>, <<"51C506AFC0A365868469DCFC79A9D0B94D896EC741CFD5BD338F49A5EC515BFE">>}, 71 | {<<"idna">>, <<"D72B4EFFEB324AD5DA3CAB1767CB16B17939004E789D8C0AD5B70F3CEA20C89A">>}, 72 | {<<"jsx">>, <<"7ACC7D785B5ABE8A6E9ADBDE926A24E481F29956DD8B4DF49E3E4E7BCC92A018">>}, 73 | {<<"lager">>, <<"9D29C5FF7F926D25ECD9899990867C9152DCF34EEE65BAC8EC0DFC0D16A26E0C">>}, 74 | {<<"metal">>, <<"5D3D1322DA7BCD34B94FED5486F577973685298883954F7A3E517EF5EF6953F5">>}, 75 | {<<"metrics">>, <<"25F094DEA2CDA98213CECC3AEFF09E940299D950904393B2A29D191C346A8486">>}, 76 | {<<"mimerl">>, <<"993F9B0E084083405ED8252B99460C4F0563E41729AB42D9074FD5E52439BE88">>}, 77 | {<<"prometheus">>, <<"3BB851DF031C204D1C94BF55FFF2ECC9AB834F0236E64C080C9D5945B48D428D">>}, 78 | {<<"quickrand">>, <<"0E4FB48FAC904FE0C6E21D7E8C31A288A0700E1E81A35B38B649FC119079755D">>}, 79 | {<<"rand_compat">>, <<"011646BC1F0B0C432FE101B816F25B9BBB74A085713CEE1DAFD2D62E9415EAD3">>}, 80 | {<<"recon">>, <<"B406C2FCCDEAA0D94E23B5E30AE3D635A2D461E363A5C9C6316897037CF050D2">>}, 81 | {<<"rfc3339">>, <<"2075653DC9407541C84B1E15F8BDA2ABE95FB17C9694025E079583F2D19C1060">>}, 82 | {<<"ssl_verify_fun">>, <<"28A4D65B7F59893BC2C7DE786DEC1E1555BD742D336043FE644AE956C3497FBE">>}, 83 | {<<"time_compat">>, <<"23FE0AD1FDF3B5B88821B2D04B4B5E865BF587AE66056D671FE0F53514ED8139">>}, 84 | {<<"types">>, <<"03BB7140016C896D3441A77CB0B7D6ACAA583D6D6E9C4A3E1FD3C25123710290">>}, 85 | {<<"unicode_util_compat">>, <<"A1F612A7B512638634A603C8F401892AFBF99B8CE93A45041F8AACA99CADB85E">>}, 86 | {<<"uuid">>, <<"C5DF97D1A3D626235C2415E74053C47B2138BB863C5CD802AB5CAECB8ECC019F">>}, 87 | {<<"wts">>, <<"5CDF22C775CB1EBAE24C326A5DB6074D753C42F4BD12A9AA47CC62D3E2C71AD1">>}]} 88 | ]. 89 | -------------------------------------------------------------------------------- /src/vg.erl: -------------------------------------------------------------------------------- 1 | -module(vg). 2 | 3 | %% client interface 4 | -export([ensure_topic/1, 5 | write_record_batch/3, 6 | write/3, write/4, 7 | fetch/1, fetch/2, fetch/4, 8 | fetch/5]). 9 | 10 | %% ops interface. 11 | -export([ 12 | create_topic/1, 13 | delete_topic/1, 14 | describe_topic/1, 15 | deactivate_topic/1, 16 | regenerate_topic_index/1, 17 | tail_topic/1, tail_topic/2, 18 | running_topics/0 19 | ]). 20 | 21 | -include("vg.hrl"). 22 | 23 | -type topic() :: binary(). 24 | 25 | -type record() :: #{offset => integer(), 26 | timestamp => integer(), 27 | key => binary(), 28 | value := binary(), 29 | headers => [{unicode:characters_binary(), binary()}]}. 30 | 31 | -type record_batch() :: #{crc := integer(), 32 | producer_id => integer(), 33 | producer_epoch => integer(), 34 | sequence_number => integer(), 35 | records := [record()]}. 36 | 37 | -export_types([topic/0, 38 | record/0, 39 | record_batch/0]). 40 | 41 | -spec create_topic(Topic :: topic()) -> ok. 42 | create_topic(Topic) -> 43 | case validate_topic(Topic) of 44 | ok -> 45 | {ok, _Chain} = vg_cluster_mgr:create_topic(Topic), 46 | ok; 47 | {error, Reason} -> 48 | {error, Reason} 49 | end. 50 | 51 | -spec ensure_topic(Topic :: topic()) -> ok. 52 | ensure_topic(Topic) -> 53 | case validate_topic(Topic) of 54 | ok -> 55 | {ok, _Chain} = vg_cluster_mgr:ensure_topic(Topic), 56 | ok; 57 | {error, Reason} -> 58 | {error, Reason} 59 | end. 60 | 61 | validate_topic(B) when is_binary(B) -> 62 | Disallowed = 63 | [ 64 | <<0>>, 65 | <<"/">>, % path separators 66 | <<"\\">>, 67 | <<"*">>, 68 | <<".">>, <<"..">>, 69 | <<"[">>, <<"]">>, 70 | <<"(">>, <<")">>, 71 | <<"{">>, <<"}">> 72 | ], 73 | case binary:match(B, Disallowed) of 74 | nomatch -> 75 | ok; 76 | _ -> 77 | {error, invalid_characters} 78 | end; 79 | validate_topic(_) -> 80 | {error, non_binary_topic}. 81 | 82 | -spec write_record_batch(Topic, Partition, RecordBatch) -> {ok, integer()} | {error, any()} when 83 | Topic :: topic(), 84 | Partition :: non_neg_integer(), 85 | RecordBatch :: vg:record_batch(). 86 | write_record_batch(Topic, Partition, RecordBatch) -> 87 | vg_active_segment:write(Topic, Partition, RecordBatch). 88 | 89 | -spec write(Topic, Partition, Records) -> ok | {error, any()} when 90 | Topic :: topic(), 91 | Partition :: non_neg_integer(), 92 | Records :: binary() | [binary()]. 93 | write(Topic, Partition, Records) -> 94 | RecordBatch = vg_protocol:encode_record_batch(Records), 95 | vg_active_segment:write(Topic, Partition, RecordBatch). 96 | 97 | write(Topic, Partition, ExpectedId, RecordBatch) -> 98 | vg_active_segment:write(Topic, Partition, ExpectedId, RecordBatch). 99 | 100 | fetch(Topic) -> 101 | fetch(Topic, 0). 102 | 103 | -spec fetch(Topic, Offset) -> {ok, RecordBatches} when 104 | Topic :: topic(), 105 | Offset :: integer(), 106 | RecordBatches :: #{high_water_mark := integer(), 107 | partition := 0, 108 | record_batches := [vg:record_batch()]}. 109 | fetch(Topic, Offset) -> 110 | fetch(Topic, 0, Offset, -1). 111 | 112 | fetch(Topic, Partition, Offset, Count) -> 113 | {_, _, {File, Position, Bytes}} = fetch(Topic, Partition, Offset, 0, Count), 114 | {ok, Fd} = file:open(File, [read, binary, raw]), 115 | try 116 | {ok, [Data]} = file:pread(Fd, [{Position, Bytes}]), 117 | {ok, #{high_water_mark => vg_topics:lookup_hwm(Topic, Partition), 118 | partition => Partition, 119 | record_batches => vg_protocol:decode_record_batches(Data)}} 120 | after 121 | file:close(Fd) 122 | end. 123 | 124 | %% fetch/5 is a special form that only returns sizes and positions for 125 | %% later framing and sending 126 | 127 | %% A fetch of offset -1 returns Limit number of the records up to the 128 | %% high watermark 129 | fetch(Topic, Partition, -1, MaxBytes, Limit) -> 130 | HWM = vg_topics:lookup_hwm(Topic, Partition), 131 | fetch(Topic, Partition, erlang:max(0, HWM - Limit + 1), MaxBytes, Limit); 132 | fetch(Topic, Partition, Offset, MaxBytes, Limit) -> 133 | {SegmentId, {Position, _}} = vg_log_segments:find_segment_offset(Topic, Partition, Offset), 134 | File = vg_utils:log_file(Topic, Partition, SegmentId), 135 | SendBytes = 136 | case Limit of 137 | -1 -> 138 | filelib:file_size(File) - Position; 139 | _ -> 140 | LastOffset = Offset + Limit, 141 | case vg_log_segments:find_log_segment(Topic, Partition, LastOffset) of 142 | %% lastoffset is on the same segment, so limit fetch to lastoffset position 143 | SegmentId -> 144 | {EndPosition, EndSize} = 145 | vg_log_segments:find_record_offset(Topic, Partition, SegmentId, LastOffset), 146 | case EndPosition of 147 | Position -> 148 | %% in the same RecordBatch 149 | EndSize; 150 | _ -> 151 | (EndPosition + EndSize) - Position 152 | end; 153 | %% some higher segment, so send this whole segment 154 | _ -> 155 | filelib:file_size(File) - Position 156 | end 157 | end, 158 | 159 | lager:info("at=fetch_request topic=~s partition=~p offset=~p segment_id=~p position=~p", 160 | [Topic, Partition, Offset, SegmentId, Position]), 161 | 162 | Bytes = 163 | case MaxBytes of 164 | 0 -> SendBytes; 165 | _ -> min(SendBytes, MaxBytes) 166 | end, 167 | ErrorCode = 0, 168 | HWM = vg_topics:lookup_hwm(Topic, Partition), 169 | Response = vg_protocol:encode_fetch_topic_response(Partition, ErrorCode, HWM, Bytes), 170 | lager:debug("sending hwm=~p bytes=~p", [HWM, Bytes]), 171 | {erlang:iolist_size(Response)+Bytes, Response, {File, Position, Bytes}}. 172 | 173 | %% these are here mostly for ergonomics. right now they just forward 174 | %% the work to the cluster manager, but we might need to change that 175 | %% later and this allows us to keep a easy to type interface that 176 | %% doesn't have to change. 177 | delete_topic(Topic) -> 178 | vg_cluster_mgr:delete_topic(Topic). 179 | 180 | describe_topic(Topic) -> 181 | vg_cluster_mgr:describe_topic(Topic). 182 | 183 | deactivate_topic(Topic) -> 184 | vg_cluster_mgr:deactivate_topic(Topic). 185 | 186 | %% there's a debate here to be had about doing this all at once vs. a 187 | %% per segment approach. wrt to format changes (which should be 188 | %% ultra-rare), this is the right thing, but wrt index corruption 189 | %% (which should also be super rare?), we might want the fine control 190 | %% of regenerating a particular segment's index alone. 191 | regenerate_topic_index(Topic) -> 192 | vg_topic_mgr:regenerate_index(Topic, 0). 193 | 194 | tail_topic(Topic) -> 195 | tail_topic(Topic, #{}). 196 | 197 | -spec tail_topic(binary(), Opts) -> ok when 198 | Opts :: #{records => pos_integer(), % default 10 records 199 | time => pos_integer()}. % default 30 seconds 200 | tail_topic(Topic, Opts) -> 201 | Printer = erlang:spawn_opt(fun() -> tail_printer(Topic, Opts) end, 202 | [{max_heap_size, 1024 * 1024}]), 203 | vg_active_segment:tail(Topic, 0, Printer). 204 | 205 | %% this is shaping up to be quite expensive and could block lazy 206 | %% starts of deactivated topics. use in production with caution. 207 | running_topics() -> 208 | vg_cluster_mgr:running_topics(). 209 | 210 | tail_printer(Topic, Opts) -> 211 | Records = maps:get(records, Opts, 10), 212 | Time = maps:get(time, Opts, timer:seconds(30)), 213 | EndTime = erlang:monotonic_time(milli_seconds) + Time, 214 | F = fun Loop(0, _End) -> 215 | io:format("printed ~p records, terminating~n", [Records]); 216 | Loop(R, End) -> 217 | Left = End - erlang:monotonic_time(milli_seconds), 218 | case Left > 0 of 219 | true -> 220 | receive 221 | {'$print', Term} -> 222 | io:format("~p: ~p~n", [Topic, Term]), 223 | Loop(R - 1, End) 224 | after Left -> 225 | io:format("tail session timed out~n") 226 | end; 227 | false -> 228 | io:format("tail session timed out~n") 229 | end 230 | end, 231 | F(Records, EndTime). 232 | -------------------------------------------------------------------------------- /src/vg_active_segment.erl: -------------------------------------------------------------------------------- 1 | %% 2 | -module(vg_active_segment). 3 | 4 | -behaviour(gen_statem). 5 | 6 | -export([start_link/3, 7 | write/3, 8 | write/4, 9 | halt/2, 10 | tail/3, 11 | where/2, 12 | stop_indexing/2, 13 | resume_indexing/2]). 14 | 15 | -export([init/1, 16 | callback_mode/0, 17 | active/3, 18 | halted/3, 19 | handle_event/3, 20 | terminate/3]). 21 | 22 | -include("vg.hrl"). 23 | 24 | -record(config, {log_dir :: file:filename(), 25 | segment_bytes :: integer(), 26 | index_max_bytes :: integer(), 27 | index_interval_bytes :: integer()}). 28 | 29 | -record(data, {topic_dir :: file:filename(), 30 | next_id :: integer(), 31 | next_brick :: atom(), 32 | byte_count :: integer(), 33 | pos :: integer(), 34 | index_pos :: integer(), 35 | log_fd :: file:fd(), 36 | segment_id :: integer(), 37 | index_fd :: file:fd() | undefined, 38 | topic :: binary(), 39 | partition :: integer(), 40 | config :: #config{}, 41 | halted = false :: boolean(), 42 | index = true :: boolean(), 43 | tailer :: pid() | undefined, 44 | terminate_after :: integer(), 45 | timer_ref :: reference() 46 | }). 47 | 48 | %% need this until an Erlang release with `hibernate_after` spec added to gen option type 49 | -dialyzer({nowarn_function, start_link/3}). 50 | 51 | -define(ACTIVE_SEG(Topic, Partition), {via, gproc, {n, l, {active, Topic, Partition}}}). 52 | 53 | start_link(Topic, Partition, NextBrick) -> 54 | HibernateAfter = application:get_env(vonnegut, hibernate_after, timer:minutes(1)), 55 | case gen_statem:start_link(?ACTIVE_SEG(Topic, Partition), ?MODULE, [Topic, Partition, NextBrick], 56 | [{hibernate_after, HibernateAfter}]) of % hibernate after 5 minutes with no messages 57 | {ok, Pid} -> 58 | {ok, Pid}; 59 | {error, {already_started, Pid}} -> 60 | {ok, Pid}; 61 | {error, Reason} -> 62 | {error, Reason} 63 | end. 64 | 65 | -spec write(Topic, Partition, RecordBatch) -> {ok, Offset} | {error, any()} when 66 | Topic :: binary(), 67 | Partition :: integer(), 68 | RecordBatch :: vg:record_batch() | [vg:record_batch()], 69 | Offset :: integer(). 70 | write(Topic, Partition, RecordBatch) -> 71 | write(Topic, Partition, head, RecordBatch). 72 | 73 | write(Topic, Partition, ExpectedId, [RecordBatch]) -> 74 | write_(Topic, Partition, ExpectedId, RecordBatch); 75 | write(Topic, Partition, ExpectedId, RecordBatch) -> 76 | write_(Topic, Partition, ExpectedId, RecordBatch). 77 | 78 | write_(Topic, Partition, ExpectedId, RecordBatch) -> 79 | try 80 | case gen_statem:call(?ACTIVE_SEG(Topic, Partition), {write, ExpectedId, RecordBatch}) of 81 | retry -> 82 | write_(Topic, Partition, ExpectedId, RecordBatch); 83 | R -> R 84 | end 85 | catch _:{noproc, _} -> 86 | create_retry(Topic, Partition, ExpectedId, RecordBatch); 87 | error:badarg -> %% is this too broad? how to restrict? 88 | create_retry(Topic, Partition, ExpectedId, RecordBatch); 89 | exit:{timeout, _} -> 90 | {error, timeout} 91 | end. 92 | 93 | create_retry(Topic, Partition, ExpectedId, RecordBatch)-> 94 | lager:warning("write to nonexistent topic '~s', creating", [Topic]), 95 | {ok, _} = vg_cluster_mgr:ensure_topic(Topic), 96 | write_(Topic, Partition, ExpectedId, RecordBatch). 97 | 98 | halt(Topic, Partition) -> 99 | gen_statem:call(?ACTIVE_SEG(Topic, Partition), halt). 100 | 101 | tail(Topic, Partition, Printer) -> 102 | gen_statem:call(?ACTIVE_SEG(Topic, Partition), {tail, Printer}). 103 | 104 | where(Topic, Partition) -> 105 | {_, _, Where} = ?ACTIVE_SEG(Topic, Partition), 106 | gproc:where(Where). 107 | 108 | stop_indexing(Topic, Partition) -> 109 | gen_statem:call(?ACTIVE_SEG(Topic, Partition), stop_indexing). 110 | 111 | resume_indexing(Topic, Partition) -> 112 | gen_statem:call(?ACTIVE_SEG(Topic, Partition), resume_indexing). 113 | 114 | %%%%%%%%%%%% 115 | 116 | init([Topic, Partition, NextNode]) -> 117 | lager:info("at=init topic=~p next_server=~p", [Topic, NextNode]), 118 | Config = setup_config(), 119 | Partition = 0, 120 | LogDir = Config#config.log_dir, 121 | TerminateAfter = application:get_env(vonnegut, terminate_after, timer:minutes(5)), 122 | TopicDir = filename:join(LogDir, [binary_to_list(Topic), "-", integer_to_list(Partition)]), 123 | filelib:ensure_dir(filename:join(TopicDir, "ensure")), 124 | 125 | vg_log_segments:load_all(Topic, Partition), 126 | 127 | {Id, LatestIndex, LatestLog} = vg_log_segments:find_latest_id(TopicDir, Topic, Partition), 128 | LastLogId = filename:basename(LatestLog, ".log"), 129 | {ok, LogFD} = vg_utils:open_append(LatestLog), 130 | {ok, IndexFD} = vg_utils:open_append(LatestIndex), 131 | 132 | {ok, Position} = file:position(LogFD, eof), 133 | {ok, IndexPosition} = file:position(IndexFD, eof), 134 | 135 | vg_topics:insert_hwm(Topic, Partition, Id), 136 | 137 | {ok, active, #data{next_id = Id + 1, 138 | next_brick = NextNode, 139 | topic_dir = TopicDir, 140 | byte_count = 0, 141 | pos = Position, 142 | index_pos = IndexPosition, 143 | log_fd = LogFD, 144 | segment_id = list_to_integer(LastLogId), 145 | index_fd = IndexFD, 146 | topic = Topic, 147 | partition = Partition, 148 | config = Config, 149 | terminate_after = TerminateAfter, 150 | timer_ref = erlang:start_timer(TerminateAfter, self(), terminate) 151 | }}. 152 | 153 | callback_mode() -> 154 | state_functions. 155 | 156 | %% keep any new writes from coming in while we delete the topic 157 | halted({call, From}, _, _) -> 158 | {keep_state_and_data, [{reply, From, halted}]}. 159 | 160 | active({call, From}, halt, Data) -> 161 | {next_state, halted, Data, [{reply, From, halted}]}; 162 | active({call, From}, {tail, Printer}, Data) -> 163 | monitor(process, Printer), 164 | {keep_state, Data#data{tailer = Printer}, [{reply, From, ok}]}; 165 | active({call, From}, stop_indexing, Data=#data{index_fd=undefined}) -> 166 | {keep_state, Data#data{index = false}, [{reply, From, ok}]}; 167 | active({call, From}, stop_indexing, Data=#data{index_fd=FD}) -> 168 | %% no need to sync here, we're about to unlink 169 | file:close(FD), 170 | {keep_state, Data#data{index = false, index_fd = undefined}, [{reply, From, ok}]}; 171 | active({call, From}, resume_indexing, Data) -> 172 | {keep_state, Data#data{index = true}, [{reply, From, ok}]}; 173 | active({call, From}, {write, ExpectedID0, Record=#{last_offset_delta := LastOffsetDelta, 174 | record_batch := RecordBatch}}, Data=#data{next_id=ID, 175 | tailer=Tailer, 176 | topic=Topic, 177 | next_brick=NextBrick, 178 | terminate_after=TerminateAfter, 179 | timer_ref=TRef}) -> 180 | erlang:cancel_timer(TRef), 181 | TRef1 = erlang:start_timer(TerminateAfter, self(), terminate), 182 | Data1 = Data#data{timer_ref=TRef1}, 183 | 184 | %% TODO: add pipelining of requests 185 | try 186 | ExpectedID = 187 | case ExpectedID0 of 188 | head -> 189 | ID + LastOffsetDelta + 1; 190 | Supplied when is_integer(Supplied) -> 191 | case (ID + LastOffsetDelta + 1) == Supplied of 192 | true -> 193 | ExpectedID0; 194 | %% should we check > vs < here? one is repair 195 | %% the other is bad corruption 196 | _ -> 197 | %% inferred current id of the writing segment 198 | WriterID = ExpectedID0 - LastOffsetDelta, 199 | %% this should probably be limited, if 200 | %% we're going back too far, we need to be 201 | %% in some sort of catch-up mode 202 | lager:debug("starting write repair, ~p", [WriterID]), 203 | WriteRepairSet = write_repair(WriterID, Data1), 204 | throw({write_repair, WriteRepairSet, Data1}) 205 | end 206 | end, 207 | 208 | Result = 209 | case NextBrick of 210 | Role when Role == solo; Role == tail -> proceed; 211 | _ -> 212 | (fun Loop(_, Remaining) when Remaining =< 0 -> 213 | {error, timeout}; 214 | Loop(Start, Remaining) -> 215 | case vg_client:replicate(next_brick, Topic, ExpectedID, RecordBatch, Remaining) of 216 | retry -> 217 | Now = erlang:monotonic_time(milli_seconds), 218 | Elapsed = Now - Start, 219 | Loop(Now, Remaining - Elapsed); 220 | Result -> 221 | Result 222 | end 223 | end)(erlang:monotonic_time(milli_seconds), timeout() * 5) 224 | end, 225 | 226 | case Result of 227 | Go when Go =:= proceed orelse 228 | element(1, Go) =:= ok -> 229 | Data2 = write_record_batch(Record, Data1), 230 | case Tailer of 231 | undefined -> 232 | ok; 233 | Pid -> 234 | Pid ! {'$print', {Data2#data.next_id - 1, Record}} 235 | end, 236 | {keep_state, Data2, [{reply, From, {ok, Data2#data.next_id - 1}}]}; 237 | {write_repair, RepairSet} -> 238 | prometheus_counter:inc(write_repairs), 239 | %% add in the following when pipelining is added, if it makes sense 240 | %% prometheus_gauge:inc(pending_write_repairs, length(RepairSet)), 241 | Data2 = write_record_batch(RepairSet, Data1), 242 | case ExpectedID0 of 243 | head -> 244 | {keep_state, Data2, [{reply, From, retry}]}; 245 | _ -> 246 | {keep_state, Data2, [{reply, From, {write_repair, RepairSet}}]} 247 | end; 248 | {error, Reason} -> 249 | {keep_state, Data1, [{reply, From, {error, Reason}}]} 250 | end 251 | catch throw:{write_repair, RS, D} -> 252 | {keep_state, D, [{reply, From, {write_repair, RS}}]}; 253 | throw:{E, D} -> 254 | {keep_state, D, [{reply, From, {error, E}}]} 255 | end; 256 | active(Type, Event, Data) -> 257 | handle_event(Type, Event, Data). 258 | 259 | 260 | handle_event(info, {timeout, _TRef, terminate}, _Data) -> 261 | {stop, normal}; 262 | handle_event(info, {'DOWN', _MonitorRef, _Type, _Object, _Info}, Data) -> 263 | {keep_state, Data#data{tailer = undefined}}. 264 | 265 | terminate(_, _Reason, _Data=#data{log_fd=LogFile, 266 | index_fd=IndexFile}) -> 267 | file:close(LogFile), 268 | file:close(IndexFile), 269 | ok. 270 | 271 | % 272 | 273 | write_record_batch(Batches, Data) when is_list(Batches) -> 274 | lists:foldl(fun(Batch, DataAcc) -> 275 | write_record_batch(Batch, DataAcc) 276 | end, Data, Batches); 277 | write_record_batch(#{last_offset_delta := LastOffsetDelta, 278 | size := Size0, 279 | record_batch := Bytes}, Data=#data{topic=Topic, 280 | partition=Partition, 281 | next_id=Id, 282 | byte_count=ByteCount}) -> 283 | Size = Size0 + ?OFFSET_AND_LENGTH_BYTES, 284 | NextId = Id + LastOffsetDelta + 1, 285 | Data1 = #data{pos=Position1, 286 | log_fd=LogFile} = maybe_roll(Size, Data), 287 | 288 | %% write to log 289 | ok = file:write(LogFile, [<>, Bytes]), 290 | Data2 = Data1#data{byte_count=ByteCount+Size}, 291 | 292 | %% maybe write index entry 293 | Data3 = update_index(Data2), 294 | 295 | %% update highwatermark in ets table 296 | vg_topics:update_hwm(Topic, Partition, NextId-1), 297 | 298 | Data3#data{next_id=NextId, 299 | pos=Position1+Size}. 300 | 301 | %% Create new log segment and index file if current segment is too large 302 | %% or if the index file is over its max and would be written to again. 303 | maybe_roll(Size, Data=#data{next_id=Id, 304 | topic_dir=TopicDir, 305 | log_fd=LogFile, 306 | index_fd=IndexFile, 307 | pos=Position, 308 | byte_count=ByteCount, 309 | index_pos=IndexPosition, 310 | index = Indexing, 311 | topic=Topic, 312 | partition=Partition, 313 | config=#config{segment_bytes=SegmentBytes, 314 | index_max_bytes=IndexMaxBytes, 315 | index_interval_bytes=IndexIntervalBytes}}) 316 | when Position+Size > SegmentBytes 317 | orelse (ByteCount+Size >= IndexIntervalBytes 318 | andalso IndexPosition+?INDEX_ENTRY_SIZE > IndexMaxBytes) -> 319 | lager:debug("seg size ~p max size ~p", [Position+Size, SegmentBytes]), 320 | lager:debug("index interval size ~p max size ~p", [ByteCount+Size, IndexIntervalBytes]), 321 | lager:debug("index pos ~p max size ~p", [IndexPosition+?INDEX_ENTRY_SIZE, IndexMaxBytes]), 322 | ok = file:sync(LogFile), 323 | ok = file:close(LogFile), 324 | 325 | case Indexing of 326 | true -> 327 | ok = file:sync(IndexFile), 328 | ok = file:close(IndexFile); 329 | _ -> 330 | ok 331 | end, 332 | 333 | {NewIndexFile, NewLogFile} = vg_log_segments:new_index_log_files(TopicDir, Id), 334 | vg_log_segments:insert(Topic, Partition, Id), 335 | 336 | Data#data{log_fd=NewLogFile, 337 | index_fd=NewIndexFile, 338 | %% we assume here that new indexes are good, and 339 | %% re-enable writing, expecting the old indexes to 340 | %% catch up eventually. This might be racy 341 | index = true, 342 | segment_id = Id, 343 | byte_count=0, 344 | pos=0, 345 | index_pos=0}; 346 | maybe_roll(_, Data) -> 347 | Data. 348 | 349 | %% skip writing indexes if they're disabled. 350 | update_index(Data=#data{index = false}) -> 351 | Data; 352 | %% Add to index if the number of bytes written to the log since the last index record was written 353 | update_index(Data=#data{next_id=Id, 354 | pos=Position, 355 | index_fd=IndexFile, 356 | byte_count=ByteCount, 357 | index_pos=IndexPosition, 358 | segment_id=BaseOffset, 359 | config=#config{index_interval_bytes=IndexIntervalBytes}}) 360 | when ByteCount >= IndexIntervalBytes -> 361 | IndexEntry = <<(Id - BaseOffset):?INDEX_OFFSET_BITS/unsigned, Position:?INDEX_OFFSET_BITS/unsigned>>, 362 | ok = file:write(IndexFile, IndexEntry), 363 | Data#data{index_pos=IndexPosition+?INDEX_ENTRY_SIZE, 364 | byte_count=0}; 365 | update_index(Data) -> 366 | Data. 367 | 368 | write_repair(Start, #data{next_id = ID, topic = Topic, partition = Partition}) -> 369 | %% two situations: replaying single-segment writes, and writes 370 | %% that span multiple segments 371 | {StartSegmentID, {StartPosition, _}} = vg_log_segments:find_segment_offset(Topic, Partition, Start), 372 | {EndSegmentID, {EndPosition, EndSize}} = vg_log_segments:find_segment_offset(Topic, Partition, ID), 373 | File = vg_utils:log_file(Topic, Partition, StartSegmentID), 374 | lager:debug("at=write_repair file=~p start=~p end=~p", [File, StartPosition, EndPosition]), 375 | case StartSegmentID == EndSegmentID of 376 | true -> 377 | {ok, FD} = file:open(File, [read, binary, raw]), 378 | try 379 | {ok, Data} = file:pread(FD, StartPosition, (EndPosition + EndSize) - StartPosition), 380 | [{StartSegmentID, Data}] 381 | after 382 | file:close(FD) 383 | end; 384 | _ -> 385 | error(not_implemented) 386 | end. 387 | 388 | setup_config() -> 389 | {ok, [LogDir]} = application:get_env(vonnegut, log_dirs), 390 | {ok, SegmentBytes} = application:get_env(vonnegut, segment_bytes), 391 | {ok, IndexMaxBytes} = application:get_env(vonnegut, index_max_bytes), 392 | {ok, IndexIntervalBytes} = application:get_env(vonnegut, index_interval_bytes), 393 | #config{log_dir=LogDir, 394 | segment_bytes=SegmentBytes, 395 | index_max_bytes=IndexMaxBytes, 396 | index_interval_bytes=IndexIntervalBytes}. 397 | 398 | timeout() -> 399 | application:get_env(vonnegut, ack_timeout, 1000). 400 | -------------------------------------------------------------------------------- /src/vg_chain_state.erl: -------------------------------------------------------------------------------- 1 | %%%------------------------------------------------------------------- 2 | %% @doc Track the current state of the chain this node is a member of. 3 | %% 4 | %% @end 5 | %%%------------------------------------------------------------------- 6 | -module(vg_chain_state). 7 | 8 | -behaviour(gen_statem). 9 | 10 | -export([start_link/0, 11 | next/0, 12 | head/0]). 13 | 14 | -export([init/1, 15 | active/3, 16 | inactive/3, 17 | callback_mode/0, 18 | terminate/3, 19 | code_change/4]). 20 | 21 | -include_lib("kernel/include/inet.hrl"). 22 | 23 | -type chain_name() :: atom(). 24 | -type role() :: head | tail | middle | solo | undefined. 25 | -type chain_node() :: {atom(), inet:ip_address() | inet:hostname(), inet:port_number(), inet:port_number()}. 26 | 27 | -export_types([role/0, 28 | chain_node/0]). 29 | 30 | -record(data, { 31 | name :: chain_name(), 32 | role :: role(), 33 | head :: node(), 34 | cluster_type :: vg_utils:cluster_type(), 35 | members :: ordsets:ordset(), 36 | all_nodes :: [chain_node()] | undefined, 37 | next_node :: atom() | tail, 38 | replicas :: integer() 39 | }). 40 | 41 | -define(SERVER, ?MODULE). 42 | -define(NODENAME, vonnegut). 43 | 44 | start_link() -> 45 | gen_statem:start_link({local, ?SERVER}, ?MODULE, [], []). 46 | 47 | next() -> 48 | gen_statem:call(?SERVER, next_node). 49 | 50 | head() -> 51 | gen_statem:call(?SERVER, head). 52 | 53 | init([]) -> 54 | ChainName = vg_config:chain_name(), 55 | ClusterType = vg_config:cluster_type(), 56 | Replicas = vg_config:replicas(), 57 | {ok, inactive, #data{name=ChainName, 58 | replicas=Replicas, 59 | cluster_type=ClusterType}, [{state_timeout, 0, connect}]}. 60 | 61 | inactive(enter, _, _Data) -> 62 | prometheus_boolean:set(is_active, false), 63 | keep_state_and_data; 64 | inactive({call, From}, next_node, _Data) -> 65 | {keep_state_and_data, [{reply, From, undefined}]}; 66 | inactive({call, From}, head, _Data) -> 67 | {keep_state_and_data, [{reply, From, undefined}]}; 68 | inactive(state_timeout, connect, Data=#data{name=Name, 69 | replicas=Replicas, 70 | cluster_type=ClusterType}) -> 71 | {Members, AllNodes} = join(ClusterType), 72 | lager:info("cluster_type=~p members=~p all_nodes=~p", [ClusterType, Members, AllNodes]), 73 | case {whereis(vg_topics_sup), role(node(), Members, Replicas, ClusterType)} of 74 | {undefined, _} -> 75 | {keep_state, Data#data{members=Members, 76 | role=undefined}, [{state_timeout, 1000, connect}]}; 77 | {_P, solo} when is_pid(_P) -> 78 | lager:info("at=chain_complete role=solo requested_size=1", []), 79 | lager:info("at=start_cluster_mgr role=solo"), 80 | vonnegut_sup:start_acceptor_pool(solo), 81 | ClientPort = vg_config:port(), 82 | PartisanPort = application:get_env(partisan, peer_port, 10200), 83 | [N, H] = string:split(atom_to_list(node()), "@"), 84 | vonnegut_sup:start_cluster_mgr(solo, [{list_to_atom(N), H, PartisanPort, ClientPort}]), 85 | {next_state, active, Data#data{members=Members, 86 | role=solo, 87 | head=node(), 88 | all_nodes=[], 89 | next_node=tail}}; 90 | {_P, undefined} when is_pid(_P) -> 91 | {keep_state, Data#data{members=Members, 92 | role=undefined}, [{state_timeout, 1000, connect}]}; 93 | {_P, Role} when is_pid(_P) -> 94 | lager:info("at=chain_join role=~s members=~p", [Role, Members]), 95 | case length(Members) of 96 | Size when Size >= Replicas -> 97 | case Role of 98 | head -> 99 | lager:info("at=start_cluster_mgr role=head"), 100 | %% if cluster mgr isn't running, start it 101 | %% otherwise, add this chain to the cluster mgr 102 | %% and all our topics 103 | vonnegut_sup:start_cluster_mgr(Name, AllNodes); 104 | _ -> 105 | ok 106 | end, 107 | 108 | vonnegut_sup:start_acceptor_pool(Role), 109 | 110 | %% monitor next link in the chain 111 | NextNode = next_node(Role, node(), Members), 112 | case string:split(atom_to_list(NextNode), "@") of 113 | [N, H] -> 114 | [Port] = [P || {N1, H1, _, P} <- AllNodes, 115 | N1 =:= list_to_atom(N), 116 | H1 =:= H], 117 | vg_client_pool:start_pool(next_brick, #{ip => H, 118 | port => Port}); 119 | _ -> 120 | ok 121 | end, 122 | 123 | Self = self(), 124 | vg_peer_service:on_down(NextNode, fun() -> Self ! {next_node_down, NextNode} end), 125 | 126 | lager:info("at=chain_complete requested_size=~p", [Replicas]), 127 | {next_state, active, Data#data{members=Members, 128 | head=hd(Members), 129 | all_nodes=AllNodes, 130 | role=Role, 131 | next_node=NextNode}}; 132 | Size -> 133 | lager:info("at=chain_incomplete requested_size=~p current_size=~p", [Replicas, Size]), 134 | {keep_state, Data#data{members=Members, 135 | role=Role}, [{state_timeout, 1000, connect}]} 136 | end 137 | end; 138 | inactive(info, {next_node_down, NextNode}, _Data) -> 139 | lager:info("state=inactive next_node_down=~p", [NextNode]), 140 | {keep_state_and_data, [{state_timeout, 0, connect}]}. 141 | 142 | active(enter, _, #data{role=Role, replicas=Replicas}) -> 143 | set_metrics(Role, Replicas), 144 | keep_state_and_data; 145 | active({call, From}, next_node, #data{next_node=NextNode}) -> 146 | {keep_state_and_data, [{reply, From, NextNode}]}; 147 | active({call, From}, head, #data{head=Head}) -> 148 | {keep_state_and_data, [{reply, From, Head}]}; 149 | active(info, {next_node_down, NextNode}, Data) -> 150 | lager:info("state=active next_node_down=~p", [NextNode]), 151 | {next_state, inactive, Data, 0}. 152 | 153 | callback_mode() -> 154 | [state_functions, state_enter]. 155 | 156 | terminate(_Reason, _State, _Data) -> 157 | ok. 158 | 159 | code_change(_, _OldState, Data, _) -> 160 | {ok, Data}. 161 | 162 | %% Internal functions 163 | 164 | %% assume we expect to find at least 1 node if using srv discovery 165 | role(_Node, _, 1, _) -> 166 | solo; 167 | role(_Node, [], _, {srv, _}) -> 168 | undefined; 169 | role(Node, [Node], _, {srv, _}) -> 170 | undefined; 171 | role(_Node, [], _, local) -> 172 | solo; 173 | role(Node, [Node], _, local) -> 174 | solo; 175 | role(Node, [Node | _], _, _) -> 176 | head; 177 | role(Node, Nodes, _, _) -> 178 | case lists:reverse(Nodes) of 179 | [Node | _] -> 180 | tail; 181 | _ -> 182 | middle 183 | end. 184 | 185 | next_node(tail, _, _) -> 186 | tail; 187 | next_node(head, _, [_, Next | _]) -> 188 | Next; 189 | next_node(_, Node, []) -> 190 | Node; 191 | next_node(_, _, [N]) -> 192 | N; 193 | next_node(_, Node, Nodes) -> 194 | find_next(Node, Nodes). 195 | 196 | -spec find_next(Node :: atom(), Nodes :: ordsets:ordset()) -> atom(). 197 | find_next(Node, Nodes) -> 198 | try 199 | %% set the accumulator when the node we are looking 200 | %% for the next of is found and throw to return 201 | %% the first element encountered after the acc is set 202 | ordsets:fold(fun(N, none) when Node =:= N -> 203 | N; 204 | (_, none) -> 205 | none; 206 | (N, _) -> 207 | throw(N) 208 | end, none, Nodes) 209 | catch 210 | throw:N -> 211 | N 212 | end. 213 | 214 | join(ClusterType) -> 215 | AllNodes = lookup(ClusterType), 216 | ordsets:fold(fun({Name, Host, PartisanPort, _ClientPort}, _) -> 217 | Node = list_to_atom(atom_to_list(Name)++"@"++Host), 218 | IP = case inet:parse_address(Host) of 219 | {error, einval} -> 220 | {ok, #hostent{h_addr_list=[IPAddress | _]}} = inet_res:getbyname(Host, a), 221 | IPAddress; 222 | {ok, IPAddress} -> 223 | IPAddress 224 | end, 225 | N = #{name => Node, 226 | listen_addrs => [#{ip => IP, port => PartisanPort}], 227 | parallelism => 1}, 228 | vg_peer_service:join(N) 229 | end, ok, AllNodes), 230 | {ok, Members} = vg_peer_service:members(), 231 | {lists:usort(Members), AllNodes}. 232 | 233 | %% leave() -> 234 | %% vg_peer_service:leave([]). 235 | 236 | %% 237 | 238 | lookup(local) -> 239 | ordsets:new(); 240 | lookup(none) -> 241 | ordsets:new(); 242 | lookup({direct, Nodes}) -> 243 | ordsets:from_list(Nodes); 244 | lookup({srv, DiscoveryDomain}) -> 245 | lists:foldl(fun({_, _, PartisanPort, H}, NodesAcc) -> 246 | Node = list_to_atom(atom_to_list(?NODENAME)++"@"++H), 247 | %% we could also do this by querying 248 | %% the srv records of _data._tcp.vonnegut.default.svc.cluster.local 249 | ClientPort = rpc:call(Node, vg_config, port, []), 250 | ordsets:add_element({?NODENAME, H, PartisanPort, ClientPort}, NodesAcc) 251 | end, ordsets:new(), inet_res:lookup(DiscoveryDomain, in, srv)). 252 | 253 | set_metrics(Role, Replicas) -> 254 | prometheus_boolean:set(is_active, true), 255 | prometheus_gauge:set(replicas, Replicas), 256 | RoleMetric = role_metric(Role), 257 | [prometheus_boolean:set(B, false) || B <- [is_solo, 258 | is_head, 259 | is_tail, 260 | is_middle], B =/= RoleMetric], 261 | prometheus_boolean:set(RoleMetric, true). 262 | 263 | role_metric(solo) -> 264 | is_solo; 265 | role_metric(head) -> 266 | is_head; 267 | role_metric(tail) -> 268 | is_tail; 269 | role_metric(middle) -> 270 | is_middle. 271 | -------------------------------------------------------------------------------- /src/vg_cleaner.erl: -------------------------------------------------------------------------------- 1 | -module(vg_cleaner). 2 | 3 | -behaviour(gen_server). 4 | 5 | -export([start_link/2, 6 | run_cleaner/2]). 7 | 8 | -export([init/1, 9 | handle_call/3, 10 | handle_cast/2, 11 | handle_info/2, 12 | terminate/2, 13 | code_change/3]). 14 | 15 | -record(state, {topic_dir :: file:filename_all(), 16 | topic :: binary(), 17 | partition :: integer(), 18 | retention_check_ms :: integer(), 19 | retention_seconds :: integer(), 20 | t_ref :: timer:tref()}). 21 | 22 | -define(SERVER(Topic, Partition), {via, gproc, {n, l, {vg_cleaner, Topic, Partition}}}). 23 | 24 | start_link(Topic, Partition) -> 25 | gen_server:start_link(?SERVER(Topic, Partition), 26 | ?MODULE, [Topic, Partition], []). 27 | 28 | run_cleaner(Topic, Partition) -> 29 | gen_server:call(?SERVER(Topic, Partition), run_cleaner). 30 | 31 | init([Topic, Partition]) -> 32 | {ok, RetentionCheckMin} = application:get_env(vonnegut, log_retention_check_interval), 33 | {ok, RetentionMinutes} = application:get_env(vonnegut, log_retention_minutes), 34 | RetentionCheckMs = round(timer:minutes(RetentionCheckMin)), 35 | RetentionSeconds = RetentionMinutes * 60, 36 | 37 | TopicDir = vg_utils:topic_dir(Topic, Partition), 38 | {ok, TRef} = timer:send_after(RetentionCheckMs, run_cleaner), 39 | {ok, #state{topic_dir=TopicDir, 40 | topic=Topic, 41 | partition=Partition, 42 | retention_check_ms=RetentionCheckMs, 43 | retention_seconds=RetentionSeconds, 44 | t_ref=TRef}}. 45 | 46 | handle_call(run_cleaner, _From, State=#state{topic_dir=TopicDir, 47 | topic=Topic, 48 | partition=Partition, 49 | retention_check_ms=RetentionCheckMs, 50 | retention_seconds=RetentionSeconds, 51 | t_ref=TRef}) -> 52 | timer:cancel(TRef), 53 | run_cleaner_(TopicDir, Topic, Partition, RetentionSeconds), 54 | {ok, TRef1} = timer:send_after(RetentionCheckMs, run_cleaner), 55 | {reply, ok, State#state{t_ref=TRef1}}. 56 | 57 | handle_cast(_, State) -> 58 | {noreply, State}. 59 | 60 | handle_info(run_cleaner, State=#state{topic_dir=TopicDir, 61 | topic=Topic, 62 | partition=Partition, 63 | retention_check_ms=RetentionCheckMs, 64 | retention_seconds=RetentionSeconds}) -> 65 | run_cleaner_(TopicDir, Topic, Partition, RetentionSeconds), 66 | {ok, TRef} = timer:send_after(RetentionCheckMs, run_cleaner), 67 | {noreply, State#state{t_ref=TRef}}. 68 | 69 | terminate(_Reason, _State) -> 70 | ok. 71 | 72 | code_change(_, State, _) -> 73 | {ok, State}. 74 | 75 | %% Internal functions 76 | 77 | run_cleaner_(TopicDir, Topic, Partition, RetentionSeconds) -> 78 | Segments = filelib:wildcard(filename:join(ec_cnv:to_list(TopicDir), "*.log")), 79 | Now = calendar:datetime_to_gregorian_seconds(calendar:universal_time()), 80 | lists:foreach(fun(Segment) -> 81 | LastModified = filelib:last_modified(Segment), 82 | [LastUniversal | _] = calendar:local_time_to_universal_time_dst(LastModified), 83 | Diff = Now - calendar:datetime_to_gregorian_seconds(LastUniversal), 84 | if 85 | Diff >= RetentionSeconds -> 86 | SegmentId = filename:basename(Segment, ".log"), 87 | lager:info("at=delete topic=~s partition=~p segment=~s", [Topic, Partition, SegmentId]), 88 | RootName = filename:rootname(Segment), 89 | ok = file:delete(RootName++".index"), 90 | ok = file:delete(Segment); 91 | true -> 92 | ok 93 | end 94 | end, Segments). 95 | -------------------------------------------------------------------------------- /src/vg_client.erl: -------------------------------------------------------------------------------- 1 | -module(vg_client). 2 | 3 | -behavior(shackle_client). 4 | 5 | -export([metadata/0, metadata/1, 6 | ensure_topic/1, 7 | delete_topic/1, 8 | topics/0, topics/2, 9 | fetch/1, fetch/2, fetch/3, 10 | 11 | %% internal-only stuff 12 | replicate/5, 13 | delete_topic/2, 14 | %% end internal 15 | 16 | produce/2, produce/3, 17 | init/1, 18 | setup/2, 19 | handle_request/2, 20 | handle_data/2, 21 | terminate/1]). 22 | 23 | -include("vg.hrl"). 24 | 25 | -record(state, { 26 | request_counter = 0 :: non_neg_integer(), 27 | buffer = <<>> :: binary(), 28 | expected_size = 0 :: non_neg_integer() 29 | }). 30 | 31 | -define(TIMEOUT, 5000). 32 | 33 | -spec metadata() -> {ok, {Chains :: vg_cluster_mgr:chains_map(), 34 | Topics :: vg_cluster_mgr:topics_map()}}. 35 | metadata() -> 36 | %% this is maybe a silly default, considering that it could return 37 | %% millions of topics 38 | metadata([]). 39 | 40 | metadata(Topics) -> 41 | Request = vg_protocol:encode_metadata_request(Topics), 42 | scall(metadata, ?METADATA_REQUEST, Request, ?TIMEOUT). 43 | 44 | -spec ensure_topic(Topic :: vg:topic()) -> 45 | {ok, {Chains :: vg_cluster_mgr:chains_map(), 46 | Topics :: vg_cluster_mgr:topics_map()}} | 47 | {error, Reason :: term()}. 48 | ensure_topic(Topic) -> 49 | %% always use the metadata topic, creation happens inside via a global process. 50 | Request = vg_protocol:encode_metadata_request([Topic]), 51 | scall(metadata, ?ENSURE_REQUEST, Request, ?TIMEOUT). 52 | 53 | -spec fetch(Topic) 54 | -> {ok, #{high_water_mark := integer(), 55 | record_batches_size := integer(), 56 | error_code := integer(), 57 | record_batches := RecordBatches}} 58 | when Topic :: vg:topic() | [{vg:topic(), [{integer(), integer(), integer()}]}], 59 | RecordBatches :: [vg:record_batch()]. 60 | 61 | %% if we don't want to expose the tuple in the second clauses of 62 | %% fetch/1 and fetch/2, we could do something like fetch_partial, 63 | %% which would return the tuple and options, which then could be fed 64 | %% into an execute_multifetch function which would do the right thing. 65 | 66 | fetch(Topic) when is_binary(Topic) -> 67 | do_fetch([{Topic, 0, #{}}], ?TIMEOUT); 68 | fetch(Requests) when is_list(Requests) -> 69 | do_fetch(Requests, ?TIMEOUT). 70 | 71 | fetch(Topic, Position) when is_binary(Topic) -> 72 | do_fetch([{Topic, Position, #{}}], ?TIMEOUT); 73 | fetch(Requests, Timeout) when is_list(Requests) -> 74 | do_fetch(Requests, Timeout). 75 | 76 | fetch(Topic, Position, Limit) when is_binary(Topic) -> 77 | do_fetch([{Topic, Position, #{limit => Limit}}], ?TIMEOUT). 78 | 79 | do_fetch(Requests, Timeout) -> 80 | try 81 | PoolReqs = 82 | lists:foldl( 83 | fun({Topic, _Position, _Opts} = R, Acc) -> 84 | case vg_client_pool:get_pool(Topic, read) of 85 | {ok, Pool} -> 86 | lager:debug("fetch request to pool: ~p ~p", [Topic, Pool]), 87 | case Acc of 88 | #{Pool := PoolReqs} -> 89 | Acc#{Pool => [R | PoolReqs]}; 90 | _ -> 91 | Acc#{Pool => [R]} 92 | end; 93 | {error, Reason} -> 94 | throw({error, Reason}) 95 | end 96 | end, #{}, Requests), 97 | %% should we do these in parallel? 98 | Restart = application:get_env(vonnegut, swap_restart, true), 99 | Resps = maps:map( 100 | fun(Pool, TPO0) -> 101 | TPO = [begin 102 | MaxBytes = maps:get(max_bytes, Opts, 0), 103 | Limit = maps:get(limit, Opts, -1), 104 | {Topic, [{0, Position, MaxBytes, Limit}]} 105 | end 106 | || {Topic, Position, Opts} <- TPO0], 107 | ReplicaId = -1, 108 | MaxWaitTime = 5000, 109 | MinBytes = 100, 110 | Request = vg_protocol:encode_fetch(ReplicaId, MaxWaitTime, MinBytes, TPO), 111 | case scall(Pool, ?FETCH2_REQUEST, Request, Timeout) of 112 | %% sometimes because of cloud orchestration, and 113 | %% restarts, head and tail nodes will switch or 114 | %% move around in time for us to reconnect to them 115 | %% in error, so if we get these codes, start over 116 | {ok, Map} when is_map(Map) andalso Restart =:= true -> 117 | case maps:fold( 118 | fun(_, _, true) -> 119 | true; 120 | (_, #{0 := #{error_code := ?FETCH_DISALLOWED_ERROR}}, _) -> 121 | true; 122 | (T, #{0 := #{error_code := ?UNKNOWN_TOPIC_OR_PARTITION}}, _) -> 123 | throw({error, {T, not_found}}); 124 | (_, _, _) -> 125 | false 126 | end, false, Map) of 127 | true -> 128 | throw(restart); 129 | _ -> 130 | {ok, Map} 131 | end; 132 | {ok, Result} -> 133 | %% if there are any error codes in any 134 | %% of these, transform the whole thing 135 | %% into an error 136 | {ok, Result}; 137 | {error, Reason} -> 138 | {error, Reason} 139 | end 140 | end, PoolReqs), 141 | 142 | lists:foldl( 143 | fun(_, {error, Response}) -> 144 | {error, Response}; 145 | ({_Pool, {ok, Response}}, {ok, Acc}) -> 146 | {ok, maps:merge(Acc, Response)}; 147 | ({_Pool, {error, Response}}, _) -> 148 | {error, Response} 149 | end, 150 | {ok, #{}}, maps:to_list(Resps)) 151 | catch throw:{error, {Topic, not_found}} -> 152 | lager:error("tried to fetch from non-existent topic ~p", [Topic]), 153 | {error, {Topic, not_found}}; 154 | throw:restart -> 155 | lager:info("disallowed request error, restarting pools"), 156 | vg_client_pool:restart(), 157 | do_fetch(Requests, Timeout) 158 | end. 159 | 160 | -spec replicate(Pool, Topic, ExpectedId, RecordBatch, Timeout) 161 | -> {ok, integer()} | {error, term()} | {write_repair, maps:map()} | retry 162 | when Pool :: atom(), 163 | Topic :: vg:topic(), 164 | ExpectedId :: integer(), 165 | RecordBatch :: vg:record_batch() | [vg:record_batch()], 166 | Timeout :: integer(). 167 | replicate(Pool, Topic, ExpectedId, RecordBatch, Timeout) -> 168 | lager:debug("replicate pool=~p topic=~p", [Pool, Topic]), 169 | Request = vg_protocol:encode_replicate(0, 5000, Topic, 0, ExpectedId, RecordBatch), 170 | case scall(Pool, ?REPLICATE_REQUEST, Request, Timeout) of 171 | {ok, {0, #{error_code := 0, 172 | offset := Offset}}} -> 173 | {ok, Offset}; 174 | {ok, {0, #{error_code := ?WRITE_REPAIR, records := RecordBatches}}} -> 175 | {write_repair, RecordBatches}; 176 | {ok, {0, #{error_code := ?TIMEOUT_ERROR}}} -> 177 | retry; 178 | {ok, {0, #{error_code := ErrorCode}}} -> 179 | {error, ErrorCode}; 180 | {error, Reason} -> 181 | {error, Reason} 182 | end. 183 | 184 | delete_topic(Topic) -> 185 | case vg_client_pool:get_pool(Topic, write) of 186 | {ok, Pool} -> 187 | Request = vg_protocol:encode_delete_topic(Topic), 188 | case scall(Pool, ?DELETE_TOPIC_REQUEST, Request, timer:seconds(60)) of 189 | {ok, ok} -> ok; 190 | {error, Reason} -> {error, Reason} 191 | end; 192 | {error, Reason} -> 193 | {error, Reason} 194 | end. 195 | 196 | delete_topic(Pool, Topic) -> 197 | lager:debug("delete_topic pool=~p topic=~p", [Pool, Topic]), 198 | Request = vg_protocol:encode_delete_topic(Topic), 199 | case scall(Pool, ?REPLICATE_DELETE_TOPIC_REQUEST, Request, timer:seconds(60)) of 200 | {ok, ok} -> ok; 201 | {error, Reason} -> {error, Reason} 202 | end. 203 | 204 | -spec produce(Topic, RecordBatch) 205 | -> {ok, integer()} | {error, term()} 206 | when Topic :: vg:topic(), 207 | RecordBatch :: vg:record_batch() | [vg:record_batch()]. 208 | produce(Topic, RecordBatch) -> 209 | produce(Topic, RecordBatch, ?TIMEOUT). 210 | 211 | -spec produce(Topic, RecordBatch, Timeout) 212 | -> {ok, integer()} | {error, term()} 213 | when Topic :: vg:topic(), 214 | RecordBatch :: vg:record_batch() | [vg:record_batch()], 215 | Timeout :: pos_integer(). 216 | produce(Topic, RecordBatch, Timeout) -> 217 | #{record_batch := EncodedRecordBatch} = vg_protocol:encode_record_batch(RecordBatch), 218 | produce_(Topic, EncodedRecordBatch, Timeout). 219 | 220 | produce_(Topic, EncodedRecordBatch, Timeout) -> 221 | case vg_client_pool:get_pool(Topic, write) of 222 | {ok, Pool} -> 223 | lager:debug("produce request to pool: ~p ~p", [Topic, Pool]), 224 | TopicRecords = [{Topic, [{0, EncodedRecordBatch}]}], 225 | Restart = application:get_env(vonnegut, swap_restart, true), 226 | Request = vg_protocol:encode_produce(0, 5000, TopicRecords), 227 | case scall(Pool, ?PRODUCE_REQUEST, Request, Timeout) of 228 | {ok, #{Topic := #{0 := #{error_code := 0, 229 | offset := Offset}}}} -> 230 | {ok, Offset}; 231 | {ok, #{Topic := #{0 := #{error_code := ?TIMEOUT_ERROR}}}} -> 232 | {error, timeout}; 233 | %% sometimes because of cloud orchestration, and 234 | %% restarts, head and tail nodes will switch or 235 | %% move around in time for us to reconnect to them 236 | %% in error, so if we get these codes, start over 237 | {ok, #{Topic := #{0 := #{error_code := ?PRODUCE_DISALLOWED_ERROR}}}} 238 | when Restart =:= true -> 239 | lager:info("disallowed request error, restarting pools"), 240 | vg_client_pool:restart(), 241 | produce_(Topic, EncodedRecordBatch, Timeout); 242 | {ok, #{Topic := #{0 := #{error_code := ErrorCode}}}} -> 243 | {error, ErrorCode}; 244 | {error, Reason} -> 245 | {error, Reason} 246 | end; 247 | {error, Reason} -> 248 | {error, Reason} 249 | end. 250 | 251 | topics() -> 252 | topics(metadata, []). 253 | 254 | topics(Pool, Topics) -> 255 | Request = vg_protocol:encode_array([<<(byte_size(T)):16/signed-integer, T/binary>> || T <- Topics]), 256 | case scall(Pool, ?TOPICS_REQUEST, Request, ?TIMEOUT) of 257 | {ok, {_, _}} = OK -> 258 | OK; 259 | {error, Reason} -> 260 | {error, Reason} 261 | end. 262 | 263 | -spec init(term()) -> {ok, term()}. 264 | init(_) -> 265 | {ok, #state{}}. 266 | 267 | -spec setup(inet:socket(), term()) -> {ok, term()} | {error, term(), term()}. 268 | setup(_Socket, State) -> 269 | {ok, State}. 270 | 271 | -spec handle_request({integer(), iodata()}, #state{}) -> {ok, non_neg_integer(), iodata(), term()}. 272 | handle_request({Type, Body}, State=#state{request_counter=RequestCounter}) -> 273 | Id = request_id(RequestCounter), 274 | Data = vg_protocol:encode_request(Type, Id, ?CLIENT_ID, Body), 275 | {ok, Id, [<<(iolist_size(Data)):32/signed-integer>>, Data], 276 | State#state{request_counter = RequestCounter + 1}}. 277 | 278 | -spec handle_data(binary(), term()) -> {ok, [{term(),term()}], term()}. 279 | handle_data(Data, State=#state{buffer=Buffer}) -> 280 | Data2 = <>, 281 | decode_data(Data2, [], State). 282 | 283 | decode_data(<<>>, Replies, State) -> 284 | {ok, Replies, State}; 285 | decode_data(Data, Replies, State=#state{expected_size=Exp}) -> 286 | case Exp of 287 | N when N == 0 orelse byte_size(Data) >= N -> 288 | case vg_protocol:decode_response(Data) of 289 | more -> 290 | {ok, Replies, State#state{buffer=Data}}; 291 | {more, Size} -> 292 | {ok, Replies, State#state{buffer=Data, expected_size=Size}}; 293 | {CorrelationId, Response, Rest} -> 294 | decode_data(Rest, [{CorrelationId, {ok, Response}} | Replies], 295 | State#state{expected_size = 0, 296 | buffer = <<>>}) 297 | end; 298 | _ -> 299 | {ok, Replies, State#state{buffer=Data}} 300 | end. 301 | 302 | -spec terminate(term()) -> ok. 303 | terminate(_State) -> 304 | ok. 305 | 306 | %% private 307 | request_id(RequestCounter) -> 308 | RequestCounter rem ?MAX_REQUEST_ID. 309 | 310 | scall(Pool, RequestType, RequestData, RequestTimeout) -> 311 | B = backoff:init(2, 200), 312 | B1 = backoff:type(B, jitter), 313 | %% at these settings, 25 retries is approximately 5s 314 | scall(Pool, RequestType, RequestData, RequestTimeout, B1, 25). 315 | 316 | scall(_, _, _, _, _, 0) -> 317 | {error, pool_timeout}; 318 | scall(Pool, RequestType, RequestData, RequestTimeout, Backoff, Retries) -> 319 | case shackle:call(Pool, {RequestType, RequestData}, RequestTimeout) of 320 | {error, timeout} -> 321 | {error, timeout}; 322 | {error, _} -> 323 | {Time, Backoff1} = backoff:fail(Backoff), 324 | timer:sleep(Time), 325 | scall(Pool, RequestType, RequestData, RequestTimeout, Backoff1, Retries - 1); 326 | {ok, Response} -> 327 | {ok, vg_protocol:decode_response(RequestType, Response)} 328 | end. 329 | -------------------------------------------------------------------------------- /src/vg_client_pool.erl: -------------------------------------------------------------------------------- 1 | -module(vg_client_pool). 2 | 3 | -export([start/0, start/1, 4 | stop/0, 5 | restart/0, 6 | get_pool/2, 7 | start_pool/2, 8 | make_pool_name/2, 9 | refresh_topic_map/0]). 10 | 11 | -include("vg.hrl"). 12 | 13 | -define(OPTIONS, [set, public, named_table, {read_concurrency, true}]). 14 | 15 | start() -> 16 | start(#{}). 17 | 18 | start(Opts) -> 19 | %% so restarts won't lose settings 20 | application:set_env(vonnegut, global_pool_opts, Opts), 21 | start(Opts, 0). 22 | 23 | start(_Opts, 10) -> 24 | {error, could_not_start_pools}; 25 | start(Opts, N) -> 26 | %% maybe start this if it hasn't been 27 | application:ensure_all_started(shackle), 28 | case application:get_env(vonnegut, client) of 29 | {ok, ClientConfig} -> 30 | case proplists:get_value(endpoints, ClientConfig) of 31 | undefined -> 32 | lager:error("No endpoints configured for client"); 33 | [{Host, Port} | _] when is_integer(Port) -> 34 | start_(Opts, N, Host, Port); 35 | [HostPort | _] when is_list(HostPort) -> 36 | case parse_host_port(HostPort) of 37 | {ok, Host, Port} -> start_(Opts, N, Host, Port); 38 | {error, _} -> lager:error("Invalid endpoint ~p", HostPort) 39 | end 40 | end; 41 | _ -> 42 | lager:info("No client configuration") 43 | end. 44 | 45 | start_(Opts, N, Host, Port) -> 46 | start_pool(metadata, Opts#{ip => Host, 47 | port => Port}), 48 | try 49 | case vg_client:topics() of 50 | {ok, {_, Chains}} -> 51 | maybe_init_ets(), 52 | _ = start_pools(Chains), 53 | application:set_env(vonnegut, chains, Chains), 54 | refresh_topic_map(), 55 | ok 56 | end 57 | catch 58 | ?WITH_STACKTRACE(_, R, S) 59 | lager:warning("at=start_pools error=~p stacktrace=~p", [R, S]), 60 | timer:sleep(500), 61 | start(Opts, N + 1) 62 | end. 63 | 64 | parse_host_port(HostPortString) -> 65 | case string:split(HostPortString, ":", all) of 66 | [Host] -> 67 | {ok, Host, ?DEFAULT_PORT}; 68 | [Host, Port] -> 69 | case string:to_integer(Port) of 70 | {IntegerPort, ""} -> {ok, Host, IntegerPort}; 71 | {_, _} -> {error, invalid_host_port_string} 72 | end; 73 | [_|_] -> 74 | {error, invalid_host_port_string} 75 | end. 76 | 77 | start_pools(Chains) -> 78 | [begin 79 | Name = <>, 80 | lager:info("starting chain: ~p ~p", [Name, C]), 81 | HeadName = make_pool_name(Name, write), 82 | start_pool(HeadName, #{ip => binary_to_list(HeadHost), 83 | port => HeadPort}), 84 | TailHost = 85 | case TailHost0 of 86 | <<"solo">> -> HeadHost; 87 | _ -> TailHost0 88 | end, 89 | %% the name of the pool can be misleading as to what host and 90 | %% port it's on. Do we need to fix this? 91 | TailName = make_pool_name(Name, read), 92 | start_pool(TailName, #{ip => binary_to_list(TailHost), 93 | port => TailPort}) 94 | end 95 | || #{name := _Name, 96 | head := {HeadHost, HeadPort}, 97 | tail := {TailHost0, TailPort}} = C <- Chains]. 98 | 99 | refresh_topic_map() -> 100 | %% TODO live migrate pools when the chain list changes? 101 | %% or just restart the whole mess? 102 | {ok, {_, Chains}} = vg_client:topics(), 103 | maybe_init_ets(clean), 104 | ets:insert(?topic_map, {chains, Chains}), 105 | ets:insert(?topic_map, {lookup, lookup_list(Chains)}). 106 | 107 | lookup_list(Chains) -> 108 | [begin 109 | Name = <>, 110 | HeadName = make_pool_name(Name, write), 111 | TailName = make_pool_name(Name, read), 112 | {Start, End, HeadName, TailName} 113 | end 114 | || #{topics_start := Start, 115 | topics_end := End, 116 | head := {HeadHost, HeadPort}} <- Chains]. 117 | 118 | get_pool(Topic, RW) -> 119 | %% at some point we should handle retries here for when the topic 120 | %% list is being refreshed. 121 | case ets:lookup(?topic_map, lookup) of 122 | [] -> 123 | refresh_topic_map(), 124 | get_pool(Topic, RW); 125 | [{_, Chains}] -> 126 | case find_chain(Chains, Topic, RW) of 127 | {ok, Pool} -> 128 | lager:debug("found chain for topic=~p on pool=~p", [Topic, Pool]), 129 | {ok, Pool}; 130 | {error, Reason} -> 131 | {error, Reason} 132 | end 133 | end. 134 | 135 | %% TODO: work out how to replace this with a select, maybe 136 | find_chain([], _Topic, _RW) -> 137 | {error, malformed_chain}; 138 | find_chain([{start_space, end_space, HeadName, TailName} | _Tail], _Topic, RW) -> 139 | {ok, pick_pool(HeadName, TailName, RW)}; 140 | find_chain([{start_space, E, HeadName, TailName} | _Tail], Topic, RW) when Topic =< E -> 141 | {ok, pick_pool(HeadName, TailName, RW)}; 142 | find_chain([{S, end_space, HeadName, TailName} | _Tail], Topic, RW) when Topic >= S -> 143 | {ok, pick_pool(HeadName, TailName, RW)}; 144 | find_chain([{S, E, HeadName, TailName} | _Tail], Topic, RW) when Topic >= S andalso Topic =< E -> 145 | {ok, pick_pool(HeadName, TailName, RW)}; 146 | find_chain([_|Tail], Topic, RW) -> 147 | find_chain(Tail, Topic, RW). 148 | 149 | pick_pool(Head, _Tail, write) -> 150 | Head; 151 | pick_pool(_Head, Tail, read) -> 152 | Tail. 153 | 154 | make_pool_name(Chain, read) -> 155 | binary_to_atom(<>, utf8); 156 | make_pool_name(Chain, write) -> 157 | binary_to_atom(<>, utf8). 158 | 159 | maybe_init_ets() -> 160 | maybe_init_ets(foo). 161 | 162 | %% eventually handle the clean argument 163 | maybe_init_ets(_) -> 164 | case ets:info(?topic_map, name) of 165 | undefined -> 166 | ets:new(?topic_map, ?OPTIONS); 167 | _ -> 168 | ok 169 | end. 170 | 171 | start_pool(Name, Opts) -> 172 | ClientPoolSize = application:get_env(vonnegut, client_pool_size, 10), 173 | SocketOpts = [binary, 174 | {buffer, 65535}, 175 | {nodelay, true}, 176 | {packet, raw}, 177 | {send_timeout, 5000}, 178 | {send_timeout_close, true}], 179 | Pools = application:get_env(vonnegut, client_pools, []), 180 | application:set_env(vonnegut, client_pools, [Name | Pools]), 181 | shackle_pool:start(Name, vg_client, 182 | [{ip, maps:get(ip, Opts, "127.0.0.1")}, 183 | {port, maps:get(port, Opts, 5588)}, 184 | {reconnect, maps:get(reconnect, Opts, true)}, 185 | {reconnect_time_max, 120000}, 186 | {reconnect_time_min, 250}, 187 | {socket_options, SocketOpts}], 188 | [{backlog_size, 1024}, 189 | {pool_size, ClientPoolSize}, 190 | {pool_strategy, round_robin}]). 191 | 192 | stop() -> 193 | [shackle_pool:stop(Pool) 194 | || Pool <- application:get_env(vonnegut, client_pools, [])], 195 | application:stop(shackle). 196 | 197 | restart() -> 198 | stop(), 199 | Opts = application:get_env(vonnegut, global_pool_opts, #{}), 200 | start(Opts). 201 | %%%%%%%%%%%%%%%%%% 202 | -------------------------------------------------------------------------------- /src/vg_cluster_mgr.erl: -------------------------------------------------------------------------------- 1 | %%%------------------------------------------------------------------- 2 | %%% @author Tristan Sloughter <> 3 | %%% @copyright (C) 2017, Tristan Sloughter 4 | %%% @doc 5 | %%% 6 | %%% @end 7 | %%% Created : 9 Feb 2017 by Tristan Sloughter <> 8 | %%%------------------------------------------------------------------- 9 | -module(vg_cluster_mgr). 10 | 11 | -behaviour(gen_server). 12 | 13 | %% API 14 | -export([start_link/3, 15 | get_map/0, 16 | ensure_topic/1]). 17 | 18 | -export([ 19 | create_topic/1, 20 | delete_topic/1, 21 | describe_topic/1, 22 | deactivate_topic/1, 23 | running_topics/0 24 | ]). 25 | 26 | %% gen_server callbacks 27 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, 28 | terminate/2, code_change/3]). 29 | 30 | -include("vg.hrl"). 31 | 32 | -type chain_id() :: binary(). 33 | -type topics_map() :: #{vg:topic() => chain_id()}. 34 | -type chains_map() :: #{chain_id() => chain()}. 35 | 36 | -export_types([topic/0, 37 | chain_id/0, 38 | topics_map/0, 39 | chains_map/0]). 40 | 41 | -define(SERVER, ?MODULE). 42 | 43 | -record(state, {topics = #{} :: maps:map(), 44 | chains = #{} :: maps:map(), 45 | epoch :: integer()}). 46 | 47 | -spec start_link(vg_chain_state:chain_name(), [vg_chain_state:chain_node()], file:filename_all()) -> {ok, pid()}. 48 | start_link(ChainName, ChainNodes, DataDir) -> 49 | gen_server:start_link({local, ?SERVER}, ?MODULE, [ChainName, ChainNodes, DataDir], []). 50 | 51 | %% add chain functionality needed 52 | 53 | -spec get_map() -> {Topics :: topics_map(), Chains :: chains_map(), Epoch :: integer()}. 54 | get_map() -> 55 | HeadNode = vg_chain_state:head(), 56 | gen_server:call({?SERVER, HeadNode}, get_map). 57 | 58 | -spec create_topic(Topic :: binary()) -> {ok, Chain :: binary()} | {error, exists}. 59 | create_topic(Topic) -> 60 | HeadNode = vg_chain_state:head(), 61 | gen_server:call({?SERVER, HeadNode}, {create_topic, Topic}). 62 | 63 | -spec ensure_topic(Topic :: binary()) -> {error, chain_not_found} | 64 | {error, topic_exists_other_chain} | 65 | {ok, chain_id()}. 66 | ensure_topic(Topic) -> 67 | HeadNode = vg_chain_state:head(), 68 | gen_server:call({?SERVER, HeadNode}, {ensure_topic, Topic}). 69 | 70 | delete_topic(Topic) -> 71 | HeadNode = vg_chain_state:head(), 72 | gen_server:call({?SERVER, HeadNode}, {delete_topic, Topic}, infinity). 73 | 74 | describe_topic(Topic) -> 75 | HeadNode = vg_chain_state:head(), 76 | gen_server:call({?SERVER, HeadNode}, {describe_topic, Topic}). 77 | 78 | deactivate_topic(Topic) -> 79 | HeadNode = vg_chain_state:head(), 80 | gen_server:call({?SERVER, HeadNode}, {deactivate_topic, Topic}). 81 | 82 | running_topics() -> 83 | HeadNode = vg_chain_state:head(), 84 | gen_server:call({?SERVER, HeadNode}, running_topics). 85 | 86 | %%%%%%%%%%%%%%%%%%%%%%%% 87 | 88 | init([ChainName, ChainNodes, DataDir]) -> 89 | Chain = create_chain(ChainName, ChainNodes), 90 | State = load_state([Chain], DataDir), 91 | self() ! {ensure_topics, ChainName}, 92 | {ok, State}. 93 | 94 | handle_call(get_map, _From, State=#state{topics=Topics, 95 | chains=Chains, 96 | epoch=Epoch}) -> 97 | {reply, {Topics, Chains, Epoch}, State}; 98 | handle_call({create_topic, Topic}, _From, State=#state{topics=Topics, 99 | chains=Chains, 100 | epoch=Epoch}) -> 101 | case maps:get(Topic, Topics, not_found) of 102 | not_found -> 103 | Keys = maps:keys(Chains), 104 | Random = rand:uniform(length(Keys)), 105 | Chain = lists:nth(Random, Keys), 106 | 107 | %% start topic process on all nodes in the chain 108 | #chain{nodes=Nodes} = maps:get(Chain, Chains), 109 | [{ok, _} = vg_topics_sup:start_child(Node, Topic, [0]) || Node <- Nodes], 110 | 111 | Topics1 = maps:put(Topic, Chain, Topics), 112 | {reply, {ok, Chain}, State#state{topics=Topics1, 113 | epoch=Epoch+1}}; 114 | Chain -> 115 | lager:info("attempting to create topic that already exists on chain=~p", [Chain]), 116 | {reply, {error, exists}, State} 117 | end; 118 | handle_call({ensure_topic, Topic}, _From, State=#state{topics=Topics, 119 | chains=Chains, 120 | epoch=Epoch}) -> 121 | case maps:get(Topic, Topics, not_found) of 122 | not_found -> 123 | Keys = maps:keys(Chains), 124 | Random = rand:uniform(length(Keys)), 125 | Chain = lists:nth(Random, Keys), 126 | 127 | %% start topic process on all nodes in the chain 128 | start_on_all_nodes(Topic, Chain, Chains), 129 | Topics1 = maps:put(Topic, Chain, Topics), 130 | {reply, {ok, Chain}, State#state{topics=Topics1, 131 | epoch=Epoch+1}}; 132 | Chain -> 133 | start_on_all_nodes(Topic, Chain, Chains), 134 | {reply, {ok, Chain}, State} 135 | end; 136 | handle_call({delete_topic, Topic}, _From, State=#state{topics=Topics, 137 | chains=Chains}) -> 138 | %% have topic mgr delete the topic segments and directory 139 | %% deactivate the topic so that it can be recreated if desired 140 | {Reply, Topics1} = 141 | case maps:get(Topic, Topics, not_found) of 142 | not_found -> {{error, not_found}, Topics}; 143 | Chain -> 144 | Rep = 145 | try 146 | vg_topic_mgr:delete_topic(Topic, 0) % eventually iterate partitions? 147 | catch _:{noproc, _} -> 148 | start_on_all_nodes(Topic, Chain, Chains), 149 | vg_topic_mgr:delete_topic(Topic, 0) 150 | end, 151 | stop_on_all_nodes(Topic, Chain, Chains), 152 | {Rep, maps:remove(Topic, Topics)} 153 | end, 154 | {reply, Reply, State#state{topics=Topics1}}; 155 | %% handle_call({describe_topic, Topic}, _From, State=#state{topics=Topics, 156 | %% chains=Chains}) -> 157 | %% %% get the hwm 158 | %% %% get number of segments 159 | %% %% size on disk 160 | %% %% check if it's running? 161 | %% {reply, ok, State}; 162 | handle_call({deactivate_topic, Topic}, _From, State=#state{topics=Topics, 163 | chains=Chains}) -> 164 | Ret = 165 | case maps:get(Topic, Topics, not_found) of 166 | not_found -> {error, not_found}; 167 | Chain -> stop_on_all_nodes(Topic, Chain, Chains) 168 | end, 169 | {reply, Ret, State}; 170 | handle_call(running_topics, _From, State=#state{chains=_Chains}) -> 171 | %% TODO: need to do this for all chains? 172 | Ret = vg_topics_sup:list_topics(node()), 173 | {reply, Ret, State}; 174 | handle_call(_, _, State) -> 175 | {noreply, State}. 176 | 177 | handle_cast(_Msg, State) -> 178 | {noreply, State}. 179 | 180 | handle_info({ensure_topics, ChainName}, State) -> 181 | State1 = ensure_all_topics(ChainName, State), 182 | {noreply, State1}. 183 | 184 | terminate(_Reason, _State) -> 185 | ok. 186 | 187 | code_change(_OldVsn, State, _Extra) -> 188 | {ok, State}. 189 | 190 | %%%=================================================================== 191 | %%% Internal functions 192 | %%%=================================================================== 193 | 194 | start_on_all_nodes(Topic, Chain, Chains) -> 195 | #chain{nodes=Nodes} = maps:get(Chain, Chains), 196 | [case vg_topics_sup:start_child(Node, Topic, [0]) of 197 | {ok, _} -> ok; 198 | {error,{already_started, _}} -> ok; 199 | {error, Reason} -> exit({error, Reason}) 200 | end || Node <- Nodes]. 201 | 202 | stop_on_all_nodes(Topic, Chain, Chains) -> 203 | #chain{nodes=Nodes} = maps:get(Chain, Chains), 204 | %% usort here to remove useless oks 205 | lists:usort( 206 | [case vg_topics_sup:stop_child(Node, Topic, [0]) of 207 | [ok] -> ok; 208 | %% annotate and pass on the error for user analysis 209 | Other -> {Node, Topic, Other} 210 | end || Node <- Nodes]). 211 | 212 | %% TODO: the topic space stuff MUST be fixed before multiple chains are supported 213 | create_chain(Name, []) -> 214 | #chain{name = Name, 215 | nodes = [node()], 216 | topics_start = start_space, 217 | topics_end = end_space, 218 | head = {"127.0.0.1", 5588}, 219 | tail = {"127.0.0.1", 5588}}; 220 | create_chain(Name, Nodes) -> 221 | #chain{name = Name, 222 | nodes = [nodename(Node) || Node <- Nodes], 223 | topics_start = start_space, % only valid for one chain 224 | topics_end = end_space, % only valid for one chain 225 | head = head(Nodes), 226 | tail = tail(Nodes)}. 227 | 228 | nodename({Name, Host, _, _}) -> 229 | list_to_atom(atom_to_list(Name) ++ "@" ++ Host). 230 | 231 | load_state(Chains, _DataDir) -> 232 | ChainsMap = lists:foldl(fun(Chain=#chain{name=Name}, Acc) -> 233 | maps:put(Name, Chain, Acc) 234 | end, #{}, Chains), 235 | #state{topics = #{}, 236 | chains = ChainsMap, 237 | epoch = 0}. 238 | 239 | head([{_, Host, _, ClientPort} | _]) -> 240 | {Host, ClientPort}. 241 | 242 | tail(Nodes) -> 243 | head(lists:reverse(Nodes)). 244 | 245 | ensure_all_topics(ChainName, State) -> 246 | Topics = vg_utils:topics_on_disk(), 247 | lists:foldl(fun({Topic, _}, StateAcc=#state{topics=TopicsAcc, 248 | epoch=Epoch}) -> 249 | TopicsAcc1 = maps:put(Topic, ChainName, TopicsAcc), 250 | StateAcc#state{topics=TopicsAcc1, 251 | epoch=Epoch+1} 252 | end, State, Topics). 253 | 254 | -------------------------------------------------------------------------------- /src/vg_config.erl: -------------------------------------------------------------------------------- 1 | -module(vg_config). 2 | 3 | -export([chain_name/0, 4 | port/0, 5 | cluster_type/0, 6 | replicas/0]). 7 | 8 | -define(DEFAULT_PORT, 5588). 9 | 10 | -type cluster_type() :: local | {direct, [any()]} | {srv, string()} | none. 11 | 12 | -export_types([cluster_type/0]). 13 | 14 | -spec chain_name() -> vg_chain_state:chain_name(). 15 | chain_name() -> 16 | vg_utils:to_atom(from_chain(name, solo)). 17 | 18 | -spec port() -> integer(). 19 | port() -> 20 | vg_utils:to_integer(from_chain(port, ?DEFAULT_PORT)). 21 | 22 | -spec cluster_type() -> cluster_type(). 23 | cluster_type() -> 24 | case from_chain(discovery, local) of 25 | "local" -> 26 | local; 27 | local -> 28 | local; 29 | {direct, Nodes} -> 30 | {direct, Nodes}; 31 | {srv, Domain} -> 32 | {srv, Domain}; 33 | Other -> 34 | lager:error("Unknown clustering option: ~p", [Other]), 35 | none 36 | end. 37 | 38 | -spec replicas() -> integer(). 39 | replicas() -> 40 | case cluster_type() of 41 | {direct, List} -> 42 | length(List); 43 | {srv, _} -> 44 | vg_utils:to_integer(from_chain(replicas, 1)); 45 | _ -> 46 | 1 47 | end. 48 | 49 | %% internal functions 50 | 51 | from_chain(Key, Default) -> 52 | case application:get_env(vonnegut, chain, []) of 53 | [] -> 54 | Default; 55 | Chain -> 56 | proplists:get_value(Key, Chain, Default) 57 | end. 58 | -------------------------------------------------------------------------------- /src/vg_elli_handler.erl: -------------------------------------------------------------------------------- 1 | -module(vg_elli_handler). 2 | 3 | -export([handle/2, 4 | handle_event/3]). 5 | 6 | -include_lib("elli/include/elli.hrl"). 7 | -behaviour(elli_handler). 8 | 9 | handle(Req, _Args) -> 10 | %% Delegate to our handler function 11 | handle(Req#req.method, elli_request:path(Req), Req). 12 | 13 | handle('GET', [<<"_health">>], _Req) -> 14 | {ok, [], <<"It's all good.">>}; 15 | 16 | handle(_, _, _Req) -> 17 | {404, [], <<"Not Found">>}. 18 | 19 | handle_event(_Event, _Data, _Args) -> 20 | ok. 21 | -------------------------------------------------------------------------------- /src/vg_index.erl: -------------------------------------------------------------------------------- 1 | %% Index files are named [offset].index 2 | %% Entries in the index are <<(Id-Offset):32/unsigned, Position:32/unsigned>> 3 | %% Position is the offset in [offset].log to find the log Id 4 | -module(vg_index). 5 | 6 | -include("vg.hrl"). 7 | 8 | -export([find_in_index/3]). 9 | 10 | -spec find_in_index(Fd, BaseOffset, Id) -> integer() | not_found when 11 | Fd :: file:fd(), 12 | BaseOffset :: integer(), 13 | Id :: integer(). 14 | find_in_index(Fd, BaseOffset, Id) -> 15 | case file:read(Fd, (2 * ?INDEX_ENTRY_SIZE)) of 16 | {ok, Bytes} -> 17 | find_in_index_(Fd, Id, BaseOffset, Bytes); 18 | _ -> 19 | 0 20 | end. 21 | 22 | %% Optimize later. Could keep entire index in memory 23 | %% and could (in memory or not) use a binary search 24 | find_in_index_(_, _, _, <<>>) -> 25 | 0; 26 | %% special case for when below the first offset in a single entry index 27 | find_in_index_(_, Id, BaseOffset, <>) 29 | when BaseOffset + Offset > Id-> 30 | 0; 31 | find_in_index_(_, _, _, <<_Offset:?INDEX_OFFSET_BITS/unsigned, 32 | Position:?INDEX_POS_BITS/unsigned>>) -> 33 | Position; 34 | find_in_index_(_, Id, BaseOffset, <>) 36 | when Id =:= BaseOffset + Offset -> 37 | Position; 38 | %% special case for below the first offset in a multi-entry index, but 39 | %% I worry that it might be overly broad. 40 | find_in_index_(_, Id, BaseOffset, <>) 44 | when BaseOffset + Offset > Id -> 45 | 0; 46 | find_in_index_(_, Id, BaseOffset, <<_Offset:?INDEX_OFFSET_BITS/unsigned, 47 | Position:?INDEX_POS_BITS/unsigned, 48 | Offset2:?INDEX_OFFSET_BITS/unsigned, 49 | _Pos2:?INDEX_POS_BITS/unsigned, _/binary>>) 50 | when BaseOffset + Offset2 > Id -> 51 | Position; 52 | find_in_index_(Fd, Id, BaseOffset, <<_Offset:?INDEX_OFFSET_BITS/unsigned, 53 | _Pos:?INDEX_POS_BITS/unsigned, Rest/binary>>) -> 54 | case file:read(Fd, ?INDEX_ENTRY_SIZE) of 55 | {ok, Bytes} -> 56 | find_in_index_(Fd, Id, BaseOffset, <>); 57 | _ -> 58 | find_in_index_(Fd, Id, BaseOffset, Rest) 59 | end. 60 | -------------------------------------------------------------------------------- /src/vg_log_segments.erl: -------------------------------------------------------------------------------- 1 | %% 2 | -module(vg_log_segments). 3 | 4 | -export([init_table/0, 5 | load_existing/2, 6 | load_all/2, 7 | delete_segments/2, 8 | delete_indexes/2, 9 | regenerate_indexes/2, 10 | cleanup_segments_table/2, 11 | insert/3, 12 | local/2, 13 | find_log_segment/3, 14 | find_active_segment/2, 15 | find_segment_offset/3, 16 | find_record_offset/4, 17 | new_index_log_files/2, 18 | find_latest_id/3, 19 | 20 | %% for testing 21 | last_in_index/3]). 22 | 23 | -include("vg.hrl"). 24 | 25 | -define(LOG_SEGMENT_MATCH_PATTERN(Topic, Partition), {Topic,Partition,'$1'}). 26 | -define(LOG_SEGMENT_GUARD(RecordId), [{is_integer, '$1'}, {'=<', '$1', RecordId}]). 27 | -define(LOG_SEGMENT_RETURN, ['$1']). 28 | 29 | init_table() -> 30 | ets:new(?SEGMENTS_TABLE, [bag, public, named_table, {read_concurrency, true}]). 31 | 32 | load_existing(Topic, Partition) -> 33 | TopicDir = vg_utils:topic_dir(Topic, Partition), 34 | case filelib:wildcard(filename:join(TopicDir, "*.log")) of 35 | [] -> 36 | throw({topic_not_found, Topic, Partition}); 37 | LogSegments -> 38 | load_segments(Topic, Partition, LogSegments) 39 | end. 40 | 41 | load_all(Topic, Partition) -> 42 | TopicDir = vg_utils:topic_dir(Topic, Partition), 43 | case filelib:wildcard(filename:join(TopicDir, "*.log")) of 44 | [] -> 45 | insert(Topic, Partition, 0), 46 | vg_topics:insert_hwm(Topic, Partition, 0), 47 | []; 48 | LogSegments -> 49 | load_segments(Topic, Partition, LogSegments) 50 | end. 51 | 52 | load_segments(Topic, Partition, LogSegments) -> 53 | [begin 54 | SegmentId = list_to_integer(filename:basename(LogSegment, ".log")), 55 | insert(Topic, Partition, SegmentId), 56 | SegmentId 57 | end || LogSegment <- LogSegments]. 58 | 59 | delete_segments(Topic, Partition) -> 60 | TopicDir = vg_utils:topic_dir(Topic, Partition), 61 | AllFiles = filelib:wildcard(filename:join(TopicDir, "*")), 62 | ok = lists:foreach(fun file:delete/1, AllFiles), 63 | file:del_dir(TopicDir), 64 | ok. 65 | 66 | delete_indexes(Topic, Partition) -> 67 | TopicDir = vg_utils:topic_dir(Topic, Partition), 68 | AllFiles = filelib:wildcard(filename:join(TopicDir, "*.index")), 69 | ok = lists:foreach(fun file:delete/1, AllFiles). 70 | 71 | regenerate_indexes(Topic, Partition) -> 72 | TopicDir = vg_utils:topic_dir(Topic, Partition), 73 | AllFiles = filelib:wildcard(filename:join(TopicDir, "*.log")), 74 | ok = lists:foreach(fun regenerate_index/1, AllFiles). 75 | 76 | regenerate_index(LogFilename) -> 77 | TopicDir = filename:dirname(LogFilename), 78 | StrID = filename:basename(LogFilename, ".log"), 79 | ID = list_to_integer(StrID), 80 | IndexFilename = vg_utils:index_file(TopicDir, ID), 81 | {ok, IndexFile} = vg_utils:open_append(IndexFilename), 82 | {ok, LogFile} = vg_utils:open_read(LogFilename), 83 | 84 | %% ignore index_max_bytes because it makes no sense without the 85 | %% ability to rewrite the segments 86 | {ok, IndexInterval} = application:get_env(vonnegut, index_interval_bytes), 87 | regen(file:pread(LogFile, 0, ?OFFSET_AND_LENGTH_BYTES), 0, LogFile, ID, IndexFile, 99999999, IndexInterval). 88 | 89 | regen(eof, _Location, Log, _ID, Index, _Bytes, _IndexInterval) -> 90 | file:close(Log), 91 | file:close(Index), 92 | ok; 93 | regen({ok, <>}, Location, Log, BaseOffset, Index, Bytes, 94 | IndexInterval) -> 95 | TotalSize = Size + ?OFFSET_AND_LENGTH_BYTES, 96 | NextLocation = Location + TotalSize, 97 | NewBytes = 98 | case Bytes + TotalSize >= IndexInterval of 99 | true -> 100 | IndexEntry = <<(Offset - BaseOffset):?INDEX_OFFSET_BITS/unsigned, 101 | Location:?INDEX_OFFSET_BITS/unsigned>>, 102 | ok = file:write(Index, IndexEntry), 103 | 0; 104 | _ -> 105 | Bytes + TotalSize 106 | end, 107 | regen(file:pread(Log, NextLocation, ?OFFSET_AND_LENGTH_BYTES), NextLocation, Log, BaseOffset, Index, NewBytes, 108 | IndexInterval). 109 | 110 | cleanup_segments_table(Topic, Partition) -> 111 | NumDeleted = ets:select_delete(?SEGMENTS_TABLE, 112 | [{?LOG_SEGMENT_MATCH_PATTERN(Topic, Partition), 113 | [], 114 | ?LOG_SEGMENT_RETURN}]), 115 | lager:info("deleted ~p segments from the table", [NumDeleted]), 116 | prometheus_gauge:dec(log_segments, [NumDeleted]). 117 | 118 | insert(Topic, Partition, SegmentId) -> 119 | prometheus_gauge:inc(log_segments, [Topic]), 120 | ets:insert(?SEGMENTS_TABLE, {Topic, Partition, SegmentId}). 121 | 122 | local(Topic, Partition) -> 123 | case ets:select(?SEGMENTS_TABLE, [{?LOG_SEGMENT_MATCH_PATTERN(Topic, Partition), 124 | ?LOG_SEGMENT_GUARD(0), 125 | ?LOG_SEGMENT_RETURN}]) of 126 | [] -> false; 127 | _ -> true 128 | end. 129 | 130 | -spec find_log_segment(Topic, Partition, RecordId) -> integer() when 131 | Topic :: binary(), 132 | Partition :: integer(), 133 | RecordId :: integer(). 134 | find_log_segment(Topic, Partition, RecordId) -> 135 | %% Find all registered log segments for topic-partition < the recordid we are looking for 136 | case find_log_segment_(Topic, Partition, RecordId) of 137 | [] -> 138 | %% load from disk and try again 139 | load_existing(Topic, Partition), 140 | find_log_segment_(Topic, Partition, RecordId); 141 | Match -> 142 | %% Return largest, being the largest log segment 143 | %% offset that is still less than the record offset 144 | Match 145 | end. 146 | 147 | %% internal version that won't try again if no match found 148 | find_log_segment_(Topic, Partition, RecordId) -> 149 | %% Find all registered log segments for topic-partition < the recordid we are looking for 150 | case ets:select(?SEGMENTS_TABLE, [{?LOG_SEGMENT_MATCH_PATTERN(Topic, Partition), 151 | ?LOG_SEGMENT_GUARD(RecordId), 152 | ?LOG_SEGMENT_RETURN}]) of 153 | [] -> 154 | []; 155 | Matches -> 156 | %% Return largest, being the largest log segment 157 | %% offset that is still less than the record offset 158 | lists:max(Matches) 159 | end. 160 | 161 | -spec find_active_segment(Topic, Partition) -> integer() when 162 | Topic :: binary(), 163 | Partition :: integer(). 164 | find_active_segment(Topic, Partition) -> 165 | case ets:select(?SEGMENTS_TABLE, [{?LOG_SEGMENT_MATCH_PATTERN(Topic, Partition), 166 | [], 167 | ?LOG_SEGMENT_RETURN}]) of 168 | [] -> 169 | %% check disk 170 | case load_existing(Topic, Partition) of 171 | [] -> 172 | 0; 173 | Segments -> 174 | lists:max(Segments) 175 | end; 176 | Matches -> 177 | lists:max(Matches) 178 | end. 179 | 180 | -spec find_segment_offset(Topic, Partition, RecordId) -> {integer(), {integer(), integer()}} when 181 | Topic :: binary(), 182 | Partition :: integer(), 183 | RecordId :: integer(). 184 | find_segment_offset(_Topic, _Partition, 0) -> 185 | {0, {0, 0}}; 186 | find_segment_offset(Topic, Partition, RecordId) when RecordId >= 0 -> 187 | SegmentId = find_log_segment(Topic, Partition, RecordId), 188 | {SegmentId, find_record_offset(Topic, Partition, SegmentId, RecordId)}. 189 | 190 | -spec find_record_offset(Topic, Partition, SegmentId, RecordId) -> {integer(), integer()} when 191 | Topic :: binary(), 192 | Partition :: integer(), 193 | SegmentId :: integer(), 194 | RecordId :: integer(). 195 | find_record_offset(Topic, Partition, SegmentId, RecordId) -> 196 | TopicDir = vg_utils:topic_dir(Topic, Partition), 197 | LogSegmentFilename = vg_utils:log_file(TopicDir, SegmentId), 198 | IndexSegmentFilename = vg_utils:index_file(TopicDir, SegmentId), 199 | 200 | %% Open log and index segment files, advise the OS we'll be reading randomly from them 201 | case vg_utils:open_read(LogSegmentFilename) of 202 | {ok, LogSegmentFD} -> 203 | file:advise(LogSegmentFD, 0, 0, random), 204 | {ok, IndexSegmentFD} = vg_utils:open_read(IndexSegmentFilename), 205 | file:advise(IndexSegmentFD, 0, 0, random), 206 | 207 | try 208 | InitialOffset = vg_index:find_in_index(IndexSegmentFD, SegmentId, RecordId), 209 | lager:info("InitialOffset topic=~p segment_id=~p initial_offset=~p", [Topic, SegmentId, InitialOffset]), 210 | find_in_log(LogSegmentFD, RecordId, InitialOffset) 211 | after 212 | file:close(LogSegmentFD), 213 | file:close(IndexSegmentFD) 214 | end; 215 | {error, enoent} -> 216 | throw({topic_not_found, Topic, Partition}) 217 | end. 218 | 219 | %% Find the position in Log file of the start of a log with id Id 220 | -spec find_in_log(Log, Id, Position) -> {integer(), integer()} when 221 | Log :: file:fd(), 222 | Id :: integer(), 223 | Position :: integer(). 224 | find_in_log(Log, Id, Position) -> 225 | {ok, _} = file:position(Log, Position), 226 | find_in_log(Log, Id, Position, 0, file:read(Log, ?OFFSET_AND_LENGTH_BYTES)). 227 | 228 | find_in_log(_Log, Id, Position, LastSize, {ok, <>}) when FileId > Id -> 229 | {Position, LastSize}; 230 | find_in_log(_Log, Id, Position, LastSize, {ok, <>}) -> 231 | {Position+LastSize, 0}; 232 | find_in_log(Log, Id, Position, LastSize, {ok, <>}) -> 233 | case file:read(Log, Size + ?OFFSET_AND_LENGTH_BYTES) of 234 | {ok, <<_:Size/binary, Data:?OFFSET_AND_LENGTH_BYTES/binary>>} -> 235 | find_in_log(Log, Id, Position+LastSize, Size+?OFFSET_AND_LENGTH_BYTES, {ok, Data}); 236 | {ok, <>} -> 237 | case D of 238 | <<_LeaderEpoch:32/signed-integer, 239 | ?MAGIC_TWO:8/signed-integer, 240 | _CRC:32/signed-integer, 241 | _Attributes:16/signed-integer, 242 | LastOffsetDelta:32/signed-integer, _/binary>> when LastOffsetDelta + FileId >= Id -> 243 | {Position+LastSize, Size+?OFFSET_AND_LENGTH_BYTES}; 244 | _ -> 245 | {Position+LastSize+Size+?OFFSET_AND_LENGTH_BYTES, 0} 246 | end; 247 | eof -> 248 | {Position+LastSize, Size+?OFFSET_AND_LENGTH_BYTES} 249 | end; 250 | find_in_log(_Log, _Id, Position, LastSize, _) -> 251 | {Position+LastSize, 0}. 252 | 253 | find_latest_id(TopicDir, Topic, Partition) -> 254 | SegmentId = vg_log_segments:find_active_segment(Topic, Partition), 255 | IndexFilename = vg_utils:index_file(TopicDir, SegmentId), 256 | {Offset, Position} = last_in_index(TopicDir, IndexFilename, SegmentId), 257 | LogSegmentFilename = vg_utils:log_file(TopicDir, SegmentId), 258 | {ok, Log} = vg_utils:open_read(LogSegmentFilename), 259 | try 260 | file:position(Log, Position), 261 | NewId = find_last_log(Log, Offset, file:read(Log, ?OFFSET_AND_LENGTH_BYTES)), 262 | {NewId, IndexFilename, LogSegmentFilename} 263 | after 264 | file:close(Log) 265 | end. 266 | 267 | %% Rolling log and index files, so open new empty ones for appending 268 | new_index_log_files(TopicDir, Id) -> 269 | IndexFilename = vg_utils:index_file(TopicDir, Id), 270 | LogFilename = vg_utils:log_file(TopicDir, Id), 271 | 272 | lager:debug("opening new log files: ~p ~p ~p", [Id, IndexFilename, LogFilename]), 273 | %% Make sure empty? 274 | {ok, IndexFile} = vg_utils:open_append(IndexFilename), 275 | {ok, LogFile} = vg_utils:open_append(LogFilename), 276 | {IndexFile, LogFile}. 277 | 278 | %% consider moving this to vg_index, but then we might need to figure 279 | %% out some other, cleaner way to do the create new case 280 | last_in_index(TopicDir, IndexFilename, SegmentId) -> 281 | case file:open(IndexFilename, [read, binary]) of 282 | {error, enoent} when SegmentId =:= 0 -> 283 | %% Index file doesn't exist, if this is the first segment (0) 284 | %% we can just create the files assuming this is a topic creation. 285 | %% Will fail if an empty topic-partition dir exists on boot since 286 | %% vg_topic_sup will not be started yet. 287 | {NewIndexFile, NewLogFile} = new_index_log_files(TopicDir, SegmentId), 288 | file:close(NewIndexFile), 289 | file:close(NewLogFile), 290 | {-1, 0}; 291 | {ok, Index} -> 292 | try 293 | case file:pread(Index, {eof, -?INDEX_ENTRY_SIZE}, ?INDEX_ENTRY_SIZE) of 294 | {ok, <>} -> 295 | %% index stores offsets as offset from SegmentId 296 | %% so add SegmentId here to get the real id 297 | {Offset+SegmentId, Position}; 298 | _ -> 299 | {-1, 0} 300 | end 301 | after 302 | file:close(Index) 303 | end 304 | end. 305 | 306 | %% Find the Id for the last log in the log file Log 307 | find_last_log(Log, _, {ok, <>}) -> 308 | case file:read(Log, Size + ?OFFSET_AND_LENGTH_BYTES) of 309 | {ok, <>} -> 310 | LastOffsetDelta = vg_protocol:last_offset_delta(Batch), 311 | find_last_log(Log, NewId+LastOffsetDelta, {ok, Data}); 312 | {ok, <>} -> 313 | LastOffsetDelta = vg_protocol:last_offset_delta(Batch), 314 | NewId + LastOffsetDelta 315 | end; 316 | find_last_log(_Log, Id, _) -> 317 | Id. 318 | 319 | -------------------------------------------------------------------------------- /src/vg_peer_service.erl: -------------------------------------------------------------------------------- 1 | -module(vg_peer_service). 2 | 3 | -export([join/1, 4 | leave/0, 5 | on_down/2, 6 | members/0, 7 | manager/0, 8 | stop/0, 9 | stop/1]). 10 | 11 | join(Node) -> 12 | partisan_peer_service:join(Node). 13 | 14 | leave() -> 15 | partisan_peer_service:leave([]). 16 | 17 | on_down(Name, Fun) -> 18 | partisan_default_peer_service_manager:on_down(Name, Fun). 19 | 20 | members() -> 21 | partisan_peer_service:members(). 22 | 23 | manager() -> 24 | partisan_peer_service:manager(). 25 | 26 | stop() -> 27 | partisan_peer_service:stop("received stop request"). 28 | 29 | stop(Reason) -> 30 | partisan_peer_service:stop(Reason). 31 | -------------------------------------------------------------------------------- /src/vg_pool.erl: -------------------------------------------------------------------------------- 1 | -module(vg_pool). 2 | 3 | -behaviour(acceptor_pool). 4 | 5 | -export([start_link/1, 6 | accept_socket/2]). 7 | 8 | -export([init/1]). 9 | 10 | %% public api 11 | 12 | start_link(Role) -> 13 | acceptor_pool:start_link({local, ?MODULE}, ?MODULE, [Role]). 14 | 15 | accept_socket(Socket, Acceptors) -> 16 | acceptor_pool:accept_socket(?MODULE, Socket, Acceptors). 17 | 18 | %% acceptor_pool api 19 | 20 | init([Role]) -> 21 | Conn = #{id => vg_conn, 22 | start => {vg_conn, [Role], []}, 23 | grace => 5000}, % Give connections 5000ms to close before shutdown 24 | {ok, {#{}, [Conn]}}. 25 | -------------------------------------------------------------------------------- /src/vg_pool_sup.erl: -------------------------------------------------------------------------------- 1 | -module(vg_pool_sup). 2 | 3 | -behaviour(supervisor). 4 | 5 | %% public api 6 | 7 | -export([start_link/1]). 8 | 9 | %% supervisor api 10 | 11 | -export([init/1]). 12 | 13 | %% public api 14 | 15 | start_link(Role) -> 16 | supervisor:start_link({local, ?MODULE}, ?MODULE, [Role]). 17 | 18 | %% supervisor api 19 | 20 | init([Role]) -> 21 | Flags = #{strategy => rest_for_one}, 22 | Pool = #{id => vg_pool, 23 | start => {vg_pool, start_link, [Role]}}, 24 | Socket = #{id => vg_socket, 25 | start => {vg_socket, start_link, []}}, 26 | {ok, {Flags, [Pool, Socket]}}. 27 | -------------------------------------------------------------------------------- /src/vg_socket.erl: -------------------------------------------------------------------------------- 1 | -module(vg_socket). 2 | 3 | -behaviour(gen_server). 4 | 5 | %% public api 6 | 7 | -export([start_link/0]). 8 | 9 | %% gen_server api 10 | 11 | -export([init/1, 12 | handle_call/3, 13 | handle_cast/2, 14 | handle_info/2, 15 | code_change/3, 16 | terminate/2]). 17 | 18 | %% public api 19 | 20 | start_link() -> 21 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 22 | 23 | %% gen_server api 24 | 25 | init([]) -> 26 | Port = vg_config:port(), 27 | AcceptorPoolSize = application:get_env(vonnegut, acceptor_pool_size, 10), 28 | %% Trapping exit so can close socket in terminate/2 29 | _ = process_flag(trap_exit, true), 30 | Opts = [{active, once}, {reuseaddr, true}, {buffer, 65535}, 31 | {nodelay, true}, {mode, binary}, {packet, raw}], 32 | case gen_tcp:listen(Port, Opts) of 33 | {ok, Socket} -> 34 | %% acceptor could close the socket if there is a problem 35 | MRef = monitor(port, Socket), 36 | vg_pool:accept_socket(Socket, AcceptorPoolSize), 37 | {ok, {Socket, MRef}}; 38 | {error, Reason} -> 39 | {stop, Reason} 40 | end. 41 | 42 | handle_call(Req, _, State) -> 43 | {stop, {bad_call, Req}, State}. 44 | 45 | handle_cast(Req, State) -> 46 | {stop, {bad_cast, Req}, State}. 47 | 48 | handle_info({'DOWN', MRef, port, Socket, Reason}, {Socket, MRef} = State) -> 49 | {stop, Reason, State}; 50 | handle_info(_, State) -> 51 | {noreply, State}. 52 | 53 | code_change(_, State, _) -> 54 | {ok, State}. 55 | 56 | terminate(_, {Socket, MRef}) -> 57 | % Socket may already be down but need to ensure it is closed to avoid 58 | % eaddrinuse error on restart 59 | case demonitor(MRef, [flush, info]) of 60 | true -> gen_tcp:close(Socket); 61 | false -> ok 62 | end. 63 | -------------------------------------------------------------------------------- /src/vg_topic_mgr.erl: -------------------------------------------------------------------------------- 1 | %% doesn't need to be constantly running along side the active segment. 2 | %% TODO: turn into a one off proc that triggers when needed. 3 | -module(vg_topic_mgr). 4 | 5 | -behaviour(gen_server). 6 | 7 | %% API 8 | -export([ 9 | start_link/3, 10 | delete_topic/2, 11 | regenerate_index/2 12 | ]). 13 | 14 | %% gen_server callbacks 15 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, 16 | terminate/2, code_change/3]). 17 | 18 | -define(SERVER, ?MODULE). 19 | 20 | -record(state, 21 | { 22 | topic :: binary(), 23 | partition :: non_neg_integer(), 24 | next :: atom() 25 | }). 26 | 27 | %%%=================================================================== 28 | %%% API 29 | %%%=================================================================== 30 | 31 | %% need this until an Erlang release with `hibernate_after` spec added to gen option type 32 | -dialyzer({nowarn_function, start_link/3}). 33 | 34 | -define(TOPIC_MGR(Topic, Partition), {via, gproc, {n, l, {mgr, Topic, Partition}}}). 35 | 36 | start_link(Topic, Partition, Next) -> 37 | case gen_server:start_link(?TOPIC_MGR(Topic, Partition), ?MODULE, [Topic, Partition, Next], 38 | [{hibernate_after, timer:minutes(5)}]) of % hibernate after 5 minutes with no messages 39 | {ok, Pid} -> 40 | {ok, Pid}; 41 | {error, {already_started, Pid}} -> 42 | {ok, Pid}; 43 | {error, Reason} -> 44 | {error, Reason} 45 | end. 46 | 47 | delete_topic(Topic, Partition) -> 48 | %% may need to start the topic if this fails? 49 | gen_server:call(?TOPIC_MGR(Topic, Partition), delete_topic, timer:seconds(45)). 50 | 51 | regenerate_index(Topic, Partition) -> 52 | %% may need to start the topic if this fails? 53 | gen_server:call(?TOPIC_MGR(Topic, Partition), regenerate_index, timer:minutes(15)). 54 | 55 | %%%=================================================================== 56 | %%% gen_server callbacks 57 | %%%=================================================================== 58 | 59 | init([Topic, Partition, Next]) -> 60 | {ok, #state{topic = Topic, 61 | partition = Partition, 62 | next = Next}}. 63 | 64 | handle_call(delete_topic, _From, #state{topic = Topic, next = Next, 65 | partition = Partition} = State) -> 66 | %% halt the active segment 67 | lager:info("halting active segment"), 68 | halted = vg_active_segment:halt(Topic, Partition), 69 | %% delete the segments 70 | lager:info("deleting segments"), 71 | ok = vg_log_segments:delete_segments(Topic, Partition), 72 | %% remove HWM 73 | true = vg_topics:delete_hwm(Topic, Partition), 74 | %% clean the segments table 75 | vg_log_segments:cleanup_segments_table(Topic, Partition), 76 | %% delete the next 77 | case Next of 78 | tail -> ok; 79 | _ -> 80 | lager:info("propagating delete"), 81 | ok = vg_client:delete_topic(next_brick, Topic) 82 | end, 83 | {reply, ok, State}; 84 | %% note that this needs to be done per node, we don't automatically 85 | %% propagate it 86 | handle_call(regenerate_index, _From, #state{topic = Topic, 87 | partition = Partition} = State) -> 88 | %% tell active_segment to stop writing indexes 89 | ok = vg_active_segment:stop_indexing(Topic, Partition), 90 | %% delete all index files 91 | ok = vg_log_segments:delete_indexes(Topic, Partition), 92 | %% fold over segments and restore indexes 93 | ok = vg_log_segments:regenerate_indexes(Topic, Partition), 94 | %% tell active_segment to resume writing indexes 95 | ok = vg_active_segment:resume_indexing(Topic, Partition), 96 | {reply, ok, State}; 97 | handle_call(_Request, _From, State) -> 98 | lager:warning("unexpected call ~p from ~p", [_Request, _From]), 99 | {noreply, State}. 100 | 101 | handle_cast(_Msg, State) -> 102 | lager:warning("unexpected cast ~p", [_Msg]), 103 | {noreply, State}. 104 | 105 | handle_info(_Info, State) -> 106 | lager:warning("unexpected message ~p", [_Info]), 107 | {noreply, State}. 108 | 109 | terminate(_Reason, _State) -> 110 | ok. 111 | 112 | code_change(_OldVsn, State, _Extra) -> 113 | {ok, State}. 114 | 115 | %%%=================================================================== 116 | %%% Internal functions 117 | %%%=================================================================== 118 | -------------------------------------------------------------------------------- /src/vg_topic_sup.erl: -------------------------------------------------------------------------------- 1 | %%%------------------------------------------------------------------- 2 | %% @doc vonnegut top level supervisor. 3 | %% @end 4 | %%%------------------------------------------------------------------- 5 | 6 | -module(vg_topic_sup). 7 | 8 | -behaviour(supervisor). 9 | 10 | %% API 11 | -export([start_link/2]). 12 | 13 | %% Supervisor callbacks 14 | -export([init/1]). 15 | 16 | -define(SERVER, ?MODULE). 17 | 18 | %%==================================================================== 19 | %% API functions 20 | %%==================================================================== 21 | 22 | start_link(Topic, Partitions) -> 23 | supervisor:start_link(?MODULE, [Topic, Partitions]). 24 | 25 | %%==================================================================== 26 | %% Supervisor callbacks 27 | %%==================================================================== 28 | 29 | %% Child :: {Id,StartFunc,Restart,Shutdown,Type,Modules} 30 | init([Topic, Partitions]) -> 31 | ChildSpecs = lists:flatten([child_specs(Topic, Partition) || Partition <- Partitions]), 32 | {ok, {{one_for_one, 0, 1}, ChildSpecs}}. 33 | 34 | %%==================================================================== 35 | %% Internal functions 36 | %%==================================================================== 37 | 38 | child_specs(Topic, Partition) -> 39 | %% wait for the chain to be active? 40 | Next = vg_chain_state:next(), 41 | [#{id => {active, Topic, Partition}, 42 | start => {vg_active_segment, start_link, [Topic, Partition, Next]}, 43 | restart => transient, 44 | type => worker}, 45 | #{id => {mgr, Topic, Partition}, 46 | start => {vg_topic_mgr, start_link, [Topic, Partition, Next]}, 47 | restart => transient, 48 | type => worker} 49 | | case application:get_env(vonnegut, log_cleaner, true) of 50 | true -> 51 | [#{id => {cleaner, Topic, Partition}, 52 | start => {vg_cleaner, start_link, [Topic, Partition]}, 53 | restart => permanent, 54 | type => worker}]; 55 | false -> 56 | [] 57 | end]. 58 | -------------------------------------------------------------------------------- /src/vg_topics.erl: -------------------------------------------------------------------------------- 1 | -module(vg_topics). 2 | 3 | -export([init_table/0, 4 | 5 | all/0, 6 | get_chain/1, 7 | 8 | insert_hwm/3, 9 | lookup_hwm/2, 10 | update_hwm/3, 11 | delete_hwm/2]). 12 | 13 | -include("vg.hrl"). 14 | 15 | -define(HWM_POS, 2). %% {{Topic, Partition}, HighWaterMark} 16 | 17 | init_table() -> 18 | ets:new(?WATERMARK_TABLE, [set, public, named_table, {write_concurrency, true}]). 19 | 20 | all() -> 21 | %% replace with ets table keys 22 | {Topics, _Chains, _Epoch} = vg_cluster_mgr:get_map(), 23 | maps:keys(Topics). 24 | 25 | get_chain(Topic) -> 26 | %% replace with ets table lookup 27 | {Topics, Chains, _Epoch} = vg_cluster_mgr:get_map(), 28 | case maps:get(Topic, Topics, not_found) of 29 | not_found -> 30 | lager:info("lookup for non-existant topic ~p", [Topic]), 31 | not_found; 32 | Chain -> 33 | maps:get(Chain, Chains) 34 | end. 35 | 36 | insert_hwm(Topic, Partition, HWM) -> 37 | ets:insert(?WATERMARK_TABLE, {{Topic, Partition}, HWM}). 38 | 39 | lookup_hwm(Topic, Partition) -> 40 | try ets:lookup_element(?WATERMARK_TABLE, {Topic, Partition}, ?HWM_POS) 41 | catch 42 | error:badarg -> 43 | %% maybe just not loaded, try to get from disk first 44 | TopicDir = vg_utils:topic_dir(Topic, Partition), 45 | try vg_log_segments:find_latest_id(TopicDir, Topic, Partition) of 46 | {HWM, _, _} -> 47 | ets:insert(?WATERMARK_TABLE, {{Topic, Partition}, HWM}), 48 | HWM 49 | catch error:{badmatch,{error,enoent}} -> 50 | throw({topic_not_found, Topic, Partition}) 51 | end 52 | end. 53 | 54 | update_hwm(Topic, Partition, HWMUpdate) -> 55 | try 56 | true = ets:update_element(?WATERMARK_TABLE, {Topic, Partition}, {?HWM_POS, HWMUpdate}) 57 | catch 58 | error:badarg -> 59 | throw(hwm_table_not_loaded) 60 | end. 61 | 62 | delete_hwm(Topic, Partition) -> 63 | ets:delete(?WATERMARK_TABLE, {Topic, Partition}). 64 | -------------------------------------------------------------------------------- /src/vg_topics_sup.erl: -------------------------------------------------------------------------------- 1 | %%%------------------------------------------------------------------- 2 | %% @doc vonnegut topics supervisor. 3 | %% @end 4 | %%%------------------------------------------------------------------- 5 | 6 | -module(vg_topics_sup). 7 | 8 | -behaviour(supervisor). 9 | 10 | %% API 11 | -export([start_link/0, 12 | start_child/1, 13 | start_child/2, 14 | start_child/3, 15 | start_child/4, 16 | list_topics/1, 17 | stop_child/3]). 18 | 19 | %% Supervisor callbacks 20 | -export([init/1]). 21 | 22 | -define(SERVER, ?MODULE). 23 | 24 | %%==================================================================== 25 | %% API functions 26 | %%==================================================================== 27 | 28 | start_link() -> 29 | supervisor:start_link({local, ?SERVER}, ?MODULE, []). 30 | 31 | start_child(Topic) -> 32 | start_child(Topic, [0]). 33 | 34 | start_child(Topic, Partitions) -> 35 | start_child(local, Topic, Partitions). 36 | 37 | start_child(Server0, Topic, Partitions) -> 38 | %% since it's crucial to start remote children, block for a while 39 | start_child(Server0, Topic, Partitions, 300). 40 | 41 | start_child(_, _, _, 0) -> 42 | {error, remote_node_down}; 43 | start_child(Server0, Topic, Partitions, Retries) -> 44 | Server = case Server0 of 45 | local -> ?SERVER; 46 | _ -> {?SERVER, Server0} 47 | end, 48 | lager:info("at=create_topic node=~p topic=~p partitions=~p target=~p", 49 | [node(), Topic, Partitions, Server0]), 50 | prometheus_gauge:inc(active_topics), 51 | try 52 | case supervisor:start_child(Server, [Topic, Partitions]) of 53 | {ok, Pid} -> 54 | {ok, Pid}; 55 | {error, {already_started, Pid}} -> 56 | {ok, Pid}; 57 | {error, {shutdown, {failed_to_start_child, _, Reason}}} -> 58 | {error, Reason} 59 | end 60 | catch _C:_E-> 61 | lager:info("~p : ~p", [_C,_E]), 62 | timer:sleep(100), 63 | start_child(Server0, Topic, Partitions, Retries - 1) 64 | end. 65 | 66 | stop_child(Server, Topic, Partitions) -> 67 | %% get a list of topic_sup supervisors 68 | Topics = supervisor:which_children({?MODULE, Server}), 69 | Topics1 = [{Pid, supervisor:which_children(Pid)} 70 | || {_, Pid, _, _} <- Topics], 71 | Res = 72 | [[case Topic =:= Top andalso lists:member(Part, Partitions) of 73 | true -> 74 | supervisor:terminate_child({?MODULE, Server}, Pid); 75 | _ -> 76 | ok 77 | end 78 | || {{active, Top, Part}, _, _, _} <- Children] 79 | || {Pid, Children} <- Topics1], 80 | lists:usort(lists:flatten(Res)). 81 | 82 | list_topics(Server0) -> 83 | Server = case Server0 of 84 | local -> ?SERVER; 85 | _ -> {?SERVER, Server0} 86 | end, 87 | Topics = supervisor:which_children(Server), 88 | [{Topic, Partition} || {{active, Topic, Partition}, _, _, _} <- 89 | lists:flatten([supervisor:which_children(Pid) 90 | || {_, Pid, _, _} <- Topics])]. 91 | 92 | %%==================================================================== 93 | %% Supervisor callbacks 94 | %%==================================================================== 95 | 96 | %% Child :: {Id,StartFunc,Restart,Shutdown,Type,Modules} 97 | init([]) -> 98 | SupFlags = #{strategy => simple_one_for_one, 99 | intensity => 0, 100 | period => 1}, 101 | ChildSpecs = [#{id => vg_topic_sup, 102 | start => {vg_topic_sup, start_link, []}, 103 | restart => permanent, 104 | type => supervisor, 105 | shutdown => 5000}], 106 | {ok, {SupFlags, ChildSpecs}}. 107 | 108 | %%==================================================================== 109 | %% Internal functions 110 | %%==================================================================== 111 | 112 | -------------------------------------------------------------------------------- /src/vg_utils.erl: -------------------------------------------------------------------------------- 1 | -module(vg_utils). 2 | 3 | -export([index_file/2, 4 | index_file/3, 5 | log_file/2, 6 | log_file/3, 7 | topic_dir/2, 8 | open_append/1, 9 | open_read/1, 10 | 11 | topics_on_disk/0, 12 | 13 | to_atom/1, 14 | to_integer/1]). 15 | 16 | %% Convenience functions for creating index and log file names 17 | index_file(TopicDir, Id) -> 18 | filename:join(TopicDir, io_lib:format("~20.10.0b.index", [Id])). 19 | 20 | index_file(Topic, Partition, Id) -> 21 | TopicDir = topic_dir(Topic, Partition), 22 | filename:join(TopicDir, io_lib:format("~20.10.0b.index", [Id])). 23 | 24 | log_file(Topic, Partition, Id) -> 25 | TopicDir = topic_dir(Topic, Partition), 26 | filename:join(TopicDir, io_lib:format("~20.10.0b.log", [Id])). 27 | 28 | log_file(TopicDir, Id) -> 29 | filename:join(TopicDir, io_lib:format("~20.10.0b.log", [Id])). 30 | 31 | topic_dir(Topic, Partition) -> 32 | {ok, [LogDir | _]} = application:get_env(vonnegut, log_dirs), 33 | filename:join(LogDir, [binary_to_list(Topic), "-", integer_to_list(Partition)]). 34 | 35 | topics_on_disk() -> 36 | {ok, [DataDir| _]} = application:get_env(vonnegut, log_dirs), 37 | TopicPartitions = filelib:wildcard(filename:join(DataDir, "*")), 38 | TPDict = lists:foldl(fun(TP, Acc) -> 39 | case string:tokens(filename:basename(TP), "-") of 40 | [_] -> 41 | Acc; 42 | L -> 43 | [P | TopicR] = lists:reverse(L), 44 | T = string:join(lists:reverse(TopicR), "-"), 45 | dict:append_list(list_to_binary(T), [list_to_integer(P)], Acc) 46 | end 47 | end, dict:new(), TopicPartitions), 48 | dict:to_list(TPDict). 49 | 50 | 51 | open_append(Filename) -> 52 | case application:get_env(vonnegut, delayed_write) of 53 | {ok, true} -> 54 | %% Buffer writes up to DelayedWriteSize bytes or DelayMS milliseconds to save on OS calls 55 | {ok, DelayedWriteSize} = application:get_env(vonnegut, delayed_write_byte_size), 56 | {ok, DelayMS} = application:get_env(vonnegut, delayed_write_milliseconds), 57 | file:open(Filename, [append, raw, binary, {delayed_write, DelayedWriteSize, DelayMS}]); 58 | _ -> 59 | file:open(Filename, [append, raw, binary]) 60 | end. 61 | 62 | open_read(Filename) -> 63 | file:open(Filename, [read, raw, binary]). 64 | 65 | to_integer(I) when is_integer(I) -> I; 66 | to_integer(I) when is_list(I) -> list_to_integer(I); 67 | to_integer(I) when is_binary(I) -> binary_to_integer(I); 68 | to_integer(_) -> throw(badarg). 69 | 70 | to_atom(A) when is_list(A) -> list_to_atom(A); 71 | to_atom(A) when is_binary(A) -> binary_to_atom(A, utf8); 72 | to_atom(A) when is_atom(A) -> A; 73 | to_atom(_) -> throw(badarg). 74 | -------------------------------------------------------------------------------- /src/vonnegut.app.src: -------------------------------------------------------------------------------- 1 | {application, vonnegut, 2 | [{description, "Replicated append-only log."}, 3 | {vsn, git}, 4 | {registered, []}, 5 | {mod, {vonnegut_app, []}}, 6 | {applications, 7 | [kernel, 8 | stdlib, 9 | sasl, 10 | lager, 11 | crypto, 12 | ssl, 13 | gproc, 14 | acceptor_pool, 15 | shackle, 16 | erlware_commons, 17 | backoff, 18 | partisan, 19 | elli, 20 | elli_prometheus, 21 | prometheus, 22 | 23 | hackney, 24 | jsx, 25 | oc_google_reporter, 26 | opencensus 27 | ]}, 28 | {env,[{log_dirs, ["./data"]}, 29 | {acceptor_pool_size, 10}, 30 | {client_pool_size, 10}, 31 | 32 | {send_buffer_bytes, 102400}, 33 | 34 | %% Log and index file related configs 35 | {segment_bytes, 1073741824}, 36 | {index_max_bytes, 10485760}, 37 | {index_interval_bytes, 4096}, 38 | 39 | {write_delayed, false}, 40 | {delayed_write_byte_size, 64000}, %% 64kb 41 | {delayed_write_milliseconds, 2000}, %% 2 seconds 42 | 43 | {log_cleaner, false}, 44 | {log_retention_check_interval, 5}, %% 5 minutes 45 | {log_retention_minutes, 10080}, %% 7 days 46 | 47 | {num_partitions, 1}]}, 48 | {modules, []}, 49 | 50 | {contributors, []}, 51 | {licenses, []}, 52 | {links, []} 53 | ]}. 54 | -------------------------------------------------------------------------------- /src/vonnegut_app.erl: -------------------------------------------------------------------------------- 1 | %%%------------------------------------------------------------------- 2 | %% @doc vonnegut public API 3 | %% @end 4 | %%%------------------------------------------------------------------- 5 | 6 | -module(vonnegut_app). 7 | 8 | -behaviour(application). 9 | 10 | %% Application callbacks 11 | -export([start/2, 12 | stop/1, 13 | swap_lager/1]). 14 | 15 | %%==================================================================== 16 | %% API 17 | %%==================================================================== 18 | 19 | start(_StartType, _StartArgs) -> 20 | init_tables(), 21 | vonnegut_sup:start_link(). 22 | 23 | 24 | %%-------------------------------------------------------------------- 25 | stop(_State) -> 26 | ok. 27 | 28 | %%==================================================================== 29 | %% Internal functions 30 | %%==================================================================== 31 | 32 | init_tables() -> 33 | vg_log_segments:init_table(), 34 | vg_topics:init_table(). 35 | 36 | %% TODO: ifdef this out in non-test builds 37 | swap_lager(Pid) -> 38 | %% our testing environment has provided us with a remote 39 | %% lager sink to target messages at, but we can't target 40 | %% it directly, so proxy message through to it. 41 | Proxy = spawn(fun Loop() -> 42 | receive 43 | E -> Pid ! E 44 | end, 45 | Loop() 46 | end), 47 | Lager = whereis(lager_event), 48 | true = unregister(lager_event), 49 | case (catch register(lager_event, Proxy)) of 50 | true -> 51 | lager:info("swapped local lager_event server with: ~p", [Pid]); 52 | Other -> 53 | register(lager_event, Lager), 54 | lager:info("noes we failed: ~p", [Other]) 55 | end. 56 | -------------------------------------------------------------------------------- /src/vonnegut_sup.erl: -------------------------------------------------------------------------------- 1 | %%%------------------------------------------------------------------- 2 | %% @doc vonnegut top level supervisor. 3 | %% @end 4 | %%%------------------------------------------------------------------- 5 | 6 | -module(vonnegut_sup). 7 | 8 | -behaviour(supervisor). 9 | 10 | %% API 11 | -export([start_link/0, 12 | start_cluster_mgr/2, 13 | start_acceptor_pool/1]). 14 | 15 | %% Supervisor callbacks 16 | -export([init/1]). 17 | 18 | -define(SERVER, ?MODULE). 19 | 20 | %%==================================================================== 21 | %% API functions 22 | %%==================================================================== 23 | 24 | start_link() -> 25 | supervisor:start_link({local, ?SERVER}, ?MODULE, []). 26 | 27 | start_cluster_mgr(Name, Nodes) -> 28 | {ok, [LogDir]} = application:get_env(vonnegut, log_dirs), 29 | ChildSpec = #{id => vg_cluster_mgr, 30 | start => {vg_cluster_mgr, start_link, [Name, Nodes, LogDir]}, 31 | restart => permanent, 32 | type => supervisor}, 33 | supervisor:start_child(?SERVER, ChildSpec). 34 | 35 | start_acceptor_pool(Role) -> 36 | ChildSpec = #{id => vg_pool_sup, 37 | start => {vg_pool_sup, start_link, [Role]}, 38 | restart => permanent, 39 | type => supervisor}, 40 | supervisor:start_child(?SERVER, ChildSpec). 41 | 42 | %%==================================================================== 43 | %% Supervisor callbacks 44 | %%==================================================================== 45 | 46 | %% Child :: {Id,StartFunc,Restart,Shutdown,Type,Modules} 47 | init([]) -> 48 | Port = application:get_env(vonnegut, http_port, 8000), 49 | ElliChild = {vonnegut_http, {elli, start_link, [[{callback, elli_middleware}, 50 | {callback_args, [{mods, [{elli_prometheus, []}, 51 | {vg_elli_handler, []}]}]}, 52 | {port, Port}]]}, 53 | permanent, 5000, worker, dynamic}, 54 | 55 | case application:get_env(vonnegut, chain, []) of 56 | [] -> 57 | {ok, {{one_for_one, 10, 30}, []}}; 58 | _ -> 59 | ChainState = {vg_chain_state, {vg_chain_state, start_link, []}, 60 | permanent, 20000, worker, [vg_chain_state]}, 61 | TopicsSup = {vg_topics_sup, {vg_topics_sup, start_link, []}, 62 | permanent, 20000, supervisor, [vg_topics_sup]}, 63 | 64 | {ok, {{one_for_one, 10, 30}, [ElliChild, TopicsSup, ChainState]}} 65 | end. 66 | 67 | %%==================================================================== 68 | %% Internal functions 69 | %%==================================================================== 70 | -------------------------------------------------------------------------------- /test/cleanup_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(cleanup_SUITE). 2 | 3 | -include_lib("eunit/include/eunit.hrl"). 4 | -include_lib("common_test/include/ct.hrl"). 5 | -compile(export_all). 6 | 7 | -include("vg.hrl"). 8 | 9 | all() -> 10 | [delete_policy]. 11 | 12 | init_per_testcase(_, Config) -> 13 | PrivDir = ?config(priv_dir, Config), 14 | application:load(vonnegut), 15 | application:set_env(vonnegut, log_dirs, [filename:join(PrivDir, "data")]), 16 | application:set_env(vonnegut, segment_bytes, 86), 17 | application:set_env(vonnegut, index_max_bytes, 18), 18 | application:set_env(vonnegut, log_cleaner, true), 19 | application:set_env(vonnegut, index_interval_bytes, 24), 20 | application:set_env(vonnegut, log_retention_minutes, 5), 21 | application:set_env(vonnegut, chain, [{discovery, local}]), 22 | application:ensure_all_started(vonnegut), 23 | crypto:start(), 24 | Config. 25 | 26 | end_per_testcase(_, Config) -> 27 | application:stop(vonnegut), 28 | application:unload(vonnegut), 29 | Config. 30 | 31 | delete_policy(_Config) -> 32 | {ok, LogRetentionMinutes} = application:get_env(vonnegut, log_retention_minutes), 33 | Topic = vg_test_utils:create_random_name(<<"test_topic">>), 34 | Partition = 0, 35 | TopicPartitionDir = vg_utils:topic_dir(Topic, Partition), 36 | vg:create_topic(Topic), 37 | ?assert(filelib:is_dir(TopicPartitionDir)), 38 | 39 | [vg:write(Topic, Partition, M) || M <- [crypto:strong_rand_bytes(60), crypto:strong_rand_bytes(60)]], 40 | 41 | 42 | %% Verify 2 segments have been created 43 | Segment0 = filename:join([TopicPartitionDir, "00000000000000000000.log"]), 44 | Segment1 = filename:join([TopicPartitionDir, "00000000000000000001.log"]), 45 | ?assert(filelib:is_regular(Segment0)), 46 | ?assert(filelib:is_regular(Segment1)), 47 | 48 | meck:new(filelib, [unstick, passthrough]), 49 | %% Mock last_modified return for Segment0 to be >= LogRetentionMinutes so it is deleted 50 | Now = calendar:local_time(), 51 | meck:expect(filelib, last_modified, fun(Segment) when Segment =:= Segment0 -> 52 | dec_datetime_by_mins(Now, LogRetentionMinutes+1); 53 | (Segment) -> 54 | meck:passthrough([Segment]) 55 | end), 56 | %% Execute the cleaner 57 | vg_cleaner:run_cleaner(Topic, 0), 58 | meck:unload(filelib), 59 | 60 | %% Verify Segment0 has been deleted but not Segment1 61 | ?assertEqual(filelib:is_regular(Segment0), false), 62 | ?assertEqual(filelib:is_regular(Segment1), true). 63 | 64 | %% 65 | 66 | dec_datetime_by_mins(DateTime, Minutes) -> 67 | Seconds = calendar:datetime_to_gregorian_seconds(DateTime), 68 | Seconds1 = Seconds - (Minutes * 60), 69 | calendar:gregorian_seconds_to_datetime(Seconds1). 70 | -------------------------------------------------------------------------------- /test/kafka_client_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(kafka_client_SUITE). 2 | 3 | -include_lib("eunit/include/eunit.hrl"). 4 | -include_lib("common_test/include/ct.hrl"). 5 | -compile(export_all). 6 | 7 | -include_lib("brod/include/brod.hrl"). 8 | 9 | all() -> 10 | [get_metadata]. %% produce, add back when brod supports >=0.11.0 kafka 11 | 12 | init_per_suite(Config) -> 13 | PrivDir = ?config(priv_dir, Config), 14 | application:load(vonnegut), 15 | application:set_env(vonnegut, log_dirs, [filename:join(PrivDir, "data")]), 16 | application:set_env(vonnegut, segment_bytes, 86), 17 | application:set_env(vonnegut, index_max_bytes, 18), 18 | application:set_env(vonnegut, index_interval_bytes, 24), 19 | application:set_env(vonnegut, chain, [{discovery, local}]), 20 | crypto:start(), 21 | 22 | Port = 5588, 23 | Host = <<"127.0.0.1">>, 24 | Hosts = [{"127.0.0.1", Port}], 25 | 26 | application:ensure_all_started(vonnegut), 27 | application:ensure_all_started(brod), 28 | 29 | ok = brod:start_client(Hosts, brod_client_1, []), 30 | 31 | [{host, Host}, {port, Port}, {hosts, Hosts} | Config]. 32 | 33 | end_per_suite(Config) -> 34 | application:stop(vonnegut), 35 | application:unload(vonnegut), 36 | Config. 37 | 38 | get_metadata(Config) -> 39 | Host = ?config(host, Config), 40 | Port = ?config(port, Config), 41 | Hosts = ?config(hosts, Config), 42 | 43 | Topic = vg_test_utils:create_random_name(<<"kafka_get_metadata">>), 44 | ok = vg:create_topic(Topic), 45 | 46 | %% same host will be in broker list twice because we send the same broker as the tail 47 | {ok, 48 | [{brokers, 49 | [[{node_id,0},{host,Host},{port,Port}], 50 | [{node_id,0},{host,Host},{port,Port}]]}, 51 | {topic_metadata, TMs}]} = brod:get_metadata(Hosts), 52 | ?assert(lists:any(fun(TM) -> lists:member({topic,Topic}, TM) end, TMs)). 53 | 54 | produce(Config) -> 55 | Hosts = ?config(hosts, Config), 56 | Topic = vg_test_utils:create_random_name(<<"kafka_produce">>), 57 | ok = vg:create_topic(Topic), 58 | brod:start_producer(brod_client_1, Topic, []), 59 | 60 | Key = <<"I'm a key">>, 61 | M = <<"hello from brod">>, 62 | brod:produce_sync(brod_client_1, 63 | Topic, 64 | 0, 65 | Key, 66 | M), 67 | 68 | ?assertMatch({ok, [#kafka_message{key=Key, 69 | value=M} | _]}, brod:fetch(Hosts, Topic, 0, 0)), 70 | 71 | ok. 72 | -------------------------------------------------------------------------------- /test/log_roll_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(log_roll_SUITE). 2 | 3 | -include_lib("eunit/include/eunit.hrl"). 4 | -include_lib("common_test/include/ct.hrl"). 5 | -compile(export_all). 6 | 7 | -include("vg.hrl"). 8 | 9 | all() -> 10 | [records_larger_than_max_segment, regenerate_index_test]. 11 | 12 | init_per_testcase(regenerate_index_test, Config) -> 13 | PrivDir = ?config(priv_dir, Config), 14 | application:load(vonnegut), 15 | application:set_env(vonnegut, log_dirs, [filename:join(PrivDir, "data")]), 16 | application:set_env(vonnegut, segment_bytes, 177), 17 | application:set_env(vonnegut, index_max_bytes, 50), 18 | application:set_env(vonnegut, index_interval_bytes, 24), 19 | application:set_env(vonnegut, chain, [{discovery, local}]), 20 | application:ensure_all_started(vonnegut), 21 | crypto:start(), 22 | Config; 23 | init_per_testcase(_, Config) -> 24 | PrivDir = ?config(priv_dir, Config), 25 | application:load(vonnegut), 26 | application:set_env(vonnegut, log_dirs, [filename:join(PrivDir, "data")]), 27 | application:set_env(vonnegut, segment_bytes, 86), 28 | application:set_env(vonnegut, index_max_bytes, 18), 29 | application:set_env(vonnegut, index_interval_bytes, 24), 30 | application:set_env(vonnegut, chain, [{discovery, local}]), 31 | application:ensure_all_started(vonnegut), 32 | crypto:start(), 33 | Config. 34 | 35 | end_per_testcase(_, Config) -> 36 | application:stop(vonnegut), 37 | %% if we don't unload the settings will stick around in other suites 38 | application:unload(vonnegut), 39 | Config. 40 | 41 | records_larger_than_max_segment(_Config) -> 42 | Topic = vg_test_utils:create_random_name(<<"log_roll_test_topic">>), 43 | Partition = 0, 44 | TopicPartitionDir = vg_utils:topic_dir(Topic, Partition), 45 | vg:create_topic(Topic), 46 | ?assert(filelib:is_dir(TopicPartitionDir)), 47 | 48 | [vg:write(Topic, 0, M) 49 | || M <- [crypto:strong_rand_bytes(60), crypto:strong_rand_bytes(60), 50 | crypto:strong_rand_bytes(6), crypto:strong_rand_bytes(6), 51 | crypto:strong_rand_bytes(60)]], 52 | 53 | %% Total size of a 60 byte record when written to log becomes 86 bytes 54 | %% Since index interval is 24 and 86 > 24, 1 index entry of 6 bytes should exist for each as well 55 | ?assertEqual(8, filelib:file_size(filename:join([TopicPartitionDir, "00000000000000000000.index"]))), 56 | ?assertEqual(127, filelib:file_size(filename:join([TopicPartitionDir, "00000000000000000000.log"]))), 57 | ?assertEqual(8, filelib:file_size(filename:join([TopicPartitionDir, "00000000000000000001.index"]))), 58 | ?assertEqual(127, filelib:file_size(filename:join([TopicPartitionDir, "00000000000000000001.log"]))), 59 | 60 | %% Next 2 records create a log with 2 records of 6 bytes each (with headers they are 32 bytes) 61 | %% with ids 2 and 3. The third record (id 4) then goes in a new index and log 62 | ?assertEqual(8, filelib:file_size(filename:join([TopicPartitionDir, "00000000000000000002.index"]))), 63 | ?assertEqual(73, filelib:file_size(filename:join([TopicPartitionDir, "00000000000000000002.log"]))), 64 | ?assertEqual(8, filelib:file_size(filename:join([TopicPartitionDir, "00000000000000000004.index"]))), 65 | ?assertEqual(127, filelib:file_size(filename:join([TopicPartitionDir, "00000000000000000004.log"]))), 66 | 67 | %% regression test. check that a cold node (no data loaded) finds the right hwm for a topic 68 | 69 | application:stop(vonnegut), 70 | application:ensure_all_started(vonnegut), 71 | 72 | ?assertEqual(4, vg_topics:lookup_hwm(Topic, Partition)). 73 | 74 | regenerate_index_test(_Config) -> 75 | Topic = vg_test_utils:create_random_name(<<"index_regen_test_topic">>), 76 | Partition = 0, 77 | TopicDir = vg_utils:topic_dir(Topic, Partition), 78 | vg:create_topic(Topic), 79 | 80 | [vg:write(Topic, 0, iolist_to_binary(lists:duplicate(rand:uniform(5), <<"A">>))) 81 | || _ <- lists:seq(1, 50)], 82 | 83 | AllFiles = filelib:wildcard(filename:join(TopicDir, "*.index")), 84 | SHAs = [begin 85 | {ok, B} = file:read_file(File), 86 | B 87 | end 88 | || File <- AllFiles], 89 | 90 | vg:regenerate_topic_index(Topic), 91 | 92 | AllFiles1 = filelib:wildcard(filename:join(TopicDir, "*.index")), 93 | SHAs1 = [begin 94 | {ok, B} = file:read_file(File), 95 | B 96 | end 97 | || File <- AllFiles1], 98 | ?assertMatch({ok,#{high_water_mark := 49, 99 | partition := 0, 100 | record_batches := 101 | [#{offset := 45}]}}, 102 | vg:fetch(Topic, 0, 45, 1)), 103 | 104 | ?assertEqual(SHAs, SHAs1), 105 | ok. 106 | -------------------------------------------------------------------------------- /test/prop_vg.erl: -------------------------------------------------------------------------------- 1 | -module(prop_vg). 2 | 3 | -include_lib("proper/include/proper.hrl"). 4 | 5 | -define(MODEL, vg_statem). 6 | 7 | prop_test() -> 8 | ?FORALL(Cmds, more_commands(8, commands(?MODEL)), 9 | begin 10 | lager:start(), 11 | lager:set_loglevel(lager_console_backend, error), 12 | application:ensure_all_started(vonnegut), 13 | {History, State, Result} = run_commands(?MODEL, Cmds), 14 | application:stop(vonnegut), 15 | ?WHENFAIL(io:format("History: ~p\nState: ~p\nResult: ~p\n", 16 | [History,State,Result]), 17 | aggregate(command_names(Cmds), Result =:= ok)) 18 | end). 19 | -------------------------------------------------------------------------------- /test/protocol_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(protocol_SUITE). 2 | 3 | -compile(export_all). 4 | 5 | %% imo eventually this should be a propEr test 6 | 7 | -include_lib("common_test/include/ct.hrl"). 8 | -include_lib("eunit/include/eunit.hrl"). 9 | 10 | -include("include/vg.hrl"). 11 | 12 | suite() -> 13 | [{timetrap,{seconds,30}}]. 14 | 15 | init_per_suite(Config) -> 16 | Config. 17 | 18 | end_per_suite(_Config) -> 19 | ok. 20 | 21 | init_per_group(_GroupName, Config) -> 22 | Config. 23 | 24 | end_per_group(_GroupName, _Config) -> 25 | ok. 26 | 27 | init_per_testcase(_TestCase, Config) -> 28 | Config. 29 | 30 | end_per_testcase(_TestCase, _Config) -> 31 | ok. 32 | 33 | groups() -> 34 | []. 35 | 36 | all() -> 37 | [ 38 | incomplete_fetch_decode, 39 | incomplete_produce_decode %, 40 | %% client_incomplete_handling 41 | ]. 42 | 43 | incomplete_fetch_decode(_Config) -> 44 | %% do we need the correlation id stuff here? or is that decoded directly? 45 | Topic = <<"foo">>, 46 | {_, EncodedSet} = 47 | lists:foldl( 48 | fun(Rec, {ID, IOL}) -> 49 | #{last_offset_delta := L, 50 | record_batch := RecordBatch} = vg_protocol:encode_record_batch(Rec), 51 | {ID+L+1, [IOL | [<>, RecordBatch]]} 52 | end, 53 | {55, []}, [<<"bar1">>, <<"bar2">>, <<"bar3">>, <<"bar4">>, <<"bar5">>]), 54 | 55 | FTR = vg_protocol:encode_fetch_topic_response(0, 0, 99, iolist_size(EncodedSet)), 56 | 57 | RespIO = [<<1:32/signed-integer>>, vg_protocol:encode_string(Topic), 58 | <<1:32/signed-integer>>, FTR, EncodedSet], 59 | 60 | ct:pal("resp ~p", [RespIO]), 61 | 62 | FullResponse = iolist_to_binary(RespIO), 63 | 64 | %% make sure that the full request is valid before we start breaking it up 65 | ?assertMatch(#{<<"foo">> := 66 | #{0 := 67 | #{error_code := 0,high_water_mark := 99, 68 | record_batches := 69 | [#{offset := 55, 70 | value := <<"bar1">>}, 71 | #{offset := 56, 72 | value := <<"bar2">>}, 73 | #{offset := 57, 74 | value := <<"bar3">>}, 75 | #{offset := 58, 76 | value := <<"bar4">>}, 77 | #{offset := 59, 78 | value := <<"bar5">>}], 79 | record_batches_size := 355}}}, 80 | vg_protocol:decode_fetch_response(FullResponse)), 81 | 82 | [begin 83 | Head = binary:part(FullResponse, 0, N), 84 | ?assertEqual(more, vg_protocol:decode_fetch_response(Head)) 85 | end 86 | || N <- lists:seq(1, byte_size(FullResponse) - 1)], 87 | 88 | ok. 89 | 90 | incomplete_produce_decode(_Config) -> 91 | Topic = <<"foo">>, 92 | Partition = 0, 93 | Results = [{Topic, [{Partition, 0, 444}]}], 94 | %% not sure why it won't use the macro here 95 | %% Results = [{Topic, [{Partition, ?NO_ERROR, 444}]}], 96 | ProduceResponse0 = vg_protocol:encode_produce_response(Results), 97 | ProduceResponse = iolist_to_binary(ProduceResponse0), 98 | ?assertEqual(#{<<"foo">> => 99 | #{0 => #{error_code => 0,offset => 444}}}, 100 | vg_protocol:decode_response(?PRODUCE_REQUEST, ProduceResponse)), 101 | 102 | [begin 103 | Head = binary:part(ProduceResponse, 0, N), 104 | ?assertEqual(more, vg_protocol:decode_response(?PRODUCE_REQUEST, Head)) 105 | end 106 | || N <- lists:seq(1, byte_size(ProduceResponse) - 1)], 107 | 108 | ok. 109 | -------------------------------------------------------------------------------- /test/test_utils.hrl: -------------------------------------------------------------------------------- 1 | %% Try for 5 seconds 2 | -define(UNTIL(X), (fun Until(100) -> 3 | erlang:error({fail, X}); 4 | Until(I) -> 5 | case X of true -> ok; 6 | false -> 7 | timer:sleep(200), 8 | Until(I+1) 9 | end 10 | end)(0)). 11 | 12 | -define(until_match(Guard, Expr, Seconds), 13 | (fun Until(I) when I =:= (Seconds * 5) -> 14 | ?assertMatch(Guard, Expr); 15 | Until(I) -> 16 | try 17 | ?assertMatch(Guard, Expr) 18 | catch error:_ -> 19 | timer:sleep(200), 20 | Until(I+1) 21 | end 22 | end)(0)). 23 | -------------------------------------------------------------------------------- /test/topic_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(topic_SUITE). 2 | 3 | -include_lib("eunit/include/eunit.hrl"). 4 | -include_lib("common_test/include/ct.hrl"). 5 | -include("test_utils.hrl"). 6 | -compile(export_all). 7 | 8 | all() -> 9 | [creation, write_empty, write, index_bug, limit, index_limit, 10 | many, verify_lazy_load, startup_index_correctness, 11 | local_client_test, last_in_index, terminate_idle_active_segment, 12 | delete_topic]. 13 | 14 | init_per_suite(Config) -> 15 | Config. 16 | 17 | end_per_suite(_Config) -> 18 | ok. 19 | 20 | init_per_testcase(terminate_idle_active_segment, Config) -> 21 | PrivDir = ?config(priv_dir, Config), 22 | LogDir = filename:join(PrivDir, "data"), 23 | %% clear env from other suites 24 | application:unload(vonnegut), 25 | application:load(vonnegut), 26 | application:load(partisan), 27 | application:set_env(vonnegut, terminate_after, timer:seconds(1)), 28 | application:set_env(partisan, partisan_peer_service_manager, partisan_default_peer_service_manager), 29 | application:set_env(vonnegut, log_dirs, [LogDir]), 30 | application:set_env(vonnegut, chain, [{discovery, local}]), 31 | application:set_env(vonnegut, client, [{endpoints, [{"127.0.0.1", 5588}]}]), 32 | application:set_env(vonnegut, client_pool_size, 2), 33 | {ok, _} = application:ensure_all_started(vonnegut), 34 | ok = vg_client_pool:start(#{reconnect => false}), 35 | Topic = vg_test_utils:create_random_name(<<"topic_SUITE_default_topic">>), 36 | {ok, _} = vg_client:ensure_topic(Topic), 37 | [{topic, Topic}, {log_dir, LogDir} | Config]; 38 | init_per_testcase(_, Config) -> 39 | PrivDir = ?config(priv_dir, Config), 40 | LogDir = filename:join(PrivDir, "data"), 41 | %% clear env from other suites 42 | application:unload(vonnegut), 43 | application:load(vonnegut), 44 | application:load(partisan), 45 | application:set_env(partisan, partisan_peer_service_manager, partisan_default_peer_service_manager), 46 | application:set_env(vonnegut, log_dirs, [LogDir]), 47 | application:set_env(vonnegut, chain, [{discovery, local}]), 48 | application:set_env(vonnegut, client, [{endpoints, [{"127.0.0.1", 5588}]}]), 49 | application:set_env(vonnegut, client_pool_size, 2), 50 | {ok, _} = application:ensure_all_started(vonnegut), 51 | ok = vg_client_pool:start(#{reconnect => false}), 52 | Topic = vg_test_utils:create_random_name(<<"topic_SUITE_default_topic">>), 53 | {ok, _} = vg_client:ensure_topic(Topic), 54 | [{topic, Topic}, {log_dir, LogDir} | Config]. 55 | 56 | end_per_testcase(_, _Config) -> 57 | vg_client_pool:stop(), 58 | ok. 59 | 60 | creation(_Config) -> 61 | Topic = vg_test_utils:create_random_name(<<"creation_test_topic">>), 62 | Partition = 0, 63 | TopicPartitionDir = vg_utils:topic_dir(Topic, Partition), 64 | vg:create_topic(Topic), 65 | ?assert(filelib:is_dir(TopicPartitionDir)). 66 | 67 | %% leaving this in as it occasionally hits a quasi race, so if we 68 | %% start hitting intermittent failures here, we might have a regression 69 | write_empty(_Config) -> 70 | Topic = vg_test_utils:create_random_name(<<"topic_SUITE_default_topic">>), 71 | {ok, _} = vg_client:ensure_topic(Topic), 72 | 73 | spawn(fun() -> vg_client:produce(Topic, <<"fleerp">>) end), 74 | {ok, #{Topic := #{0 := #{record_batches := Reply, high_water_mark := HWM}}}} = vg_client:fetch(Topic, 0), 75 | case Reply of 76 | [#{value := <<"fleerp">>}] -> % write then read 77 | ?assertEqual(0, HWM), 78 | ok; 79 | [] -> 80 | %% spawned write could have finished after our sendfile data boundaries 81 | %% are figured out but before we grabbed the HWM for the response 82 | ?assert(-1 =:= HWM orelse 0 =:= HWM), 83 | ok; 84 | _ -> 85 | ct:pal("got ~p", [Reply]), 86 | error(bad_return) 87 | end. 88 | 89 | write(Config) -> 90 | Topic = ?config(topic, Config), 91 | Anarchist = <<"no gods no masters">>, 92 | [begin 93 | {ok, R} = vg_client:produce(Topic, Anarchist), 94 | ct:pal("reply: ~p", [R]) 95 | end 96 | || _ <- lists:seq(1, rand:uniform(20))], 97 | Communist = <<"from each according to their abilities, to " 98 | "each according to their needs">>, 99 | {ok, R1} = vg_client:produce(Topic, Communist), 100 | ct:pal("reply: ~p", [R1]), 101 | {ok, #{Topic := #{0 := #{record_batches := Reply}}}} = vg_client:fetch(Topic, R1), 102 | ?assertMatch([#{value := Communist}], Reply), 103 | 104 | {ok, #{Topic := #{0 := #{record_batches := Reply1}}}} = vg_client:fetch(Topic, R1 - 1), 105 | ?assertMatch([#{value := Anarchist}, #{value := Communist}], Reply1). 106 | 107 | index_bug(Config) -> 108 | Topic = ?config(topic, Config), 109 | 110 | %% write enough data to cause index creation but not two entries 111 | {ok, _} = vg_client:produce(Topic, 112 | lists:duplicate(100, <<"123456789abcdef">>)), 113 | 114 | %% fetch from 0 to make sure that they're all there 115 | {ok, #{Topic := #{0 := #{record_batches := Reply}}}} = vg_client:fetch(Topic, 0), 116 | ?assertEqual(100, length(Reply)), 117 | 118 | %% now query for something before the first index marker 119 | {ok, #{Topic := #{0 := #{record_batches := Reply2, 120 | high_water_mark := HWM}}}} = 121 | vg_client:fetch(Topic, 10), 122 | 123 | ?assertEqual(99, HWM), 124 | 125 | %% this is a passing version before the bugfix 126 | %% ?assertEqual([], Reply2). 127 | 128 | %% ?assertEqual(90, length(Reply2)), 129 | %% change with 0.11.0 RecorBatch storage 130 | ?assertEqual(100, length(Reply2)), 131 | 132 | %% write enough more data for another entry to hit the second clause 133 | {ok, _} = vg_client:produce(Topic, 134 | lists:duplicate(100, <<"123456789abcdef">>)), 135 | 136 | {ok, #{Topic := #{0 := #{record_batches := Reply3}}}} = vg_client:fetch(Topic, 0), 137 | ?assertEqual(200, length(Reply3)), 138 | 139 | {ok, #{Topic := #{0 := #{record_batches := Reply4, 140 | high_water_mark := HWM4}}}} = vg_client:fetch(Topic, 10), 141 | 142 | ?assertEqual(199, HWM4), 143 | ?assertEqual(200, length(Reply4)). 144 | 145 | last_in_index(Config) -> 146 | Topic = ?config(topic, Config), 147 | 148 | [{ok, _} = vg_client:produce(Topic, 149 | lists:duplicate(100, <<"123456789abcdef">>)) 150 | || _ <- lists:seq(1, 100)], 151 | 152 | %% try to force flush 153 | application:stop(vonnegut), 154 | 155 | {ok, TopicDir0} = application:get_env(vonnegut, log_dirs), 156 | TopicDir = TopicDir0 ++ "/" ++ binary_to_list(Topic) ++ "-0/", 157 | Filename = vg_utils:index_file(TopicDir, 0), 158 | ct:pal("topic dir ~p, filename ~p", [TopicDir, Filename]), 159 | ?assertNotEqual({0, 0}, vg_log_segments:last_in_index(TopicDir, Filename, 1)). 160 | 161 | 162 | limit(Config) -> 163 | Topic = ?config(topic, Config), 164 | 165 | {ok, P} = vg_client:produce(Topic, 166 | lists:duplicate(100, <<"123456789abcdef">>)), 167 | ?assertEqual(99, P), 168 | {ok, #{Topic := #{0 := #{record_batches := Reply}}}} = vg_client:fetch(Topic), 169 | ?assertEqual(100, length(Reply)), 170 | 171 | {ok, #{Topic := #{0 := #{record_batches := Reply2}}}} = 172 | vg_client:fetch([{Topic, 0, #{max_bytes => 1000}}]), 173 | %% ?assertEqual(24, length(Reply2)), 174 | %% how is it we are geting more with the new format... 175 | ?assertEqual(37, length(Reply2)), 176 | 177 | {ok, #{Topic := #{0 := #{record_batches := []}}}} = 178 | vg_client:fetch([{Topic, 0, #{max_bytes => 1}}]), 179 | 180 | ok. 181 | 182 | index_limit(Config) -> 183 | Topic = ?config(topic, Config), 184 | 185 | [{ok, _} = vg_client:produce(Topic, 186 | <<"123456789abcdef">>) || _ <- lists:seq(1, 100)], 187 | 188 | {ok, #{Topic := #{0 := #{record_batches := Reply}}}} = vg_client:fetch(Topic, 0), 189 | ?assertEqual(100, length(Reply)), 190 | 191 | {ok, #{Topic := #{0 := #{record_batches := Reply2}}}} = vg_client:fetch(Topic, 0, 50), 192 | ?assertEqual(50, length(Reply2)), 193 | %% ?assertEqual(50, length(Reply2)), 194 | 195 | %% max_bytes overrides max_index 196 | {ok, #{Topic := #{0 := #{record_batches := Reply3}}}} = vg_client:fetch([{Topic, 0, #{limit => 50, max_bytes => 1000}}]), 197 | ?assertEqual(12, length(Reply3)), 198 | %% ?assertEqual(24, length(Reply3)), 199 | 200 | %% limit returns Offset to Offset+Limit 201 | {ok, #{Topic := #{0 := #{record_batches := Reply4}}}} = vg_client:fetch([{Topic, 10, #{limit => 20}}]), 202 | ?assertEqual(20, length(Reply4)), 203 | ?assertMatch(#{offset := 10}, hd(Reply4)), 204 | ?assertMatch(#{offset := 29}, hd(lists:reverse(Reply4))), 205 | 206 | %% -1 Offset returns HWM-Limit to HWM 207 | {ok, #{Topic := #{0 := #{record_batches := Reply5}}}} = vg_client:fetch([{Topic, -1, #{limit => 20}}]), 208 | ?assertMatch(#{offset := 80}, hd(Reply5)), 209 | ?assertMatch(#{offset := 99}, hd(lists:reverse(Reply5))), 210 | ?assertEqual(20, length(Reply5)), 211 | 212 | %% -1 Offset with limit larger than HWM starts from 0 213 | {ok, #{Topic := #{0 := #{record_batches := Reply6}}}} = vg_client:fetch([{Topic, -1, #{limit => 200}}]), 214 | ?assertEqual(100, length(Reply6)), 215 | 216 | {ok, #{Topic := #{0 := #{record_batches := []}}}} = vg_client:fetch([{Topic, 0, #{max_bytes => 1}}]), 217 | 218 | ok. 219 | 220 | 221 | many(Config) -> 222 | TopicCount = 1000, 223 | TimeLimit = 100000, 224 | 225 | Start = erlang:monotonic_time(milli_seconds), 226 | [begin 227 | N = integer_to_binary(N0), 228 | Topic = vg_test_utils:create_random_name(<<"many-topic-", N/binary>>), 229 | %% adding a record to the topic will create it under current settings 230 | ct:pal("adding to topic: ~p", [Topic]), 231 | {ok, _} = vg_client:ensure_topic(Topic), 232 | {ok, _} = vg_client:produce(Topic, [<<"woo">>]) 233 | end || N0 <- lists:seq(1, TopicCount)], 234 | Duration = erlang:monotonic_time(milli_seconds) - Start, 235 | ct:pal("creating ~p topics took ~p ms", [TopicCount, Duration]), 236 | ?assert(Duration < TimeLimit), 237 | Config. 238 | 239 | wait_for_start(Topic) -> 240 | wait_for_start(Topic, 5000). 241 | 242 | wait_for_start(_Topic, 0) -> 243 | error(waited_too_long); 244 | wait_for_start(Topic, N) -> 245 | case vg_client:fetch(Topic, 0, 1) of 246 | {ok, _} = _OK -> 247 | %%ct:pal("ok ~p", [_OK]), 248 | timer:sleep(150), 249 | ok; 250 | {error, no_socket} -> 251 | timer:sleep(1), 252 | wait_for_start(Topic, N - 1) 253 | end. 254 | 255 | startup_index_correctness(Config) -> 256 | %% we actually want the reconnect behavior here 257 | ok = vg_client_pool:stop(), 258 | ok = vg_client_pool:start(#{reconnect => true}), 259 | 260 | Topic = ?config(topic, Config), 261 | ct:pal("STARTING TEST"), 262 | 263 | {ok, _} = vg_client:produce(Topic, 264 | lists:duplicate(1, <<"123456789abcdef000000000">>)), 265 | {ok, _} = vg_client:produce(Topic, 266 | lists:duplicate(1, <<"123456789abcdef111111111">>)), 267 | 268 | 269 | [begin 270 | application:stop(vonnegut), 271 | %% bleh circle 272 | timer:sleep(750), 273 | {ok, _} = application:ensure_all_started(vonnegut), 274 | wait_for_start(Topic), 275 | A = integer_to_binary(N), 276 | B = integer_to_binary(N + 1), 277 | C = integer_to_binary(N + 2), 278 | M = N + 1, 279 | {ok, Q} = vg_client:produce(Topic, [<<"123456789abcdef-", A/binary>>, 280 | <<"123456789abcdef-", B/binary>>]), 281 | ?assertEqual(M, Q), 282 | Y = M + 1, 283 | {ok, Y} = vg_client:produce(Topic, <<"123456789abcdef-", C/binary>>) 284 | end 285 | || N <- lists:seq(2, 11, 3)], 286 | 287 | %% -1 Offset returns HWM-Limit to HWM 288 | {ok, #{Topic := #{0 := #{record_batches := ReplyN}}}} = vg_client:fetch([{Topic, 0, #{limit => 2000}}]), 289 | {ok, #{Topic := #{0 := #{record_batches := Reply0}}}} = vg_client:fetch([{Topic, -1, #{limit => 2}}]), 290 | ct:pal("reply 0 ~p", [Reply0]), 291 | ct:pal("whole set ~p", [ReplyN]), 292 | 293 | %% 11 and 12 are in one RecordBatch 294 | ?assertEqual(3, length(Reply0)), 295 | ?assertMatch(#{offset := 12}, hd(Reply0)), 296 | ?assertMatch(#{offset := 13}, hd(lists:reverse(Reply0))), 297 | 298 | {ok, #{Topic := #{0 := #{record_batches := Reply1}}}} = vg_client:fetch([{Topic, 0, #{limit => 100}}]), 299 | ?assertEqual(14, length(Reply1)), 300 | ?assertMatch(#{offset := 13}, hd(lists:reverse(Reply1))), 301 | ok. 302 | 303 | %% verify the active topic segment process is not started until needed 304 | verify_lazy_load(_Config) -> 305 | Topic = vg_test_utils:create_random_name(<<"verify_lazy_load">>), 306 | Partition = 0, 307 | TopicPartitionDir = vg_utils:topic_dir(Topic, Partition), 308 | vg:create_topic(Topic), 309 | ?assert(filelib:is_dir(TopicPartitionDir)), 310 | 311 | {ok, _} = vg_client:produce(Topic, 312 | lists:duplicate(100, <<"123456789abcdef">>)), 313 | 314 | %% fetch from 0 to make sure that they're all there 315 | {ok, #{Topic := #{0 := #{record_batches := Reply}}}} = vg_client:fetch(Topic, 0), 316 | ?assertEqual(100, length(Reply)), 317 | 318 | application:stop(vonnegut), 319 | 320 | %% delay on getting the elli port back can cause restarting to fail so pause for a bit 321 | timer:sleep(500), 322 | 323 | {ok, _} = application:ensure_all_started(vonnegut), 324 | wait_for_start(Topic), 325 | 326 | ?assertEqual(undefined, gproc:whereis_name({n,l,{active, Topic, Partition}})), 327 | 328 | {ok, #{Topic := #{0 := #{record_batches := Reply2}}}} = vg_client:fetch(Topic, 0), 329 | ?assertEqual(100, length(Reply2)), 330 | 331 | ?assertEqual(undefined, gproc:whereis_name({n,l,{active, Topic, Partition}})), 332 | 333 | %% writing starts the process 334 | {ok, _} = vg_client:produce(Topic, 335 | lists:duplicate(100, <<"123456789abcdef">>)), 336 | 337 | ?assertNotEqual(undefined, gproc:whereis_name({n,l,{active, Topic, Partition}})). 338 | 339 | local_client_test(Config) -> 340 | Topic = ?config(topic, Config), 341 | vg:write(Topic, 0, <<"foo">>), 342 | {ok, Ret} = vg:fetch(Topic), 343 | ?assertMatch(#{high_water_mark := 0, 344 | partition := 0, 345 | record_batches := 346 | [#{offset := 0, 347 | value := <<"foo">>}]}, 348 | Ret), 349 | ok. 350 | 351 | terminate_idle_active_segment(Config) -> 352 | Topic = ?config(topic, Config), 353 | %% verify it is running 354 | {ok, _} = vg:write(Topic, 0, <<"foo">>), 355 | ?assertNotEqual(undefined, vg_active_segment:where(Topic, 0)), 356 | %% after a second it should terminate and be undefined 357 | ?UNTIL(vg_active_segment:where(Topic, 0) =:= undefined), 358 | ok. 359 | 360 | delete_topic(Config) -> 361 | Dir = ?config(log_dir, Config), 362 | Topic = vg_test_utils:create_random_name(<<"topic_SUITE_delete_topic">>), 363 | {ok, _} = vg_client:ensure_topic(Topic), 364 | 365 | [begin 366 | {ok, _} = vg_client:produce(Topic, <<"some datas">>) 367 | end 368 | || _ <- lists:seq(1, rand:uniform(20))], 369 | 370 | ?assert(filelib:is_dir(filename:join(Dir, <>))), 371 | 372 | vg_client:delete_topic(Topic), 373 | 374 | ?assertNot(filelib:is_dir(filename:join(Dir, <>))). 375 | -------------------------------------------------------------------------------- /test/vg_consumer_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(vg_consumer_SUITE). 2 | 3 | -include_lib("eunit/include/eunit.hrl"). 4 | -include_lib("common_test/include/ct.hrl"). 5 | -compile(export_all). 6 | 7 | -include("test_utils.hrl"). 8 | 9 | all() -> 10 | [from_zero, multi_topic_fetch, fetch_unknown, fetch_higher_than_hwm, regression_2_23_18]. 11 | 12 | init_per_suite(Config) -> 13 | PrivDir = ?config(priv_dir, Config), 14 | application:load(vonnegut), 15 | application:set_env(vonnegut, client_pool_size, 2), 16 | application:set_env(vonnegut, log_dirs, [filename:join(PrivDir, "data")]), 17 | application:set_env(vonnegut, segment_bytes, 86), 18 | application:set_env(vonnegut, index_max_bytes, 18), 19 | application:set_env(vonnegut, index_interval_bytes, 24), 20 | application:set_env(vonnegut, client, [{endpoints, [{"127.0.0.1", 5588}]}]), 21 | application:set_env(vonnegut, chain, [{discovery, local}]), 22 | application:start(shackle), 23 | application:ensure_all_started(vonnegut), 24 | crypto:start(), 25 | Config. 26 | 27 | end_per_suite(Config) -> 28 | application:stop(vonnegut), 29 | application:unload(vonnegut), 30 | Config. 31 | 32 | init_per_testcase(_, Config) -> 33 | ok = vg_client_pool:start(#{reconnect => false}), 34 | Config. 35 | 36 | end_per_testcase(_, _Config) -> 37 | vg_client_pool:stop(), 38 | ok. 39 | 40 | from_zero(_Config) -> 41 | Topic = vg_test_utils:create_random_name(<<"consumer_SUITE_test_topic">>), 42 | {ok, _} = vg_client:ensure_topic(Topic), 43 | Partition = 0, 44 | TopicPartitionDir = vg_utils:topic_dir(Topic, Partition), 45 | ?assert(filelib:is_dir(TopicPartitionDir)), 46 | 47 | %% make sure there's enough time for the 48 | %% listeners to come up 49 | timer:sleep(250), 50 | 51 | ?assertMatch({ok, 0}, 52 | vg_client:produce(Topic, [#{key => <<"key">>, 53 | value => <<"record 1 wasn't long enough to make wrapping fail">>}])), 54 | ?assertMatch({ok, 1}, 55 | vg_client:produce(Topic, [<<"record 2">>])), 56 | {ok, #{Topic := #{0 := #{record_batches := Data, high_water_mark := HWM}}}} = vg_client:fetch(Topic, 0), 57 | ?assertEqual(1, HWM), 58 | ?assertMatch([#{offset := 0, key := <<"key">>, value := <<"record 1 wasn't long enough to make wrapping fail">>}], Data), 59 | {ok, #{Topic := #{0 := #{record_batches := Data1, high_water_mark := HWM1}}}} = vg_client:fetch(Topic, 1), 60 | ?assertEqual(1, HWM1), 61 | ?assertMatch([#{offset := 1, value := <<"record 2">>}], Data1), 62 | 63 | ok. 64 | 65 | multi_topic_fetch(_Config) -> 66 | 67 | Topic1 = vg_test_utils:create_random_name(<<"consumer_SUITE_test_topic-1">>), 68 | Topic2 = vg_test_utils:create_random_name(<<"consumer_SUITE_test_topic-2">>), 69 | 70 | ok = vg:create_topic(Topic1), 71 | ok = vg:create_topic(Topic2), 72 | 73 | %% make sure there's enough time for the 74 | %% listeners to come up 75 | timer:sleep(250), 76 | 77 | ?assertMatch({ok, 0}, 78 | vg_client:produce(Topic1, [#{timestamp => erlang:system_time(millisecond), 79 | key => <<"key">>, value => <<"topic 1 record 1">>}])), 80 | ?assertMatch({ok, 1}, 81 | vg_client:produce(Topic1, [<<"topic 1 record 2">>])), 82 | 83 | ?assertMatch({ok, 0}, 84 | vg_client:produce(Topic2, [#{timestamp => erlang:system_time(millisecond), 85 | key => <<"key-2">>, value => <<"topic 2 record 1">>}])), 86 | ?assertMatch({ok, 1}, 87 | vg_client:produce(Topic2, [<<"topic 2 record 2">>])), 88 | 89 | {ok, #{Topic1 := #{0 := #{record_batches := Data, high_water_mark := HWM}}, 90 | Topic2 := #{0 := #{record_batches := Data2, high_water_mark := HWM2}}}} = vg_client:fetch([{Topic1, 0, #{}}, 91 | {Topic2, 1, #{}}]), 92 | 93 | ?assertEqual(1, HWM), 94 | ?assertMatch([#{offset := 0, key := <<"key">>, value := <<"topic 1 record 1">>}], Data), 95 | 96 | ?assertEqual(1, HWM2), 97 | ?assertMatch([#{offset := 1, value := <<"topic 2 record 2">>}], Data2), 98 | 99 | ok. 100 | 101 | fetch_unknown(_Config) -> 102 | Topic = vg_test_utils:create_random_name(<<"consumer_SUITE_test_topic">>), 103 | 104 | %% make sure there's enough time for the 105 | %% listeners to come up 106 | timer:sleep(250), 107 | 108 | ?assertMatch({error, {Topic, not_found}}, vg_client:fetch(Topic, 0)), 109 | 110 | ok. 111 | 112 | fetch_higher_than_hwm(_Config) -> 113 | Topic = vg_test_utils:create_random_name(<<"consumer_SUITE_fetch_higher_than_hwm">>), 114 | {ok, _} = vg_client:ensure_topic(Topic), 115 | Partition = 0, 116 | TopicPartitionDir = vg_utils:topic_dir(Topic, Partition), 117 | ?assert(filelib:is_dir(TopicPartitionDir)), 118 | 119 | %% fetch from an empty log 120 | {ok, #{Topic := #{0 := #{record_batches := Data0, high_water_mark := HWM0}}}} = vg_client:fetch(Topic, 1), 121 | ?assertEqual(-1, HWM0), 122 | ?assertMatch([], Data0), 123 | 124 | %% fetch with a limit from an empty log 125 | {ok, #{Topic := #{0 := #{record_batches := Data0, high_water_mark := HWM0}}}} = vg_client:fetch(Topic, 1, 1), 126 | ?assertEqual(-1, HWM0), 127 | ?assertMatch([], Data0), 128 | 129 | %% make sure there's enough time for the 130 | %% listeners to come up 131 | timer:sleep(250), 132 | 133 | ?assertMatch({ok, 0}, 134 | vg_client:produce(Topic, [#{key => <<"some key">>, 135 | value => <<"rsome value">>}])), 136 | {ok, #{Topic := #{0 := #{record_batches := Data, high_water_mark := HWM1}}}} = vg_client:fetch(Topic, 1), 137 | ?assertEqual(0, HWM1), 138 | ?assertMatch([], Data), 139 | 140 | ok. 141 | 142 | %% fetch from 5 with limit 1000 was timing out. 143 | %% issue was vonnegut claiming to send more data than it actually would, leaving the client expecting more 144 | regression_2_23_18(_Config) -> 145 | Topic = vg_test_utils:create_random_name(<<"consumer_SUITE_regression_2-23-18">>), 146 | {ok, _} = vg_client:ensure_topic(Topic), 147 | Partition = 0, 148 | TopicPartitionDir = vg_utils:topic_dir(Topic, Partition), 149 | ?assert(filelib:is_dir(TopicPartitionDir)), 150 | 151 | %% make sure there's enough time for the 152 | %% listeners to come up 153 | timer:sleep(250), 154 | 155 | vg_client:produce(Topic, [#{value => <<"some value">>}]), 156 | {ok, #{Topic := #{0 := #{record_batches := _Data, high_water_mark := HWM1}}}} = vg_client:fetch(Topic, 5, 1000), 157 | ?assertEqual(0, HWM1), 158 | 159 | ok. 160 | -------------------------------------------------------------------------------- /test/vg_statem.erl: -------------------------------------------------------------------------------- 1 | -module(vg_statem). 2 | 3 | -include_lib("proper/include/proper.hrl"). 4 | 5 | -export([command/1, initial_state/0, next_state/3, 6 | precondition/2, postcondition/3]). 7 | 8 | %% sigh 9 | -export([restart_server/0]). 10 | 11 | -record(state, 12 | { 13 | topics = #{} 14 | }). 15 | 16 | command(_S = #state{topics = Topics}) -> 17 | %% replace with proper oneof when maps are supported 18 | {Topic, Info} = one_of(Topics), 19 | Index = hwm(Info) + 1, 20 | frequency( 21 | [%% to tickle close and restart validation bugs 22 | {2, {call, ?MODULE, restart_server, []}}, 23 | {2, {call, vg, ensure_topic, [?LET(A, atom(), atom_to_binary(A, utf8))]}}, 24 | %% write more than is common in typical workloads in order to 25 | %% trigger more wraps 26 | {40, {call, vg, write, 27 | %%[Topic, message(Topic, Index, binary())]}}, 28 | [Topic, 0, message(Topic, Index, <<>>)]}}, 29 | {10, {call, vg, write, 30 | %%[Topic, message(Topic, Index, binary())]}}, 31 | [Topic, 0, ?LET(I, integer(2, 15), 32 | [message(Topic, Index + N, <<>>) 33 | || N <- lists:seq(0, I)])]}}, 34 | {40, {call, vg, fetch, [Topic, 0, -1, integer(1, 5)]}}, 35 | {100, {call, vg, fetch, [Topic, integer(0, Index - 1)]}} 36 | ]). 37 | 38 | %% Initial model value at system start. Should be deterministic. 39 | initial_state() -> 40 | application:load(vonnegut), 41 | %% set this once per run 42 | application:set_env(vonnegut, log_dirs, [filename:join("properdata", integer_to_list(erlang:system_time()))]), 43 | 44 | _ = application:stop(vonnegut), 45 | {ok, _} = application:ensure_all_started(vonnegut), 46 | timer:sleep(500), 47 | ok = vg:create_topic(<<"seed">>), 48 | {ok, 0} = vg:write(<<"seed">>, 0, message(<<"seed">>, 0, <<>>)), 49 | #state{topics = #{<<"seed">> => 0}}. 50 | 51 | %% Picks whether a command should be valid under the current state. 52 | precondition(#state{topics = T}, {call, _Mod, Fun, _Args}) when T == #{} 53 | andalso (Fun == fetch orelse 54 | Fun == produce) -> 55 | false; 56 | precondition(#state{}, _C = {call, _Mod, _Fun, _Args}) -> 57 | %lager:info("calling ~p", [_C]), 58 | true. 59 | 60 | %% Given the state `State' *prior* to the call `{call, Mod, Fun, Args}', 61 | %% determine whether the result `Res' (coming from the actual system) 62 | %% makes sense. 63 | postcondition(#state{topics = Topics}, {call, vg_client, fetch, [Topic, Index]}, Res) -> 64 | HWM = maps:get(Topic, Topics, undefined), 65 | {ok, #{Topic := #{0 := #{record_batches := RecordSet}}}} = Res, 66 | %% potentially validate what we're seeing here, also this will stop being true once more bytes start being returned 67 | length(RecordSet) =:= (HWM - Index) + 1; 68 | postcondition(#state{topics = Topics}, {call, vg_client, produce, [Topic, Message]}, Res) -> 69 | lager:info("check ~p ~p ~p", [Topic, Res, Topics]), 70 | HWM = maps:get(Topic, Topics), 71 | {ok, Offset} = Res, 72 | %% potentially validate what we're seeing here 73 | Added = 74 | case Message of 75 | L when is_list(L) -> 76 | length(L); 77 | _ -> 1 78 | end, 79 | Offset =:= HWM + Added; 80 | postcondition(_State, {call, _Mod, _Fun, _Args}, _Res) -> 81 | true. 82 | 83 | %% Assuming the postcondition for a call was true, update the model 84 | %% accordingly for the test to proceed. 85 | next_state(State=#state{topics = Topics}, {ok, Offset}, {call, vg_client, produce, [Topic, _Message]}) -> 86 | NewTopics = maps:put(Topic, Offset, Topics), 87 | State#state{topics = NewTopics}; 88 | next_state(State=#state{topics = Topics}, {ok, _}, {call, vg_client, ensure_topic, [Topic]}) -> 89 | NewTopics = maps:put(Topic, 0, Topics), 90 | State#state{topics = NewTopics}; 91 | next_state(State, {error, _}, {call, vg_client, ensure_topic, [_Topic]}) -> 92 | %% i guess just ignore invalid topic names? 93 | State; 94 | next_state(State, _Res, {call, _Mod, _Fun, _Args}) -> 95 | NewState = State, 96 | NewState. 97 | 98 | 99 | %%%%%%%%% 100 | 101 | restart_server() -> 102 | ok = application:stop(vonnegut), 103 | ok = application:start(vonnegut), 104 | timer:sleep(500), 105 | ok. 106 | 107 | message(Topic, Index, Gen) -> 108 | Reps = rand:uniform(15) + 5, 109 | iolist_to_binary( 110 | [<>, 112 | lists:duplicate(Reps, 113 | <>), 115 | <<(byte_size(Gen)):32/native, Gen/binary>>]). 116 | 117 | %% trivial but opaque for interface reasons 118 | hwm(HWM) -> 119 | HWM. 120 | 121 | one_of(Empty) when Empty =:= #{} -> 122 | %% bad feeling here, buttttt 123 | {<<>>, 0}; 124 | one_of(Map) -> 125 | Sz = maps:size(Map), 126 | Index = rand:uniform(Sz), 127 | lists:nth(Index, maps:to_list(Map)). 128 | -------------------------------------------------------------------------------- /test/vg_test_utils.erl: -------------------------------------------------------------------------------- 1 | -module(vg_test_utils). 2 | 3 | -compile(export_all). 4 | 5 | create_random_name(Name) -> 6 | <>. 7 | --------------------------------------------------------------------------------