├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── Makefile ├── README.md ├── flake.lock ├── flake.nix ├── pgwrh.control ├── src ├── common.sql ├── master │ ├── api-management.sql │ ├── api-replica.sql │ ├── deps.txt │ ├── ext-config-dump.sql │ ├── helpers.sql │ ├── implementation-views.sql │ ├── monitoring.sql │ ├── publication-sync.sql │ ├── snapshot.sql │ ├── tables.sql │ └── triggers.sql └── replica │ ├── api-management.sql │ ├── daemon.sql │ ├── deps.txt │ ├── ext-config-dump.sql │ ├── fdw.sql │ ├── helpers.sql │ ├── status.sql │ ├── sync.sql │ └── tables.sql └── test ├── master.sql ├── requirements.txt └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | .build 2 | .work 3 | /pgwrh--*.sql 4 | result 5 | /.idea/ 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "plpgsqlLanguageServer.database": "postgres", 3 | "plpgsqlLanguageServer.user": "michal", 4 | "plpgsqlLanguageServer.password": "michal", 5 | "plpgsqlLanguageServer.definitionFiles": [ 6 | "**/*.psql", 7 | "**/*.pgsql", 8 | "**/*.sql" 9 | ], 10 | "files.associations": { 11 | "*.sql": "postgres" 12 | }, 13 | "[makefile]": { 14 | "editor.insertSpaces": false, 15 | "editor.detectIndentation": false 16 | }, 17 | "plpgsqlLanguageServer.workspaceValidationTargetFiles": [ 18 | "**/*.sql" 19 | ] 20 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # pgwrh 2 | # Copyright (C) 2024 Michal Kleczek 3 | 4 | # This program is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU Affero General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU Affero General Public License for more details. 13 | 14 | # You should have received a copy of the GNU Affero General Public License 15 | # along with this program. If not, see . 16 | 17 | EXTENSION = pgwrh 18 | EXTVERSION = $(shell grep default_version $(EXTENSION).control | \ 19 | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/") 20 | BUILD = .build 21 | DATA = $(BUILD)/pgwrh/$(EXTENSION)--$(EXTVERSION).sql 22 | EXTRA_CLEAN = $(BUILD) 23 | 24 | MASTER = $(shell tsort src/master/deps.txt | sed -e 's/^/src\/master\//' -e 's/$$/\.sql/' | xargs echo) 25 | REPLICA = $(shell tsort src/replica/deps.txt | sed -e 's/^/src\/replica\//' -e 's/$$/\.sql/' | xargs echo) 26 | 27 | PG_CONFIG = pg_config 28 | 29 | ifdef NO_PGXS 30 | # Simple install for systems without pgxs 31 | # RedHat packages pgxs in postgresql-devel 32 | # which has a lot of dependencies (compilers etc.) 33 | # need to make it possible to use make to install 34 | # pgwrh on such systems 35 | EXTDIR := $(shell $(PG_CONFIG) --sharedir)/extension 36 | 37 | clean: 38 | rm -rf $(BUILD) 39 | 40 | install: all 41 | install -c -m 644 ./pgwrh.control $(EXTDIR) 42 | install -c -m 644 $(BUILD)/pgwrh/$(EXTENSION)--$(EXTVERSION).sql $(EXTDIR) 43 | 44 | else # NO_PGXS 45 | # Standard pgxs makefile 46 | PGXS := $(shell $(PG_CONFIG) --pgxs) 47 | include $(PGXS) 48 | 49 | endif # NO_PGXS 50 | 51 | $(BUILD)/pgwrh/$(EXTENSION)--$(EXTVERSION).sql: src/common.sql $(MASTER) $(REPLICA) 52 | cat $^ > $@ 53 | 54 | all: prepare $(EXTENSION).control $(BUILD)/pgwrh/$(EXTENSION)--$(EXTVERSION).sql 55 | prepare: 56 | mkdir -p ${BUILD}/pgwrh 57 | 58 | PHONY: all prepare 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pgwrh 2 | 3 | An extension implementing sharding for PostgreSQL based on logical replication and postgres_fdw. 4 | The goal is to scale **_read queries_** overcoming main limitation of traditional setups based on streaming replication and hot standbys: 5 | lack of sharding and large storage requirements. 6 | 7 | See [Architecture](https://github.com/mkleczek/pgwrh/wiki/Architecture) for more information on inner workings. 8 | 9 | :warning: **WIP**: readme might be incomplete and contain mistakes in usage instrutions (as the API is still changing) 10 | 11 | # Features 12 | 13 | ## Horizontal Scalability and High Availability 14 | ### No need for rebalancing 15 | Setting up and maintaining a highly available cluster of sharded storage servers is inherently tricky, especially during changes to cluster topology. 16 | Adding a new replica often requires rebalancing (ie. reorganizing data placement among replicas). 17 | 18 | _pgwrh_ minimizes the need to copy data by utilizing _Weighted Randezvous Hashing_ algorithm to distribute shards among replicas. 19 | Adding replicas never requires moving data between existing ones. 20 | ### Data redundancy 21 | _pgwrh_ maintains requested level of redundancy of shard data. 22 | 23 | Administrator can specify: 24 | * the percentage of replicas to host each shard 25 | * the minimum number of copies of any shard (regardless of the percentage setting above) 26 | 27 | So it is possible to implement policies like: _"Shards X, Y, Z should be distributed among 20% of replicas in the cluster, but in no fewer than 2 copies"_. 28 | ### Availability zones 29 | Replicas can be assigned to _availability zones_ and _pgwrh_ ensures shard copies are distributed evenly across all of them. 30 | 31 | ### Zero downtime reconfiguration of cluster topology 32 | Changing cluster topology very often requires lengthy process of data copying and indexing. 33 | Exposing replicas that do not have necessary indexes created imposes a risk of downtimes due to long queries causing exhaustion of connection pools. 34 | 35 | _pgwrh_ makes sure the cluster can operate without disruptions and that not-yet-ready replicas are isolated from query traffic. 36 | 37 | ## Sharding policy flexibility and storage tiering 38 | _pgwrh_ does not dictate how data is split into shards. It is possible to implement _any_ sharding policy by utilizing PostgreSQL partitioning. 39 | _pgwrh_ will distribute _leaves_ of partition hierarchy among replicas. 40 | It is also possible to specify different levels of redundancy for different subtrees of partitioning hierarchy. 41 | 42 | Thanks to this it is possible to have more replicas maintain _hot_ data and have _cold_ data storage requirements minimized. 43 | 44 | ## Ease of deployment and cluster administration 45 | 46 | 47 | ## Pure SQL/PGSQL 48 | This makes it easy to use _pgwrh_ in cloud environments that limit possibilities of custom extension installation. 49 | 50 | *** 51 | _Caveat_ at the moment _pgwrh_ requires _pg_background_ to operate as it needs a way to execute SQL commands 52 | outside current transaction (_CREATE/ALTER SUBSCRIPTION_ must not be executed in transaction). 53 | 54 | ## Based on built-in PostgreSQL facilities - no need for custom query parser/planner 55 | Contrary to other PostgreSQL sharding solutions that implement a query parser and interpreter to direct queries to 56 | the right replicas, _pgwrh_ reuses built-in PostgreSQL features: partitioning and postgres_fdw. 57 | 58 | PostgreSQL query planner and executor - while still somewhat limited - have capabilities to distribute computing among 59 | multiple machines by: 60 | * _pushing down_ filtering and aggregates (see https://www.postgresql.org/docs/current/runtime-config-query.html#GUC-ENABLE-PARTITIONWISE-AGGREGATE) 61 | * skip execution of unnecessary query plan nodes (see https://www.postgresql.org/docs/current/runtime-config-query.html#GUC-ENABLE-PARTITION-PRUNING) 62 | 63 | # Installation 64 | 65 | ## Prerequisites 66 | 67 | | Name | Version | 68 | | :---- | :---: | 69 | | PostgreSQL | 16+ | 70 | | pg_background | 1.2+ | 71 | 72 | ## Extension installation 73 | 74 | Clone the Git repository. 75 | ```sh 76 | git clone https://github.com/mkleczek/pgwrh.git 77 | ``` 78 | Install the extension. 79 | ```sh 80 | cd pgwrh 81 | make install 82 | ``` 83 | Create extension in PostgreSQL database. 84 | ```sh 85 | psql -c "CREATE EXTENSION pgwrh CASCADE" 86 | ``` 87 | 88 | # Usage 89 | 90 | ## On master server 91 | 92 | ### Create your sharded table partitioning hierarchy 93 | 94 | The below example would create a two-level partition hierarchy for `test.my_table`: 95 | * First level by dates in `col3` (split by year) 96 | * Second level by hash on `col2` 97 | ```pgsql 98 | CREATE SCHEMA IF NOT EXISTS test; 99 | 100 | CREATE TABLE test.my_data (col1 text, col2 text, col3 date) PARTITION BY RANGE (col3); 101 | CREATE TABLE test.my_data_2023 PARTITION OF parent FOR VALUES FROM (make_date(2023, 1, 1)) TO (make_date(2024, 1, 1)); 102 | CREATE TABLE test.my_data_2024 PARTITION OF parent FOR VALUES FROM (make_date(2024, 1, 1)) TO (make_date(2025, 1, 1)); 103 | CREATE TABLE test.my_data_2025 PARTITION OF parent FOR VALUES FROM (make_date(2025, 1, 1)) TO (make_date(2026, 1, 1)); 104 | 105 | CREATE SCHEMA IF NOT EXISTS test_shards; 106 | DO$$ 107 | DECLARE 108 | r record; 109 | BEGIN 110 | FOR r IN 111 | SELECT 112 | format('CREATE TABLE test_shards.my_data_%1$s_%2$s PARTITION OF test.my_data_%1$s (PRIMARY KEY (col1)) FOR VALUES WITH (MODULUS 16, REMAINDER %2$s)', year, rem) stmt 113 | FROM generate_series(2023, 2025) year, generate_series(0, 15) rem 114 | LOOP 115 | EXECUTE r.stmt; 116 | END LOOP; 117 | END$$; 118 | ``` 119 | 120 | That gives 48 (16 * 3) shards in total. 121 | 122 | **Note** that there are no specific requirements for the partitioning hierarchy and any partitioned table can be sharded - the above is only for illustration purposes. 123 | 124 | ### Create a replica cluster 125 | 126 | Example: 127 | ```pgsql 128 | SELECT pgwrh.create_replica_cluster('c01'); 129 | ``` 130 | 131 | ### Configure roles and user accounts for replicas 132 | 133 | (Optional) Create a role for you cluster replicas and grant rights to SELECT from shards. 134 | ```pgsql 135 | CREATE ROLE c01_replica; 136 | 137 | GRANT SELECT ON ALL TABLES IN SCHEMA test_shards TO c01_replica; 138 | ``` 139 | 140 | Create account for each replica. 141 | ```pgsql 142 | CREATE USER c01r01 PASSWORD 'c01r01Password' REPLICATION IN ROLE c01_replica; 143 | ``` 144 | 145 | ## On every replica 146 | 147 | Make sure `pgwrh` extension is installed. 148 | 149 | ### Configure connection to master server 150 | 151 | Call `configure_controller` function providing username and password of this replica account created on master. 152 | ```pgsql 153 | SELECT configure_controller( 154 | host => 'master.myorg', 155 | port => '5432', 156 | username => 'cr01r01', -- same as above 157 | password => 'c01r01Password' -- same as above 158 | ); 159 | ``` 160 | 161 | ## Create and deploy replica cluster configuration 162 | 163 | ### Specify what tables to replicate 164 | 165 | Example below would configure distribution of every partition of `test.my_data` to half (50%) of replicas, 166 | except partitions of `test.my_data_2024` which will be copied to all (100%) replicas. 167 | ```pgsql 168 | WITH st(schema_name, table_name, replication_factory) AS ( 169 | VALUES 170 | ('test', 'my_data', 50), 171 | ('test', 'my_data_2024', 100) 172 | ) 173 | INSERT INTO pgwrh.sharded_table (replication_group_id, sharded_table_schema, sharded_table_name, replication_factor) 174 | SELECT 175 | 'c01', schema_name, table_name, replication_factor 176 | FROM 177 | st; 178 | ``` 179 | 180 | ### Configure replicas 181 | Add replica to configuration: 182 | ```pgsql 183 | SELECT pgwrh.add_replica('c01', 'c01r01', 'replica01.cluster01.myorg', 5432); 184 | ``` 185 | 186 | ### Start deployment 187 | ```pgsql 188 | SELECT pgwrh.start_rollout('c01'); 189 | ``` 190 | 191 | New configuration is now visible to connected replicas which will start data replication. 192 | 193 | ### Commit configuration 194 | Once all replicas confirmed configuration changes, execute: 195 | ```pgsql 196 | SELECT pgwrh.commit_rollout('c01'); 197 | ``` 198 | (this will fail if some replicas are not reconfigured yet) 199 | 200 | ### Add more replicas 201 | ```pgsql 202 | CREATE USER c01r02 PASSWORD 'c01r02Password' REPLICATION IN ROLE c01_replica; 203 | CREATE USER c01r03 PASSWORD 'c01r03Password' REPLICATION IN ROLE c01_replica; 204 | CREATE USER c01r04 PASSWORD 'c01r04Password' REPLICATION IN ROLE c01_replica; 205 | 206 | select pgwrh.add_replica( 207 | _replication_group_id := 'c01', 208 | _host_id := 'c01r02', 209 | _host_name := 'replica02.cluster01.myorg', 210 | _port := 5432); 211 | select pgwrh.add_replica( 212 | _replication_group_id := 'c01', 213 | _host_id := 'c01r03', 214 | _host_name := 'replica03.cluster01.myorg', 215 | _port := 5432, 216 | _weight := 70); 217 | select pgwrh.add_replica( 218 | _replication_group_id := 'c01', 219 | _host_id := 'c01r04', 220 | _host_name := 'replica04.cluster01.myorg', 221 | _port := 5432); 222 | ``` 223 | It is possible to adjust the number of shards assigned to replicas by setting replica weight: 224 | ```pgsql 225 | SELECT pgwrh.set_replica_weight('c01', 'c01r04', 200); 226 | ``` 227 | 228 | To deploy new configuration: 229 | ```pgsql 230 | SELECT pgwrh.start_rollout('c01'); 231 | ``` 232 | And then: 233 | ```pgsql 234 | SELECT pgwrh.commit_rollout('c01'); 235 | ``` 236 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "flakelight": { 4 | "inputs": { 5 | "nixpkgs": "nixpkgs" 6 | }, 7 | "locked": { 8 | "lastModified": 1738586370, 9 | "narHash": "sha256-oNDm2sfLm9jdfOskRq2ABn85gwXusbsHEOC181peno4=", 10 | "owner": "nix-community", 11 | "repo": "flakelight", 12 | "rev": "d05bcabfc1efb84a7d8689de6e50b84d7f23b427", 13 | "type": "github" 14 | }, 15 | "original": { 16 | "owner": "nix-community", 17 | "repo": "flakelight", 18 | "type": "github" 19 | } 20 | }, 21 | "nixpkgs": { 22 | "locked": { 23 | "lastModified": 1738410390, 24 | "narHash": "sha256-xvTo0Aw0+veek7hvEVLzErmJyQkEcRk6PSR4zsRQFEc=", 25 | "owner": "NixOS", 26 | "repo": "nixpkgs", 27 | "rev": "3a228057f5b619feb3186e986dbe76278d707b6e", 28 | "type": "github" 29 | }, 30 | "original": { 31 | "owner": "NixOS", 32 | "ref": "nixos-unstable", 33 | "repo": "nixpkgs", 34 | "type": "github" 35 | } 36 | }, 37 | "nixpkgs_2": { 38 | "locked": { 39 | "lastModified": 1738961098, 40 | "narHash": "sha256-yWNBf6VDW38tl179FEuJ0qukthVfB02kv+mRsfUsWC0=", 41 | "owner": "nixos", 42 | "repo": "nixpkgs", 43 | "rev": "a3eaf5e8eca7cab680b964138fb79073704aca75", 44 | "type": "github" 45 | }, 46 | "original": { 47 | "owner": "nixos", 48 | "ref": "nixos-unstable", 49 | "repo": "nixpkgs", 50 | "type": "github" 51 | } 52 | }, 53 | "root": { 54 | "inputs": { 55 | "flakelight": "flakelight", 56 | "nixpkgs": "nixpkgs_2" 57 | } 58 | } 59 | }, 60 | "root": "root", 61 | "version": 7 62 | } 63 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "Simple flake to set up env"; 3 | 4 | inputs = { 5 | nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable"; 6 | flakelight.url = "github:nix-community/flakelight"; 7 | }; 8 | 9 | outputs = { flakelight, nixpkgs, ... }: 10 | flakelight ./. ({lib, ...}: { 11 | inputs.nixpkgs = nixpkgs; 12 | systems = lib.systems.flakeExposed; 13 | package = { stdenv, defaultMeta, pkgs }: 14 | stdenv.mkDerivation { 15 | pname = "pgwrh"; 16 | version = "0.2.0"; 17 | src = ./.; 18 | buildInputs = [ pkgs.coreutils pkgs.postgresql ]; 19 | buildPhase = '' 20 | USEPGXS=1 make DESTDIR=$out all 21 | ''; 22 | meta = defaultMeta; 23 | }; 24 | 25 | devShell.packages = pkgs: with pkgs; [ coreutils postgresql ]; 26 | }); 27 | } 28 | -------------------------------------------------------------------------------- /pgwrh.control: -------------------------------------------------------------------------------- 1 | # pgwrh 2 | # Copyright (C) 2024 Michal Kleczek 3 | 4 | # This program is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU Affero General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU Affero General Public License for more details. 13 | 14 | # You should have received a copy of the GNU Affero General Public License 15 | # along with this program. If not, see . 16 | 17 | relocatable = false 18 | default_version = '0.2.1' 19 | 20 | schema = pgwrh 21 | requires = 'postgres_fdw,pg_background' 22 | -------------------------------------------------------------------------------- /src/common.sql: -------------------------------------------------------------------------------- 1 | -- name: common 2 | 3 | -- pgwrh 4 | -- Copyright (C) 2024 Michal Kleczek 5 | 6 | -- This program is free software: you can redistribute it and/or modify 7 | -- it under the terms of the GNU Affero General Public License as published by 8 | -- the Free Software Foundation, either version 3 of the License, or 9 | -- (at your option) any later version. 10 | 11 | -- This program is distributed in the hope that it will be useful, 12 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | -- GNU Affero General Public License for more details. 15 | 16 | -- You should have received a copy of the GNU Affero General Public License 17 | -- along with this program. If not, see . 18 | 19 | \echo Use "CREATE EXTENSION pgwrh CASCADE" to load this file. \quit 20 | 21 | GRANT USAGE ON SCHEMA "@extschema@" TO PUBLIC; 22 | 23 | CREATE FUNCTION pgwrh_replica_role_name() RETURNS text IMMUTABLE LANGUAGE sql AS 24 | $$ 25 | SELECT format('pgwrh_replica_%s', current_database()); 26 | $$; 27 | CREATE FUNCTION exec_dynamic(cmd text) RETURNS void LANGUAGE plpgsql AS 28 | $$ 29 | BEGIN 30 | EXECUTE cmd; 31 | END; 32 | $$; 33 | 34 | DO 35 | $$ 36 | DECLARE 37 | r record; 38 | BEGIN 39 | FOR r IN SELECT format('CREATE ROLE %I', pgwrh_replica_role_name()) AS stmt WHERE NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = pgwrh_replica_role_name()) LOOP 40 | EXECUTE r.stmt; 41 | END LOOP; 42 | END 43 | $$; 44 | 45 | CREATE OR REPLACE FUNCTION add_ext_dependency(_classid regclass, _objid oid) RETURNS void LANGUAGE sql AS 46 | $$ 47 | INSERT INTO pg_depend (classid, objid, refclassid, refobjid, deptype, objsubid, refobjsubid) 48 | SELECT _classid, _objid, 'pg_extension'::regclass, e.oid, 'n', 0 ,0 49 | FROM pg_extension e WHERE e.extname = 'pgwrh' 50 | $$; 51 | 52 | CREATE OR REPLACE FUNCTION select_add_ext_dependency(_classid regclass, oidexpr text) RETURNS text LANGUAGE sql AS 53 | $$SELECT format('SELECT "@extschema@".add_ext_dependency(%L, %s)', _classid, oidexpr)$$; 54 | 55 | CREATE OR REPLACE FUNCTION select_add_ext_dependency(_classid regclass, name_attr text, name text) RETURNS text LANGUAGE sql AS 56 | $$SELECT format('SELECT "@extschema@".add_ext_dependency(%1$L, (SELECT oid FROM %1$s WHERE %I = %L))', _classid, name_attr, name)$$; 57 | 58 | CREATE OR REPLACE FUNCTION is_dependent_object(_classid regclass, _objid oid) RETURNS boolean STABLE LANGUAGE sql AS 59 | $$ 60 | SELECT EXISTS (SELECT 1 FROM 61 | pg_depend 62 | JOIN pg_extension e ON refclassid = 'pg_extension'::regclass AND refobjid = e.oid 63 | WHERE 64 | e.extname = 'pgwrh' 65 | AND 66 | classid = _classid 67 | AND 68 | objid = _objid 69 | ) 70 | $$; 71 | 72 | CREATE VIEW owned_obj AS 73 | SELECT 74 | classid, 75 | objid 76 | FROM 77 | pg_depend d JOIN pg_extension e ON 78 | refclassid = 'pg_extension'::regclass 79 | AND refobjid = e.oid 80 | WHERE 81 | d.deptype = 'n' 82 | AND e.extname = 'pgwrh' 83 | ; 84 | 85 | CREATE VIEW owned_server AS 86 | SELECT 87 | s.* 88 | FROM 89 | pg_foreign_server s JOIN owned_obj ON 90 | classid = 'pg_foreign_server'::regclass 91 | AND objid = s.oid 92 | ; -------------------------------------------------------------------------------- /src/master/api-management.sql: -------------------------------------------------------------------------------- 1 | -- name: master-api-management 2 | -- requires: master-tables 3 | 4 | -- pgwrh 5 | -- Copyright (C) 2024 Michal Kleczek 6 | 7 | -- This program is free software: you can redistribute it and/or modify 8 | -- it under the terms of the GNU Affero General Public License as published by 9 | -- the Free Software Foundation, either version 3 of the License, or 10 | -- (at your option) any later version. 11 | 12 | -- This program is distributed in the hope that it will be useful, 13 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | -- GNU Affero General Public License for more details. 16 | 17 | -- You should have received a copy of the GNU Affero General Public License 18 | -- along with this program. If not, see . 19 | 20 | CREATE FUNCTION start_rollout( 21 | _replication_group_id text) 22 | RETURNS void 23 | SET SEARCH_PATH FROM CURRENT 24 | LANGUAGE sql 25 | AS 26 | $$ 27 | INSERT INTO replication_group_config_lock (replication_group_id, version) 28 | SELECT 29 | replication_group_id, version 30 | FROM 31 | replication_group_config 32 | JOIN replication_group USING (replication_group_id) 33 | WHERE 34 | replication_group_id = $1 35 | AND current_version = target_version AND version <> current_version 36 | ON CONFLICT DO NOTHING; 37 | UPDATE replication_group g 38 | SET target_version = l.version 39 | FROM replication_group_config_lock l 40 | WHERE 41 | g.replication_group_id = l.replication_group_id 42 | AND l.version = next_version(current_version) 43 | AND g.replication_group_id = $1; 44 | $$; 45 | COMMENT ON FUNCTION start_rollout(_replication_group_id text) IS 46 | $$ 47 | Starts rollout of group's next configuration version. 48 | 49 | The new version is locked and marked as target version in replication_group record. 50 | If there is no new configuration version the function is a noop. 51 | 52 | # Parameters 53 | ## _replication_group_id 54 | Identifier of the replication group to start rollout. 55 | $$; 56 | 57 | CREATE FUNCTION create_replica_cluster( 58 | _replication_group_id text) 59 | RETURNS void 60 | SET SEARCH_PATH FROM CURRENT 61 | LANGUAGE sql 62 | AS 63 | $$ 64 | INSERT INTO replication_group (replication_group_id) 65 | VALUES ($1); 66 | $$; 67 | COMMENT ON FUNCTION create_replica_cluster(_replication_group_id text) IS 68 | $$ 69 | Creates new replica cluster. 70 | $$; 71 | 72 | CREATE FUNCTION add_replica( 73 | _replication_group_id text, 74 | _replica_id text, 75 | _host_name text, 76 | _port int, 77 | _member_role regrole DEFAULT NULL, 78 | _availability_zone text DEFAULT 'default', 79 | _weight int DEFAULT 100) 80 | RETURNS void 81 | SET SEARCH_PATH FROM CURRENT 82 | LANGUAGE sql 83 | AS 84 | $$ 85 | WITH m AS ( 86 | INSERT INTO replication_group_member (replication_group_id, host_id, member_role, availability_zone) 87 | VALUES (_replication_group_id, _replica_id, coalesce(_member_role::text, _replica_id::regrole::text), _availability_zone) 88 | ), 89 | h AS ( 90 | INSERT INTO shard_host (replication_group_id, availability_zone, host_id, host_name, port) 91 | VALUES (_replication_group_id, _availability_zone, _replica_id, _host_name, _port) 92 | ) 93 | INSERT INTO shard_host_weight (replication_group_id, availability_zone, host_id, weight) 94 | VALUES (_replication_group_id, _availability_zone, _replica_id, _weight) 95 | $$; 96 | COMMENT ON FUNCTION add_replica(_replication_group_id text, _host_id text, _host_name text, _port int, _member_role regrole, _availability_zone text, _weight int) IS 97 | $$ 98 | Adds new replica to a cluster. 99 | $$; 100 | 101 | CREATE FUNCTION set_replica_weight( 102 | _replication_group_id text, 103 | _availability_zone text, 104 | _replica_id text, 105 | _weight int) 106 | RETURNS void 107 | SET SEARCH_PATH FROM CURRENT 108 | LANGUAGE sql 109 | AS 110 | $$ 111 | INSERT INTO shard_host_weight (replication_group_id, availability_zone, host_id, weight) 112 | VALUES (_replication_group_id, _availability_zone, _replica_id, _weight) 113 | ON CONFLICT (replication_group_id, availability_zone, host_id, version) 114 | DO UPDATE SET weight = EXCLUDED.weight; 115 | $$; 116 | 117 | CREATE OR REPLACE FUNCTION commit_rollout( 118 | group_id text, keep_old_config boolean DEFAULT false) 119 | RETURNS void 120 | SET SEARCH_PATH FROM CURRENT 121 | LANGUAGE plpgsql 122 | AS 123 | $$ 124 | BEGIN 125 | UPDATE replication_group g 126 | SET current_version = target_version 127 | WHERE 128 | replication_group_id = group_id; 129 | DELETE FROM replication_group_config cfg 130 | USING replication_group g 131 | WHERE 132 | g.replication_group_id = cfg.replication_group_id 133 | AND g.replication_group_id = group_id 134 | AND cfg.version <> g.current_version 135 | AND NOT keep_old_config; 136 | END 137 | $$; 138 | COMMENT ON FUNCTION commit_rollout(group_id text, keep_old_config boolean) IS 139 | $$ 140 | Marks the version being rolled out as current. 141 | If any of the replicas did not report all remote and local shards as ready error is raised. 142 | 143 | # WARNING 144 | This is destructive operation. During rollout replicas maintain shards from both versions. 145 | After marking new version as current they will delete no longer needed shards. 146 | $$; 147 | 148 | CREATE FUNCTION rollback_rollout(_replication_group_id text, unlock boolean DEFAULT TRUE) 149 | RETURNS void 150 | SET SEARCH_PATH FROM CURRENT 151 | LANGUAGE sql 152 | AS 153 | $$ 154 | UPDATE replication_group 155 | SET target_version = current_version 156 | WHERE replication_group_id = _replication_group_id; 157 | DELETE FROM replication_group_config_lock l 158 | USING replication_group g 159 | WHERE 160 | g.replication_group_id = _replication_group_id 161 | AND l.replication_group_id = g.replication_group_id 162 | AND l.version <> g.current_version 163 | AND unlock; 164 | $$; 165 | COMMENT ON FUNCTION rollback_rollout(_replication_group_id text, unlock boolean) IS 166 | $$ 167 | Rolls back any changes that are effects of roll out of new configuration version. 168 | Unlocks configuration version being rolled out. 169 | $$; -------------------------------------------------------------------------------- /src/master/api-replica.sql: -------------------------------------------------------------------------------- 1 | -- name: api-replica 2 | -- requires: master-implementation-views 3 | 4 | -- pgwrh 5 | -- Copyright (C) 2024 Michal Kleczek 6 | 7 | -- This program is free software: you can redistribute it and/or modify 8 | -- it under the terms of the GNU Affero General Public License as published by 9 | -- the Free Software Foundation, either version 3 of the License, or 10 | -- (at your option) any later version. 11 | 12 | -- This program is distributed in the hope that it will be useful, 13 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | -- GNU Affero General Public License for more details. 16 | 17 | -- You should have received a copy of the GNU Affero General Public License 18 | -- along with this program. If not, see . 19 | 20 | CREATE OR REPLACE VIEW shard_structure AS 21 | WITH stc AS ( 22 | SELECT 23 | st.replication_group_id, 24 | c.oid::regclass 25 | FROM 26 | pg_class c 27 | JOIN pg_namespace n ON relnamespace = n.oid 28 | JOIN sharded_table st ON (nspname, relname) = (sharded_table_schema, sharded_table_name) 29 | ), 30 | roots AS ( 31 | SELECT * 32 | FROM stc r 33 | WHERE NOT EXISTS (SELECT 1 FROM stc WHERE replication_group_id = r.replication_group_id AND oid <> r.oid AND oid = ANY (SELECT * FROM pg_partition_ancestors(r.oid))) 34 | ) 35 | SELECT 36 | n.nspname AS schema_name, 37 | c.relname AS table_name, 38 | level, 39 | format('CREATE TABLE IF NOT EXISTS %I.%I %s%s', 40 | n.nspname, c.relname, 41 | CASE WHEN level = 0 42 | -- root of the partition tree - need to define attributes 43 | THEN 44 | '(' || 45 | ( 46 | SELECT string_agg(format('%I %s', attname, atttypid::regtype), ',') 47 | FROM pg_attribute WHERE attrelid = t.relid AND attnum >= 1 48 | ) || 49 | coalesce( 50 | ', ' || (SELECT string_agg(pg_get_constraintdef(c.oid), ', ') FROM pg_constraint c WHERE conrelid = t.relid AND conislocal), 51 | '' 52 | ) || 53 | ')' 54 | -- partition - no attributes necessary 55 | ELSE 56 | format('PARTITION OF %I.%I%s %s', 57 | pn.nspname, p.relname, 58 | coalesce( 59 | ' (' || (SELECT string_agg(pg_get_constraintdef(c.oid), ', ') FROM pg_constraint c WHERE conrelid = t.relid AND conislocal) || ')', 60 | '' 61 | ), 62 | pg_get_expr(c.relpartbound, c.oid)) 63 | END, 64 | CASE WHEN t.isleaf 65 | THEN 66 | '' 67 | ELSE 68 | ' PARTITION BY ' || pg_get_partkeydef(t.relid) 69 | END 70 | ) AS create_table 71 | FROM 72 | roots r 73 | JOIN replication_group_member m USING (replication_group_id) 74 | JOIN replication_group USING (replication_group_id), 75 | pg_partition_tree(oid) t 76 | JOIN pg_class c ON t.relid = c.oid JOIN pg_namespace n ON c.relnamespace = n.oid 77 | LEFT JOIN pg_class p ON t.parentrelid = p.oid LEFT JOIN pg_namespace pn ON p.relnamespace = pn.oid 78 | WHERE 79 | ( 80 | c.relkind = 'p' 81 | OR 82 | c.relkind = 'r' 83 | AND 84 | EXISTS (SELECT 1 FROM 85 | shard_assigned_host 86 | WHERE 87 | replication_group_id = m.replication_group_id 88 | AND 89 | schema_name = n.nspname AND table_name = c.relname 90 | AND 91 | version IN (current_version, target_version) 92 | ) 93 | ) 94 | AND 95 | member_role = CURRENT_ROLE; 96 | 97 | GRANT SELECT ON shard_structure TO PUBLIC; 98 | 99 | 100 | CREATE OR REPLACE VIEW shard_assignment AS 101 | SELECT 102 | schema_name, 103 | table_name, 104 | local, 105 | shard_server_name, 106 | host, 107 | port, 108 | dbname, 109 | shard_server_user, 110 | pubname, 111 | connect_remote, 112 | retained_shard_server_name 113 | FROM 114 | shard_assignment_per_member 115 | WHERE 116 | member_role = CURRENT_ROLE 117 | ; 118 | GRANT SELECT ON shard_assignment TO PUBLIC; 119 | 120 | COMMENT ON VIEW shard_assignment IS 121 | 'Main view implementing shard assignment logic. 122 | 123 | Presents a particular replication_group_member (as identified by member_role) view of the cluster (replicaton_group). 124 | Each member sees all shards with the following information for each shard: 125 | * "local" flag saying if this shard should be replicated to this member 126 | * information on how to connect to remote replicas for this shard: host, port, dbname, user, password'; 127 | 128 | CREATE OR REPLACE VIEW shard_index AS 129 | SELECT 130 | schema_name, 131 | table_name, 132 | index_name, 133 | index_template, 134 | optional 135 | FROM 136 | shard_index_per_member 137 | WHERE 138 | member_role = CURRENT_ROLE 139 | ; 140 | GRANT SELECT ON shard_index TO PUBLIC; 141 | 142 | CREATE VIEW replica_state AS 143 | SELECT 144 | subscribed_local_shards, 145 | indexes, 146 | connected_local_shards, 147 | connected_remote_shards, 148 | users 149 | FROM replication_group_member 150 | WHERE 151 | member_role = CURRENT_ROLE 152 | ; 153 | 154 | -- CREATE FUNCTION update_replica_state() RETURNS trigger LANGUAGE plpgsql AS 155 | -- $$ 156 | -- BEGIN 157 | -- INSERT INTO replica_state_per_member (member_role, subscribed_local_shards, indexes, connected_local_shards, connected_remote_shards) 158 | -- VALUES (CURRENT_ROLE, NEW.subscribed_local_shards, NEW.indexes, NEW.connected_local_shards, NEW.connected_remote_shards) 159 | -- ON CONFLICT (member_role) DO UPDATE SET 160 | -- subscribed_local_shards = REJECTED.subscribed_local_shards, 161 | -- indexes = REJECTED.indexes, 162 | -- connected_local_shards = REJECTED.connected_local_shards, 163 | -- connected_remote_shards = REJECTED.connected_remote_shards; 164 | -- RETURN NEW; 165 | -- END 166 | -- $$; 167 | -- CREATE TRIGGER update_replica_state_trigger INSTEAD OF INSERT OR UPDATE ON replica_state FOR EACH ROW EXECUTE FUNCTION update_replica_state(); 168 | GRANT SELECT, INSERT, UPDATE ON replica_state TO PUBLIC; 169 | 170 | CREATE VIEW credentials AS 171 | SELECT 172 | creds.username, 173 | creds.password 174 | FROM 175 | replication_group_member 176 | JOIN replication_group_credentials creds USING (replication_group_id) 177 | WHERE 178 | member_role = CURRENT_ROLE; 179 | GRANT SELECT ON credentials TO PUBLIC; 180 | -------------------------------------------------------------------------------- /src/master/deps.txt: -------------------------------------------------------------------------------- 1 | implementation-views api-replica 2 | tables ext-config-dump 3 | tables helpers 4 | helpers implementation-views 5 | helpers publication-sync 6 | publication-sync triggers 7 | snapshot triggers 8 | tables api-management 9 | implementation-views snapshot 10 | tables monitoring 11 | -------------------------------------------------------------------------------- /src/master/ext-config-dump.sql: -------------------------------------------------------------------------------- 1 | -- name: ext-config-dump 2 | -- requires: tables 3 | 4 | SELECT pg_catalog.pg_extension_config_dump('replication_group_config_lock', ''); 5 | SELECT pg_catalog.pg_extension_config_dump('replication_group_config_clone', ''); 6 | SELECT pg_catalog.pg_extension_config_dump('replication_group_config', ''); 7 | SELECT pg_catalog.pg_extension_config_dump('replication_group', ''); 8 | SELECT pg_catalog.pg_extension_config_dump('replication_group_member', ''); 9 | SELECT pg_catalog.pg_extension_config_dump('shard_host', ''); 10 | SELECT pg_catalog.pg_extension_config_dump('shard_host_weight', ''); 11 | SELECT pg_catalog.pg_extension_config_dump('sharded_table', ''); 12 | SELECT pg_catalog.pg_extension_config_dump('shard_index_template', ''); 13 | SELECT pg_catalog.pg_extension_config_dump('shard', ''); 14 | SELECT pg_catalog.pg_extension_config_dump('shard_assigned_host', ''); 15 | SELECT pg_catalog.pg_extension_config_dump('shard_assigned_index', ''); 16 | -------------------------------------------------------------------------------- /src/master/helpers.sql: -------------------------------------------------------------------------------- 1 | -- name: master-helpers 2 | -- requires: tables 3 | 4 | CREATE FUNCTION pubname(schema_name text, table_name text) RETURNS text IMMUTABLE LANGUAGE sql AS 5 | $$ 6 | SELECT 'pgwrh_' || md5(schema_name || table_name); 7 | $$; 8 | GRANT EXECUTE ON FUNCTION pubname(schema_name text, table_name text) TO PUBLIC; 9 | 10 | CREATE FUNCTION usernamegen(replication_group_id text, version config_version, seed uuid) 11 | RETURNS text 12 | IMMUTABLE 13 | LANGUAGE sql 14 | AS 15 | $$ 16 | SELECT 'pgwrh_' || current_database() || '_' || replication_group_id || '_' || right(md5(version || seed::text), 5); 17 | $$; 18 | GRANT EXECUTE ON FUNCTION usernamegen(replication_group_id text, version config_version, seed uuid) TO PUBLIC; 19 | CREATE FUNCTION passgen(replication_group_id text, version config_version, seed uuid) 20 | RETURNS text 21 | IMMUTABLE 22 | LANGUAGE sql 23 | AS 24 | $$ 25 | SELECT encode(sha256(convert_to(replication_group_id || version || seed::text, 'UTF8')), 'hex'); 26 | $$; 27 | GRANT EXECUTE ON FUNCTION passgen(replication_group_id text, version config_version, seed uuid) TO PUBLIC; 28 | 29 | CREATE OR REPLACE FUNCTION next_version(version config_version) RETURNS config_version 30 | IMMUTABLE 31 | LANGUAGE sql AS 32 | $$ 33 | SELECT CASE version WHEN 'FLIP' THEN 'FLOP' ELSE 'FLIP' END::"@extschema@".config_version 34 | $$; 35 | 36 | 37 | CREATE OR REPLACE FUNCTION prev_version(version config_version) RETURNS config_version 38 | IMMUTABLE 39 | LANGUAGE sql AS 40 | $$ 41 | SELECT "@extschema@".next_version(version) 42 | $$; 43 | 44 | CREATE OR REPLACE FUNCTION is_locked(group_id text, version config_version) RETURNS boolean 45 | LANGUAGE sql STABLE AS 46 | $$ 47 | SELECT EXISTS (SELECT 1 FROM 48 | "@extschema@".replication_group_config_lock l 49 | WHERE replication_group_id = $1 AND l.version = $2 50 | ) 51 | $$; 52 | 53 | 54 | CREATE OR REPLACE FUNCTION next_pending_version(group_id text) RETURNS config_version 55 | LANGUAGE sql AS 56 | $$ 57 | INSERT INTO "@extschema@".replication_group_config 58 | SELECT 59 | replication_group_id, "@extschema@".next_version(current_version) 60 | FROM 61 | "@extschema@".replication_group 62 | WHERE 63 | replication_group_id = group_id 64 | ON CONFLICT DO NOTHING; 65 | 66 | SELECT 67 | "@extschema@".next_version(current_version) 68 | FROM 69 | "@extschema@".replication_group 70 | WHERE 71 | replication_group_id = group_id 72 | $$; 73 | COMMENT ON FUNCTION next_pending_version(group_id text) IS 74 | 'Inserts next pending version into replication_group_config and returns it.'; 75 | 76 | 77 | CREATE OR REPLACE FUNCTION stable_hash(VARIADIC text[]) RETURNS int IMMUTABLE LANGUAGE sql AS 78 | $$ 79 | SELECT ('x' || substr(md5(array_to_string($1, '', '')), 1, 8))::bit(32)::int 80 | $$; 81 | 82 | CREATE OR REPLACE FUNCTION score(weight int, VARIADIC text[]) RETURNS double precision IMMUTABLE LANGUAGE sql AS 83 | $$ 84 | SELECT weight / -ln("@extschema@".stable_hash(VARIADIC $2)::double precision / ((2147483649)::bigint - (-2147483648)::bigint) + 0.5::double precision) 85 | $$; 86 | 87 | CREATE OR REPLACE FUNCTION extract_sharding_key_value(schema_name text, table_name text, sharding_key_expression text) RETURNS text IMMUTABLE LANGUAGE plpgsql AS 88 | $$ 89 | DECLARE 90 | result text; 91 | BEGIN 92 | EXECUTE sharding_key_expression INTO result USING schema_name, table_name; 93 | RETURN result; 94 | END 95 | $$; 96 | 97 | CREATE OR REPLACE FUNCTION to_regclass(st sharded_table) RETURNS regclass STABLE LANGUAGE sql AS 98 | $$ 99 | SELECT to_regclass(st.sharded_table_schema || '.' || st.sharded_table_name) 100 | $$; 101 | -------------------------------------------------------------------------------- /src/master/implementation-views.sql: -------------------------------------------------------------------------------- 1 | -- name: master-implementation-views 2 | -- requires: core 3 | 4 | CREATE VIEW shard_index_definition AS 5 | SELECT 6 | replication_group_id, 7 | version, 8 | schema_name, 9 | table_name, 10 | table_name 11 | || '_' 12 | || index_template_name 13 | || '_' 14 | || substr(md5(index_template_schema || index_template_table_name || index_template), 1, 5) AS index_name, 15 | index_template 16 | FROM 17 | shard_assigned_index 18 | JOIN shard_index_template USING (replication_group_id, version, index_template_schema, index_template_table_name, index_template_name) 19 | ; 20 | 21 | CREATE OR REPLACE VIEW shard_index_per_member AS 22 | WITH shard_class_index AS ( 23 | SELECT 24 | replication_group_id, 25 | schema_name, 26 | table_name, 27 | index_name, 28 | index_template, 29 | bool_or(version = current_version) is_current, 30 | bool_or(version = target_version AND current_version <> target_version) AS is_target 31 | FROM 32 | shard_index_definition 33 | JOIN replication_group USING (replication_group_id) 34 | GROUP BY 35 | 1, 2, 3, 4, 5 36 | ), 37 | member_shard AS ( 38 | SELECT 39 | replication_group_id, 40 | member_role, 41 | schema_name, 42 | table_name, 43 | bool_or(version = current_version) is_current, 44 | bool_or(version = target_version AND current_version <> target_version) is_target 45 | FROM 46 | shard_assigned_host 47 | JOIN replication_group USING (replication_group_id) 48 | JOIN replication_group_member USING (replication_group_id, availability_zone, host_id) 49 | GROUP BY 50 | 1, 2, 3, 4 51 | ), 52 | member_shard_index AS ( 53 | SELECT 54 | replication_group_id, 55 | member_role, 56 | schema_name, 57 | table_name, 58 | index_name, 59 | index_template, 60 | s.is_current AS optional 61 | FROM 62 | member_shard s 63 | JOIN shard_class_index i USING (replication_group_id, schema_name, table_name) 64 | WHERE 65 | s.is_current AND i.is_current 66 | OR 67 | s.is_target AND i.is_target 68 | ) 69 | SELECT 70 | replication_group_id, 71 | member_role, 72 | schema_name, 73 | table_name, 74 | index_name, 75 | index_template, 76 | optional 77 | FROM 78 | member_shard_index 79 | ; 80 | COMMENT ON VIEW shard_index_per_member IS 81 | 'Provides definitions of indexes that should be created for each shard.'; 82 | 83 | CREATE VIEW replication_group_credentials AS 84 | SELECT 85 | replication_group_id, 86 | version, 87 | usernamegen(replication_group_id, version, seed) AS username, 88 | passgen(replication_group_id, version, seed) AS password 89 | FROM 90 | replication_group_config_lock 91 | ; 92 | 93 | CREATE OR REPLACE VIEW shard_assignment_per_member AS 94 | SELECT 95 | replication_group_id, 96 | availability_zone, 97 | host_id, 98 | member_role, 99 | schema_name, 100 | table_name, 101 | local, 102 | -- foreign server hosting shard 103 | -- use target configuration server only when transitioning and all remote replicas subscribed to the shard (ie. we can run ANALYZE) 104 | CASE WHEN current_version <> target_version AND target_subscribed AND target_online AND target_user_created 105 | THEN target_server_name 106 | ELSE current_server_name 107 | END AS shard_server_name, 108 | CASE WHEN current_version <> target_version AND target_subscribed AND target_online AND target_user_created 109 | THEN target_host 110 | ELSE coalesce(current_host, '') 111 | END AS host, 112 | CASE WHEN current_version <> target_version AND target_subscribed AND target_online AND target_user_created 113 | THEN target_port 114 | ELSE coalesce(current_port, '') 115 | END AS port, 116 | current_database() AS dbname, 117 | CASE WHEN current_version <> target_version AND target_subscribed AND target_online AND target_user_created 118 | THEN target_credentials.username 119 | ELSE current_username 120 | END AS shard_server_user, 121 | -- If shard is remote in target version, and it is ready, connect it to slot instead of the local one 122 | -- (but keep the local one if it is still be marked as "local" above) 123 | CASE WHEN current_version <> target_version 124 | THEN target_remote AND target_subscribed AND target_online AND target_user_created 125 | ELSE NOT local 126 | END AS connect_remote, 127 | pubname(schema_name, table_name) AS pubname, 128 | current_server_name AS retained_shard_server_name, -- do not drop foreign tables with this server name (to keep current tables during transition) 129 | --local AND hosted_shard_subscribed_confirmation IS NULL AS subscription_confirmation_required -- whether confirmation from this member is required 130 | m AS replication_group_member 131 | FROM 132 | replication_group_member m 133 | JOIN replication_group g USING (replication_group_id) 134 | JOIN replication_group_credentials current_credentials USING (replication_group_id) 135 | JOIN replication_group_credentials target_credentials USING (replication_group_id) 136 | CROSS JOIN LATERAL ( 137 | SELECT 138 | schema_name, 139 | table_name, 140 | -- is m among assigned hosts regardless of version 141 | -- every host has to retain shards from both current and target version 142 | bool_or(member_role = m.member_role) AS local, 143 | bool_and(member_role <> m.member_role) 144 | FILTER ( WHERE version = target_version) AS target_remote, 145 | -- server names are independent of shard 146 | md5(string_agg(sah.availability_zone || sah.host_id, ',' ORDER BY sah.availability_zone, sah.host_id) 147 | FILTER (WHERE member_role <> m.member_role AND version = current_version)) AS current_server_name, 148 | md5(string_agg(sah.availability_zone || sah.host_id, ',' ORDER BY sah.availability_zone, sah.host_id) 149 | FILTER (WHERE member_role <> m.member_role AND version = target_version)) AS target_server_name, 150 | -- is any of target version hosts online? 151 | bool_or(online) FILTER (WHERE member_role <> m.member_role AND version = target_version) AS target_online, 152 | -- status of this particular shard 153 | -- did all target hosts confirmed subscription (so that clients can execute analyze) 154 | bool_and(subscribes_local_shard) 155 | FILTER (WHERE member_role <> m.member_role AND version = target_version) AS target_subscribed, 156 | -- did all target version hosts confirm target version indexes (so that clients can expose them as foreign tables) 157 | -- we want to avoid situation when clients issue queries to hosts that don't have required indexes 158 | -- as that might disrupt whole cluster due to slow queries, that in turn cause 159 | -- a) high resource usage and cache thrashing 160 | -- b) exhausted connection pools 161 | bool_and(has_all_indexes) 162 | FILTER (WHERE member_role <> m.member_role AND version = target_version) AS target_indexed, 163 | -- If all current hosts confirmed creation of target version user 164 | -- then we rotate credentials 165 | CASE WHEN bool_and(target_user_created) FILTER ( WHERE member_role <> m.member_role AND version = current_version) 166 | THEN target_credentials.username 167 | ELSE current_credentials.username 168 | END AS current_username, 169 | bool_and(target_user_created) 170 | FILTER ( WHERE member_role <> m.member_role AND version = target_version) AS target_user_created 171 | FROM 172 | shard_assigned_host sah 173 | JOIN shard_host USING (replication_group_id, availability_zone, host_id) 174 | JOIN replication_group_member USING (replication_group_id, availability_zone, host_id) 175 | -- check if all required indexes are created 176 | CROSS JOIN LATERAL (SELECT NOT EXISTS (SELECT 1 FROM 177 | shard_index_definition i 178 | WHERE 179 | ( i.replication_group_id, i.version, i.schema_name, i.table_name) = 180 | ( sah.replication_group_id, sah.version, sah.schema_name, sah.table_name) 181 | AND 182 | NOT EXISTS (SELECT 1 FROM 183 | json_to_recordset(indexes) AS mi(schema_name text, index_name text) 184 | WHERE 185 | ( schema_name, index_name) = 186 | ( i.schema_name, i.index_name) 187 | ) 188 | )) i(has_all_indexes) 189 | -- check if shard is subscribed 190 | CROSS JOIN LATERAL ( 191 | SELECT EXISTS (SELECT 1 FROM 192 | json_to_recordset(subscribed_local_shards) AS t(schema_name text, table_name text) 193 | WHERE 194 | ( schema_name, table_name) = 195 | (sah.schema_name, sah.table_name) 196 | )) s(subscribes_local_shard) 197 | CROSS JOIN LATERAL ( 198 | SELECT EXISTS (SELECT 1 FROM 199 | json_array_elements_text(users) AS t(username) 200 | WHERE username = target_credentials.username) 201 | ) u(target_user_created) 202 | WHERE 203 | sah.replication_group_id = m.replication_group_id 204 | AND 205 | version IN (current_version, target_version) 206 | GROUP BY 207 | 1, 2 208 | ) s 209 | -- calculate current version foreign server host and port based on _online_ assigned hosts and this member availability zone 210 | LEFT JOIN LATERAL ( 211 | SELECT 212 | schema_name, 213 | table_name, 214 | string_agg(host_name, ',' ORDER BY sah.host_id) AS current_host, 215 | string_agg(port::text, ',' ORDER BY sah.host_id) AS current_port 216 | FROM 217 | shard_assigned_host sah 218 | JOIN shard_host USING (replication_group_id, availability_zone, host_id) 219 | JOIN replication_group_member shm USING (replication_group_id, availability_zone, host_id), 220 | -- multiply hosts in the same availability zone by same_zone_multiplier 221 | generate_series(1, CASE WHEN m.availability_zone = sah.availability_zone THEN m.same_zone_multiplier ELSE 1 END) 222 | WHERE 223 | sah.replication_group_id = m.replication_group_id 224 | AND 225 | version = current_version 226 | AND 227 | (availability_zone, host_id) <> (m.availability_zone, m.host_id) 228 | AND 229 | online 230 | AND 231 | -- isolate hosts that for some reason are missing current version indexes 232 | -- condition is: 233 | -- there are no current version indexes that this host did not report 234 | -- ideally we could use a function, but it is problematic due to permissions 235 | NOT EXISTS (SELECT 1 FROM 236 | shard_assigned_host 237 | JOIN shard_index_definition i USING (replication_group_id, version, schema_name, table_name) 238 | WHERE 239 | (availability_zone, host_id) = (sah.availability_zone, sah.host_id) 240 | AND version = g.current_version 241 | AND NOT EXISTS (SELECT 1 FROM 242 | json_to_recordset(shm.indexes) AS mi(schema_name text, index_name text) 243 | WHERE 244 | ( schema_name, index_name) = 245 | ( i.schema_name, i.index_name) 246 | ) 247 | ) 248 | GROUP BY 249 | 1, 2 250 | ) current_host_port USING (schema_name, table_name) 251 | LEFT JOIN LATERAL ( 252 | SELECT 253 | schema_name, 254 | table_name, 255 | string_agg(host_name, ',' ORDER BY sah.host_id) AS target_host, 256 | string_agg(port::text, ',' ORDER BY sah.host_id) AS target_port 257 | FROM 258 | shard_assigned_host sah 259 | JOIN shard_host USING (replication_group_id, availability_zone, host_id), 260 | -- multiply hosts in the same availability zone by same_zone_multiplier 261 | generate_series(1, CASE WHEN m.availability_zone = sah.availability_zone THEN m.same_zone_multiplier ELSE 1 END) 262 | WHERE 263 | sah.replication_group_id = m.replication_group_id 264 | AND 265 | version = target_version 266 | AND 267 | (availability_zone, host_id) <> (m.availability_zone, m.host_id) 268 | AND 269 | online 270 | GROUP BY 271 | 1, 2 272 | ) target_host_port USING (schema_name, table_name) 273 | WHERE 274 | current_credentials.version = current_version 275 | AND target_credentials.version = target_version 276 | ; 277 | 278 | CREATE VIEW missing_subscribed_shard AS 279 | SELECT 280 | replication_group_id, version, availability_zone, host_id, schema_name, table_name 281 | FROM 282 | shard_assigned_host a 283 | JOIN replication_group_member USING (replication_group_id, availability_zone, host_id) 284 | WHERE 285 | NOT EXISTS (SELECT 1 FROM 286 | json_to_recordset(subscribed_local_shards) AS c(schema_name text, table_name text) 287 | WHERE (schema_name, table_name) = (a.schema_name, a.table_name) 288 | ) 289 | ; 290 | 291 | CREATE VIEW missing_connected_local_shard AS 292 | SELECT 293 | replication_group_id, version, availability_zone, host_id, schema_name, table_name 294 | FROM 295 | shard_assigned_host a 296 | JOIN replication_group_member USING (replication_group_id, availability_zone, host_id) 297 | WHERE 298 | NOT EXISTS (SELECT 1 FROM 299 | json_to_recordset(connected_local_shards) AS c(schema_name text, table_name text) 300 | WHERE (schema_name, table_name) = (a.schema_name, a.table_name) 301 | ) 302 | ; 303 | 304 | CREATE VIEW missing_connected_remote_shard AS 305 | WITH remote_shard AS ( 306 | SELECT 307 | m.*, 308 | version, 309 | schema_name, 310 | table_name 311 | FROM 312 | replication_group_member m 313 | JOIN shard ms USING (replication_group_id) 314 | WHERE 315 | NOT EXISTS (SELECT 1 FROM shard_assigned_host WHERE 316 | ( replication_group_id, version, availability_zone, host_id, schema_name, table_name) = 317 | (m.replication_group_id, ms.version, m.availability_zone, m.host_id, ms.schema_name, ms.table_name)) 318 | ) 319 | SELECT 320 | replication_group_id, version, availability_zone, host_id, schema_name, table_name 321 | FROM 322 | remote_shard s 323 | WHERE 324 | NOT EXISTS (SELECT 1 FROM 325 | json_to_recordset(connected_remote_shards) AS c(schema_name text, table_name text) 326 | WHERE (schema_name, table_name) = (s.schema_name, s.table_name) 327 | ) 328 | ; 329 | -------------------------------------------------------------------------------- /src/master/monitoring.sql: -------------------------------------------------------------------------------- 1 | -- name: master-monitoring 2 | -- requires: tables 3 | 4 | CREATE VIEW replication_status AS 5 | WITH sessions AS ( 6 | SELECT 7 | usename AS member_role, count(*) AS num_sessions 8 | FROM 9 | pg_stat_activity 10 | GROUP BY usename 11 | ) 12 | SELECT 13 | replication_group_id, 14 | availability_zone, 15 | host_id, 16 | pg_size_pretty(pg_current_wal_lsn() - confirmed_flush_lsn) AS lag, 17 | coalesce(num_sessions, 0) AS num_sessions 18 | FROM 19 | replication_group_member m 20 | LEFT JOIN pg_replication_slots ON 21 | array_to_string(trim_array(regexp_split_to_array(slot_name, '_'), 1), '_') = m.member_role 22 | LEFT JOIN sessions USING (member_role) 23 | ; 24 | COMMENT ON VIEW replication_status IS 25 | $$ 26 | Shows replication status of all replicas. 27 | $$; 28 | -------------------------------------------------------------------------------- /src/master/publication-sync.sql: -------------------------------------------------------------------------------- 1 | -- name: publication-sync 2 | -- requires: core 3 | -- requires: master-helpers 4 | 5 | CREATE PUBLICATION pgwrh_controller_ping FOR TABLE ping WITH (PUBLISH = 'insert'); 6 | SELECT add_ext_dependency('pg_publication', (SELECT oid FROM pg_publication WHERE pubname = 'pgwrh_controller_ping')); 7 | 8 | CREATE OR REPLACE FUNCTION sync_publications() RETURNS void 9 | SET SEARCH_PATH FROM CURRENT 10 | LANGUAGE plpgsql AS 11 | $$DECLARE 12 | r record; 13 | BEGIN 14 | FOR r IN 15 | SELECT format('CREATE PUBLICATION %I FOR TABLE %s WITH ( publish = %L )', 16 | pubname, 17 | c.oid::regclass, 18 | 'insert,update,delete') stmt, 19 | pubname 20 | FROM 21 | pg_class c 22 | JOIN pg_namespace n ON c.relnamespace = n.oid, 23 | pubname(nspname, relname) AS pubname 24 | WHERE 25 | EXISTS (SELECT 1 FROM 26 | shard 27 | JOIN replication_group USING (replication_group_id) 28 | WHERE 29 | (schema_name, table_name) = (nspname, relname) 30 | AND 31 | version IN (current_version, target_version) 32 | ) 33 | AND 34 | NOT EXISTS (SELECT 1 FROM 35 | pg_publication_rel 36 | WHERE 37 | prrelid = c.oid 38 | AND 39 | is_dependent_object('pg_publication', prpubid) 40 | ) 41 | LOOP 42 | EXECUTE r.stmt; 43 | PERFORM add_ext_dependency('pg_publication', (SELECT oid FROM pg_publication WHERE pubname = r.pubname::text)); 44 | END LOOP; 45 | FOR r IN 46 | SELECT format('DROP PUBLICATION %I CASCADE', 47 | pubname) stmt 48 | FROM 49 | pg_publication p 50 | WHERE 51 | is_dependent_object('pg_publication', oid) 52 | AND 53 | pubname NOT IN ('pgwrh_controller_ping') 54 | AND 55 | NOT EXISTS (SELECT 1 FROM 56 | shard s 57 | JOIN replication_group USING (replication_group_id) 58 | WHERE 59 | version IN (current_version, target_version) 60 | AND pubname(schema_name, table_name) = p.pubname 61 | ) 62 | LOOP 63 | EXECUTE r.stmt; 64 | END LOOP; 65 | RETURN; 66 | END 67 | $$; 68 | 69 | CREATE OR REPLACE FUNCTION sync_publications_trigger() RETURNS TRIGGER 70 | SET SEARCH_PATH FROM CURRENT 71 | LANGUAGE plpgsql AS 72 | $$BEGIN 73 | PERFORM sync_publications(); 74 | RETURN NULL; 75 | END$$; 76 | 77 | CREATE OR REPLACE TRIGGER sync_publications AFTER INSERT OR UPDATE OR DELETE OR TRUNCATE ON replication_group 78 | FOR EACH STATEMENT EXECUTE FUNCTION sync_publications_trigger(); 79 | -------------------------------------------------------------------------------- /src/master/snapshot.sql: -------------------------------------------------------------------------------- 1 | -- name: master-snapshot 2 | -- requires: master-implementation-views 3 | 4 | -- pgwrh 5 | -- Copyright (C) 2024 Michal Kleczek 6 | 7 | -- This program is free software: you can redistribute it and/or modify 8 | -- it under the terms of the GNU Affero General Public License as published by 9 | -- the Free Software Foundation, either version 3 of the License, or 10 | -- (at your option) any later version. 11 | 12 | -- This program is distributed in the hope that it will be useful, 13 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | -- GNU Affero General Public License for more details. 16 | 17 | -- You should have received a copy of the GNU Affero General Public License 18 | -- along with this program. If not, see . 19 | 20 | CREATE FUNCTION replication_group_config_snapshot(_replication_group_id text, _version config_version) 21 | RETURNS void 22 | SET SEARCH_PATH FROM CURRENT 23 | LANGUAGE sql 24 | AS 25 | $$ 26 | WITH sharded_pg_class AS ( 27 | SELECT 28 | c.oid::regclass, 29 | st.replication_group_id, 30 | version, 31 | sharded_table_schema, 32 | sharded_table_name, 33 | replication_factor, 34 | sharding_key_expression 35 | FROM 36 | pg_class c 37 | JOIN pg_namespace n ON relnamespace = n.oid 38 | JOIN sharded_table st ON (nspname, relname) = (sharded_table_schema, sharded_table_name) 39 | WHERE 40 | (replication_group_id, version) = (_replication_group_id, _version) 41 | ), 42 | shard_snapshot AS ( 43 | SELECT 44 | c.oid, 45 | st.replication_group_id, 46 | version, 47 | nspname AS schema_name, 48 | relname AS table_name, 49 | st.sharded_table_schema, 50 | st.sharded_table_name, 51 | replication_factor, 52 | "@extschema@".extract_sharding_key_value( 53 | nspname, 54 | relname, 55 | sharding_key_expression) AS sharding_key_value 56 | FROM 57 | pg_class c 58 | JOIN pg_namespace n ON n.oid = relnamespace 59 | JOIN sharded_pg_class st ON 60 | st.oid = ANY ( 61 | SELECT * FROM pg_partition_ancestors(c.oid) 62 | ) 63 | AND 64 | NOT EXISTS (SELECT 1 FROM 65 | sharded_pg_class des 66 | WHERE 67 | (des.replication_group_id, des.version) = (st.replication_group_id, st.version) 68 | AND des.oid = ANY (SELECT * FROM pg_partition_ancestors(c.oid)) 69 | AND des.oid <> st.oid 70 | AND st.oid = ANY (SELECT * FROM pg_partition_ancestors(des.oid)) 71 | ) 72 | WHERE 73 | c.relkind = 'r' 74 | ), 75 | saved_shard AS ( 76 | INSERT INTO shard 77 | (replication_group_id, version, schema_name, table_name, sharded_table_schema, sharded_table_name) 78 | SELECT 79 | replication_group_id, 80 | version, 81 | schema_name, 82 | table_name, 83 | sharded_table_schema, 84 | sharded_table_name 85 | FROM 86 | shard_snapshot 87 | ), 88 | saved_index AS ( 89 | INSERT INTO shard_assigned_index 90 | (replication_group_id, version, schema_name, table_name, index_template_schema, index_template_table_name, index_template_name) 91 | SELECT 92 | replication_group_id, version, ss.schema_name, ss.table_name, t.index_template_schema, t.index_template_table_name, t.index_template_name 93 | FROM 94 | shard_snapshot ss 95 | JOIN shard_index_template t USING (replication_group_id, version) 96 | JOIN pg_namespace itn ON itn.nspname = t.index_template_schema 97 | JOIN pg_class itc ON itc.relnamespace = itn.oid AND itc.relname = t.index_template_table_name 98 | WHERE 99 | itc.oid = ANY (SELECT * FROM pg_partition_ancestors(ss.oid)) 100 | ), 101 | group_counts AS ( 102 | SELECT 103 | replication_group_id, 104 | version, 105 | count(DISTINCT availability_zone) AS az_count, 106 | count(*) AS host_count 107 | FROM 108 | shard_host_weight 109 | GROUP BY 110 | 1, 2 111 | ), 112 | replicated_shard AS ( 113 | SELECT 114 | replication_group_id, 115 | version, 116 | schema_name, 117 | table_name, 118 | greatest( 119 | ceil((replication_factor * host_count) / 100), 120 | least(min_replica_count, host_count), 121 | least(min_replica_count_per_availability_zone * az_count, host_count)) AS replica_count, 122 | sharding_key_value 123 | FROM 124 | shard_snapshot sc 125 | JOIN replication_group_config USING (replication_group_id, version) 126 | JOIN group_counts USING (replication_group_id, version) 127 | ) 128 | INSERT INTO shard_assigned_host (replication_group_id, version, schema_name, table_name, availability_zone, host_id) 129 | SELECT 130 | replication_group_id, 131 | version, 132 | schema_name, 133 | table_name, 134 | availability_zone, 135 | host_id 136 | FROM 137 | replicated_shard s 138 | CROSS JOIN LATERAL ( 139 | SELECT 140 | availability_zone, 141 | host_id, 142 | row_number() OVER ( 143 | PARTITION BY availability_zone 144 | ORDER BY "@extschema@".score(weight, sharding_key_value, host_id) DESC) AS group_rank 145 | FROM 146 | shard_host_weight 147 | JOIN shard_host USING (replication_group_id, availability_zone, host_id) 148 | JOIN replication_group_member m USING (replication_group_id, availability_zone, host_id) 149 | WHERE 150 | (replication_group_id, version) = (s.replication_group_id, s.version) 151 | ORDER BY 152 | group_rank, "@extschema@".score(100, sharding_key_value, availability_zone) DESC 153 | LIMIT 154 | s.replica_count 155 | ) h 156 | $$; 157 | -------------------------------------------------------------------------------- /src/master/tables.sql: -------------------------------------------------------------------------------- 1 | -- name: tables 2 | -- requires: common 3 | 4 | -- pgwrh 5 | -- Copyright (C) 2024 Michal Kleczek 6 | 7 | -- This program is free software: you can redistribute it and/or modify 8 | -- it under the terms of the GNU Affero General Public License as published by 9 | -- the Free Software Foundation, either version 3 of the License, or 10 | -- (at your option) any later version. 11 | 12 | -- This program is distributed in the hope that it will be useful, 13 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | -- GNU Affero General Public License for more details. 16 | 17 | -- You should have received a copy of the GNU Affero General Public License 18 | -- along with this program. If not, see . 19 | 20 | CREATE TYPE config_version AS ENUM ('FLIP', 'FLOP'); 21 | COMMENT ON TYPE config_version IS 22 | 'A FLIP/FLAP enum to use as configuration version identifier.'; 23 | 24 | CREATE TABLE replication_group ( 25 | replication_group_id text NOT NULL PRIMARY KEY, 26 | current_version config_version NOT NULL DEFAULT 'FLIP', 27 | target_version config_version NOT NULL DEFAULT 'FLIP' 28 | ); 29 | COMMENT ON TABLE replication_group IS 30 | 'Represents a specific cluster (replica group) configuration. 31 | A single sever may be a source of data for multiple groups of replicas. 32 | Each group may have different configuration, in particular: 33 | * what tables should be sharded 34 | * number of desired copies per shard 35 | * member servers and shard hosts topology 36 | '; 37 | COMMENT ON COLUMN replication_group.replication_group_id IS 38 | 'Unique identifier of a replication group.'; 39 | COMMENT ON COLUMN replication_group.current_version IS 40 | 'Identifier of currently deployed configuration version.'; 41 | COMMENT ON COLUMN replication_group.target_version IS 42 | 'Identifier of pending configuration version that is currently being deployed.'; 43 | 44 | CREATE TABLE replication_group_lock ( 45 | replication_group_id text NOT NULL PRIMARY KEY REFERENCES replication_group(replication_group_id) 46 | ); 47 | COMMENT ON TABLE replication_group_lock IS 48 | $$ 49 | Having a lock on replication_group ensures accidental DELETE on a group cannot happen. 50 | 51 | To delete a replication group it is necessary to delete replication_group_lock first. 52 | $$; 53 | 54 | CREATE TABLE replication_group_config ( 55 | replication_group_id text NOT NULL REFERENCES replication_group(replication_group_id) ON DELETE CASCADE, 56 | version config_version NOT NULL, 57 | 58 | min_replica_count int NOT NULL CHECK ( min_replica_count >= 0 ) DEFAULT 1, 59 | min_replica_count_per_availability_zone int NOT NULL CHECK ( min_replica_count_per_availability_zone >= 0 ) DEFAULT 1, 60 | 61 | PRIMARY KEY (replication_group_id, version) 62 | ); 63 | COMMENT ON TABLE replication_group_config IS 64 | 'Represents a version of configuration of a replication group. 65 | 66 | Each cluster (replication group) configuration is versioned to make sure 67 | changes in cluster topology and shards configuration does not cause any downtime. 68 | 69 | There may be two versions of configuration present at the same time. 70 | A configuration version might be "pending" or "ready". 71 | 72 | Version marked as "ready" (pending = false) is a configuration version that all 73 | replicas installed and configured successfully. The shards assigned to replicas in that version are copied, indexed and available to use. 74 | 75 | Version marked as "pending" (pending = true) is a configuration version that is under installaction/configuration by the replicas. 76 | 77 | A replica keeps all shards from "ready" configuration even if a shard might be no longer assigned to it in "pending" configuration version. 78 | '; 79 | 80 | CREATE TABLE replication_group_config_clone ( 81 | replication_group_id text NOT NULL, 82 | source_version config_version NOT NULL, 83 | target_version config_version NOT NULL, 84 | 85 | PRIMARY KEY (replication_group_id, target_version), 86 | CHECK ( source_version <> target_version ), 87 | FOREIGN KEY (replication_group_id, source_version) 88 | REFERENCES replication_group_config(replication_group_id, version) ON DELETE CASCADE, 89 | FOREIGN KEY (replication_group_id, target_version) 90 | REFERENCES replication_group_config(replication_group_id, version) ON DELETE CASCADE 91 | ); 92 | 93 | CREATE TABLE replication_group_config_lock ( 94 | replication_group_id text NOT NULL, 95 | version config_version NOT NULL, 96 | -- most probably it should be separate 97 | -- but for now it is simpler here 98 | seed uuid NOT NULL DEFAULT gen_random_uuid(), 99 | 100 | PRIMARY KEY (replication_group_id, version), 101 | FOREIGN KEY (replication_group_id, version) 102 | REFERENCES replication_group_config(replication_group_id, version) 103 | ON DELETE CASCADE 104 | ); 105 | 106 | ALTER TABLE replication_group ADD FOREIGN KEY (replication_group_id, current_version) 107 | REFERENCES replication_group_config_lock(replication_group_id, version) DEFERRABLE INITIALLY DEFERRED; 108 | 109 | ALTER TABLE replication_group ADD FOREIGN KEY (replication_group_id, target_version) 110 | REFERENCES replication_group_config_lock(replication_group_id, version) DEFERRABLE INITIALLY DEFERRED; 111 | 112 | CREATE TABLE replication_group_member ( 113 | replication_group_id text NOT NULL REFERENCES replication_group(replication_group_id), 114 | availability_zone text NOT NULL, 115 | host_id text NOT NULL, 116 | member_role text NOT NULL UNIQUE, 117 | same_zone_multiplier smallint NOT NULL CHECK ( same_zone_multiplier BETWEEN 1 AND 5 ) DEFAULT 2, 118 | 119 | subscribed_local_shards json NOT NULL DEFAULT '[]', 120 | indexes json NOT NULL DEFAULT '[]', 121 | connected_local_shards json NOT NULL DEFAULT '[]', 122 | connected_remote_shards json NOT NULL DEFAULT '[]', 123 | users json NOT NULL DEFAULT '[]', 124 | 125 | PRIMARY KEY (replication_group_id, availability_zone, host_id) 126 | ); 127 | COMMENT ON TABLE replication_group_member IS 128 | 'Represents a node in a cluster (replication group). 129 | 130 | A cluster consists of two types of nodes: 131 | 132 | * shard hosts - nodes that replicate and serve data 133 | * non replicating members - nodes that act only as proxies (ie. not hosting any shards)'; 134 | 135 | CREATE TABLE shard_host ( 136 | replication_group_id text NOT NULL, 137 | availability_zone text NOT NULL, 138 | host_id text NOT NULL, 139 | host_name text NOT NULL, 140 | port int NOT NULL CHECK ( port > 0 ), 141 | 142 | online boolean NOT NULL DEFAULT true, 143 | 144 | PRIMARY KEY (replication_group_id, availability_zone, host_id), 145 | FOREIGN KEY (replication_group_id, availability_zone, host_id) 146 | REFERENCES replication_group_member(replication_group_id, availability_zone, host_id) 147 | ON DELETE CASCADE, 148 | UNIQUE (host_name, port) 149 | ); 150 | COMMENT ON TABLE shard_host IS 151 | 'Represents a data replicating node in a cluster (replication group).'; 152 | COMMENT ON COLUMN shard_host.online IS 153 | 'Shard host marked as offline is not going to receive any requests for data from other nodes. 154 | It is still replicating shards assigned to it. 155 | 156 | This flag is supposed to be used in situation when a particular node must be 157 | temporarily disconnected from a cluster for maintenance purposes.'; 158 | 159 | CREATE TABLE shard_host_weight ( 160 | replication_group_id text NOT NULL, 161 | availability_zone text NOT NULL, 162 | host_id text NOT NULL, 163 | version config_version NOT NULL, 164 | weight int NOT NULL CHECK ( weight > 0 ), 165 | 166 | PRIMARY KEY (replication_group_id, availability_zone, host_id, version), 167 | FOREIGN KEY (replication_group_id, availability_zone, host_id) 168 | REFERENCES shard_host(replication_group_id, availability_zone, host_id) 169 | ON DELETE CASCADE, 170 | FOREIGN KEY (replication_group_id, version) 171 | REFERENCES replication_group_config(replication_group_id, version) 172 | ON DELETE CASCADE 173 | ); 174 | COMMENT ON TABLE shard_host_weight IS 175 | 'Weight of a shard host in a specific configuration version'; 176 | 177 | CREATE TABLE sharded_table ( 178 | replication_group_id text NOT NULL, 179 | sharded_table_schema text NOT NULL, 180 | sharded_table_name text NOT NULL, 181 | version config_version NOT NULL, 182 | replication_factor decimal(5, 2) NOT NULL CHECK ( replication_factor BETWEEN 0 AND 100 ), 183 | sharding_key_expression text NOT NULL DEFAULT 'SELECT $1 || $2', 184 | 185 | PRIMARY KEY (replication_group_id, sharded_table_schema, sharded_table_name, version), 186 | FOREIGN KEY (replication_group_id, version) 187 | REFERENCES replication_group_config(replication_group_id, version) 188 | ON DELETE CASCADE 189 | ); 190 | 191 | CREATE TABLE shard_index_template ( 192 | replication_group_id text NOT NULL, 193 | version config_version NOT NULL, 194 | index_template_schema text NOT NULL, 195 | index_template_table_name text NOT NULL, 196 | index_template_name name NOT NULL, 197 | index_template text NOT NULL, 198 | 199 | PRIMARY KEY (replication_group_id, version, index_template_schema, index_template_table_name, index_template_name), 200 | FOREIGN KEY (replication_group_id, version) REFERENCES replication_group_config(replication_group_id, version) ON DELETE CASCADE 201 | ); 202 | 203 | -- SNAPSHOT 204 | 205 | CREATE TABLE shard ( 206 | replication_group_id text NOT NULL, 207 | version config_version NOT NULL, 208 | schema_name text NOT NULL, 209 | table_name text NOT NULL, 210 | sharded_table_schema text NOT NULL, 211 | sharded_table_name text NOT NULL, 212 | 213 | PRIMARY KEY (replication_group_id, version, schema_name, table_name), 214 | FOREIGN KEY (replication_group_id, version, sharded_table_schema, sharded_table_name) 215 | REFERENCES sharded_table (replication_group_id, version, sharded_table_schema, sharded_table_name), 216 | FOREIGN KEY (replication_group_id, version) 217 | REFERENCES replication_group_config_lock(replication_group_id, version) 218 | ON DELETE CASCADE 219 | ); 220 | 221 | CREATE TABLE shard_assigned_host ( 222 | replication_group_id text NOT NULL, 223 | version config_version NOT NULL, 224 | schema_name text NOT NULL, 225 | table_name text NOT NULL, 226 | availability_zone text NOT NULL, 227 | host_id text NOT NULL, 228 | 229 | PRIMARY KEY (replication_group_id, version, schema_name, table_name, availability_zone, host_id), 230 | FOREIGN KEY (replication_group_id, version, schema_name, table_name) 231 | REFERENCES shard(replication_group_id, version, schema_name, table_name) 232 | ON DELETE CASCADE, 233 | FOREIGN KEY (replication_group_id, version, availability_zone, host_id) 234 | REFERENCES shard_host_weight(replication_group_id, version, availability_zone, host_id) 235 | DEFERRABLE INITIALLY DEFERRED 236 | ); 237 | 238 | CREATE TABLE shard_assigned_index ( 239 | replication_group_id text NOT NULL, 240 | version config_version NOT NULL, 241 | schema_name text NOT NULL, 242 | table_name text NOT NULL, 243 | index_template_schema text NOT NULL, 244 | index_template_table_name text NOT NULL, 245 | index_template_name name NOT NULL, 246 | 247 | PRIMARY KEY (replication_group_id, version, schema_name, table_name, index_template_schema, index_template_table_name, index_template_name), 248 | FOREIGN KEY (replication_group_id, version, index_template_schema, index_template_table_name, index_template_name) 249 | REFERENCES shard_index_template(replication_group_id, version, index_template_schema, index_template_table_name, index_template_name) 250 | DEFERRABLE INITIALLY DEFERRED, 251 | FOREIGN KEY (replication_group_id, version, schema_name, table_name) 252 | REFERENCES shard(replication_group_id, version, schema_name, table_name) 253 | ON DELETE CASCADE 254 | ); 255 | 256 | -------------------- 257 | -------------------- 258 | CREATE TABLE ping ( 259 | last_time timestamptz NOT NULL PRIMARY KEY DEFAULT clock_timestamp() 260 | ); 261 | SELECT exec_dynamic(format('GRANT SELECT ON ping TO %I', pgwrh_replica_role_name())); 262 | -------------------------------------------------------------------------------- /src/master/triggers.sql: -------------------------------------------------------------------------------- 1 | -- name: master-triggers 2 | -- requires: core 3 | -- requires: publication-sync 4 | -- requires: master-snapshot 5 | 6 | -- pgwrh 7 | -- Copyright (C) 2024 Michal Kleczek 8 | 9 | -- This program is free software: you can redistribute it and/or modify 10 | -- it under the terms of the GNU Affero General Public License as published by 11 | -- the Free Software Foundation, either version 3 of the License, or 12 | -- (at your option) any later version. 13 | 14 | -- This program is distributed in the hope that it will be useful, 15 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | -- GNU Affero General Public License for more details. 18 | 19 | -- You should have received a copy of the GNU Affero General Public License 20 | -- along with this program. If not, see . 21 | 22 | 23 | CREATE OR REPLACE FUNCTION replication_group_prepare() RETURNS trigger LANGUAGE plpgsql AS 24 | $$ 25 | BEGIN 26 | INSERT INTO "@extschema@".replication_group_config VALUES (NEW.replication_group_id, NEW.current_version) 27 | ON CONFLICT DO NOTHING; 28 | INSERT INTO "@extschema@".replication_group_config_lock VALUES (NEW.replication_group_id, NEW.current_version) 29 | ON CONFLICT DO NOTHING; 30 | INSERT INTO "@extschema@".replication_group_lock VALUES (NEW.replication_group_id); 31 | RETURN NEW; 32 | END 33 | $$; 34 | CREATE OR REPLACE TRIGGER replication_group_prepare 35 | AFTER INSERT ON replication_group 36 | FOR EACH ROW EXECUTE FUNCTION replication_group_prepare(); 37 | COMMENT ON TRIGGER replication_group_prepare ON replication_group IS 38 | $$ 39 | * Creates a default, empty configuration that is locked and marked as current. 40 | * Inserts a replication_group_lock to prevent accidental deletes of newly created replication_group. 41 | $$; 42 | 43 | CREATE FUNCTION version_lifecycle_check() RETURNS trigger LANGUAGE plpgsql AS 44 | $$ 45 | BEGIN 46 | IF OLD.current_version = OLD.target_version THEN 47 | IF NEW.current_version <> OLD.current_version THEN 48 | RAISE 'Cannot switch current version directly. Please update target version first.'; 49 | END IF; 50 | ELSE 51 | IF NEW.target_version <> NEW.current_version AND NEW.current_version <> OLD.current_version THEN 52 | RAISE 'Cannot swap version. Please rollback target version first.'; 53 | END IF; 54 | END IF; 55 | RETURN NEW; 56 | END 57 | $$; 58 | CREATE TRIGGER version_lifecycle_check 59 | BEFORE UPDATE ON replication_group 60 | FOR EACH ROW EXECUTE FUNCTION version_lifecycle_check(); 61 | COMMENT ON TRIGGER version_lifecycle_check ON replication_group IS 62 | $comment$ 63 | Makes sure it is only possible to change current_version and target_version according to the following rules: 64 | * If there is no configuration change in progress (ie. current_version = target_version) then it is only possible to change target_version 65 | * If there is configuration change in progress it possible to either 66 | ** commit configuration change (ie. set current_version to target_version) 67 | ** or rollback configuration change (ie. set target_version to current_version) 68 | 69 | It must not be possible to change current_version without changing target_version first. 70 | $comment$; 71 | 72 | CREATE FUNCTION check_replication_group_rollout_done() RETURNS trigger LANGUAGE plpgsql AS 73 | $$ 74 | BEGIN 75 | IF EXISTS (SELECT 1 FROM 76 | "@extschema@".missing_connected_local_shard 77 | WHERE 78 | version = NEW.current_version 79 | ) THEN 80 | RAISE 'Not all hosts confirmed configuration of required local shards' 81 | USING HINT = 'Check missing_connected_local_shard view for details'; 82 | END IF; 83 | IF EXISTS (SELECT 1 FROM 84 | "@extschema@".missing_connected_remote_shard 85 | WHERE 86 | version = NEW.current_version 87 | ) THEN 88 | RAISE 'Not all hosts confirmed configuration of required remote shards' 89 | USING HINT = 'Check missing_connected_remote_shard view for details'; 90 | END IF; 91 | RETURN NEW; 92 | END; 93 | $$; 94 | CREATE TRIGGER check_replication_group_rollout_done 95 | BEFORE UPDATE ON replication_group 96 | FOR EACH ROW 97 | WHEN ( NEW.current_version <> OLD.current_version ) 98 | EXECUTE FUNCTION check_replication_group_rollout_done(); 99 | COMMENT ON TRIGGER check_replication_group_rollout_done ON replication_group IS 100 | $comment$ 101 | # Generated by DeepSeek-R1 102 | This trigger ensures that the replication group is only updated when all nodes have confirmed their configuration for the new version. 103 | It checks both local and remote shards to ensure consistency before allowing an update. 104 | The check_replication_group_rollout_done trigger is designed to ensure that all hosts have confirmed their configuration changes for a given replication group version. When an update occurs, this trigger checks if the new current_version exists in both missing_connected_local_shard and missing_connected_remote_shard tables. If any of these tables lack entries for the new version, it raises an error indicating that not all hosts have completed the configuration rollout. This helps maintain data consistency by preventing updates until all nodes are up to date. 105 | 106 | This trigger is crucial because it enforces a rollback mechanism in case there's an inconsistency after the update. By checking before applying any changes and ensuring all necessary configurations are in place, it helps maintain the integrity of the replication group across all participating hosts. 107 | $comment$; 108 | 109 | CREATE FUNCTION ping_on_version_change_trigger() RETURNS trigger LANGUAGE plpgsql AS 110 | $$ 111 | BEGIN 112 | INSERT INTO "@extschema@".ping VALUES (now()) ON CONFLICT DO NOTHING; 113 | RETURN NEW; 114 | END; 115 | $$; 116 | CREATE TRIGGER ping_on_version_change AFTER UPDATE ON replication_group 117 | FOR EACH ROW 118 | WHEN ( NEW.current_version <> OLD.current_version OR NEW.target_version <> OLD.target_version ) 119 | EXECUTE FUNCTION ping_on_version_change_trigger(); 120 | COMMENT ON TRIGGER ping_on_version_change ON replication_group IS 121 | $$ 122 | # Generated by DeepSeek-R1 123 | * This trigger is designed to send a ping check whenever the current or target version changes. 124 | * It ensures that any modifications to the replication group's configuration are properly monitored for consistency and potential issues. 125 | $$; 126 | 127 | CREATE OR REPLACE FUNCTION next_pending_version_trigger() RETURNS TRIGGER 128 | LANGUAGE plpgsql AS 129 | $$BEGIN 130 | NEW.version := "@extschema@".next_pending_version(NEW.replication_group_id); 131 | RETURN NEW; 132 | END$$; 133 | 134 | CREATE OR REPLACE FUNCTION forbid_locked_version_modifications() RETURNS TRIGGER 135 | LANGUAGE plpgsql AS 136 | $$ 137 | BEGIN 138 | RAISE 'This config version is locked. Modifications in % are forbidden.', TG_RELID::regclass; 139 | RETURN NULL; 140 | END 141 | $$; 142 | 143 | CREATE OR REPLACE FUNCTION clone_config_trigger() RETURNS TRIGGER 144 | LANGUAGE plpgsql AS 145 | $$BEGIN 146 | INSERT INTO "@extschema@".replication_group_config_clone (replication_group_id, source_version, target_version) 147 | VALUES (NEW.replication_group_id, "@extschema@".prev_version(NEW.version), NEW.version) 148 | ON CONFLICT DO NOTHING; 149 | RETURN NEW; 150 | END$$; 151 | 152 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_update BEFORE UPDATE ON replication_group_config 153 | FOR EACH ROW 154 | WHEN (is_locked(OLD.replication_group_id, OLD.version) OR is_locked(NEW.replication_group_id, NEW.version)) 155 | EXECUTE FUNCTION forbid_locked_version_modifications(); 156 | 157 | CREATE OR REPLACE TRIGGER "00_next_pending_version" BEFORE INSERT ON shard_host_weight 158 | FOR EACH ROW EXECUTE FUNCTION next_pending_version_trigger(); 159 | 160 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_insert BEFORE INSERT ON shard_host_weight 161 | FOR EACH ROW 162 | WHEN (is_locked(NEW.replication_group_id, NEW.version)) 163 | EXECUTE FUNCTION forbid_locked_version_modifications(); 164 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_update BEFORE UPDATE ON shard_host_weight 165 | FOR EACH ROW 166 | WHEN (is_locked(OLD.replication_group_id, OLD.version) OR is_locked(NEW.replication_group_id, NEW.version)) 167 | EXECUTE FUNCTION forbid_locked_version_modifications(); 168 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_delete BEFORE DELETE ON shard_host_weight 169 | FOR EACH ROW 170 | WHEN (is_locked(OLD.replication_group_id, OLD.version)) 171 | EXECUTE FUNCTION forbid_locked_version_modifications(); 172 | 173 | CREATE OR REPLACE TRIGGER clone_config AFTER INSERT ON shard_host_weight 174 | FOR EACH ROW EXECUTE FUNCTION clone_config_trigger(); 175 | 176 | CREATE OR REPLACE TRIGGER "00_next_pending_version" BEFORE INSERT ON sharded_table 177 | FOR EACH ROW EXECUTE FUNCTION next_pending_version_trigger(); 178 | 179 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_insert BEFORE INSERT ON sharded_table 180 | FOR EACH ROW 181 | WHEN (is_locked(NEW.replication_group_id, NEW.version)) 182 | EXECUTE FUNCTION forbid_locked_version_modifications(); 183 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_update BEFORE UPDATE ON sharded_table 184 | FOR EACH ROW 185 | WHEN (is_locked(OLD.replication_group_id, OLD.version) OR is_locked(NEW.replication_group_id, NEW.version)) 186 | EXECUTE FUNCTION forbid_locked_version_modifications(); 187 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_delete BEFORE DELETE ON sharded_table 188 | FOR EACH ROW 189 | WHEN (is_locked(OLD.replication_group_id, OLD.version)) 190 | EXECUTE FUNCTION forbid_locked_version_modifications(); 191 | 192 | CREATE OR REPLACE TRIGGER clone_config AFTER INSERT ON sharded_table 193 | FOR EACH ROW EXECUTE FUNCTION clone_config_trigger(); 194 | 195 | CREATE OR REPLACE TRIGGER "00_next_pending_version" BEFORE INSERT ON shard_index_template 196 | FOR EACH ROW EXECUTE FUNCTION next_pending_version_trigger(); 197 | 198 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_insert BEFORE INSERT ON shard_index_template 199 | FOR EACH ROW 200 | WHEN (is_locked(NEW.replication_group_id, NEW.version)) 201 | EXECUTE FUNCTION forbid_locked_version_modifications(); 202 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_update BEFORE UPDATE ON shard_index_template 203 | FOR EACH ROW 204 | WHEN (is_locked(OLD.replication_group_id, OLD.version) OR is_locked(NEW.replication_group_id, NEW.version)) 205 | EXECUTE FUNCTION forbid_locked_version_modifications(); 206 | CREATE OR REPLACE TRIGGER forbid_not_pending_version_delete BEFORE DELETE ON shard_index_template 207 | FOR EACH ROW 208 | WHEN (is_locked(OLD.replication_group_id, OLD.version)) 209 | EXECUTE FUNCTION forbid_locked_version_modifications(); 210 | 211 | CREATE OR REPLACE TRIGGER clone_config AFTER INSERT ON shard_index_template 212 | FOR EACH ROW EXECUTE FUNCTION clone_config_trigger(); 213 | 214 | CREATE OR REPLACE FUNCTION replication_group_config_snapshot_trigger() RETURNS trigger LANGUAGE plpgsql AS 215 | $$ 216 | BEGIN 217 | PERFORM "@extschema@".replication_group_config_snapshot(NEW.replication_group_id, NEW.version); 218 | RETURN NEW; 219 | END 220 | $$; 221 | CREATE OR REPLACE TRIGGER replication_group_config_snapshot AFTER INSERT ON replication_group_config_lock 222 | FOR EACH ROW EXECUTE FUNCTION replication_group_config_snapshot_trigger(); 223 | -------------------- 224 | -------------------- 225 | CREATE FUNCTION before_clone_insert_trigger() RETURNS trigger LANGUAGE plpgsql AS 226 | $$ 227 | BEGIN 228 | INSERT INTO "@extschema@".replication_group_config (replication_group_id, version, min_replica_count, min_replica_count_per_availability_zone) 229 | SELECT replication_group_id, NEW.target_version, min_replica_count, min_replica_count_per_availability_zone FROM 230 | "@extschema@".replication_group_config 231 | WHERE 232 | replication_group_id = NEW.replication_group_id 233 | AND version = NEW.source_version 234 | ON CONFLICT DO NOTHING; 235 | RETURN NEW; 236 | END 237 | $$; 238 | -------------------- 239 | -------------------- 240 | CREATE FUNCTION after_clone_insert_trigger() 241 | RETURNS trigger 242 | SET SEARCH_PATH FROM CURRENT 243 | LANGUAGE plpgsql AS 244 | $$ 245 | BEGIN 246 | INSERT INTO shard_host_weight (replication_group_id, availability_zone, host_id, version, weight) 247 | SELECT 248 | replication_group_id, availability_zone, host_id, NEW.target_version, weight 249 | FROM 250 | shard_host_weight 251 | WHERE 252 | (replication_group_id, version) = (NEW.replication_group_id, NEW.source_version) 253 | ON CONFLICT DO NOTHING; 254 | INSERT INTO sharded_table (replication_group_id, sharded_table_schema, sharded_table_name, version, replication_factor) 255 | SELECT replication_group_id, sharded_table_schema, sharded_table_name, NEW.target_version, replication_factor 256 | FROM 257 | sharded_table 258 | WHERE 259 | (replication_group_id, version) = (NEW.replication_group_id, NEW.source_version) 260 | ON CONFLICT DO NOTHING; 261 | INSERT INTO shard_index_template (replication_group_id, version, index_template_schema, index_template_table_name, index_template_name, index_template) 262 | SELECT replication_group_id, NEW.target_version, index_template_schema, index_template_table_name, index_template_name, index_template 263 | FROM 264 | shard_index_template 265 | WHERE 266 | (replication_group_id, version) = (NEW.replication_group_id, NEW.source_version) 267 | ON CONFLICT DO NOTHING; 268 | 269 | RETURN NEW; 270 | END 271 | $$; 272 | COMMENT ON FUNCTION after_clone_insert_trigger() IS 273 | 'Copies configuration from one version to another. Ignores already existing items.'; 274 | 275 | CREATE TRIGGER before_insert BEFORE INSERT ON replication_group_config_clone 276 | FOR EACH ROW EXECUTE FUNCTION before_clone_insert_trigger(); 277 | CREATE TRIGGER after_insert AFTER INSERT ON replication_group_config_clone 278 | FOR EACH ROW EXECUTE FUNCTION after_clone_insert_trigger(); 279 | 280 | CREATE FUNCTION make_sure_daemon_started_on_ping_trigger() RETURNS TRIGGER LANGUAGE plpgsql AS 281 | $$ 282 | BEGIN 283 | PERFORM "@extschema@".start_sync_daemon(tg_argv[0]::real); 284 | RETURN NEW; 285 | END 286 | $$; 287 | COMMENT ON FUNCTION make_sure_daemon_started_on_ping_trigger() IS 288 | 'Starts sync daemon if it is not running.'; 289 | -------------------------------------------------------------------------------- /src/replica/api-management.sql: -------------------------------------------------------------------------------- 1 | -- name: replica-api-management 2 | -- requires: replica-daemon 3 | -- requires: replica-helpers 4 | 5 | -- pgwrh 6 | -- Copyright (C) 2024 Michal Kleczek 7 | 8 | -- This program is free software: you can redistribute it and/or modify 9 | -- it under the terms of the GNU Affero General Public License as published by 10 | -- the Free Software Foundation, either version 3 of the License, or 11 | -- (at your option) any later version. 12 | 13 | -- This program is distributed in the hope that it will be useful, 14 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | -- GNU Affero General Public License for more details. 17 | 18 | -- You should have received a copy of the GNU Affero General Public License 19 | -- along with this program. If not, see . 20 | 21 | CREATE OR REPLACE FUNCTION configure_controller(host text, port text, username text, password text, start_daemon boolean DEFAULT true, refresh_seconds real DEFAULT 20) 22 | RETURNS void 23 | SET SEARCH_PATH FROM CURRENT 24 | SECURITY DEFINER 25 | LANGUAGE plpgsql AS 26 | $$ 27 | DECLARE 28 | r record; 29 | BEGIN 30 | FOR r IN SELECT * FROM "@extschema@".update_server_options('replica_controller', host, port) AS u(cmd) LOOP 31 | EXECUTE r.cmd; 32 | END LOOP; 33 | FOR r IN SELECT * FROM "@extschema@".update_user_mapping('replica_controller', username, password) AS u(cmd) LOOP 34 | EXECUTE r.cmd; 35 | END LOOP; 36 | --format('host=%s port=%s user=%s password=%s dbname=%s target_session_attrs=primary' 37 | PERFORM exec_dynamic(format('CREATE TRIGGER make_sure_daemon_started_on_ping AFTER INSERT ON ping 38 | FOR ROW EXECUTE FUNCTION make_sure_daemon_started_on_ping_trigger(%s)', refresh_seconds)); 39 | ALTER TABLE ping ENABLE REPLICA TRIGGER make_sure_daemon_started_on_ping; 40 | INSERT INTO shard_subscription (subname) VALUES ('pgwrh_replica_subscription'); 41 | PERFORM * FROM "@extschema:pg_background@".pg_background_result( 42 | "@extschema:pg_background@".pg_background_launch( 43 | format('CREATE SUBSCRIPTION pgwrh_replica_subscription CONNECTION ''host=%s port=%s user=%s password=%s dbname=%s target_session_attrs=primary'' PUBLICATION %I WITH (copy_data = false, %s)', 44 | host, port, username, password, current_database(), 'pgwrh_controller_ping', 45 | ( 46 | SELECT string_agg(format('%s = %L', key, val), ', ') FROM ( 47 | SELECT 48 | 'slot_name' AS key, 49 | username || '_' || (random() * 10000000)::bigint::text AS val-- random slot_name 50 | UNION ALL 51 | -- add failover = 'true' option for PostgreSQL >= 17 52 | SELECT 53 | 'failover' AS key, 54 | 'true' AS val 55 | WHERE 56 | substring(current_setting('server_version') FROM '\d{2}')::int >= 17 57 | ) opts 58 | ) 59 | ) 60 | ) 61 | ) AS discarded(result text); 62 | IF start_daemon THEN 63 | PERFORM "@extschema@".start_sync_daemon(refresh_seconds); 64 | END IF; 65 | END 66 | $$; 67 | -------------------------------------------------------------------------------- /src/replica/daemon.sql: -------------------------------------------------------------------------------- 1 | -- name: replica-daemon 2 | -- requires: replica-sync 3 | -- requires: replica-status 4 | 5 | -- pgwrh 6 | -- Copyright (C) 2024 Michal Kleczek 7 | 8 | -- This program is free software: you can redistribute it and/or modify 9 | -- it under the terms of the GNU Affero General Public License as published by 10 | -- the Free Software Foundation, either version 3 of the License, or 11 | -- (at your option) any later version. 12 | 13 | -- This program is distributed in the hope that it will be useful, 14 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | -- GNU Affero General Public License for more details. 17 | 18 | -- You should have received a copy of the GNU Affero General Public License 19 | -- along with this program. If not, see . 20 | 21 | CREATE OR REPLACE FUNCTION launch_in_background(commands text) RETURNS void LANGUAGE plpgsql AS 22 | $$ 23 | DECLARE 24 | pid int; 25 | BEGIN 26 | pid := (select "@extschema:pg_background@".pg_background_launch(commands)); 27 | PERFORM pg_sleep(0.1); 28 | PERFORM "@extschema:pg_background@".pg_background_detach(pid); 29 | END 30 | $$; 31 | 32 | CREATE OR REPLACE FUNCTION launch_sync() RETURNS void LANGUAGE sql AS 33 | $$ 34 | SELECT "@extschema@".launch_in_background('CAll "@extschema@".sync_replica_worker();') 35 | $$; 36 | 37 | CREATE OR REPLACE PROCEDURE sync_daemon(seconds real, _application_name text DEFAULT 'pgwrh_sync_daemon') LANGUAGE plpgsql AS 38 | $$ 39 | DECLARE 40 | err text; 41 | BEGIN 42 | IF pg_try_advisory_lock(517384732) THEN 43 | PERFORM set_config('application_name', _application_name, FALSE); 44 | LOOP 45 | BEGIN 46 | CAll "@extschema@".sync_replica_worker(); 47 | EXCEPTION 48 | WHEN OTHERS THEN 49 | GET STACKED DIAGNOSTICS err = MESSAGE_TEXT; 50 | raise WARNING '%', err; 51 | END; 52 | COMMIT; 53 | PERFORM pg_sleep(seconds); 54 | EXIT WHEN NOT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pgwrh'); 55 | END LOOP; 56 | END IF; 57 | END 58 | $$; 59 | 60 | CREATE OR REPLACE FUNCTION start_sync_daemon(seconds real, application_name text DEFAULT 'pgwrh_sync_daemon') RETURNS void LANGUAGE sql AS 61 | $$ 62 | SELECT "@extschema@".launch_in_background(format(' 63 | CALL "@extschema@".sync_daemon(%s, %L); 64 | ', seconds, application_name)) 65 | $$; 66 | 67 | CREATE OR REPLACE FUNCTION exec_script(script text) RETURNS boolean LANGUAGE plpgsql AS 68 | $$ 69 | DECLARE 70 | err text; 71 | BEGIN 72 | PERFORM * FROM "@extschema:pg_background@".pg_background_result("@extschema:pg_background@".pg_background_launch(script)) AS discarded(result text); 73 | RETURN TRUE; 74 | EXCEPTION 75 | WHEN OTHERS THEN 76 | GET STACKED DIAGNOSTICS err = MESSAGE_TEXT; 77 | raise WARNING '%', err; 78 | RETURN FALSE; 79 | END 80 | $$; 81 | 82 | CREATE OR REPLACE FUNCTION exec_non_tx_scripts(scripts text[]) RETURNS boolean LANGUAGE plpgsql AS 83 | $$ 84 | DECLARE 85 | cmd text; 86 | err text; 87 | BEGIN 88 | FOREACH cmd IN ARRAY scripts LOOP 89 | PERFORM * FROM "@extschema:pg_background@".pg_background_result("@extschema:pg_background@".pg_background_launch(cmd)) AS discarded(result text); 90 | END LOOP; 91 | RETURN TRUE; 92 | EXCEPTION 93 | WHEN OTHERS THEN 94 | GET STACKED DIAGNOSTICS err = MESSAGE_TEXT; 95 | raise NOTICE '%', err; 96 | RETURN FALSE; 97 | END 98 | $$; 99 | 100 | CREATE OR REPLACE FUNCTION sync_step() RETURNS boolean LANGUAGE plpgsql AS 101 | $$ 102 | DECLARE 103 | r record; 104 | cmd text; 105 | err text; 106 | BEGIN 107 | IF pg_try_advisory_xact_lock(2895359559) THEN 108 | -- Select commands to execute in a separate transaction so that we don't keep any locks here 109 | FOR r IN SELECT * FROM "@extschema:pg_background@".pg_background_result("@extschema:pg_background@".pg_background_launch('select async, transactional, description, commands from "@extschema@".sync')) AS (async boolean, transactional boolean, description text, commands text[]) LOOP 110 | RAISE NOTICE '%', r.description; 111 | IF r.transactional THEN 112 | IF r.async THEN 113 | PERFORM "@extschema@".launch_in_background(array_to_string(r.commands, ';')); 114 | ELSE 115 | PERFORM "@extschema@".exec_script(array_to_string(r.commands || 'SELECT '''''::text, ';')); 116 | END IF; 117 | ELSE 118 | IF r.async THEN 119 | IF array_length(r.commands, 1) > 1 THEN 120 | PERFORM "@extschema@".launch_in_background(format('SELECT "@extschema@".exec_non_tx_scripts(ARRAY[%s])', (SELECT string_agg(format('%L', c), ',') FROM unnest(r.commands) AS c))); 121 | ELSE 122 | PERFORM "@extschema@".launch_in_background(r.commands[1]); 123 | END IF; 124 | ELSE 125 | FOREACH cmd IN ARRAY r.commands LOOP 126 | PERFORM "@extschema@".exec_script(cmd); 127 | END LOOP; 128 | END IF; 129 | END IF; 130 | END LOOP; 131 | RETURN FOUND; 132 | ELSE 133 | RETURN FALSE; 134 | END IF; 135 | EXCEPTION 136 | WHEN OTHERS THEN 137 | GET STACKED DIAGNOSTICS err = MESSAGE_TEXT; 138 | raise WARNING '%', err; 139 | PERFORM pg_sleep(1); 140 | RETURN TRUE; 141 | END 142 | $$; 143 | 144 | CREATE OR REPLACE PROCEDURE sync_replica_worker() LANGUAGE plpgsql AS 145 | $$ 146 | BEGIN 147 | WHILE r FROM "@extschema:pg_background@".pg_background_result("@extschema:pg_background@".pg_background_launch('SELECT "@extschema@".sync_step()')) AS r(r boolean) LOOP 148 | END LOOP; 149 | PERFORM * FROM "@extschema:pg_background@".pg_background_result("@extschema:pg_background@".pg_background_launch('SELECT ''ignored'' FROM "@extschema@".report_state()')) AS r(ignored text); 150 | PERFORM * FROM "@extschema:pg_background@".pg_background_result("@extschema:pg_background@".pg_background_launch('SELECT ''ignored'' FROM "@extschema@".cleanup_analyzed_pg_class()')) AS r(ignored text); 151 | END 152 | $$; 153 | 154 | 155 | -- -- CREATE OR REPLACE FUNCTION sync_trigger() RETURNS trigger LANGUAGE plpgsql AS 156 | -- -- $$BEGIN 157 | -- -- PERFORM @extschema@.launch_sync(); 158 | -- -- RETURN NULL; 159 | -- -- END$$; 160 | -- -- CREATE OR REPLACE TRIGGER sync_trigger AFTER INSERT ON config_change FOR EACH ROW EXECUTE FUNCTION sync_trigger(); 161 | -- -- ALTER TABLE config_change ENABLE REPLICA TRIGGER sync_trigger; 162 | -------------------------------------------------------------------------------- /src/replica/deps.txt: -------------------------------------------------------------------------------- 1 | tables helpers 2 | helpers sync 3 | fdw sync 4 | fdw status 5 | helpers status 6 | tables ext-config-dump 7 | sync daemon 8 | status daemon 9 | daemon api-management 10 | helpers api-management 11 | -------------------------------------------------------------------------------- /src/replica/ext-config-dump.sql: -------------------------------------------------------------------------------- 1 | -- name: replica-ext-config-dump 2 | -- requires: replica-tables 3 | 4 | -- pgwrh 5 | -- Copyright (C) 2024 Michal Kleczek 6 | 7 | -- This program is free software: you can redistribute it and/or modify 8 | -- it under the terms of the GNU Affero General Public License as published by 9 | -- the Free Software Foundation, either version 3 of the License, or 10 | -- (at your option) any later version. 11 | 12 | -- This program is distributed in the hope that it will be useful, 13 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | -- GNU Affero General Public License for more details. 16 | 17 | -- You should have received a copy of the GNU Affero General Public License 18 | -- along with this program. If not, see . 19 | 20 | SELECT pg_catalog.pg_extension_config_dump('shard_subscription', ''); 21 | -------------------------------------------------------------------------------- /src/replica/fdw.sql: -------------------------------------------------------------------------------- 1 | -- name: replica-fdw 2 | 3 | -- pgwrh 4 | -- Copyright (C) 2024 Michal Kleczek 5 | 6 | -- This program is free software: you can redistribute it and/or modify 7 | -- it under the terms of the GNU Affero General Public License as published by 8 | -- the Free Software Foundation, either version 3 of the License, or 9 | -- (at your option) any later version. 10 | 11 | -- This program is distributed in the hope that it will be useful, 12 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | -- GNU Affero General Public License for more details. 15 | 16 | -- You should have received a copy of the GNU Affero General Public License 17 | -- along with this program. If not, see . 18 | 19 | CREATE SERVER IF NOT EXISTS replica_controller FOREIGN DATA WRAPPER postgres_fdw OPTIONS (load_balance_hosts 'random'); 20 | CREATE USER MAPPING FOR PUBLIC SERVER replica_controller; 21 | 22 | CREATE FOREIGN TABLE IF NOT EXISTS fdw_shard_assignment ( 23 | schema_name text, 24 | table_name text, 25 | local boolean, 26 | shard_server_name text, 27 | host text, 28 | port text, 29 | dbname text, 30 | shard_server_user text, 31 | pubname text, 32 | connect_remote boolean, 33 | retained_shard_server_name text 34 | ) 35 | SERVER replica_controller 36 | OPTIONS (table_name 'shard_assignment'); 37 | 38 | CREATE FOREIGN TABLE IF NOT EXISTS fdw_shard_index ( 39 | schema_name text, 40 | table_name text, 41 | index_name text, 42 | index_template text, 43 | optional boolean 44 | ) 45 | SERVER replica_controller 46 | OPTIONS (table_name 'shard_index'); 47 | 48 | CREATE FOREIGN TABLE IF NOT EXISTS fdw_shard_structure ( 49 | schema_name text, 50 | table_name text, 51 | level int, 52 | create_table text 53 | ) 54 | SERVER replica_controller 55 | OPTIONS (table_name 'shard_structure'); 56 | 57 | CREATE FOREIGN TABLE fdw_replica_state ( 58 | subscribed_local_shards json, 59 | indexes json, 60 | connected_local_shards json, 61 | connected_remote_shards json, 62 | users json 63 | ) SERVER replica_controller 64 | OPTIONS (table_name 'replica_state'); 65 | 66 | CREATE FOREIGN TABLE fdw_credentials ( 67 | username text, 68 | password text 69 | ) SERVER replica_controller 70 | OPTIONS (table_name 'credentials'); 71 | -------------------------------------------------------------------------------- /src/replica/helpers.sql: -------------------------------------------------------------------------------- 1 | -- name: replica-helpers 2 | 3 | -- pgwrh 4 | -- Copyright (C) 2024 Michal Kleczek 5 | 6 | -- This program is free software: you can redistribute it and/or modify 7 | -- it under the terms of the GNU Affero General Public License as published by 8 | -- the Free Software Foundation, either version 3 of the License, or 9 | -- (at your option) any later version. 10 | 11 | -- This program is distributed in the hope that it will be useful, 12 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | -- GNU Affero General Public License for more details. 15 | 16 | -- You should have received a copy of the GNU Affero General Public License 17 | -- along with this program. If not, see . 18 | 19 | -- options parsing 20 | CREATE OR REPLACE FUNCTION opts(arr text[]) RETURNS TABLE(key text, value text, vals text[]) LANGUAGE sql AS 21 | $$ 22 | SELECT kv[1] AS key, kv[2] AS value, vals FROM unnest(arr) AS o(val), string_to_array(o.val, '=') AS kv, string_to_array(kv[2], ',') vals 23 | $$; 24 | 25 | CREATE OR REPLACE FUNCTION update_server_options(_srvname text, srvoptions text[], host text, port text, dbname text DEFAULT current_database()) 26 | RETURNS SETOF text 27 | STABLE 28 | LANGUAGE sql AS 29 | $$ 30 | SELECT format('ALTER SERVER %I OPTIONS (%s)', srvname, string_agg(opts.cmd, ', ')) 31 | FROM 32 | ( 33 | SELECT 34 | _srvname, 35 | CASE 36 | WHEN opt.key IS NOT NULL THEN format('SET %s %L', toset.key, toset.val) 37 | ELSE format('ADD %s %L', toset.key, toset.val) 38 | END 39 | FROM 40 | unnest(ARRAY['host', 'port', 'dbname'], ARRAY[host, port, dbname]) AS toset(key, val) 41 | LEFT JOIN (SELECT * FROM "@extschema@".opts(srvoptions)) AS opt USING (key) 42 | WHERE 43 | opt.key IS NULL OR toset.val <> opt.value 44 | ) AS opts(srvname, cmd) 45 | GROUP BY 46 | srvname 47 | $$; 48 | CREATE OR REPLACE FUNCTION update_server_options(_srvname text, host text, port text, dbname text DEFAULT current_database()) 49 | RETURNS SETOF text 50 | STABLE 51 | LANGUAGE sql 52 | AS 53 | $$ 54 | SELECT "@extschema@".update_server_options(srvname, srvoptions, host, port, dbname) 55 | FROM 56 | pg_foreign_server 57 | WHERE 58 | srvname = _srvname; 59 | $$; 60 | 61 | CREATE OR REPLACE FUNCTION update_user_mapping(server_name text, umoptions text[], username text, password text) 62 | RETURNS SETOF text 63 | STABLE 64 | LANGUAGE sql AS 65 | $$ 66 | SELECT format('ALTER USER MAPPING FOR PUBLIC SERVER %I OPTIONS (%s)', srvname, string_agg(opts.cmd, ', ')) 67 | FROM 68 | ( 69 | SELECT 70 | server_name, 71 | CASE 72 | WHEN opt.key IS NOT NULL THEN format('SET %s %L', toset.key, toset.val) 73 | ELSE format('ADD %s %L', toset.key, toset.val) 74 | END 75 | FROM 76 | unnest(ARRAY['user', 'password'], ARRAY[username, password]) AS toset(key, val) 77 | LEFT JOIN (SELECT * FROM "@extschema@".opts(umoptions)) AS opt USING (key) 78 | WHERE 79 | opt.key IS NULL OR toset.val <> opt.value 80 | ) AS opts(srvname, cmd) 81 | GROUP BY 82 | srvname 83 | $$; 84 | CREATE OR REPLACE FUNCTION update_user_mapping(_srvname text, username text, password text) 85 | RETURNS SETOF text 86 | STABLE 87 | LANGUAGE sql 88 | AS 89 | $$ 90 | SELECT "@extschema@".update_user_mapping(srvname, umoptions, username, password) 91 | FROM 92 | pg_user_mappings 93 | WHERE 94 | srvname = _srvname; 95 | $$; 96 | 97 | CREATE TYPE rel_id AS (schema_name text, table_name text); 98 | 99 | CREATE FUNCTION fqn(rel_id) RETURNS text LANGUAGE sql AS 100 | $$ 101 | SELECT format('%I.%I', $1.schema_name, $1.table_name) 102 | $$; 103 | CREATE FUNCTION add_ext_dependency(rel_id) RETURNS text LANGUAGE sql AS 104 | $$ 105 | SELECT "@extschema@".select_add_ext_dependency('pg_class'::regclass, format('%L::regclass', "@extschema@".fqn($1))) 106 | $$; 107 | 108 | -- rel_id functions 109 | CREATE VIEW rel AS 110 | SELECT 111 | pc, pn, 112 | nspname AS schema_name, 113 | relname AS table_name, 114 | (nspname, relname)::rel_id AS rel_id, 115 | pc.oid::regclass AS reg_class 116 | FROM 117 | pg_class pc 118 | JOIN pg_namespace pn ON pn.oid = pc.relnamespace; 119 | CREATE OR REPLACE VIEW local_rel AS 120 | SELECT 121 | r.*, 122 | pg_get_expr((r).pc.relpartbound, (r).pc.oid) AS bound, 123 | parent, 124 | ((r.rel_id).schema_name || '_' || 'slot', (r.rel_id).table_name)::rel_id AS slot_rel_id 125 | FROM 126 | rel r 127 | LEFT JOIN pg_inherits pi ON (r).pc.oid = pi.inhrelid 128 | LEFT JOIN rel AS parent ON (parent).pc.oid = pi.inhparent; 129 | 130 | CREATE OR REPLACE VIEW shard_assignment_r AS 131 | SELECT 132 | lr.rel_id AS rel_id, 133 | lr.slot_rel_id AS slot_rel_id, 134 | (lr).slot_rel_id.schema_name AS slot_schema_name, 135 | remote_rel_id, 136 | shard_server_name, 137 | shard_server_schema AS shard_server_schema_name, 138 | template_rel_id, 139 | shard_template_schema AS template_schema_name, 140 | sa.local, 141 | CASE WHEN local THEN rel_id ELSE remote_rel_id END AS shard_rel_id, 142 | 'pgwrh_replica_subscription' AS subname, 143 | sa.pubname, 144 | sa.shard_server_user, 145 | sa.dbname, 146 | host, 147 | port, 148 | connect_remote, 149 | retained_shard_server_name, 150 | retained_shard_server_schema, 151 | retained_remote_rel_id, 152 | view_schema AS view_schema_name, 153 | view_rel_id, 154 | lr.reg_class, 155 | lr.parent, 156 | lr, 157 | parent IS NOT NULL AND (parent).rel_id = slot_rel_id AS connected 158 | FROM 159 | fdw_shard_assignment sa 160 | JOIN local_rel lr ON (sa.schema_name, sa.table_name) = ((lr).rel_id.schema_name, (lr).rel_id.table_name), 161 | format('%s_%s', sa.schema_name, shard_server_name) AS shard_server_schema, 162 | format('%s_%s', sa.schema_name, retained_shard_server_name) AS retained_shard_server_schema, 163 | format('%s_template', sa.schema_name) AS shard_template_schema, 164 | format('%s_shield', sa.schema_name) AS view_schema 165 | CROSS JOIN LATERAL ( 166 | SELECT 167 | (shard_server_schema, (rel_id).table_name)::rel_id AS remote_rel_id, 168 | (shard_template_schema, (rel_id).table_name)::rel_id AS template_rel_id, 169 | (retained_shard_server_schema, (rel_id).table_name)::rel_id AS retained_remote_rel_id, 170 | (view_schema, (rel_id).table_name)::rel_id AS view_rel_id 171 | ) AS rels; 172 | 173 | CREATE VIEW subscribed_local_shard AS 174 | SELECT 175 | * 176 | FROM 177 | local_rel 178 | WHERE 179 | EXISTS (SELECT 1 FROM 180 | pg_subscription_rel sr 181 | JOIN pg_subscription s ON srsubid = s.oid 182 | JOIN shard_subscription USING (subname) 183 | WHERE srrelid = reg_class AND srsubstate = 'r' 184 | ) 185 | ; 186 | 187 | CREATE VIEW created_index AS 188 | SELECT 189 | schema_name, 190 | table_name AS index_name 191 | FROM 192 | pg_index i 193 | JOIN rel r ON i.indexrelid = r.reg_class 194 | WHERE 195 | schema_name <> '@extschema@' 196 | AND is_dependent_object('pg_class'::regclass, i.indexrelid) 197 | ; 198 | 199 | CREATE VIEW remote_shard AS 200 | SELECT 201 | lr.*, 202 | s.srvname 203 | FROM 204 | local_rel lr 205 | JOIN pg_foreign_table ft ON ft.ftrelid = reg_class 206 | JOIN owned_server s ON 207 | s.oid = ft.ftserver 208 | ; 209 | -------------------------------------------------------------------------------- /src/replica/status.sql: -------------------------------------------------------------------------------- 1 | -- name: replica-status 2 | -- requires: replica-fdw 3 | -- requires: replica-helpers 4 | 5 | -- pgwrh 6 | -- Copyright (C) 2024 Michal Kleczek 7 | 8 | -- This program is free software: you can redistribute it and/or modify 9 | -- it under the terms of the GNU Affero General Public License as published by 10 | -- the Free Software Foundation, either version 3 of the License, or 11 | -- (at your option) any later version. 12 | 13 | -- This program is distributed in the hope that it will be useful, 14 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | -- GNU Affero General Public License for more details. 17 | 18 | -- You should have received a copy of the GNU Affero General Public License 19 | -- along with this program. If not, see . 20 | 21 | CREATE VIEW connected_local_shard AS 22 | SELECT 23 | ls.rel_id 24 | FROM 25 | subscribed_local_shard ls 26 | JOIN rel slot ON ls.slot_rel_id = slot.rel_id AND (ls).parent.reg_class = slot.reg_class 27 | ; 28 | COMMENT ON VIEW connected_local_shard IS 29 | $$ 30 | Local shards ready and connected to slots. 31 | 32 | Local shard is considered ready if 33 | * it is subscribed and its subscription state is 'r' 34 | * all non-optional indexes are created 35 | $$; 36 | 37 | -- TODO maybe better would be to use pg_depend to link local and foreign tables for the same shard 38 | CREATE VIEW connected_remote_shard AS 39 | SELECT 40 | ls.rel_id 41 | FROM 42 | remote_shard rs 43 | JOIN rel slot ON slot.reg_class = (rs).parent.reg_class 44 | JOIN local_rel ls ON ls.slot_rel_id = slot.rel_id 45 | ; 46 | COMMENT ON VIEW connected_remote_shard IS 47 | $$ 48 | Remote shards ready to use and connected to slots. 49 | 50 | Remote shard is considered ready if ANALYZE was performed on corresponding foreign table. 51 | $$; 52 | 53 | CREATE VIEW local_shard_index AS 54 | SELECT 55 | (ic).schema_name, 56 | (ic).table_name AS index_name 57 | FROM 58 | subscribed_local_shard ls 59 | JOIN pg_index i ON i.indrelid = ls.reg_class 60 | JOIN rel ic ON ic.reg_class = i.indexrelid 61 | WHERE 62 | NOT EXISTS (SELECT 1 FROM 63 | pg_constraint 64 | WHERE conindid = i.indexrelid 65 | ) 66 | ; 67 | COMMENT ON VIEW local_shard_index IS 68 | $$ 69 | Indexes on local shards except constraint indexes. 70 | $$; 71 | 72 | CREATE FUNCTION report_state() RETURNS void LANGUAGE sql AS 73 | $$ 74 | UPDATE "@extschema@".fdw_replica_state 75 | SET 76 | subscribed_local_shards = (SELECT coalesce((SELECT json_agg(rel_id) FROM "@extschema@".subscribed_local_shard), '[]')), 77 | connected_local_shards = (SELECT coalesce((SELECT json_agg(rel_id) FROM "@extschema@".connected_local_shard), '[]')), 78 | connected_remote_shards = (SELECT coalesce((SELECT json_agg(rel_id) FROM "@extschema@".connected_remote_shard), '[]')), 79 | indexes = (SELECT coalesce((SELECT json_agg(i) FROM "@extschema@".local_shard_index i), '[]')), 80 | users = (SELECT coalesce((SELECT json_agg(u.rolname) 81 | FROM pg_roles u 82 | JOIN pg_auth_members ON member = u.oid 83 | JOIN pg_roles gr ON 84 | gr.oid = roleid 85 | AND gr.rolname = format('pgwrh_replica_%s', current_database())), 86 | '[]')); 87 | $$; 88 | COMMENT ON FUNCTION report_state() IS 89 | $$ 90 | Updates controller with information about current state of a replica. 91 | # Details 92 | Function performs UPDATE on controller replica_state view setting 93 | subscribed_local_shards, connected_local_shards, connected_remote_shards, indexes 94 | columns to JSON arrays containing lists of tables and indexes having 95 | corresponding state. 96 | $$; -------------------------------------------------------------------------------- /src/replica/sync.sql: -------------------------------------------------------------------------------- 1 | -- name: replica-sync 2 | -- requires: replica-tables 3 | -- requires: replica-helpers 4 | -- requires: replica-fdw 5 | 6 | -- pgwrh 7 | -- Copyright (C) 2024 Michal Kleczek 8 | 9 | -- This program is free software: you can redistribute it and/or modify 10 | -- it under the terms of the GNU Affero General Public License as published by 11 | -- the Free Software Foundation, either version 3 of the License, or 12 | -- (at your option) any later version. 13 | 14 | -- This program is distributed in the hope that it will be useful, 15 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | -- GNU Affero General Public License for more details. 18 | 19 | -- You should have received a copy of the GNU Affero General Public License 20 | -- along with this program. If not, see . 21 | 22 | CREATE OR REPLACE VIEW sync(async, transactional, description, commands) AS 23 | WITH shard_assignment AS MATERIALIZED ( 24 | SELECT * FROM shard_assignment_r 25 | ), 26 | local_shard AS ( 27 | SELECT * FROM shard_assignment WHERE local 28 | ), 29 | slot_schema AS ( 30 | SELECT DISTINCT slot_schema_name FROM shard_assignment 31 | ), 32 | template_schema AS ( 33 | SELECT DISTINCT template_schema_name FROM shard_assignment 34 | ), 35 | view_schema AS ( 36 | SELECT DISTINCT view_schema_name FROM shard_assignment 37 | ), 38 | shard_structure AS MATERIALIZED ( 39 | SELECT * FROM fdw_shard_structure 40 | ), 41 | shard_schema AS ( 42 | SELECT DISTINCT schema_name FROM shard_structure 43 | ), 44 | shard_server AS ( 45 | SELECT DISTINCT 46 | shard_server_name, 47 | shard_server_schema_name, 48 | host, 49 | port, 50 | dbname, 51 | shard_server_user 52 | FROM 53 | shard_assignment 54 | WHERE 55 | shard_server_name IS NOT NULL 56 | ), 57 | shard_server_schema AS ( 58 | SELECT DISTINCT shard_server_schema_name 59 | FROM shard_assignment 60 | WHERE shard_server_name IS NOT NULL 61 | ), 62 | server_host_port AS ( 63 | SELECT 64 | s.*, 65 | host, 66 | port 67 | FROM 68 | pg_foreign_server s, 69 | LATERAL ( 70 | SELECT h.value AS host, p.value AS port 71 | FROM opts(srvoptions) AS h, opts(srvoptions) AS p 72 | WHERE h.key = 'host' AND p.key = 'port' 73 | ) AS opts 74 | ), 75 | owned_namespace AS ( 76 | SELECT 77 | n.* 78 | FROM 79 | pg_namespace n JOIN owned_obj ON classid = 'pg_namespace'::regclass AND objid = n.oid 80 | ), 81 | owned_subscription AS ( 82 | SELECT * FROM pg_subscription s JOIN shard_subscription USING (subname) 83 | ), 84 | shard_index AS ( 85 | SELECT 86 | reg_class, 87 | rel_id, 88 | si.* 89 | FROM 90 | fdw_shard_index si 91 | JOIN local_rel lr ON (si.schema_name, si.table_name) = ((rel_id).schema_name, (rel_id).table_name) 92 | ), 93 | missing_index AS ( 94 | SELECT 95 | * 96 | FROM 97 | shard_index si 98 | WHERE 99 | NOT EXISTS ( 100 | SELECT 1 FROM pg_index i JOIN pg_class ic ON i.indexrelid = ic.oid 101 | WHERE 102 | i.indrelid = si.reg_class AND 103 | ic.relname = si.index_name 104 | ) 105 | ), 106 | missing_required_index AS ( 107 | SELECT 108 | * 109 | FROM 110 | missing_index 111 | WHERE 112 | NOT optional 113 | ), 114 | ready_remote_shard AS ( 115 | SELECT 116 | * 117 | FROM 118 | remote_shard 119 | WHERE 120 | EXISTS (SELECT 1 FROM 121 | pg_statistic s 122 | WHERE s.starelid = reg_class 123 | ) 124 | OR 125 | EXISTS (SELECT 1 FROM 126 | analyzed_remote_pg_class 127 | WHERE oid = reg_class 128 | ) 129 | ), 130 | ready_local_shard AS ( 131 | SELECT 132 | * 133 | FROM 134 | subscribed_local_shard s 135 | WHERE 136 | NOT EXISTS ( 137 | SELECT 1 FROM missing_required_index 138 | WHERE 139 | reg_class = s.reg_class 140 | ) 141 | ), 142 | roles AS ( 143 | SELECT * FROM fdw_credentials 144 | ), 145 | scripts (async, transactional, description, commands) AS ( 146 | SELECT 147 | FALSE, 148 | TRUE, 149 | format('Found schemas [%s] to create.', 150 | string_agg(format('%I', schema_name), ', ')), 151 | array_agg(format('CREATE SCHEMA IF NOT EXISTS %I', schema_name)) 152 | || 153 | array_agg(select_add_ext_dependency('pg_namespace', format('%L::regnamespace', schema_name))) 154 | FROM 155 | ( 156 | SELECT schema_name FROM shard_schema 157 | UNION ALL 158 | SELECT slot_schema_name FROM slot_schema 159 | UNION ALL 160 | SELECT template_schema_name FROM template_schema 161 | UNION ALL 162 | SELECT view_schema_name FROM view_schema 163 | ) s(schema_name) 164 | WHERE NOT EXISTS (SELECT 1 FROM 165 | pg_namespace 166 | WHERE nspname = schema_name 167 | ) 168 | GROUP BY 1, 2 169 | 170 | UNION ALL 171 | SELECT 172 | FALSE, 173 | TRUE, 174 | format('Found tables [%s] to create.', 175 | string_agg(format('%I.%I', schema_name, table_name), ', ')), 176 | array_agg(create_table ORDER BY level) 177 | || 178 | array_agg(add_ext_dependency((schema_name, table_name))) 179 | FROM 180 | shard_structure s JOIN pg_namespace n ON nspname = s.schema_name 181 | WHERE 182 | NOT EXISTS (SELECT 1 FROM local_rel WHERE (schema_name, table_name) = (s.schema_name, s.table_name)) 183 | GROUP BY 1, 2 -- make sure we produce empty set when no results 184 | 185 | UNION ALL 186 | -- CLEANUP: DROP unnecessary slot and remote (per shard server) schemas 187 | SELECT 188 | FALSE, 189 | TRUE, 190 | format('Removing unused schemas [%s]', string_agg(nspname, ', ')), 191 | ARRAY[ 192 | format('DROP SCHEMA IF EXISTS %s CASCADE', string_agg(quote_ident(nspname), ',')) 193 | ] 194 | FROM 195 | owned_namespace n 196 | WHERE 197 | n.nspname <> '@extschema@' 198 | AND NOT EXISTS ( 199 | SELECT 1 FROM shard_schema WHERE n.nspname = schema_name 200 | ) 201 | AND NOT EXISTS ( 202 | SELECT 1 FROM slot_schema WHERE n.nspname = slot_schema_name 203 | ) 204 | AND NOT EXISTS ( 205 | SELECT 1 FROM template_schema WHERE n.nspname = template_schema_name 206 | ) 207 | AND NOT EXISTS ( 208 | SELECT 1 FROM view_schema WHERE n.nspname = view_schema_name 209 | ) 210 | AND NOT EXISTS ( 211 | SELECT 1 FROM shard_assignment WHERE n.nspname IN (shard_server_schema_name, retained_shard_server_schema) 212 | ) 213 | -- Make sure not to drop schemas that contain subscribed tables 214 | -- This can happen because dropping publications from subscription 215 | -- is done in separate transaction so there is a race condition. 216 | -- Adding this condition resolves that by postponing dropping 217 | -- schemas until after publications drop. 218 | AND NOT EXISTS (SELECT 1 FROM 219 | pg_subscription_rel JOIN pg_class c ON srrelid = c.oid 220 | WHERE 221 | c.relnamespace = n.oid 222 | ) 223 | GROUP BY 1, 2 224 | 225 | UNION ALL 226 | -- Make sure user accounts for local shards are created 227 | SELECT 228 | FALSE, 229 | TRUE, 230 | format('User accounts [%s] to access local shards need to be created.', string_agg(username, ', ')), 231 | array_agg(format('CREATE USER %I PASSWORD %L IN ROLE %I', username, password, "@extschema@".pgwrh_replica_role_name())) 232 | FROM 233 | roles 234 | WHERE 235 | NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = username) 236 | AND EXISTS (SELECT 1 FROM pg_roles WHERE rolname = "@extschema@".pgwrh_replica_role_name()) 237 | GROUP BY 1, 2 -- make sure we produce empty set when no results 238 | 239 | UNION ALL 240 | -- Clean up 241 | SELECT 242 | FALSE, 243 | TRUE, 244 | format('Dropping no longer needed roles [%s]', string_agg(u.rolname, ', ')), 245 | array_agg(format('DROP ROLE %I', u.rolname)) 246 | FROM 247 | pg_roles u 248 | JOIN pg_auth_members ON member = u.oid 249 | JOIN pg_roles gr ON gr.oid = roleid AND gr.rolname = "@extschema@".pgwrh_replica_role_name() 250 | WHERE 251 | NOT EXISTS (SELECT 1 FROM roles WHERE u.rolname = username) 252 | GROUP BY 1, 2 -- make sure we produce empty set when no results 253 | 254 | UNION ALL 255 | -- Grant USAGE on local shards view schemas 256 | SELECT 257 | FALSE, 258 | TRUE, 259 | format('Found view shard schemas [%s] without proper access rights for other replicas', string_agg(view_schema_name, ', ')), 260 | ARRAY[ 261 | format('GRANT USAGE ON SCHEMA %s TO %I', string_agg(quote_ident(view_schema_name), ', '), pgwrh_replica_role_name()) 262 | ] 263 | FROM 264 | view_schema s 265 | JOIN pg_namespace n ON n.nspname = s.view_schema_name 266 | JOIN pg_roles ON 267 | rolname = pgwrh_replica_role_name() 268 | AND NOT has_schema_privilege(rolname, n.oid, 'USAGE') 269 | 270 | GROUP BY 1, 2 271 | 272 | UNION ALL 273 | -- Create single table infrastructure: slot, template tables and views 274 | SELECT 275 | FALSE, 276 | TRUE, 277 | format('Found new shards [%s]. Preparing slot tables.', string_agg(reg_class::text, ', ')), 278 | array_agg( 279 | format('ALTER TABLE %s DETACH PARTITION %s', 280 | (parent).reg_class, 281 | reg_class 282 | ) 283 | ) 284 | || 285 | array_agg( 286 | format('CREATE TABLE %s PARTITION OF %s %s PARTITION BY %s', 287 | fqn(slot_rel_id), 288 | (parent).reg_class, 289 | (lr).bound, 290 | pg_get_partkeydef((parent).pc.oid) 291 | ) 292 | ) 293 | || 294 | array_agg(add_ext_dependency(slot_rel_id)) 295 | || 296 | array_agg( 297 | format('CREATE TABLE %s PARTITION OF %s %s PARTITION BY %s', 298 | fqn(template_rel_id), 299 | fqn(slot_rel_id), 300 | (lr).bound, 301 | pg_get_partkeydef((parent).pc.oid) 302 | ) 303 | ) 304 | || 305 | array_agg(add_ext_dependency(template_rel_id)) 306 | || 307 | array_agg( 308 | format('CREATE VIEW %s AS SELECT * FROM %s', fqn(view_rel_id), (lr).reg_class) 309 | ) 310 | || 311 | array_agg( 312 | format('GRANT SELECT ON %s TO %I', fqn(view_rel_id), pgwrh_replica_role_name()) 313 | ) 314 | || 315 | array_agg(add_ext_dependency(view_rel_id)) 316 | FROM 317 | shard_assignment sc 318 | JOIN pg_namespace sns ON sns.nspname = slot_schema_name 319 | JOIN pg_namespace tns ON tns.nspname = template_schema_name 320 | JOIN pg_namespace vns ON vns.nspname = view_schema_name 321 | WHERE 322 | parent IS NOT NULL 323 | AND (parent).pn.oid <> sns.oid 324 | GROUP BY 325 | 1, 2 326 | 327 | UNION ALL 328 | -- Attach ready local shards to slots replacing existing attachments if necessary 329 | -- TODO partition check constraints handling to speed up attaching local shards 330 | SELECT 331 | FALSE, 332 | TRUE, 333 | format('Attaching local shards [%s] to slots', string_agg(format('%s', ready_shard.reg_class), ', ')), 334 | array_agg(format('ALTER TABLE %s DETACH PARTITION %s', 335 | slot.reg_class, 336 | i.inhrelid::regclass 337 | ) 338 | ) FILTER (WHERE i IS NOT NULL) 339 | || 340 | array_agg(format('ALTER TABLE %s ATTACH PARTITION %s %s', 341 | slot.reg_class, 342 | ready_shard.reg_class, 343 | slot.bound 344 | ) 345 | ) 346 | FROM 347 | shard_assignment sa 348 | JOIN local_rel slot ON slot.rel_id = sa.slot_rel_id 349 | JOIN ready_local_shard ready_shard ON sa.rel_id = ready_shard.rel_id 350 | LEFT JOIN pg_inherits i ON i.inhparent = slot.reg_class 351 | WHERE 352 | ready_shard.reg_class IS DISTINCT FROM i.inhrelid 353 | AND sa.local 354 | AND NOT sa.connect_remote 355 | GROUP BY 1, 2 356 | 357 | UNION ALL 358 | -- Attach ready remote shards to slots replacing 359 | -- existing attachments if necessary 360 | SELECT 361 | FALSE, 362 | TRUE, 363 | format('Attaching remote shards [%s] to slots', string_agg(format('%s', ready_shard.reg_class), ', ')), 364 | array_agg(format('ALTER TABLE %s DETACH PARTITION %s', 365 | slot.reg_class, 366 | i.inhrelid::regclass 367 | ) 368 | ) FILTER (WHERE i IS NOT NULL) 369 | || 370 | array_agg(format('ALTER TABLE %s ATTACH PARTITION %s %s', 371 | slot.reg_class, 372 | ready_shard.reg_class, 373 | slot.bound 374 | ) 375 | ) 376 | FROM 377 | shard_assignment sa 378 | JOIN local_rel slot ON slot.rel_id = sa.slot_rel_id 379 | JOIN ready_remote_shard ready_shard ON sa.remote_rel_id = ready_shard.rel_id 380 | LEFT JOIN pg_inherits i ON i.inhparent = slot.reg_class 381 | WHERE 382 | ready_shard.reg_class IS DISTINCT FROM i.inhrelid 383 | AND 384 | sa.connect_remote 385 | GROUP BY 1, 2 386 | 387 | UNION ALL 388 | -- Subscriptions 389 | SELECT 390 | FALSE, 391 | FALSE, 392 | format('Adding missing shards [%s] to subscription [%s]', string_agg((sc).reg_class::text, ', '), s.subname), 393 | ARRAY[ 394 | format('TRUNCATE %s', 395 | string_agg((sc).reg_class::text, ', ') 396 | ), 397 | format('ALTER SUBSCRIPTION %I ADD PUBLICATION %s WITH (copy_data = true)', 398 | s.subname, 399 | string_agg(quote_ident(sc.pubname), ', ') 400 | ) 401 | ] 402 | FROM 403 | local_shard sc JOIN owned_subscription s USING (subname) 404 | WHERE 405 | NOT EXISTS ( 406 | SELECT 1 FROM unnest(s.subpublications) AS pub(name) 407 | WHERE pub.name = sc.pubname 408 | ) 409 | GROUP BY 410 | s.subname 411 | 412 | UNION ALL 413 | -- create missing indexes 414 | SELECT * FROM 415 | ( 416 | SELECT 417 | TRUE, 418 | TRUE, 419 | format('Creating missing index [%s] ON [%s]', index_name, reg_class), 420 | ARRAY[ 421 | format('CREATE INDEX IF NOT EXISTS %I ON %s %s', 422 | index_name, 423 | reg_class, 424 | index_template 425 | ), 426 | add_ext_dependency(((rel_id).schema_name, index_name)::rel_id) 427 | ] 428 | FROM 429 | missing_index 430 | WHERE 431 | -- there is no way to find out what index is being created 432 | -- so we only allow one concurrent indexing for any given table 433 | NOT EXISTS ( 434 | SELECT 1 FROM pg_stat_progress_create_index WHERE relid = reg_class 435 | ) 436 | LIMIT 437 | -- make sure no more than max_worker_processes/2 indexing operations at the same time 438 | greatest(0, current_setting('max_worker_processes')::int/2 - (SELECT count(*) FROM pg_stat_progress_create_index)) 439 | ) AS sub 440 | 441 | UNION ALL 442 | -- DROP indexes not defined in index_template 443 | -- make sure we do not drop constraint indexes 444 | SELECT 445 | FALSE, 446 | TRUE, 447 | format('Dropping unnecessary indexes [%s] on %s', string_agg(i.indexrelid::regclass::text, ', '), string_agg(reg_class::text, ', ')), 448 | ARRAY[ 449 | format('DROP INDEX %s', string_agg(i.indexrelid::regclass::text, ', ')) 450 | ] 451 | FROM 452 | pg_index i 453 | JOIN pg_class ic ON ic.oid = i.indexrelid 454 | JOIN shard_assignment sa ON sa.reg_class = i.indrelid 455 | WHERE 456 | NOT EXISTS (SELECT 1 FROM 457 | shard_index t 458 | WHERE ic.relname = t.index_name AND i.indrelid = reg_class 459 | ) 460 | AND NOT EXISTS (SELECT 1 FROM 461 | pg_constraint 462 | WHERE conindid = i.indexrelid 463 | ) 464 | GROUP BY 1, 2 465 | 466 | UNION ALL 467 | -- DROP subscriptions for no longer hosted shards 468 | SELECT 469 | FALSE, 470 | FALSE, 471 | format('Dropping subscribed publications for no longer hosted shards [%s]', string_agg(pub.name, ', ')), 472 | ARRAY[ 473 | format('ALTER SUBSCRIPTION %I DROP PUBLICATION %s', 474 | s.subname, 475 | string_agg(quote_ident(pub.name), ', ') 476 | ), 477 | -- FIXME There is a race condition here when cascade delete shard schemas 478 | ( 479 | SELECT format('TRUNCATE %s', string_agg(srrelid::regclass::text, ', ')) 480 | FROM 481 | pg_subscription_rel 482 | WHERE 483 | srsubid = s.oid 484 | AND NOT EXISTS ( 485 | SELECT 1 FROM local_shard WHERE reg_class = srrelid 486 | ) 487 | ) 488 | ] 489 | FROM 490 | owned_subscription s, unnest(s.subpublications) pub(name) 491 | WHERE 492 | NOT EXISTS ( 493 | SELECT 1 FROM local_shard WHERE pubname = pub.name 494 | ) 495 | AND pub.name NOT IN ('pgwrh_controller_ping') 496 | GROUP BY 497 | s.oid, s.subname 498 | 499 | ----- REMOTE SHARDS ------ 500 | UNION ALL 501 | -- create missing foreign servers 502 | SELECT 503 | FALSE, 504 | TRUE, 505 | format('Found foreign servers [%s] to create.', string_agg(format('%I', shard_server_name), ', ')), 506 | array_agg( 507 | format('CREATE SERVER IF NOT EXISTS %I FOREIGN DATA WRAPPER postgres_fdw OPTIONS 508 | ( host %L, port %L, dbname %L, 509 | load_balance_hosts ''random'', 510 | async_capable ''true'', 511 | updatable ''false'', 512 | truncatable ''false'', 513 | extensions %L, 514 | fdw_tuple_cost ''99999'', 515 | analyze_sampling ''system'')', 516 | shard_server_name, 517 | host, port, 518 | dbname, 519 | (SELECT string_agg(extname, ', ') FROM pg_extension) -- assume remote server has all the same extensions 520 | ) 521 | ) 522 | || 523 | array_agg( 524 | format('CREATE USER MAPPING FOR PUBLIC SERVER %I OPTIONS (user %L, password %L)', 525 | shard_server_name, 526 | username, 527 | password 528 | )) 529 | || 530 | array_agg(select_add_ext_dependency('pg_foreign_server'::regclass, 'srvname', shard_server_name)) 531 | FROM 532 | shard_server 533 | JOIN roles ON shard_server_user = username 534 | WHERE 535 | NOT EXISTS (SELECT 1 FROM pg_foreign_server WHERE srvname = shard_server_name) 536 | GROUP BY 1, 2 537 | 538 | UNION ALL 539 | -- create missing remote schemas 540 | SELECT 541 | FALSE, 542 | TRUE, 543 | format('Found remote schemas [%s] to create.', string_agg(shard_server_schema_name, ', ')), 544 | array_agg(format('CREATE SCHEMA IF NOT EXISTS %I', shard_server_schema_name)) 545 | || 546 | array_agg(select_add_ext_dependency('pg_namespace'::regclass, format('%L::regnamespace', shard_server_schema_name))) 547 | FROM 548 | shard_server_schema 549 | WHERE 550 | NOT EXISTS (SELECT 1 FROM pg_namespace WHERE nspname = shard_server_schema_name) 551 | GROUP BY 1, 2 552 | 553 | UNION ALL 554 | -- Create missing remote shards 555 | SELECT 556 | FALSE, 557 | TRUE, 558 | format('Creating missing remote shards [%s]', string_agg(fqn(remote_rel_id), ', ')), 559 | array_agg( 560 | format('CREATE FOREIGN TABLE %s PARTITION OF %s %s SERVER %I OPTIONS (schema_name %L)', 561 | fqn(remote_rel_id), 562 | template.reg_class, 563 | slot.bound, 564 | shard_server_name, 565 | (sa).view_schema_name 566 | ) 567 | ) 568 | || 569 | array_agg(add_ext_dependency(remote_rel_id)) 570 | || 571 | array_agg( 572 | format('ALTER TABLE %s DETACH PARTITION %s', 573 | template.reg_class, 574 | fqn(remote_rel_id) 575 | ) 576 | ) 577 | FROM 578 | shard_assignment sa 579 | JOIN local_rel template ON template.rel_id = sa.template_rel_id 580 | JOIN local_rel slot ON slot.rel_id = sa.slot_rel_id 581 | JOIN pg_namespace ns ON ns.nspname = shard_server_schema_name 582 | JOIN pg_foreign_server fs ON fs.srvname = shard_server_name 583 | WHERE 584 | NOT EXISTS (SELECT 1 FROM 585 | rel 586 | WHERE rel_id = remote_rel_id 587 | ) 588 | GROUP BY 1, 2 589 | 590 | UNION ALL 591 | -- Analyze remote shards in parallel 592 | SELECT 593 | TRUE, 594 | TRUE, 595 | format('Analyze remote shards [%s]', reg_class), 596 | ARRAY [ 597 | format('ANALYZE %s', reg_class), 598 | format('INSERT INTO "@extschema@".analyzed_remote_pg_class (oid) VALUES (%s) ON CONFLICT DO NOTHING', reg_class::oid) 599 | ] 600 | FROM ( 601 | SELECT 602 | rs.reg_class 603 | FROM 604 | remote_shard rs 605 | JOIN shard_assignment ON rs.rel_id IN (remote_rel_id, retained_remote_rel_id) 606 | WHERE 607 | NOT EXISTS (SELECT 1 FROM 608 | pg_statistic s 609 | WHERE s.starelid = rs.reg_class 610 | ) 611 | AND NOT EXISTS (SELECT 1 FROM 612 | analyzed_remote_pg_class 613 | WHERE oid = rs.reg_class 614 | ) 615 | AND NOT EXISTS (SELECT 1 FROM 616 | pg_stat_progress_analyze 617 | WHERE 618 | datname = current_database() 619 | AND relid = rs.reg_class 620 | ) 621 | -- run maximum 5 background analysis concurrently 622 | LIMIT greatest( 623 | 0, 624 | least( 625 | 5, 626 | current_setting('max_worker_processes')::int - 6 - (SELECT count(*) FROM pg_stat_progress_analyze WHERE datname = current_database()))) 627 | ) sub 628 | 629 | UNION ALL 630 | -- DROP remote shards no longer in use 631 | SELECT 632 | FALSE, 633 | TRUE, 634 | format('Dropping remote shards [%s] no longer in use', string_agg(reg_class::text, ', ')), 635 | ARRAY[ 636 | format('DROP FOREIGN TABLE IF EXISTS %s', string_agg(reg_class::text, ', ')) 637 | ] 638 | FROM 639 | remote_shard rs 640 | WHERE 641 | NOT EXISTS (SELECT 1 FROM 642 | shard_assignment 643 | WHERE 644 | rs.rel_id IN (remote_rel_id, retained_remote_rel_id) 645 | ) 646 | GROUP BY 1, 2 647 | 648 | UNION ALL 649 | -- Update foreign servers with updated host/port if changed 650 | SELECT 651 | FALSE, 652 | TRUE, 653 | format('Found modified host and port for server %I', srvname), 654 | ARRAY[ 655 | cmd 656 | ] 657 | FROM 658 | owned_server 659 | JOIN shard_server ON srvname = shard_server_name, 660 | update_server_options(srvname, srvoptions, host, port) AS cmd 661 | 662 | UNION ALL 663 | -- Update user mapping with updated user/pass if changed 664 | SELECT 665 | FALSE, 666 | TRUE, 667 | format('Found modified user and pass for server %I', s.srvname), 668 | ARRAY[ 669 | cmd 670 | ] 671 | FROM 672 | owned_server s 673 | JOIN pg_user_mappings um ON um.srvid = s.oid AND um.umuser = 0 674 | JOIN shard_server ON s.srvname = shard_server_name 675 | JOIN roles ON shard_server_user = username, 676 | update_user_mapping(s.srvname, umoptions, username, password) AS cmd 677 | 678 | UNION ALL 679 | -- DROP remote servers (and all dependent objects) for non-existent remote shards 680 | SELECT 681 | FALSE, 682 | TRUE, 683 | format('Found server %s for non-existent shard. Dropping.', string_agg(srvname, ', ')), 684 | array_agg(format('DROP SERVER IF EXISTS %I CASCADE', srvname)) 685 | FROM 686 | owned_server fs 687 | WHERE 688 | fs.srvname <> 'replica_controller' 689 | AND NOT EXISTS (SELECT 1 FROM 690 | shard_assignment WHERE fs.srvname IN (shard_server_name, retained_shard_server_name) 691 | ) 692 | GROUP BY 1, 2 -- make sure we produce empty set when no results 693 | 694 | 695 | ) 696 | SELECT 697 | * 698 | FROM 699 | scripts 700 | ; 701 | -- FIXME should it be PUBLIC? 702 | GRANT SELECT ON sync TO PUBLIC; 703 | 704 | CREATE FUNCTION cleanup_analyzed_pg_class() RETURNS void LANGUAGE sql AS 705 | $$ 706 | DELETE 707 | FROM "@extschema@".analyzed_remote_pg_class ac 708 | WHERE 709 | NOT EXISTS (SELECT 1 FROM pg_class WHERE oid = ac.oid) 710 | $$; 711 | -------------------------------------------------------------------------------- /src/replica/tables.sql: -------------------------------------------------------------------------------- 1 | -- name: replica-tables 2 | 3 | -- pgwrh 4 | -- Copyright (C) 2024 Michal Kleczek 5 | 6 | -- This program is free software: you can redistribute it and/or modify 7 | -- it under the terms of the GNU Affero General Public License as published by 8 | -- the Free Software Foundation, either version 3 of the License, or 9 | -- (at your option) any later version. 10 | 11 | -- This program is distributed in the hope that it will be useful, 12 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | -- GNU Affero General Public License for more details. 15 | 16 | -- You should have received a copy of the GNU Affero General Public License 17 | -- along with this program. If not, see . 18 | 19 | CREATE TABLE IF NOT EXISTS shard_subscription ( 20 | subname text NOT NULL PRIMARY KEY 21 | ); 22 | 23 | CREATE TABLE IF NOT EXISTS analyzed_remote_pg_class ( 24 | oid oid NOT NULL PRIMARY KEY 25 | ); 26 | -------------------------------------------------------------------------------- /test/master.sql: -------------------------------------------------------------------------------- 1 | -- pgwrh 2 | -- Copyright (C) 2024 Michal Kleczek 3 | 4 | -- This program is free software: you can redistribute it and/or modify 5 | -- it under the terms of the GNU Affero General Public License as published by 6 | -- the Free Software Foundation, either version 3 of the License, or 7 | -- (at your option) any later version. 8 | 9 | -- This program is distributed in the hope that it will be useful, 10 | -- but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | -- GNU Affero General Public License for more details. 13 | 14 | -- You should have received a copy of the GNU Affero General Public License 15 | -- along with this program. If not, see . 16 | 17 | CREATE ROLE test_replica; 18 | 19 | CREATE SCHEMA IF NOT EXISTS test; 20 | CREATE SCHEMA IF NOT EXISTS test_shards; 21 | ALTER SCHEMA test_shards OWNER TO test_replica; 22 | 23 | --GRANT USAGE ON SCHEMA test_shards TO test_replica; 24 | --ALTER DEFAULT PRIVILEGES IN SCHEMA test_shards GRANT SELECT ON TABLES TO test_replica; 25 | 26 | CREATE TABLE test.my_data (col1 text, col2 text, col3 date) PARTITION BY RANGE (col3); 27 | 28 | CREATE OR REPLACE PROCEDURE test.create_year_shard(year int) LANGUAGE plpgsql AS 29 | $$ 30 | DECLARE 31 | r record; 32 | BEGIN 33 | EXECUTE format('CREATE TABLE test.my_data_%1$s PARTITION OF test.my_data FOR VALUES FROM (make_date(%1$s, 1, 1)) TO (make_date(%2$s, 1, 1)) PARTITION BY HASH (col2)', year, year + 1); 34 | FOR r IN 35 | SELECT 36 | format('CREATE TABLE test_shards.my_data_%1$s_%2$s PARTITION OF test.my_data_%1$s (PRIMARY KEY (col1)) FOR VALUES WITH (MODULUS 16, REMAINDER %2$s)', year, rem) stmt, 37 | format('ALTER TABLE test_shards.my_data_%1$s_%2$s OWNER TO test_replica', year, rem) own 38 | FROM generate_series(0, 15) rem 39 | LOOP 40 | EXECUTE r.stmt; 41 | EXECUTE r.own; 42 | END LOOP; 43 | END 44 | $$; 45 | 46 | DO 47 | $$ 48 | DECLARE 49 | year int; 50 | BEGIN 51 | FOR year IN 2022..2025 LOOP 52 | CALL test.create_year_shard(year); 53 | END LOOP; 54 | END 55 | $$; 56 | 57 | CREATE OR REPLACE PROCEDURE test.insert_test_data(years VARIADIC int[]) LANGUAGE sql AS 58 | $$ 59 | INSERT INTO test.my_data 60 | SELECT 'col1: ' || n, 'col2: ' || n, make_date(year, 1, 1) + n FROM unnest(years) AS year, generate_series(1, 300, 1) as n; 61 | $$; 62 | 63 | CALL test.insert_test_data(2022, 2023, 2024, 2025); 64 | 65 | INSERT INTO pgwrh.replication_group 66 | (replication_group_id, username, password) 67 | VALUES 68 | ('g1', 'u', 'p'); 69 | INSERT INTO pgwrh.sharded_table 70 | (replication_group_id, sharded_table_schema, sharded_table_name, replication_factor) 71 | VALUES 72 | ('g1', 'test', 'my_data', 20), 73 | ('g1', 'test', 'my_data_2025', 100), 74 | ('g1', 'test', 'my_data_2024', 50); 75 | 76 | -- SELECT pgwrh.add_shard_host('g1', 'h1', 'localhost', 5533); 77 | -- SELECT pgwrh.add_shard_host('g1', 'h2', 'localhost', 5534); 78 | -- SELECT pgwrh.add_shard_host('g1', 'h3', 'localhost', 5535); 79 | -- SELECT pgwrh.add_shard_host('g1', 'h4', 'localhost', 5536); 80 | -------------------------------------------------------------------------------- /test/requirements.txt: -------------------------------------------------------------------------------- 1 | # pgwrh 2 | # Copyright (C) 2024 Michal Kleczek 3 | 4 | # This program is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU Affero General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU Affero General Public License for more details. 13 | 14 | # You should have received a copy of the GNU Affero General Public License 15 | # along with this program. If not, see . 16 | 17 | asn1crypto==1.5.1 18 | iniconfig==2.0.0 19 | logging==0.4.9.6 20 | packaging==24.2 21 | pg8000==1.31.2 22 | pluggy==1.5.0 23 | port-for==0.7.4 24 | psutil==6.1.1 25 | pytest==8.3.4 26 | python-dateutil==2.9.0.post0 27 | scramp==1.4.5 28 | six==1.17.0 29 | testgres==1.10.3 30 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | # pgwrh 2 | # Copyright (C) 2024 Michal Kleczek 3 | 4 | # This program is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU Affero General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU Affero General Public License for more details. 13 | 14 | # You should have received a copy of the GNU Affero General Public License 15 | # along with this program. If not, see . 16 | 17 | import pytest 18 | import time 19 | 20 | from testgres import get_new_node, scoped_config 21 | from contextlib import ExitStack 22 | 23 | @pytest.fixture 24 | def new_postgres_node(): 25 | with ExitStack() as stack: 26 | with scoped_config(use_python_logging=True): 27 | def do(name, master=False): 28 | node = get_new_node(name) 29 | stack.enter_context(node) 30 | node.init(allow_logical=master) 31 | (node 32 | .append_conf('max_worker_processes = 100') 33 | .append_conf('max_replication_slots = 100') 34 | .append_conf('max_wal_senders = 100')) 35 | node.start() 36 | node.execute('CREATE EXTENSION pgwrh CASCADE') 37 | return node 38 | yield do 39 | 40 | @pytest.fixture 41 | def master(new_postgres_node): 42 | pg_node = new_postgres_node('master', True) 43 | pg_node.psql(filename='masteride.sql') 44 | class Master: 45 | port = pg_node.port 46 | def with_node(self, action): 47 | action(pg_node) 48 | def publish_config_version(self): 49 | pg_node.execute('select pgwrh.mark_pending_version_ready(\'g1\')') 50 | def register_replica(self, replica): 51 | user = replica.name 52 | password = replica.name 53 | with pg_node.connect() as mc: 54 | mc.begin() 55 | mc.execute(f'CREATE USER {user} PASSWORD \'{password}\' REPLICATION IN ROLE test_replica;') 56 | mc.execute(f'SELECT pgwrh.add_shard_host(\'g1\', \'{user}\', \'localhost\', {replica.port})') 57 | mc.commit() 58 | return (replica.name, replica.name) 59 | def delete_pending_version(self): 60 | pg_node.execute('select pgwrh.delete_pending_version(\'g1\')') 61 | def assert_same_result(self, query, replicas): 62 | assert all(query(pg_node) == result for result in map(query, replicas)) 63 | return Master() 64 | 65 | @pytest.fixture 66 | def register_replicas(master): 67 | def do(replicas): 68 | for replica in replicas: 69 | (user, password) = master.register_replica(replica) 70 | replica.execute(f'SELECT pgwrh.configure_controller(\'localhost\', \'{master.port}\', \'{user}\', \'{password}\', refresh_seconds := 0)') 71 | return do 72 | 73 | def poll_ready(replicas): 74 | for replica in replicas: 75 | replica.poll_query_until('SELECT pgwrh.replica_ready()') 76 | 77 | def test_dummy(master, register_replicas, new_postgres_node): 78 | replica1 = new_postgres_node('replica1') 79 | replica2 = new_postgres_node('replica2') 80 | 81 | replicas = [replica1, replica2] 82 | 83 | register_replicas(replicas) 84 | 85 | poll_ready(replicas) 86 | 87 | try: 88 | print(f'Count: {replica1.execute('select count(*) from test.my_data')[0]}') 89 | pytest.fail('Should have failed with fdw connection error') 90 | except: 91 | pass 92 | 93 | master.publish_config_version() 94 | 95 | poll_ready(replicas) 96 | 97 | query = lambda r: r.execute('select count(*) from test.my_data')[0] 98 | master.assert_same_result(query, replicas) 99 | 100 | replica3 = new_postgres_node('replica3') 101 | try: 102 | register_replicas([replica3]) 103 | pytest.fail('Should faile with locked version') 104 | except: 105 | pass 106 | 107 | master.delete_pending_version() 108 | register_replicas([replica3]) 109 | replicas.append(replica3) 110 | 111 | poll_ready(replicas) 112 | 113 | master.assert_same_result(query, replicas) 114 | 115 | master.publish_config_version() 116 | 117 | poll_ready(replicas) 118 | 119 | master.assert_same_result(query, replicas) 120 | --------------------------------------------------------------------------------