├── Cargo.toml ├── LICENSE ├── README.md ├── rustmerger_config.json └── src ├── app_state.rs ├── cli.rs ├── commands.rs ├── config.rs ├── config_validator.rs ├── core.rs ├── display.rs ├── errors.rs ├── file_utils.rs ├── lib.rs ├── logging.rs ├── main.rs ├── processing.rs ├── progress.rs └── signal_handler.rs /Cargo.toml: -------------------------------------------------------------------------------- 1 | # This section defines the package metadata for the Rust project 2 | [package] 3 | name = "rustmerger" # The name of the package 4 | version = "0.1.1" # The current version of the package 5 | edition = "2021" # The Rust edition to use (2021 edition) 6 | author = "Robert Pimentel @pr0b3r7 | github.com/pr0b3r7 | linkedin.com/in/pimentelrobert1 | www.hackerhermanos.com" 7 | 8 | # This section lists the dependencies required by the project 9 | [dependencies] 10 | zip = "2.2.0" # Library for working with ZIP archives 11 | uuid = { version = "1.11.0", features = ["v4"] } # Library for generating and handling UUIDs, using version 4 12 | url = "2.5.2" # Library for URL parsing and manipulation 13 | unrar = "0.5.6" # Library for working with RAR archives 14 | tokio-util = "0.7.12" # Utilities for working with the Tokio async runtime 15 | tokio = { version = "1.36", features = ["full"] } # Tokio async runtime with full feature set 16 | thiserror = "1.0.65" # Library for deriving custom error types 17 | terminal_size = "0.4.0" # Library for getting the terminal size 18 | tempfile = "3.13" # Library for creating temporary files 19 | tar = "0.4.42" # Library for working with TAR archives 20 | signal-hook = "0.3.17" # Library for handling OS signals 21 | sha2 = "0.10.8" # Library for SHA-2 hashing 22 | sevenz-rust = "0.6.1" # Library for working with 7z archives 23 | serde_json = "1.0.132" # Library for JSON serialization and deserialization using Serde 24 | serde = { version = "1.0", features = ["derive"] } # Serde library for serialization and deserialization, with derive feature 25 | reqwest = { version = "0.12.9", features = ["json", "stream"] } # HTTP client library with JSON and streaming support 26 | log = "0.4.22" # Logging library 27 | lazy_static = "1.5.0" # Library for defining statics that require code to be executed at runtime 28 | indicatif = "0.17" # Library for creating progress bars and spinners 29 | hex = "0.4.3" # Library for encoding and decoding hexadecimal 30 | futures = "0.3" # Library for working with asynchronous computations 31 | env_logger = "0.11.5" # Library for logging with environment variable configuration 32 | encoding_rs = "0.8.35" # Library for encoding and decoding character sets 33 | dialoguer = "0.11.0" # Library for creating interactive command-line prompts 34 | ctrlc = { version = "3.4.5", features = ["termination"] } # Library for handling Ctrl+C signals with termination feature 35 | crossterm = "0.28.1" # Library for cross-platform terminal manipulation 36 | clap = { version = "4.4", features = ["derive"] } # Library for command-line argument parsing with derive feature 37 | chrono = { version = "0.4.38", features = ["serde"] } # Library for date and time handling with Serde support 38 | bytes = "1.8.0" # Library for working with byte buffers 39 | async-compression = { version = "0.4.17", features = ["tokio", "bzip2", "gzip", "xz"] } # Library for async compression with support for multiple formats 40 | anyhow = "1.0.91" # Library for error handling with context support 41 | sys-info = "0.9.1" # Library for system information 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # File Merger Tool 2 | 3 | ## Overview 4 | 5 | A robust command-line tool built in Rust that makes merging and deduplicating text files a breeze. Whether you're dealing with small files or massive datasets, this tool handles the heavy lifting with parallel processing and smart error handling. 6 | 7 | ## Key Features 8 | 9 | ### Core Functionality 10 | 11 | - **Smart File Merging**: Feed it a list of file paths via `-i/--input-files`, and it'll combine them into a single output file (`-o/--output-files`). 12 | - **No More Duplicates**: Uses a `HashSet` under the hood to ensure each line appears exactly once in your final output. 13 | - **Memory-Friendly**: Processes files in 10MB chunks by default, so your RAM stays happy. 14 | - **Optimized I/O**: Uses generous buffer sizes (32MB read, 16MB write) to keep things moving quickly. 15 | 16 | ### Performance Features 17 | 18 | - **Parallel Processing**: Spreads the work across 10 threads by default (but you can adjust this). 19 | - **Resource-Conscious**: Chunks files to keep memory usage in check, even with large files. 20 | - **Know What's Happening**: Shows you exactly where you are with progress bars for: 21 | - Overall progress 22 | - Current file 23 | - Deduplication status 24 | - **Your Tool, Your Rules**: Tweak buffer sizes and other settings to match your needs. 25 | 26 | ### Error Handling & Reliability 27 | 28 | - **Keeps Going**: Logs errors without stopping, because one bad file shouldn't ruin everything. 29 | - **UTF-8 Problems? No Problem**: Skips problematic lines and keeps moving. 30 | - **Checks First**: Makes sure all your input files exist and are readable before starting. 31 | - **Safe Writes**: Uses atomic writing to protect your output file from corruption. 32 | 33 | ### Resume Capability 34 | 35 | - **Never Lose Progress**: Creates checkpoint files as it works. 36 | - **Ctrl+C Friendly**: Saves its state when interrupted so you can pick up where you left off. 37 | - **Easy Resumption**: Just use `--resume ` to continue an interrupted job. 38 | - **Knows Its Place**: Keeps track of exactly where it stopped, down to the line. 39 | 40 | ## Author 41 | 42 | Robert Pimentel 43 | 44 | - GitHub: [@pr0b3r7](https://github.com/pr0b3r7) 45 | - LinkedIn: [pimentelrobert1](https://linkedin.com/in/pimentelrobert1) 46 | - Website: [hackerhermanos.com](https://www.hackerhermanos.com) 47 | 48 | ## Dependencies 49 | 50 | This project relies on several high-quality Rust crates to provide its functionality: 51 | 52 | ### Core Dependencies 53 | 54 | - **tokio** (1.36) - Asynchronous runtime powering parallel processing 55 | - **clap** (4.4) - Command-line argument parsing 56 | - **serde** (1.0) - Serialization framework for configuration 57 | - **anyhow** (1.0.91) - Error handling with context 58 | 59 | ### File Processing 60 | 61 | - **async-compression** (0.4.17) - Handles various compression formats (bzip2, gzip, xz) 62 | - **zip** (2.2.0) - ZIP archive support 63 | - **unrar** (0.5.6) - RAR archive support 64 | - **sevenz-rust** (0.6.1) - 7z archive support 65 | - **tar** (0.4.42) - TAR archive support 66 | 67 | ### User Interface 68 | 69 | - **indicatif** (0.17) - Progress bars and spinners 70 | - **dialoguer** (0.11.0) - Interactive command prompts 71 | - **crossterm** (0.28.1) - Terminal manipulation 72 | - **terminal_size** (0.4.0) - Terminal dimensions detection 73 | 74 | ### Utilities 75 | 76 | - **chrono** (0.4.38) - Date and time handling 77 | - **uuid** (1.11.0) - Unique identifier generation 78 | - **sha2** (0.10.8) - Cryptographic hashing 79 | - **encoding_rs** (0.8.35) - Character encoding support 80 | - **sys-info** (0.9.1) - System information gathering 81 | 82 | ### Networking 83 | 84 | - **reqwest** (0.12.9) - HTTP client with streaming support 85 | - **url** (2.5.2) - URL parsing and manipulation 86 | 87 | ### Logging and Error Handling 88 | 89 | - **env_logger** (0.11.5) - Environment-based logging 90 | - **log** (0.4.22) - Logging framework 91 | - **thiserror** (1.0.65) - Custom error types 92 | 93 | ### Signal Handling 94 | 95 | - **ctrlc** (3.4.5) - Ctrl+C signal handling 96 | - **signal-hook** (0.3.17) - OS signal handling 97 | 98 | ## Installation 99 | 100 | ### You'll Need 101 | 102 | - Rust toolchain (1.70+) 103 | - Cargo package manager 104 | 105 | ### Getting Started 106 | 107 | 1. Grab the code: 108 | ```sh 109 | git clone https://github.com/yourusername/file-merger-tool.git 110 | cd file-merger-tool 111 | ``` 112 | 113 | 2. Build it: 114 | ```sh 115 | cargo build --release 116 | ``` 117 | 118 | 3. Want it system-wide? (Optional): 119 | ```sh 120 | sudo cp target/release/file-merger-tool /usr/local/bin/ 121 | ``` 122 | 123 | ## Usage 124 | 125 | ### Quick Start 126 | 127 | ```sh 128 | file-merger-tool merge -w input_list.txt -o merged_output.txt 129 | ``` 130 | 131 | ### Command Reference 132 | 133 | ``` 134 | Usage: rustmerger [OPTIONS] 135 | 136 | Commands: 137 | merge Merge wordlists and rules 138 | generate-config Generate configuration file 139 | guided-setup Run guided setup 140 | resume Resume interrupted operation 141 | help Print this message or the help of the given subcommand(s) 142 | 143 | Options: 144 | -v, --verbose... Set verbosity level (-v: debug, -vv: trace) 145 | --log-level [default: info] 146 | -h, --help Print help 147 | -V, --version Print version 148 | ``` 149 | 150 | #### Merge Command 151 | 152 | ``` 153 | Usage: rustmerger merge [OPTIONS] 154 | 155 | Options: 156 | -v, --verbose... Set verbosity level (-v: debug, -vv: trace) 157 | -w, --wordlists-file Text file containing one wordlist path per line 158 | -r, --rules-file Text file containing one rule path per line 159 | --output-wordlist Destination path for merged and deduplicated wordlist 160 | --output-rules Destination path for merged and deduplicated rules 161 | -c, --config JSON configuration file with default settings 162 | --progress-file Save progress state for resume capability 163 | -d, --debug Enable detailed progress output 164 | -h, --help Print help 165 | ``` 166 | 167 | #### Generate Config Command 168 | 169 | ``` 170 | Usage: rustmerger generate-config [OPTIONS] 171 | 172 | Arguments: 173 | Destination path for configuration file 174 | 175 | Options: 176 | -t, --template Generate default configuration template 177 | -v, --verbose... Set verbosity level (-v: debug, -vv: trace) 178 | -h, --help Print help 179 | ``` 180 | 181 | #### Guided Setup Command 182 | 183 | ``` 184 | Usage: rustmerger guided-setup [OPTIONS] 185 | 186 | Arguments: 187 | Destination path for interactive configuration 188 | 189 | Options: 190 | -v, --verbose... Set verbosity level (-v: debug, -vv: trace) 191 | -h, --help Print help 192 | ``` 193 | 194 | #### Sample Configuration File 195 | 196 | ```json 197 | { 198 | "input_files": "/tmp/wordlists_to_merge_dev.txt", 199 | "output_files": "/tmp/merged_wordlist.txt", 200 | "threads": 90, 201 | "verbose": true, 202 | "debug": true 203 | } 204 | ``` 205 | 206 | ### Under the Hood 207 | 208 | #### How It Works 209 | 210 | The heavy lifting happens in the `FileProcessor` struct (`src/processing.rs`). Here's what makes it tick: 211 | 212 | 1. **Smart File Reading**: 213 | - Uses async I/O with `tokio` for non-blocking file access 214 | - Buffers reads to minimize system calls 215 | 216 | 2. **Reliable Error Handling**: 217 | - Logs issues but keeps going 218 | - Won't let one bad file stop the whole show 219 | 220 | 3. **Line-by-Line Processing**: 221 | - Handles each line individually 222 | - Gracefully skips UTF-8 issues 223 | 224 | 4. **Progress Tracking**: 225 | - Keeps tabs on processed files 226 | - Makes resuming interrupted jobs seamless 227 | 228 | #### Performance Tricks 229 | 230 | 1. **Parallel Power**: 231 | - Spreads work across multiple threads (default: 10) 232 | - Built on `tokio` for efficient async processing 233 | 234 | 2. **Smart Deduplication**: 235 | - Uses `HashSet` for O(1) lookups 236 | - Keeps memory usage in check 237 | 238 | 3. **Visual Feedback**: 239 | - Real-time progress bars 240 | - Shows you exactly what's happening 241 | 242 | 4. **Interruption-Proof**: 243 | - Handles Ctrl+C gracefully 244 | - Saves progress for later 245 | - Managed by `AppState` in `src/app_state.rs` 246 | 247 | 5. **Flexible Configuration**: 248 | - JSON config support via `--config ` 249 | - Interactive setup with `--guided-setup` 250 | 251 | This tool is built to be reliable, efficient, and adaptable to your needs. Whether you're merging a few files or processing thousands, it's got you covered. -------------------------------------------------------------------------------- /rustmerger_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_files": "/tmp/wordlists_to_merge_dev.txt", 3 | "output_files": "/tmp/merged_wordlist.txt", 4 | "threads": 90, 5 | "verbose": true, 6 | "debug": true 7 | } -------------------------------------------------------------------------------- /src/app_state.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; // Importing Result type from anyhow crate for error handling 2 | use std::path::PathBuf; // Importing PathBuf to handle file paths 3 | use tokio::sync::RwLock; // Importing RwLock from tokio for async read-write lock 4 | use std::sync::Arc; // Importing Arc for atomic reference counting 5 | use crate::progress::Progress; // Importing Progress struct from the local crate 6 | 7 | #[allow(dead_code)] 8 | // AppState struct holds the state of the application 9 | pub struct AppState { 10 | pub input_file: PathBuf, // Path to the input file 11 | pub output_file: PathBuf, // Path to the output file 12 | pub threads: usize, // Number of threads to use for processing 13 | pub progress: Arc>, // Progress tracking wrapped in an async read-write lock and atomic reference counter 14 | pub shutdown_requested: Arc>, // Flag to indicate if shutdown is requested, wrapped in an async read-write lock and atomic reference counter 15 | } 16 | 17 | impl AppState { 18 | // Asynchronous function to create a new AppState instance 19 | pub async fn new(input_file: PathBuf, output_file: PathBuf, threads: usize) -> Result { 20 | Ok(Self { 21 | input_file, // Set input file path 22 | output_file, // Set output file path 23 | threads, // Set number of threads 24 | progress: Arc::new(RwLock::new(Progress::default())), // Initialize progress with default value, wrapped in Arc and RwLock 25 | shutdown_requested: Arc::new(RwLock::new(false)), // Initialize shutdown_requested to false, wrapped in Arc and RwLock 26 | }) 27 | } 28 | 29 | // Asynchronous function to create an AppState instance from a resume file 30 | pub async fn from_resume(resume_file: PathBuf) -> Result { 31 | let progress = Progress::load(&resume_file).await?; // Load progress from the resume file 32 | Ok(Self { 33 | input_file: progress.input_file.clone(), // Set input file path from progress 34 | output_file: progress.output_file.clone(), // Set output file path from progress 35 | threads: progress.threads, // Set number of threads from progress 36 | progress: Arc::new(RwLock::new(progress)), // Wrap loaded progress in Arc and RwLock 37 | shutdown_requested: Arc::new(RwLock::new(false)), // Initialize shutdown_requested to false, wrapped in Arc and RwLock 38 | }) 39 | } 40 | 41 | // Asynchronous function to save the current progress 42 | pub async fn save_progress(&self) -> Result<()> { 43 | let progress = self.progress.read().await; // Acquire read lock on progress 44 | progress.save().await // Save the progress 45 | } 46 | 47 | // Asynchronous function to request shutdown 48 | pub async fn request_shutdown(&self) { 49 | *self.shutdown_requested.write().await = true; // Acquire write lock and set shutdown_requested to true 50 | } 51 | 52 | // Asynchronous function to check if shutdown is requested 53 | pub async fn should_shutdown(&self) -> bool { 54 | *self.shutdown_requested.read().await // Acquire read lock and return the value of shutdown_requested 55 | } 56 | } -------------------------------------------------------------------------------- /src/cli.rs: -------------------------------------------------------------------------------- 1 | // Import required dependencies 2 | use clap::{Parser, Subcommand}; // For command-line argument parsing 3 | use std::path::PathBuf; // For handling file paths 4 | use log::LevelFilter; // For controlling log levels 5 | 6 | // Main CLI structure that defines the application's command-line interface 7 | #[derive(Parser)] 8 | #[command( 9 | name = "rustmerger", 10 | about = "Fast parallel merging and deduplication of wordlists and rules", 11 | version, 12 | author, 13 | long_about = None 14 | )] 15 | pub struct Cli { 16 | // Global verbose flag that can be used multiple times (-v, -vv, etc.) 17 | // Each occurrence increases the verbosity level 18 | #[arg( 19 | global = true, // Available to all subcommands 20 | short = 'v', // Can be used as -v 21 | long = "verbose", // Can be used as --verbose 22 | action = clap::ArgAction::Count, // Counts number of occurrences 23 | help = "Set verbosity level (-v: debug, -vv: trace)" 24 | )] 25 | verbose: u8, 26 | 27 | #[command(subcommand)] 28 | pub command: Commands, 29 | 30 | #[arg(long, default_value = "info")] 31 | log_level: String, 32 | } 33 | 34 | // Enum defining all available subcommands 35 | #[derive(Subcommand)] 36 | pub enum Commands { 37 | // Merge subcommand for combining wordlists and rules 38 | #[command(about = "Merge wordlists and rules")] 39 | Merge(MergeArgs), 40 | 41 | // Generate configuration file subcommand 42 | #[command(about = "Generate configuration file")] 43 | GenerateConfig(GenerateConfigArgs), 44 | 45 | // Interactive setup subcommand 46 | #[command(about = "Run guided setup")] 47 | GuidedSetup(GuidedSetupArgs), 48 | 49 | // Resume interrupted operations subcommand 50 | #[command(about = "Resume interrupted operation")] 51 | Resume(ResumeArgs), 52 | } 53 | 54 | // Structure defining all possible arguments for the merge command 55 | #[derive(Parser, Clone)] 56 | pub struct MergeArgs { 57 | // Input file containing list of wordlist paths 58 | #[arg( 59 | short = 'w', 60 | long = "wordlists-file", 61 | help = "Text file containing one wordlist path per line", 62 | value_name = "FILE" 63 | )] 64 | pub wordlists_file: Option, 65 | 66 | // Input file containing list of rule paths 67 | #[arg( 68 | short = 'r', 69 | long = "rules-file", 70 | help = "Text file containing one rule path per line", 71 | value_name = "FILE" 72 | )] 73 | pub rules_file: Option, 74 | 75 | // Output path for merged wordlist 76 | #[arg( 77 | long = "output-wordlist", 78 | help = "Destination path for merged and deduplicated wordlist", 79 | value_name = "FILE" 80 | )] 81 | pub output_wordlist: Option, 82 | 83 | // Output path for merged rules 84 | #[arg( 85 | long = "output-rules", 86 | help = "Destination path for merged and deduplicated rules", 87 | value_name = "FILE" 88 | )] 89 | pub output_rules: Option, 90 | 91 | // Configuration file path 92 | #[arg( 93 | short = 'c', 94 | long = "config", 95 | help = "JSON configuration file with default settings", 96 | value_name = "FILE" 97 | )] 98 | pub config: Option, 99 | 100 | // Progress state file for resume capability 101 | #[arg( 102 | long = "progress-file", 103 | help = "Save progress state for resume capability", 104 | value_name = "FILE" 105 | )] 106 | pub progress_file: Option, 107 | 108 | // Debug mode flag 109 | #[arg( 110 | short = 'd', 111 | long = "debug", 112 | help = "Enable detailed progress output" 113 | )] 114 | pub debug: bool, 115 | } 116 | 117 | // Arguments for the generate-config command 118 | #[derive(Parser, Clone)] 119 | pub struct GenerateConfigArgs { 120 | // Output path for the configuration file 121 | #[arg( 122 | help = "Destination path for configuration file", 123 | value_name = "FILE" 124 | )] 125 | pub output: PathBuf, 126 | 127 | // Flag to generate template configuration 128 | #[arg( 129 | short = 't', 130 | long = "template", 131 | help = "Generate default configuration template" 132 | )] 133 | pub template: bool, 134 | } 135 | 136 | // Arguments for the guided-setup command 137 | #[derive(Parser, Clone)] 138 | pub struct GuidedSetupArgs { 139 | // Output path for the generated configuration 140 | #[arg( 141 | help = "Destination path for interactive configuration", 142 | value_name = "FILE" 143 | )] 144 | pub output: PathBuf, 145 | } 146 | 147 | // Arguments for the resume command 148 | #[derive(Parser, Clone)] 149 | pub struct ResumeArgs { 150 | // Path to the progress state file 151 | #[arg( 152 | help = "Path to saved progress state file", 153 | value_name = "FILE" 154 | )] 155 | pub progress_file: PathBuf, 156 | } 157 | 158 | // Implementation of helper methods for the Cli struct 159 | impl Cli { 160 | // Convert verbose flag count to appropriate log level 161 | pub fn log_level(&self) -> LevelFilter { 162 | match self.log_level.as_str() { 163 | "error" => LevelFilter::Error, 164 | "warn" => LevelFilter::Warn, 165 | "info" => LevelFilter::Info, 166 | "debug" => LevelFilter::Debug, 167 | "trace" => LevelFilter::Trace, 168 | _ => LevelFilter::Info, 169 | } 170 | } 171 | 172 | // Add this new method 173 | pub fn verbose_count(&self) -> u8 { 174 | self.verbose 175 | } 176 | } -------------------------------------------------------------------------------- /src/commands.rs: -------------------------------------------------------------------------------- 1 | // Import required dependencies 2 | use anyhow::Result; // For error handling 3 | use std::path::PathBuf; // For file path operations 4 | use std::sync::Arc; // For thread-safe reference counting 5 | use log::{info, warn}; // For logging 6 | use crate::errors::{MergerError, MergerResult}; 7 | 8 | // Import local modules 9 | use crate::{ 10 | app_state::AppState, // Application state management 11 | config::Config, // Configuration handling 12 | core::ProcessingCore, // Core processing logic 13 | cli::{Cli, MergeArgs, GenerateConfigArgs, GuidedSetupArgs, ResumeArgs}, // CLI arguments 14 | signal_handler::SignalHandler, // Add this with other imports 15 | }; 16 | 17 | // Command handler for processing CLI commands 18 | pub struct CommandHandler; 19 | 20 | impl CommandHandler { 21 | // Handle the merge command - combines wordlists and rules 22 | pub async fn handle_merge(cli: &Cli, args: MergeArgs) -> Result<()> { 23 | info!("Starting merge operation"); 24 | 25 | // Load existing config or create default template 26 | let config = if let Some(config_path) = args.config { 27 | Config::load(&config_path).await? 28 | } else { 29 | Config::default() 30 | }; 31 | 32 | // Create thread-safe application state 33 | let app_state = Arc::new(AppState::new( 34 | args.wordlists_file 35 | .or(config.input_files) 36 | .ok_or_else(|| anyhow::anyhow!("No wordlists file specified"))?, 37 | args.output_wordlist 38 | .or(config.output_files) 39 | .ok_or_else(|| anyhow::anyhow!("No output file specified"))?, 40 | if let Some(threads) = config.threads { 41 | threads 42 | } else { 43 | 10 // Default to 10 threads if not specified 44 | } 45 | ).await?); 46 | 47 | // Fix debug and verbose settings 48 | let debug_enabled = args.debug || config.debug; // Enable debug if specified in args or config 49 | let verbose_enabled = cli.verbose_count() > 0 || config.verbose; // Enable verbose if specified in CLI or config 50 | 51 | // Set up signal handler 52 | let signal_handler = SignalHandler::new(app_state.clone())?; 53 | signal_handler.setup_handlers()?; 54 | 55 | // Create processing core and start processing 56 | let mut core = ProcessingCore::new( 57 | app_state.clone(), 58 | debug_enabled, 59 | verbose_enabled 60 | ).await?; 61 | 62 | if let Err(e) = core.process().await { 63 | warn!("Error during processing: {}", e); 64 | } 65 | 66 | info!("Merge operation completed"); 67 | Ok(()) 68 | } 69 | 70 | // Handle configuration file generation 71 | pub async fn handle_generate_config(args: GenerateConfigArgs) -> Result<()> { 72 | info!("Generating configuration file"); 73 | 74 | // Create default template config 75 | let config = if args.template { 76 | Config::template() 77 | } else { 78 | Config::template() 79 | }; 80 | 81 | // Save configuration to specified path 82 | config.save(&args.output).await?; 83 | 84 | info!("Configuration file generated at: {:?}", args.output); 85 | Ok(()) 86 | } 87 | 88 | // Handle interactive setup process 89 | pub async fn handle_guided_setup(args: GuidedSetupArgs) -> Result<()> { 90 | info!("Starting guided setup"); 91 | 92 | // Run interactive configuration 93 | let config = Config::guided_setup().await?; 94 | config.save(&args.output).await?; 95 | 96 | info!("Configuration saved to: {:?}", args.output); 97 | Ok(()) 98 | } 99 | 100 | // Handle resuming from a previous state 101 | #[allow(dead_code)] 102 | pub async fn handle_resume(args: ResumeArgs) -> Result<()> { 103 | info!("Resuming from progress file: {:?}", args.progress_file); 104 | 105 | // Create application state with default values 106 | let app_state = Arc::new(AppState::new( 107 | args.progress_file.clone(), 108 | PathBuf::from("/tmp/output.txt"), // Default output path 109 | 10 // Default threads 110 | ).await?); 111 | 112 | // Initialize processing core with minimal logging 113 | let mut core = ProcessingCore::new( 114 | app_state.clone(), 115 | false, // Debug disabled 116 | false // Verbose disabled 117 | ).await?; 118 | 119 | // Resume processing and handle errors 120 | if let Err(e) = core.process().await { 121 | warn!("Error during processing: {}", e); 122 | } 123 | 124 | info!("Resume operation completed"); 125 | Ok(()) 126 | } 127 | } -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | // Import required dependencies 2 | use serde::{Serialize, Deserialize}; // For JSON serialization/deserialization 3 | use std::path::PathBuf; // For file path handling 4 | use anyhow::Result; // For error handling 5 | use tokio::fs; // For async file operations 6 | use dialoguer::{Input, Confirm}; // For interactive CLI prompts 7 | use anyhow::Context; 8 | use std::io::{BufReader, BufWriter, BufRead, Write}; 9 | use crate::errors::{MergerError, MergerResult, ConfigError}; 10 | 11 | // Configuration structure that can be serialized to/from JSON 12 | #[derive(Debug, Serialize, Deserialize)] 13 | pub struct Config { 14 | pub input_files: Option, // Path to file containing list of input files 15 | pub output_files: Option, // Path where merged output will be written 16 | pub threads: Option, // Number of parallel processing threads 17 | pub verbose: bool, // Enable detailed logging 18 | pub debug: bool, // Enable debug mode 19 | } 20 | 21 | impl Default for Config { 22 | fn default() -> Self { 23 | Self { 24 | input_files: None, 25 | output_files: None, 26 | threads: Some(10), 27 | verbose: true, 28 | debug: true, 29 | } 30 | } 31 | } 32 | 33 | impl Config { 34 | // Load configuration from a JSON file 35 | pub async fn load(path: &PathBuf) -> MergerResult { 36 | let content = fs::read_to_string(path).await 37 | .map_err(MergerError::Io)?; 38 | serde_json::from_str(&content) 39 | .map_err(|e| MergerError::Config(ConfigError::InvalidFormat(e.to_string()))) 40 | } 41 | 42 | // Save configuration to a JSON file 43 | pub async fn save(&self, path: &PathBuf) -> MergerResult<()> { 44 | let content = serde_json::to_string_pretty(self) 45 | .map_err(|e| MergerError::Config(ConfigError::SerializationError(e.to_string())))?; 46 | fs::write(path, content).await 47 | .map_err(MergerError::Io) 48 | } 49 | 50 | // Create a default configuration template 51 | pub fn template() -> Self { 52 | Self { 53 | input_files: None, 54 | output_files: None, 55 | threads: Some(10), 56 | verbose: true, 57 | debug: true, 58 | } 59 | } 60 | 61 | // Interactive configuration setup using command-line prompts 62 | pub async fn guided_setup() -> MergerResult { 63 | // Prompt for input files path with default value 64 | let input_files: String = Input::new() 65 | .with_prompt("Enter path to input files list") 66 | .default("/tmp/wordlists_to_merge.txt".into()) 67 | .interact()?; 68 | 69 | // Prompt for output file path with default value 70 | let output_files: String = Input::new() 71 | .with_prompt("Enter path for output file") 72 | .default("/tmp/merged_wordlist.txt".into()) 73 | .interact()?; 74 | 75 | // Prompt for number of processing threads 76 | let threads: String = Input::new() 77 | .with_prompt("Enter number of threads") 78 | .default("50".into()) 79 | .interact()?; 80 | 81 | // Confirm whether to enable verbose logging 82 | let verbose = Confirm::new() 83 | .with_prompt("Enable verbose logging?") 84 | .default(true) 85 | .interact()?; 86 | 87 | // Confirm whether to enable debug mode 88 | let debug = Confirm::new() 89 | .with_prompt("Enable debug logging?") 90 | .default(false) 91 | .interact()?; 92 | 93 | // Parse threads with proper error handling 94 | let threads = threads.parse::() 95 | .map_err(|_| MergerError::Config(ConfigError::InvalidThreadCount(0)))?; 96 | 97 | if threads == 0 || threads > 100 { 98 | return Err(MergerError::Config(ConfigError::InvalidThreadCount(threads))); 99 | } 100 | 101 | // Create and return configuration with user-provided values 102 | Ok(Self { 103 | input_files: Some(PathBuf::from(input_files)), 104 | output_files: Some(PathBuf::from(output_files)), 105 | threads: Some(threads), 106 | verbose, 107 | debug, 108 | }) 109 | } 110 | 111 | // Replace the existing validate method with this implementation 112 | pub fn validate(&self) -> Result<(), ConfigError> { 113 | // Validate thread count 114 | if let Some(threads) = self.threads { 115 | if threads == 0 || threads > 100 { 116 | return Err(ConfigError::InvalidThreadCount(threads)); 117 | } 118 | } 119 | 120 | // Validate input files path exists 121 | let input_path = self.input_files 122 | .as_ref() 123 | .ok_or(ConfigError::MissingInputFiles)?; 124 | 125 | if !input_path.exists() { 126 | return Err(ConfigError::InputFileNotFound(input_path.clone())); 127 | } 128 | 129 | // Validate output files path 130 | let output_path = self.output_files 131 | .as_ref() 132 | .ok_or(ConfigError::MissingOutputFiles)?; 133 | 134 | // Check if input and output paths are the same 135 | if input_path == output_path { 136 | return Err(ConfigError::InputOutputPathsEqual); 137 | } 138 | 139 | // Validate output directory exists and is writable 140 | if let Some(parent) = output_path.parent() { 141 | if !parent.exists() { 142 | return Err(ConfigError::OutputDirectoryNotWritable(parent.to_path_buf())); 143 | } 144 | 145 | // Check if directory is writable by attempting to create a temporary file 146 | if let Ok(temp_path) = tempfile::Builder::new() 147 | .prefix(".test-write-") 148 | .tempfile_in(parent) 149 | { 150 | // Clean up temporary file 151 | let _ = temp_path.close(); 152 | } else { 153 | return Err(ConfigError::OutputDirectoryNotWritable(parent.to_path_buf())); 154 | } 155 | } 156 | 157 | Ok(()) 158 | } 159 | } -------------------------------------------------------------------------------- /src/config_validator.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Context, Result}; // Importing Context and Result from the anyhow crate for error handling 2 | use std::path::Path; // Importing Path from the standard library for file path handling 3 | use crate::Config; // Importing the Config struct from the current crate 4 | 5 | // Define a struct for configuration validation 6 | pub struct ConfigValidator; 7 | 8 | impl ConfigValidator { 9 | // Function to validate the entire configuration 10 | pub fn validate_config(config: &Config) -> Result<()> { 11 | // Validate input files path 12 | Self::validate_input_file(&config.input_files) 13 | .context("Invalid input files configuration")?; 14 | 15 | // Validate output files path 16 | if let Some(parent) = config.output_files.parent() { 17 | Self::validate_directory(parent) 18 | .context("Invalid output directory")?; 19 | } 20 | 21 | // Validate thread count 22 | if config.threads == 0 { 23 | return Err(anyhow::anyhow!("Thread count must be greater than 0")); 24 | } 25 | 26 | Ok(()) 27 | } 28 | 29 | // Function to validate an input file path 30 | fn validate_input_file(path: &Path) -> Result<()> { 31 | // Check if the file exists 32 | if !path.exists() { 33 | return Err(anyhow::anyhow!("File does not exist: {:?}", path)); 34 | } 35 | // Check if the path is a file 36 | if !path.is_file() { 37 | return Err(anyhow::anyhow!("Path is not a file: {:?}", path)); 38 | } 39 | Ok(()) 40 | } 41 | 42 | // Function to validate a directory path 43 | fn validate_directory(path: &Path) -> Result<()> { 44 | // Check if the path exists and is a directory 45 | if path.exists() && !path.is_dir() { 46 | return Err(anyhow::anyhow!("Path exists but is not a directory: {:?}", path)); 47 | } 48 | Ok(()) 49 | } 50 | } -------------------------------------------------------------------------------- /src/core.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; // Import Result type from anyhow crate for error handling 2 | use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; // Import progress bar utilities from indicatif crate 3 | use std::path::{Path, PathBuf}; // Import Path and PathBuf for file path handling 4 | use std::sync::Arc; // Import Arc for thread-safe reference counting 5 | use tokio::io::{AsyncWriteExt, AsyncSeekExt, BufWriter, BufReader, AsyncBufReadExt}; // Remove duplicate AsyncWriteExt 6 | use tokio::fs::File; 7 | use futures::StreamExt; 8 | use std::collections::HashSet; 9 | use tokio::sync::mpsc; 10 | use futures::stream::{self, FuturesUnordered}; 11 | use crate::app_state::AppState; 12 | use serde::{Serialize, Deserialize}; 13 | use tokio::fs::OpenOptions; 14 | use tokio::io::SeekFrom; 15 | use sys_info; 16 | use crate::progress::ProgressTracker; 17 | use std::sync::atomic::{AtomicUsize, Ordering}; 18 | use crate::errors::{MergerError, MergerResult}; 19 | 20 | const CHUNK_SIZE: usize = 1024 * 1024 * 10; // 10MB chunks 21 | const BUFFER_SIZE: usize = 1024 * 1024 * 32; // 32MB buffer 22 | const CHANNEL_SIZE: usize = 1000; // Number of chunks to keep in memory 23 | const PARALLEL_FILES: usize = 4; // Number of files to process in parallel 24 | const LINE_BUFFER_CAPACITY: usize = 1024 * 64; // 64KB initial line buffer 25 | const OUTPUT_CHUNKS: usize = 8; 26 | const OUTPUT_BUFFER_SIZE: usize = 1024 * 1024 * 16; // 16MB output buffer 27 | 28 | // Define a struct to manage the core processing logic 29 | #[allow(dead_code)] 30 | pub struct ProcessingCore { 31 | app_state: Arc, // Shared application state 32 | tracker: ProgressTracker, // Replace progress: MultiProgress with tracker 33 | verbose: bool, // Flag to enable verbose logging 34 | debug: bool, // Flag to enable debug mode 35 | } 36 | 37 | // Implement methods for ProcessingCore 38 | impl ProcessingCore { 39 | // Asynchronous constructor for ProcessingCore 40 | pub async fn new(app_state: Arc, verbose: bool, debug: bool) -> MergerResult { 41 | // Estimate total files and lines 42 | let input_file = &app_state.input_file; 43 | let content = tokio::fs::read_to_string(input_file).await?; 44 | let total_files = content.lines().count(); 45 | 46 | // Rough estimation of lines (can be adjusted based on your needs) 47 | let estimated_lines = total_files * 1000; // Assuming average 1000 lines per file 48 | 49 | Ok(Self { 50 | app_state, 51 | tracker: ProgressTracker::new(total_files, estimated_lines), 52 | verbose, 53 | debug, 54 | }) 55 | } 56 | 57 | // Main processing function 58 | pub async fn process(&mut self) -> MergerResult<()> { 59 | if self.verbose { 60 | println!("Starting the processing of files..."); 61 | } 62 | 63 | let input_path = self.app_state.input_file.clone(); 64 | let files = match Self::read_input_files(&input_path).await { 65 | Ok(f) => f, 66 | Err(e) => { 67 | self.log_error(&format!("Failed to read input files: {}", e)).await?; 68 | return Ok(()); 69 | } 70 | }; 71 | 72 | let mut files_processed = 0; 73 | let app_state = Arc::clone(&self.app_state); 74 | 75 | for file in files { 76 | if app_state.should_shutdown().await { 77 | self.tracker.finish(); 78 | return Ok(()); 79 | } 80 | 81 | let file_path = file.clone(); 82 | let result = self.process_single_file(file_path.clone(), &app_state).await; 83 | if let Err(e) = result { 84 | let error_msg = format!("Error processing file {:?}: {}", file_path, e); 85 | self.log_error(&error_msg).await?; 86 | continue; 87 | } 88 | 89 | files_processed += 1; 90 | self.tracker.update_overall_progress(files_processed); 91 | } 92 | 93 | println!("Starting merge and deduplication process..."); 94 | self.merge_and_deduplicate().await?; 95 | 96 | self.tracker.finish(); 97 | println!("Processing completed successfully"); 98 | 99 | Ok(()) 100 | } 101 | 102 | // Function to merge files and remove duplicates 103 | async fn merge_and_deduplicate(&mut self) -> MergerResult<()> { 104 | let files = self.validate_and_collect_metadata(&self.app_state.progress.read().await.processed_files).await?; 105 | let optimized_files = optimize_processing_order(files).await; 106 | 107 | // Calculate optimal batch size based on available system memory 108 | let mem_info = sys_info::mem_info()?; 109 | let available_memory = (mem_info.avail as usize * 1024) / 2; 110 | let batch_size = (available_memory / std::mem::size_of::()).min(CHUNK_SIZE); 111 | 112 | let (tx, mut rx) = mpsc::channel::>(CHANNEL_SIZE); 113 | let unique_count = Arc::new(AtomicUsize::new(0)); 114 | 115 | // Spawn writer task with optimized batching 116 | let writer_task = tokio::spawn({ 117 | let unique_count = unique_count.clone(); 118 | async move { 119 | let mut final_set = HashSet::with_capacity(batch_size); 120 | 121 | while let Some(mut chunk_set) = rx.recv().await { 122 | final_set.extend(chunk_set.drain()); 123 | unique_count.store(final_set.len(), Ordering::Relaxed); 124 | } 125 | final_set 126 | } 127 | }); 128 | 129 | // Process files in parallel with optimized ordering 130 | let mut total_lines_processed = 0; 131 | 132 | // Process files in chunks 133 | for chunk in optimized_files.chunks(PARALLEL_FILES) { 134 | let tx = tx.clone(); 135 | let chunk_files = chunk.to_vec(); 136 | 137 | for file in chunk_files { 138 | if let Ok(lines_count) = Self::process_large_file(&file, tx.clone(), batch_size).await { 139 | total_lines_processed += lines_count; 140 | let current_unique = unique_count.load(Ordering::Relaxed); 141 | self.tracker.update_dedup_progress(current_unique, total_lines_processed); 142 | } 143 | } 144 | } 145 | 146 | drop(tx); // Close the channel 147 | 148 | // Get the final set and write results 149 | let unique_lines = writer_task.await?; 150 | let file = File::create(&self.app_state.output_file).await?; 151 | let mut writer = BufWriter::with_capacity(BUFFER_SIZE, file); 152 | let total_unique = unique_lines.len(); 153 | 154 | println!("Writing {} unique lines to output file", total_unique); 155 | 156 | let mut buffer = String::with_capacity(CHUNK_SIZE); 157 | for line in unique_lines { 158 | buffer.push_str(&line); 159 | buffer.push('\n'); 160 | 161 | if buffer.len() >= CHUNK_SIZE { 162 | writer.write_all(buffer.as_bytes()).await?; 163 | buffer.clear(); 164 | } 165 | } 166 | 167 | if !buffer.is_empty() { 168 | writer.write_all(buffer.as_bytes()).await?; 169 | } 170 | 171 | writer.flush().await?; 172 | self.tracker.update_dedup_progress(total_unique, total_lines_processed); 173 | 174 | Ok(()) 175 | } 176 | 177 | // Move process_large_file into the impl block and make it an associated function 178 | async fn process_large_file( 179 | path: &PathBuf, 180 | tx: mpsc::Sender>, 181 | chunk_size: usize, 182 | ) -> MergerResult { 183 | let file = File::open(path).await?; 184 | let mut reader = BufReader::with_capacity(BUFFER_SIZE, file); 185 | let mut buffer = Vec::with_capacity(LINE_BUFFER_CAPACITY); 186 | let mut current_set = HashSet::with_capacity(chunk_size); 187 | let mut bytes_processed = 0; 188 | let mut total_lines = 0; 189 | 190 | loop { 191 | buffer.clear(); 192 | match reader.read_until(b'\n', &mut buffer).await? { 193 | 0 => break, 194 | n => { 195 | bytes_processed += n; 196 | if !buffer.is_empty() { 197 | if let Ok(line) = String::from_utf8(buffer[..n-1].to_vec()) { 198 | if !line.is_empty() { 199 | current_set.insert(line); 200 | total_lines += 1; 201 | } 202 | } 203 | } 204 | } 205 | } 206 | 207 | if bytes_processed >= CHUNK_SIZE || current_set.len() >= chunk_size { 208 | tx.send(current_set).await?; 209 | current_set = HashSet::with_capacity(chunk_size); 210 | bytes_processed = 0; 211 | } 212 | } 213 | 214 | if !current_set.is_empty() { 215 | tx.send(current_set).await?; 216 | } 217 | 218 | Ok(total_lines) 219 | } 220 | 221 | // Function to read input files from the provided path 222 | async fn read_input_files(input_file: &Path) -> Result> { 223 | let content = tokio::fs::read_to_string(input_file).await?; 224 | Ok(content.lines() 225 | .map(PathBuf::from) 226 | .collect()) 227 | } 228 | 229 | // Function to process a single file 230 | async fn process_single_file(&mut self, file: PathBuf, app_state: &Arc) -> Result<()> { 231 | if app_state.should_shutdown().await { 232 | return Err(anyhow::anyhow!("Processing interrupted by shutdown signal")); // Return an error if shutdown is requested 233 | } 234 | 235 | let content = match tokio::fs::read_to_string(&file).await { 236 | Ok(content) => content, 237 | Err(e) => { 238 | self.log_error(&format!("Error reading {}: {}", file.display(), e)).await?; 239 | return Ok(()); 240 | } 241 | }; 242 | 243 | // Process the content here 244 | let mut progress = app_state.progress.write().await; // Acquire a write lock on the progress state 245 | progress.processed_files.push(file.clone()); // Add the file to the list of processed files 246 | progress.current_position += content.lines().count(); // Update the current position 247 | progress.save().await?; // Save the progress state 248 | 249 | if self.verbose { 250 | log::debug!("Processed file: {}", file.display()); // Log the processed file if verbose is enabled 251 | } 252 | 253 | Ok(()) 254 | } 255 | 256 | // Function to validate the input files 257 | async fn validate_files(&mut self, files: &[PathBuf]) -> Result<()> { 258 | for (i, file) in files.iter().enumerate() { 259 | if !file.exists() { 260 | self.log_error(&format!("File not found: {}", file.display())).await?; 261 | continue; 262 | } 263 | self.tracker.update_overall_progress(i + 1); 264 | } 265 | Ok(()) 266 | } 267 | 268 | // Function to log errors to a file 269 | async fn log_error(&self, message: &str) -> Result<()> { 270 | let mut file = tokio::fs::OpenOptions::new() 271 | .create(true) 272 | .append(true) 273 | .open("error.log") 274 | .await?; 275 | 276 | let error_message = format!("[{}] {}\n", 277 | chrono::Local::now().format("%Y-%m-%d %H:%M:%S"), // Get the current timestamp 278 | message 279 | ); 280 | 281 | file.write_all(error_message.as_bytes()).await?; // Write the error message to the file 282 | file.sync_all().await?; // Sync the file to ensure all data is written 283 | Ok(()) 284 | } 285 | 286 | async fn validate_and_collect_metadata(&self, files: &[PathBuf]) -> Result> { 287 | let mut valid_files = Vec::with_capacity(files.len()); 288 | 289 | // Process files in parallel batches 290 | let batch_size = 50; // Validate 50 files at a time 291 | for chunk in files.chunks(batch_size) { 292 | let futures: FuturesUnordered<_> = chunk.iter().map(|path| async move { 293 | match tokio::fs::metadata(path).await { 294 | Ok(meta) => Some((path.clone(), meta.len())), 295 | Err(e) => { 296 | eprintln!("Error accessing file {}: {}", path.display(), e); 297 | None 298 | } 299 | } 300 | }).collect(); 301 | 302 | // Collect results from this batch 303 | let batch_results: Vec<_> = futures 304 | .filter_map(|result| async move { result }) 305 | .collect() 306 | .await; 307 | 308 | // Extend valid_files with batch results 309 | valid_files.extend(batch_results); 310 | } 311 | 312 | Ok(valid_files) 313 | } 314 | } 315 | 316 | // Enum to represent different processing stages 317 | #[derive(Debug, Clone, Serialize, Deserialize)] 318 | pub enum ProcessingStage { 319 | Initializing, // Initializing stage 320 | ValidatingFiles, // Validating files stage 321 | ProcessingFiles, // Processing files stage 322 | Merging, // Merging stage 323 | Completed, // Completed stage 324 | Failed, // Failed stage 325 | } 326 | 327 | async fn write_chunk( 328 | lines: Vec, 329 | file: &Path, 330 | offset: u64, 331 | ) -> Result<()> { 332 | let mut file = OpenOptions::new() 333 | .write(true) 334 | .create(true) 335 | .open(file) 336 | .await?; 337 | file.seek(SeekFrom::Start(offset)).await?; 338 | let mut writer = BufWriter::with_capacity(OUTPUT_BUFFER_SIZE, file); 339 | 340 | for line in lines { 341 | writer.write_all(line.as_bytes()).await?; 342 | writer.write_all(b"\n").await?; 343 | } 344 | writer.flush().await?; 345 | Ok(()) 346 | } 347 | 348 | async fn optimize_processing_order(files: Vec<(PathBuf, u64)>) -> Vec { 349 | // Sort files by size in descending order for better memory utilization 350 | let mut sorted_files = files; 351 | sorted_files.sort_by(|a, b| b.1.cmp(&a.1)); 352 | 353 | // Group files by size ranges to process similar-sized files together 354 | let mut optimized = Vec::with_capacity(sorted_files.len()); 355 | let mut small = Vec::new(); 356 | let mut medium = Vec::new(); 357 | let mut large = Vec::new(); 358 | 359 | for (path, size) in sorted_files { 360 | match size { 361 | s if s < 1024 * 1024 * 100 => small.push(path), // < 100MB 362 | s if s < 1024 * 1024 * 1000 => medium.push(path), // < 1GB 363 | _ => large.push(path), // >= 1GB 364 | } 365 | } 366 | 367 | // Process largest files first when memory is fresh 368 | optimized.extend(large); 369 | optimized.extend(medium); 370 | optimized.extend(small); 371 | optimized 372 | } -------------------------------------------------------------------------------- /src/display.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Stdout, Write}; // Importing necessary modules from the standard library 2 | use std::time::Instant; // Importing Instant for tracking elapsed time 3 | 4 | // Struct to manage status display on the terminal 5 | pub struct StatusDisplay { 6 | stdout: Stdout, // Standard output handle 7 | last_line_length: usize, // Length of the last printed line 8 | terminal_width: usize, // Width of the terminal 9 | start_time: Instant, // Start time to track elapsed time 10 | } 11 | 12 | impl StatusDisplay { 13 | // Function to create a new StatusDisplay instance 14 | pub fn new() -> io::Result { 15 | let stdout = io::stdout(); // Get the standard output handle 16 | let terminal_width = terminal_size::terminal_size() // Get the terminal size 17 | .map(|(w, _)| w.0 as usize) // Extract the width and convert to usize 18 | .unwrap_or(80); // Default to 80 if terminal size is not available 19 | let start_time = Instant::now(); // Record the current time as start time 20 | 21 | Ok(Self { 22 | stdout, // Initialize stdout 23 | last_line_length: 0, // Initialize last line length to 0 24 | terminal_width, // Initialize terminal width 25 | start_time, // Initialize start time 26 | }) 27 | } 28 | 29 | // Function to update the status message on the terminal 30 | pub fn update_status(&mut self, message: &str) -> io::Result<()> { 31 | // Clear the previous line 32 | write!(self.stdout, "\r")?; // Move cursor to the beginning of the line 33 | for _ in 0..self.last_line_length { 34 | write!(self.stdout, " ")?; // Overwrite the previous line with spaces 35 | } 36 | write!(self.stdout, "\r")?; // Move cursor to the beginning of the line again 37 | 38 | // Write the new message 39 | write!(self.stdout, "{}", message)?; // Print the new message 40 | self.stdout.flush()?; // Flush the output to ensure it is displayed 41 | 42 | // Update the last line length 43 | self.last_line_length = message.len(); // Store the length of the new message 44 | 45 | Ok(()) 46 | } 47 | 48 | // Function to update the progress bar on the terminal 49 | pub fn update_progress(&mut self, current: usize, total: usize, message: &str) -> io::Result<()> { 50 | let percentage = (current as f64 / total as f64 * 100.0) as usize; // Calculate the progress percentage 51 | let bar_width = 30; // Width of the progress bar 52 | let filled = (bar_width as f64 * (current as f64 / total as f64)) as usize; // Calculate the filled portion of the bar 53 | 54 | // Create the progress bar string 55 | let bar: String = format!( 56 | "[{}{}] {}/{} ({}%) {}", 57 | "=".repeat(filled), // Filled portion of the bar 58 | " ".repeat(bar_width - filled), // Empty portion of the bar 59 | current, // Current progress 60 | total, // Total progress 61 | percentage, // Progress percentage 62 | message // Additional message 63 | ); 64 | 65 | self.update_status(&self.truncate_message(&bar)) // Update the status with the progress bar 66 | } 67 | 68 | // Function to truncate the message if it exceeds the terminal width 69 | fn truncate_message(&self, message: &str) -> String { 70 | if message.len() > self.terminal_width { 71 | format!("{}...", &message[..self.terminal_width - 3]) // Truncate and add ellipsis 72 | } else { 73 | message.to_string() // Return the original message if it fits 74 | } 75 | } 76 | 77 | // Function to finish the status display 78 | pub fn finish(&mut self) -> io::Result<()> { 79 | writeln!(self.stdout)?; // Print a newline to finish the status display 80 | self.stdout.flush() // Flush the output to ensure it is displayed 81 | } 82 | 83 | // Function to log the elapsed time since the start 84 | pub fn log_elapsed_time(&self) { 85 | let elapsed = self.start_time.elapsed(); // Calculate the elapsed time 86 | println!("Elapsed time: {:.2?}", elapsed); // Print the elapsed time 87 | } 88 | } -------------------------------------------------------------------------------- /src/errors.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | use std::path::PathBuf; 3 | use dialoguer; 4 | use tokio::task::JoinError; 5 | use tokio::sync::mpsc::error::SendError; 6 | use std::collections::HashSet; 7 | use ctrlc; 8 | 9 | /// Type alias for Result with MergerError as the error type 10 | pub type MergerResult = Result; 11 | 12 | /// Custom error types for the file merger application 13 | #[derive(Error, Debug)] 14 | pub enum MergerError { 15 | /// Standard IO errors 16 | #[error("IO error: {0}")] 17 | Io(#[from] std::io::Error), 18 | 19 | /// Generic error handling via anyhow 20 | #[error("Internal error: {0}")] 21 | Anyhow(#[from] anyhow::Error), 22 | 23 | /// Configuration related errors 24 | #[error("Config error: {0}")] 25 | Config(#[from] ConfigError), 26 | 27 | /// System resource errors 28 | #[error("System error: {0}")] 29 | SysInfo(#[from] sys_info::Error), 30 | 31 | /// File processing errors 32 | #[error("Processing error: {0}")] 33 | Processing(String), 34 | 35 | /// Thread communication errors 36 | #[error("Channel error: {0}")] 37 | Channel(String), 38 | 39 | /// Input file validation errors 40 | #[error("Input validation error: {0}")] 41 | InputValidation(String), 42 | 43 | /// Progress tracking errors 44 | #[error("Progress tracking error: {0}")] 45 | Progress(String), 46 | 47 | /// Resume operation errors 48 | #[error("Resume error: {source}")] 49 | Resume { 50 | #[from] 51 | source: ResumeError, 52 | }, 53 | 54 | /// Deduplication errors 55 | #[error("Deduplication error: {0}")] 56 | Deduplication(String), 57 | 58 | /// UTF-8 encoding errors 59 | #[error("Invalid UTF-8 in file {path}: {message}")] 60 | InvalidUtf8 { 61 | path: PathBuf, 62 | message: String, 63 | }, 64 | } 65 | 66 | /// Specific errors related to resume functionality 67 | #[derive(Error, Debug)] 68 | pub enum ResumeError { 69 | #[error("Progress file not found: {0}")] 70 | ProgressFileNotFound(PathBuf), 71 | 72 | #[error("Invalid progress file format")] 73 | InvalidProgressFormat, 74 | 75 | #[error("Progress file is corrupted")] 76 | CorruptedProgress, 77 | 78 | #[error("Cannot resume: input files have changed")] 79 | InputFilesChanged, 80 | } 81 | 82 | /// Specific errors related to configuration 83 | #[derive(Error, Debug)] 84 | pub enum ConfigError { 85 | #[error("Invalid thread count: {0}. Must be between 1 and 100")] 86 | InvalidThreadCount(usize), 87 | 88 | #[error("Input files path must be specified")] 89 | MissingInputFiles, 90 | 91 | #[error("Output files path must be specified")] 92 | MissingOutputFiles, 93 | 94 | #[error("Input file not found: {0}")] 95 | InputFileNotFound(PathBuf), 96 | 97 | #[error("Output directory is not writable: {0}")] 98 | OutputDirectoryNotWritable(PathBuf), 99 | 100 | #[error("Input and output paths cannot be the same")] 101 | InputOutputPathsEqual, 102 | 103 | #[error("Invalid configuration format: {0}")] 104 | InvalidFormat(String), 105 | 106 | #[error("Serialization error: {0}")] 107 | SerializationError(String), 108 | } 109 | 110 | impl From for MergerError { 111 | fn from(err: dialoguer::Error) -> Self { 112 | MergerError::Processing(err.to_string()) 113 | } 114 | } 115 | 116 | impl From for MergerError { 117 | fn from(err: JoinError) -> Self { 118 | MergerError::Processing(format!("Task join error: {}", err)) 119 | } 120 | } 121 | 122 | impl From> for MergerError { 123 | fn from(err: SendError) -> Self { 124 | MergerError::Channel(err.to_string()) 125 | } 126 | } 127 | 128 | impl From for MergerError { 129 | fn from(err: serde_json::Error) -> Self { 130 | MergerError::Config(ConfigError::InvalidFormat(err.to_string())) 131 | } 132 | } 133 | 134 | impl From> for MergerError { 135 | fn from(err: std::sync::mpsc::SendError) -> Self { 136 | MergerError::Channel(err.to_string()) 137 | } 138 | } 139 | 140 | impl From for MergerError { 141 | fn from(err: ctrlc::Error) -> Self { 142 | MergerError::Processing(format!("Ctrl+C handler error: {}", err)) 143 | } 144 | } -------------------------------------------------------------------------------- /src/file_utils.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; // Import the Result type from the anyhow crate for error handling 2 | use std::{ 3 | path::Path, // Import the Path struct for handling file paths 4 | fs::{File, OpenOptions}, // Import File and OpenOptions for file operations 5 | io::{BufRead, BufReader, BufWriter, Write}, // Import I/O traits and structs for reading and writing files 6 | }; 7 | use log::warn; // Import the warn macro from the log crate for logging warnings 8 | 9 | // Define a struct for file utility functions 10 | pub struct FileUtils; 11 | 12 | impl FileUtils { 13 | // Ensure a directory exists, creating it if necessary 14 | pub async fn ensure_dir(path: &Path) -> Result<()> { 15 | // Check if the directory does not exist 16 | if !path.exists() { 17 | // Create the directory and all its parent directories 18 | tokio::fs::create_dir_all(path).await?; 19 | } 20 | Ok(()) 21 | } 22 | 23 | // Atomically write content to a file 24 | pub async fn atomic_write(path: &Path, content: &[u8]) -> Result<()> { 25 | // Create a temporary file path with a ".tmp" extension 26 | let temp_path = path.with_extension("tmp"); 27 | // Write the content to the temporary file 28 | tokio::fs::write(&temp_path, content).await?; 29 | // Rename the temporary file to the target file path 30 | tokio::fs::rename(temp_path, path).await?; 31 | Ok(()) 32 | } 33 | 34 | // Read lines from a file and return them as a vector of strings 35 | pub fn read_lines(path: &Path) -> Result> { 36 | // Open the file for reading 37 | let file = File::open(path)?; 38 | // Create a buffered reader for the file 39 | let reader = BufReader::new(file); 40 | // Initialize an empty vector to store the lines 41 | let mut lines = Vec::new(); 42 | 43 | // Iterate over the lines in the file 44 | for line in reader.lines() { 45 | match line { 46 | // If the line is read successfully, add it to the vector 47 | Ok(line) => lines.push(line), 48 | // If there is an error reading the line, log a warning 49 | Err(e) => warn!("Error reading line: {}", e), 50 | } 51 | } 52 | 53 | Ok(lines) 54 | } 55 | 56 | // Append unique lines to a file, avoiding duplicates 57 | pub async fn append_unique_lines(path: &Path, lines: &[String]) -> Result<()> { 58 | // Read existing lines from the file into a HashSet to avoid duplicates 59 | let mut existing = if path.exists() { 60 | Self::read_lines(path)? 61 | .into_iter() 62 | .collect::>() 63 | } else { 64 | std::collections::HashSet::new() 65 | }; 66 | 67 | // Open the file for appending, creating it if it doesn't exist 68 | let mut writer = BufWriter::new( 69 | OpenOptions::new() 70 | .create(true) 71 | .append(true) 72 | .open(path)? 73 | ); 74 | 75 | // Iterate over the new lines to be added 76 | for line in lines { 77 | // If the line is not already in the HashSet, add it and write it to the file 78 | if existing.insert(line.clone()) { 79 | if let Err(e) = writeln!(writer, "{}", line) { 80 | warn!("Failed to write line: {}", e); 81 | } 82 | } 83 | } 84 | // Flush the writer to ensure all data is written to the file 85 | if let Err(e) = writer.flush() { 86 | warn!("Failed to flush writer: {}", e); 87 | } 88 | 89 | Ok(()) 90 | } 91 | 92 | // Clean up temporary files in a directory with a specific prefix 93 | pub async fn cleanup_temp_files(dir: &Path, prefix: &str) -> Result<()> { 94 | // Read the directory entries 95 | let mut entries = tokio::fs::read_dir(dir).await?; 96 | // Iterate over the directory entries 97 | while let Some(entry) = entries.next_entry().await? { 98 | let path = entry.path(); 99 | // Check if the file name starts with the specified prefix 100 | if path.file_name() 101 | .and_then(|n| n.to_str()) 102 | .map(|n| n.starts_with(prefix)) 103 | .unwrap_or(false) 104 | { 105 | // Remove the file and log a warning if there is an error 106 | if let Err(e) = tokio::fs::remove_file(&path).await { 107 | warn!("Failed to remove temp file {:?}: {}", path, e); 108 | } 109 | } 110 | } 111 | Ok(()) 112 | } 113 | } -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Declare the display module, which handles displaying information to the user 2 | pub mod display; 3 | 4 | // Declare the core module, which contains the core processing logic of the application 5 | pub mod core; 6 | 7 | // Declare the app_state module, which manages the state of the application 8 | pub mod app_state; 9 | 10 | // Declare the progress module, which tracks and displays progress information 11 | pub mod progress; 12 | 13 | // Declare the config module, which handles configuration management 14 | pub mod config; 15 | 16 | // Declare the file_utils module, which provides utility functions for file operations 17 | pub mod file_utils; 18 | 19 | // Declare the logging module, which handles logging of messages and errors 20 | pub mod logging; 21 | 22 | // Declare the processing module, which contains the main processing logic 23 | pub mod processing; 24 | 25 | // Declare the signal_handler module, which handles OS signals and manages application state 26 | pub mod signal_handler; 27 | 28 | // Declare the errors module, which contains custom error types 29 | pub mod errors; 30 | -------------------------------------------------------------------------------- /src/logging.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; // Importing Result type from anyhow for error handling 2 | use chrono::Local; // Importing Local from chrono to get the current date and time 3 | use crossterm::style::Stylize; // Importing Stylize from crossterm to style log levels 4 | use log::{Level, LevelFilter, Metadata, Record}; // Importing logging types from the log crate 5 | use std::{ 6 | fs::{File, OpenOptions}, // Importing File and OpenOptions for file operations 7 | io::Write, // Importing Write trait for writing to files 8 | path::PathBuf, // Importing PathBuf to handle file paths 9 | sync::Mutex, // Importing Mutex for thread-safe access to files 10 | }; 11 | 12 | // Define a struct for the Logger 13 | pub struct Logger { 14 | log_file: Option>, // Optional log file wrapped in a Mutex for thread-safe access 15 | error_file: Option>, // Optional error file wrapped in a Mutex for thread-safe access 16 | level: LevelFilter, // Log level filter to control which log messages are recorded 17 | } 18 | 19 | impl Logger { 20 | // Initialize the logger with optional log and error file paths and a log level 21 | pub fn init( 22 | log_path: Option, // Optional path for the log file 23 | error_path: Option, // Optional path for the error file 24 | level: LevelFilter, // Log level filter 25 | ) -> Result<()> { 26 | // Create the log file if a path is provided 27 | let log_file = log_path.map(|path| { 28 | Mutex::new( 29 | OpenOptions::new() 30 | .create(true) // Create the file if it doesn't exist 31 | .append(true) // Append to the file if it exists 32 | .open(path) // Open the file at the given path 33 | .unwrap(), // Unwrap the result, panicking if there's an error 34 | ) 35 | }); 36 | 37 | // Create the error file if a path is provided 38 | let error_file = error_path.map(|path| { 39 | Mutex::new( 40 | OpenOptions::new() 41 | .create(true) // Create the file if it doesn't exist 42 | .append(true) // Append to the file if it exists 43 | .open(path) // Open the file at the given path 44 | .unwrap(), // Unwrap the result, panicking if there's an error 45 | ) 46 | }); 47 | 48 | // Create a new Logger instance 49 | let logger = Logger { 50 | log_file, 51 | error_file, 52 | level, 53 | }; 54 | 55 | // Set the global logger to the newly created logger 56 | log::set_boxed_logger(Box::new(logger))?; 57 | // Set the maximum log level 58 | log::set_max_level(level); 59 | 60 | Ok(()) 61 | } 62 | 63 | // Format a log record into a string 64 | fn format_log(&self, record: &Record) -> String { 65 | // Style the log level based on its severity 66 | let level_str = match record.level() { 67 | Level::Error => record.level().to_string().red(), // Red for errors 68 | Level::Warn => record.level().to_string().yellow(), // Yellow for warnings 69 | Level::Info => record.level().to_string().green(), // Green for info 70 | Level::Debug => record.level().to_string().blue(), // Blue for debug 71 | Level::Trace => record.level().to_string().magenta(), // Magenta for trace 72 | }; 73 | 74 | // Format the log message with the current time, log level, target, and message 75 | format!( 76 | "[{}] {} - {}: {}\n", 77 | Local::now().format("%Y-%m-%d %H:%M:%S"), // Current date and time 78 | level_str, // Styled log level 79 | record.target(), // Target of the log message 80 | record.args() // Log message 81 | ) 82 | } 83 | } 84 | 85 | // Implement the Log trait for the Logger struct 86 | impl log::Log for Logger { 87 | // Check if a log message should be logged based on its metadata 88 | fn enabled(&self, metadata: &Metadata) -> bool { 89 | metadata.level() <= self.level // Only log messages at or below the set log level 90 | } 91 | 92 | // Log a message 93 | fn log(&self, record: &Record) { 94 | if self.enabled(record.metadata()) { // Check if the log message should be logged 95 | let formatted = self.format_log(record); // Format the log message 96 | 97 | // Print the log message to the console 98 | print!("{}", formatted); 99 | 100 | // Write the log message to the log file if it exists 101 | if let Some(log_file) = &self.log_file { 102 | if let Ok(mut file) = log_file.lock() { // Lock the file for thread-safe access 103 | let _ = file.write_all(formatted.as_bytes()); // Write the log message to the file 104 | } 105 | } 106 | 107 | // Write error messages to the error file if it exists 108 | if record.level() == Level::Error { 109 | if let Some(error_file) = &self.error_file { 110 | if let Ok(mut file) = error_file.lock() { // Lock the file for thread-safe access 111 | let _ = file.write_all(formatted.as_bytes()); // Write the error message to the file 112 | } 113 | } 114 | } 115 | } 116 | } 117 | 118 | // Flush the log files 119 | fn flush(&self) { 120 | // Flush the log file if it exists 121 | if let Some(log_file) = &self.log_file { 122 | if let Ok(mut file) = log_file.lock() { // Lock the file for thread-safe access 123 | let _ = file.flush(); // Flush the file 124 | } 125 | } 126 | // Flush the error file if it exists 127 | if let Some(error_file) = &self.error_file { 128 | if let Ok(mut file) = error_file.lock() { // Lock the file for thread-safe access 129 | let _ = file.flush(); // Flush the file 130 | } 131 | } 132 | } 133 | } -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; // Import the Result type from the anyhow crate for error handling 2 | use clap::Parser; // Import the Parser trait from the clap crate for command-line argument parsing 3 | use log::{info, error}; // Import the info macro and error macro from the log crate for logging 4 | use std::sync::Arc; // Import the Arc type from the std::sync crate for shared ownership 5 | use ctrlc; // Import the ctrlc crate for handling Ctrl+C signals 6 | 7 | // Declare the modules used in the application 8 | mod cli; // Module for command-line interface definitions 9 | mod commands; // Module for handling different commands 10 | mod config; // Module for configuration management 11 | mod core; // Module for core processing logic 12 | mod app_state; // Module for application state management 13 | mod progress; // Module for progress tracking 14 | mod signal_handler; // Module for signal handling 15 | mod errors; // Add this line 16 | 17 | // Import specific items from the cli and commands modules 18 | use cli::{Cli, Commands}; // Import the Cli struct and Commands enum from the cli module 19 | use commands::CommandHandler; // Import the CommandHandler struct from the commands module 20 | use crate::core::ProcessingCore; 21 | use crate::app_state::AppState; 22 | use crate::errors::{MergerError, MergerResult}; 23 | 24 | // Main asynchronous function 25 | #[tokio::main] // Macro to set up the Tokio runtime 26 | async fn main() -> MergerResult<()> { 27 | // Parse command-line arguments into the Cli struct 28 | let cli = Cli::parse(); 29 | 30 | // Initialize the logger with the log level specified in the command-line arguments 31 | env_logger::builder().filter_level(cli.log_level()).init(); 32 | 33 | // Match on the command provided in the command-line arguments 34 | match cli.command { 35 | // Handle the "merge" command 36 | Commands::Merge(ref args) => { 37 | CommandHandler::handle_merge(&cli, args.clone()).await?; 38 | } 39 | // Handle the "generate-config" command 40 | Commands::GenerateConfig(args) => { 41 | CommandHandler::handle_generate_config(args).await?; 42 | } 43 | // Handle the "guided-setup" command 44 | Commands::GuidedSetup(args) => { 45 | CommandHandler::handle_guided_setup(args).await?; 46 | } 47 | // Handle the "resume" command 48 | Commands::Resume(args) => { 49 | let state: AppState = AppState::from_resume(args.progress_file).await?; 50 | let state = Arc::new(state); 51 | 52 | // Set up Ctrl+C handler 53 | let state_clone = Arc::clone(&state); 54 | ctrlc::set_handler(move || { 55 | let state = state_clone.clone(); 56 | tokio::spawn(async move { 57 | info!("Received Ctrl+C, saving progress..."); 58 | if let Err(e) = state.save_progress().await { 59 | error!("Failed to save progress: {}", e); 60 | } 61 | state.request_shutdown().await; 62 | }); 63 | })?; 64 | 65 | // Resume merger 66 | let mut core = ProcessingCore::new(state.clone(), true, true).await?; 67 | core.process().await?; 68 | } 69 | } 70 | 71 | Ok(()) 72 | } 73 | -------------------------------------------------------------------------------- /src/processing.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | use tokio::io::{AsyncBufReadExt, BufReader}; 3 | use tokio::fs::File; 4 | use log::warn; 5 | use std::path::PathBuf; 6 | use anyhow::Result; 7 | use crate::progress::Progress; 8 | 9 | #[derive(Debug, Clone, Serialize, Deserialize)] 10 | pub enum ProcessingStage { 11 | Initializing, 12 | ValidatingFiles, 13 | ProcessingFiles, 14 | Merging, 15 | Completed, 16 | Failed, 17 | } 18 | 19 | pub struct FileProcessor; 20 | 21 | impl FileProcessor { 22 | pub async fn process_file(progress: &mut Progress, file: PathBuf) -> Result<()> { 23 | let file_path = file.clone(); 24 | 25 | let file = match File::open(&file).await { 26 | Ok(f) => f, 27 | Err(e) => { 28 | warn!("Failed to open file {:?}: {}", file, e); 29 | return Ok(()); 30 | } 31 | }; 32 | 33 | let reader = BufReader::new(file); 34 | let mut lines = reader.lines(); 35 | 36 | while let Some(line) = lines.next_line().await? { 37 | if !line.is_empty() { 38 | // Process line here if needed 39 | } 40 | } 41 | 42 | progress.add_processed_file(file_path).await?; 43 | Ok(()) 44 | } 45 | } -------------------------------------------------------------------------------- /src/progress.rs: -------------------------------------------------------------------------------- 1 | // Import required dependencies 2 | use anyhow::Result; // For error handling 3 | use serde::{Serialize, Deserialize}; // For JSON serialization/deserialization 4 | use std::path::PathBuf; // For file path handling 5 | use tokio::fs; // For async file operations 6 | use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; 7 | use std::time::{Duration, Instant}; 8 | 9 | // Metrics tracking structures 10 | pub struct ProcessingMetrics { 11 | start_time: Instant, 12 | files_processed: usize, 13 | lines_processed: usize, 14 | errors_count: usize, 15 | } 16 | 17 | impl ProcessingMetrics { 18 | pub fn new() -> Self { 19 | Self { 20 | start_time: Instant::now(), 21 | files_processed: 0, 22 | lines_processed: 0, 23 | errors_count: 0, 24 | } 25 | } 26 | 27 | pub fn increment_files(&mut self) { 28 | self.files_processed += 1; 29 | } 30 | 31 | pub fn add_lines(&mut self, count: usize) { 32 | self.lines_processed += count; 33 | } 34 | 35 | pub fn get_summary(&self) -> ProcessingSummary { 36 | ProcessingSummary { 37 | elapsed_time: self.start_time.elapsed(), 38 | files_processed: self.files_processed, 39 | lines_processed: self.lines_processed, 40 | errors_count: self.errors_count, 41 | memory_usage: 0, 42 | } 43 | } 44 | } 45 | 46 | pub struct ProcessingSummary { 47 | pub elapsed_time: Duration, 48 | pub files_processed: usize, 49 | pub lines_processed: usize, 50 | pub errors_count: usize, 51 | pub memory_usage: usize, 52 | } 53 | 54 | // Progress tracking structure that can be serialized to/from JSON 55 | #[derive(Debug, Serialize, Deserialize)] 56 | pub struct Progress { 57 | pub input_file: PathBuf, // Source file containing list of files to process 58 | pub output_file: PathBuf, // Destination file for merged content 59 | pub threads: usize, // Number of parallel processing threads 60 | pub processed_files: Vec, // List of successfully processed files 61 | pub current_position: usize, // Current processing position for resume capability 62 | pub save_path: Option, // Path where progress state is saved 63 | } 64 | 65 | // Implement Default trait for Progress 66 | impl Default for Progress { 67 | fn default() -> Self { 68 | Self { 69 | input_file: PathBuf::new(), 70 | output_file: PathBuf::new(), 71 | threads: 10, // Default to 10 threads 72 | processed_files: Vec::new(), 73 | current_position: 0, 74 | save_path: None, 75 | } 76 | } 77 | } 78 | 79 | impl Progress { 80 | // Save current progress state to JSON file 81 | pub async fn save(&self) -> Result<()> { 82 | if let Some(path) = &self.save_path { 83 | // Convert progress state to pretty-printed JSON 84 | let content = serde_json::to_string_pretty(&self)?; 85 | // Write to file asynchronously 86 | fs::write(path, content).await?; 87 | } 88 | Ok(()) 89 | } 90 | 91 | // Load progress state from a JSON file 92 | pub async fn load(path: &PathBuf) -> Result { 93 | // Read file content asynchronously 94 | let content = fs::read_to_string(path).await?; 95 | // Parse JSON into Progress struct 96 | let mut progress: Progress = serde_json::from_str(&content)?; 97 | // Store save path for future updates 98 | progress.save_path = Some(path.clone()); 99 | Ok(progress) 100 | } 101 | 102 | // Add a processed file to the progress tracking 103 | #[allow(dead_code)] // Suppress unused function warning 104 | pub async fn add_processed_file(&mut self, file: PathBuf) -> Result<()> { 105 | // Add file to processed list 106 | self.processed_files.push(file); 107 | // Increment position counter 108 | self.current_position += 1; 109 | // Save updated progress state 110 | self.save().await 111 | } 112 | } 113 | 114 | pub struct ProgressTracker { 115 | multi_progress: MultiProgress, 116 | overall_progress: ProgressBar, 117 | dedup_progress: ProgressBar, 118 | metrics: ProcessingMetrics, 119 | refresh_rate: Duration, 120 | } 121 | 122 | impl ProgressTracker { 123 | pub fn new(total_files: usize, estimated_lines: usize) -> Self { 124 | let multi = MultiProgress::new(); 125 | 126 | // Overall progress bar style 127 | let overall_style = ProgressStyle::default_bar() 128 | .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} files ({percent}%) | {msg}") 129 | .unwrap() 130 | .progress_chars("#>-"); 131 | 132 | // Deduplication progress bar style 133 | let dedup_style = ProgressStyle::default_bar() 134 | .template("{spinner:.yellow} [{elapsed_precise}] [{bar:40.yellow/blue}] {pos}/{len} lines | {msg}") 135 | .unwrap() 136 | .progress_chars("#>-"); 137 | 138 | let overall_pb = multi.add(ProgressBar::new(total_files as u64)); 139 | overall_pb.set_style(overall_style); 140 | 141 | let dedup_pb = multi.add(ProgressBar::new(estimated_lines as u64)); 142 | dedup_pb.set_style(dedup_style); 143 | 144 | Self { 145 | multi_progress: multi, 146 | overall_progress: overall_pb, 147 | dedup_progress: dedup_pb, 148 | metrics: ProcessingMetrics::new(), 149 | refresh_rate: Duration::from_millis(100), 150 | } 151 | } 152 | 153 | pub fn update_overall_progress(&mut self, files_processed: usize) { 154 | self.metrics.increment_files(); 155 | let summary = self.metrics.get_summary(); 156 | 157 | self.overall_progress.set_position(files_processed as u64); 158 | self.overall_progress.set_message(format!( 159 | "Speed: {:.2} files/s | Memory: {:.2} MB | Errors: {}", 160 | files_processed as f64 / summary.elapsed_time.as_secs_f64(), 161 | summary.memory_usage as f64 / 1_048_576.0, // Convert bytes to MB 162 | summary.errors_count 163 | )); 164 | } 165 | 166 | pub fn update_dedup_progress(&mut self, lines_processed: usize, total_lines: usize) { 167 | self.metrics.add_lines(lines_processed); 168 | let summary = self.metrics.get_summary(); 169 | 170 | self.dedup_progress.set_length(total_lines as u64); 171 | self.dedup_progress.set_position(lines_processed as u64); 172 | self.dedup_progress.set_message(format!( 173 | "Speed: {:.2} lines/s | Unique lines: {}", 174 | summary.lines_processed as f64 / summary.elapsed_time.as_secs_f64(), 175 | lines_processed 176 | )); 177 | } 178 | 179 | pub fn finish(&self) { 180 | let summary = self.metrics.get_summary(); 181 | self.overall_progress.finish_with_message(format!( 182 | "Completed in {}s | Files: {} | Lines: {} | Errors: {}", 183 | summary.elapsed_time.as_secs(), 184 | summary.files_processed, 185 | summary.lines_processed, 186 | summary.errors_count 187 | )); 188 | self.dedup_progress.finish(); 189 | } 190 | 191 | pub fn get_metrics(&self) -> &ProcessingMetrics { 192 | &self.metrics 193 | } 194 | } -------------------------------------------------------------------------------- /src/signal_handler.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; // Importing Arc for thread-safe reference counting 2 | use tokio::sync::broadcast; // Importing broadcast channel from tokio for sending shutdown signals 3 | use anyhow::Result; // Importing Result type from anyhow for error handling 4 | use log::{info, error}; // Importing logging macros for info and error messages 5 | use crate::app_state::AppState; // Importing the AppState struct from the app_state module 6 | 7 | // Struct to handle OS signals and manage application state 8 | pub struct SignalHandler { 9 | app_state: Arc, // Shared and mutable application state 10 | shutdown_tx: broadcast::Sender<()>, // Broadcast channel sender for shutdown signals 11 | } 12 | 13 | impl SignalHandler { 14 | // Function to create a new instance of SignalHandler 15 | pub fn new(app_state: Arc) -> Result { 16 | // Create a new broadcast channel with a buffer size of 1 17 | let (shutdown_tx, _) = broadcast::channel(1); 18 | 19 | // Return a new SignalHandler instance with the provided app_state and broadcast channel 20 | Ok(Self { 21 | app_state, 22 | shutdown_tx, 23 | }) 24 | } 25 | 26 | // Function to subscribe to the shutdown broadcast channel 27 | #[allow(dead_code)] 28 | pub fn subscribe(&self) -> broadcast::Receiver<()> { 29 | // Return a new receiver for the broadcast channel 30 | self.shutdown_tx.subscribe() 31 | } 32 | 33 | // Function to set up signal handlers 34 | pub fn setup_handlers(&self) -> Result<()> { 35 | // Clone the broadcast channel sender for use in the signal handler 36 | let shutdown_tx = self.shutdown_tx.clone(); 37 | // Clone the app_state for use in the signal handler 38 | let app_state = self.app_state.clone(); 39 | 40 | // Set up a handler for the Ctrl+C signal 41 | ctrlc::set_handler(move || { 42 | // Log that an interrupt signal was received 43 | info!("Received interrupt signal, initiating graceful shutdown"); 44 | 45 | // Clone app_state and shutdown_tx again before moving into async block 46 | let app_state = app_state.clone(); 47 | let shutdown_tx = shutdown_tx.clone(); 48 | 49 | tokio::spawn(async move { 50 | // Attempt to save the progress 51 | if let Err(e) = app_state.save_progress().await { 52 | error!("Failed to save progress: {}", e); 53 | } 54 | 55 | // Attempt to send the shutdown signal 56 | if let Err(e) = shutdown_tx.send(()) { 57 | error!("Failed to send shutdown signal: {}", e); 58 | } 59 | }); 60 | })?; 61 | 62 | Ok(()) 63 | } 64 | } --------------------------------------------------------------------------------