├── Cargo.toml
├── LICENSE
├── README.md
├── rustmerger_config.json
└── src
├── app_state.rs
├── cli.rs
├── commands.rs
├── config.rs
├── config_validator.rs
├── core.rs
├── display.rs
├── errors.rs
├── file_utils.rs
├── lib.rs
├── logging.rs
├── main.rs
├── processing.rs
├── progress.rs
└── signal_handler.rs
/Cargo.toml:
--------------------------------------------------------------------------------
1 | # This section defines the package metadata for the Rust project
2 | [package]
3 | name = "rustmerger" # The name of the package
4 | version = "0.1.1" # The current version of the package
5 | edition = "2021" # The Rust edition to use (2021 edition)
6 | author = "Robert Pimentel @pr0b3r7 | github.com/pr0b3r7 | linkedin.com/in/pimentelrobert1 | www.hackerhermanos.com"
7 |
8 | # This section lists the dependencies required by the project
9 | [dependencies]
10 | zip = "2.2.0" # Library for working with ZIP archives
11 | uuid = { version = "1.11.0", features = ["v4"] } # Library for generating and handling UUIDs, using version 4
12 | url = "2.5.2" # Library for URL parsing and manipulation
13 | unrar = "0.5.6" # Library for working with RAR archives
14 | tokio-util = "0.7.12" # Utilities for working with the Tokio async runtime
15 | tokio = { version = "1.36", features = ["full"] } # Tokio async runtime with full feature set
16 | thiserror = "1.0.65" # Library for deriving custom error types
17 | terminal_size = "0.4.0" # Library for getting the terminal size
18 | tempfile = "3.13" # Library for creating temporary files
19 | tar = "0.4.42" # Library for working with TAR archives
20 | signal-hook = "0.3.17" # Library for handling OS signals
21 | sha2 = "0.10.8" # Library for SHA-2 hashing
22 | sevenz-rust = "0.6.1" # Library for working with 7z archives
23 | serde_json = "1.0.132" # Library for JSON serialization and deserialization using Serde
24 | serde = { version = "1.0", features = ["derive"] } # Serde library for serialization and deserialization, with derive feature
25 | reqwest = { version = "0.12.9", features = ["json", "stream"] } # HTTP client library with JSON and streaming support
26 | log = "0.4.22" # Logging library
27 | lazy_static = "1.5.0" # Library for defining statics that require code to be executed at runtime
28 | indicatif = "0.17" # Library for creating progress bars and spinners
29 | hex = "0.4.3" # Library for encoding and decoding hexadecimal
30 | futures = "0.3" # Library for working with asynchronous computations
31 | env_logger = "0.11.5" # Library for logging with environment variable configuration
32 | encoding_rs = "0.8.35" # Library for encoding and decoding character sets
33 | dialoguer = "0.11.0" # Library for creating interactive command-line prompts
34 | ctrlc = { version = "3.4.5", features = ["termination"] } # Library for handling Ctrl+C signals with termination feature
35 | crossterm = "0.28.1" # Library for cross-platform terminal manipulation
36 | clap = { version = "4.4", features = ["derive"] } # Library for command-line argument parsing with derive feature
37 | chrono = { version = "0.4.38", features = ["serde"] } # Library for date and time handling with Serde support
38 | bytes = "1.8.0" # Library for working with byte buffers
39 | async-compression = { version = "0.4.17", features = ["tokio", "bzip2", "gzip", "xz"] } # Library for async compression with support for multiple formats
40 | anyhow = "1.0.91" # Library for error handling with context support
41 | sys-info = "0.9.1" # Library for system information
42 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # File Merger Tool
2 |
3 | ## Overview
4 |
5 | A robust command-line tool built in Rust that makes merging and deduplicating text files a breeze. Whether you're dealing with small files or massive datasets, this tool handles the heavy lifting with parallel processing and smart error handling.
6 |
7 | ## Key Features
8 |
9 | ### Core Functionality
10 |
11 | - **Smart File Merging**: Feed it a list of file paths via `-i/--input-files`, and it'll combine them into a single output file (`-o/--output-files`).
12 | - **No More Duplicates**: Uses a `HashSet` under the hood to ensure each line appears exactly once in your final output.
13 | - **Memory-Friendly**: Processes files in 10MB chunks by default, so your RAM stays happy.
14 | - **Optimized I/O**: Uses generous buffer sizes (32MB read, 16MB write) to keep things moving quickly.
15 |
16 | ### Performance Features
17 |
18 | - **Parallel Processing**: Spreads the work across 10 threads by default (but you can adjust this).
19 | - **Resource-Conscious**: Chunks files to keep memory usage in check, even with large files.
20 | - **Know What's Happening**: Shows you exactly where you are with progress bars for:
21 | - Overall progress
22 | - Current file
23 | - Deduplication status
24 | - **Your Tool, Your Rules**: Tweak buffer sizes and other settings to match your needs.
25 |
26 | ### Error Handling & Reliability
27 |
28 | - **Keeps Going**: Logs errors without stopping, because one bad file shouldn't ruin everything.
29 | - **UTF-8 Problems? No Problem**: Skips problematic lines and keeps moving.
30 | - **Checks First**: Makes sure all your input files exist and are readable before starting.
31 | - **Safe Writes**: Uses atomic writing to protect your output file from corruption.
32 |
33 | ### Resume Capability
34 |
35 | - **Never Lose Progress**: Creates checkpoint files as it works.
36 | - **Ctrl+C Friendly**: Saves its state when interrupted so you can pick up where you left off.
37 | - **Easy Resumption**: Just use `--resume ` to continue an interrupted job.
38 | - **Knows Its Place**: Keeps track of exactly where it stopped, down to the line.
39 |
40 | ## Author
41 |
42 | Robert Pimentel
43 |
44 | - GitHub: [@pr0b3r7](https://github.com/pr0b3r7)
45 | - LinkedIn: [pimentelrobert1](https://linkedin.com/in/pimentelrobert1)
46 | - Website: [hackerhermanos.com](https://www.hackerhermanos.com)
47 |
48 | ## Dependencies
49 |
50 | This project relies on several high-quality Rust crates to provide its functionality:
51 |
52 | ### Core Dependencies
53 |
54 | - **tokio** (1.36) - Asynchronous runtime powering parallel processing
55 | - **clap** (4.4) - Command-line argument parsing
56 | - **serde** (1.0) - Serialization framework for configuration
57 | - **anyhow** (1.0.91) - Error handling with context
58 |
59 | ### File Processing
60 |
61 | - **async-compression** (0.4.17) - Handles various compression formats (bzip2, gzip, xz)
62 | - **zip** (2.2.0) - ZIP archive support
63 | - **unrar** (0.5.6) - RAR archive support
64 | - **sevenz-rust** (0.6.1) - 7z archive support
65 | - **tar** (0.4.42) - TAR archive support
66 |
67 | ### User Interface
68 |
69 | - **indicatif** (0.17) - Progress bars and spinners
70 | - **dialoguer** (0.11.0) - Interactive command prompts
71 | - **crossterm** (0.28.1) - Terminal manipulation
72 | - **terminal_size** (0.4.0) - Terminal dimensions detection
73 |
74 | ### Utilities
75 |
76 | - **chrono** (0.4.38) - Date and time handling
77 | - **uuid** (1.11.0) - Unique identifier generation
78 | - **sha2** (0.10.8) - Cryptographic hashing
79 | - **encoding_rs** (0.8.35) - Character encoding support
80 | - **sys-info** (0.9.1) - System information gathering
81 |
82 | ### Networking
83 |
84 | - **reqwest** (0.12.9) - HTTP client with streaming support
85 | - **url** (2.5.2) - URL parsing and manipulation
86 |
87 | ### Logging and Error Handling
88 |
89 | - **env_logger** (0.11.5) - Environment-based logging
90 | - **log** (0.4.22) - Logging framework
91 | - **thiserror** (1.0.65) - Custom error types
92 |
93 | ### Signal Handling
94 |
95 | - **ctrlc** (3.4.5) - Ctrl+C signal handling
96 | - **signal-hook** (0.3.17) - OS signal handling
97 |
98 | ## Installation
99 |
100 | ### You'll Need
101 |
102 | - Rust toolchain (1.70+)
103 | - Cargo package manager
104 |
105 | ### Getting Started
106 |
107 | 1. Grab the code:
108 | ```sh
109 | git clone https://github.com/yourusername/file-merger-tool.git
110 | cd file-merger-tool
111 | ```
112 |
113 | 2. Build it:
114 | ```sh
115 | cargo build --release
116 | ```
117 |
118 | 3. Want it system-wide? (Optional):
119 | ```sh
120 | sudo cp target/release/file-merger-tool /usr/local/bin/
121 | ```
122 |
123 | ## Usage
124 |
125 | ### Quick Start
126 |
127 | ```sh
128 | file-merger-tool merge -w input_list.txt -o merged_output.txt
129 | ```
130 |
131 | ### Command Reference
132 |
133 | ```
134 | Usage: rustmerger [OPTIONS]
135 |
136 | Commands:
137 | merge Merge wordlists and rules
138 | generate-config Generate configuration file
139 | guided-setup Run guided setup
140 | resume Resume interrupted operation
141 | help Print this message or the help of the given subcommand(s)
142 |
143 | Options:
144 | -v, --verbose... Set verbosity level (-v: debug, -vv: trace)
145 | --log-level [default: info]
146 | -h, --help Print help
147 | -V, --version Print version
148 | ```
149 |
150 | #### Merge Command
151 |
152 | ```
153 | Usage: rustmerger merge [OPTIONS]
154 |
155 | Options:
156 | -v, --verbose... Set verbosity level (-v: debug, -vv: trace)
157 | -w, --wordlists-file Text file containing one wordlist path per line
158 | -r, --rules-file Text file containing one rule path per line
159 | --output-wordlist Destination path for merged and deduplicated wordlist
160 | --output-rules Destination path for merged and deduplicated rules
161 | -c, --config JSON configuration file with default settings
162 | --progress-file Save progress state for resume capability
163 | -d, --debug Enable detailed progress output
164 | -h, --help Print help
165 | ```
166 |
167 | #### Generate Config Command
168 |
169 | ```
170 | Usage: rustmerger generate-config [OPTIONS]
171 |
172 | Arguments:
173 | Destination path for configuration file
174 |
175 | Options:
176 | -t, --template Generate default configuration template
177 | -v, --verbose... Set verbosity level (-v: debug, -vv: trace)
178 | -h, --help Print help
179 | ```
180 |
181 | #### Guided Setup Command
182 |
183 | ```
184 | Usage: rustmerger guided-setup [OPTIONS]
185 |
186 | Arguments:
187 | Destination path for interactive configuration
188 |
189 | Options:
190 | -v, --verbose... Set verbosity level (-v: debug, -vv: trace)
191 | -h, --help Print help
192 | ```
193 |
194 | #### Sample Configuration File
195 |
196 | ```json
197 | {
198 | "input_files": "/tmp/wordlists_to_merge_dev.txt",
199 | "output_files": "/tmp/merged_wordlist.txt",
200 | "threads": 90,
201 | "verbose": true,
202 | "debug": true
203 | }
204 | ```
205 |
206 | ### Under the Hood
207 |
208 | #### How It Works
209 |
210 | The heavy lifting happens in the `FileProcessor` struct (`src/processing.rs`). Here's what makes it tick:
211 |
212 | 1. **Smart File Reading**:
213 | - Uses async I/O with `tokio` for non-blocking file access
214 | - Buffers reads to minimize system calls
215 |
216 | 2. **Reliable Error Handling**:
217 | - Logs issues but keeps going
218 | - Won't let one bad file stop the whole show
219 |
220 | 3. **Line-by-Line Processing**:
221 | - Handles each line individually
222 | - Gracefully skips UTF-8 issues
223 |
224 | 4. **Progress Tracking**:
225 | - Keeps tabs on processed files
226 | - Makes resuming interrupted jobs seamless
227 |
228 | #### Performance Tricks
229 |
230 | 1. **Parallel Power**:
231 | - Spreads work across multiple threads (default: 10)
232 | - Built on `tokio` for efficient async processing
233 |
234 | 2. **Smart Deduplication**:
235 | - Uses `HashSet` for O(1) lookups
236 | - Keeps memory usage in check
237 |
238 | 3. **Visual Feedback**:
239 | - Real-time progress bars
240 | - Shows you exactly what's happening
241 |
242 | 4. **Interruption-Proof**:
243 | - Handles Ctrl+C gracefully
244 | - Saves progress for later
245 | - Managed by `AppState` in `src/app_state.rs`
246 |
247 | 5. **Flexible Configuration**:
248 | - JSON config support via `--config `
249 | - Interactive setup with `--guided-setup`
250 |
251 | This tool is built to be reliable, efficient, and adaptable to your needs. Whether you're merging a few files or processing thousands, it's got you covered.
--------------------------------------------------------------------------------
/rustmerger_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "input_files": "/tmp/wordlists_to_merge_dev.txt",
3 | "output_files": "/tmp/merged_wordlist.txt",
4 | "threads": 90,
5 | "verbose": true,
6 | "debug": true
7 | }
--------------------------------------------------------------------------------
/src/app_state.rs:
--------------------------------------------------------------------------------
1 | use anyhow::Result; // Importing Result type from anyhow crate for error handling
2 | use std::path::PathBuf; // Importing PathBuf to handle file paths
3 | use tokio::sync::RwLock; // Importing RwLock from tokio for async read-write lock
4 | use std::sync::Arc; // Importing Arc for atomic reference counting
5 | use crate::progress::Progress; // Importing Progress struct from the local crate
6 |
7 | #[allow(dead_code)]
8 | // AppState struct holds the state of the application
9 | pub struct AppState {
10 | pub input_file: PathBuf, // Path to the input file
11 | pub output_file: PathBuf, // Path to the output file
12 | pub threads: usize, // Number of threads to use for processing
13 | pub progress: Arc>, // Progress tracking wrapped in an async read-write lock and atomic reference counter
14 | pub shutdown_requested: Arc>, // Flag to indicate if shutdown is requested, wrapped in an async read-write lock and atomic reference counter
15 | }
16 |
17 | impl AppState {
18 | // Asynchronous function to create a new AppState instance
19 | pub async fn new(input_file: PathBuf, output_file: PathBuf, threads: usize) -> Result {
20 | Ok(Self {
21 | input_file, // Set input file path
22 | output_file, // Set output file path
23 | threads, // Set number of threads
24 | progress: Arc::new(RwLock::new(Progress::default())), // Initialize progress with default value, wrapped in Arc and RwLock
25 | shutdown_requested: Arc::new(RwLock::new(false)), // Initialize shutdown_requested to false, wrapped in Arc and RwLock
26 | })
27 | }
28 |
29 | // Asynchronous function to create an AppState instance from a resume file
30 | pub async fn from_resume(resume_file: PathBuf) -> Result {
31 | let progress = Progress::load(&resume_file).await?; // Load progress from the resume file
32 | Ok(Self {
33 | input_file: progress.input_file.clone(), // Set input file path from progress
34 | output_file: progress.output_file.clone(), // Set output file path from progress
35 | threads: progress.threads, // Set number of threads from progress
36 | progress: Arc::new(RwLock::new(progress)), // Wrap loaded progress in Arc and RwLock
37 | shutdown_requested: Arc::new(RwLock::new(false)), // Initialize shutdown_requested to false, wrapped in Arc and RwLock
38 | })
39 | }
40 |
41 | // Asynchronous function to save the current progress
42 | pub async fn save_progress(&self) -> Result<()> {
43 | let progress = self.progress.read().await; // Acquire read lock on progress
44 | progress.save().await // Save the progress
45 | }
46 |
47 | // Asynchronous function to request shutdown
48 | pub async fn request_shutdown(&self) {
49 | *self.shutdown_requested.write().await = true; // Acquire write lock and set shutdown_requested to true
50 | }
51 |
52 | // Asynchronous function to check if shutdown is requested
53 | pub async fn should_shutdown(&self) -> bool {
54 | *self.shutdown_requested.read().await // Acquire read lock and return the value of shutdown_requested
55 | }
56 | }
--------------------------------------------------------------------------------
/src/cli.rs:
--------------------------------------------------------------------------------
1 | // Import required dependencies
2 | use clap::{Parser, Subcommand}; // For command-line argument parsing
3 | use std::path::PathBuf; // For handling file paths
4 | use log::LevelFilter; // For controlling log levels
5 |
6 | // Main CLI structure that defines the application's command-line interface
7 | #[derive(Parser)]
8 | #[command(
9 | name = "rustmerger",
10 | about = "Fast parallel merging and deduplication of wordlists and rules",
11 | version,
12 | author,
13 | long_about = None
14 | )]
15 | pub struct Cli {
16 | // Global verbose flag that can be used multiple times (-v, -vv, etc.)
17 | // Each occurrence increases the verbosity level
18 | #[arg(
19 | global = true, // Available to all subcommands
20 | short = 'v', // Can be used as -v
21 | long = "verbose", // Can be used as --verbose
22 | action = clap::ArgAction::Count, // Counts number of occurrences
23 | help = "Set verbosity level (-v: debug, -vv: trace)"
24 | )]
25 | verbose: u8,
26 |
27 | #[command(subcommand)]
28 | pub command: Commands,
29 |
30 | #[arg(long, default_value = "info")]
31 | log_level: String,
32 | }
33 |
34 | // Enum defining all available subcommands
35 | #[derive(Subcommand)]
36 | pub enum Commands {
37 | // Merge subcommand for combining wordlists and rules
38 | #[command(about = "Merge wordlists and rules")]
39 | Merge(MergeArgs),
40 |
41 | // Generate configuration file subcommand
42 | #[command(about = "Generate configuration file")]
43 | GenerateConfig(GenerateConfigArgs),
44 |
45 | // Interactive setup subcommand
46 | #[command(about = "Run guided setup")]
47 | GuidedSetup(GuidedSetupArgs),
48 |
49 | // Resume interrupted operations subcommand
50 | #[command(about = "Resume interrupted operation")]
51 | Resume(ResumeArgs),
52 | }
53 |
54 | // Structure defining all possible arguments for the merge command
55 | #[derive(Parser, Clone)]
56 | pub struct MergeArgs {
57 | // Input file containing list of wordlist paths
58 | #[arg(
59 | short = 'w',
60 | long = "wordlists-file",
61 | help = "Text file containing one wordlist path per line",
62 | value_name = "FILE"
63 | )]
64 | pub wordlists_file: Option,
65 |
66 | // Input file containing list of rule paths
67 | #[arg(
68 | short = 'r',
69 | long = "rules-file",
70 | help = "Text file containing one rule path per line",
71 | value_name = "FILE"
72 | )]
73 | pub rules_file: Option,
74 |
75 | // Output path for merged wordlist
76 | #[arg(
77 | long = "output-wordlist",
78 | help = "Destination path for merged and deduplicated wordlist",
79 | value_name = "FILE"
80 | )]
81 | pub output_wordlist: Option,
82 |
83 | // Output path for merged rules
84 | #[arg(
85 | long = "output-rules",
86 | help = "Destination path for merged and deduplicated rules",
87 | value_name = "FILE"
88 | )]
89 | pub output_rules: Option,
90 |
91 | // Configuration file path
92 | #[arg(
93 | short = 'c',
94 | long = "config",
95 | help = "JSON configuration file with default settings",
96 | value_name = "FILE"
97 | )]
98 | pub config: Option,
99 |
100 | // Progress state file for resume capability
101 | #[arg(
102 | long = "progress-file",
103 | help = "Save progress state for resume capability",
104 | value_name = "FILE"
105 | )]
106 | pub progress_file: Option,
107 |
108 | // Debug mode flag
109 | #[arg(
110 | short = 'd',
111 | long = "debug",
112 | help = "Enable detailed progress output"
113 | )]
114 | pub debug: bool,
115 | }
116 |
117 | // Arguments for the generate-config command
118 | #[derive(Parser, Clone)]
119 | pub struct GenerateConfigArgs {
120 | // Output path for the configuration file
121 | #[arg(
122 | help = "Destination path for configuration file",
123 | value_name = "FILE"
124 | )]
125 | pub output: PathBuf,
126 |
127 | // Flag to generate template configuration
128 | #[arg(
129 | short = 't',
130 | long = "template",
131 | help = "Generate default configuration template"
132 | )]
133 | pub template: bool,
134 | }
135 |
136 | // Arguments for the guided-setup command
137 | #[derive(Parser, Clone)]
138 | pub struct GuidedSetupArgs {
139 | // Output path for the generated configuration
140 | #[arg(
141 | help = "Destination path for interactive configuration",
142 | value_name = "FILE"
143 | )]
144 | pub output: PathBuf,
145 | }
146 |
147 | // Arguments for the resume command
148 | #[derive(Parser, Clone)]
149 | pub struct ResumeArgs {
150 | // Path to the progress state file
151 | #[arg(
152 | help = "Path to saved progress state file",
153 | value_name = "FILE"
154 | )]
155 | pub progress_file: PathBuf,
156 | }
157 |
158 | // Implementation of helper methods for the Cli struct
159 | impl Cli {
160 | // Convert verbose flag count to appropriate log level
161 | pub fn log_level(&self) -> LevelFilter {
162 | match self.log_level.as_str() {
163 | "error" => LevelFilter::Error,
164 | "warn" => LevelFilter::Warn,
165 | "info" => LevelFilter::Info,
166 | "debug" => LevelFilter::Debug,
167 | "trace" => LevelFilter::Trace,
168 | _ => LevelFilter::Info,
169 | }
170 | }
171 |
172 | // Add this new method
173 | pub fn verbose_count(&self) -> u8 {
174 | self.verbose
175 | }
176 | }
--------------------------------------------------------------------------------
/src/commands.rs:
--------------------------------------------------------------------------------
1 | // Import required dependencies
2 | use anyhow::Result; // For error handling
3 | use std::path::PathBuf; // For file path operations
4 | use std::sync::Arc; // For thread-safe reference counting
5 | use log::{info, warn}; // For logging
6 | use crate::errors::{MergerError, MergerResult};
7 |
8 | // Import local modules
9 | use crate::{
10 | app_state::AppState, // Application state management
11 | config::Config, // Configuration handling
12 | core::ProcessingCore, // Core processing logic
13 | cli::{Cli, MergeArgs, GenerateConfigArgs, GuidedSetupArgs, ResumeArgs}, // CLI arguments
14 | signal_handler::SignalHandler, // Add this with other imports
15 | };
16 |
17 | // Command handler for processing CLI commands
18 | pub struct CommandHandler;
19 |
20 | impl CommandHandler {
21 | // Handle the merge command - combines wordlists and rules
22 | pub async fn handle_merge(cli: &Cli, args: MergeArgs) -> Result<()> {
23 | info!("Starting merge operation");
24 |
25 | // Load existing config or create default template
26 | let config = if let Some(config_path) = args.config {
27 | Config::load(&config_path).await?
28 | } else {
29 | Config::default()
30 | };
31 |
32 | // Create thread-safe application state
33 | let app_state = Arc::new(AppState::new(
34 | args.wordlists_file
35 | .or(config.input_files)
36 | .ok_or_else(|| anyhow::anyhow!("No wordlists file specified"))?,
37 | args.output_wordlist
38 | .or(config.output_files)
39 | .ok_or_else(|| anyhow::anyhow!("No output file specified"))?,
40 | if let Some(threads) = config.threads {
41 | threads
42 | } else {
43 | 10 // Default to 10 threads if not specified
44 | }
45 | ).await?);
46 |
47 | // Fix debug and verbose settings
48 | let debug_enabled = args.debug || config.debug; // Enable debug if specified in args or config
49 | let verbose_enabled = cli.verbose_count() > 0 || config.verbose; // Enable verbose if specified in CLI or config
50 |
51 | // Set up signal handler
52 | let signal_handler = SignalHandler::new(app_state.clone())?;
53 | signal_handler.setup_handlers()?;
54 |
55 | // Create processing core and start processing
56 | let mut core = ProcessingCore::new(
57 | app_state.clone(),
58 | debug_enabled,
59 | verbose_enabled
60 | ).await?;
61 |
62 | if let Err(e) = core.process().await {
63 | warn!("Error during processing: {}", e);
64 | }
65 |
66 | info!("Merge operation completed");
67 | Ok(())
68 | }
69 |
70 | // Handle configuration file generation
71 | pub async fn handle_generate_config(args: GenerateConfigArgs) -> Result<()> {
72 | info!("Generating configuration file");
73 |
74 | // Create default template config
75 | let config = if args.template {
76 | Config::template()
77 | } else {
78 | Config::template()
79 | };
80 |
81 | // Save configuration to specified path
82 | config.save(&args.output).await?;
83 |
84 | info!("Configuration file generated at: {:?}", args.output);
85 | Ok(())
86 | }
87 |
88 | // Handle interactive setup process
89 | pub async fn handle_guided_setup(args: GuidedSetupArgs) -> Result<()> {
90 | info!("Starting guided setup");
91 |
92 | // Run interactive configuration
93 | let config = Config::guided_setup().await?;
94 | config.save(&args.output).await?;
95 |
96 | info!("Configuration saved to: {:?}", args.output);
97 | Ok(())
98 | }
99 |
100 | // Handle resuming from a previous state
101 | #[allow(dead_code)]
102 | pub async fn handle_resume(args: ResumeArgs) -> Result<()> {
103 | info!("Resuming from progress file: {:?}", args.progress_file);
104 |
105 | // Create application state with default values
106 | let app_state = Arc::new(AppState::new(
107 | args.progress_file.clone(),
108 | PathBuf::from("/tmp/output.txt"), // Default output path
109 | 10 // Default threads
110 | ).await?);
111 |
112 | // Initialize processing core with minimal logging
113 | let mut core = ProcessingCore::new(
114 | app_state.clone(),
115 | false, // Debug disabled
116 | false // Verbose disabled
117 | ).await?;
118 |
119 | // Resume processing and handle errors
120 | if let Err(e) = core.process().await {
121 | warn!("Error during processing: {}", e);
122 | }
123 |
124 | info!("Resume operation completed");
125 | Ok(())
126 | }
127 | }
--------------------------------------------------------------------------------
/src/config.rs:
--------------------------------------------------------------------------------
1 | // Import required dependencies
2 | use serde::{Serialize, Deserialize}; // For JSON serialization/deserialization
3 | use std::path::PathBuf; // For file path handling
4 | use anyhow::Result; // For error handling
5 | use tokio::fs; // For async file operations
6 | use dialoguer::{Input, Confirm}; // For interactive CLI prompts
7 | use anyhow::Context;
8 | use std::io::{BufReader, BufWriter, BufRead, Write};
9 | use crate::errors::{MergerError, MergerResult, ConfigError};
10 |
11 | // Configuration structure that can be serialized to/from JSON
12 | #[derive(Debug, Serialize, Deserialize)]
13 | pub struct Config {
14 | pub input_files: Option, // Path to file containing list of input files
15 | pub output_files: Option, // Path where merged output will be written
16 | pub threads: Option, // Number of parallel processing threads
17 | pub verbose: bool, // Enable detailed logging
18 | pub debug: bool, // Enable debug mode
19 | }
20 |
21 | impl Default for Config {
22 | fn default() -> Self {
23 | Self {
24 | input_files: None,
25 | output_files: None,
26 | threads: Some(10),
27 | verbose: true,
28 | debug: true,
29 | }
30 | }
31 | }
32 |
33 | impl Config {
34 | // Load configuration from a JSON file
35 | pub async fn load(path: &PathBuf) -> MergerResult {
36 | let content = fs::read_to_string(path).await
37 | .map_err(MergerError::Io)?;
38 | serde_json::from_str(&content)
39 | .map_err(|e| MergerError::Config(ConfigError::InvalidFormat(e.to_string())))
40 | }
41 |
42 | // Save configuration to a JSON file
43 | pub async fn save(&self, path: &PathBuf) -> MergerResult<()> {
44 | let content = serde_json::to_string_pretty(self)
45 | .map_err(|e| MergerError::Config(ConfigError::SerializationError(e.to_string())))?;
46 | fs::write(path, content).await
47 | .map_err(MergerError::Io)
48 | }
49 |
50 | // Create a default configuration template
51 | pub fn template() -> Self {
52 | Self {
53 | input_files: None,
54 | output_files: None,
55 | threads: Some(10),
56 | verbose: true,
57 | debug: true,
58 | }
59 | }
60 |
61 | // Interactive configuration setup using command-line prompts
62 | pub async fn guided_setup() -> MergerResult {
63 | // Prompt for input files path with default value
64 | let input_files: String = Input::new()
65 | .with_prompt("Enter path to input files list")
66 | .default("/tmp/wordlists_to_merge.txt".into())
67 | .interact()?;
68 |
69 | // Prompt for output file path with default value
70 | let output_files: String = Input::new()
71 | .with_prompt("Enter path for output file")
72 | .default("/tmp/merged_wordlist.txt".into())
73 | .interact()?;
74 |
75 | // Prompt for number of processing threads
76 | let threads: String = Input::new()
77 | .with_prompt("Enter number of threads")
78 | .default("50".into())
79 | .interact()?;
80 |
81 | // Confirm whether to enable verbose logging
82 | let verbose = Confirm::new()
83 | .with_prompt("Enable verbose logging?")
84 | .default(true)
85 | .interact()?;
86 |
87 | // Confirm whether to enable debug mode
88 | let debug = Confirm::new()
89 | .with_prompt("Enable debug logging?")
90 | .default(false)
91 | .interact()?;
92 |
93 | // Parse threads with proper error handling
94 | let threads = threads.parse::()
95 | .map_err(|_| MergerError::Config(ConfigError::InvalidThreadCount(0)))?;
96 |
97 | if threads == 0 || threads > 100 {
98 | return Err(MergerError::Config(ConfigError::InvalidThreadCount(threads)));
99 | }
100 |
101 | // Create and return configuration with user-provided values
102 | Ok(Self {
103 | input_files: Some(PathBuf::from(input_files)),
104 | output_files: Some(PathBuf::from(output_files)),
105 | threads: Some(threads),
106 | verbose,
107 | debug,
108 | })
109 | }
110 |
111 | // Replace the existing validate method with this implementation
112 | pub fn validate(&self) -> Result<(), ConfigError> {
113 | // Validate thread count
114 | if let Some(threads) = self.threads {
115 | if threads == 0 || threads > 100 {
116 | return Err(ConfigError::InvalidThreadCount(threads));
117 | }
118 | }
119 |
120 | // Validate input files path exists
121 | let input_path = self.input_files
122 | .as_ref()
123 | .ok_or(ConfigError::MissingInputFiles)?;
124 |
125 | if !input_path.exists() {
126 | return Err(ConfigError::InputFileNotFound(input_path.clone()));
127 | }
128 |
129 | // Validate output files path
130 | let output_path = self.output_files
131 | .as_ref()
132 | .ok_or(ConfigError::MissingOutputFiles)?;
133 |
134 | // Check if input and output paths are the same
135 | if input_path == output_path {
136 | return Err(ConfigError::InputOutputPathsEqual);
137 | }
138 |
139 | // Validate output directory exists and is writable
140 | if let Some(parent) = output_path.parent() {
141 | if !parent.exists() {
142 | return Err(ConfigError::OutputDirectoryNotWritable(parent.to_path_buf()));
143 | }
144 |
145 | // Check if directory is writable by attempting to create a temporary file
146 | if let Ok(temp_path) = tempfile::Builder::new()
147 | .prefix(".test-write-")
148 | .tempfile_in(parent)
149 | {
150 | // Clean up temporary file
151 | let _ = temp_path.close();
152 | } else {
153 | return Err(ConfigError::OutputDirectoryNotWritable(parent.to_path_buf()));
154 | }
155 | }
156 |
157 | Ok(())
158 | }
159 | }
--------------------------------------------------------------------------------
/src/config_validator.rs:
--------------------------------------------------------------------------------
1 | use anyhow::{Context, Result}; // Importing Context and Result from the anyhow crate for error handling
2 | use std::path::Path; // Importing Path from the standard library for file path handling
3 | use crate::Config; // Importing the Config struct from the current crate
4 |
5 | // Define a struct for configuration validation
6 | pub struct ConfigValidator;
7 |
8 | impl ConfigValidator {
9 | // Function to validate the entire configuration
10 | pub fn validate_config(config: &Config) -> Result<()> {
11 | // Validate input files path
12 | Self::validate_input_file(&config.input_files)
13 | .context("Invalid input files configuration")?;
14 |
15 | // Validate output files path
16 | if let Some(parent) = config.output_files.parent() {
17 | Self::validate_directory(parent)
18 | .context("Invalid output directory")?;
19 | }
20 |
21 | // Validate thread count
22 | if config.threads == 0 {
23 | return Err(anyhow::anyhow!("Thread count must be greater than 0"));
24 | }
25 |
26 | Ok(())
27 | }
28 |
29 | // Function to validate an input file path
30 | fn validate_input_file(path: &Path) -> Result<()> {
31 | // Check if the file exists
32 | if !path.exists() {
33 | return Err(anyhow::anyhow!("File does not exist: {:?}", path));
34 | }
35 | // Check if the path is a file
36 | if !path.is_file() {
37 | return Err(anyhow::anyhow!("Path is not a file: {:?}", path));
38 | }
39 | Ok(())
40 | }
41 |
42 | // Function to validate a directory path
43 | fn validate_directory(path: &Path) -> Result<()> {
44 | // Check if the path exists and is a directory
45 | if path.exists() && !path.is_dir() {
46 | return Err(anyhow::anyhow!("Path exists but is not a directory: {:?}", path));
47 | }
48 | Ok(())
49 | }
50 | }
--------------------------------------------------------------------------------
/src/core.rs:
--------------------------------------------------------------------------------
1 | use anyhow::Result; // Import Result type from anyhow crate for error handling
2 | use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; // Import progress bar utilities from indicatif crate
3 | use std::path::{Path, PathBuf}; // Import Path and PathBuf for file path handling
4 | use std::sync::Arc; // Import Arc for thread-safe reference counting
5 | use tokio::io::{AsyncWriteExt, AsyncSeekExt, BufWriter, BufReader, AsyncBufReadExt}; // Remove duplicate AsyncWriteExt
6 | use tokio::fs::File;
7 | use futures::StreamExt;
8 | use std::collections::HashSet;
9 | use tokio::sync::mpsc;
10 | use futures::stream::{self, FuturesUnordered};
11 | use crate::app_state::AppState;
12 | use serde::{Serialize, Deserialize};
13 | use tokio::fs::OpenOptions;
14 | use tokio::io::SeekFrom;
15 | use sys_info;
16 | use crate::progress::ProgressTracker;
17 | use std::sync::atomic::{AtomicUsize, Ordering};
18 | use crate::errors::{MergerError, MergerResult};
19 |
20 | const CHUNK_SIZE: usize = 1024 * 1024 * 10; // 10MB chunks
21 | const BUFFER_SIZE: usize = 1024 * 1024 * 32; // 32MB buffer
22 | const CHANNEL_SIZE: usize = 1000; // Number of chunks to keep in memory
23 | const PARALLEL_FILES: usize = 4; // Number of files to process in parallel
24 | const LINE_BUFFER_CAPACITY: usize = 1024 * 64; // 64KB initial line buffer
25 | const OUTPUT_CHUNKS: usize = 8;
26 | const OUTPUT_BUFFER_SIZE: usize = 1024 * 1024 * 16; // 16MB output buffer
27 |
28 | // Define a struct to manage the core processing logic
29 | #[allow(dead_code)]
30 | pub struct ProcessingCore {
31 | app_state: Arc, // Shared application state
32 | tracker: ProgressTracker, // Replace progress: MultiProgress with tracker
33 | verbose: bool, // Flag to enable verbose logging
34 | debug: bool, // Flag to enable debug mode
35 | }
36 |
37 | // Implement methods for ProcessingCore
38 | impl ProcessingCore {
39 | // Asynchronous constructor for ProcessingCore
40 | pub async fn new(app_state: Arc, verbose: bool, debug: bool) -> MergerResult {
41 | // Estimate total files and lines
42 | let input_file = &app_state.input_file;
43 | let content = tokio::fs::read_to_string(input_file).await?;
44 | let total_files = content.lines().count();
45 |
46 | // Rough estimation of lines (can be adjusted based on your needs)
47 | let estimated_lines = total_files * 1000; // Assuming average 1000 lines per file
48 |
49 | Ok(Self {
50 | app_state,
51 | tracker: ProgressTracker::new(total_files, estimated_lines),
52 | verbose,
53 | debug,
54 | })
55 | }
56 |
57 | // Main processing function
58 | pub async fn process(&mut self) -> MergerResult<()> {
59 | if self.verbose {
60 | println!("Starting the processing of files...");
61 | }
62 |
63 | let input_path = self.app_state.input_file.clone();
64 | let files = match Self::read_input_files(&input_path).await {
65 | Ok(f) => f,
66 | Err(e) => {
67 | self.log_error(&format!("Failed to read input files: {}", e)).await?;
68 | return Ok(());
69 | }
70 | };
71 |
72 | let mut files_processed = 0;
73 | let app_state = Arc::clone(&self.app_state);
74 |
75 | for file in files {
76 | if app_state.should_shutdown().await {
77 | self.tracker.finish();
78 | return Ok(());
79 | }
80 |
81 | let file_path = file.clone();
82 | let result = self.process_single_file(file_path.clone(), &app_state).await;
83 | if let Err(e) = result {
84 | let error_msg = format!("Error processing file {:?}: {}", file_path, e);
85 | self.log_error(&error_msg).await?;
86 | continue;
87 | }
88 |
89 | files_processed += 1;
90 | self.tracker.update_overall_progress(files_processed);
91 | }
92 |
93 | println!("Starting merge and deduplication process...");
94 | self.merge_and_deduplicate().await?;
95 |
96 | self.tracker.finish();
97 | println!("Processing completed successfully");
98 |
99 | Ok(())
100 | }
101 |
102 | // Function to merge files and remove duplicates
103 | async fn merge_and_deduplicate(&mut self) -> MergerResult<()> {
104 | let files = self.validate_and_collect_metadata(&self.app_state.progress.read().await.processed_files).await?;
105 | let optimized_files = optimize_processing_order(files).await;
106 |
107 | // Calculate optimal batch size based on available system memory
108 | let mem_info = sys_info::mem_info()?;
109 | let available_memory = (mem_info.avail as usize * 1024) / 2;
110 | let batch_size = (available_memory / std::mem::size_of::()).min(CHUNK_SIZE);
111 |
112 | let (tx, mut rx) = mpsc::channel::>(CHANNEL_SIZE);
113 | let unique_count = Arc::new(AtomicUsize::new(0));
114 |
115 | // Spawn writer task with optimized batching
116 | let writer_task = tokio::spawn({
117 | let unique_count = unique_count.clone();
118 | async move {
119 | let mut final_set = HashSet::with_capacity(batch_size);
120 |
121 | while let Some(mut chunk_set) = rx.recv().await {
122 | final_set.extend(chunk_set.drain());
123 | unique_count.store(final_set.len(), Ordering::Relaxed);
124 | }
125 | final_set
126 | }
127 | });
128 |
129 | // Process files in parallel with optimized ordering
130 | let mut total_lines_processed = 0;
131 |
132 | // Process files in chunks
133 | for chunk in optimized_files.chunks(PARALLEL_FILES) {
134 | let tx = tx.clone();
135 | let chunk_files = chunk.to_vec();
136 |
137 | for file in chunk_files {
138 | if let Ok(lines_count) = Self::process_large_file(&file, tx.clone(), batch_size).await {
139 | total_lines_processed += lines_count;
140 | let current_unique = unique_count.load(Ordering::Relaxed);
141 | self.tracker.update_dedup_progress(current_unique, total_lines_processed);
142 | }
143 | }
144 | }
145 |
146 | drop(tx); // Close the channel
147 |
148 | // Get the final set and write results
149 | let unique_lines = writer_task.await?;
150 | let file = File::create(&self.app_state.output_file).await?;
151 | let mut writer = BufWriter::with_capacity(BUFFER_SIZE, file);
152 | let total_unique = unique_lines.len();
153 |
154 | println!("Writing {} unique lines to output file", total_unique);
155 |
156 | let mut buffer = String::with_capacity(CHUNK_SIZE);
157 | for line in unique_lines {
158 | buffer.push_str(&line);
159 | buffer.push('\n');
160 |
161 | if buffer.len() >= CHUNK_SIZE {
162 | writer.write_all(buffer.as_bytes()).await?;
163 | buffer.clear();
164 | }
165 | }
166 |
167 | if !buffer.is_empty() {
168 | writer.write_all(buffer.as_bytes()).await?;
169 | }
170 |
171 | writer.flush().await?;
172 | self.tracker.update_dedup_progress(total_unique, total_lines_processed);
173 |
174 | Ok(())
175 | }
176 |
177 | // Move process_large_file into the impl block and make it an associated function
178 | async fn process_large_file(
179 | path: &PathBuf,
180 | tx: mpsc::Sender>,
181 | chunk_size: usize,
182 | ) -> MergerResult {
183 | let file = File::open(path).await?;
184 | let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
185 | let mut buffer = Vec::with_capacity(LINE_BUFFER_CAPACITY);
186 | let mut current_set = HashSet::with_capacity(chunk_size);
187 | let mut bytes_processed = 0;
188 | let mut total_lines = 0;
189 |
190 | loop {
191 | buffer.clear();
192 | match reader.read_until(b'\n', &mut buffer).await? {
193 | 0 => break,
194 | n => {
195 | bytes_processed += n;
196 | if !buffer.is_empty() {
197 | if let Ok(line) = String::from_utf8(buffer[..n-1].to_vec()) {
198 | if !line.is_empty() {
199 | current_set.insert(line);
200 | total_lines += 1;
201 | }
202 | }
203 | }
204 | }
205 | }
206 |
207 | if bytes_processed >= CHUNK_SIZE || current_set.len() >= chunk_size {
208 | tx.send(current_set).await?;
209 | current_set = HashSet::with_capacity(chunk_size);
210 | bytes_processed = 0;
211 | }
212 | }
213 |
214 | if !current_set.is_empty() {
215 | tx.send(current_set).await?;
216 | }
217 |
218 | Ok(total_lines)
219 | }
220 |
221 | // Function to read input files from the provided path
222 | async fn read_input_files(input_file: &Path) -> Result> {
223 | let content = tokio::fs::read_to_string(input_file).await?;
224 | Ok(content.lines()
225 | .map(PathBuf::from)
226 | .collect())
227 | }
228 |
229 | // Function to process a single file
230 | async fn process_single_file(&mut self, file: PathBuf, app_state: &Arc) -> Result<()> {
231 | if app_state.should_shutdown().await {
232 | return Err(anyhow::anyhow!("Processing interrupted by shutdown signal")); // Return an error if shutdown is requested
233 | }
234 |
235 | let content = match tokio::fs::read_to_string(&file).await {
236 | Ok(content) => content,
237 | Err(e) => {
238 | self.log_error(&format!("Error reading {}: {}", file.display(), e)).await?;
239 | return Ok(());
240 | }
241 | };
242 |
243 | // Process the content here
244 | let mut progress = app_state.progress.write().await; // Acquire a write lock on the progress state
245 | progress.processed_files.push(file.clone()); // Add the file to the list of processed files
246 | progress.current_position += content.lines().count(); // Update the current position
247 | progress.save().await?; // Save the progress state
248 |
249 | if self.verbose {
250 | log::debug!("Processed file: {}", file.display()); // Log the processed file if verbose is enabled
251 | }
252 |
253 | Ok(())
254 | }
255 |
256 | // Function to validate the input files
257 | async fn validate_files(&mut self, files: &[PathBuf]) -> Result<()> {
258 | for (i, file) in files.iter().enumerate() {
259 | if !file.exists() {
260 | self.log_error(&format!("File not found: {}", file.display())).await?;
261 | continue;
262 | }
263 | self.tracker.update_overall_progress(i + 1);
264 | }
265 | Ok(())
266 | }
267 |
268 | // Function to log errors to a file
269 | async fn log_error(&self, message: &str) -> Result<()> {
270 | let mut file = tokio::fs::OpenOptions::new()
271 | .create(true)
272 | .append(true)
273 | .open("error.log")
274 | .await?;
275 |
276 | let error_message = format!("[{}] {}\n",
277 | chrono::Local::now().format("%Y-%m-%d %H:%M:%S"), // Get the current timestamp
278 | message
279 | );
280 |
281 | file.write_all(error_message.as_bytes()).await?; // Write the error message to the file
282 | file.sync_all().await?; // Sync the file to ensure all data is written
283 | Ok(())
284 | }
285 |
286 | async fn validate_and_collect_metadata(&self, files: &[PathBuf]) -> Result> {
287 | let mut valid_files = Vec::with_capacity(files.len());
288 |
289 | // Process files in parallel batches
290 | let batch_size = 50; // Validate 50 files at a time
291 | for chunk in files.chunks(batch_size) {
292 | let futures: FuturesUnordered<_> = chunk.iter().map(|path| async move {
293 | match tokio::fs::metadata(path).await {
294 | Ok(meta) => Some((path.clone(), meta.len())),
295 | Err(e) => {
296 | eprintln!("Error accessing file {}: {}", path.display(), e);
297 | None
298 | }
299 | }
300 | }).collect();
301 |
302 | // Collect results from this batch
303 | let batch_results: Vec<_> = futures
304 | .filter_map(|result| async move { result })
305 | .collect()
306 | .await;
307 |
308 | // Extend valid_files with batch results
309 | valid_files.extend(batch_results);
310 | }
311 |
312 | Ok(valid_files)
313 | }
314 | }
315 |
316 | // Enum to represent different processing stages
317 | #[derive(Debug, Clone, Serialize, Deserialize)]
318 | pub enum ProcessingStage {
319 | Initializing, // Initializing stage
320 | ValidatingFiles, // Validating files stage
321 | ProcessingFiles, // Processing files stage
322 | Merging, // Merging stage
323 | Completed, // Completed stage
324 | Failed, // Failed stage
325 | }
326 |
327 | async fn write_chunk(
328 | lines: Vec,
329 | file: &Path,
330 | offset: u64,
331 | ) -> Result<()> {
332 | let mut file = OpenOptions::new()
333 | .write(true)
334 | .create(true)
335 | .open(file)
336 | .await?;
337 | file.seek(SeekFrom::Start(offset)).await?;
338 | let mut writer = BufWriter::with_capacity(OUTPUT_BUFFER_SIZE, file);
339 |
340 | for line in lines {
341 | writer.write_all(line.as_bytes()).await?;
342 | writer.write_all(b"\n").await?;
343 | }
344 | writer.flush().await?;
345 | Ok(())
346 | }
347 |
348 | async fn optimize_processing_order(files: Vec<(PathBuf, u64)>) -> Vec {
349 | // Sort files by size in descending order for better memory utilization
350 | let mut sorted_files = files;
351 | sorted_files.sort_by(|a, b| b.1.cmp(&a.1));
352 |
353 | // Group files by size ranges to process similar-sized files together
354 | let mut optimized = Vec::with_capacity(sorted_files.len());
355 | let mut small = Vec::new();
356 | let mut medium = Vec::new();
357 | let mut large = Vec::new();
358 |
359 | for (path, size) in sorted_files {
360 | match size {
361 | s if s < 1024 * 1024 * 100 => small.push(path), // < 100MB
362 | s if s < 1024 * 1024 * 1000 => medium.push(path), // < 1GB
363 | _ => large.push(path), // >= 1GB
364 | }
365 | }
366 |
367 | // Process largest files first when memory is fresh
368 | optimized.extend(large);
369 | optimized.extend(medium);
370 | optimized.extend(small);
371 | optimized
372 | }
--------------------------------------------------------------------------------
/src/display.rs:
--------------------------------------------------------------------------------
1 | use std::io::{self, Stdout, Write}; // Importing necessary modules from the standard library
2 | use std::time::Instant; // Importing Instant for tracking elapsed time
3 |
4 | // Struct to manage status display on the terminal
5 | pub struct StatusDisplay {
6 | stdout: Stdout, // Standard output handle
7 | last_line_length: usize, // Length of the last printed line
8 | terminal_width: usize, // Width of the terminal
9 | start_time: Instant, // Start time to track elapsed time
10 | }
11 |
12 | impl StatusDisplay {
13 | // Function to create a new StatusDisplay instance
14 | pub fn new() -> io::Result {
15 | let stdout = io::stdout(); // Get the standard output handle
16 | let terminal_width = terminal_size::terminal_size() // Get the terminal size
17 | .map(|(w, _)| w.0 as usize) // Extract the width and convert to usize
18 | .unwrap_or(80); // Default to 80 if terminal size is not available
19 | let start_time = Instant::now(); // Record the current time as start time
20 |
21 | Ok(Self {
22 | stdout, // Initialize stdout
23 | last_line_length: 0, // Initialize last line length to 0
24 | terminal_width, // Initialize terminal width
25 | start_time, // Initialize start time
26 | })
27 | }
28 |
29 | // Function to update the status message on the terminal
30 | pub fn update_status(&mut self, message: &str) -> io::Result<()> {
31 | // Clear the previous line
32 | write!(self.stdout, "\r")?; // Move cursor to the beginning of the line
33 | for _ in 0..self.last_line_length {
34 | write!(self.stdout, " ")?; // Overwrite the previous line with spaces
35 | }
36 | write!(self.stdout, "\r")?; // Move cursor to the beginning of the line again
37 |
38 | // Write the new message
39 | write!(self.stdout, "{}", message)?; // Print the new message
40 | self.stdout.flush()?; // Flush the output to ensure it is displayed
41 |
42 | // Update the last line length
43 | self.last_line_length = message.len(); // Store the length of the new message
44 |
45 | Ok(())
46 | }
47 |
48 | // Function to update the progress bar on the terminal
49 | pub fn update_progress(&mut self, current: usize, total: usize, message: &str) -> io::Result<()> {
50 | let percentage = (current as f64 / total as f64 * 100.0) as usize; // Calculate the progress percentage
51 | let bar_width = 30; // Width of the progress bar
52 | let filled = (bar_width as f64 * (current as f64 / total as f64)) as usize; // Calculate the filled portion of the bar
53 |
54 | // Create the progress bar string
55 | let bar: String = format!(
56 | "[{}{}] {}/{} ({}%) {}",
57 | "=".repeat(filled), // Filled portion of the bar
58 | " ".repeat(bar_width - filled), // Empty portion of the bar
59 | current, // Current progress
60 | total, // Total progress
61 | percentage, // Progress percentage
62 | message // Additional message
63 | );
64 |
65 | self.update_status(&self.truncate_message(&bar)) // Update the status with the progress bar
66 | }
67 |
68 | // Function to truncate the message if it exceeds the terminal width
69 | fn truncate_message(&self, message: &str) -> String {
70 | if message.len() > self.terminal_width {
71 | format!("{}...", &message[..self.terminal_width - 3]) // Truncate and add ellipsis
72 | } else {
73 | message.to_string() // Return the original message if it fits
74 | }
75 | }
76 |
77 | // Function to finish the status display
78 | pub fn finish(&mut self) -> io::Result<()> {
79 | writeln!(self.stdout)?; // Print a newline to finish the status display
80 | self.stdout.flush() // Flush the output to ensure it is displayed
81 | }
82 |
83 | // Function to log the elapsed time since the start
84 | pub fn log_elapsed_time(&self) {
85 | let elapsed = self.start_time.elapsed(); // Calculate the elapsed time
86 | println!("Elapsed time: {:.2?}", elapsed); // Print the elapsed time
87 | }
88 | }
--------------------------------------------------------------------------------
/src/errors.rs:
--------------------------------------------------------------------------------
1 | use thiserror::Error;
2 | use std::path::PathBuf;
3 | use dialoguer;
4 | use tokio::task::JoinError;
5 | use tokio::sync::mpsc::error::SendError;
6 | use std::collections::HashSet;
7 | use ctrlc;
8 |
9 | /// Type alias for Result with MergerError as the error type
10 | pub type MergerResult = Result;
11 |
12 | /// Custom error types for the file merger application
13 | #[derive(Error, Debug)]
14 | pub enum MergerError {
15 | /// Standard IO errors
16 | #[error("IO error: {0}")]
17 | Io(#[from] std::io::Error),
18 |
19 | /// Generic error handling via anyhow
20 | #[error("Internal error: {0}")]
21 | Anyhow(#[from] anyhow::Error),
22 |
23 | /// Configuration related errors
24 | #[error("Config error: {0}")]
25 | Config(#[from] ConfigError),
26 |
27 | /// System resource errors
28 | #[error("System error: {0}")]
29 | SysInfo(#[from] sys_info::Error),
30 |
31 | /// File processing errors
32 | #[error("Processing error: {0}")]
33 | Processing(String),
34 |
35 | /// Thread communication errors
36 | #[error("Channel error: {0}")]
37 | Channel(String),
38 |
39 | /// Input file validation errors
40 | #[error("Input validation error: {0}")]
41 | InputValidation(String),
42 |
43 | /// Progress tracking errors
44 | #[error("Progress tracking error: {0}")]
45 | Progress(String),
46 |
47 | /// Resume operation errors
48 | #[error("Resume error: {source}")]
49 | Resume {
50 | #[from]
51 | source: ResumeError,
52 | },
53 |
54 | /// Deduplication errors
55 | #[error("Deduplication error: {0}")]
56 | Deduplication(String),
57 |
58 | /// UTF-8 encoding errors
59 | #[error("Invalid UTF-8 in file {path}: {message}")]
60 | InvalidUtf8 {
61 | path: PathBuf,
62 | message: String,
63 | },
64 | }
65 |
66 | /// Specific errors related to resume functionality
67 | #[derive(Error, Debug)]
68 | pub enum ResumeError {
69 | #[error("Progress file not found: {0}")]
70 | ProgressFileNotFound(PathBuf),
71 |
72 | #[error("Invalid progress file format")]
73 | InvalidProgressFormat,
74 |
75 | #[error("Progress file is corrupted")]
76 | CorruptedProgress,
77 |
78 | #[error("Cannot resume: input files have changed")]
79 | InputFilesChanged,
80 | }
81 |
82 | /// Specific errors related to configuration
83 | #[derive(Error, Debug)]
84 | pub enum ConfigError {
85 | #[error("Invalid thread count: {0}. Must be between 1 and 100")]
86 | InvalidThreadCount(usize),
87 |
88 | #[error("Input files path must be specified")]
89 | MissingInputFiles,
90 |
91 | #[error("Output files path must be specified")]
92 | MissingOutputFiles,
93 |
94 | #[error("Input file not found: {0}")]
95 | InputFileNotFound(PathBuf),
96 |
97 | #[error("Output directory is not writable: {0}")]
98 | OutputDirectoryNotWritable(PathBuf),
99 |
100 | #[error("Input and output paths cannot be the same")]
101 | InputOutputPathsEqual,
102 |
103 | #[error("Invalid configuration format: {0}")]
104 | InvalidFormat(String),
105 |
106 | #[error("Serialization error: {0}")]
107 | SerializationError(String),
108 | }
109 |
110 | impl From for MergerError {
111 | fn from(err: dialoguer::Error) -> Self {
112 | MergerError::Processing(err.to_string())
113 | }
114 | }
115 |
116 | impl From for MergerError {
117 | fn from(err: JoinError) -> Self {
118 | MergerError::Processing(format!("Task join error: {}", err))
119 | }
120 | }
121 |
122 | impl From> for MergerError {
123 | fn from(err: SendError) -> Self {
124 | MergerError::Channel(err.to_string())
125 | }
126 | }
127 |
128 | impl From for MergerError {
129 | fn from(err: serde_json::Error) -> Self {
130 | MergerError::Config(ConfigError::InvalidFormat(err.to_string()))
131 | }
132 | }
133 |
134 | impl From> for MergerError {
135 | fn from(err: std::sync::mpsc::SendError) -> Self {
136 | MergerError::Channel(err.to_string())
137 | }
138 | }
139 |
140 | impl From for MergerError {
141 | fn from(err: ctrlc::Error) -> Self {
142 | MergerError::Processing(format!("Ctrl+C handler error: {}", err))
143 | }
144 | }
--------------------------------------------------------------------------------
/src/file_utils.rs:
--------------------------------------------------------------------------------
1 | use anyhow::Result; // Import the Result type from the anyhow crate for error handling
2 | use std::{
3 | path::Path, // Import the Path struct for handling file paths
4 | fs::{File, OpenOptions}, // Import File and OpenOptions for file operations
5 | io::{BufRead, BufReader, BufWriter, Write}, // Import I/O traits and structs for reading and writing files
6 | };
7 | use log::warn; // Import the warn macro from the log crate for logging warnings
8 |
9 | // Define a struct for file utility functions
10 | pub struct FileUtils;
11 |
12 | impl FileUtils {
13 | // Ensure a directory exists, creating it if necessary
14 | pub async fn ensure_dir(path: &Path) -> Result<()> {
15 | // Check if the directory does not exist
16 | if !path.exists() {
17 | // Create the directory and all its parent directories
18 | tokio::fs::create_dir_all(path).await?;
19 | }
20 | Ok(())
21 | }
22 |
23 | // Atomically write content to a file
24 | pub async fn atomic_write(path: &Path, content: &[u8]) -> Result<()> {
25 | // Create a temporary file path with a ".tmp" extension
26 | let temp_path = path.with_extension("tmp");
27 | // Write the content to the temporary file
28 | tokio::fs::write(&temp_path, content).await?;
29 | // Rename the temporary file to the target file path
30 | tokio::fs::rename(temp_path, path).await?;
31 | Ok(())
32 | }
33 |
34 | // Read lines from a file and return them as a vector of strings
35 | pub fn read_lines(path: &Path) -> Result> {
36 | // Open the file for reading
37 | let file = File::open(path)?;
38 | // Create a buffered reader for the file
39 | let reader = BufReader::new(file);
40 | // Initialize an empty vector to store the lines
41 | let mut lines = Vec::new();
42 |
43 | // Iterate over the lines in the file
44 | for line in reader.lines() {
45 | match line {
46 | // If the line is read successfully, add it to the vector
47 | Ok(line) => lines.push(line),
48 | // If there is an error reading the line, log a warning
49 | Err(e) => warn!("Error reading line: {}", e),
50 | }
51 | }
52 |
53 | Ok(lines)
54 | }
55 |
56 | // Append unique lines to a file, avoiding duplicates
57 | pub async fn append_unique_lines(path: &Path, lines: &[String]) -> Result<()> {
58 | // Read existing lines from the file into a HashSet to avoid duplicates
59 | let mut existing = if path.exists() {
60 | Self::read_lines(path)?
61 | .into_iter()
62 | .collect::>()
63 | } else {
64 | std::collections::HashSet::new()
65 | };
66 |
67 | // Open the file for appending, creating it if it doesn't exist
68 | let mut writer = BufWriter::new(
69 | OpenOptions::new()
70 | .create(true)
71 | .append(true)
72 | .open(path)?
73 | );
74 |
75 | // Iterate over the new lines to be added
76 | for line in lines {
77 | // If the line is not already in the HashSet, add it and write it to the file
78 | if existing.insert(line.clone()) {
79 | if let Err(e) = writeln!(writer, "{}", line) {
80 | warn!("Failed to write line: {}", e);
81 | }
82 | }
83 | }
84 | // Flush the writer to ensure all data is written to the file
85 | if let Err(e) = writer.flush() {
86 | warn!("Failed to flush writer: {}", e);
87 | }
88 |
89 | Ok(())
90 | }
91 |
92 | // Clean up temporary files in a directory with a specific prefix
93 | pub async fn cleanup_temp_files(dir: &Path, prefix: &str) -> Result<()> {
94 | // Read the directory entries
95 | let mut entries = tokio::fs::read_dir(dir).await?;
96 | // Iterate over the directory entries
97 | while let Some(entry) = entries.next_entry().await? {
98 | let path = entry.path();
99 | // Check if the file name starts with the specified prefix
100 | if path.file_name()
101 | .and_then(|n| n.to_str())
102 | .map(|n| n.starts_with(prefix))
103 | .unwrap_or(false)
104 | {
105 | // Remove the file and log a warning if there is an error
106 | if let Err(e) = tokio::fs::remove_file(&path).await {
107 | warn!("Failed to remove temp file {:?}: {}", path, e);
108 | }
109 | }
110 | }
111 | Ok(())
112 | }
113 | }
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | // Declare the display module, which handles displaying information to the user
2 | pub mod display;
3 |
4 | // Declare the core module, which contains the core processing logic of the application
5 | pub mod core;
6 |
7 | // Declare the app_state module, which manages the state of the application
8 | pub mod app_state;
9 |
10 | // Declare the progress module, which tracks and displays progress information
11 | pub mod progress;
12 |
13 | // Declare the config module, which handles configuration management
14 | pub mod config;
15 |
16 | // Declare the file_utils module, which provides utility functions for file operations
17 | pub mod file_utils;
18 |
19 | // Declare the logging module, which handles logging of messages and errors
20 | pub mod logging;
21 |
22 | // Declare the processing module, which contains the main processing logic
23 | pub mod processing;
24 |
25 | // Declare the signal_handler module, which handles OS signals and manages application state
26 | pub mod signal_handler;
27 |
28 | // Declare the errors module, which contains custom error types
29 | pub mod errors;
30 |
--------------------------------------------------------------------------------
/src/logging.rs:
--------------------------------------------------------------------------------
1 | use anyhow::Result; // Importing Result type from anyhow for error handling
2 | use chrono::Local; // Importing Local from chrono to get the current date and time
3 | use crossterm::style::Stylize; // Importing Stylize from crossterm to style log levels
4 | use log::{Level, LevelFilter, Metadata, Record}; // Importing logging types from the log crate
5 | use std::{
6 | fs::{File, OpenOptions}, // Importing File and OpenOptions for file operations
7 | io::Write, // Importing Write trait for writing to files
8 | path::PathBuf, // Importing PathBuf to handle file paths
9 | sync::Mutex, // Importing Mutex for thread-safe access to files
10 | };
11 |
12 | // Define a struct for the Logger
13 | pub struct Logger {
14 | log_file: Option>, // Optional log file wrapped in a Mutex for thread-safe access
15 | error_file: Option>, // Optional error file wrapped in a Mutex for thread-safe access
16 | level: LevelFilter, // Log level filter to control which log messages are recorded
17 | }
18 |
19 | impl Logger {
20 | // Initialize the logger with optional log and error file paths and a log level
21 | pub fn init(
22 | log_path: Option, // Optional path for the log file
23 | error_path: Option, // Optional path for the error file
24 | level: LevelFilter, // Log level filter
25 | ) -> Result<()> {
26 | // Create the log file if a path is provided
27 | let log_file = log_path.map(|path| {
28 | Mutex::new(
29 | OpenOptions::new()
30 | .create(true) // Create the file if it doesn't exist
31 | .append(true) // Append to the file if it exists
32 | .open(path) // Open the file at the given path
33 | .unwrap(), // Unwrap the result, panicking if there's an error
34 | )
35 | });
36 |
37 | // Create the error file if a path is provided
38 | let error_file = error_path.map(|path| {
39 | Mutex::new(
40 | OpenOptions::new()
41 | .create(true) // Create the file if it doesn't exist
42 | .append(true) // Append to the file if it exists
43 | .open(path) // Open the file at the given path
44 | .unwrap(), // Unwrap the result, panicking if there's an error
45 | )
46 | });
47 |
48 | // Create a new Logger instance
49 | let logger = Logger {
50 | log_file,
51 | error_file,
52 | level,
53 | };
54 |
55 | // Set the global logger to the newly created logger
56 | log::set_boxed_logger(Box::new(logger))?;
57 | // Set the maximum log level
58 | log::set_max_level(level);
59 |
60 | Ok(())
61 | }
62 |
63 | // Format a log record into a string
64 | fn format_log(&self, record: &Record) -> String {
65 | // Style the log level based on its severity
66 | let level_str = match record.level() {
67 | Level::Error => record.level().to_string().red(), // Red for errors
68 | Level::Warn => record.level().to_string().yellow(), // Yellow for warnings
69 | Level::Info => record.level().to_string().green(), // Green for info
70 | Level::Debug => record.level().to_string().blue(), // Blue for debug
71 | Level::Trace => record.level().to_string().magenta(), // Magenta for trace
72 | };
73 |
74 | // Format the log message with the current time, log level, target, and message
75 | format!(
76 | "[{}] {} - {}: {}\n",
77 | Local::now().format("%Y-%m-%d %H:%M:%S"), // Current date and time
78 | level_str, // Styled log level
79 | record.target(), // Target of the log message
80 | record.args() // Log message
81 | )
82 | }
83 | }
84 |
85 | // Implement the Log trait for the Logger struct
86 | impl log::Log for Logger {
87 | // Check if a log message should be logged based on its metadata
88 | fn enabled(&self, metadata: &Metadata) -> bool {
89 | metadata.level() <= self.level // Only log messages at or below the set log level
90 | }
91 |
92 | // Log a message
93 | fn log(&self, record: &Record) {
94 | if self.enabled(record.metadata()) { // Check if the log message should be logged
95 | let formatted = self.format_log(record); // Format the log message
96 |
97 | // Print the log message to the console
98 | print!("{}", formatted);
99 |
100 | // Write the log message to the log file if it exists
101 | if let Some(log_file) = &self.log_file {
102 | if let Ok(mut file) = log_file.lock() { // Lock the file for thread-safe access
103 | let _ = file.write_all(formatted.as_bytes()); // Write the log message to the file
104 | }
105 | }
106 |
107 | // Write error messages to the error file if it exists
108 | if record.level() == Level::Error {
109 | if let Some(error_file) = &self.error_file {
110 | if let Ok(mut file) = error_file.lock() { // Lock the file for thread-safe access
111 | let _ = file.write_all(formatted.as_bytes()); // Write the error message to the file
112 | }
113 | }
114 | }
115 | }
116 | }
117 |
118 | // Flush the log files
119 | fn flush(&self) {
120 | // Flush the log file if it exists
121 | if let Some(log_file) = &self.log_file {
122 | if let Ok(mut file) = log_file.lock() { // Lock the file for thread-safe access
123 | let _ = file.flush(); // Flush the file
124 | }
125 | }
126 | // Flush the error file if it exists
127 | if let Some(error_file) = &self.error_file {
128 | if let Ok(mut file) = error_file.lock() { // Lock the file for thread-safe access
129 | let _ = file.flush(); // Flush the file
130 | }
131 | }
132 | }
133 | }
--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
1 | use anyhow::Result; // Import the Result type from the anyhow crate for error handling
2 | use clap::Parser; // Import the Parser trait from the clap crate for command-line argument parsing
3 | use log::{info, error}; // Import the info macro and error macro from the log crate for logging
4 | use std::sync::Arc; // Import the Arc type from the std::sync crate for shared ownership
5 | use ctrlc; // Import the ctrlc crate for handling Ctrl+C signals
6 |
7 | // Declare the modules used in the application
8 | mod cli; // Module for command-line interface definitions
9 | mod commands; // Module for handling different commands
10 | mod config; // Module for configuration management
11 | mod core; // Module for core processing logic
12 | mod app_state; // Module for application state management
13 | mod progress; // Module for progress tracking
14 | mod signal_handler; // Module for signal handling
15 | mod errors; // Add this line
16 |
17 | // Import specific items from the cli and commands modules
18 | use cli::{Cli, Commands}; // Import the Cli struct and Commands enum from the cli module
19 | use commands::CommandHandler; // Import the CommandHandler struct from the commands module
20 | use crate::core::ProcessingCore;
21 | use crate::app_state::AppState;
22 | use crate::errors::{MergerError, MergerResult};
23 |
24 | // Main asynchronous function
25 | #[tokio::main] // Macro to set up the Tokio runtime
26 | async fn main() -> MergerResult<()> {
27 | // Parse command-line arguments into the Cli struct
28 | let cli = Cli::parse();
29 |
30 | // Initialize the logger with the log level specified in the command-line arguments
31 | env_logger::builder().filter_level(cli.log_level()).init();
32 |
33 | // Match on the command provided in the command-line arguments
34 | match cli.command {
35 | // Handle the "merge" command
36 | Commands::Merge(ref args) => {
37 | CommandHandler::handle_merge(&cli, args.clone()).await?;
38 | }
39 | // Handle the "generate-config" command
40 | Commands::GenerateConfig(args) => {
41 | CommandHandler::handle_generate_config(args).await?;
42 | }
43 | // Handle the "guided-setup" command
44 | Commands::GuidedSetup(args) => {
45 | CommandHandler::handle_guided_setup(args).await?;
46 | }
47 | // Handle the "resume" command
48 | Commands::Resume(args) => {
49 | let state: AppState = AppState::from_resume(args.progress_file).await?;
50 | let state = Arc::new(state);
51 |
52 | // Set up Ctrl+C handler
53 | let state_clone = Arc::clone(&state);
54 | ctrlc::set_handler(move || {
55 | let state = state_clone.clone();
56 | tokio::spawn(async move {
57 | info!("Received Ctrl+C, saving progress...");
58 | if let Err(e) = state.save_progress().await {
59 | error!("Failed to save progress: {}", e);
60 | }
61 | state.request_shutdown().await;
62 | });
63 | })?;
64 |
65 | // Resume merger
66 | let mut core = ProcessingCore::new(state.clone(), true, true).await?;
67 | core.process().await?;
68 | }
69 | }
70 |
71 | Ok(())
72 | }
73 |
--------------------------------------------------------------------------------
/src/processing.rs:
--------------------------------------------------------------------------------
1 | use serde::{Deserialize, Serialize};
2 | use tokio::io::{AsyncBufReadExt, BufReader};
3 | use tokio::fs::File;
4 | use log::warn;
5 | use std::path::PathBuf;
6 | use anyhow::Result;
7 | use crate::progress::Progress;
8 |
9 | #[derive(Debug, Clone, Serialize, Deserialize)]
10 | pub enum ProcessingStage {
11 | Initializing,
12 | ValidatingFiles,
13 | ProcessingFiles,
14 | Merging,
15 | Completed,
16 | Failed,
17 | }
18 |
19 | pub struct FileProcessor;
20 |
21 | impl FileProcessor {
22 | pub async fn process_file(progress: &mut Progress, file: PathBuf) -> Result<()> {
23 | let file_path = file.clone();
24 |
25 | let file = match File::open(&file).await {
26 | Ok(f) => f,
27 | Err(e) => {
28 | warn!("Failed to open file {:?}: {}", file, e);
29 | return Ok(());
30 | }
31 | };
32 |
33 | let reader = BufReader::new(file);
34 | let mut lines = reader.lines();
35 |
36 | while let Some(line) = lines.next_line().await? {
37 | if !line.is_empty() {
38 | // Process line here if needed
39 | }
40 | }
41 |
42 | progress.add_processed_file(file_path).await?;
43 | Ok(())
44 | }
45 | }
--------------------------------------------------------------------------------
/src/progress.rs:
--------------------------------------------------------------------------------
1 | // Import required dependencies
2 | use anyhow::Result; // For error handling
3 | use serde::{Serialize, Deserialize}; // For JSON serialization/deserialization
4 | use std::path::PathBuf; // For file path handling
5 | use tokio::fs; // For async file operations
6 | use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
7 | use std::time::{Duration, Instant};
8 |
9 | // Metrics tracking structures
10 | pub struct ProcessingMetrics {
11 | start_time: Instant,
12 | files_processed: usize,
13 | lines_processed: usize,
14 | errors_count: usize,
15 | }
16 |
17 | impl ProcessingMetrics {
18 | pub fn new() -> Self {
19 | Self {
20 | start_time: Instant::now(),
21 | files_processed: 0,
22 | lines_processed: 0,
23 | errors_count: 0,
24 | }
25 | }
26 |
27 | pub fn increment_files(&mut self) {
28 | self.files_processed += 1;
29 | }
30 |
31 | pub fn add_lines(&mut self, count: usize) {
32 | self.lines_processed += count;
33 | }
34 |
35 | pub fn get_summary(&self) -> ProcessingSummary {
36 | ProcessingSummary {
37 | elapsed_time: self.start_time.elapsed(),
38 | files_processed: self.files_processed,
39 | lines_processed: self.lines_processed,
40 | errors_count: self.errors_count,
41 | memory_usage: 0,
42 | }
43 | }
44 | }
45 |
46 | pub struct ProcessingSummary {
47 | pub elapsed_time: Duration,
48 | pub files_processed: usize,
49 | pub lines_processed: usize,
50 | pub errors_count: usize,
51 | pub memory_usage: usize,
52 | }
53 |
54 | // Progress tracking structure that can be serialized to/from JSON
55 | #[derive(Debug, Serialize, Deserialize)]
56 | pub struct Progress {
57 | pub input_file: PathBuf, // Source file containing list of files to process
58 | pub output_file: PathBuf, // Destination file for merged content
59 | pub threads: usize, // Number of parallel processing threads
60 | pub processed_files: Vec, // List of successfully processed files
61 | pub current_position: usize, // Current processing position for resume capability
62 | pub save_path: Option, // Path where progress state is saved
63 | }
64 |
65 | // Implement Default trait for Progress
66 | impl Default for Progress {
67 | fn default() -> Self {
68 | Self {
69 | input_file: PathBuf::new(),
70 | output_file: PathBuf::new(),
71 | threads: 10, // Default to 10 threads
72 | processed_files: Vec::new(),
73 | current_position: 0,
74 | save_path: None,
75 | }
76 | }
77 | }
78 |
79 | impl Progress {
80 | // Save current progress state to JSON file
81 | pub async fn save(&self) -> Result<()> {
82 | if let Some(path) = &self.save_path {
83 | // Convert progress state to pretty-printed JSON
84 | let content = serde_json::to_string_pretty(&self)?;
85 | // Write to file asynchronously
86 | fs::write(path, content).await?;
87 | }
88 | Ok(())
89 | }
90 |
91 | // Load progress state from a JSON file
92 | pub async fn load(path: &PathBuf) -> Result {
93 | // Read file content asynchronously
94 | let content = fs::read_to_string(path).await?;
95 | // Parse JSON into Progress struct
96 | let mut progress: Progress = serde_json::from_str(&content)?;
97 | // Store save path for future updates
98 | progress.save_path = Some(path.clone());
99 | Ok(progress)
100 | }
101 |
102 | // Add a processed file to the progress tracking
103 | #[allow(dead_code)] // Suppress unused function warning
104 | pub async fn add_processed_file(&mut self, file: PathBuf) -> Result<()> {
105 | // Add file to processed list
106 | self.processed_files.push(file);
107 | // Increment position counter
108 | self.current_position += 1;
109 | // Save updated progress state
110 | self.save().await
111 | }
112 | }
113 |
114 | pub struct ProgressTracker {
115 | multi_progress: MultiProgress,
116 | overall_progress: ProgressBar,
117 | dedup_progress: ProgressBar,
118 | metrics: ProcessingMetrics,
119 | refresh_rate: Duration,
120 | }
121 |
122 | impl ProgressTracker {
123 | pub fn new(total_files: usize, estimated_lines: usize) -> Self {
124 | let multi = MultiProgress::new();
125 |
126 | // Overall progress bar style
127 | let overall_style = ProgressStyle::default_bar()
128 | .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} files ({percent}%) | {msg}")
129 | .unwrap()
130 | .progress_chars("#>-");
131 |
132 | // Deduplication progress bar style
133 | let dedup_style = ProgressStyle::default_bar()
134 | .template("{spinner:.yellow} [{elapsed_precise}] [{bar:40.yellow/blue}] {pos}/{len} lines | {msg}")
135 | .unwrap()
136 | .progress_chars("#>-");
137 |
138 | let overall_pb = multi.add(ProgressBar::new(total_files as u64));
139 | overall_pb.set_style(overall_style);
140 |
141 | let dedup_pb = multi.add(ProgressBar::new(estimated_lines as u64));
142 | dedup_pb.set_style(dedup_style);
143 |
144 | Self {
145 | multi_progress: multi,
146 | overall_progress: overall_pb,
147 | dedup_progress: dedup_pb,
148 | metrics: ProcessingMetrics::new(),
149 | refresh_rate: Duration::from_millis(100),
150 | }
151 | }
152 |
153 | pub fn update_overall_progress(&mut self, files_processed: usize) {
154 | self.metrics.increment_files();
155 | let summary = self.metrics.get_summary();
156 |
157 | self.overall_progress.set_position(files_processed as u64);
158 | self.overall_progress.set_message(format!(
159 | "Speed: {:.2} files/s | Memory: {:.2} MB | Errors: {}",
160 | files_processed as f64 / summary.elapsed_time.as_secs_f64(),
161 | summary.memory_usage as f64 / 1_048_576.0, // Convert bytes to MB
162 | summary.errors_count
163 | ));
164 | }
165 |
166 | pub fn update_dedup_progress(&mut self, lines_processed: usize, total_lines: usize) {
167 | self.metrics.add_lines(lines_processed);
168 | let summary = self.metrics.get_summary();
169 |
170 | self.dedup_progress.set_length(total_lines as u64);
171 | self.dedup_progress.set_position(lines_processed as u64);
172 | self.dedup_progress.set_message(format!(
173 | "Speed: {:.2} lines/s | Unique lines: {}",
174 | summary.lines_processed as f64 / summary.elapsed_time.as_secs_f64(),
175 | lines_processed
176 | ));
177 | }
178 |
179 | pub fn finish(&self) {
180 | let summary = self.metrics.get_summary();
181 | self.overall_progress.finish_with_message(format!(
182 | "Completed in {}s | Files: {} | Lines: {} | Errors: {}",
183 | summary.elapsed_time.as_secs(),
184 | summary.files_processed,
185 | summary.lines_processed,
186 | summary.errors_count
187 | ));
188 | self.dedup_progress.finish();
189 | }
190 |
191 | pub fn get_metrics(&self) -> &ProcessingMetrics {
192 | &self.metrics
193 | }
194 | }
--------------------------------------------------------------------------------
/src/signal_handler.rs:
--------------------------------------------------------------------------------
1 | use std::sync::Arc; // Importing Arc for thread-safe reference counting
2 | use tokio::sync::broadcast; // Importing broadcast channel from tokio for sending shutdown signals
3 | use anyhow::Result; // Importing Result type from anyhow for error handling
4 | use log::{info, error}; // Importing logging macros for info and error messages
5 | use crate::app_state::AppState; // Importing the AppState struct from the app_state module
6 |
7 | // Struct to handle OS signals and manage application state
8 | pub struct SignalHandler {
9 | app_state: Arc, // Shared and mutable application state
10 | shutdown_tx: broadcast::Sender<()>, // Broadcast channel sender for shutdown signals
11 | }
12 |
13 | impl SignalHandler {
14 | // Function to create a new instance of SignalHandler
15 | pub fn new(app_state: Arc) -> Result {
16 | // Create a new broadcast channel with a buffer size of 1
17 | let (shutdown_tx, _) = broadcast::channel(1);
18 |
19 | // Return a new SignalHandler instance with the provided app_state and broadcast channel
20 | Ok(Self {
21 | app_state,
22 | shutdown_tx,
23 | })
24 | }
25 |
26 | // Function to subscribe to the shutdown broadcast channel
27 | #[allow(dead_code)]
28 | pub fn subscribe(&self) -> broadcast::Receiver<()> {
29 | // Return a new receiver for the broadcast channel
30 | self.shutdown_tx.subscribe()
31 | }
32 |
33 | // Function to set up signal handlers
34 | pub fn setup_handlers(&self) -> Result<()> {
35 | // Clone the broadcast channel sender for use in the signal handler
36 | let shutdown_tx = self.shutdown_tx.clone();
37 | // Clone the app_state for use in the signal handler
38 | let app_state = self.app_state.clone();
39 |
40 | // Set up a handler for the Ctrl+C signal
41 | ctrlc::set_handler(move || {
42 | // Log that an interrupt signal was received
43 | info!("Received interrupt signal, initiating graceful shutdown");
44 |
45 | // Clone app_state and shutdown_tx again before moving into async block
46 | let app_state = app_state.clone();
47 | let shutdown_tx = shutdown_tx.clone();
48 |
49 | tokio::spawn(async move {
50 | // Attempt to save the progress
51 | if let Err(e) = app_state.save_progress().await {
52 | error!("Failed to save progress: {}", e);
53 | }
54 |
55 | // Attempt to send the shutdown signal
56 | if let Err(e) = shutdown_tx.send(()) {
57 | error!("Failed to send shutdown signal: {}", e);
58 | }
59 | });
60 | })?;
61 |
62 | Ok(())
63 | }
64 | }
--------------------------------------------------------------------------------