├── LICENSE ├── Methylotl.py ├── README.md ├── environment.yml ├── img └── logo.jpg ├── index_genome.py ├── install.sh ├── process_reads.py └── test_data ├── sequence.fasta ├── test_100bp_0.5err_1.fastq └── test_100bp_0.5err_2.fastq /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-ShareAlike 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-ShareAlike 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. Share means to provide material to the public by any means or 126 | process that requires permission under the Licensed Rights, such 127 | as reproduction, public display, public performance, distribution, 128 | dissemination, communication, or importation, and to make material 129 | available to the public including in ways that members of the 130 | public may access the material from a place and at a time 131 | individually chosen by them. 132 | 133 | l. Sui Generis Database Rights means rights other than copyright 134 | resulting from Directive 96/9/EC of the European Parliament and of 135 | the Council of 11 March 1996 on the legal protection of databases, 136 | as amended and/or succeeded, as well as other essentially 137 | equivalent rights anywhere in the world. 138 | 139 | m. You means the individual or entity exercising the Licensed Rights 140 | under this Public License. Your has a corresponding meaning. 141 | 142 | 143 | Section 2 -- Scope. 144 | 145 | a. License grant. 146 | 147 | 1. Subject to the terms and conditions of this Public License, 148 | the Licensor hereby grants You a worldwide, royalty-free, 149 | non-sublicensable, non-exclusive, irrevocable license to 150 | exercise the Licensed Rights in the Licensed Material to: 151 | 152 | a. reproduce and Share the Licensed Material, in whole or 153 | in part; and 154 | 155 | b. produce, reproduce, and Share Adapted Material. 156 | 157 | 2. Exceptions and Limitations. For the avoidance of doubt, where 158 | Exceptions and Limitations apply to Your use, this Public 159 | License does not apply, and You do not need to comply with 160 | its terms and conditions. 161 | 162 | 3. Term. The term of this Public License is specified in Section 163 | 6(a). 164 | 165 | 4. Media and formats; technical modifications allowed. The 166 | Licensor authorizes You to exercise the Licensed Rights in 167 | all media and formats whether now known or hereafter created, 168 | and to make technical modifications necessary to do so. The 169 | Licensor waives and/or agrees not to assert any right or 170 | authority to forbid You from making technical modifications 171 | necessary to exercise the Licensed Rights, including 172 | technical modifications necessary to circumvent Effective 173 | Technological Measures. For purposes of this Public License, 174 | simply making modifications authorized by this Section 2(a) 175 | (4) never produces Adapted Material. 176 | 177 | 5. Downstream recipients. 178 | 179 | a. Offer from the Licensor -- Licensed Material. Every 180 | recipient of the Licensed Material automatically 181 | receives an offer from the Licensor to exercise the 182 | Licensed Rights under the terms and conditions of this 183 | Public License. 184 | 185 | b. Additional offer from the Licensor -- Adapted Material. 186 | Every recipient of Adapted Material from You 187 | automatically receives an offer from the Licensor to 188 | exercise the Licensed Rights in the Adapted Material 189 | under the conditions of the Adapter's License You apply. 190 | 191 | c. No downstream restrictions. You may not offer or impose 192 | any additional or different terms or conditions on, or 193 | apply any Effective Technological Measures to, the 194 | Licensed Material if doing so restricts exercise of the 195 | Licensed Rights by any recipient of the Licensed 196 | Material. 197 | 198 | 6. No endorsement. Nothing in this Public License constitutes or 199 | may be construed as permission to assert or imply that You 200 | are, or that Your use of the Licensed Material is, connected 201 | with, or sponsored, endorsed, or granted official status by, 202 | the Licensor or others designated to receive attribution as 203 | provided in Section 3(a)(1)(A)(i). 204 | 205 | b. Other rights. 206 | 207 | 1. Moral rights, such as the right of integrity, are not 208 | licensed under this Public License, nor are publicity, 209 | privacy, and/or other similar personality rights; however, to 210 | the extent possible, the Licensor waives and/or agrees not to 211 | assert any such rights held by the Licensor to the limited 212 | extent necessary to allow You to exercise the Licensed 213 | Rights, but not otherwise. 214 | 215 | 2. Patent and trademark rights are not licensed under this 216 | Public License. 217 | 218 | 3. To the extent possible, the Licensor waives any right to 219 | collect royalties from You for the exercise of the Licensed 220 | Rights, whether directly or through a collecting society 221 | under any voluntary or waivable statutory or compulsory 222 | licensing scheme. In all other cases the Licensor expressly 223 | reserves any right to collect such royalties. 224 | 225 | 226 | Section 3 -- License Conditions. 227 | 228 | Your exercise of the Licensed Rights is expressly made subject to the 229 | following conditions. 230 | 231 | a. Attribution. 232 | 233 | 1. If You Share the Licensed Material (including in modified 234 | form), You must: 235 | 236 | a. retain the following if it is supplied by the Licensor 237 | with the Licensed Material: 238 | 239 | i. identification of the creator(s) of the Licensed 240 | Material and any others designated to receive 241 | attribution, in any reasonable manner requested by 242 | the Licensor (including by pseudonym if 243 | designated); 244 | 245 | ii. a copyright notice; 246 | 247 | iii. a notice that refers to this Public License; 248 | 249 | iv. a notice that refers to the disclaimer of 250 | warranties; 251 | 252 | v. a URI or hyperlink to the Licensed Material to the 253 | extent reasonably practicable; 254 | 255 | b. indicate if You modified the Licensed Material and 256 | retain an indication of any previous modifications; and 257 | 258 | c. indicate the Licensed Material is licensed under this 259 | Public License, and include the text of, or the URI or 260 | hyperlink to, this Public License. 261 | 262 | 2. You may satisfy the conditions in Section 3(a)(1) in any 263 | reasonable manner based on the medium, means, and context in 264 | which You Share the Licensed Material. For example, it may be 265 | reasonable to satisfy the conditions by providing a URI or 266 | hyperlink to a resource that includes the required 267 | information. 268 | 269 | 3. If requested by the Licensor, You must remove any of the 270 | information required by Section 3(a)(1)(A) to the extent 271 | reasonably practicable. 272 | 273 | b. ShareAlike. 274 | 275 | In addition to the conditions in Section 3(a), if You Share 276 | Adapted Material You produce, the following conditions also apply. 277 | 278 | 1. The Adapter's License You apply must be a Creative Commons 279 | license with the same License Elements, this version or 280 | later, or a BY-SA Compatible License. 281 | 282 | 2. You must include the text of, or the URI or hyperlink to, the 283 | Adapter's License You apply. You may satisfy this condition 284 | in any reasonable manner based on the medium, means, and 285 | context in which You Share Adapted Material. 286 | 287 | 3. You may not offer or impose any additional or different terms 288 | or conditions on, or apply any Effective Technological 289 | Measures to, Adapted Material that restrict exercise of the 290 | rights granted under the Adapter's License You apply. 291 | 292 | 293 | Section 4 -- Sui Generis Database Rights. 294 | 295 | Where the Licensed Rights include Sui Generis Database Rights that 296 | apply to Your use of the Licensed Material: 297 | 298 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 299 | to extract, reuse, reproduce, and Share all or a substantial 300 | portion of the contents of the database; 301 | 302 | b. if You include all or a substantial portion of the database 303 | contents in a database in which You have Sui Generis Database 304 | Rights, then the database in which You have Sui Generis Database 305 | Rights (but not its individual contents) is Adapted Material, 306 | including for purposes of Section 3(b); and 307 | 308 | c. You must comply with the conditions in Section 3(a) if You Share 309 | all or a substantial portion of the contents of the database. 310 | 311 | For the avoidance of doubt, this Section 4 supplements and does not 312 | replace Your obligations under this Public License where the Licensed 313 | Rights include other Copyright and Similar Rights. 314 | 315 | 316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 317 | 318 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 319 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 320 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 321 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 322 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 323 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 324 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 325 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 326 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 327 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 328 | 329 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 330 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 331 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 332 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 333 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 334 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 335 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 336 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 337 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 338 | 339 | c. The disclaimer of warranties and limitation of liability provided 340 | above shall be interpreted in a manner that, to the extent 341 | possible, most closely approximates an absolute disclaimer and 342 | waiver of all liability. 343 | 344 | 345 | Section 6 -- Term and Termination. 346 | 347 | a. This Public License applies for the term of the Copyright and 348 | Similar Rights licensed here. However, if You fail to comply with 349 | this Public License, then Your rights under this Public License 350 | terminate automatically. 351 | 352 | b. Where Your right to use the Licensed Material has terminated under 353 | Section 6(a), it reinstates: 354 | 355 | 1. automatically as of the date the violation is cured, provided 356 | it is cured within 30 days of Your discovery of the 357 | violation; or 358 | 359 | 2. upon express reinstatement by the Licensor. 360 | 361 | For the avoidance of doubt, this Section 6(b) does not affect any 362 | right the Licensor may have to seek remedies for Your violations 363 | of this Public License. 364 | 365 | c. For the avoidance of doubt, the Licensor may also offer the 366 | Licensed Material under separate terms or conditions or stop 367 | distributing the Licensed Material at any time; however, doing so 368 | will not terminate this Public License. 369 | 370 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 371 | License. 372 | 373 | 374 | Section 7 -- Other Terms and Conditions. 375 | 376 | a. The Licensor shall not be bound by any additional or different 377 | terms or conditions communicated by You unless expressly agreed. 378 | 379 | b. Any arrangements, understandings, or agreements regarding the 380 | Licensed Material not stated herein are separate from and 381 | independent of the terms and conditions of this Public License. 382 | 383 | 384 | Section 8 -- Interpretation. 385 | 386 | a. For the avoidance of doubt, this Public License does not, and 387 | shall not be interpreted to, reduce, limit, restrict, or impose 388 | conditions on any use of the Licensed Material that could lawfully 389 | be made without permission under this Public License. 390 | 391 | b. To the extent possible, if any provision of this Public License is 392 | deemed unenforceable, it shall be automatically reformed to the 393 | minimum extent necessary to make it enforceable. If the provision 394 | cannot be reformed, it shall be severed from this Public License 395 | without affecting the enforceability of the remaining terms and 396 | conditions. 397 | 398 | c. No term or condition of this Public License will be waived and no 399 | failure to comply consented to unless expressly agreed to by the 400 | Licensor. 401 | 402 | d. Nothing in this Public License constitutes or may be interpreted 403 | as a limitation upon, or waiver of, any privileges and immunities 404 | that apply to the Licensor or You, including from the legal 405 | processes of any jurisdiction or authority. 406 | 407 | 408 | ======================================================================= 409 | 410 | Creative Commons is not a party to its public 411 | licenses. Notwithstanding, Creative Commons may elect to apply one of 412 | its public licenses to material it publishes and in those instances 413 | will be considered the “Licensor.” The text of the Creative Commons 414 | public licenses is dedicated to the public domain under the CC0 Public 415 | Domain Dedication. Except for the limited purpose of indicating that 416 | material is shared under a Creative Commons public license or as 417 | otherwise permitted by the Creative Commons policies published at 418 | creativecommons.org/policies, Creative Commons does not authorize the 419 | use of the trademark "Creative Commons" or any other trademark or logo 420 | of Creative Commons without its prior written consent including, 421 | without limitation, in connection with any unauthorized modifications 422 | to any of its public licenses or any other arrangements, 423 | understandings, or agreements concerning use of licensed material. For 424 | the avoidance of doubt, this paragraph does not form part of the 425 | public licenses. 426 | 427 | Creative Commons may be contacted at creativecommons.org. 428 | -------------------------------------------------------------------------------- /Methylotl.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | 4 | 5 | def index_genome(genome: str) -> None: 6 | subprocess.run( 7 | [ 8 | "python", 9 | "index_genome.py", 10 | "--genome", 11 | genome, 12 | ] 13 | ) 14 | 15 | 16 | def process_reads( 17 | threads: int, input_dir: str, output_dir: str, ref: str, logs: str, q: int 18 | ) -> None: 19 | subprocess.run( 20 | [ 21 | "python", 22 | "process_reads.py", 23 | "--threads", 24 | str(threads), 25 | "--input_dir", 26 | input_dir, 27 | "--output_dir", 28 | output_dir, 29 | "--ref", 30 | ref, 31 | "--logs", 32 | logs, 33 | "--q", 34 | str(q), 35 | ] 36 | ) 37 | 38 | 39 | if __name__ == "__main__": 40 | parser = argparse.ArgumentParser( 41 | description="Main script for indexing genome or processing reads" 42 | ) 43 | 44 | subparsers = parser.add_subparsers( 45 | dest="action", required=True, help="Action to perform: index or process" 46 | ) 47 | 48 | # Subparser for indexing genome 49 | index_parser = subparsers.add_parser("index", help="Index the genome") 50 | index_parser.add_argument( 51 | "--genome", required=True, type=str, help="Path to the reference genome file" 52 | ) 53 | 54 | # Subparser for processing reads 55 | process_parser = subparsers.add_parser("process", help="Process reads") 56 | process_parser.add_argument( 57 | "--threads", type=int, default=8, help="Number of threads" 58 | ) 59 | process_parser.add_argument( 60 | "--input_dir", type=str, required=True, help="Abs path to input folder" 61 | ) 62 | process_parser.add_argument( 63 | "--output_dir", type=str, required=True, help="Abs path to output folder" 64 | ) 65 | process_parser.add_argument( 66 | "--ref", type=str, required=True, help="Abs path to genome file" 67 | ) 68 | process_parser.add_argument( 69 | "--logs", type=str, required=True, help="Abs path to logs folder" 70 | ) 71 | process_parser.add_argument( 72 | "--q", type=int, default=20, help="Quality score for trim_galore" 73 | ) 74 | 75 | args = parser.parse_args() 76 | 77 | if args.action == "index": 78 | index_genome(genome=args.genome) 79 | elif args.action == "process": 80 | process_reads( 81 | threads=args.threads, 82 | input_dir=args.input_dir, 83 | output_dir=args.output_dir, 84 | ref=args.ref, 85 | logs=args.logs, 86 | q=args.q, 87 | ) 88 | 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
Methylotl
2 | 3 | # Methylotl 4 | 5 | 6 | 7 | Methylotl is a Python package designed to process raw bisulfite sequencing data. It performs essential steps to convert, align, and analyze bisulfite-treated DNA sequences, providing insights into DNA methylation patterns. 8 | 9 | ## Features 10 | 11 | - Aligns bisulfite-treated reads to a reference genome 12 | - Analyzes methylation patterns 13 | 14 | ## Requirements 15 | 16 | - Python 3.10 or higher 17 | 18 | - Required Python packages: `pysam` 19 | 20 | - External tools: `fastqc`, `trimgalore`, `bsmapz`, `samtools`, `methyldackel` 21 | 22 | **Note:** This tool has been tested and is confirmed to work correctly on Debian-like distributions. 23 | 24 | ## Installation 25 | 26 | 1. Clone the repository: 27 | 28 | ```{bash} 29 | git clone https://github.com/yourusername/Methylotl.git 30 | cd Methylotl 31 | ``` 32 | 33 | 2. Run the install.sh script to create a conda environment and install the necessary tools: 34 | 35 | ```{bash} 36 | chmod +x install.sh 37 | ./install.sh 38 | ``` 39 | 40 | ## Usage 41 | 42 | ### Basic Usage 43 | 44 | To index the genome, run the following command: 45 | 46 | ```{bash} 47 | python Methylotl.py index --genome /path/to/genome.fa 48 | ``` 49 | To process reads with default settings, use the following command: 50 | 51 | ```{bash} 52 | python Methylotl.py process --input_dir /path/to/input --output_dir /path/to/output --ref /path/to/genome.fa --logs /path/to/logs --q 20 53 | ``` 54 | 55 | ### Command-Line Arguments 56 | 57 | * `index`: Index the genome. 58 | - `--genome`: Path to the reference genome file. 59 | * `process`: Process reads. 60 | - `--threads`: Number of threads to use (default: 8). 61 | - `--input_dir`: Absolute path to the input folder. 62 | - `--output_dir`: Absolute path to the output folder. 63 | - `--ref`: Absolute path to the genome file. 64 | - `--logs`: Absolute path to the logs folder. 65 | - `--q`: Quality score for trim_galore (default: 20). 66 | 67 | ## Output 68 | 69 | Methylotl generates the following output files: 70 | 71 | - FastQC reports: Before and after trimming 72 | - Trimmed FASTQ files 73 | - Unsorted BAM files 74 | - Sorted BAM files 75 | - Deduplicated BAM files 76 | - BedGraph report 77 | - Methylation bias (MBias) plot for each strand (in SVG format) 78 | - Log files for each step of the process 79 | 80 | ## Contributing 81 | 82 | Contributions are welcome! Please fork the repository and submit pull requests with detailed descriptions of your changes. 83 | 84 | ## License 85 | 86 | This project is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. See the `LICENSE` file for more details. 87 | 88 | ## Contact 89 | 90 | For questions or comments, please contact us ttnlsc@gmail.com , stacy.petukhova@gmail.com. 91 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: methylotl_env 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | dependencies: 6 | - python=3.10 7 | - fastqc 8 | - trim-galore 9 | - samtools 10 | - pysam 11 | - methyldackel 12 | -------------------------------------------------------------------------------- /img/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ComputationalAgingLab/Methylotl/327ada4ef561dfe59dcee32430ce8ecbfdb5f3cd/img/logo.jpg -------------------------------------------------------------------------------- /index_genome.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | import os 4 | 5 | def index_genome(genome_file: str) -> None: 6 | """ 7 | Indexes the genome file using samtools faidx. 8 | Args: 9 | genome_file (str): Path to the genome file. 10 | Returns: 11 | None. Prints a message indicating the completion of indexing. 12 | Raises: 13 | FileNotFoundError: If the specified genome file is not found. 14 | """ 15 | index_file = genome_file + ".fai" 16 | subprocess.run(["samtools", "faidx", genome_file]) 17 | print(f"Genome indexed. Index file saved as: {index_file}") 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser(description="Genome indexing script using samtools faidx") 21 | parser.add_argument("--genome", required=True, help="Path to the genome file") 22 | args = parser.parse_args() 23 | 24 | genome_file = args.genome 25 | 26 | if not os.path.isfile(genome_file): 27 | raise FileNotFoundError("Genome file not found.") 28 | else: 29 | index_genome(genome_file) 30 | 31 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit immediately if a command exits with a non-zero status 4 | set -e 5 | 6 | # Define the environment file 7 | ENV_FILE="environment.yml" 8 | 9 | # Create the Conda environment from the YAML file 10 | echo "Creating Conda environment from ${ENV_FILE}..." 11 | conda env create -f $ENV_FILE 12 | 13 | # Activate the newly created environment 14 | echo "Activating the Conda environment..." 15 | source $(conda info --base)/etc/profile.d/conda.sh 16 | conda activate methylotl_env 17 | 18 | # Clone the BSMAPz repository 19 | echo "Cloning BSMAPz repository..." 20 | git clone https://github.com/zyndagj/BSMAPz.git 21 | 22 | # Navigate into the BSMAPz directory 23 | cd BSMAPz 24 | 25 | # Build BSMAPz 26 | echo "Building BSMAPz..." 27 | make bsmapz 28 | 29 | # Add BSMAPz to the PATH within the environment 30 | echo "Adding BSMAPz to PATH within the environment..." 31 | export PATH=$PATH:$(pwd) 32 | 33 | # Confirm installation 34 | echo "BSMAPz installed successfully. Current PATH:" 35 | echo $PATH 36 | 37 | # Stay in the Conda environment 38 | echo "Staying in the Conda environment: methylotl_env" 39 | $SHELL 40 | 41 | -------------------------------------------------------------------------------- /process_reads.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import subprocess 4 | import argparse 5 | import sys 6 | import shutil 7 | from typing import List, Tuple, Optional 8 | 9 | 10 | def get_file_extension(folder: str) -> Tuple[str, str]: 11 | """ 12 | Determines the platform-specific output file extensions from files in the given folder. 13 | Args: 14 | folder (str): The directory to scan for files. 15 | Returns: 16 | Tuple[str, str]: A tuple containing two file extensions. 17 | Raises: 18 | ValueError: If no valid file extensions are found. 19 | """ 20 | platform_output1 = None 21 | platform_output2 = None 22 | 23 | valid_extensions = ( 24 | "_1.fastq", "_1.fastq.gz", 25 | "_1.fq.gz", "_1.fq", 26 | "_R1.fq.gz", "_R1.fq", 27 | "_R1.fastq.gz", "_R1.fastq", 28 | "_R1_001.fq.gz", "_R1_001.fq", 29 | "_R1_001.fastq.gz", "_R1_001.fastq" 30 | ) 31 | 32 | for filename in os.listdir(folder): 33 | for extension in valid_extensions: 34 | if filename.endswith(extension): 35 | platform_output1 = extension 36 | platform_output2 = platform_output1.replace("1", "2", 1) 37 | 38 | if platform_output1 and platform_output2: 39 | break 40 | 41 | if platform_output1 and platform_output2: 42 | return platform_output1, platform_output2 43 | else: 44 | raise ValueError("Wrong file format") 45 | 46 | 47 | def process_filename(filename: str, platform_output: str) -> Tuple[str, Optional[str]]: 48 | """ 49 | Extracts the sample name and line from the given filename. 50 | Args: 51 | filename (str): The filename to process. 52 | platform_output (str): The platform-specific file extension. 53 | Returns: 54 | Tuple[str, Optional[str]]: A tuple containing the sample name and the line (if found). 55 | """ 56 | sample_name = filename.replace(platform_output, "") 57 | match = re.search(r'(L00[1-4])', sample_name) 58 | 59 | if match: 60 | line = match.group(1) 61 | sample_name = sample_name.split(line)[0] 62 | else: 63 | line = None 64 | 65 | return sample_name, line 66 | 67 | 68 | def process_oneline_files(files: List[str], platform_output1: str, platform_output2: str, logpath: str, input_dir: str, 69 | output_dir: str, args) -> None: 70 | """ 71 | Processes files that do not have corresponding lines. 72 | Args: 73 | files (List[str]): List of file names to process. 74 | platform_output1 (str): First file output pattern. 75 | platform_output2 (str): Second file output pattern. 76 | logpath (str): Path to the log file. 77 | input_dir (str): Directory containing input files. 78 | output_dir (str): Directory to store output files. 79 | """ 80 | r1 = None 81 | r2 = None 82 | name = None 83 | 84 | for file in files: 85 | if file.endswith(platform_output1): 86 | name, line = process_filename(file, platform_output1) 87 | if line: 88 | r1 = f"{name}{line}{platform_output1}" 89 | else: 90 | r1 = f"{name}{platform_output1}" 91 | elif file.endswith(platform_output2): 92 | name, line = process_filename(file, platform_output2) 93 | if line: 94 | r2 = f"{name}{line}{platform_output2}" 95 | else: 96 | r2 = f"{name}{platform_output2}" 97 | 98 | # Perform quality control 99 | print("QC...") 100 | qc_dir = os.path.join(output_dir, "FastQC") 101 | os.makedirs(qc_dir, exist_ok=True) 102 | qc_command = [ 103 | "fastqc", 104 | "-o", qc_dir, 105 | "-t", str(args.threads), 106 | os.path.join(input_dir, r1), 107 | os.path.join(input_dir, r2) 108 | ] 109 | if platform_output1.endswith(".gz"): 110 | qc_command.insert(1, "--noextract") 111 | 112 | with open(logpath, "w") as logfile: 113 | subprocess.run(qc_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 114 | 115 | # Adapter trimming 116 | print("Adapter trimming...") 117 | trimgalore_dir = os.path.join(output_dir, "trim-galore") 118 | os.makedirs(trimgalore_dir, exist_ok=True) 119 | trim_command = [ 120 | "trim_galore", 121 | "-q", str(args.q), 122 | "--fastqc", 123 | "-o", trimgalore_dir, 124 | "-j", str(args.threads), 125 | "--paired", 126 | "--gzip", 127 | os.path.join(input_dir, r1), 128 | os.path.join(input_dir, r2), 129 | ] 130 | with open(logpath, "a") as logfile: 131 | subprocess.run(trim_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 132 | 133 | # Aligning 134 | print("Aligning...") 135 | aligning_dir = os.path.join(output_dir, "aligning") 136 | os.makedirs(aligning_dir, exist_ok=True) 137 | align_command = [ 138 | "bsmapz", 139 | "-a", os.path.join(trimgalore_dir, f"{name}_1_val_1.fq.gz"), 140 | "-b", os.path.join(trimgalore_dir, f"{name}_2_val_2.fq.gz"), 141 | "-d", args.ref, 142 | "-o", os.path.join(aligning_dir, f"{name}.bam"), 143 | "-p", str(args.threads), 144 | ] 145 | with open(logpath, "a") as logfile: 146 | subprocess.run(align_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 147 | 148 | # Fixing mates 149 | fixmate_command = [ 150 | "samtools", "fixmate", "-m", 151 | os.path.join(aligning_dir, f"{name}.bam"), 152 | os.path.join(aligning_dir, f"{name}.fixmate.bam"), 153 | ] 154 | with open(logpath, "a") as logfile: 155 | subprocess.run(fixmate_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 156 | 157 | # Sorting BAM files 158 | sort_command = [ 159 | "samtools", "sort", "-@", str(args.threads), 160 | "-o", os.path.join(aligning_dir, f"{name}.sorted.bam"), 161 | os.path.join(aligning_dir, f"{name}.fixmate.bam"), 162 | ] 163 | with open(logpath, "a") as logfile: 164 | subprocess.run(sort_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 165 | 166 | # Deduplicating BAM files 167 | print("Deduplicating...") 168 | deduplicate_command = [ 169 | "samtools", "markdup", 170 | os.path.join(aligning_dir, f"{name}.sorted.bam"), 171 | os.path.join(aligning_dir, f"{name}.sorted.deduplicated.bam"), 172 | ] 173 | with open(logpath, "a") as logfile: 174 | subprocess.run(deduplicate_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 175 | 176 | # Collect stats 177 | collect_stats_command = [ 178 | "samtools", "flagstat", os.path.join(aligning_dir, f"{name}.sorted.deduplicated.bam") 179 | ] 180 | with open(logpath, "a") as logfile: 181 | subprocess.run(collect_stats_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 182 | 183 | # Methylation extraction 184 | print("Methylation extraction...") 185 | extraction_dir = os.path.join(output_dir, "methyl_extraction") 186 | os.makedirs(extraction_dir, exist_ok=True) 187 | extraction_command = [ 188 | "MethylDackel", "extract", "--fraction", "--minDepth", "10", 189 | args.ref, 190 | os.path.join(aligning_dir, f"{name}.sorted.deduplicated.bam"), 191 | ] 192 | with open(logpath, "a") as logfile: 193 | subprocess.run(extraction_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 194 | bedgraph_file = f"{name}.sorted.deduplicated_CpG.meth.bedGraph" 195 | extracted_bedgraph = os.path.join(aligning_dir, bedgraph_file) 196 | shutil.move(extracted_bedgraph, os.path.join(extraction_dir, bedgraph_file)) 197 | 198 | # M-bias plotting 199 | mbias_command = [ 200 | "MethylDackel", "mbias", 201 | args.ref, 202 | os.path.join(aligning_dir, f"{name}.sorted.deduplicated.bam"), 203 | f"{name}" 204 | ] 205 | with open(logpath, "a") as logfile: 206 | subprocess.run(mbias_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 207 | svg_files = [f"{name}_OB.svg", f"{name}_OT.svg"] 208 | for svg_file in svg_files: 209 | shutil.move(svg_file, os.path.join(extraction_dir, svg_file)) 210 | 211 | 212 | def process_multiline_files(files: List[str], platform_output1: str, platform_output2: str, logpath: str, 213 | input_dir: str, output_dir: str, args) -> None: 214 | """ 215 | Processes files where each file has a corresponding line value. 216 | 217 | Args: 218 | files (List[str]): List of file names to process. 219 | platform_output1 (str): First file output pattern. 220 | platform_output2 (str): Second file output pattern. 221 | logpath (str): Path to the log file. 222 | input_dir (str): Directory containing input files. 223 | output_dir (str): Directory to store output files. 224 | """ 225 | name = None 226 | lines = [] 227 | bams = [] 228 | 229 | # Extract lines and names from files 230 | for file in files: 231 | if file.endswith(platform_output1): 232 | name, line = process_filename(file, platform_output1) 233 | lines.append(line) 234 | 235 | # Process each line 236 | for line in lines: 237 | r1 = f"{name}{line}{platform_output1}" 238 | r2 = f"{name}{line}{platform_output2}" 239 | 240 | # Perform quality control 241 | print("QC...") 242 | qc_dir = os.path.join(output_dir, "FastQC") 243 | os.makedirs(qc_dir, exist_ok=True) 244 | qc_command = [ 245 | "fastqc", 246 | "-o", qc_dir, 247 | "-t", str(args.threads), 248 | os.path.join(input_dir, r1), 249 | os.path.join(input_dir, r2) 250 | ] 251 | if platform_output1.endswith(".gz"): 252 | qc_command.insert(1, "--noextract") 253 | 254 | with open(logpath, "w") as logfile: 255 | subprocess.run(qc_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 256 | 257 | # Adapter trimming 258 | print("Adapter trimming...") 259 | trimgalore_dir = os.path.join(output_dir, "trim-galore") 260 | os.makedirs(trimgalore_dir, exist_ok=True) 261 | trim_command = [ 262 | "trim_galore", 263 | "-q", str(args.q), 264 | "--fastqc", 265 | "-o", trimgalore_dir, 266 | "-j", str(args.threads), 267 | "--paired", 268 | "--gzip", 269 | os.path.join(input_dir, r1), 270 | os.path.join(input_dir, r2), 271 | ] 272 | with open(logpath, "a") as logfile: 273 | subprocess.run(trim_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 274 | 275 | # Aligning 276 | print("Aligning...") 277 | aligning_dir = os.path.join(output_dir, "aligning") 278 | os.makedirs(aligning_dir, exist_ok=True) 279 | align_command = [ 280 | "bsmapz", 281 | "-a", os.path.join(trimgalore_dir, f"{name}{line}_1_val_1.fq.gz"), 282 | "-b", os.path.join(trimgalore_dir, f"{name}{line}_2_val_2.fq.gz"), 283 | "-d", args.ref, 284 | "-o", os.path.join(aligning_dir, f"{name}{line}.bam"), 285 | "-p", str(args.threads), 286 | ] 287 | with open(logpath, "a") as logfile: 288 | subprocess.run(align_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 289 | 290 | # Fixing mates 291 | fixmate_command = [ 292 | "samtools", "fixmate", "-m", 293 | os.path.join(aligning_dir, f"{name}{line}.bam"), 294 | os.path.join(aligning_dir, f"{name}{line}.fixmate.bam"), 295 | ] 296 | with open(logpath, "a") as logfile: 297 | subprocess.run(fixmate_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 298 | 299 | # Sorting BAM files 300 | sort_command = [ 301 | "samtools", "sort", "-@", str(args.threads), 302 | "-o", os.path.join(aligning_dir, f"{name}{line}.sorted.bam"), 303 | os.path.join(aligning_dir, f"{name}{line}.fixmate.bam"), 304 | ] 305 | with open(logpath, "a") as logfile: 306 | subprocess.run(sort_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 307 | 308 | # Deduplicating BAM files 309 | print("Deduplicating...") 310 | deduplicate_command = [ 311 | "samtools", "markdup", 312 | os.path.join(aligning_dir, f"{name}{line}.sorted.bam"), 313 | os.path.join(aligning_dir, f"{name}{line}.sorted.deduplicated.bam"), 314 | ] 315 | with open(logpath, "a") as logfile: 316 | subprocess.run(deduplicate_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 317 | 318 | # Collect BAM files for merging 319 | bams.append(os.path.join(aligning_dir, f"{name}{line}.sorted.deduplicated.bam")) 320 | 321 | # Merging BAM files 322 | merge_bams_command = [ 323 | "samtools", "merge", "-@", str(args.threads), 324 | os.path.join(aligning_dir, f"{name}.sorted.deduplicated.bam"), *bams 325 | ] 326 | with open(logpath, "a") as logfile: 327 | subprocess.run(merge_bams_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 328 | 329 | #Collect stats 330 | collect_stats_command = [ 331 | "samtools", "flagstat", os.path.join(aligning_dir, f"{name}.sorted.deduplicated.bam") 332 | ] 333 | with open(logpath, "a") as logfile: 334 | subprocess.run(collect_stats_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 335 | 336 | # Methylation extraction 337 | print("Methylation extraction...") 338 | extraction_dir = os.path.join(output_dir, "methyl_extraction") 339 | os.makedirs(extraction_dir, exist_ok=True) 340 | extraction_command = [ 341 | "MethylDackel", "extract", "--fraction", "--minDepth", "10", 342 | args.ref, 343 | os.path.join(aligning_dir, f"{name}.sorted.deduplicated.bam"), 344 | ] 345 | with open(logpath, "a") as logfile: 346 | subprocess.run(extraction_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 347 | bedgraph_file = f"{name}.sorted.deduplicated_CpG.meth.bedGraph" 348 | extracted_bedgraph = os.path.join(aligning_dir, bedgraph_file) 349 | shutil.move(extracted_bedgraph, os.path.join(extraction_dir, bedgraph_file)) 350 | 351 | # M-bias plotting 352 | mbias_command = [ 353 | "MethylDackel", "mbias", 354 | args.ref, 355 | os.path.join(aligning_dir, f"{name}.sorted.deduplicated.bam"), 356 | f"{name}" 357 | ] 358 | with open(logpath, "a") as logfile: 359 | subprocess.run(mbias_command, stdout=logfile, stderr=subprocess.STDOUT, text=True) 360 | svg_files = [f"{name}_OB.svg", f"{name}_OT.svg"] 361 | for svg_file in svg_files: 362 | shutil.move(svg_file, os.path.join(extraction_dir, svg_file)) 363 | 364 | 365 | def main(args): 366 | print("CPU count:", args.threads) 367 | 368 | input_dir = args.input_dir 369 | 370 | log_dir = args.logs 371 | os.makedirs(log_dir, exist_ok=True) 372 | 373 | output_dir = args.output_dir 374 | os.makedirs(output_dir, exist_ok=True) 375 | 376 | if not os.listdir(input_dir): 377 | print("Empty input directory") 378 | sys.exit(1) 379 | else: 380 | files_grouped = {} 381 | platform_output1, platform_output2 = get_file_extension(input_dir) 382 | for infile in os.listdir(input_dir): 383 | if infile.endswith(platform_output1): 384 | name, line = process_filename(infile, platform_output1) 385 | if name not in files_grouped: 386 | files_grouped[name] = [] 387 | files_grouped[name].append(infile) 388 | elif infile.endswith(platform_output2): 389 | name, line = process_filename(infile, platform_output2) 390 | if name not in files_grouped: 391 | files_grouped[name] = [] 392 | files_grouped[name].append(infile) 393 | 394 | for name, files in files_grouped.items(): 395 | logpath = os.path.join(log_dir, f"{name}.log") 396 | if len(files) == 2: 397 | process_oneline_files(files, platform_output1, platform_output2, logpath, input_dir, output_dir, args) 398 | elif len(files) > 2: 399 | if len(files) % 2 == 0: 400 | process_multiline_files(files, platform_output1, platform_output2, logpath, input_dir, 401 | output_dir, args) 402 | else: 403 | raise ValueError(f"Number of files for '{name}' does not match paired reads") 404 | 405 | 406 | if __name__ == "__main__": 407 | parser = argparse.ArgumentParser(description="Run pipeline") 408 | parser.add_argument("--threads", type=int, default=8, help="Number of threads") 409 | parser.add_argument("--input_dir", type=str, required=True, help="Abs path to input folder") 410 | parser.add_argument("--output_dir", type=str, required=True, help="Abs path to output folder") 411 | parser.add_argument("--ref", type=str, required=True, help="Abs path to genome filer") 412 | parser.add_argument("--logs", type=str, required=True, help="Abs path to logs folder") 413 | parser.add_argument("--q", type=int, default=20, help="Quality score for trim_galore") 414 | args = parser.parse_args() 415 | 416 | main(args) 417 | 418 | --------------------------------------------------------------------------------