├── .gitmodules ├── hapog ├── __init__.py ├── mapping.py ├── cli.py └── pipeline.py ├── requirements.txt ├── setup.cfg ├── .gitignore ├── hapog.py ├── conda_files ├── build.sh ├── cross-linux.cmake └── meta.yaml ├── src ├── find_htslib.cmake ├── CMakeLists.txt ├── hash.h ├── polished.h ├── hash.c ├── alipile.h ├── polished.c ├── polish_consensus.c ├── LibFindMacros.cmake └── alipile.c ├── setup.py ├── README.md └── LICENSE.md /.gitmodules: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /hapog/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | biopython 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = "1.3.8" 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .idea 3 | package_build 4 | hapog.egg-info 5 | -------------------------------------------------------------------------------- /hapog.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from hapog import cli 3 | from hapog import mapping 4 | from hapog import pipeline 5 | 6 | import argparse 7 | import os 8 | import sys 9 | import time 10 | 11 | 12 | if __name__ == "__main__": 13 | cli.main() -------------------------------------------------------------------------------- /conda_files/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export C_INCLUDE_PATH=${PREFIX}/include 4 | export LIBRARY_PATH=${PREFIX}/lib 5 | 6 | python setup.py install --single-version-externally-managed --record=record.txt 7 | 8 | mkdir hapog_build && cd hapog_build 9 | export HTSLIB_ROOT=${LIBRARY_PATH} 10 | CMAKE_PLATFORM_FLAGS+=(-DCMAKE_TOOLCHAIN_FILE="${RECIPE_DIR}/cross-linux.cmake") 11 | cmake -DCMAKE_INSTALL_PREFIX=${PREFIX} \ 12 | ${CMAKE_PLATFORM_FLAGS[@]} \ 13 | ../src 14 | make 15 | cd .. 16 | cp -r hapog_build/hapog ${PREFIX}/bin/hapog_bin 17 | -------------------------------------------------------------------------------- /conda_files/cross-linux.cmake: -------------------------------------------------------------------------------- 1 | # this one is important 2 | set(CMAKE_SYSTEM_NAME Linux) 3 | set(CMAKE_PLATFORM Linux) 4 | #this one not so much 5 | set(CMAKE_SYSTEM_VERSION 1) 6 | 7 | # specify the cross compiler 8 | set(CMAKE_C_COMPILER $ENV{CC}) 9 | 10 | # where is the target environment 11 | set(CMAKE_FIND_ROOT_PATH $ENV{PREFIX} $ENV{BUILD_PREFIX}/$ENV{HOST}/sysroot) 12 | 13 | # search for programs in the build host directories 14 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 15 | # for libraries and headers in the target directories 16 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 17 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 18 | 19 | # god-awful hack because it seems to not run correct tests to determine this: 20 | set(__CHAR_UNSIGNED___EXITCODE 1) 21 | -------------------------------------------------------------------------------- /conda_files/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set version = "1.3.5" %} 2 | 3 | package: 4 | name: hapog 5 | version: {{ version }} 6 | 7 | build: 8 | number: 0 9 | 10 | source: 11 | url: https://github.com/institut-de-genomique/HAPO-G/archive/refs/tags/{{ version }}.tar.gz 12 | sha256: "46e0d102467276d0aee5c493e5e678c76120760f0dbe57caf4d880d366dcb285" 13 | 14 | requirements: 15 | build: 16 | - {{ compiler('c') }} 17 | - cmake 18 | - make 19 | host: 20 | - python>=3.7 21 | - htslib 22 | - setuptools 23 | run: 24 | - htslib 25 | - python>=3.7 26 | - setuptools 27 | - biopython 28 | - bwa 29 | - samtools 30 | - minimap2 31 | 32 | test: 33 | commands: 34 | - hapog -h 35 | - hapog_bin -h 36 | imports: 37 | - hapog 38 | 39 | about: 40 | home: https://github.com/institut-de-genomique/HAPO-G 41 | license: CECILL-2.1 42 | license_family: OTHER 43 | license_file: LICENSE.md 44 | summary: Haplotype-Aware Polishing of Genomes 45 | 46 | extra: 47 | identifiers: 48 | - doi:10.1093/nargab/lqab034 49 | -------------------------------------------------------------------------------- /src/find_htslib.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find htslib 2 | # Once done, this will define 3 | # 4 | # htslib_found - system has htslib 5 | # htslib_INCLUDE_DIR - the htslib include directories 6 | # htslib_LIBRARY - link these to use htslib 7 | 8 | set(HTSLIB_SEARCH_DIRS 9 | ${HTSLIB_SEARCH_DIRS} 10 | $ENV{HTLSIB_ROOT} 11 | ) 12 | 13 | set(_htslib_ver_path "htslib-${htslib_FIND_VERSION}") 14 | include(LibFindMacros.cmake) 15 | 16 | # Dependencies 17 | #libfind_package(HTSlib) 18 | 19 | # Include dir 20 | find_path(htslib_INCLUDE_DIR 21 | NAMES include/htslib/sam.h 22 | PATHS ${HTSLIB_SEARCH_DIRS} 23 | HINTS ENV HTSLIB_ROOT 24 | ) 25 | 26 | # Finally the library itself 27 | find_library(htslib_LIBRARY 28 | NAMES libhts.a hts.a 29 | PATHS ${htslib_INCLUDE_DIR} ${HTSLIB_SEARCH_DIRS} 30 | NO_DEFAULT_PATH 31 | PATH_SUFFIXES lib lib64 ${_htslib_ver_path} 32 | HINTS ENV HTSLIB_ROOT 33 | ) 34 | 35 | if(NOT "${htslib_INCLUDE_DIR}" STREQUAL "htslib_INCLUDE_DIR-NOTFOUND" AND NOT "${htslib_LIBRARY}" STREQUAL "htslib_LIBRARY-NOTFOUND") 36 | set(htslib_found "TRUE") 37 | else() 38 | set(htslib_found "FALSE") 39 | endif() 40 | message(${htslib_INCLUDE_DIR}) 41 | message(${htslib_LIBRARY}) 42 | message("htslib_found ${htslib_found}\n") 43 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import distutils.command.build 2 | 3 | from setuptools import setup 4 | from os import path 5 | 6 | # Override build command 7 | class BuildCommand(distutils.command.build.build): 8 | def initialize_options(self): 9 | distutils.command.build.build.initialize_options(self) 10 | self.build_base = 'package_build' 11 | 12 | this_directory = path.abspath(path.dirname(__file__)) 13 | with open(path.join(this_directory, "README.md")) as f: 14 | long_description = f.read() 15 | 16 | setup( 17 | name="hapog", 18 | packages=["hapog"], 19 | version="1.3.8", 20 | license="CeCILL", 21 | description="Haplotype-Aware Polishing of Genomes", 22 | long_description=long_description, 23 | long_description_content_type="text/markdown", 24 | author=["Jean-Marc Aury", "Benjamin Istace"], 25 | author_email=["jmaury@genoscope.cns.fr", "bistace@genoscope.cns.fr"], 26 | url="https://github.com/institut-de-genomique/HAPO-G", 27 | download_url="https://github.com/institut-de-genomique/HAPO-G", 28 | keywords=[ 29 | "bioinformatics", 30 | "genomics", 31 | "genome", 32 | "polishing", 33 | ], 34 | install_requires=["biopython"], 35 | classifiers=[ 36 | "Development Status :: 4 - Beta", 37 | "Intended Audience :: Science/Research", 38 | "Topic :: Scientific/Engineering :: Bio-Informatics", 39 | "License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)", 40 | "Programming Language :: Python :: 3.7", 41 | ], 42 | entry_points={"console_scripts" : ["hapog = hapog.cli:main"]}, 43 | cmdclass={"build": BuildCommand}, 44 | ) 45 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | include(ExternalProject) 3 | 4 | # Set gcc and g++ compiler 5 | set(CMAKE_C_COMPILER gcc) 6 | set(CMAKE_CXX_COMPILER g++) 7 | 8 | # Set the project name 9 | project(HAPoG C) 10 | enable_language(C) 11 | 12 | # Add htslib 13 | if(NOT "$ENV{HTSLIB_ROOT}" STREQUAL "") 14 | message("\nWill attempt to find HTSlib at the provided path $ENV{HTSLIB_ROOT}") 15 | include(find_htslib.cmake) 16 | if("${htslib_found}" STREQUAL "TRUE") 17 | message("Found htslib at ${htslib_INCLUDE_DIR}") 18 | include_directories("${htslib_INCLUDE_DIR}/include") 19 | link_directories(${htslib_INCLUDE_DIR}/lib) 20 | else() 21 | message(FATAL_ERROR "Could not find the required files at the given $ENV{HTSLIB_ROOT} path.") 22 | endif() 23 | else() 24 | message("\nNo htslib path was provided, trying to find it on system") 25 | include(find_htslib.cmake) 26 | if("${htslib_found}" STREQUAL "TRUE") 27 | message("Found htslib at ${htslib_INCLUDE_DIR}") 28 | include_directories("${htslib_INCLUDE_DIR}") 29 | link_directories(${htslib_LIBRARY}) 30 | else() 31 | message("htslib was not found on system, attempting to download and compile it") 32 | set(HTSLIB_PREFIX ${CMAKE_CURRENT_SOURCE_DIR}/htslib) 33 | set(HTSLIB_INSTALL make install prefix=${HTSLIB_PREFIX}/src/htslib/) 34 | 35 | ExternalProject_Add(htslib 36 | PREFIX ${HTSLIB_PREFIX} 37 | GIT_REPOSITORY "https://github.com/samtools/htslib" 38 | GIT_TAG "1.11" 39 | BUILD_IN_SOURCE 1 40 | CONFIGURE_COMMAND autoheader && autoconf && ${HTSLIB_PREFIX}/src/htslib/configure --prefix=${HTSLIB_PREFIX}/src/htslib/ 41 | BUILD_COMMAND make 42 | INSTALL_COMMAND ${HTSLIB_INSTALL} 43 | LOG_DOWNLOAD 0 44 | LOG_UPDATE 0 45 | LOG_CONFIGURE 0 46 | LOG_BUILD 0 47 | LOG_TEST 0 48 | LOG_INSTALL 0 49 | ) 50 | 51 | ExternalProject_Get_Property(htslib install_dir) 52 | include_directories(${HTSLIB_PREFIX}/src/htslib) 53 | link_directories(${HTSLIB_PREFIX}/src/htslib) 54 | endif() 55 | endif() 56 | 57 | 58 | # Add the executable 59 | set(CMAKE_C_FLAGS "-O3 -g") 60 | add_executable(hapog polish_consensus.c alipile.c hash.c polished.c) 61 | 62 | # Links 63 | target_link_libraries(hapog pthread) 64 | target_link_libraries(hapog curl) 65 | target_link_libraries(hapog crypto) 66 | target_link_libraries(hapog m) 67 | target_link_libraries(hapog hts) 68 | target_link_libraries(hapog z) 69 | target_link_libraries(hapog lzma) 70 | target_link_libraries(hapog bz2) 71 | -------------------------------------------------------------------------------- /src/hash.h: -------------------------------------------------------------------------------- 1 | /// @file hash.h 2 | /// Hash table for string entries 3 | 4 | /* 5 | ################################################################################ 6 | # * Copyright Jean-Marc Aury / Genoscope / DRF / CEA 7 | # * 8 | # * 9 | # * This software is governed by the CeCILL license under French law and 10 | # * abiding by the rules of distribution of free software. You can use, 11 | # * modify and/ or redistribute the software under the terms of the CeCILL 12 | # * license as circulated by CEA, CNRS and INRIA at the following URL 13 | # * "http://www.cecill.info". 14 | # * 15 | # * As a counterpart to the access to the source code and rights to copy, 16 | # * modify and redistribute granted by the license, users are provided only 17 | # * with a limited warranty and the software's author, the holder of the 18 | # * economic rights, and the successive licensors have only limited 19 | # * liability. 20 | # * 21 | # * In this respect, the user's attention is drawn to the risks associated 22 | # * with loading, using, modifying and/or developing or reproducing the 23 | # * software by the user in light of its specific status of free software, 24 | # * that may mean that it is complicated to manipulate, and that also 25 | # * therefore means that it is reserved for developers and experienced 26 | # * professionals having in-depth computer knowledge. Users are therefore 27 | # * encouraged to load and test the software's suitability as regards their 28 | # * requirements in conditions enabling the security of their systems and/or 29 | # * data to be ensured and, more generally, to use and operate it in the 30 | # * same conditions as regards security. 31 | # * 32 | # * The fact that you are presently reading this means that you have had 33 | # * knowledge of the CeCILL license and that you accept its terms. 34 | ################################################################################ 35 | */ 36 | 37 | #ifndef HASH_H 38 | #define HASH_H 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | /*! @typedef 46 | @abstract Structure for a hash entry 47 | @field next Next entry to manage colision 48 | @field key The key of the entry 49 | */ 50 | typedef struct elm { 51 | struct elm *next; 52 | char *key; 53 | } elm_t; 54 | 55 | /*! @typedef 56 | @abstract Structure for a hash table 57 | @field m_size Size of the table 58 | @field size The number of entries 59 | @field table The hash table 60 | */ 61 | typedef struct hash { 62 | int msize; 63 | int size; 64 | elm_t **htable; 65 | } hash_t; 66 | 67 | 68 | /* create and initialize an empty hash table */ 69 | hash_t* hash_init(void); 70 | hash_t* hash_init1(int); 71 | 72 | /* destroy a hash table */ 73 | void hash_free(hash_t*); 74 | 75 | /* insert a new key into a hash table */ 76 | void hash_insert(hash_t*, const char*); 77 | 78 | /* return 1 if the key is present, 0 otherwise */ 79 | int hash_search(hash_t*, const char*); 80 | 81 | /* delete the record with the given key */ 82 | /* if there is no such record, has no effect */ 83 | void hash_delete(hash_t*, const char*); 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /src/polished.h: -------------------------------------------------------------------------------- 1 | /// @file polished.h 2 | /// Polished (error-corrected) sequence 3 | 4 | /* 5 | ################################################################################ 6 | # * Copyright Jean-Marc Aury / Genoscope / DRF / CEA 7 | # * 8 | # * 9 | # * This software is governed by the CeCILL license under French law and 10 | # * abiding by the rules of distribution of free software. You can use, 11 | # * modify and/ or redistribute the software under the terms of the CeCILL 12 | # * license as circulated by CEA, CNRS and INRIA at the following URL 13 | # * "http://www.cecill.info". 14 | # * 15 | # * As a counterpart to the access to the source code and rights to copy, 16 | # * modify and redistribute granted by the license, users are provided only 17 | # * with a limited warranty and the software's author, the holder of the 18 | # * economic rights, and the successive licensors have only limited 19 | # * liability. 20 | # * 21 | # * In this respect, the user's attention is drawn to the risks associated 22 | # * with loading, using, modifying and/or developing or reproducing the 23 | # * software by the user in light of its specific status of free software, 24 | # * that may mean that it is complicated to manipulate, and that also 25 | # * therefore means that it is reserved for developers and experienced 26 | # * professionals having in-depth computer knowledge. Users are therefore 27 | # * encouraged to load and test the software's suitability as regards their 28 | # * requirements in conditions enabling the security of their systems and/or 29 | # * data to be ensured and, more generally, to use and operate it in the 30 | # * same conditions as regards security. 31 | # * 32 | # * The fact that you are presently reading this means that you have had 33 | # * knowledge of the CeCILL license and that you accept its terms. 34 | ################################################################################ 35 | */ 36 | 37 | #ifndef POLISHED_H 38 | #define POLISHED_H 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #include "htslib/hts.h" 46 | #include "htslib/sam.h" 47 | 48 | //const unsigned int BUFF_SIZE = 1000; 49 | 50 | /*! @typedef 51 | @abstract Structure for alignment pile information. 52 | @field max_cov Maximal coverage of the pile 53 | @field min_cov Minimal coverage required to correct the consensus 54 | @field pile Array of bam alignment 55 | @field seqpile Array of padded alignment 56 | @field current_read Index of the selected Read 57 | @field current_seq Index of the reference sequence 58 | @field name_seq Name of the reference sequence 59 | @field seq Reference sequence 60 | @field nb_ali Number of alignment 61 | */ 62 | typedef struct { 63 | char* name_seq; 64 | char* seq; 65 | int size_alloc; 66 | int pos_seq; 67 | int chunk; 68 | 69 | char buffer[1000+1]; 70 | int buffer_level; 71 | } polished_t; 72 | 73 | 74 | // Create a alipile_t structure 75 | /** 76 | @return An empty alipile_t structure on success, NULL on failure 77 | */ 78 | polished_t *polished_init(void); 79 | 80 | // Free a alipile_t structure 81 | /** 82 | @param b structure to free 83 | Does nothing if @p b is NULL. 84 | */ 85 | void polished_free(polished_t *s); 86 | 87 | // Add nucleotide base to the polished sequence 88 | void add_base(polished_t *s, char base); 89 | 90 | // Add string str to the polished sequence 91 | void add_str(polished_t *s, char* str); 92 | 93 | // Print polished sequence in FILE 94 | void print_seq(polished_t *s, FILE* fo); 95 | 96 | // Substr function 97 | char* _substr(const char *src,int pos,int len); 98 | // Add a char ath the end of src 99 | void _addchar(char **src, char c); 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /src/hash.c: -------------------------------------------------------------------------------- 1 | /// @file hash.c 2 | /// Hash table for string entries 3 | 4 | /* 5 | ################################################################################ 6 | # * Copyright Jean-Marc Aury / Genoscope / DRF / CEA 7 | # * 8 | # * 9 | # * This software is governed by the CeCILL license under French law and 10 | # * abiding by the rules of distribution of free software. You can use, 11 | # * modify and/ or redistribute the software under the terms of the CeCILL 12 | # * license as circulated by CEA, CNRS and INRIA at the following URL 13 | # * "http://www.cecill.info". 14 | # * 15 | # * As a counterpart to the access to the source code and rights to copy, 16 | # * modify and redistribute granted by the license, users are provided only 17 | # * with a limited warranty and the software's author, the holder of the 18 | # * economic rights, and the successive licensors have only limited 19 | # * liability. 20 | # * 21 | # * In this respect, the user's attention is drawn to the risks associated 22 | # * with loading, using, modifying and/or developing or reproducing the 23 | # * software by the user in light of its specific status of free software, 24 | # * that may mean that it is complicated to manipulate, and that also 25 | # * therefore means that it is reserved for developers and experienced 26 | # * professionals having in-depth computer knowledge. Users are therefore 27 | # * encouraged to load and test the software's suitability as regards their 28 | # * requirements in conditions enabling the security of their systems and/or 29 | # * data to be ensured and, more generally, to use and operate it in the 30 | # * same conditions as regards security. 31 | # * 32 | # * The fact that you are presently reading this means that you have had 33 | # * knowledge of the CeCILL license and that you accept its terms. 34 | ################################################################################ 35 | */ 36 | 37 | #include "hash.h" 38 | 39 | const int INIT_SIZE = 268288; 40 | const int EXPAND = 2; 41 | const float MAX_LOAD_FACTOR = 0.7; 42 | const int HASH_FN_OFFSET = 85; 43 | 44 | 45 | hash_t* hash_init() { 46 | return hash_init1(INIT_SIZE); 47 | } 48 | 49 | hash_t* hash_init1(int size) { 50 | hash_t* h; 51 | int i = 0; 52 | 53 | h = malloc(sizeof(hash_t)); 54 | 55 | assert(h != 0); 56 | 57 | h->msize = size; 58 | h->size = 0; 59 | h->htable = malloc(sizeof(elm_t) * h->msize); 60 | 61 | assert(h->htable != 0); 62 | 63 | for( ; i < h->msize; i++) h->htable[i] = 0; 64 | 65 | return h; 66 | } 67 | 68 | void hash_free(hash_t* h) { 69 | int i; 70 | elm_t *current; 71 | elm_t *next; 72 | 73 | for(i = 0; i < h->msize; i++) { 74 | for(current = h->htable[i]; current != 0; current = next) { 75 | next = current->next; 76 | free(current->key); 77 | free(current); 78 | } 79 | } 80 | free(h->htable); 81 | free(h); 82 | } 83 | 84 | unsigned long hash_fn(const char *s) { 85 | unsigned const char *it = (unsigned const char *) s; 86 | unsigned long idx = 0; 87 | 88 | for( ; *it; it++) idx = HASH_FN_OFFSET * idx + *it; 89 | 90 | return idx; 91 | } 92 | 93 | void grow(hash_t* h) { 94 | hash_t* htmp = hash_init1(h->msize * EXPAND);; 95 | struct hash swap; 96 | int i = 0; 97 | elm_t* current; 98 | 99 | for( ; i < h->msize; i++) 100 | for(current = h->htable[i]; current != 0; current = current->next) 101 | hash_insert(htmp, current->key); 102 | 103 | swap = *h; 104 | *h = *htmp; 105 | *htmp = swap; 106 | hash_free(htmp); 107 | } 108 | 109 | void hash_insert(hash_t* h, const char *key) { 110 | elm_t *ins; 111 | unsigned long hkey; 112 | 113 | assert(key); 114 | 115 | ins = malloc(sizeof(elm_t)); 116 | 117 | assert(ins); 118 | 119 | //printf("please delete read= %s\n", key); 120 | 121 | ins->key = strdup(key); 122 | hkey = hash_fn(key) % h->msize; 123 | ins->next = h->htable[hkey]; 124 | h->htable[hkey] = ins; 125 | h->size++; 126 | 127 | if(h->size >= (int)(h->msize * MAX_LOAD_FACTOR)) grow(h); 128 | } 129 | 130 | int hash_search(hash_t* h, const char *key) { 131 | unsigned long hkey = hash_fn(key) % h->msize; 132 | elm_t *se = h->htable[hkey]; 133 | 134 | for( ; se != 0; se = se->next) 135 | if(!strcmp(se->key, key)) return 1; 136 | 137 | return 0; 138 | } 139 | 140 | void hash_delete(hash_t* h, const char *key) { 141 | unsigned long hkey = hash_fn(key) % h->msize; 142 | elm_t **prev = &(h->htable[hkey]); 143 | elm_t *del; 144 | 145 | while(*prev != 0) { 146 | if(!strcmp((*prev)->key, key)) { 147 | del = *prev; 148 | *prev = del->next; 149 | 150 | free(del->key); 151 | free(del); 152 | h->size--; 153 | 154 | return; 155 | } 156 | prev = &((*prev)->next); 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /hapog/mapping.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | import warnings 5 | 6 | 7 | def launch_PE_mapping(genome, pe1, pe2, threads, samtools_memory): 8 | print("\nGenerating bwa index...", flush=True) 9 | cmd = ["bwa", "index", genome] 10 | 11 | start = time.perf_counter() 12 | with open("cmds/bwa_index.cmds", "w") as cmd_file: 13 | print(" ".join(cmd), flush=True, file=cmd_file) 14 | 15 | try: 16 | with warnings.catch_warnings(): 17 | warnings.simplefilter("ignore") 18 | _ = subprocess.run( 19 | cmd, 20 | stdout=open("logs/bwa_index.o", "w"), 21 | stderr=open("logs/bwa_index.e", "w"), 22 | check=True, 23 | ) 24 | except Exception as e: 25 | print("\nERROR: Couldn't index genome", flush=True) 26 | print(e) 27 | exit(1) 28 | 29 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 30 | 31 | print("\nLaunching mapping on genome...", flush=True) 32 | cmd = "bash -c 'bwa mem -t %s %s " % (threads, genome) 33 | 34 | streamer = "cat" 35 | if pe1[0].endswith(".gz"): 36 | streamer = "zcat" 37 | 38 | if len(pe1) > 1: 39 | cmd += f"<({streamer}" 40 | for pe in pe1: 41 | cmd += " %s" % pe 42 | cmd += ") " 43 | cmd += f"<({streamer}" 44 | for pe in pe2: 45 | cmd += " %s" % pe 46 | cmd += ") 2> logs/bwa_mem.e" 47 | else: 48 | cmd += pe1[0] + " " + pe2[0] + " 2> logs/bwa_mem.e" 49 | 50 | cmd += f" | samtools sort -m {samtools_memory} -@ {threads} -o bam/aln.sorted.bam - 2> logs/samtools_sort.e'" 51 | 52 | start = time.perf_counter() 53 | 54 | with open("cmds/mapping.cmds", "w") as cmd_file: 55 | print(cmd, flush=True, file=cmd_file) 56 | 57 | return_code = os.system(cmd) 58 | if return_code != 0: 59 | print(f"Error in bwa mem and samtools sort, return code: {return_code}") 60 | print(f"Faulty command: {cmd}") 61 | exit(1) 62 | else: 63 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 64 | 65 | index_bam() 66 | 67 | 68 | def launch_LR_mapping(genome, long_reads, threads, samtools_memory): 69 | print("\nLaunching mapping on genome...", flush=True) 70 | cmd = f"bash -c 'minimap2 -t {threads} -a --secondary=no -x map-pb {genome} {long_reads} 2> logs/minimap2.e" 71 | cmd += f" | samtools sort -m {samtools_memory} -@ {threads} -o bam/aln.sorted.bam - 2> logs/samtools_sort.e'" 72 | 73 | start = time.perf_counter() 74 | 75 | with open("cmds/mapping.cmds", "w") as cmd_file: 76 | print(cmd, flush=True, file=cmd_file) 77 | 78 | return_code = os.system(cmd) 79 | if return_code != 0: 80 | print(f"Error in minimap2 and samtools sort, return code: {return_code}") 81 | print(f"Faulty command: {cmd}") 82 | exit(1) 83 | else: 84 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 85 | 86 | index_bam() 87 | 88 | 89 | def remove_secondary_alignments(bam, output_dir): 90 | print("\nRemoving secondary alignments from BAM file...", flush=True) 91 | cmd = ["samtools", "view", "-b", "-h", "-F", "0x900", bam] 92 | 93 | start = time.perf_counter() 94 | with open(f"{output_dir}/cmds/samtools_view.cmds", "w") as cmd_file: 95 | print(" ".join(cmd), flush=True, file=cmd_file) 96 | 97 | try: 98 | with warnings.catch_warnings(): 99 | warnings.simplefilter("ignore") 100 | _ = subprocess.run( 101 | cmd, 102 | stdout=open(f"{output_dir}/bam/aln.sorted.bam", "w"), 103 | stderr=open(f"{output_dir}/logs/samtools_view.e", "w"), 104 | check=True, 105 | ) 106 | except Exception as e: 107 | print("\nERROR: Couldn't remove secondary alignments") 108 | print(e) 109 | exit(1) 110 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 111 | 112 | 113 | def index_bam(): 114 | print("\nIndexing the BAM file...", flush=True) 115 | cmd = ["samtools", "index", "bam/aln.sorted.bam"] 116 | 117 | start = time.perf_counter() 118 | with open("cmds/samtools_index.cmds", "w") as cmd_file: 119 | print(" ".join(cmd), flush=True, file=cmd_file) 120 | 121 | try: 122 | with warnings.catch_warnings(): 123 | warnings.simplefilter("ignore") 124 | _ = subprocess.run( 125 | cmd, 126 | stdout=open("logs/samtools_index.o", "w"), 127 | stderr=open("logs/samtools_index.e", "w"), 128 | check=True, 129 | ) 130 | except Exception as e: 131 | print("\nERROR: Couldn't index bam file") 132 | print(e) 133 | exit(1) 134 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 135 | -------------------------------------------------------------------------------- /src/alipile.h: -------------------------------------------------------------------------------- 1 | /// @file alipile.h 2 | /// Pile of alignment and padded alignments 3 | 4 | /* 5 | ################################################################################ 6 | # * Copyright Jean-Marc Aury / Genoscope / DRF / CEA 7 | # * 8 | # * 9 | # * This software is governed by the CeCILL license under French law and 10 | # * abiding by the rules of distribution of free software. You can use, 11 | # * modify and/ or redistribute the software under the terms of the CeCILL 12 | # * license as circulated by CEA, CNRS and INRIA at the following URL 13 | # * "http://www.cecill.info". 14 | # * 15 | # * As a counterpart to the access to the source code and rights to copy, 16 | # * modify and redistribute granted by the license, users are provided only 17 | # * with a limited warranty and the software's author, the holder of the 18 | # * economic rights, and the successive licensors have only limited 19 | # * liability. 20 | # * 21 | # * In this respect, the user's attention is drawn to the risks associated 22 | # * with loading, using, modifying and/or developing or reproducing the 23 | # * software by the user in light of its specific status of free software, 24 | # * that may mean that it is complicated to manipulate, and that also 25 | # * therefore means that it is reserved for developers and experienced 26 | # * professionals having in-depth computer knowledge. Users are therefore 27 | # * encouraged to load and test the software's suitability as regards their 28 | # * requirements in conditions enabling the security of their systems and/or 29 | # * data to be ensured and, more generally, to use and operate it in the 30 | # * same conditions as regards security. 31 | # * 32 | # * The fact that you are presently reading this means that you have had 33 | # * knowledge of the CeCILL license and that you accept its terms. 34 | ################################################################################ 35 | */ 36 | 37 | #ifndef ALIPILE_H 38 | #define ALIPILE_H 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | #include "polished.h" 47 | #include "hash.h" 48 | 49 | #include "htslib/hts.h" 50 | #include "htslib/sam.h" 51 | 52 | //const unsigned int MAX_COV = 50; 53 | //const unsigned int MIN_COV = 3; 54 | 55 | /*! @typedef 56 | @abstract Structure for alignment pile information. 57 | @field max_cov Maximal coverage of the pile 58 | @field min_cov Minimal coverage required to correct the consensus 59 | @field pile Array of bam alignment 60 | @field seqpile Array of padded alignment 61 | @field current_read Index of the selected Read 62 | @field current_seq Index of the reference sequence 63 | @field name_seq Name of the reference sequence 64 | @field seq Reference sequence 65 | @field len_seq Reference sequence length 66 | @field nb_ali Number of alignment 67 | @field nbhapB Number of removed PE reads from hapB 68 | @field changes FILE output changes 69 | */ 70 | typedef struct { 71 | int max_cov; 72 | int min_cov; 73 | 74 | bam1_t* pile[500]; 75 | char* seqpile[500]; 76 | 77 | int current_read; 78 | int read_choice; 79 | 80 | int current_seq; 81 | char* name_seq; 82 | char* seq; 83 | int len_seq; 84 | 85 | int nb_ali; 86 | int debug; 87 | 88 | int nb_changes; 89 | int nbhapB; 90 | 91 | FILE* changes; 92 | } alipile_t; 93 | 94 | 95 | // Create a alipile_t structure 96 | /** 97 | @return An empty alipile_t structure on success, NULL on failure 98 | */ 99 | alipile_t *alipile_init(FILE* out); 100 | 101 | // Free a alipile_t structure 102 | /** 103 | @param b structure to free 104 | Does nothing if @p b is NULL. 105 | */ 106 | void alipile_free(alipile_t *ap); 107 | 108 | // Clean pile by erasing alignment that do not overlap pos 109 | void clean(alipile_t *ap, int pos); 110 | 111 | // Add an alignment to the alipile_t structure 112 | void add(alipile_t *ap, bam1_t *aln); 113 | 114 | // Change current read in the alipile_t structure 115 | void change_currentread(alipile_t *ap, int pos); 116 | 117 | // Change current read in the alipile_t structure 118 | void change_currentread2(alipile_t *ap, int pos); 119 | 120 | // Delete an alignment from the alipile_t structure 121 | void delete(alipile_t *ap, int elem); 122 | 123 | // Get padded-alignment length 124 | int get_lseqali(bam1_t *aln); 125 | 126 | // Get padded-alignment 127 | char* get_seqali(bam1_t *aln); 128 | 129 | // Get sequence from bam alignment 130 | char* get_bamseq(bam1_t *aln); 131 | 132 | // Return the base of the read at position pos 133 | char* get_base(alipile_t *ap, int read, int pos); 134 | 135 | // return the proportion of ali with base nt at given position 136 | float get_ratio_base(alipile_t *ap, char *nt, int pos); 137 | 138 | // Remove alignments from haplotype B 139 | void removehapB_read(alipile_t *ap, char* str, int pos, hash_t* readname); 140 | 141 | // Get the corrected nucleotide at position current_pos 142 | void select_base(alipile_t *ap, alipile_t *allali, int current_pos, 143 | polished_t* s, hash_t* readname); 144 | 145 | 146 | #endif 147 | -------------------------------------------------------------------------------- /src/polished.c: -------------------------------------------------------------------------------- 1 | /// @file polished.c 2 | /// Polished (error-corrected) sequence 3 | 4 | /* 5 | ################################################################################ 6 | # * Copyright Jean-Marc Aury / Genoscope / DRF / CEA 7 | # * 8 | # * 9 | # * This software is governed by the CeCILL license under French law and 10 | # * abiding by the rules of distribution of free software. You can use, 11 | # * modify and/ or redistribute the software under the terms of the CeCILL 12 | # * license as circulated by CEA, CNRS and INRIA at the following URL 13 | # * "http://www.cecill.info". 14 | # * 15 | # * As a counterpart to the access to the source code and rights to copy, 16 | # * modify and redistribute granted by the license, users are provided only 17 | # * with a limited warranty and the software's author, the holder of the 18 | # * economic rights, and the successive licensors have only limited 19 | # * liability. 20 | # * 21 | # * In this respect, the user's attention is drawn to the risks associated 22 | # * with loading, using, modifying and/or developing or reproducing the 23 | # * software by the user in light of its specific status of free software, 24 | # * that may mean that it is complicated to manipulate, and that also 25 | # * therefore means that it is reserved for developers and experienced 26 | # * professionals having in-depth computer knowledge. Users are therefore 27 | # * encouraged to load and test the software's suitability as regards their 28 | # * requirements in conditions enabling the security of their systems and/or 29 | # * data to be ensured and, more generally, to use and operate it in the 30 | # * same conditions as regards security. 31 | # * 32 | # * The fact that you are presently reading this means that you have had 33 | # * knowledge of the CeCILL license and that you accept its terms. 34 | ################################################################################ 35 | */ 36 | 37 | #include "polished.h" 38 | 39 | polished_t *polished_init(void) { 40 | polished_t* s = (polished_t*) calloc(1, sizeof(polished_t)); 41 | s->chunk = 1000; 42 | s->size_alloc = s->chunk; 43 | s->seq = (char *) malloc((s->chunk + 1) * sizeof(char)); 44 | s->seq[s->chunk]='\0'; 45 | s->pos_seq = 0; 46 | s->buffer_level = 0; 47 | return s; 48 | } 49 | 50 | void polished_free(polished_t *s) { 51 | if(s == 0) return; 52 | //free(s->name_seq); 53 | // on redonne la taille allouee 54 | //s->seq[strlen(s->seq)] = 'e'; 55 | //s->seq[s->size_alloc] = '\0'; 56 | free(s->seq); 57 | free(s); 58 | } 59 | 60 | 61 | void add_base(polished_t *s, char base) { 62 | char tmp[2] = { base }; 63 | add_str(s, tmp); 64 | //if(s->seq == NULL) s->seq = (char *) malloc(2 * sizeof(char)); 65 | //else { 66 | // int oldl = strlen(s->seq); 67 | // s->seq = (char *) realloc(s->seq, (oldl + 1) * sizeof(char)); 68 | //} 69 | //s->seq[strlen(s->seq)-1] = base; 70 | //s->seq[strlen(s->seq)] = '\0'; 71 | } 72 | 73 | void add_str(polished_t *s, char *str) { 74 | assert(str != NULL && s->seq != NULL); 75 | int len = strlen(str); 76 | if(s->pos_seq + len >= s->size_alloc) { 77 | int extend = s->chunk; 78 | while(len >= extend) { extend += s->chunk; } 79 | void* tmpseq = (char *) realloc(s->seq, (s->size_alloc + extend) * sizeof(char)); 80 | assert(tmpseq); 81 | s->seq = tmpseq; 82 | s->size_alloc += extend; 83 | } 84 | int i = 0; 85 | for(i = 0 ; i < len ; i++) 86 | s->seq[s->pos_seq + i ] = str[i]; 87 | s->pos_seq += len; 88 | s->seq[s->pos_seq]='\0'; 89 | } 90 | 91 | /*void addBase(polished_t *s, char base) { 92 | assert(buffer_level >= BUFF_SIZE); 93 | 94 | s->buffer[s->buffer_level] = base; 95 | s->buffer_level++; 96 | 97 | if(buffer_level + 1 == BUFF_SIZE) { 98 | if(s->seq == NULL) s->seq = (char *) malloc(BUFF_SIZE * sizeof(char)); 99 | else { 100 | int oldl = strlen(s->seq); 101 | s->seq = (char *) realloc(s->seq, (oldl + BUFF_SIZE) * sizeof(char)); 102 | } 103 | strcat(s->seq, buffer); 104 | buffer_level = 0; 105 | } 106 | }*/ 107 | 108 | void print_seq(polished_t *s, FILE* fo) { 109 | if(s->seq != NULL) { 110 | char *cs = s->seq; 111 | fprintf(fo, ">%s_polished\n", s->name_seq); 112 | int linesize = 60; 113 | int len = strlen(s->seq); 114 | int write = len; 115 | while(len>0) { 116 | write = len; 117 | if(len > linesize) write = linesize; 118 | char* line = _substr(s->seq, 0, write); 119 | fprintf(fo, "%s\n", line); 120 | free(line); 121 | s->seq += write; 122 | len -= write; 123 | } 124 | s->seq = cs; 125 | } 126 | } 127 | 128 | char* _substr(const char *src, int pos, int len) { 129 | char *dest = NULL; 130 | if (len>0) { 131 | dest = calloc(len+1, sizeof(char)); 132 | if(NULL != dest) strncat(dest, src+pos, len); 133 | } 134 | return dest; 135 | } 136 | 137 | 138 | void _addchar(char **src, char c) { 139 | if(*src == NULL) { 140 | *src = malloc(2 * sizeof(char)); 141 | *src[0] = c; 142 | *src[1] = '\0'; 143 | } else { 144 | int oldl = strlen(*src); 145 | *src = realloc(*src, (oldl + 1) * sizeof(char)); 146 | //printf("realloc seq= \"%s\"\t char= %c\t len=%i\n", *src, c, strlen(*src)); 147 | *src[strlen(*src)-1] = c; 148 | *src[strlen(*src)] = '\0'; 149 | } 150 | //printf("seq= \"%s\"\t char= %c\t len=%i\n", *src, c, strlen(*src)); 151 | } 152 | 153 | -------------------------------------------------------------------------------- /src/polish_consensus.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "htslib/hts.h" 10 | #include "htslib/sam.h" 11 | #include "htslib/faidx.h" 12 | 13 | #include "alipile.h" 14 | #include "polished.h" 15 | #include "hash.h" 16 | 17 | const unsigned int MAX_COV = 50; 18 | const unsigned int MIN_COV = 3; 19 | const unsigned int BUFFER = 1000; 20 | 21 | void usage(); 22 | int parse_bam(char*, char*, char*, char*, int); 23 | void print_step(int, int); 24 | void error(char*, ...); 25 | 26 | int main(int argc, char* argv[]) { 27 | char *BAMfile = NULL, *outfile = NULL, *changefile = NULL, *FAfile = NULL; 28 | int silent = 0; 29 | int c; 30 | 31 | // Invokes member function `int operator ()(void);' 32 | while ((c = getopt(argc, argv, "f:b:c:o:hs")) != -1) { 33 | switch (c) { 34 | case 'b': 35 | BAMfile = optarg; 36 | break; 37 | case 'o': 38 | outfile = optarg; 39 | break; 40 | case 'c': 41 | changefile = optarg; 42 | break; 43 | case 'f': 44 | FAfile = optarg; 45 | break; 46 | /* case 'u': 47 | unmasked_len = atoi(optarg); 48 | break; 49 | case 'l': 50 | level = atoi(optarg); 51 | break;*/ 52 | case 'h': 53 | usage(); 54 | case 's': 55 | silent = 1; 56 | break; 57 | default : 58 | abort(); 59 | } 60 | } 61 | 62 | if (BAMfile==NULL) error("Could not load bam: %s\n", BAMfile); 63 | if (FAfile==NULL) error("Could not load faidx: %s\n", FAfile); 64 | if (outfile==NULL) error("Could not open output fasta file: %s\n", outfile); 65 | if (changefile==NULL) error("Could not open output changes file: %s\n", changefile); 66 | return parse_bam(BAMfile, FAfile, outfile, changefile, silent); 67 | } 68 | 69 | void error(char* message, ...) { 70 | va_list argp; 71 | va_start(argp, message); 72 | vprintf(message, argp); 73 | va_end(argp); 74 | exit(-1); 75 | } 76 | 77 | int parse_bam(char* bam, char* fa, char* outfa, char *changefile, int silent) { 78 | setvbuf(stdout, NULL, _IONBF, 0); 79 | 80 | //open BAM for reading 81 | samFile *in = sam_open(bam, "r"); 82 | if(in == NULL) error("Unable to open BAM/SAM file: %s\n", bam); 83 | 84 | // open fasta file and index 85 | faidx_t *ref = fai_load(fa); 86 | if (ref == NULL) error("Could not load faidx: %s\n", fa); 87 | 88 | // open output fasta file 89 | FILE* out = fopen(outfa,"w"); 90 | if (out == NULL) error("Could not open output fasta file: %s\n", outfa); 91 | 92 | // open output changes file 93 | FILE* changes = fopen(changefile,"w"); 94 | if (changes == NULL) error("Could not open output changes file: %s\n", changefile); 95 | 96 | //Get the header 97 | bam_hdr_t *header = sam_hdr_read(in); 98 | //Initiate the alignment record 99 | bam1_t *aln = bam_init1(); 100 | bam1_t *aln2 = bam_init1(); 101 | int ret=0, i=0; 102 | int current_position = 0, previous_position = 0; 103 | alipile_t* ap = alipile_init(changes); 104 | alipile_t* allali = alipile_init(NULL); 105 | polished_t* s = polished_init(); 106 | hash_t* readname = hash_init(); 107 | int tot_read = 0, added_read = 0, too_short = 0; 108 | int current_seq = -1, progress = 0, nb_changes = 0, nb_hapB = 0; 109 | 110 | while ((ret = sam_read1(in, header, aln)) >= 0) { 111 | bam_copy1(aln2, aln); 112 | // exclude unmapped reads 113 | if (aln->core.tid < 0 || (aln->core.flag&BAM_FUNMAP)) continue; 114 | //printf("Read: %s\t%s\t%i\n", bam_get_qname(aln), get_bamseq(aln), aln->core.l_qseq); 115 | 116 | if(current_seq != aln->core.tid) { 117 | progress = 0; 118 | if(current_seq != -1) { 119 | if(!silent) { 120 | print_step(ap->len_seq, ap->len_seq); 121 | printf("\n"); 122 | } 123 | int i = 0; 124 | for(i = previous_position ; i < ap->len_seq ; i++) 125 | select_base(ap, allali, i, s, readname); 126 | print_seq(s, out); 127 | nb_changes += ap->nb_changes; 128 | nb_hapB += ap->nbhapB; 129 | alipile_free(ap); 130 | alipile_free(allali); 131 | polished_free(s); 132 | hash_free(readname); 133 | previous_position = 0; 134 | ap = alipile_init(changes); 135 | allali = alipile_init(NULL); 136 | s = polished_init(); 137 | readname = hash_init(); 138 | } 139 | if(!silent) printf("Reference sequence : %s\n", header->target_name[aln->core.tid]); 140 | current_seq = aln->core.tid; 141 | ap->name_seq = header->target_name[aln->core.tid]; 142 | s->name_seq = header->target_name[aln->core.tid]; 143 | ap->len_seq = header->target_len[aln->core.tid]; 144 | int len = 0; 145 | ap->seq = faidx_fetch_seq(ref, ap->name_seq, 0, ap->len_seq, &len); 146 | } 147 | 148 | current_position = aln->core.pos; 149 | if(tot_read%10000 == 0 && !silent) print_step(current_position, ap->len_seq); 150 | 151 | //if(!strcmp(header->target_name[aln->core.tid], "HS_assemblymutate") && current_position > 96300 && current_position < 96400) ap->debug = 1; 152 | //else ap->debug = 0; 153 | //ap->debug = 1; 154 | int i; 155 | for(i = previous_position ; i < current_position ; i++) 156 | select_base(ap, allali, i, s, readname); 157 | 158 | // exclude too short alignments 159 | if(get_lseqali(aln) >= 31) { 160 | if(!hash_search(readname, bam_get_qname(aln))) { 161 | add(ap, aln); 162 | added_read++; 163 | } 164 | else { 165 | //printf("======> hapremove %s\n", bam_get_qname(aln)); 166 | hash_delete(readname, bam_get_qname(aln)); 167 | bam_destroy1(aln); 168 | } 169 | add(allali, aln2); 170 | aln = bam_init1(); 171 | aln2 = bam_init1(); 172 | tot_read++; 173 | } else too_short++; 174 | previous_position = current_position; 175 | } 176 | for(i = previous_position ; i < ap->len_seq ; i++) 177 | select_base(ap, allali, i, s, readname); 178 | 179 | if(ap->len_seq != 0) { 180 | print_seq(s, out); 181 | if(!silent) print_step(ap->len_seq, ap->len_seq); 182 | } 183 | nb_changes += ap->nb_changes; 184 | nb_hapB += ap->nbhapB; 185 | printf("\n\nNumber of reads : %i\n", tot_read); 186 | printf("Number of too short alignment : %i\n", too_short); 187 | printf("Number of hapB reads found : %i\n", nb_hapB*2); 188 | printf("Number of changes : %i\n", nb_changes); 189 | 190 | fai_destroy(ref); 191 | bam_hdr_destroy(header); 192 | alipile_free(ap); 193 | alipile_free(allali); 194 | polished_free(s); 195 | hash_free(readname); 196 | bam_destroy1(aln); 197 | bam_destroy1(aln2); 198 | sam_close(in); 199 | fclose(changes); 200 | fclose(out); 201 | return 0; 202 | } 203 | 204 | void print_step(int pos, int len_seq) { 205 | int p = (int)((pos * 100)/ len_seq); 206 | if(p == 99) p = 100; 207 | int r = p, i = 50; 208 | printf("\r|"); 209 | while(p >= 2) { 210 | printf("*"); 211 | p -= 2; 212 | i--; 213 | } 214 | while(i > 0) { printf(" "); i--; } 215 | printf("| %i%%", r); 216 | } 217 | 218 | void usage() { 219 | fprintf(stderr, "--------------------------------------------------------------------------------------------\n"); 220 | fprintf(stderr, "\n"); 221 | fprintf(stderr, "Usage: polish_consensus -b -f -o -c \n\n"); 222 | fprintf(stderr, "Parameters: -b FILE : input BAM file\n"); 223 | fprintf(stderr, " -f FILE : input fasta file\n"); 224 | fprintf(stderr, " -o FILE : output fasta file\n"); 225 | fprintf(stderr, " -c FILE : output file that describes the corrections made.\n"); 226 | fprintf(stderr, " -h : this help\n"); 227 | fprintf(stderr, "\n"); 228 | fprintf(stderr, "--------------------------------------------------------------------------------------------\n"); 229 | exit(0); 230 | } 231 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hapo-G - Haplotype-Aware Polishing of Genomes 2 | 3 | Hapo-G (pronounced like apogee) is a tool that aims to improve the quality of genome assemblies by polishing the consensus with accurate reads. 4 | 5 | Publication : [link](https://academic.oup.com/nargab/article/3/2/lqab034/6262629 "Hapo-G publication") 6 | 7 | In case of troubles when using or installing the software, please open up an issue by clicking [here](https://github.com/institut-de-genomique/Hapo-G/issues/new "Github issue page"). 8 | 9 | 10 | ## Dependencies 11 | 12 | Hapo-G depends on some software and libraries: 13 | - GCC and G++ (Hapo-G has been tested with GCC 4.9.2 and GCC 7.3.0) 14 | - Autoconf with minimum 2.69 version (to build HTSlib) 15 | - Python3 (minimum version 3.6) 16 | - [HTSlib](https://github.com/samtools/htslib "HTSlib github") 17 | - [BioPython](https://biopython.org/wiki/Download "BioPython") 18 | - [BWA](https://github.com/lh3/bwa "BWA") 19 | - [Samtools](https://github.com/samtools/samtools "Samtools") 20 | 21 | 22 | ## Installation 23 | ### Installation with conda 24 | ``` 25 | conda create -n hapog 26 | conda activate hapog 27 | conda install -c bioconda hapog 28 | ``` 29 | 30 | ### Installing from Github 31 | First, clone this repository: 32 | ``` 33 | git clone https://github.com/institut-de-genomique/HAPO-G hapog 34 | ``` 35 | 36 | If htslib is already installed on your system, go to the next point `Build with existing htslib`. If you want Hapo-G to download and install htslib for you, go to the point `Build with a new htslib install` 37 | 38 | #### Build with existing htslib 39 | Building with an existing htslib ensures that Hapo-G and Samtools are using the same version of the library and should reduce compatibility issues. To build with an existing htslib, do: 40 | ``` 41 | cd hapog 42 | bash build.sh -l path_to_htslib 43 | ``` 44 | If samtools is already installed on your system at `/home/user/samtools`, htslib is probably installed at `/home/user/samtools/htslib`. 45 | 46 | #### Build with a new htslib 47 | Hapo-G can download and compile htslib for you, to do so, please run: 48 | ``` 49 | cd hapog 50 | bash build.sh 51 | ``` 52 | 53 | If everything went as expected, a binary of Hapo-G was created in `build/` and a symlink was created in the `bin/` folder 54 | 55 | 56 | ## Using Hapo-G 57 | Before running Hapo-G, you should make sure that BWA and Samtools are in your `$PATH`: 58 | ``` 59 | which bwa 60 | which samtools 61 | ``` 62 | 63 | ### Standard pipeline 64 | You can launch Hapo-G by using the Python3 script in its root directory: 65 | ``` 66 | python3 HAPOG_ROOT/hapog.py \ 67 | --genome assembly.fasta \ # Fasta file of the genome to polish 68 | --pe1 R1.fastq.gz \ # Illumina R1 reads in .fastq or .fastq.gz, can be given multiple times 69 | --pe2 R2.fastq.gz \ # Illumina R2 reads in .fastq or .fastq.gz, can be given multiple times 70 | -o polishing \ # Output directory 71 | -t 36 \ # Number of threads to use 72 | -u # Include unpolished sequences in the output 73 | ``` 74 | 75 | **NOTE**: If you installed Hapo-G using conda, you can invoke it by directly running `hapog`. 76 | 77 | ### Skipping the mapping step 78 | The mapping step can be skipped if a sorted BAM file is provided via the `-b` switch. Please verify that your fasta headers don't contain any non-alphanumerical characters (`-`and `_`are accepted) before launching Hapo-G. 79 | 80 | **IMPORTANT**: The BAM file should not contain secondary alignments, these could lead to non-ACTG characters being introduced in the consensus sequence. You can use Minimap2's option `secondary=no` to produce a SAM file with no secondary alignments. 81 | 82 | A typical command line with a bam file would look like this: 83 | ``` 84 | python3 HAPOG_ROOT/hapog.py \ 85 | --genome assembly.fasta \ # Fasta file of the genome to polish 86 | -b mapping.sorted.bam \ # Sorted BAM file 87 | -o polishing \ # Output directory 88 | -t 36 \ # Number of threads to use 89 | -u # Include unpolished sequences in the output 90 | ``` 91 | 92 | ## Output files 93 | #### `hapog_results/hapog.fasta` 94 | The corrected sequences. Hapo-G will parse the read alignments to the genome and focus on phasing errors (i.e the assembly switched from one haplotype to the other) and base errors (insertions, deletions, mismatches) that may be related or not to phasing errors. Remember to include the `-u` flag to tell Hapo-G to output sequences with no reads mapped and thus could not be changed. 95 | 96 | Hapo-G will not add any new contigs or scaffolds to the assembly if, as an example, one of the haplotype is missing in the input assembly file. Instead, it will correct the haplotype that is present in the input file and output a corrected version of the sequence that is phased as best as we could with the data at hand. 97 | 98 | As an example, let’s consider the following heterozygous genome: 99 | ```text 100 | maternal hap.: ACCGTTA 101 | paternal hap.: ATCGTGA 102 | ``` 103 | If the assembler outputted a version with one phasing error (the C in 2nd position is associated with a G in 6th position) and one deletion in 4th position: 104 | ```text 105 | assembly: ACC-TGA 106 | ``` 107 | Then, if Hapo-G was able to correct all the errors, the `hapog.fasta` file will contain: 108 | ```text 109 | hapog.fasta: ACCGTTA 110 | ``` 111 | 112 | ## `hapog_results/hapog.changes` 113 | This file is a tabulated file that gives more information on what Hapo-G did to the input assembly. It has eight columns that show: 114 | - Name of the input sequence where the change was made 115 | - Position in the input sequence where the change was made 116 | - Nucleotide(s) at the position in column 2 117 | - Nucleotide(s) that will replace the nucleotide(s) shown in column 3 118 | - Name of the read that is used as the current template. In the Hapo-G algorithm, we try to follow a read for as long as possible to not switch haplotypes. If an error is found in the template read, we switch to a different read of the same haplotype 119 | `hetero` if the change is only present in one of the two possible haplotypes (i.e a phasing error). `homo` if the change is present in both haplotypes 120 | - Ratio of reads from the same haplotypes as the template read that validate the changes 121 | - Ratio of reads from both haplotypes that validate the changes 122 | 123 | Here is an example: 124 | ```text 125 | Contig_1 1000 ref=A read=TA readname=read_2 hetero ratio1=0.7419 ratio2=0.4237 126 | Contig_1 2000 ref=T read=G readname=read_2 homo ratio1=0.8142 ratio2=0.8323 127 | ``` 128 | We can see that on the contig `Contig_1`, Hapo-G found a phasing error (`hetero` on the first line) and replaced a A at position 1000 by TA. This change was validated by 74.19% of reads of the same haplotype as the template read (ratio1) and by 42.37% of reads if we do not discriminate on which haplotype they belong to. It also found a mismatch at position 2000 (`homo`) and replaced a T by a G. This change was validated by 83% of reads of no matter which haplotype (ratio2). 129 | 130 | 131 | ## Rerunning specific chunks 132 | When using multiple `--threads`, Hapo-G splits the assembly into numbered chunks (`chunks_1.fasta`, `chunks_2.fasta`, …) and processes each one independently. If a subset of chunks failed, you can rerun only those pieces by passing their numbers as a comma-separated list: 133 | ``` 134 | hapog --chunk-list 3,7,12 \ 135 | -o polishing \ 136 | [other options] 137 | ``` 138 | 139 | Re-run the command with the same output directory that contains the previous run. Hapo-G will reuse the existing mapping data, regenerate only the requested chunk files and refresh the merged `hapog_results/` outputs. 140 | 141 | ## Acknowledgements 142 | 143 | Some Cmake files have been taken and/or modified from several projects. We would like to thank: 144 | - [panguangze](https://delta.cs.cityu.edu.hk/gzpan2) for their `FindHTSLIB.cmake` library 145 | - [L. Kärkkäinen](https://github.com/Tronic) for their `LibFindMacros.cmake` library 146 | -------------------------------------------------------------------------------- /hapog/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from hapog import mapping 3 | from hapog import pipeline 4 | 5 | import argparse 6 | import os 7 | import sys 8 | import time 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser( 13 | prog="hapog", 14 | description="\n\nHapo-G uses alignments produced by BWA (or any other aligner that produces SAM files) to polish the consensus of a genome assembly.", 15 | formatter_class=argparse.RawTextHelpFormatter, 16 | add_help=True, 17 | ) 18 | 19 | mandatory_args = parser.add_argument_group("Mandatory arguments") 20 | mandatory_args.add_argument( 21 | "--genome", 22 | "-g", 23 | action="store", 24 | dest="input_genome", 25 | help="Input genome file to map reads to", 26 | default=None, 27 | required=True, 28 | ) 29 | mandatory_args.add_argument( 30 | "--pe1", 31 | action="append", 32 | dest="pe1", 33 | help="Fastq.gz paired-end file (pair 1, can be given multiple times)", 34 | default=None, 35 | required=False, 36 | ) 37 | mandatory_args.add_argument( 38 | "--pe2", 39 | action="append", 40 | dest="pe2", 41 | help="Fastq.gz paired-end file (pair 2, can be given multiple times)", 42 | default=None, 43 | required=False, 44 | ) 45 | mandatory_args.add_argument( 46 | "--single", 47 | action="store", 48 | dest="long_reads", 49 | help="Use long reads instead of short reads (can only be given one time, please concatenate all read files into one)", 50 | default=None, 51 | required=False, 52 | ) 53 | 54 | optional_args = parser.add_argument_group("Optional arguments") 55 | optional_args.add_argument( 56 | "-b", 57 | action="store", 58 | dest="bam_file", 59 | help="Skip mapping step and provide a sorted bam file. Important: the BAM file must not contain secondary alignments, please use the 'secondary=no' option in Minimap2.", 60 | default="", 61 | required=False, 62 | ) 63 | optional_args.add_argument( 64 | "-u", 65 | action="store_true", 66 | dest="include_unpolished", 67 | help="Include unpolished sequences in final output", 68 | default=False, 69 | required=False, 70 | ) 71 | optional_args.add_argument( 72 | "--output", 73 | "-o", 74 | action="store", 75 | dest="output_dir", 76 | help="Output directory name", 77 | default="hapog_results", 78 | required=False, 79 | ) 80 | optional_args.add_argument( 81 | "--threads", 82 | "-t", 83 | action="store", 84 | dest="threads", 85 | help="Number of threads (used in BWA, Samtools and Hapo-G)", 86 | default="8", 87 | required=False, 88 | ) 89 | optional_args.add_argument( 90 | "--hapog-threads", 91 | action="store", 92 | dest="hapog_threads", 93 | help="Maximum number of Hapo-G jobs to launch in parallel (Defaults to the same value as --threads)", 94 | default=0, 95 | type=int, 96 | required=False, 97 | ) 98 | optional_args.add_argument( 99 | "--bin", 100 | action="store", 101 | dest="hapog_bin", 102 | help="Use a different Hapo-G binary (for debug purposes)", 103 | default=None, 104 | required=False, 105 | ) 106 | optional_args.add_argument( 107 | "--samtools-mem", 108 | action="store", 109 | dest="samtools_mem", 110 | help="Amount of memory to use per samtools thread (Default: '5G')", 111 | default="5G", 112 | required=False, 113 | ) 114 | optional_args.add_argument( 115 | "--chunk-list", 116 | action="store", 117 | dest="chunk_list", 118 | help="Comma-separated list of chunk numbers to process (e.g., '12,18'). Useful for rerunning failed chunks.", 119 | default=None, 120 | required=False, 121 | ) 122 | 123 | args = parser.parse_args() 124 | pipeline.check_dependencies() 125 | 126 | args.input_genome = os.path.abspath(args.input_genome) 127 | args.output_dir = os.path.abspath(args.output_dir) 128 | if args.hapog_threads == 0: 129 | args.hapog_threads = args.threads 130 | if args.bam_file: 131 | args.bam_file = os.path.abspath(args.bam_file) 132 | 133 | # Parse chunk list if provided 134 | chunk_list = None 135 | if args.chunk_list: 136 | try: 137 | chunk_list = [int(x.strip()) for x in args.chunk_list.split(",")] 138 | print(f"\nProcessing only chunks: {chunk_list}", flush=True) 139 | except ValueError: 140 | print( 141 | f"ERROR: Invalid chunk list format. Please use comma-separated numbers (e.g., '12,18')" 142 | ) 143 | sys.exit(1) 144 | 145 | pe1 = [] 146 | pe2 = [] 147 | use_short_reads = False 148 | 149 | if not args.chunk_list: 150 | try: 151 | os.mkdir(args.output_dir) 152 | except: 153 | print( 154 | f"\nOutput directory {args.output_dir} can't be created, please erase it before launching Hapo-G.\n" 155 | ) 156 | sys.exit(1) 157 | 158 | os.mkdir(f"{args.output_dir}/bam") 159 | os.mkdir(f"{args.output_dir}/logs") 160 | os.mkdir(f"{args.output_dir}/cmds") 161 | 162 | if args.bam_file: 163 | mapping.remove_secondary_alignments(args.bam_file, args.output_dir) 164 | args.bam_file = os.path.abspath("no_secondary.bam") 165 | else: 166 | if not args.long_reads and (not args.pe1 or not args.pe2): 167 | print("You need to specify the paths to paired-end or long reads files.") 168 | sys.exit(-1) 169 | 170 | if not args.long_reads: 171 | for pe in args.pe1: 172 | pe1.append(os.path.abspath(pe)) 173 | for pe in args.pe2: 174 | pe2.append(os.path.abspath(pe)) 175 | use_short_reads = True 176 | else: 177 | args.long_reads = os.path.abspath(args.long_reads) 178 | if not os.path.exists(args.long_reads): 179 | print("Long reads not found: %s" % (args.long_reads)) 180 | sys.exit(-1) 181 | 182 | global_start = time.perf_counter() 183 | os.chdir(args.output_dir) 184 | 185 | non_alphanumeric_chars = False 186 | if args.chunk_list and os.path.exists("correspondance.txt"): 187 | non_alphanumeric_chars = True 188 | 189 | if not args.bam_file and not args.chunk_list: 190 | non_alphanumeric_chars = pipeline.check_fasta_headers(args.input_genome) 191 | if non_alphanumeric_chars: 192 | print( 193 | "\nNon alphanumeric characters detected in fasta headers. Renaming sequences.", 194 | flush=True, 195 | ) 196 | pipeline.rename_assembly(args.input_genome) 197 | else: 198 | os.system(f"ln -s {args.input_genome} assembly.fasta") 199 | 200 | if use_short_reads: 201 | mapping.launch_PE_mapping( 202 | "assembly.fasta", pe1, pe2, args.threads, args.samtools_mem 203 | ) 204 | else: 205 | mapping.launch_LR_mapping( 206 | "assembly.fasta", args.long_reads, args.threads, args.samtools_mem 207 | ) 208 | 209 | else: 210 | if not args.chunk_list and pipeline.check_fasta_headers(args.input_genome): 211 | print( 212 | "\nERROR: Non-alphanumeric characters detected in fasta headers will cause samtools view to crash.", 213 | flush=True, 214 | ) 215 | print( 216 | "Please remove these characters before launching Hapo-G with -b or let Hapo-G do the mapping by itself.", 217 | flush=True, 218 | ) 219 | print( 220 | "Authorized characters belong to this list: 'a-z', 'A-Z', '0-9', '_-'.", 221 | flush=True, 222 | ) 223 | sys.exit(-1) 224 | 225 | if not args.chunk_list: 226 | os.system(f"ln -s {args.input_genome} assembly.fasta") 227 | mapping.index_bam() 228 | 229 | if not args.chunk_list and int(args.hapog_threads) > 1: 230 | pipeline.create_chunks("assembly.fasta", args.threads) 231 | pipeline.extract_bam(int(args.threads)) 232 | elif not args.chunk_list: 233 | os.mkdir("chunks") 234 | os.mkdir("chunks_bam") 235 | if non_alphanumeric_chars: 236 | os.system("ln -s ../assembly.fasta chunks/chunks_1.fasta") 237 | else: 238 | os.system(f"ln -s {args.input_genome} chunks/chunks_1.fasta") 239 | os.system(f"ln -s ../bam/aln.sorted.bam chunks_bam/chunks_1.bam") 240 | 241 | pipeline.launch_hapog(args.hapog_bin, args.hapog_threads, chunk_list) 242 | pipeline.merge_results(int(args.threads)) 243 | 244 | if non_alphanumeric_chars: 245 | pipeline.rename_results() 246 | else: 247 | os.system("mv hapog_results/hapog.changes.tmp hapog_results/hapog.changes") 248 | os.system("mv hapog_results/hapog.fasta.tmp hapog_results/hapog.fasta") 249 | 250 | if args.include_unpolished: 251 | pipeline.include_unpolished(args.input_genome) 252 | 253 | print("\nResults can be found in the hapog_results directory") 254 | print(f"Total running time: {int(time.perf_counter() - global_start)} seconds") 255 | print("\nThanks for using Hapo-G, have a great day :-)\n") 256 | -------------------------------------------------------------------------------- /hapog/pipeline.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | 3 | import glob 4 | import os 5 | import shutil 6 | import subprocess 7 | import time 8 | import warnings 9 | 10 | 11 | def is_in_path(tool): 12 | return shutil.which(tool) is not None 13 | 14 | 15 | def check_dependencies(): 16 | missing_dependency = False 17 | print("\nChecking dependencies...", flush=True) 18 | 19 | tools = ["bwa", "samtools"] 20 | for tool in tools: 21 | if not is_in_path(tool): 22 | print(f"\tWARNING: {tool} not found.", flush=True) 23 | missing_dependency = True 24 | else: 25 | print(f"\tFound {tool}", flush=True) 26 | 27 | if missing_dependency: 28 | exit(-1) 29 | 30 | 31 | def check_fasta_headers(genome): 32 | authorized_chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_" 33 | with open(genome) as genome_file: 34 | for line in genome_file: 35 | if line.startswith(">"): 36 | header = line[1:].rstrip("\n") 37 | for char in header: 38 | if char not in authorized_chars: 39 | return True 40 | return False 41 | 42 | 43 | def get_genome_size(genome): 44 | cumul_size = 0 45 | with open(genome) as genome_file: 46 | for record in SeqIO.parse(genome_file, "fasta"): 47 | cumul_size += len(record.seq) 48 | return cumul_size 49 | 50 | 51 | def rename_assembly(genome): 52 | correspondance_file = open("correspondance.txt", "w") 53 | with open("assembly.fasta", "w") as out: 54 | counter = 0 55 | with open(genome) as genome_file: 56 | for line in genome_file: 57 | if line.startswith(">"): 58 | out.write(f">Contig{counter}\n") 59 | correspondance_file.write(f"Contig{counter}\t{line[1:]}") 60 | counter += 1 61 | else: 62 | out.write(line) 63 | correspondance_file.close() 64 | 65 | 66 | def create_chunks(genome, threads): 67 | cumul_size = get_genome_size(genome) 68 | 69 | chunk_size = cumul_size / int(threads) 70 | print( 71 | f"\nFragmenting the genome into {threads} chunks of {int(chunk_size):,} bases (depending of scaffold sizes)", 72 | flush=True, 73 | ) 74 | try: 75 | os.mkdir("chunks") 76 | except: 77 | pass 78 | 79 | current_chunk = 1 80 | current_chunk_size = 0 81 | current_chunk_file = open("chunks/chunks_1.fasta", "w") 82 | current_bed_file = open("chunks/chunks_1.bed", "w") 83 | 84 | start = time.perf_counter() 85 | with warnings.catch_warnings(): 86 | warnings.simplefilter("ignore") 87 | 88 | for record in SeqIO.parse(open(genome), "fasta"): 89 | if current_chunk_size >= chunk_size and current_chunk != threads: 90 | current_chunk_file.close() 91 | current_chunk_file = open(f"chunks/chunks_{current_chunk + 1}.fasta", "w") 92 | current_bed_file = open(f"chunks/chunks_{current_chunk + 1}.bed", "w") 93 | current_chunk += 1 94 | current_chunk_size = 0 95 | 96 | current_chunk_file.write(record.format("fasta")) 97 | current_bed_file.write(f"{record.id}\t0\t{len(record.seq)}\n") 98 | current_chunk_size += len(record.seq) 99 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 100 | 101 | current_chunk_file.close() 102 | current_bed_file.close() 103 | 104 | 105 | def extract_bam(processes): 106 | print("\nExtracting bam for each chunk", flush=True) 107 | try: 108 | os.mkdir("chunks_bam") 109 | except: 110 | pass 111 | 112 | start = time.perf_counter() 113 | 114 | cmds = [] 115 | nb_beds = 0 116 | for bed in glob.glob("chunks/*.bed"): 117 | nb_beds += 1 118 | cmds.append(["samtools", "view", "-ML", bed, "-b", "bam/aln.sorted.bam"]) 119 | 120 | procs = [] 121 | for j in range(0, len(cmds)): 122 | cmd = cmds.pop(0) 123 | bam = "chunks_bam/" + cmd[3].split("/")[1].replace(".bed", ".bam") 124 | with open("cmds/extract_bam.cmds", "a") as cmd_file: 125 | print(" ".join(cmd), flush=True, file=cmd_file) 126 | 127 | # Ignore the ResourceWarning about unclosed files 128 | with warnings.catch_warnings(): 129 | warnings.simplefilter("ignore") 130 | procs.append( 131 | subprocess.Popen( 132 | cmd, 133 | stdout=open(bam, "w"), 134 | stderr=open("logs/samtools_split.e", "a"), 135 | ) 136 | ) 137 | 138 | has_failed = False 139 | for p in procs: 140 | p.wait() 141 | 142 | if p.returncode != 0: 143 | print(f"ERROR: Samtools view didn't finish correctly, return code: {p.returncode}") 144 | print("Faulty command: {p.args}") 145 | has_failed = True 146 | 147 | if has_failed: 148 | exit(1) 149 | 150 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 151 | 152 | 153 | def launch_hapog(hapog_bin, parallel_jobs, chunk_list=None): 154 | if chunk_list: 155 | print(f"\nLaunching Hapo-G on specified chunks: {chunk_list}", flush=True) 156 | else: 157 | print(f"\nLaunching Hapo-G on each chunk", flush=True) 158 | try: 159 | os.mkdir("hapog_chunks") 160 | except: 161 | pass 162 | 163 | start = time.perf_counter() 164 | 165 | script_path = os.path.realpath(__file__).replace("/hapog/pipeline.py", "") 166 | if not hapog_bin: 167 | if not is_in_path("hapog_bin"): 168 | hapog_bin = f"{script_path}/hapog_build/hapog" 169 | else: 170 | hapog_bin = "hapog_bin" 171 | else: 172 | print(f"Using this bin: {hapog_bin}") 173 | 174 | procs = [] 175 | for chunk in glob.glob("chunks/*.fasta"): 176 | chunk_prefix = chunk.split("/")[-1].replace(".fasta", "") 177 | 178 | # Extract chunk number from chunk_prefix (e.g., "chunks_12" -> 12) 179 | chunk_number = int(chunk_prefix.split("_")[-1]) 180 | 181 | # Skip this chunk if chunk_list is specified and this chunk is not in the list 182 | if chunk_list and chunk_number not in chunk_list: 183 | continue 184 | 185 | cmd = [ 186 | hapog_bin, 187 | "-b", 188 | f"chunks_bam/{chunk_prefix}.bam", 189 | "-f", 190 | chunk, 191 | "-o", 192 | f"hapog_chunks/{chunk_prefix}.fasta", 193 | "-c", 194 | f"hapog_chunks/{chunk_prefix}.changes", 195 | ] 196 | with open("cmds/hapog.cmds", "a") as cmd_file: 197 | print(" ".join(cmd), flush=True, file=cmd_file) 198 | 199 | with warnings.catch_warnings(): 200 | warnings.simplefilter("ignore") 201 | procs.append( 202 | subprocess.Popen( 203 | cmd, 204 | stdout=open(f"logs/hapog_{chunk_prefix}.o", "w"), 205 | stderr=open(f"logs/hapog_{chunk_prefix}.e", "w"), 206 | ) 207 | ) 208 | 209 | # Only launch a job if there is less than 'parallel_jobs' running 210 | # Otherwise, wait for any to finish before launching a new one 211 | while len([p for p in procs if p.poll() is None]) >= int(parallel_jobs): 212 | time.sleep(1) 213 | 214 | has_failed = False 215 | for p in procs: 216 | p.wait() 217 | 218 | return_code = p.returncode 219 | if return_code != 0: 220 | print(f"ERROR: Hapo-G didn't finish successfully, exit code: {return_code}") 221 | print("Faulty command: %s" % (" ".join(p.args))) 222 | has_failed = True 223 | 224 | if has_failed: 225 | exit(1) 226 | 227 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 228 | 229 | 230 | def merge_results(threads): 231 | print("\nMerging results", flush=True) 232 | try: 233 | os.mkdir("hapog_results") 234 | except: 235 | pass 236 | 237 | try: 238 | for f in glob.glob("hapog_results/*"): 239 | os.remove(f) 240 | except: 241 | pass 242 | 243 | start = time.perf_counter() 244 | 245 | with open("hapog_results/hapog.fasta.tmp", "w") as out: 246 | for i in range(1, threads + 1): 247 | f = f"hapog_chunks/chunks_{i}.fasta" 248 | if os.path.exists(f): 249 | with open(f, "r") as fd: 250 | shutil.copyfileobj(fd, out) 251 | out.write("\n") 252 | 253 | with open("hapog_results/hapog.changes.tmp", "w") as out: 254 | for i in range(1, threads + 1): 255 | f = f"hapog_chunks/chunks_{i}.changes" 256 | if os.path.exists(f): 257 | with open(f, "r") as fd: 258 | shutil.copyfileobj(fd, out) 259 | out.write("\n") 260 | 261 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 262 | 263 | 264 | def rename_results(): 265 | correspondance_file = open("correspondance.txt") 266 | dict_correspondance = {} 267 | for line in correspondance_file: 268 | new, original = line.strip("\n").split("\t") 269 | dict_correspondance[new] = original 270 | correspondance_file.close() 271 | 272 | with open("hapog_results/hapog.fasta", "w") as out: 273 | hapog_tmp = open("hapog_results/hapog.fasta.tmp") 274 | for record in SeqIO.parse(hapog_tmp, "fasta"): 275 | out.write(f">{dict_correspondance[str(record.id).replace('_polished', '')]}\n{record.seq}\n") 276 | hapog_tmp.close() 277 | 278 | with open("hapog_results/hapog.changes", "w") as out: 279 | hapog_tmp = open("hapog_results/hapog.changes.tmp") 280 | for line in hapog_tmp: 281 | line = line.strip("\n").split("\t") 282 | try: 283 | line[0] = dict_correspondance[line[0]] 284 | except: 285 | continue 286 | line = "\t".join(line) 287 | out.write(f"{line}\n") 288 | hapog_tmp.close() 289 | 290 | for f in glob.glob("assembly.fasta*"): 291 | os.remove(f) 292 | os.remove("hapog_results/hapog.fasta.tmp") 293 | os.remove("hapog_results/hapog.changes.tmp") 294 | 295 | 296 | def include_unpolished(genome): 297 | print("\nWriting unpolished contigs to final output...") 298 | start = time.perf_counter() 299 | 300 | initial_contig_names = set() 301 | if os.path.exists("correspondance.txt"): 302 | with open("correspondance.txt") as corr: 303 | for line in corr: 304 | _, initial = line.strip("\n").split("\t") 305 | initial_contig_names.add(initial) 306 | else: 307 | with open(genome) as genome_file: 308 | for line in genome_file: 309 | if line.startswith(">"): 310 | initial_contig_names.add(line[1:].strip("\n")) 311 | 312 | polished_contig_names = set() 313 | with open("hapog_results/hapog.fasta") as fasta: 314 | for line in fasta: 315 | if line.startswith(">"): 316 | contig_name = line[1:].strip("\n").replace("_polished", "") 317 | polished_contig_names.add(contig_name) 318 | 319 | with open("hapog_results/hapog.fasta", "a") as out: 320 | genome_file = open(genome) 321 | for record in SeqIO.parse(genome_file, "fasta"): 322 | if record.description.replace("_polished", "") not in polished_contig_names: 323 | out.write(record.format("fasta")) 324 | genome_file.close() 325 | 326 | print(f"Done in {int(time.perf_counter() - start)} seconds", flush=True) 327 | -------------------------------------------------------------------------------- /src/LibFindMacros.cmake: -------------------------------------------------------------------------------- 1 | # Version 2.3 2 | # Public Domain, originally written by Lasse Kärkkäinen 3 | # Maintained at https://github.com/Tronic/cmake-modules 4 | # Please send your improvements as pull requests on Github. 5 | 6 | # Find another package and make it a dependency of the current package. 7 | # This also automatically forwards the "REQUIRED" argument. 8 | # Usage: libfind_package( [extra args to find_package]) 9 | macro (libfind_package PREFIX PKG) 10 | set(${PREFIX}_args ${PKG} ${ARGN}) 11 | if (${PREFIX}_FIND_REQUIRED) 12 | set(${PREFIX}_args ${${PREFIX}_args} REQUIRED) 13 | endif() 14 | find_package(${${PREFIX}_args}) 15 | set(${PREFIX}_DEPENDENCIES ${${PREFIX}_DEPENDENCIES};${PKG}) 16 | unset(${PREFIX}_args) 17 | endmacro() 18 | 19 | # A simple wrapper to make pkg-config searches a bit easier. 20 | # Works the same as CMake's internal pkg_check_modules but is always quiet. 21 | macro (libfind_pkg_check_modules) 22 | find_package(PkgConfig QUIET) 23 | if (PKG_CONFIG_FOUND) 24 | pkg_check_modules(${ARGN} QUIET) 25 | endif() 26 | endmacro() 27 | 28 | # Avoid useless copy&pasta by doing what most simple libraries do anyway: 29 | # pkg-config, find headers, find library. 30 | # Usage: libfind_pkg_detect( FIND_PATH [other args] FIND_LIBRARY [other args]) 31 | # E.g. libfind_pkg_detect(SDL2 sdl2 FIND_PATH SDL.h PATH_SUFFIXES SDL2 FIND_LIBRARY SDL2) 32 | function (libfind_pkg_detect PREFIX) 33 | # Parse arguments 34 | set(argname pkgargs) 35 | foreach (i ${ARGN}) 36 | if ("${i}" STREQUAL "FIND_PATH") 37 | set(argname pathargs) 38 | elseif ("${i}" STREQUAL "FIND_LIBRARY") 39 | set(argname libraryargs) 40 | else() 41 | set(${argname} ${${argname}} ${i}) 42 | endif() 43 | endforeach() 44 | if (NOT pkgargs) 45 | message(FATAL_ERROR "libfind_pkg_detect requires at least a pkg_config package name to be passed.") 46 | endif() 47 | # Find library 48 | libfind_pkg_check_modules(${PREFIX}_PKGCONF ${pkgargs}) 49 | if (pathargs) 50 | find_path(${PREFIX}_INCLUDE_DIR NAMES ${pathargs} HINTS ${${PREFIX}_PKGCONF_INCLUDE_DIRS}) 51 | endif() 52 | if (libraryargs) 53 | find_library(${PREFIX}_LIBRARY NAMES ${libraryargs} HINTS ${${PREFIX}_PKGCONF_LIBRARY_DIRS}) 54 | endif() 55 | # Read pkg-config version 56 | if (${PREFIX}_PKGCONF_VERSION) 57 | set(${PREFIX}_VERSION ${${PREFIX}_PKGCONF_VERSION} PARENT_SCOPE) 58 | endif() 59 | endfunction() 60 | 61 | # Extracts a version #define from a version.h file, output stored to _VERSION. 62 | # Usage: libfind_version_header(Foobar foobar/version.h FOOBAR_VERSION_STR) 63 | # Fourth argument "QUIET" may be used for silently testing different define names. 64 | # This function does nothing if the version variable is already defined. 65 | function (libfind_version_header PREFIX VERSION_H DEFINE_NAME) 66 | # Skip processing if we already have a version or if the include dir was not found 67 | if (${PREFIX}_VERSION OR NOT ${PREFIX}_INCLUDE_DIR) 68 | return() 69 | endif() 70 | set(quiet ${${PREFIX}_FIND_QUIETLY}) 71 | # Process optional arguments 72 | foreach(arg ${ARGN}) 73 | if (arg STREQUAL "QUIET") 74 | set(quiet TRUE) 75 | else() 76 | message(AUTHOR_WARNING "Unknown argument ${arg} to libfind_version_header ignored.") 77 | endif() 78 | endforeach() 79 | # Read the header and parse for version number 80 | set(filename "${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") 81 | if (NOT EXISTS ${filename}) 82 | if (NOT quiet) 83 | message(AUTHOR_WARNING "Unable to find ${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") 84 | endif() 85 | return() 86 | endif() 87 | file(READ "${filename}" header) 88 | string(REGEX REPLACE ".*#[ \t]*define[ \t]*${DEFINE_NAME}[ \t]*\"([^\n]*)\".*" "\\1" match "${header}") 89 | # No regex match? 90 | if (match STREQUAL header) 91 | if (NOT quiet) 92 | message(AUTHOR_WARNING "Unable to find \#define ${DEFINE_NAME} \"\" from ${${PREFIX}_INCLUDE_DIR}/${VERSION_H}") 93 | endif() 94 | return() 95 | endif() 96 | # Export the version string 97 | set(${PREFIX}_VERSION "${match}" PARENT_SCOPE) 98 | endfunction() 99 | 100 | # Do the final processing once the paths have been detected. 101 | # If include dirs are needed, ${PREFIX}_PROCESS_INCLUDES should be set to contain 102 | # all the variables, each of which contain one include directory. 103 | # Ditto for ${PREFIX}_PROCESS_LIBS and library files. 104 | # Will set ${PREFIX}_FOUND, ${PREFIX}_INCLUDE_DIRS and ${PREFIX}_LIBRARIES. 105 | # Also handles errors in case library detection was required, etc. 106 | function (libfind_process PREFIX) 107 | # Skip processing if already processed during this configuration run 108 | if (${PREFIX}_FOUND) 109 | return() 110 | endif() 111 | 112 | set(found TRUE) # Start with the assumption that the package was found 113 | 114 | # Did we find any files? Did we miss includes? These are for formatting better error messages. 115 | set(some_files FALSE) 116 | set(missing_headers FALSE) 117 | 118 | # Shorthands for some variables that we need often 119 | set(quiet ${${PREFIX}_FIND_QUIETLY}) 120 | set(required ${${PREFIX}_FIND_REQUIRED}) 121 | set(exactver ${${PREFIX}_FIND_VERSION_EXACT}) 122 | set(findver "${${PREFIX}_FIND_VERSION}") 123 | set(version "${${PREFIX}_VERSION}") 124 | 125 | # Lists of config option names (all, includes, libs) 126 | unset(configopts) 127 | set(includeopts ${${PREFIX}_PROCESS_INCLUDES}) 128 | set(libraryopts ${${PREFIX}_PROCESS_LIBS}) 129 | 130 | # Process deps to add to 131 | foreach (i ${PREFIX} ${${PREFIX}_DEPENDENCIES}) 132 | if (DEFINED ${i}_INCLUDE_OPTS OR DEFINED ${i}_LIBRARY_OPTS) 133 | # The package seems to export option lists that we can use, woohoo! 134 | list(APPEND includeopts ${${i}_INCLUDE_OPTS}) 135 | list(APPEND libraryopts ${${i}_LIBRARY_OPTS}) 136 | else() 137 | # If plural forms don't exist or they equal singular forms 138 | if ((NOT DEFINED ${i}_INCLUDE_DIRS AND NOT DEFINED ${i}_LIBRARIES) OR 139 | (${i}_INCLUDE_DIR STREQUAL ${i}_INCLUDE_DIRS AND ${i}_LIBRARY STREQUAL ${i}_LIBRARIES)) 140 | # Singular forms can be used 141 | if (DEFINED ${i}_INCLUDE_DIR) 142 | list(APPEND includeopts ${i}_INCLUDE_DIR) 143 | endif() 144 | if (DEFINED ${i}_LIBRARY) 145 | list(APPEND libraryopts ${i}_LIBRARY) 146 | endif() 147 | else() 148 | # Oh no, we don't know the option names 149 | message(FATAL_ERROR "We couldn't determine config variable names for ${i} includes and libs. Aieeh!") 150 | endif() 151 | endif() 152 | endforeach() 153 | 154 | if (includeopts) 155 | list(REMOVE_DUPLICATES includeopts) 156 | endif() 157 | 158 | if (libraryopts) 159 | list(REMOVE_DUPLICATES libraryopts) 160 | endif() 161 | 162 | string(REGEX REPLACE ".*[ ;]([^ ;]*(_INCLUDE_DIRS|_LIBRARIES))" "\\1" tmp "${includeopts} ${libraryopts}") 163 | if (NOT tmp STREQUAL "${includeopts} ${libraryopts}") 164 | message(AUTHOR_WARNING "Plural form ${tmp} found in config options of ${PREFIX}. This works as before but is now deprecated. Please only use singular forms INCLUDE_DIR and LIBRARY, and update your find scripts for LibFindMacros > 2.0 automatic dependency system (most often you can simply remove the PROCESS variables entirely).") 165 | endif() 166 | 167 | # Include/library names separated by spaces (notice: not CMake lists) 168 | unset(includes) 169 | unset(libs) 170 | 171 | # Process all includes and set found false if any are missing 172 | foreach (i ${includeopts}) 173 | list(APPEND configopts ${i}) 174 | if (NOT "${${i}}" STREQUAL "${i}-NOTFOUND") 175 | list(APPEND includes "${${i}}") 176 | else() 177 | set(found FALSE) 178 | set(missing_headers TRUE) 179 | endif() 180 | endforeach() 181 | 182 | # Process all libraries and set found false if any are missing 183 | foreach (i ${libraryopts}) 184 | list(APPEND configopts ${i}) 185 | if (NOT "${${i}}" STREQUAL "${i}-NOTFOUND") 186 | list(APPEND libs "${${i}}") 187 | else() 188 | set (found FALSE) 189 | endif() 190 | endforeach() 191 | 192 | # Version checks 193 | if (found AND findver) 194 | if (NOT version) 195 | message(WARNING "The find module for ${PREFIX} does not provide version information, so we'll just assume that it is OK. Please fix the module or remove package version requirements to get rid of this warning.") 196 | elseif (version VERSION_LESS findver OR (exactver AND NOT version VERSION_EQUAL findver)) 197 | set(found FALSE) 198 | set(version_unsuitable TRUE) 199 | endif() 200 | endif() 201 | 202 | # If all-OK, hide all config options, export variables, print status and exit 203 | if (found) 204 | foreach (i ${configopts}) 205 | mark_as_advanced(${i}) 206 | endforeach() 207 | if (NOT quiet) 208 | message(STATUS "Found ${PREFIX} ${${PREFIX}_VERSION}") 209 | if (LIBFIND_DEBUG) 210 | message(STATUS " ${PREFIX}_DEPENDENCIES=${${PREFIX}_DEPENDENCIES}") 211 | message(STATUS " ${PREFIX}_INCLUDE_OPTS=${includeopts}") 212 | message(STATUS " ${PREFIX}_INCLUDE_DIRS=${includes}") 213 | message(STATUS " ${PREFIX}_LIBRARY_OPTS=${libraryopts}") 214 | message(STATUS " ${PREFIX}_LIBRARIES=${libs}") 215 | endif() 216 | endif() 217 | set (${PREFIX}_INCLUDE_OPTS ${includeopts} PARENT_SCOPE) 218 | set (${PREFIX}_LIBRARY_OPTS ${libraryopts} PARENT_SCOPE) 219 | set (${PREFIX}_INCLUDE_DIRS ${includes} PARENT_SCOPE) 220 | set (${PREFIX}_LIBRARIES ${libs} PARENT_SCOPE) 221 | set (${PREFIX}_FOUND TRUE PARENT_SCOPE) 222 | return() 223 | endif() 224 | 225 | # Format messages for debug info and the type of error 226 | set(vars "Relevant CMake configuration variables:\n") 227 | foreach (i ${configopts}) 228 | mark_as_advanced(CLEAR ${i}) 229 | set(val ${${i}}) 230 | if ("${val}" STREQUAL "${i}-NOTFOUND") 231 | set (val "") 232 | elseif (val AND NOT EXISTS ${val}) 233 | set (val "${val} (does not exist)") 234 | else() 235 | set(some_files TRUE) 236 | endif() 237 | set(vars "${vars} ${i}=${val}\n") 238 | endforeach() 239 | set(vars "${vars}You may use CMake GUI, cmake -D or ccmake to modify the values. Delete CMakeCache.txt to discard all values and force full re-detection if necessary.\n") 240 | if (version_unsuitable) 241 | set(msg "${PREFIX} ${${PREFIX}_VERSION} was found but") 242 | if (exactver) 243 | set(msg "${msg} only version ${findver} is acceptable.") 244 | else() 245 | set(msg "${msg} version ${findver} is the minimum requirement.") 246 | endif() 247 | else() 248 | if (missing_headers) 249 | set(msg "We could not find development headers for ${PREFIX}. Do you have the necessary dev package installed?") 250 | elseif (some_files) 251 | set(msg "We only found some files of ${PREFIX}, not all of them. Perhaps your installation is incomplete or maybe we just didn't look in the right place?") 252 | if(findver) 253 | set(msg "${msg} This could also be caused by incompatible version (if it helps, at least ${PREFIX} ${findver} should work).") 254 | endif() 255 | else() 256 | set(msg "We were unable to find package ${PREFIX}.") 257 | endif() 258 | endif() 259 | 260 | # Fatal error out if REQUIRED 261 | if (required) 262 | set(msg "REQUIRED PACKAGE NOT FOUND\n${msg} This package is REQUIRED and you need to install it or adjust CMake configuration in order to continue building ${CMAKE_PROJECT_NAME}.") 263 | message(FATAL_ERROR "${msg}\n${vars}") 264 | endif() 265 | # Otherwise just print a nasty warning 266 | if (NOT quiet) 267 | message(WARNING "WARNING: MISSING PACKAGE\n${msg} This package is NOT REQUIRED and you may ignore this warning but by doing so you may miss some functionality of ${CMAKE_PROJECT_NAME}. \n${vars}") 268 | endif() 269 | endfunction() 270 | -------------------------------------------------------------------------------- /src/alipile.c: -------------------------------------------------------------------------------- 1 | /// @file alipile.c 2 | /// Pile of alignment and padded alignments 3 | 4 | /* 5 | ################################################################################ 6 | # * Copyright Jean-Marc Aury / Genoscope / DRF / CEA 7 | # * 8 | # * 9 | # * This software is governed by the CeCILL license under French law and 10 | # * abiding by the rules of distribution of free software. You can use, 11 | # * modify and/ or redistribute the software under the terms of the CeCILL 12 | # * license as circulated by CEA, CNRS and INRIA at the following URL 13 | # * "http://www.cecill.info". 14 | # * 15 | # * As a counterpart to the access to the source code and rights to copy, 16 | # * modify and redistribute granted by the license, users are provided only 17 | # * with a limited warranty and the software's author, the holder of the 18 | # * economic rights, and the successive licensors have only limited 19 | # * liability. 20 | # * 21 | # * In this respect, the user's attention is drawn to the risks associated 22 | # * with loading, using, modifying and/or developing or reproducing the 23 | # * software by the user in light of its specific status of free software, 24 | # * that may mean that it is complicated to manipulate, and that also 25 | # * therefore means that it is reserved for developers and experienced 26 | # * professionals having in-depth computer knowledge. Users are therefore 27 | # * encouraged to load and test the software's suitability as regards their 28 | # * requirements in conditions enabling the security of their systems and/or 29 | # * data to be ensured and, more generally, to use and operate it in the 30 | # * same conditions as regards security. 31 | # * 32 | # * The fact that you are presently reading this means that you have had 33 | # * knowledge of the CeCILL license and that you accept its terms. 34 | ################################################################################ 35 | */ 36 | 37 | #include "alipile.h" 38 | 39 | int compare( const void* a, const void* b) { 40 | int int_a = * ( (int*) a ); 41 | int int_b = * ( (int*) b ); 42 | 43 | // an easy expression for comparing 44 | return (int_a > int_b) - (int_a < int_b); 45 | } 46 | 47 | alipile_t *alipile_init(FILE* out) { 48 | alipile_t* ap = (alipile_t*) calloc(1, sizeof(alipile_t)); 49 | ap->max_cov = 500; 50 | ap->current_read = -1; 51 | ap->read_choice = 0; 52 | ap->current_seq = -1; 53 | ap->min_cov = 3; 54 | ap->nb_ali = 0; 55 | ap->debug = 0; 56 | ap->nb_changes = 0; 57 | ap->changes = out; 58 | ap->nbhapB = 0; 59 | return ap; 60 | } 61 | 62 | void alipile_free(alipile_t *ap) { 63 | int i; 64 | for (i = 0; i < ap->nb_ali; i++) free(ap->seqpile[i]); 65 | //free(ap->seqpile); 66 | for (i = 0; i < ap->nb_ali; i++) bam_destroy1(ap->pile[i]); 67 | //free(ap->pile); 68 | //free(ap->name_seq); 69 | free(ap->seq); 70 | free(ap); 71 | } 72 | 73 | void clean(alipile_t *ap, int pos) { 74 | int idread[ap->nb_ali]; 75 | int nbdel = 0, c = 0; 76 | // retrieve all possibilities 77 | for( c = 0 ; c < ap->nb_ali ; c++) { 78 | bam1_t *b = ap->pile[c]; 79 | if(bam_endpos(b) <= pos+10) { 80 | idread[nbdel] = c; 81 | nbdel++; 82 | } 83 | } 84 | for( c = nbdel-1 ; c >= 0 ; c-- ) 85 | delete(ap, idread[c]); 86 | } 87 | 88 | void add(alipile_t *ap, bam1_t *aln) { 89 | int c = 0, smallest = -1, value = 0; 90 | clean(ap, aln->core.pos); 91 | 92 | //need to delete an alignment 93 | if(ap->nb_ali == ap->max_cov) { 94 | for( c = 0, value = 0 ; c < ap->nb_ali ; c++) { 95 | bam1_t *b = ap->pile[c]; 96 | int size = bam_endpos(b) - aln->core.pos; 97 | if(smallest == -1 || size < value) { smallest = c; value = size; } 98 | } 99 | delete(ap, smallest); 100 | } 101 | 102 | //add the new alignment 103 | ap->pile[ap->nb_ali] = aln; 104 | ap->seqpile[ap->nb_ali] = get_seqali(aln); 105 | //printf("->add: %s\tseq=%s\tnbreads=%i\tcurrentpos=%i\n", bam_get_qname(aln), ap->seqpile[ap->nb_ali], ap->nb_ali, aln->core.pos); 106 | ap->nb_ali++; 107 | 108 | // if current_read has been deleted, need to select a new read 109 | if(ap->current_read == -1 || smallest == ap->current_read) 110 | change_currentread(ap, aln->core.pos); 111 | } 112 | 113 | void change_currentread(alipile_t *ap, int pos) { 114 | int c = 0, smallest = -1, value = 0; 115 | for( c = 0 ; c < ap->nb_ali ; c++) { 116 | bam1_t *b = ap->pile[c]; 117 | int size = bam_endpos(b) - pos; 118 | if(smallest == -1 || size < value) { smallest = c; value = size; } 119 | } 120 | ap->current_read = smallest; 121 | } 122 | 123 | void change_currentread2(alipile_t *ap, int pos) { 124 | int c = 0, smallest = -1; 125 | float value = 0.0; 126 | for( c = 0 ; c < ap->nb_ali ; c++) { 127 | char *nt = get_base(ap, c, pos); 128 | float ratio = get_ratio_base(ap, nt, pos); 129 | free(nt); 130 | if(smallest == -1 || ratio > value) { smallest = c; value = ratio; } 131 | } 132 | ap->current_read = smallest; 133 | } 134 | 135 | void delete(alipile_t *ap, int c) { 136 | assert( c < ap->nb_ali ); 137 | bam1_t *remove = ap->pile[c]; 138 | //printf("->remove: %s\tseq=%s\tindex=%i\n", bam_get_qname(remove), get_bamseq(remove), elem); 139 | bam_destroy1(remove); 140 | free(ap->seqpile[c]); 141 | while(c+1 < ap->max_cov) { 142 | ap->pile[c] = ap->pile[c+1]; 143 | ap->seqpile[c] = ap->seqpile[c+1]; 144 | c++; 145 | } 146 | ap->nb_ali--; 147 | if(ap->current_read >= ap->nb_ali) 148 | ap->current_read = 0; 149 | } 150 | 151 | int get_lseqali(bam1_t *aln) { 152 | int k, c; 153 | uint32_t *cigar = bam_get_cigar(aln); 154 | //calculate size of ali string 155 | for (k = 0, c = 0; k < aln->core.n_cigar; ++k) { 156 | int op, ol; 157 | op = bam_cigar_op(cigar[k]); 158 | ol = bam_cigar_oplen(cigar[k]); 159 | if(op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || 160 | op == BAM_CINS || op == BAM_CDEL) c+=ol; 161 | } 162 | return c; 163 | } 164 | 165 | char* get_seqali(bam1_t *aln) { 166 | int i, j, k, c, qlen = 0; 167 | uint32_t *cigar = bam_get_cigar(aln); 168 | uint8_t *seq = bam_get_seq(aln); 169 | int length = get_lseqali(aln); 170 | char *r = malloc((length+1) * sizeof(char)); 171 | 172 | for (k = 0, j = 0, c = 0; k < aln->core.n_cigar; ++k) { 173 | int op, ol; 174 | op = bam_cigar_op(cigar[k]); 175 | ol = bam_cigar_oplen(cigar[k]); 176 | assert(c <= length); 177 | if(op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { 178 | for (i = 0; i < ol; ++i, ++j, ++c) r[c] = seq_nt16_str[bam_seqi(seq, j)]; 179 | } 180 | if(op == BAM_CSOFT_CLIP) { 181 | for (i = 0; i < ol; ++i, ++j); 182 | } 183 | if(op == BAM_CINS) { 184 | for (i = 0; i < ol; ++i, ++j, ++c) r[c] = tolower(seq_nt16_str[bam_seqi(seq, j)]); 185 | } 186 | if(op == BAM_CDEL) { 187 | for (i = 0; i < ol; ++i, ++c) r[c] = '-'; 188 | } 189 | } 190 | r[length]='\0'; 191 | return r; 192 | } 193 | 194 | char* get_bamseq(bam1_t *aln) { 195 | int i = 0; 196 | const bam1_core_t *c = &aln->core; 197 | char *cp = malloc(c->l_qseq + 1); 198 | uint8_t *s = bam_get_seq(aln); 199 | for (i = 0; i < c->l_qseq; i++) 200 | cp[i] = seq_nt16_str[bam_seqi(s, i)]; 201 | cp[i] = '\0'; 202 | 203 | return cp; 204 | } 205 | 206 | char* get_base(alipile_t *ap, int read, int pos) { 207 | char* region = malloc(100000 * sizeof(char)); 208 | region[0]= '\0'; 209 | char* aliseq = ap->seqpile[read]; 210 | bam1_t* aln = ap->pile[read]; 211 | int i = 0, c = 0, len = strlen(aliseq); 212 | for (i = aln->core.pos , c = 0; c < len ; c++) { 213 | char nt = aliseq[c]; 214 | if(i == pos) { 215 | int l = strlen(region); 216 | if(l == 0 || nt != '-') { 217 | region[l] = nt; 218 | region[l+1] = '\0'; 219 | } 220 | } 221 | // lowercase indicate insertion in read 222 | if(!(nt >= 'a' && nt <= 'z')) i++; 223 | } 224 | return region; 225 | } 226 | 227 | float get_ratio_base(alipile_t *ap, char* str, int pos) { 228 | int i = 0, nb_id = 0, tot = 0; 229 | for( i = 0 ; i < ap->nb_ali ; i++ ) { 230 | char *seq = get_base(ap, i, pos); 231 | bam1_t *b = ap->pile[i]; 232 | // check if the alignment start before the current base minus the size of the insertion 233 | if(b->core.pos < pos - strlen(str)) { 234 | if(strcmp(str, seq) == 0) nb_id++; 235 | tot++; 236 | } 237 | free(seq); 238 | } 239 | return ((float)nb_id/(float)tot); 240 | } 241 | 242 | void removehapB_read(alipile_t *ap, char* str, int pos, hash_t* readname) { 243 | int i = 0, j = 0, id = -1, nbdiff = 0, vu = 0; 244 | if(ap->debug) printf("**** Remove hapB **** hapA=%s read=%s, seqpile=%s aliseq=%s\n", str, bam_get_qname(ap->pile[ap->current_read]), ap->seqpile[ap->current_read], get_seqali(ap->pile[ap->current_read])); 245 | char *seen[ap->nb_ali]; 246 | int nbocc[ap->nb_ali]; 247 | int idread[ap->nb_ali][ap->nb_ali]; 248 | // retrieve all possibilities 249 | for( i = 0 ; i < ap->nb_ali ; i++ ) { 250 | char *seq = get_base(ap, i, pos); 251 | vu = 0; 252 | for( j = 0 ; j < nbdiff ; j++) 253 | if(strcmp(seen[j], seq) == 0) { vu = 1; idread[j][nbocc[j]]=i; nbocc[j]++; } 254 | if(!vu) { 255 | seen[nbdiff] = seq; 256 | idread[nbdiff][0]=i; 257 | nbocc[nbdiff] = 1; 258 | if(strcmp(str, seq) == 0) id = nbdiff; 259 | nbdiff++; 260 | } else { free(seq); } 261 | } 262 | // erase reads from hapB 263 | int nbdelread = 0, delread[ap->nb_ali]; 264 | for( j = 0 ; j < nbdiff ; j++) { 265 | float ratio = ((float)nbocc[j] / (float)ap->nb_ali); 266 | if(ap->debug) 267 | printf("======> found base=%s\t nbocc=%i\t ratio=%.4f\n", 268 | seen[j], nbocc[j], ratio); 269 | if( j == id ) continue; 270 | for( i = nbocc[j]-1 ; i >= 0 ; i-- ) { 271 | bam1_t *aln = ap->pile[idread[j][i]]; 272 | 273 | if(ap->debug) 274 | printf("======> erase readid= %s\t base=%s\t ratio=%.4f\n", 275 | bam_get_qname(aln), seen[j], ratio); 276 | 277 | if(!hash_search(readname, bam_get_qname(aln))) { 278 | hash_insert(readname, bam_get_qname(aln)); 279 | ap->nbhapB++; 280 | } else 281 | hash_delete(readname, bam_get_qname(aln)); 282 | 283 | delread[nbdelread++] = idread[j][i]; 284 | } 285 | } 286 | qsort(delread, nbdelread, sizeof(int), compare); 287 | for( i = nbdelread - 1 ; i >= 0 ; i--) delete(ap, delread[i]); 288 | 289 | for( j = 0 ; j < nbdiff ; j++) free(seen[j]); 290 | } 291 | 292 | void select_base(alipile_t *ap, alipile_t *allali, int current_pos, 293 | polished_t *s, hash_t* readname) { 294 | char ref[2] = {toupper(ap->seq[current_pos])}; 295 | char *read = NULL; 296 | int ret = 0; 297 | 298 | // clean the pile 299 | clean(ap, current_pos); 300 | clean(allali, current_pos); 301 | 302 | // coverage is too low, keep reference sequence 303 | if(ap->nb_ali < ap->min_cov) { 304 | // coverage is too low in both pile, keep reference sequence 305 | if(ap->nb_ali == 0 || allali->nb_ali < ap->min_cov) { 306 | if(ap->debug) 307 | printf("==> seq=%s pos=%i base_ref=%s nbali=%i [COV TOO LOW]\n", 308 | ap->name_seq, current_pos, ref, ap->nb_ali); 309 | add_str(s, ref); 310 | ret = 1; 311 | } else { 312 | read = get_base(ap, ap->current_read, current_pos); 313 | float ratio = get_ratio_base(ap, read, current_pos); 314 | float ratioall = get_ratio_base(allali, read, current_pos); 315 | if(ratioall >= 0.8) { 316 | if(strcmp(read, "-") != 0) add_str(s, read); 317 | if(strcmp(ref, read) != 0) { 318 | fprintf(ap->changes, "%s\t%i\tref=%s\tread=%s\treadname=%s\thomo\tratio1=%.4f\tratio2=%.4f\n", 319 | ap->name_seq, current_pos, ref, read, 320 | bam_get_qname(ap->pile[ap->current_read]), ratio, ratioall); 321 | 322 | ap->nb_changes++; 323 | if(ap->debug) 324 | printf("%s\t%i\tref=%s\tread=%s\treadname=%s\thomo-lowcov\tratio1=%.4f\tratio2=%.4f\n", 325 | ap->name_seq, current_pos, ref, read, 326 | bam_get_qname(ap->pile[ap->current_read]), ratio, ratioall); 327 | } 328 | ret = 1; 329 | } 330 | 331 | } 332 | } 333 | 334 | if(!ret) { 335 | read = get_base(ap, ap->current_read, current_pos); 336 | 337 | float ratio = get_ratio_base(ap, read, current_pos); 338 | float ratioall = get_ratio_base(allali, read, current_pos); 339 | if(ap->debug) 340 | printf("==> seq=%s pos=%i base_ref=%s base_read=%s ratio=%f ratioall=%f nbali=%i read=%s\n", 341 | ap->name_seq, current_pos, ref, read, ratio, ratioall, ap->nb_ali, 342 | bam_get_qname(ap->pile[ap->current_read])); 343 | 344 | // keep read base (homo diff) 345 | if(!ret && ratioall >= 0.8 && (ratio * ap->nb_ali) > ap->min_cov) { 346 | if(strcmp(read, "-") != 0) add_str(s, read); 347 | if(strcmp(ref, read) != 0) { 348 | fprintf(ap->changes, "%s\t%i\tref=%s\tread=%s\treadname=%s\thomo\tratio1=%.4f\tratio2=%.4f\n", 349 | ap->name_seq, current_pos, ref, read, 350 | bam_get_qname(ap->pile[ap->current_read]), ratio, ratioall); 351 | 352 | ap->nb_changes++; 353 | if(ap->debug) 354 | printf("%s\t%i\tref=%s\tread=%s\treadname=%s\thomo\tratio1=%.4f\tratio2=%.4f\n", 355 | ap->name_seq, current_pos, ref, read, 356 | bam_get_qname(ap->pile[ap->current_read]), ratio, ratioall); 357 | } 358 | ret = 1; 359 | } 360 | 361 | // keep read base (hetero diff) 362 | if(!ret && ap->nb_ali > 2*ap->min_cov && ratioall > 0.2 && ratioall < 0.8) { 363 | // if both ratios are equal (first hetero variation), we select the haplotype with the highest coverage 364 | float r = (ratiocurrent_read, current_pos); 369 | if(ap->debug) 370 | printf("CHANGE %s\t%i\tref=%s\tread=%s\treadname=%s\thetero\tratio1=%.4f\tratio2=%.4f\n", 371 | ap->name_seq, current_pos, ref, read, 372 | bam_get_qname(ap->pile[ap->current_read]), ratio, ratioall); 373 | } 374 | 375 | removehapB_read(ap, read, current_pos, readname); 376 | if(strcmp(read, "-") != 0) add_str(s, read); 377 | if(strcmp(ref, read) != 0) { 378 | fprintf(ap->changes, "%s\t%i\tref=%s\tread=%s\treadname=%s\thetero\tratio1=%.4f\tratio2=%.4f\n", 379 | ap->name_seq, current_pos, ref, read, 380 | bam_get_qname(ap->pile[ap->current_read]), ratio, ratioall); 381 | ap->nb_changes++; 382 | if(ap->debug) 383 | printf("%s\t%i\tref=%s\tread=%s\treadname=%s\thetero\tratio1=%.4f\tratio2=%.4f\n", 384 | ap->name_seq, current_pos, ref, read, 385 | bam_get_qname(ap->pile[ap->current_read]), ratio, ratioall); 386 | } 387 | ret = 1; 388 | } 389 | 390 | // sequencing error, delete the read and select a new read 391 | if(!ret && ratioall <= 0.2) { 392 | delete(ap, ap->current_read); 393 | ap->current_read = -1; 394 | change_currentread2(ap, current_pos); 395 | return select_base(ap, allali, current_pos, s, readname); 396 | } 397 | 398 | // special case : keep reference base 399 | if(!ret) add_str(s, ref); 400 | } 401 | 402 | free(read); 403 | } 404 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | CeCILL FREE SOFTWARE LICENSE AGREEMENT 3 | 4 | Version 2.1 dated 2013-06-21 5 | 6 | 7 | Notice 8 | 9 | This Agreement is a Free Software license agreement that is the result 10 | of discussions between its authors in order to ensure compliance with 11 | the two main principles guiding its drafting: 12 | 13 | * firstly, compliance with the principles governing the distribution 14 | of Free Software: access to source code, broad rights granted to users, 15 | * secondly, the election of a governing law, French law, with which it 16 | is conformant, both as regards the law of torts and intellectual 17 | property law, and the protection that it offers to both authors and 18 | holders of the economic rights over software. 19 | 20 | The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) 21 | license are: 22 | 23 | Commissariat à l'énergie atomique et aux énergies alternatives - CEA, a 24 | public scientific, technical and industrial research establishment, 25 | having its principal place of business at 25 rue Leblanc, immeuble Le 26 | Ponant D, 75015 Paris, France. 27 | 28 | Centre National de la Recherche Scientifique - CNRS, a public scientific 29 | and technological establishment, having its principal place of business 30 | at 3 rue Michel-Ange, 75794 Paris cedex 16, France. 31 | 32 | Institut National de Recherche en Informatique et en Automatique - 33 | Inria, a public scientific and technological establishment, having its 34 | principal place of business at Domaine de Voluceau, Rocquencourt, BP 35 | 105, 78153 Le Chesnay cedex, France. 36 | 37 | 38 | Preamble 39 | 40 | The purpose of this Free Software license agreement is to grant users 41 | the right to modify and redistribute the software governed by this 42 | license within the framework of an open source distribution model. 43 | 44 | The exercising of this right is conditional upon certain obligations for 45 | users so as to preserve this status for all subsequent redistributions. 46 | 47 | In consideration of access to the source code and the rights to copy, 48 | modify and redistribute granted by the license, users are provided only 49 | with a limited warranty and the software's author, the holder of the 50 | economic rights, and the successive licensors only have limited liability. 51 | 52 | In this respect, the risks associated with loading, using, modifying 53 | and/or developing or reproducing the software by the user are brought to 54 | the user's attention, given its Free Software status, which may make it 55 | complicated to use, with the result that its use is reserved for 56 | developers and experienced professionals having in-depth computer 57 | knowledge. Users are therefore encouraged to load and test the 58 | suitability of the software as regards their requirements in conditions 59 | enabling the security of their systems and/or data to be ensured and, 60 | more generally, to use and operate it in the same conditions of 61 | security. This Agreement may be freely reproduced and published, 62 | provided it is not altered, and that no provisions are either added or 63 | removed herefrom. 64 | 65 | This Agreement may apply to any or all software for which the holder of 66 | the economic rights decides to submit the use thereof to its provisions. 67 | 68 | Frequently asked questions can be found on the official website of the 69 | CeCILL licenses family (http://www.cecill.info/index.en.html) for any 70 | necessary clarification. 71 | 72 | 73 | Article 1 - DEFINITIONS 74 | 75 | For the purpose of this Agreement, when the following expressions 76 | commence with a capital letter, they shall have the following meaning: 77 | 78 | Agreement: means this license agreement, and its possible subsequent 79 | versions and annexes. 80 | 81 | Software: means the software in its Object Code and/or Source Code form 82 | and, where applicable, its documentation, "as is" when the Licensee 83 | accepts the Agreement. 84 | 85 | Initial Software: means the Software in its Source Code and possibly its 86 | Object Code form and, where applicable, its documentation, "as is" when 87 | it is first distributed under the terms and conditions of the Agreement. 88 | 89 | Modified Software: means the Software modified by at least one 90 | Contribution. 91 | 92 | Source Code: means all the Software's instructions and program lines to 93 | which access is required so as to modify the Software. 94 | 95 | Object Code: means the binary files originating from the compilation of 96 | the Source Code. 97 | 98 | Holder: means the holder(s) of the economic rights over the Initial 99 | Software. 100 | 101 | Licensee: means the Software user(s) having accepted the Agreement. 102 | 103 | Contributor: means a Licensee having made at least one Contribution. 104 | 105 | Licensor: means the Holder, or any other individual or legal entity, who 106 | distributes the Software under the Agreement. 107 | 108 | Contribution: means any or all modifications, corrections, translations, 109 | adaptations and/or new functions integrated into the Software by any or 110 | all Contributors, as well as any or all Internal Modules. 111 | 112 | Module: means a set of sources files including their documentation that 113 | enables supplementary functions or services in addition to those offered 114 | by the Software. 115 | 116 | External Module: means any or all Modules, not derived from the 117 | Software, so that this Module and the Software run in separate address 118 | spaces, with one calling the other when they are run. 119 | 120 | Internal Module: means any or all Module, connected to the Software so 121 | that they both execute in the same address space. 122 | 123 | GNU GPL: means the GNU General Public License version 2 or any 124 | subsequent version, as published by the Free Software Foundation Inc. 125 | 126 | GNU Affero GPL: means the GNU Affero General Public License version 3 or 127 | any subsequent version, as published by the Free Software Foundation Inc. 128 | 129 | EUPL: means the European Union Public License version 1.1 or any 130 | subsequent version, as published by the European Commission. 131 | 132 | Parties: mean both the Licensee and the Licensor. 133 | 134 | These expressions may be used both in singular and plural form. 135 | 136 | 137 | Article 2 - PURPOSE 138 | 139 | The purpose of the Agreement is the grant by the Licensor to the 140 | Licensee of a non-exclusive, transferable and worldwide license for the 141 | Software as set forth in Article 5 <#scope> hereinafter for the whole 142 | term of the protection granted by the rights over said Software. 143 | 144 | 145 | Article 3 - ACCEPTANCE 146 | 147 | 3.1 The Licensee shall be deemed as having accepted the terms and 148 | conditions of this Agreement upon the occurrence of the first of the 149 | following events: 150 | 151 | * (i) loading the Software by any or all means, notably, by 152 | downloading from a remote server, or by loading from a physical medium; 153 | * (ii) the first time the Licensee exercises any of the rights granted 154 | hereunder. 155 | 156 | 3.2 One copy of the Agreement, containing a notice relating to the 157 | characteristics of the Software, to the limited warranty, and to the 158 | fact that its use is restricted to experienced users has been provided 159 | to the Licensee prior to its acceptance as set forth in Article 3.1 160 | <#accepting> hereinabove, and the Licensee hereby acknowledges that it 161 | has read and understood it. 162 | 163 | 164 | Article 4 - EFFECTIVE DATE AND TERM 165 | 166 | 167 | 4.1 EFFECTIVE DATE 168 | 169 | The Agreement shall become effective on the date when it is accepted by 170 | the Licensee as set forth in Article 3.1 <#accepting>. 171 | 172 | 173 | 4.2 TERM 174 | 175 | The Agreement shall remain in force for the entire legal term of 176 | protection of the economic rights over the Software. 177 | 178 | 179 | Article 5 - SCOPE OF RIGHTS GRANTED 180 | 181 | The Licensor hereby grants to the Licensee, who accepts, the following 182 | rights over the Software for any or all use, and for the term of the 183 | Agreement, on the basis of the terms and conditions set forth hereinafter. 184 | 185 | Besides, if the Licensor owns or comes to own one or more patents 186 | protecting all or part of the functions of the Software or of its 187 | components, the Licensor undertakes not to enforce the rights granted by 188 | these patents against successive Licensees using, exploiting or 189 | modifying the Software. If these patents are transferred, the Licensor 190 | undertakes to have the transferees subscribe to the obligations set 191 | forth in this paragraph. 192 | 193 | 194 | 5.1 RIGHT OF USE 195 | 196 | The Licensee is authorized to use the Software, without any limitation 197 | as to its fields of application, with it being hereinafter specified 198 | that this comprises: 199 | 200 | 1. permanent or temporary reproduction of all or part of the Software 201 | by any or all means and in any or all form. 202 | 203 | 2. loading, displaying, running, or storing the Software on any or all 204 | medium. 205 | 206 | 3. entitlement to observe, study or test its operation so as to 207 | determine the ideas and principles behind any or all constituent 208 | elements of said Software. This shall apply when the Licensee 209 | carries out any or all loading, displaying, running, transmission or 210 | storage operation as regards the Software, that it is entitled to 211 | carry out hereunder. 212 | 213 | 214 | 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS 215 | 216 | The right to make Contributions includes the right to translate, adapt, 217 | arrange, or make any or all modifications to the Software, and the right 218 | to reproduce the resulting software. 219 | 220 | The Licensee is authorized to make any or all Contributions to the 221 | Software provided that it includes an explicit notice that it is the 222 | author of said Contribution and indicates the date of the creation thereof. 223 | 224 | 225 | 5.3 RIGHT OF DISTRIBUTION 226 | 227 | In particular, the right of distribution includes the right to publish, 228 | transmit and communicate the Software to the general public on any or 229 | all medium, and by any or all means, and the right to market, either in 230 | consideration of a fee, or free of charge, one or more copies of the 231 | Software by any means. 232 | 233 | The Licensee is further authorized to distribute copies of the modified 234 | or unmodified Software to third parties according to the terms and 235 | conditions set forth hereinafter. 236 | 237 | 238 | 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION 239 | 240 | The Licensee is authorized to distribute true copies of the Software in 241 | Source Code or Object Code form, provided that said distribution 242 | complies with all the provisions of the Agreement and is accompanied by: 243 | 244 | 1. a copy of the Agreement, 245 | 246 | 2. a notice relating to the limitation of both the Licensor's warranty 247 | and liability as set forth in Articles 8 and 9, 248 | 249 | and that, in the event that only the Object Code of the Software is 250 | redistributed, the Licensee allows effective access to the full Source 251 | Code of the Software for a period of at least three years from the 252 | distribution of the Software, it being understood that the additional 253 | acquisition cost of the Source Code shall not exceed the cost of the 254 | data transfer. 255 | 256 | 257 | 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE 258 | 259 | When the Licensee makes a Contribution to the Software, the terms and 260 | conditions for the distribution of the resulting Modified Software 261 | become subject to all the provisions of this Agreement. 262 | 263 | The Licensee is authorized to distribute the Modified Software, in 264 | source code or object code form, provided that said distribution 265 | complies with all the provisions of the Agreement and is accompanied by: 266 | 267 | 1. a copy of the Agreement, 268 | 269 | 2. a notice relating to the limitation of both the Licensor's warranty 270 | and liability as set forth in Articles 8 and 9, 271 | 272 | and, in the event that only the object code of the Modified Software is 273 | redistributed, 274 | 275 | 3. a note stating the conditions of effective access to the full source 276 | code of the Modified Software for a period of at least three years 277 | from the distribution of the Modified Software, it being understood 278 | that the additional acquisition cost of the source code shall not 279 | exceed the cost of the data transfer. 280 | 281 | 282 | 5.3.3 DISTRIBUTION OF EXTERNAL MODULES 283 | 284 | When the Licensee has developed an External Module, the terms and 285 | conditions of this Agreement do not apply to said External Module, that 286 | may be distributed under a separate license agreement. 287 | 288 | 289 | 5.3.4 COMPATIBILITY WITH OTHER LICENSES 290 | 291 | The Licensee can include a code that is subject to the provisions of one 292 | of the versions of the GNU GPL, GNU Affero GPL and/or EUPL in the 293 | Modified or unmodified Software, and distribute that entire code under 294 | the terms of the same version of the GNU GPL, GNU Affero GPL and/or EUPL. 295 | 296 | The Licensee can include the Modified or unmodified Software in a code 297 | that is subject to the provisions of one of the versions of the GNU GPL, 298 | GNU Affero GPL and/or EUPL and distribute that entire code under the 299 | terms of the same version of the GNU GPL, GNU Affero GPL and/or EUPL. 300 | 301 | 302 | Article 6 - INTELLECTUAL PROPERTY 303 | 304 | 305 | 6.1 OVER THE INITIAL SOFTWARE 306 | 307 | The Holder owns the economic rights over the Initial Software. Any or 308 | all use of the Initial Software is subject to compliance with the terms 309 | and conditions under which the Holder has elected to distribute its work 310 | and no one shall be entitled to modify the terms and conditions for the 311 | distribution of said Initial Software. 312 | 313 | The Holder undertakes that the Initial Software will remain ruled at 314 | least by this Agreement, for the duration set forth in Article 4.2 <#term>. 315 | 316 | 317 | 6.2 OVER THE CONTRIBUTIONS 318 | 319 | The Licensee who develops a Contribution is the owner of the 320 | intellectual property rights over this Contribution as defined by 321 | applicable law. 322 | 323 | 324 | 6.3 OVER THE EXTERNAL MODULES 325 | 326 | The Licensee who develops an External Module is the owner of the 327 | intellectual property rights over this External Module as defined by 328 | applicable law and is free to choose the type of agreement that shall 329 | govern its distribution. 330 | 331 | 332 | 6.4 JOINT PROVISIONS 333 | 334 | The Licensee expressly undertakes: 335 | 336 | 1. not to remove, or modify, in any manner, the intellectual property 337 | notices attached to the Software; 338 | 339 | 2. to reproduce said notices, in an identical manner, in the copies of 340 | the Software modified or not. 341 | 342 | The Licensee undertakes not to directly or indirectly infringe the 343 | intellectual property rights on the Software of the Holder and/or 344 | Contributors, and to take, where applicable, vis-à-vis its staff, any 345 | and all measures required to ensure respect of said intellectual 346 | property rights of the Holder and/or Contributors. 347 | 348 | 349 | Article 7 - RELATED SERVICES 350 | 351 | 7.1 Under no circumstances shall the Agreement oblige the Licensor to 352 | provide technical assistance or maintenance services for the Software. 353 | 354 | However, the Licensor is entitled to offer this type of services. The 355 | terms and conditions of such technical assistance, and/or such 356 | maintenance, shall be set forth in a separate instrument. Only the 357 | Licensor offering said maintenance and/or technical assistance services 358 | shall incur liability therefor. 359 | 360 | 7.2 Similarly, any Licensor is entitled to offer to its licensees, under 361 | its sole responsibility, a warranty, that shall only be binding upon 362 | itself, for the redistribution of the Software and/or the Modified 363 | Software, under terms and conditions that it is free to decide. Said 364 | warranty, and the financial terms and conditions of its application, 365 | shall be subject of a separate instrument executed between the Licensor 366 | and the Licensee. 367 | 368 | 369 | Article 8 - LIABILITY 370 | 371 | 8.1 Subject to the provisions of Article 8.2, the Licensee shall be 372 | entitled to claim compensation for any direct loss it may have suffered 373 | from the Software as a result of a fault on the part of the relevant 374 | Licensor, subject to providing evidence thereof. 375 | 376 | 8.2 The Licensor's liability is limited to the commitments made under 377 | this Agreement and shall not be incurred as a result of in particular: 378 | (i) loss due the Licensee's total or partial failure to fulfill its 379 | obligations, (ii) direct or consequential loss that is suffered by the 380 | Licensee due to the use or performance of the Software, and (iii) more 381 | generally, any consequential loss. In particular the Parties expressly 382 | agree that any or all pecuniary or business loss (i.e. loss of data, 383 | loss of profits, operating loss, loss of customers or orders, 384 | opportunity cost, any disturbance to business activities) or any or all 385 | legal proceedings instituted against the Licensee by a third party, 386 | shall constitute consequential loss and shall not provide entitlement to 387 | any or all compensation from the Licensor. 388 | 389 | 390 | Article 9 - WARRANTY 391 | 392 | 9.1 The Licensee acknowledges that the scientific and technical 393 | state-of-the-art when the Software was distributed did not enable all 394 | possible uses to be tested and verified, nor for the presence of 395 | possible defects to be detected. In this respect, the Licensee's 396 | attention has been drawn to the risks associated with loading, using, 397 | modifying and/or developing and reproducing the Software which are 398 | reserved for experienced users. 399 | 400 | The Licensee shall be responsible for verifying, by any or all means, 401 | the suitability of the product for its requirements, its good working 402 | order, and for ensuring that it shall not cause damage to either persons 403 | or properties. 404 | 405 | 9.2 The Licensor hereby represents, in good faith, that it is entitled 406 | to grant all the rights over the Software (including in particular the 407 | rights set forth in Article 5 <#scope>). 408 | 409 | 9.3 The Licensee acknowledges that the Software is supplied "as is" by 410 | the Licensor without any other express or tacit warranty, other than 411 | that provided for in Article 9.2 <#good-faith> and, in particular, 412 | without any warranty as to its commercial value, its secured, safe, 413 | innovative or relevant nature. 414 | 415 | Specifically, the Licensor does not warrant that the Software is free 416 | from any error, that it will operate without interruption, that it will 417 | be compatible with the Licensee's own equipment and software 418 | configuration, nor that it will meet the Licensee's requirements. 419 | 420 | 9.4 The Licensor does not either expressly or tacitly warrant that the 421 | Software does not infringe any third party intellectual property right 422 | relating to a patent, software or any other property right. Therefore, 423 | the Licensor disclaims any and all liability towards the Licensee 424 | arising out of any or all proceedings for infringement that may be 425 | instituted in respect of the use, modification and redistribution of the 426 | Software. Nevertheless, should such proceedings be instituted against 427 | the Licensee, the Licensor shall provide it with technical and legal 428 | expertise for its defense. Such technical and legal expertise shall be 429 | decided on a case-by-case basis between the relevant Licensor and the 430 | Licensee pursuant to a memorandum of understanding. The Licensor 431 | disclaims any and all liability as regards the Licensee's use of the 432 | name of the Software. No warranty is given as regards the existence of 433 | prior rights over the name of the Software or as regards the existence 434 | of a trademark. 435 | 436 | 437 | Article 10 - TERMINATION 438 | 439 | 10.1 In the event of a breach by the Licensee of its obligations 440 | hereunder, the Licensor may automatically terminate this Agreement 441 | thirty (30) days after notice has been sent to the Licensee and has 442 | remained ineffective. 443 | 444 | 10.2 A Licensee whose Agreement is terminated shall no longer be 445 | authorized to use, modify or distribute the Software. However, any 446 | licenses that it may have granted prior to termination of the Agreement 447 | shall remain valid subject to their having been granted in compliance 448 | with the terms and conditions hereof. 449 | 450 | 451 | Article 11 - MISCELLANEOUS 452 | 453 | 454 | 11.1 EXCUSABLE EVENTS 455 | 456 | Neither Party shall be liable for any or all delay, or failure to 457 | perform the Agreement, that may be attributable to an event of force 458 | majeure, an act of God or an outside cause, such as defective 459 | functioning or interruptions of the electricity or telecommunications 460 | networks, network paralysis following a virus attack, intervention by 461 | government authorities, natural disasters, water damage, earthquakes, 462 | fire, explosions, strikes and labor unrest, war, etc. 463 | 464 | 11.2 Any failure by either Party, on one or more occasions, to invoke 465 | one or more of the provisions hereof, shall under no circumstances be 466 | interpreted as being a waiver by the interested Party of its right to 467 | invoke said provision(s) subsequently. 468 | 469 | 11.3 The Agreement cancels and replaces any or all previous agreements, 470 | whether written or oral, between the Parties and having the same 471 | purpose, and constitutes the entirety of the agreement between said 472 | Parties concerning said purpose. No supplement or modification to the 473 | terms and conditions hereof shall be effective as between the Parties 474 | unless it is made in writing and signed by their duly authorized 475 | representatives. 476 | 477 | 11.4 In the event that one or more of the provisions hereof were to 478 | conflict with a current or future applicable act or legislative text, 479 | said act or legislative text shall prevail, and the Parties shall make 480 | the necessary amendments so as to comply with said act or legislative 481 | text. All other provisions shall remain effective. Similarly, invalidity 482 | of a provision of the Agreement, for any reason whatsoever, shall not 483 | cause the Agreement as a whole to be invalid. 484 | 485 | 486 | 11.5 LANGUAGE 487 | 488 | The Agreement is drafted in both French and English and both versions 489 | are deemed authentic. 490 | 491 | 492 | Article 12 - NEW VERSIONS OF THE AGREEMENT 493 | 494 | 12.1 Any person is authorized to duplicate and distribute copies of this 495 | Agreement. 496 | 497 | 12.2 So as to ensure coherence, the wording of this Agreement is 498 | protected and may only be modified by the authors of the License, who 499 | reserve the right to periodically publish updates or new versions of the 500 | Agreement, each with a separate number. These subsequent versions may 501 | address new issues encountered by Free Software. 502 | 503 | 12.3 Any Software distributed under a given version of the Agreement may 504 | only be subsequently distributed under the same version of the Agreement 505 | or a subsequent version, subject to the provisions of Article 5.3.4 506 | <#compatibility>. 507 | 508 | 509 | Article 13 - GOVERNING LAW AND JURISDICTION 510 | 511 | 13.1 The Agreement is governed by French law. The Parties agree to 512 | endeavor to seek an amicable solution to any disagreements or disputes 513 | that may arise during the performance of the Agreement. 514 | 515 | 13.2 Failing an amicable solution within two (2) months as from their 516 | occurrence, and unless emergency proceedings are necessary, the 517 | disagreements or disputes shall be referred to the Paris Courts having 518 | jurisdiction, by the more diligent Party. 519 | --------------------------------------------------------------------------------