├── explainer.pdf
├── package.sh
├── README.md
├── repro.sh
├── retr.sh
├── pnasresearcharticle.sty
├── skewdb-readme
├── pnas-new.cls
├── gcskew-article.py
├── skewdb-uses.tex
├── references.bib
└── gcskew-article-sd.tex


/explainer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berthubert/skewdb-articles/master/explainer.pdf


--------------------------------------------------------------------------------
/package.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cp skewdb-readme antonie2/genomes/auto/README
4 | cd antonie2/genomes/auto
5 | tar cjf skewdb.tar.bz2 README gcskewdb.csv skplot.csv *_fit.csv
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GC skew articles
2 | I am working on two GC skew articles. This repository contains the TeX
3 | source for both. In addition, there is code to create all the graphs and
4 | numerical claims found in the articles.
5 | 
6 | Finally, there is a complete reproduction setup that recreates the software
7 | and the database from scratch.
8 | 
9 | 


--------------------------------------------------------------------------------
/repro.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # apt-get install libboost-dev
 4 | #git clone https://github.com/berthubert/antonie2.git
 5 | cd antonie2
 6 | git pull
 7 | make -j4 skfit gcstats genehisto
 8 | mkdir genomes
 9 | cd genomes
10 | ../../retr.sh auto
11 | 
12 | echo Starting analysis - will take several hours
13 | 
14 | tar xzf new_taxdump.tar.gz fullnamelineage.dmp
15 | cd auto
16 | 
17 | ../../gcstats ../fullnamelineage.dmp *.fna.gz
18 | ../../skfit
19 | ../../genehisto *.fna.gz
20 | cp genomes.csv genomes-$(date +"%Y-%m-%d").csv
21 | # diff againts previous one
22 | # diff -uBb <(sort genomes-2021-10-01.csv | cut -f1,2 -d\;) <(sort genomes-2021-10-11.csv | cut -f1,2 -d\;) | grep ^\+ | cut -f2 -d\;
23 | echo *.fna.gz | xargs -n 5000 -P 4 zgrep ^\> > manifest.txt
24 | cd ../../..
25 | python3 gcskew-article.py
26 | 


--------------------------------------------------------------------------------
/retr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | rsync --times --progress -zv rsync://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz . && \
 4 | rsync --times -zv rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt bacteria_assembly_summary.txt && \
 5 | rsync --times -zv rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/assembly_summary.txt archaea_assembly_summary.txt && \
 6 | cat bacteria_assembly_summary.txt archaea_assembly_summary.txt > assembly_summary.txt && \
 7 | awk -F "\t" '$12=="Complete Genome" && $11=="latest"{print $20}' assembly_summary.txt > ftpdirpaths && \
 8 | awk 'BEGIN{FS=OFS="/";filesuffix="genomic.fna.gz"}{ftpdir=$0;asm=$10;file=asm"_"filesuffix;print ftpdir,file}' ftpdirpaths | sed s,ftp://,rsync://, >  ftpfilepaths && \
 9 | awk 'BEGIN{FS=OFS="/";filesuffix="genomic.gff.gz"}{ftpdir=$0;asm=$10;file=asm"_"filesuffix;print ftpdir,file}' ftpdirpaths | sed s,ftp://,rsync://, >>  ftpfilepaths && \
10 | sort < ftpfilepaths | grep ^rsync:// >  ftpfilepaths.srt && \
11 | 
12 | cat > wrsync <<EOM
13 | #!/bin/sh
14 | DEST=\$1
15 | shift
16 | rsync --times -v \$@ \$DEST
17 | EOM
18 | chmod +x ./wrsync
19 | 
20 | cat ftpfilepaths.srt | xargs -P 3 -n 500 ./wrsync $1
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/pnasresearcharticle.sty:
--------------------------------------------------------------------------------
 1 | %%% PNAS two column research article  style file
 2 | %%% For use with pnas-new.cls
 3 | \NeedsTeXFormat{LaTeX2e}
 4 | \ProvidesPackage{pnasresearcharticle}[2018/05/06 v1.3 PNAS two column research article style]
 5 | 
 6 | %% Set whether the abstract is set into the first column
 7 | \setboolean{shortarticle}{true} 
 8 | % true = set into first column
 9 | % false = spans page width
10 | 
11 | %% Set colors
12 | \definecolor{color2}{RGB}{130,0,0} % color
13 | 
14 | %% Set up the first page footnote/fact box here
15 | \RequirePackage{float}
16 | \floatstyle{plain}
17 | \newfloat{sigstatement}{b!}{sst}
18 | 
19 | \additionalelement{%
20 | \afterpage{\begin{sigstatement}
21 | \sffamily
22 | \mdfdefinestyle{pnassigstyle}{linewidth=0.7pt,backgroundcolor=pnasblueback,linecolor=pnasbluetext,fontcolor=pnasbluetext,innertopmargin=6pt,innerrightmargin=6pt,innerbottommargin=6pt,innerleftmargin=6pt}
23 | \@ifundefined{@significancestatement}{}{%	
24 | 	\begin{mdframed}[style=pnassigstyle]%
25 | 	\section*{Significance Statement}%
26 | 	\@significancestatement
27 | 	\end{mdframed}}
28 | %     \medskip
29 | \scriptsize
30 | \@ifundefined{@authorcontributions}{}{\@authorcontributions}
31 | \vskip5pt%
32 | \@ifundefined{@authordeclaration}{}{\@authordeclaration}
33 | \vskip5pt%
34 | \@ifundefined{@equalauthors}{}{\@equalauthors}
35 | \vskip5pt%
36 | \@ifundefined{@correspondingauthor}{}{\@correspondingauthor}
37 | \end{sigstatement}}
38 | }
39 | 
40 | \endinput


--------------------------------------------------------------------------------
/skewdb-readme:
--------------------------------------------------------------------------------
  1 | This is an archival copy of the SkewDB.
  2 | 
  3 | Details of this database can be found on:
  4 | 
  5 | 	* https://skewdb.org/
  6 | 	* https://berthub.eu/articles/posts/skewdb-an-open-database-of-gc-and-other-microbial-skews/
  7 | 	* https://doi.org/10.1101/2021.09.09.459602 ("SkewDB: A comprehensive database of GC and 10 other skews for over 28,000 chromosomes and plasmids")
  8 | 
  9 | In this document you'll find an abstract with a high-level description, an
 10 | explanation of data sources & regeneration details, followed by a per-field
 11 | description of the files in the distribution.
 12 | 
 13 | # Abstract
 14 | 
 15 | GC skew denotes the relative excess of G nucleotides over C nucleotides on
 16 | the leading versus the lagging replication strand of eubacteria.  While the
 17 | effect is small, typically around 2.5%, it is robust and pervasive.  GC skew
 18 | and the analogous TA skew are a localized deviation from Chargaff's second
 19 | parity rule, which states that G and C, and T and A occur with (mostly)
 20 | equalfrequency even within a strand.  Most bacteria also show the analogous
 21 | TA skew.  
 22 | 
 23 | Different phyla show different kinds of skew and differing relations between
 24 | TA and GC skew.  This article introduces an open access database
 25 | (https://skewdb.org) of GC and 10 other skews for over 28,000 chromosomes
 26 | and plasmids.
 27 | 
 28 | Further details like codon bias, strand bias, strand lengths and taxonomic
 29 | data are also included.  The SkewDB database can be used to generate or
 30 | verify hypotheses.  Since the origins of both the second parity rule, as
 31 | well as GC skew itself, are not yet satisfactorily explained, such a
 32 | database may enhance our understanding of microbial DNA.
 33 | 
 34 | # Sources & Regeneration
 35 | As explained more fully in the preprint, all data is sourced from the NCBI
 36 | genome repository. No further data is required. This whole database can be
 37 | recreated using the open source Antonie software and the 'repro.sh' script
 38 | from https://github.com/berthubert/skewdb-articles/blob/master/repro.sh
 39 | 
 40 | # Contents
 41 | This distribution contains the following files:
 42 | 
 43 |  * gcskewdb.csv: one line per DNA sequence, containing a high-level description of skews and biases, plus phylogenetic data
 44 |  * skplot.csv: raw skew data for all DNA sequences, at 4096-nucleotide intervals
 45 |  * ...fit.csv: one file per DNA sequence, containing the data from skplot.csv, but also plotted fits of all the skews
 46 | 
 47 | 
 48 | gcskewdb has the following defined fields:
 49 | 
 50 | name		Name of DNA sequence (symbolic, like NC_123234.1)
 51 | fullname	Full name of sequence, often including strain. Sourced from FASTA
 52 | a/c/g/tcount	Number of 'A/C/G/T' nucleotides in sequence
 53 | plasmid		Set to 1 if this is a plasmid
 54 | realm1/2/3/4/5	Phylogenetic information at 5 levels	
 55 | protgenecount	Total nucleotides found in coding regions
 56 | stopTAG		Number of stop codons that are TAG
 57 | stopTAA		Number of stop codons that are TAA
 58 | stopTGA		Number of stop codons that are TGA
 59 | stopXXX		Number of stop codons that are something else
 60 | startATG	Number of start codons that are ATG	
 61 | startGTG	Number of start codons that are GTG	
 62 | startTTG	Number of start codons that are TTG	
 63 | startXXX	Number of start codons that are something else
 64 | dnaApos		Locus of the dnaA gene in the DNA sequence, -1 if not found
 65 | dnaAsense	Sense of the dnaA gene
 66 | siz		Size of DNA sequence in nucleotides
 67 | gccount		Equal to gcount+ccount
 68 | ngcount		Number of nucleotides outside of protein coding regions
 69 | a/c/g/tcounts2	Number of A/C/G/T nucleotides in the final codon position
 70 | alpha1gc	GC excess ratio per nucleotide, leading strand
 71 | alpha2gc	CG excess ratio per nucleotide, lagging strand
 72 | shift		Position in DNA sequence there the leading strand starts
 73 | div		Relative length of the leading strand versus genome length
 74 | alpha1/2ta	AT/TA excess ratio per nucleotide, leading/lagging trand
 75 | alpha1/2sb	Excess ratio of coding nucleotides, leading/lagging strand
 76 | alpha1gc0/1/2	Excess ratio of GC on 1st, 2nd, 3rd codon positions, leading strand
 77 | alpha2gc0/2/2	Excess ratio of CG on 1st, 2nd, 3rd codon positions, lagging strand
 78 | alpha1ta0/1/2	Excess ratio of TA on 1st, 2nd, 3rd codon positions, leading strand
 79 | alpha2ta0/2/2	Excess ratio of AT on 1st, 2nd, 3rd codon positions, lagging strand
 80 | alpha1gcNG	Excess ratio of GC on non-protein coding nucleotides, leading strand
 81 | alpha2gcNG	Excess ratio of CG on non-protein coding nucleotides, lagging strand
 82 | alpha1taNG	Excess ratio of TA on non-protein coding nucleotides, lagging strand
 83 | alpha2taNG	Excess ratio of AT on non-protein coding nucleotides, lagging strand
 84 | rmsGC,TA,SB	Root mean squared error of fits	
 85 | rmsGC0/1/2	Root mean squared error of fits	
 86 | rmsTA0/1/2	Root mean squared error of fits	
 87 | rmsGC/TANG	Root mean squared error of fits	
 88 | gccontent	GC% of DNA sequence - equal to (gcount+ccount)/siz
 89 | a/c/g/tfrac	Fraction of nucleotides that are A, C, G or T
 90 | leada/c/g/tfrac	Fraction of leading strand coding nucleotides that are A, C, G or T
 91 | laga/c/g/tfrac  Fraction of lagging strand coding nucleotides that are A, C, G or T	
 92 | 
 93 | For historical reasons some other fields are also present, these should not
 94 | be used until they are defined here.
 95 | 
 96 | The raw, unmodelled, skews are available in skplot.csv, at 4096 nucleotide
 97 | resolution, with the following fields:
 98 | 
 99 | name		Name of this DNA sequence
100 | relpos		Relative position in sequence
101 | abspos		Absolute position in sequence
102 | gc/taskew	Cumulative GC/TA skews, in raw nucleotides
103 | gcskew0/1/2	Cumulative GC skew on 1st, 2nd and 3rd codon positions of coding nucleotides	
104 | gcskewNG	Cumulative GC skew on non-protein coding nucleotides
105 | taskew0/1/2	Cumulative TA skew on 1st, 2nd and 3rd codon positions of coding nucleotides	
106 | taskewNG	Cumulative TA skew on non-protein coding nucleotides
107 | pospos		Cumulative excess of positive sense genes (for strand bias)	
108 | gccount		Cumulative count of GC nucleotides
109 | ngcount		Cumulative count of non-protein coding nucleotires
110 | a/c/g/tcounts0	Counts of A/C/G/T nucleotides on first codon position of protein coding nucleotides
111 | a/c/g/tcounts1	Counts of A/C/G/T nucleotides on second codon position of protein coding nucleotides
112 | a/c/g/tcounts2	Counts of A/C/G/T nucleotides on third codon position of protein coding nucleotides
113 | 
114 | Per DNA sequence, there is a fit.csv file. Its name corresponds to the 'name' field in gcskewdb.csv file.
115 | The .fit csv files contain the following fields:
116 | 
117 | pos		Position in genome (as relative to the FASTA). Data is provided at 4096 nucleotide intervals.	
118 | gcskew		Cumulative GC skew up to this point	
119 | predgcskew	Predicted cumulative GC skew based on the fit, up to this point	
120 | taskew		Same, but for TA
121 | predtaskew	Same, but for TA
122 | sbskew		Same, but for Strand Bias
123 | predsbskew	Same, but for Strand Bias
124 | gc0/1/2skew	Same but for GC skew on 0/1/2 codon position
125 | predgc0/1/2skew	Same but for GC skew on 0/1/2 codon position
126 | ta0/1/2skew	Same but for TA skew on 0/1/2 codon position
127 | predta0/1/2skew	Same but for TA skew on 0/1/2 codon position
128 | gcNGskew	Same but for GC skew non non-protein coding nucleotides
129 | predgcNGskew	Same but for GC skew non non-protein coding nucleotides
130 | taNGskew	Same but for TA skew non non-protein coding nucleotides
131 | predtaNGskew	Same but for TA skew non non-protein coding nucleotides
132 | predleading	Set to 1 if this position (locus) is modelled to be on the leading strand
133 | 
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/pnas-new.cls:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | % pnas-new.cls, v1.44, 2018/05/06
  3 | %
  4 | % This class file enables authors to prepare research 
  5 | % articles for submission to PNAS.
  6 | % 
  7 | % Please note that whilst this template provides a 
  8 | % preview of the typeset manuscript for submission, it 
  9 | % will not necessarily be the final publication layout.
 10 | %
 11 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 12 | % This work may be distributed and/or modified under the
 13 | % conditions of the LaTeX Project Public License, either
 14 | % version 1.3 of this license or any later version.
 15 | % The latest version of this license is in
 16 | % http://www.latex-project.org/lppl.txt and
 17 | % version 1.3 or later is part of all distributions
 18 | % of LaTeX version 2005/12/01 or later.
 19 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 20 | %
 21 | % For use with latex+dvipdfm, pdflatex, xelatex & lualatex 
 22 | % For compiling with plain latex, please use latex+dvipdfm 
 23 | % to produce the PDF, not dvis -> ps -> pdf nor dvipdf
 24 | % 
 25 | \NeedsTeXFormat{LaTeX2e}
 26 | \ProvidesClass{pnas-new}[2018/05/06, v1.44]
 27 | \AtEndOfClass{\RequirePackage{microtype}}
 28 | % Option for line numbers
 29 | \newif\if@pnaslineno
 30 | \DeclareOption{lineno}{\@pnaslinenotrue}
 31 | \DeclareOption*{\PassOptionsToClass{\CurrentOption}{extarticle}}
 32 | \ProcessOptions*
 33 | \LoadClass{extarticle}
 34 | 
 35 | %% Fonts and language
 36 | \RequirePackage[utf8]{inputenc}
 37 | \RequirePackage[english]{babel}
 38 | \RequirePackage{amsmath,amsfonts,amssymb}
 39 | \RequirePackage{lmodern}
 40 | \RequirePackage[scaled]{helvet}
 41 | \RequirePackage[T1]{fontenc}
 42 | \RequirePackage{lettrine} % For dropped capitals
 43 | 
 44 | %% For the Significance Statement & footnote on the first page
 45 | \RequirePackage{afterpage}
 46 | \RequirePackage{ifpdf,ifxetex}
 47 | \ifpdf\else
 48 |   \ifxetex\else
 49 |     \def\pgfsysdriver{pgfsys-dvipdfm.def}
 50 |     \pdfpagewidth=\paperwidth
 51 |     \pdfpageheight=\paperheight
 52 | \fi\fi
 53 | \RequirePackage{xcolor}
 54 | \RequirePackage{tikz}
 55 | \RequirePackage[framemethod=tikz]{mdframed}
 56 | 
 57 | %% Hyperlinking
 58 | \RequirePackage[colorlinks=true, allcolors=blue]{hyperref}
 59 | 
 60 | %% Set up main title page fonts 
 61 | \newcommand{\headerfont}{\normalfont\sffamily\fontsize{7}{9} \selectfont}
 62 | \newcommand{\footerfont}{\normalfont\sffamily\fontsize{7}{9} \selectfont}
 63 | \newcommand{\titlefont}{\fontfamily{lmss}\bfseries\fontsize{22pt}{24pt}\selectfont}
 64 | \newcommand{\dropcapfont}{\fontfamily{lmss}\bfseries\fontsize{26pt}{28pt}\selectfont}
 65 | \newcommand{\datesfont}{\normalfont\sffamily\fontsize{7}{8}\selectfont}
 66 | \newcommand{\absfont}{\normalfont\sffamily\bfseries\fontsize{8}{11}\selectfont}
 67 | \newcommand{\keywordsfont}{\normalfont\rmfamily\fontsize{7}{10}\selectfont}
 68 | \newcommand{\copyrightfont}{\normalfont\rmfamily\fontsize{6}{8}\selectfont}
 69 | 
 70 | %% Set URL link color & font
 71 | \renewcommand\UrlFont{\color{black}\sffamily}
 72 | 
 73 | %% Author and affiliation
 74 | \RequirePackage{authblk}
 75 | \setlength{\affilsep}{8.5pt} % 16.5pts between base of author line and base of affil line
 76 | \renewcommand\Authfont{\color{color0}\normalfont\sffamily\bfseries\fontsize{9}{11}\selectfont}
 77 | \renewcommand\Affilfont{\color{color0}\normalfont\sffamily\fontsize{7}{8}\selectfont}
 78 | 
 79 | \renewcommand\AB@affilsepx{; \protect\Affilfont}
 80 | 
 81 | \renewcommand\Authands{, and }
 82 | 
 83 | %% Choose template type
 84 | \newcommand*{\templatetype}[1]{%
 85 |   \RequirePackage{#1}}
 86 | 
 87 | %% Options for element switching
 88 | \RequirePackage{xifthen}
 89 | \newboolean{shortarticle}
 90 | \newboolean{singlecolumn}
 91 | 
 92 | %% For numbering just one line of an equation
 93 | \newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
 94 | 
 95 | %% Watermark 
 96 | \newboolean{displaywatermark}
 97 | \setboolean{displaywatermark}{false} % Set to false to remove the watermark
 98 | \AtBeginDocument{%
 99 |   \ifthenelse{\boolean{displaywatermark}}{%
100 |   \RequirePackage{draftwatermark}
101 |   \SetWatermarkAngle{45}
102 |   \SetWatermarkColor{gray!20}
103 |   \SetWatermarkFontSize{3cm}
104 |   \SetWatermarkText{{\fontfamily{bch}\bfseries DRAFT}}
105 | }{}
106 | }
107 | 
108 | %% Copyright statement (not used)
109 | \newboolean{displaycopyright}
110 | \setboolean{displaycopyright}{false} % Confirmed as not required
111 | \RequirePackage{textcomp} % For copyright symbol styling
112 | \newcommand{\copyrightstatement}{\, \textcopyright\, 2015 by The National Academy of Sciences of the USA}
113 | 
114 | %% Graphics, tables and other formatting
115 | \RequirePackage{graphicx,xcolor}
116 | \RequirePackage{colortbl}
117 | \RequirePackage{booktabs}
118 | \RequirePackage{algorithm}
119 | \RequirePackage[noend]{algpseudocode}
120 | \RequirePackage{changepage}
121 | \RequirePackage[twoside,%
122 | 				letterpaper,includeheadfoot,%
123 | 				layoutsize={8.125in,10.875in},%
124 |                 layouthoffset=0.1875in,%
125 |                 layoutvoffset=0.0625in,%
126 |                 left=38.5pt,%
127 |                 right=43pt,%
128 |                 top=43pt,% 10pt provided by headsep
129 |                 bottom=32pt,%
130 |                 headheight=0pt,% No Header
131 |                 headsep=10pt,%
132 |                 footskip=25pt,
133 |                 marginparwidth=38pt]{geometry}
134 | \RequirePackage[labelfont={bf,sf},%
135 |                 labelsep=period,%
136 |                 figurename=Fig.]{caption}
137 | \setlength{\columnsep}{13.5pt} % Distance between the two columns of text
138 | \setlength{\parindent}{12pt} % Paragraph indent
139 | 
140 | %% Set document color scheme
141 | \definecolor{black50}{gray}{0.5} % 50% black for hrules
142 | \definecolor{color0}{RGB}{0,0,0} % Base
143 | \definecolor{color1}{RGB}{59,90,198} % author email, doi
144 | \definecolor{color2}{RGB}{16,131,16} %
145 | % For sig statement box
146 | \definecolor{pnasbluetext}{RGB}{0,101,165} %
147 | \definecolor{pnasblueback}{RGB}{205,217,235} %
148 | %\definecolor{pnasbluetext}{RGB}{0,115,209} % Not used
149 | %\definecolor{pnasblueback}{RGB}{210,230,247} % Not used
150 | 
151 | %% Bibliography
152 | %% v1.41 (15/11/2016)
153 | %% Use pnas-new.bst if available
154 | %% If not, try to use pnas2011.bst instead
155 | %% If both aren't available, fall back to
156 | %% unsrtnat
157 | \RequirePackage[numbers,sort&compress,merge,round]{natbib}
158 | \setlength{\bibsep}{0.0pt}
159 | \IfFileExists{pnas-new.bst}{
160 |   \bibliographystyle{pnas-new}
161 | }{
162 |   \IfFileExists{pnas2011.bst}{
163 |     \bibliographystyle{pnas2011}
164 |     \PackageWarning{pnas-new}{pnas-new.bst not found; using pnas2011.bst instead}
165 |   }{
166 |     \bibliographystyle{unsrtnat}
167 |     \PackageWarning{pnas-new}{pnas-new.bst and pnas2011.bst not found; falling back to unsrtnat instead}
168 |   }
169 | }
170 | \renewcommand{\bibsection}{} % Remove header
171 | \renewcommand\bibfont{\normalfont\sffamily\fontsize{6}{8}\selectfont} % set font to be sans serif
172 | 
173 |  
174 | \renewcommand\@biblabel[1]{ #1.} % Remove brackets from label
175 | \def\tagform@#1{\maketag@@@{\bfseries(\ignorespaces#1\unskip\@@italiccorr)}}
176 | \renewcommand{\eqref}[1]{\textup{{\normalfont Eq.~(\ref{#1}}\normalfont)}}
177 | 
178 | 
179 | %% Figure caption style
180 | \DeclareCaptionFormat{pnasformat}{\normalfont\sffamily\fontsize{7}{9}\selectfont#1#2#3}
181 | \captionsetup*{format=pnasformat}
182 | 
183 | %% Table style
184 | \RequirePackage{etoolbox}
185 | \captionsetup*[table]{labelfont+={small},textfont+={small,sf,bf},skip=10pt,position=above}
186 | % booktabs provide nice spacing, but rule widths and distances need fixing
187 | \setlength{\heavyrulewidth}{0.5pt}
188 | \setlength{\lightrulewidth}{0.5pt}
189 | \setlength{\aboverulesep}{1.5pt}
190 | \setlength{\belowrulesep}{1.5pt}
191 | \setlength{\belowbottomsep}{10pt}
192 | \AtBeginEnvironment{tabular}{
193 | \sffamily\fontsize{7.5}{10}\selectfont
194 | }
195 | \newcommand{\addtabletext}[1]{{\setlength{\leftskip}{9pt}\fontsize{7}{9}\selectfont#1}}
196 | 
197 | %% Equation numbering - use square brackets
198 | 
199 | \renewcommand\tagform@[1]{\maketag@@@ {[\ignorespaces #1\unskip \@@italiccorr ]}}
200 | 
201 | 
202 | %% Headers and footers
203 | \RequirePackage{fancyhdr}  % custom headers/footers
204 | \RequirePackage{lastpage}  % Number of pages in the document
205 | \pagestyle{fancy}          % Enables the custom headers/footers
206 | 
207 | \fancypagestyle{firststyle}{
208 |    \fancyfoot[R]{\footerfont BioRXiv\hspace{7pt}|\hspace{7pt}\textbf{\today}\hspace{7pt}|\hspace{7pt}vol. XXX\hspace{7pt}|\hspace{7pt}no. XX\hspace{7pt}|\hspace{7pt}\textbf{\thepage\textendash\pageref{LastPage}}}
209 |    \fancyfoot[L]{\footerfont\@ifundefined{@doi}{}{\@doi}}
210 | }
211 | 
212 | 
213 | % Headers
214 | \fancyhead[LE,RO]{}
215 | \fancyhead[LO,RE]{}
216 | % Footers
217 | \lfoot{}%
218 | \cfoot{}%
219 | \rfoot{}%
220 | 
221 | \fancyfoot[LE]{\footerfont\textbf{\thepage}\hspace{7pt}|\hspace{7pt}\@ifundefined{@doi}{}{\@doi}}
222 | \fancyfoot[RO]{\footerfont BioRXiv\hspace{7pt}|\hspace{7pt}\textbf{\today}\hspace{7pt}|\hspace{7pt}vol. XXX\hspace{7pt}|\hspace{7pt}no. XX\hspace{7pt}|\hspace{7pt}\textbf{\thepage}}
223 | \fancyfoot[RE,LO]{\footerfont\@ifundefined{@leadauthor}{}{\@leadauthor}\ifnum \value{authors} > 1\hspace{5pt}\textit{et al.}\fi}
224 | 
225 | \renewcommand{\headrulewidth}{0pt}% % No header rule
226 | \renewcommand{\footrulewidth}{0pt}% % No footer rule
227 | 
228 | %% Section/subsection/paragraph set-up
229 | \RequirePackage[explicit]{titlesec}
230 | \setcounter{secnumdepth}{5}
231 | \renewcommand{\thesubsection}{\Alph{subsection}}
232 | 
233 | \titleformat{\section}
234 |   {\large\sffamily\bfseries}
235 |   {\thesection.}
236 |   {0.5em}
237 |   {#1}
238 |   []
239 | \titleformat{name=\section,numberless}
240 |   {\large\sffamily\bfseries}
241 |   {}
242 |   {0em}
243 |   {#1}
244 |   []  
245 | \titleformat{\subsection}[runin]
246 |   {\sffamily\bfseries}
247 |   {\thesubsection.}
248 |   {0.5em}
249 |   {#1. }
250 |   []
251 | \titleformat{\subsubsection}[runin]
252 |   {\sffamily\small\bfseries\itshape}
253 |   {\thesubsubsection.}
254 |   {0.5em}
255 |   {#1. }
256 |   []    
257 | \titleformat{\paragraph}[runin]
258 |   {\sffamily\small\bfseries}
259 |   {}
260 |   {0em}
261 |   {#1} 
262 | \titlespacing*{\section}{0pc}{3ex \@plus4pt \@minus3pt}{5pt}
263 | \titlespacing*{\subsection}{0pc}{2.5ex \@plus3pt \@minus2pt}{2pt}
264 | \titlespacing*{\subsubsection}{0pc}{2ex \@plus2.5pt \@minus1.5pt}{2pt}
265 | \titlespacing*{\paragraph}{0pc}{1.5ex \@plus2pt \@minus1pt}{12pt}
266 | 
267 | %% Article meta data additional fields
268 | \newcommand{\additionalelement}[1]{\def\@additionalelement{#1}}
269 | \newcommand{\dates}[1]{\def\@dates{#1}}
270 | \newcommand{\doi}[1]{\def\@doi{#1}}
271 | \newcommand{\leadauthor}[1]{\def\@leadauthor{#1}}
272 | \newcommand{\etal}[1]{\def\@etal{#1}}
273 | \newcommand{\keywords}[1]{\def\@keywords{#1}}
274 | \newcommand{\authorcontributions}[1]{\def\@authorcontributions{#1}}
275 | \newcommand{\authordeclaration}[1]{\def\@authordeclaration{#1}}
276 | \newcommand{\equalauthors}[1]{\def\@equalauthors{#1}}
277 | \newcommand{\correspondingauthor}[1]{\def\@correspondingauthor{#1}}
278 | \newcommand{\significancestatement}[1]{\def\@significancestatement{#1}}
279 | \newcommand{\matmethods}[1]{\def\@matmethods{#1}}
280 | \newcommand{\acknow}[1]{\def\@acknow{#1}}
281 | 
282 | %% Dropped capital for first letter of main text
283 | \newcommand{\dropcap}[1]{\lettrine[lines=2,lraise=0.05,findent=0.1em, nindent=0em]{{\dropcapfont{#1}}}{}}
284 | 
285 | %% Abstract formatting
286 | \def\xabstract{abstract}
287 | \long\def\abstract#1\end#2{\def\two{#2}\ifx\two\xabstract 
288 | \long\gdef\theabstract{\ignorespaces#1}
289 | \def\go{\end{abstract}}\else
290 | \typeout{^^J^^J PLEASE DO NOT USE ANY \string\begin\space \string\end^^J
291 | COMMANDS WITHIN ABSTRACT^^J^^J}#1\end{#2}
292 | \gdef\theabstract{\vskip12pt BADLY FORMED ABSTRACT: PLEASE DO
293 | NOT USE {\tt\string\begin...\string\end} COMMANDS WITHIN
294 | THE ABSTRACT\vskip12pt}\let\go\relax\fi
295 | \go}
296 | 
297 | % Define an environment with abstract content and styling
298 | \newcommand{\abscontent}{
299 | \noindent
300 | \parbox{\dimexpr\linewidth}{%
301 |     \vskip3pt%
302 | 	\absfont \theabstract
303 | }%
304 | \vskip10pt%
305 | \noindent
306 | \parbox{\dimexpr\linewidth}{%
307 | {
308 |  \keywordsfont \@ifundefined{@keywords}{}{\@keywords}}%
309 | }
310 | \vskip12pt%
311 | }
312 | 
313 | % Option to format abstract differently for certain layouts (not used)
314 | \newcommand{\abscontentformatted}{
315 | \abscontent
316 | }
317 | 
318 | %% Manual adjustment to line up main content with line numbers
319 | \newlength\pnas@vertadjust
320 | \newcommand\verticaladjustment[1]{\setlength{\pnas@vertadjust}{#1}}
321 | 
322 | %% Custom title page 
323 | \renewcommand{\@maketitle}{%
324 | {%
325 | \ifthenelse{\boolean{shortarticle}}
326 |   {\ifthenelse{\boolean{singlecolumn}}{}{
327 |     {\raggedright\baselineskip= 24pt\titlefont \@title\par}%
328 |     \vskip10pt% 21pts between base of title and base of author line
329 |     {\raggedright \@author\par}
330 |     \vskip8pt% 16pts between base of affiliations and base of dates line 
331 |     {\raggedright \datesfont \@ifundefined{@dates}{}{\@dates}\par}
332 |     \vskip12pt%
333 |     }}
334 |   {% else
335 |     %
336 |     \vskip10pt%
337 |     {\raggedright\baselineskip= 24pt\titlefont \@title\par}%
338 |     \vskip10pt% 21pts between base of title and base of author line
339 |     {\raggedright \@author\par}
340 |     \vskip8pt% 16pts between base of affiliations and base of dates line 
341 |     {\raggedright \datesfont \@ifundefined{@dates}{}{\@dates}\par}
342 |     \vskip12pt
343 |     {%
344 |     \abscontent
345 |     }%
346 |     \vskip25pt%
347 |   }%
348 | %%%
349 | \@additionalelement
350 | }%
351 | \vskip\pnas@vertadjust
352 | }
353 | 
354 | %%%% Adding line numbers
355 | \if@twocolumn
356 |   \RequirePackage[switch,mathlines]{lineno}
357 | \else
358 |   \RequirePackage[mathlines]{lineno}
359 | \fi
360 | 
361 | \if@pnaslineno
362 |   \linenumbers
363 | 
364 |   \patchcmd{\abscontent}{\noindent}{\noindent\nolinenumbers}{}{}
365 |   \patchcmd{\abscontent}{\theabstract}{\internallinenumbers\theabstract}{}{}
366 |   \appto{\abscontent}{\linenumbers*}
367 |     
368 |   \if@twocolumn
369 |   \else
370 |     \preto{\@maketitle}{\nolinenumbers}
371 |   \fi
372 | \fi
373 | 
374 | 
375 | %% Footnotes set up
376 | \RequirePackage[flushmargin,ragged,symbol*]{footmisc}
377 | \renewcommand*{\footnotelayout}{\normalfont\sffamily\fontsize{6}{8}\selectfont} % set the footnote font
378 | \renewcommand{\footnoterule}{% Set the footnote hrule style
379 |   \kern -3pt
380 |   {\color{black50} \hrule width 72pt height 0.25pt}
381 |   \kern 2.5pt
382 | }
383 | 
384 | %% Set up the acknowledgments field
385 | \titleclass{\acknow@section}{straight}[\part]
386 | \newcounter{acknow@section}
387 | \providecommand*{\toclevel@acknow@section}{0}
388 | % Format is set for research articles by default
389 | \titleformat{\acknow@section}[runin]
390 |    {\sffamily\normalsize\bfseries}
391 |    {}
392 |    {0em}
393 |    {#1.}
394 |    []
395 | \titlespacing{\acknow@section}
396 | 	{0pt}
397 | 	{3.25ex plus 1ex minus .2ex}
398 | 	{1.5ex plus .2ex}
399 | 
400 | 
401 | \newcommand{\showacknow}{% Display acknowledgments section
402 | \@ifundefined{@acknow}{}{
403 | \vskip 3.25ex plus 1ex minus .2ex
404 | \noindent{\sffamily\normalsize\bfseries ACKNOWLEDGMENTS.\hspace{1.5ex plus .2ex}}
405 | \small\@acknow}
406 | }
407 | 
408 | 
409 | %% Set up the materials&methods field
410 | \titleclass{\matmethods@section}{straight}[\part]
411 | \newcounter{matmethods@section}
412 | \providecommand*{\toclevel@matmethods@section}{0}
413 | % Format is set for research articles by default
414 | \titleformat{\matmethods@section}
415 |    {\sffamily\normalsize\bfseries}
416 |    {}
417 |    {0em}
418 |    {#1}
419 |    []
420 | \titlespacing{\matmethods@section}
421 | 	{0pt}
422 | 	{3.25ex plus 1ex minus .2ex}
423 | 	{1.5ex plus .2ex}
424 | \newcommand{\showmatmethods}{% Display materials&methods section
425 | \@ifundefined{@matmethods}{}{\matmethods@section{Materials and Methods}{\small\noindent\@matmethods}}
426 | }
427 | 
428 | %% Other packages
429 | \RequirePackage{enumitem} % For reducing bullet list item separation
430 | 
431 | %% For sidecaptions
432 | \RequirePackage[rightcaption]{sidecap}
433 | 
434 | %% Define widetext as a double-column float, with a warning
435 | \RequirePackage{float}
436 | \RequirePackage{stfloats}
437 | \RequirePackage{marginnote}
438 | \floatstyle{plain}
439 | \newfloat{@widetext}{hbt!}{wtt}
440 | \newenvironment{widetext}{%
441 |   \PackageWarning{pnas-new}{Use of `widetext` is not recommended. We will now place it at the top or bottom of a page.}
442 |   \begin{@widetext*}[bt!]
443 |   \marginnote{\itshape\footnotesize\color{red}Use of \texttt{widetext} is not recommended.}
444 |   \hrule
445 | }{
446 |   \hrule
447 |   \end{@widetext*}
448 | }
449 | 
450 | %% For backward compatibility; does nothing
451 | \def\pnasbreak{}
452 | 
453 | \endinput
454 | 


--------------------------------------------------------------------------------
/gcskew-article.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[104]:
  5 | 
  6 | 
  7 | 
  8 | from IPython.display import set_matplotlib_formats
  9 | from pandas.plotting import register_matplotlib_converters
 10 | register_matplotlib_converters()
 11 | from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
 12 |                                AutoMinorLocator)
 13 | 
 14 | import matplotlib
 15 | import math
 16 | import numpy as np
 17 | import seaborn as sb
 18 | import matplotlib.pyplot as plt
 19 | import os
 20 | plt.rcParams['figure.figsize'] = [9.5, 7]
 21 | plt.rcParams['font.size'] = '13'
 22 | plt.rcParams.update({'font.family':'sans-serif'})
 23 | 
 24 | #plt.rcParams['figure.figsize'] = [15, 6]
 25 | import datetime
 26 | import pandas
 27 | from random import shuffle
 28 | from scipy.stats import poisson
 29 | import statsmodels.api as sm
 30 | import statsmodels.formula.api as smf
 31 | 
 32 | 
 33 | # In[105]:
 34 | 
 35 | 
 36 | # The goal of this notebook is to recreate all the graphs from the skewdb article
 37 | 
 38 | 
 39 | # In[106]:
 40 | 
 41 | 
 42 | 
 43 | prefix="./antonie2/genomes/auto/"
 44 | #data=pandas.read_csv(prefix+"/skplot.csv")
 45 | #data.describe()
 46 | 
 47 | 
 48 | # In[107]:
 49 | 
 50 | 
 51 | results = pandas.read_csv(prefix+"/results.csv")
 52 | results.sort_values(["rmsGC","rmsTA"], inplace=True)
 53 | results["gccontent"]=results.gccount/results.siz
 54 | results.describe()
 55 | gennames=pandas.read_csv(prefix+"/genomes.csv", sep=';')
 56 | codongc=pandas.read_csv(prefix+"/codongc.csv", sep=';')
 57 | genehisto=pandas.read_csv(prefix+"/genehisto.csv", sep=',')
 58 | 
 59 | m=gennames.merge(results, on="name").merge(codongc, on="name").merge(genehisto, on="name")
 60 | m.to_csv(prefix+"/gcskewdb.csv", float_format='%g')
 61 | print(len(m))
 62 | 
 63 | 
 64 | noplasmids=m[m.plasmid==0]
 65 | 
 66 | # we don't want plamids in these graphs
 67 | m=noplasmids
 68 | 
 69 | # In[108]:
 70 | 
 71 | 
 72 | plt.figure()
 73 | logbins = np.geomspace(0.005, m.rmsGC.max(), 100)
 74 | 
 75 | plt.hist(m.rmsGC, bins=logbins, cumulative=True, density=True)
 76 | plt.xscale('log')
 77 | 
 78 | 
 79 | # In[109]:
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | # In[146]:
 86 | 
 87 | 
 88 | #chromoname="NZ_CP032359.1"  # this is one of the neatest in the whole database
 89 | chromoname="NZ_CP012122.1" # also very neat
 90 | #chromoname="NZ_AP023049.1" #test
 91 | #chromoname="NZ_CP044177.1" # anomalous
 92 | print(chromoname)
 93 | fitted = pandas.read_csv(prefix+"/"+chromoname+"_fit.csv")
 94 | 
 95 | plt.figure()
 96 | us=results[results.name==chromoname]
 97 | plt.plot(fitted.pos, fitted.gcskew, label="Cumulative GC skew")
 98 | plt.plot(fitted.pos, fitted.predgcskew, label="Fitted GC skew")
 99 | 
100 | leshift=us["shift"].item()
101 | if leshift > 0:
102 |     plt.axvline(leshift, ls=':', color='black')
103 | else:
104 |     plt.axvline(fitted.pos.max() + leshift, ls=':', color='black')
105 | 
106 | plt.xlabel("Locus")
107 | plt.ylabel("Skew")
108 | plt.legend()
109 | plt.title(chromoname + " alpha1 " + str(round(us["alpha1gc"].item(),3)) + " alpha2 " + str(round(us["alpha2gc"].item(),3)) + 
110 |           " div " + str(round(us["div"].item(),3)) + " rmsGC "+str(round(us["rmsGC"].item(),3)) )
111 | plt.grid()
112 | print(fitted.tail(1).gcskew.item())
113 | plt.hlines([0, fitted.tail(1).gcskew.item()], 
114 |            [0, fitted.tail(1).pos.item()] ,
115 |            [fitted.pos.max() * m[m.name==chromoname]["div"].item(), 
116 |             fitted.pos.max() * m[m.name==chromoname]["div"].item()])
117 | 
118 | plt.gca().yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
119 | 
120 | 
121 | plt.annotate('alpha1',
122 |             xy=(190000, 1500), xytext=(190000, 1500), xycoords='data')
123 | 
124 | plt.annotate('alpha2',
125 |             xy=(fitted.tail(1).pos.item() - 420000, fitted.tail(1).gcskew.item()+1400),
126 |              xytext=(fitted.tail(1).pos.item() - 420000, fitted.tail(1).gcskew.item()+1400),
127 |              xycoords='data')
128 | 
129 | 
130 | #plt.annotate("",
131 | #            xy=(105000, 5000), xycoords='data',
132 | #            xytext=(170000, 0), textcoords='data',
133 | #            arrowprops=dict(arrowstyle="-",
134 | #                            connectionstyle="arc3,rad=0.3")
135 | #            )
136 | 
137 | plt.savefig("explainer.svg") # needs to be postprocessed with Inkscape, explainer.svg
138 | 
139 | 
140 | # In[111]:
141 | 
142 | 
143 | 
144 | 
145 | 
146 | # In[124]:
147 | 
148 | 
149 | chromoname="NZ_CP044177.1" # anomalous
150 | print(chromoname)
151 | fitted = pandas.read_csv(prefix+"/"+chromoname+"_fit.csv")
152 | results[results.name==chromoname]
153 | plt.figure()
154 | us=results[results.name==chromoname]
155 | plt.plot(fitted.pos, fitted.gcskew, label="Cumulative GC skew")
156 | plt.plot(fitted.pos, fitted.predgcskew, label="Fitted GC skew")
157 | plt.plot(fitted.pos, fitted.taskew, label="Cumulative TA skew")
158 | plt.plot(fitted.pos, fitted.predtaskew, label="Fitted TA skew")
159 | plt.gca().yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
160 | #plt.plot(fitted.pos, fitted.sbskew, label="Cumulative SB skew")
161 | #plt.plot(fitted.pos, fitted.predsbskew, label="Fitted SB skew")
162 | 
163 | 
164 | leshift=us["shift"].item()
165 | if leshift > 0:
166 |     plt.axvline(leshift, ls=':', color='black')
167 | else:
168 |     plt.axvline(fitted.pos.max() + leshift, ls=':', color='black')
169 | 
170 | plt.xlabel("Locus")
171 | plt.ylabel("Skew")
172 | plt.legend()
173 | plt.title(chromoname + " alpha1 " + str(round(us["alpha1gc"].item(),3)) + " alpha2 " + str(round(us["alpha2gc"].item(),3)) + 
174 |           " div " + str(round(us["div"].item(),3)) + " rmsGC "+str(round(us["rmsGC"].item(),3)) )
175 | plt.grid()
176 | plt.savefig("anomalous.pdf")
177 | 
178 | 
179 | # In[65]:
180 | 
181 | 
182 | 
183 | 
184 | 
185 | # In[143]:
186 | 
187 | 
188 | chromoname="NZ_CP019870.1" # c difficile
189 | fitted = pandas.read_csv(prefix+"/"+chromoname+"_fit.csv")
190 | plt.figure()
191 | us=results[results.name==chromoname]
192 | print(us.alpha1ta)
193 | plt.plot(fitted.pos, fitted.gcskew, label="Cumulative GC skew")
194 | plt.plot(fitted.pos, fitted.predgcskew, label="Fitted GC skew")
195 | plt.plot(fitted.pos, fitted.taskew, label="Cumulative TA skew")
196 | plt.plot(fitted.pos, fitted.predtaskew, label="Fitted TA skew")
197 | 
198 | #plt.plot(fitted.pos, fitted.sbskew, label="Cumulative SB skew")
199 | #plt.plot(fitted.pos, fitted.predsbskew, label="Fitted SB skew")
200 | 
201 | 
202 | leshift=us["shift"].item()
203 | if leshift > 0:
204 |     plt.axvline(leshift, ls=':', color='black')
205 | else:
206 |     plt.axvline(fitted.pos.max() + leshift, ls=':', color='black')
207 | 
208 | plt.xlabel("Locus")
209 | plt.ylabel("Skew")
210 | plt.legend()
211 | plt.title(chromoname + " alpha1 " + str(round(us["alpha1gc"].item(),3)) + " alpha2 " + str(round(us["alpha2gc"].item(),3)) + 
212 |           " div " + str(round(us["div"].item(),3)) + " rmsGC "+str(round(us["rmsGC"].item(),3)) )
213 | plt.grid()
214 | plt.savefig("cdif.pdf")
215 | 
216 | 
217 | # In[144]:
218 | 
219 | 
220 | plt.figure()
221 | 
222 | leX=fitted.sbskew.diff().rolling(10, center=True).mean()/4096
223 | leY=fitted.gcskew.diff().rolling(10, center=True).mean()/4096
224 | 
225 | 
226 | frame = { 'predleading': fitted.predleading, 'x': leX, 'y': leY }
227 | result = pandas.DataFrame(frame)
228 | #print(result.tail(20))
229 | 
230 | 
231 | sub=result[(result.predleading==1) & ((result.x < 0) | (result.x>0)) & ((result.y < 0) | (result.y > 0)) ]
232 | #z = np.polyfit(sub.x, sub.y, 1)
233 | #p = np.poly1d(z)
234 | #xp = np.linspace(leX.min(), leX.max(), 100)
235 | #plt.plot(xp, p(xp), ':', color='red')
236 | 
237 | model = smf.quantreg('y ~ x', sub).fit(q=0.5)
238 | print(model.summary())
239 | 
240 | print(model.params['Intercept'], model.params['x'])
241 | 
242 | get_y = lambda a, b: a + b * sub.x
243 | y = get_y(model.params['Intercept'], model.params['x'])
244 | plt.plot(sub.x, y, color='black')
245 | 
246 | 
247 | sub=result[(result.predleading==0) & ((result.x < 0) | (result.x>0)) & ((result.y < 0) | (result.y > 0))]
248 | 
249 | #z = np.polyfit(sub.x, sub.y, 1)
250 | #p = np.poly1d(z)
251 | #xp = np.linspace(leX.min(), leX.max(), 100)
252 | #plt.plot(xp, p(xp), ':', color='red')
253 | 
254 | model = smf.quantreg('y ~ x', sub).fit(q=0.5)
255 | print(model.summary())
256 | print(model.params['Intercept'], model.params['x'])
257 | get_y = lambda a, b: a + b * sub.x
258 | y = get_y(model.params['Intercept'], model.params['x'])
259 | plt.plot(sub.x, y, color='black')
260 | 
261 | 
262 | plt.scatter(result[result.predleading==0].x, result[result.predleading==0].y, marker="+", color='#1f77b4', label="GC lagging")
263 | plt.scatter(result[result.predleading==1].x, result[result.predleading==1].y, marker="^", color='#1f77b4', label="GC leading")
264 | 
265 | leX=fitted.sbskew.diff().rolling(10, center=True).mean()/4096
266 | leY=fitted.taskew.diff().rolling(10, center=True).mean()/4096
267 | 
268 | z = np.polyfit(leX.dropna(), leY.dropna(), 1)
269 | p = np.poly1d(z)
270 | xp = np.linspace(leX.min(), leX.max(), 100)
271 | plt.plot(xp, p(xp), ':', color='red')
272 | 
273 | plt.scatter(leX, leY,  s=10, label="TA", color='#ff7f0e')
274 | plt.xlabel("SB skew")
275 | plt.ylabel("GC/TA skew")
276 | plt.title(chromoname)
277 | plt.legend()
278 | 
279 | 
280 | plt.grid()
281 | plt.savefig("cdif-histo.pdf")
282 | 
283 | 
284 | # In[49]:
285 | 
286 | 
287 | plt.figure()
288 | plt.hist(m["div"], bins=50, density=True)
289 | plt.grid()
290 | plt.title("Division between leading/lagging strand")
291 | 
292 | 
293 | # In[155]:
294 | 
295 | 
296 | # how many chromosomes show one strand being 3 times smaller than the other, for good fit quality?
297 | m[(m.rmsGC <0.1) & ((m["div"] <0.25) | (m["div"] > 0.75))]["div"].describe()
298 | 
299 | 
300 | # In[163]:
301 | 
302 | 
303 | # how many chromosomes show one strand having 3 times more/less skew than the ohther?
304 | m[(m.rmsGC <0.1) & ((m["alpha1gc"]/m["alpha2gc"] < 0.33) | (m["alpha1gc"]/m["alpha2gc"] > 3))].alpha1gc
305 | 
306 | 
307 | # In[50]:
308 | 
309 | 
310 | plt.figure()
311 | plt.scatter(results.alpha1gc, results.alpha2gc)
312 | plt.grid()
313 | 
314 | 
315 | # In[51]:
316 | 
317 | 
318 | plt.figure()
319 | plt.hist((results.alpha1gc-results.alpha2gc)/results.alpha1gc, range=(-0.5, 0.5), bins=20)
320 | plt.grid()
321 | 
322 | 
323 | # In[941]:
324 | 
325 | 
326 | results[results["div"] < 0.35]
327 | 
328 | 
329 | # In[225]:
330 | 
331 | 
332 | # you can pick if you want flat genomes:
333 | 
334 | sel=m[(m.rmsGC < 0.05) & (m["div"] > 0.3) & (m["div"] < 0.7) & (
335 |         ((m["alpha1gc"] < 0.0014) & (m["alpha1gc"] > 0))
336 |     ^
337 |        ((m["alpha2gc"] < 0.0014) & (m["alpha2gc"] > 0))
338 |     ) ]  #  & (m.realm3=="Firmicutes")
339 | 
340 | 
341 | 
342 | #or unequally distributed ones:
343 | #sel=m[(m["div"] < 0.2) & (m.rmsGC < 0.002) ]
344 | print(len(sel))
345 | d=2
346 | fig,axs=plt.subplots(d,d, sharex=False, sharey=False, constrained_layout=True)
347 | a=0
348 | b=0
349 | names=sel.name.unique()
350 | print(len(names))
351 | shuffle(names)
352 | for k in names:
353 |         print(a,b,k)
354 |         fitted = pandas.read_csv(prefix+"/"+k+"_fit.csv")
355 |         t=results[results.name==k]
356 |         #print(t.alpha1gc, t.minpos, t.maxpos)
357 |         axs[b,a].plot(fitted.pos, fitted.gcskew)
358 |         axs[b,a].plot(fitted.pos, fitted.predgcskew)
359 |         #axs[b,a].plot(fitted.pos, fitted.taskew)
360 |         #axs[b,a].plot(fitted.pos, fitted.predtaskew)
361 | 
362 |         #axs[b,a].get_yaxis().set_ticks([])
363 |         axs[b,a].set_title(k) #  + " "+str(1000*results[results.name==k].rmsGC.mean()), 
364 |         axs[b,a].yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
365 | 
366 | 
367 |         #axs[b,a].grid()
368 |         a=a+1
369 |         if(a>=d):
370 |             a=0
371 |             b=b+1
372 |         if(b == d):
373 |             break
374 | #fig.suptitle("GC/TA skew in random bacterial chromosomes + fit")
375 | plt.savefig("flat-skew.pdf")
376 | 
377 | 
378 | # In[224]:
379 | 
380 | 
381 | # you can pick if you want flat genomes:
382 | sel=m[((m["div"] < 0.20) | (m["div"] > 0.8)) & (m.rmsGC < 0.035) ]
383 | print(len(sel))
384 | d=2
385 | fig,axs=plt.subplots(d,d, sharex=False, sharey=False, constrained_layout=True)
386 | a=0
387 | b=0
388 | names=sel.name.unique()
389 | print(len(names))
390 | shuffle(names)
391 | for k in names:
392 |         print(a,b,k)
393 |         fitted = pandas.read_csv(prefix+"/"+k+"_fit.csv")
394 |         t=results[results.name==k]
395 |         #print(t.alpha1gc, t.minpos, t.maxpos)
396 |         axs[b,a].plot(fitted.pos, fitted.gcskew)
397 |         axs[b,a].plot(fitted.pos, fitted.predgcskew)
398 |         #axs[b,a].plot(fitted.pos, fitted.taskew)
399 |         #axs[b,a].plot(fitted.pos, fitted.predtaskew)
400 | 
401 |         #axs[b,a].get_yaxis().set_ticks([])
402 |         axs[b,a].yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))        
403 |         axs[b,a].set_title(k) #  + " "+str(1000*results[results.name==k].rmsGC.mean()), 
404 |         #axs[b,a].grid()
405 |         a=a+1
406 |         if(a>=d):
407 |             a=0
408 |             b=b+1
409 |         if(b == d):
410 |             break
411 | #fig.suptitle("GC/TA skew in random bacterial chromosomes + fit")
412 | plt.savefig("strand-div.pdf")
413 | 
414 | 
415 | # In[220]:
416 | 
417 | 
418 | # you can pick if you want flat genomes:
419 | d=4
420 | fig,axs=plt.subplots(d,d, sharex=False, sharey=False, constrained_layout=True)
421 | a=0
422 | b=0
423 | 
424 | res,edges=pandas.qcut(m.rmsGC, 16, retbins=True)
425 | edges
426 | for q in edges:
427 |         sel = m[m.rmsGC>q]
428 |         k=sel.sort_values(["rmsGC"]).head(1).name.item()
429 |         fitted = pandas.read_csv(prefix+"/"+k+"_fit.csv")
430 |         t=results[results.name==k]
431 |         #print(t.alpha1gc, t.minpos, t.maxpos)
432 |         axs[b,a].plot(fitted.pos, fitted.gcskew)
433 |         axs[b,a].plot(fitted.pos, fitted.predgcskew)
434 |         axs[b,a].get_xaxis().set_visible(False)
435 |         axs[b,a].get_yaxis().set_visible(False)
436 | 
437 |         #axs[b,a].plot(fitted.pos, fitted.taskew)
438 |         #axs[b,a].plot(fitted.pos, fitted.predtaskew)
439 | 
440 |         #axs[b,a].get_yaxis().set_ticks([])
441 |         axs[b,a].set_title(("rmsGC = %.4f" % q )) #  + " "+str(1000*results[results.name==k].rmsGC.mean()), 
442 |         #axs[b,a].grid()
443 |         a=a+1
444 |         if(a>=d):
445 |             a=0
446 |             b=b+1
447 |         if(b == d):
448 |             break
449 | #fig.suptitle("GC/TA skew fit for 16 equally sized quality limits")
450 | plt.savefig("rms-samples.pdf")
451 | 
452 | 
453 | # In[223]:
454 | 
455 | 
456 | len(m[m.rmsGC<0.1])/len(m)
457 | 
458 | 
459 | # In[79]:
460 | 
461 | 
462 | for k in m.groupby(["realm2"]).name.count().reset_index().sort_values(["name"], ascending=False).realm2.head(10):
463 |     print(k)
464 | 
465 | 
466 | # In[115]:
467 | 
468 | 
469 | plt.figure()
470 | for k in m.groupby(["realm2"]).name.count().reset_index().sort_values(["name"], ascending=False).realm2.head(5):
471 |     sel=m[m.realm2==k]
472 |     plt.scatter(sel.alpha1gc, sel.alpha1ta, alpha=0.2, label=k)
473 |     
474 | plt.xlabel("alpha1 of GC skew")
475 | plt.ylabel("alpha1 of TA skew")
476 | leg=plt.legend()
477 | for lh in leg.legendHandles: 
478 |     lh.set_alpha(1)
479 | plt.grid()
480 | plt.xlim((0,0.09))
481 | plt.ylim(-0.12, 0.12)
482 | plt.savefig("phylo-histo.png", dpi=300)
483 | plt.savefig("phylo-histo.tiff", dpi=300)
484 | os.system("convert -compress lzw phylo-histo.tiff phylo-histo-lzw.tiff")
485 | 
486 | # In[118]:
487 | 
488 | 
489 | firmi=m[(m.realm3=="Firmicutes") & (m.rmsGC < 0.3) & (m.rmsTA < 0.3)]
490 | plt.figure()
491 | #leX = firmi.acounts2/(firmi.acounts2 + firmi.ccounts2 + firmi.gcounts2 + firmi.tcounts2)
492 | #leX = firmi.gccount/firmi.siz
493 | leX =  -(firmi.cfrac - firmi.gfrac)*(1-firmi.ngcount/firmi.siz)*firmi.alpha2sb
494 | #leX =  firmi.alpha1sb
495 | 
496 | leY=firmi.alpha1gc
497 | z1 = np.polyfit(leX, leY, 1)
498 | print(z1)
499 | plt.scatter(leX, leY, s=1.5, label="GC")
500 | p = np.poly1d(z1)
501 | xp = np.linspace(leX.min(), leX.max(), 100)
502 | plt.plot(xp, p(xp), color="red")
503 | 
504 | #
505 | 
506 | leX =  -(firmi.tfrac - firmi.afrac)*(1-firmi.ngcount/firmi.siz)*firmi.alpha2sb
507 | #leX =  (firmi.tfrac-0.5)*firmi.alpha1sb
508 | 
509 | leY=firmi.alpha1ta
510 | 
511 | z2 = np.polyfit(leX, leY, 1)
512 | print(z2)
513 | plt.scatter(leX, leY, s=1.5, label="TA")
514 | p = np.poly1d(z2)
515 | xp = np.linspace(leX.min(), leX.max(), 100)
516 | plt.plot(xp, p(xp), color="red")
517 | 
518 | plt.grid()
519 | #plt.scatter(firmi.ccounts2-firmi.tcounts2, firmi.alpha1gc)
520 | print(z1[0]/z2[0])
521 | plt.legend()
522 | 
523 | 
524 | # In[226]:
525 | 
526 | 
527 | firmi=m[(m.realm3=="Firmicutes") & (m.rmsGC < 0.25) & (m.rmsTA < 0.25)]
528 | plt.figure()
529 | leX =  -(firmi.cfrac - firmi.gfrac)*(1-firmi.ngcount/firmi.siz)*firmi.alpha2sb
530 | 
531 | leY=firmi.alpha1gc
532 | 
533 | frame = {  'x': leX, 'y': leY }
534 | sub = pandas.DataFrame(frame)
535 | 
536 | model = smf.quantreg('y ~ x', sub).fit(q=0.5)
537 | print(model.summary())
538 | 
539 | print(model.params['Intercept'], model.params['x'])
540 | 
541 | get_y = lambda a, b: a + b * sub.x
542 | y = get_y(model.params['Intercept'], model.params['x'])
543 | 
544 | plt.scatter(leX, leY, s=2, label="GC")
545 | plt.plot(sub.x, y, color='black')
546 | 
547 | #
548 | 
549 | leX =  -(firmi.tfrac - firmi.afrac)*(1-firmi.ngcount/firmi.siz)*firmi.alpha2sb
550 | leY=firmi.alpha1ta
551 | 
552 | frame = {  'x': leX, 'y': leY }
553 | sub = pandas.DataFrame(frame)
554 | 
555 | model = smf.quantreg('y ~ x', sub).fit(q=0.5)
556 | print(model.summary())
557 | 
558 | print(model.params['Intercept'], model.params['x'])
559 | 
560 | get_y = lambda a, b: a + b * sub.x
561 | y = get_y(model.params['Intercept'], model.params['x'])
562 | plt.scatter(leX, leY, s=2, label="TA")
563 | 
564 | plt.plot(sub.x, y, color='black', label='fit')
565 | 
566 | 
567 | 
568 | 
569 | 
570 | plt.grid()
571 | #plt.scatter(firmi.ccounts2-firmi.tcounts2, firmi.alpha1gc)
572 | plt.legend()
573 | plt.xlabel("Product of gene strand bias, codon bias skew, percentage coding")
574 | plt.ylabel("GC/TA skew fraction")
575 | #plt.title("Data for "+str(len(firmi))+ " Firmicute chromosomes")
576 | plt.savefig("firmi.pdf")
577 | 
578 | 
579 | # In[120]:
580 | 
581 | 
582 | firmi=m[(m.realm3=="Firmicutes") & (m.rmsGC < 0.2) ]
583 | plt.figure()
584 | #leX = firmi.acounts2/(firmi.acounts2 + firmi.ccounts2 + firmi.gcounts2 + firmi.tcounts2)
585 | #leX = firmi.gccount/firmi.siz
586 | leX =  -(firmi.cfrac - firmi.gfrac)*(1-firmi.ngcount/firmi.siz)*firmi.alpha1sb
587 | 
588 | 
589 | leY=firmi.alpha1gc
590 | 
591 | 
592 | z1 = np.polyfit(leX, leY, 1)
593 | print(z1)
594 | plt.scatter(leX, leY, s=1.5, label="GC")
595 | p = np.poly1d(z1)
596 | xp = np.linspace(leX.min(), leX.max(), 100)
597 | plt.plot(xp, p(xp), color="red")
598 | 
599 | #
600 | 
601 | leX =  -(firmi.tfrac - firmi.afrac)*(1-firmi.ngcount/firmi.siz)*firmi.alpha1sb
602 | leY=firmi.alpha1ta
603 | 
604 | z2 = np.polyfit(leX, leY, 1)
605 | print(z2)
606 | plt.scatter(leX, leY, s=1.5, label="TA")
607 | p = np.poly1d(z2)
608 | xp = np.linspace(leX.min(), leX.max(), 100)
609 | plt.plot(xp, p(xp), color="red")
610 | 
611 | plt.grid()
612 | #plt.scatter(firmi.ccounts2-firmi.tcounts2, firmi.alpha1gc)
613 | print(z1[0]/z2[0])
614 | plt.legend()
615 | 
616 | 
617 | # In[ ]:
618 | 
619 | 
620 | 
621 | 
622 | 
623 | 


--------------------------------------------------------------------------------
/skewdb-uses.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[9pt,twocolumn,twoside]{pnas-new}
  2 | % Use the lineno option to display guide line numbers if required.
  3 | % twocolumn
  4 | 
  5 | \templatetype{pnasresearcharticle} % Choose template 
  6 | % {pnasresearcharticle} = Template for a two-column research article
  7 | % {pnasmathematics} %= Template for a one-column mathematics article
  8 | % {pnasinvited} %= Template for a PNAS invited submission
  9 | 
 10 | \title{\emph{Skew}DB: A comprehensive database of GC and a dozen other skews for over 23,000 chromosomes}
 11 | 
 12 | % Use letters for affiliations, numbers to show equal authorship (if applicable) and to indicate the corresponding author
 13 | \author[a]{Bert Hubert}
 14 | %\affil[a]{Independent}
 15 | 
 16 | % Please give the surname of the lead author for the running footer
 17 | \leadauthor{Hubert} 
 18 | 
 19 | % Please add here a significance statement to explain the relevance of your work
 20 | \significancestatement{GC and TA skew are small ($\pm2.5\%$) deviations in nucleotide frequencies between the leading and lagging replication strands of eubacteria. These deviations show patterns within and across phyla, thus highlighting differences between superficially similar organisms. Such differences may guide research into bacterial replication mechanisms. This work describes a database with details of many skews for over 23,000 bacterial chromosomes.}
 21 | 
 22 | % Please include corresponding author, author contribution and author declaration information
 23 | %\authorcontributions{Please provide details of author contributions here.}
 24 | %\authordeclaration{Please declare any conflict of interest here.}
 25 | %\equalauthors{\textsuperscript{1}A.O.(Author One) and A.T. (Author Two) contributed equally to this work (remove if not applicable).}
 26 | \correspondingauthor{\textsuperscript{a}To whom correspondence should be addressed. E-mail: bert@hubertnet.nl}
 27 | 
 28 | % Keywords are not mandatory, but authors are strongly encouraged to provide them. If provided, please include two to five keywords, separated by the pipe symbol, e.g:
 29 | \keywords{GC skew $|$ TA skew $|$ strand bias $|$ codon bias} 
 30 | 
 31 | \begin{abstract}
 32 |   GC skew \cite{lobry_asymmetric_1996} denotes the relative excess of G nucleotides over C nucleotides on the leading versus the lagging replication strand of eubacteria. While small, typically around 2.5\%, the effect is robust and pervasive, showing up in almost all circular chromosomes. GC skew and the analogous TA skew are a localized deviation from Chargaff's second parity rule \cite{rudner_separation_1968}, which states that G and C, and T and A occur with (mostly) equal frequency even within a strand.
 33 | 
 34 |   Unlike archaea \cite{arakawa_quantitative_2009}, bacteria show very well defined GC skew, often highly stable over the entire leading and lagging strands. The excess of G on the leading strand is on average negated by the excess of C on the lagging strand.
 35 | 
 36 |   Most bacteria also show the analogous TA skew, although this is often noisier than GC skew, with more local deviations. Different phyla show different relations between TA and GC skew \cite{charneski_atypical_2011}.
 37 | 
 38 |   This article introduces an open access database (\url{https://skewdb.org}) of GC and 12 other skews for over 23,000 chromosomes. Many other details like codon bias, strand bias, strand lengths and taxonomic data are also included. The \emph{Skew}DB database can be used to generate or verify hypotheses. Since the origins of both the second parity rule, as well as GC skew itself, are not yet satisfactorily explained, such a database may enhance our understanding of microbial DNA.
 39 | 
 40 |   In this work various interesting chromosomes are highlighted which show asymmetric skews and strands, or other hard to explain phenomena. I also show how GC skew is an interesting forensic tool for studying DNA assembly problems, but also possible chromosome or plasmid mergers, which show up clearly in the graphs generated using the \emph{Skew}DB. 
 41 | \end{abstract}
 42 | 
 43 | \dates{This manuscript was compiled on \today}
 44 | \doi{\url{www.berthub.eu/cgi/doi/10.1073/bhub.XXXXXXXXXX}}
 45 | 
 46 | \begin{document}
 47 | 
 48 | \maketitle
 49 | \thispagestyle{firststyle}
 50 | \ifthenelse{\boolean{shortarticle}}{\ifthenelse{\boolean{singlecolumn}}{\abscontentformatted}{\abscontent}}{}
 51 | 
 52 | The phenomenon of GC skew is tantalizing because it enables a simple numerical analysis that accurately predicts the loci of both the origin and terminus of replication in most bacteria \cite{lu_skewit_2020} \cite{luo_doric_2019}.
 53 | 
 54 | Bacterial DNA is typically replicated simultaneously on both strands, starting at the origin of replication. Both copying forks (naturally) travel in the 5' to 3' direction, but given that the replichores are on opposite strands, topologically they are traveling in opposite directions. This continues until the forks meet again at the terminus.
 55 | 
 56 | The excess of G over C on the leading strand is in itself only remarkable because of Chargaff's somewhat mysterious second parity rule, which states that within a DNA strand, there are near equal numbers of G's and C's, and similarly, T's and A's. This rule does not directly follow from the first parity rule, which is a simple statement of base pairing rules.
 57 | 
 58 | Depending on who is asked, Chargaff's second parity rule is so trivial that it needs no explanation, or it requires complex mathematics and entropy considerations to explain its existence \cite{fariselli_dna_2020}.
 59 | 
 60 | The origins of GC skew are still being debated. The leading and lagging strands of circular bacterial chromosomes are replicated very differently; it is at least plausible that this leads to different mutational biases. In addition, there are selection biases that are theorized to be involved \cite{tillier_contributions_2000}. No single mechanism may be exclusively responsible.
 61 | 
 62 | This article does not attempt to explain or further mystify \cite{zhang_brief_2014} the second parity rule or GC skew, but it may be that the contents of the \emph{Skew}DB might contribute to our further understanding.
 63 | 
 64 | The \emph{Skew}DB also contains hard to explain data on many chromosomes. These include highly asymmetric skew, but also very disparate strand lengths. On the positive side, the \emph{Skew}DB confirms earlier work on skews in the Firmicute phylum  \cite{charneski_atypical_2011}, and also expands on these earlier findings.
 65 | 
 66 | GC skew has often been investigated by looking at windows of DNA of a certain size. It has been found that this chosen size impacts the results of the analysis. In what follows, I only look at cumulative skew \cite{grigoriev_analyzing_1998}. This sidesteps window size issues. For example, the sequence GGGCCC has a cumulative GC skew of zero. However, as we progress through the sequence, this skew takes on values 0, 1, 2, 3, 2, 1, 0.
 67 | 
 68 | The result of such an analysis is shown in figure \ref{fig:explainer-graph}. The analysis software fits a linear model on the skews, where it also compensates for chromosome files sequenced in the non-canonical direction, or where the origin of replication is not at the start of the file.
 69 | 
 70 | \section*{\emph{Skew}DB: GC and TA skew across 23,725 chromosomes}
 71 | Using simple statistical techniques and open source software, I analysed 23,725 chromosomes for their GC, TA and other skews.
 72 | These chromosomes have been sourced from the NCBI download service. Using the associated annotation data (GFF3), the skews are also reported split out over the first, second and third codon positions, and also for non-coding regions.
 73 | 
 74 | In addition, the analysis also includes statistics on codon bias and strand bias (the phenomenon where genes tend to reside more on the leading replication strand). To make life easier, the database also features further statistics on GC-percentage, as well as taxonomic details.
 75 | 
 76 | The statistical model also captures interesting asymmetries. Although no plausible mechanism is known, some chromosomes feature very different GC skews between leading and lagging strands. Similarly, some strands appear to differ in length by whole factors, something which would negatively impact chromosome replication speed.
 77 | 
 78 | To prevent plasmids from skewing the numbers, only chromosomes of 1 million nucleotides or more are included. 
 79 | 
 80 | The \emph{Skew}DB software (``Antonie'') is fully open source, the database itself is fully open access. The generation of the database requires a few hours of CPU time. More information on the software and how to recreate the database are described in the methods section.
 81 | 
 82 | \section*{What is in the database}
 83 | The \emph{Skew}DB contains several kinds of data. First for each chromosome there are raw counts of nucleotides, also split out by codon position, coding sense and coding status. These numbers are provided with 4096-nucleotide resolution, as well as whole-chromosome totals. From these counts, detailed graphs can be generated per chromosome.
 84 | 
 85 | At one level higher, all the skews are summarised down to three biologically relevant parameters which describe the skews on the leading and lagging replication strands, plus the relative length of these strands. A goodness-of-fit parameter (relative RMS) is also provided. This parameter is not only for quality assurance, it also reliably spots exotic or anomalous chromosomes.
 86 | 
 87 | For each chromosome, codon bias data is also included. Finally for ease of processing, taxonomic data is sourced and merged, enabling the creation of per phylum or other phylogenetic demarcations. 
 88 | 
 89 | \section*{Skew details}
 90 | 
 91 | \begin{figure}[ht]
 92 | \centering
 93 | \includegraphics[width=.9\linewidth]{explainer.pdf}
 94 | \caption{Sample graph showing \emph{Skew}DB data for Lactiplantibacillus plantarum strain LZ95 chromosome}
 95 | \label{fig:explainer-graph}
 96 | \end{figure}
 97 | 
 98 | The fits are based on four parameters, as shown in figure \ref{fig:explainer-graph}. {\tt Alpha1} and {\tt alpha2} denote the relative excess of G over C on the leading and lagging strands. If {\tt alpha1} is $0.046$, this means that for every 1000 nucleotides on the leading strand, the cumulative count of G excess increases by 46.
 99 | 
100 | The third parameter is {\tt div} and it describes how the chromosome is divided over leading and lagging strands. If this number is $0.557$, the leading replication strand is modeled to make up $55.7\%$ of the chromosome.
101 | 
102 | The final parameter is {\tt shift} (the dotted vertical line), and denotes the offset of the origin of replication compared to the DNA FASTA file. This parameter has no biological meaning, and is an artifact of the DNA assembly process. 
103 | 
104 | The goodness-of-fit number consists of the root mean squared error of the fit, divided by the absolute mean skew. This latter correction is made to not penalize good fits for bacteria showing significant skew.
105 | 
106 | Both GC and TA skews are reported, for the whole genome, but also split out on first, second or third codon positions. In addition, separate numbers are included for non-genomic and genomic skews. Finally, a ``strand bias'' skew is reported, in which each positive sense nucleotide counts upwards, and each anti-sense nucleotide counts down.
107 | 
108 | Exhaustive details on all the metrics can be found on \url{https://skewdb.org}. This also includes links to a Jupyter \cite{Kluyver:2016aa} notebook that uses Matplotlib \cite{Hunter:2007} and Pandas \cite{jeff_reback_2021_5203279} to create all the graphs from this article, and many more. In addition, this notebook shows all numerical claims made in this work.
109 | 
110 | \section*{Sample findings}
111 | In what follows, I discuss some findings that can trivially be extracted from the \emph{Skew}DB. 
112 | 
113 | \subsection*{GC and TA skews}
114 | Most bacteria show concordant GC and TA skew, with an excess of G correlating with an excess of T. This does not need to be the case however. Figure \ref{fig:gc-ta-scatter} is a scatterplot that shows the correlation between the skews for various major superphyla. Firmicutes (part of the Terrabacteria group) show clearly discordant skews.
115 | \begin{figure}[ht]
116 | \centering
117 | \includegraphics[width=.9\linewidth]{phylo-histo.png}
118 | \caption{Scatter graph of 25,000 chromosomes by superphylum, GC skew versus TA skew}
119 | \label{fig:gc-ta-scatter}
120 | \end{figure}
121 | 
122 | \subsection*{Firmicute prediction}
123 | In many bacteria, genes tend to concentrate on the leading replication strand. If the codon bias of genes is such that they are relatively rich in one nucleotide, the ``strand bias'' may itself give rise to GC or TA bias. Or in other words, if genes contain a lot of G's and they huddle on the leading strand, that strand will show GC skew. As an hypothesis, we can plot the observed GC and TA skews for all Firmicutes for which we have data.
124 | 
125 | \begin{figure}[ht]
126 | \centering
127 | \includegraphics[width=.9\linewidth]{firmi.pdf}
128 | \caption{Predicted versus actual GC/TA skew for 4093 Firmicutes}
129 | \label{fig:the-big-graph}
130 | \end{figure}
131 | 
132 | 
133 | Mathematically the relation between the codon bias, strand bias and predicted GC skew turns out to be a simple multiplication. In figure \ref{fig:the-big-graph}, the x-axis represents this multiplication. The y-axis represents the GC and TA skew ratio. 
134 | 
135 | It can clearly be seen that both GC and TA skew correlate strongly with the codon/strand bias product. TA skew goes to zero with the two biases, but GC skew appears to persist even in the absence of such biases.
136 | 
137 | \begin{figure}[ht]
138 | \centering
139 | \includegraphics[width=.9\linewidth]{cdif-histo.pdf}
140 | \caption{Scatter graph of codon/strand bias versus GC/TA skew for \emph{C. difficile}}
141 | \label{fig:cdif-scatter}
142 | \end{figure}
143 | 
144 | 
145 | Figure \ref{fig:cdif-scatter} shows the situation for an individual chromosome (\emph{C. difficile}), based on overlapping 40960-nucleotide segments. On the x-axis we find the strand bias for these segments, running from entirely negative sense genes to entirely positive sense genes. The skew is meanwhile plotted on the y-axis, and here too we see that TA skew goes to zero in the absence of strand bias, while GC skew persists and clearly has an independent strand-based component.
146 | 
147 | \subsection*{Asymmetric skew}
148 | The vast majority of chromosomes show similar skews on the leading and the lagging replication strands, something that makes sense given the pairing rules. There are however many chromosomes that have very asymmetric skews, with one strand sometimes showing no skew at all. In figure \ref{fig:asym-skew} four chromosomes are shown that exhibit such behavior. The \emph{Skew}DB lists around 250 chromosomes where one strand shows a GC skew at least 3 times bigger/smaller than the other one.
149 | 
150 | \begin{figure}[ht]
151 | \centering
152 | \includegraphics[width=.9\linewidth]{flat-skew.pdf}
153 | \caption{Chromosomes with asymmetric skews}
154 | \label{fig:asym-skew}
155 | \end{figure}
156 | 
157 | 
158 | \subsection*{Asymmetric strands}
159 | Bacteria tend to have very equally sized replication strands, which is also an optimum for the duration of replication. It is therefore interesting to observe that GC skew analysis finds many chromosomes where one strand is four times larger than the other strand.  In  figure \ref{fig:strand-div} four such chromosomes are shown. The \emph{Skew}DB lists around 100 chromosomes where one strand is at least three times as large as the other strand.
160 | \begin{figure}[ht]
161 | \centering
162 | \includegraphics[width=.9\linewidth]{strand-div.pdf}
163 | \caption{Chromosomes with differing strand lengths}
164 | \label{fig:strand-div}
165 | \end{figure}
166 | 
167 | 
168 | \subsection*{Anomalies}
169 | In many ways, GC skew is like a forensic record of the historical developments in a chromosome. Horizontal gene transfer, inversions, integration of plasmids, excisions can all leave traces. In addition, DNA sequencing or assembly artifacts will also reliably show up in GC graphs.
170 | 
171 | Figure \ref{fig:anomalous} shows the GC and TA skews for Salmonella enterica subsp. enterica serovar Concord strain AR-0407 (NZ\_CP044177.1), and many things could be going on here. The peaks might correspond to multiple origins of replication, but might also indicate inversions or DNA assembly problems.
172 | 
173 | To find such anomalies, the \emph{Skew}DB viewer (\url{https://skewdb.org/view}) offers various buttons to show random anomalous chromosomes. 
174 | 
175 | \begin{figure}[ht]
176 | \centering
177 | \includegraphics[width=.9\linewidth]{anomalous.pdf}
178 | \caption{GC and TA skew for Salmonella enterica subsp. enterica serovar Concord strain AR-0407}
179 | \label{fig:anomalous}
180 | \end{figure}
181 | 
182 | \section*{Limitations and quality}
183 | The existential limitation of any database like the \emph{Skew}DB is that it does not represent nature. The sequence and annotation databases are dominated by easily culturable microbes. And even within that selection, specific (model) bacteria are heavily oversampled because of their economic or medical relevance.
184 | 
185 | Because of this, care should be taken not to interpret numbers in a way that does not take such over- and undersampling into account. This leaves enough room however for finding correlations. Some metrics are sampled so heavily that it would be a miracle if the unculturable organisms were collectively conspiring to skew the statistics away from what we are seeing.
186 | 
187 | The \emph{Skew}DB fits skews to a relatively simple model of only three parameters. This prevents overfitting, and this model has proven to be robust in practice. Yet, when doing automated analysis of tens of thousands of chromosomes, mistakes will be made. 
188 | 
189 | \begin{figure}[tbhp]
190 | \centering
191 | \includegraphics[width=.9\linewidth]{rms-samples.pdf}
192 | \caption{\emph{Skew}DB fits for 16 equal sized quality categories}
193 | \label{fig:rms-samples}
194 | \end{figure}
195 | 
196 | Figure \ref{fig:rms-samples} shows 16 equal sized quality categories, where it is visually clear that the 88\% best fits are excellent. It is therefore reasonable to filter the database on $RMS_{gc}<0.1067$. Or conversely, above that limit, interesting anomalous chromosomes can be found.
197 | 
198 | Finally it should be noted that the \emph{Skew}DB tries to precisely measure the skew, but it makes no effort to pin down the Origin of repliation exactly. For such uses, please refer to the DoriC database \cite{luo_doric_2019}.
199 | \section*{Discussion}
200 | Chargaff's second parity rule, GC and other skews are still open to further research. Previously, no database was available that covers such a wide variety of skews and genomes.
201 | 
202 | The \emph{Skew}DB is aimed at biologists so they can use the data to generate or even reject hypotheses. In addition, it may be that by going through the chromosomes with anomalous skews, badly assembled sequences can be detected and removed or redone.
203 | 
204 | Nucleotide skews, especially when split out by codon position, coding status and strand, may be able to tell us more about microbiology than previously thought, especially given that such a robust effect remains at best partially explained.
205 | 
206 | Specifically, given the abundance of data found in the \emph{Skew}DB, it may be possible to use statistics to constrain the possible origins of the various skews. If GC skew is truly a random process, the resulting distribution of skews can be calculated from first principles and compared to the biological reality.
207 | 
208 | \matmethods{All software and processes to generate the \emph{Skew}DB are open source. The source code and the database itself can be found through \url{https://skewdb.org}, where details are also available how to rebuild the database. All annotations and sequences are obtained from the NCBI download service. The bioinformatics of the software are also described in a separate article.}
209 | 
210 | \showmatmethods{} % Display the Materials and Methods section
211 | 
212 | \acknow{I would like to thank Bertus Beaumont for helping me to think like a biologist, and Jason Piper for regularly pointing me to the relevant literature. In addition, I am grateful that Felix Hol kindly allowed me to field test my software on his DNA sequences \cite{hol_density-dependent_2016}. Twitter users @halvorz and @Suddenly\_a\_goat also provided valuable feedback.}
213 | 
214 | \showacknow{} % Display the acknowledgments section
215 | 
216 | % Bibliography
217 | \bibliography{references}
218 | 
219 | \end{document}
220 | 


--------------------------------------------------------------------------------
/references.bib:
--------------------------------------------------------------------------------
  1 | 
  2 | @article{albrecht-buehler_asymptotically_2006,
  3 | 	title = {Asymptotically increasing compliance of genomes with {Chargaff}'s second parity rules through inversions and inverted transpositions},
  4 | 	volume = {103},
  5 | 	copyright = {© 2006 by The National Academy of Sciences of the USA. 
  6 |                   Freely available online through the PNAS open access option.},
  7 | 	issn = {0027-8424, 1091-6490},
  8 | 	url = {https://www.pnas.org/content/103/47/17828},
  9 | 	doi = {10.1073/pnas.0605553103},
 10 | 	abstract = {Chargaff's second parity rules for mononucleotides and oligonucleotides (CII mono and CII oligo rules) state that a sufficiently long ({\textgreater}100 kb) strand of genomic DNA that contains N copies of a mono- or oligonucleotide, also contains N copies of its reverse complementary mono- or oligonucleotide on the same strand. There is very strong support in the literature for the validity of the rules in coding and noncoding regions, especially for the CII mono rule. Because the experimental support for the CII oligo rule is much less complete, the present article, focusing on the special case of trinucleotides (triplets), examined several gigabases of genome sequences from a wide range of species and kingdoms including organelles such as mitochondria and chloroplasts. I found that all genomes, with the only exception of certain mitochondria, complied with the CII triplet rule at a very high level of accuracy in coding and noncoding regions alike. Based on the growing evidence that genomes may contain up to millions of copies of interspersed repetitive elements, I propose in this article a quantitative formulation of the hypothesis that inversions and inverted transposition could be a major contributing if not dominant factor in the almost universal validity of the rules.},
 11 | 	language = {en},
 12 | 	number = {47},
 13 | 	urldate = {2021-02-14},
 14 | 	journal = {Proceedings of the National Academy of Sciences},
 15 | 	author = {Albrecht-Buehler, Guenter},
 16 | 	month = nov,
 17 | 	year = {2006},
 18 | 	pmid = {17093051},
 19 | 	keywords = {chloroplasts, genomics, mitochondria, base composition, oligonucleotide composition},
 20 | 	pages = {17828--17833},
 21 | }
 22 | 
 23 | @article{charneski_atypical_2011,
 24 | 	title = {Atypical {AT} {Skew} in {Firmicute} {Genomes} {Results} from {Selection} and {Not} from {Mutation}},
 25 | 	volume = {7},
 26 | 	issn = {1553-7404},
 27 | 	url = {https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1002283},
 28 | 	doi = {10.1371/journal.pgen.1002283},
 29 | 	abstract = {The second parity rule states that, if there is no bias in mutation or selection, then within each strand of DNA complementary bases are present at approximately equal frequencies. In bacteria, however, there is commonly an excess of G (over C) and, to a lesser extent, T (over A) in the replicatory leading strand. The low G+C Firmicutes, such as Staphylococcus aureus, are unusual in displaying an excess of A over T on the leading strand. As mutation has been established as a major force in the generation of such skews across various bacterial taxa, this anomaly has been assumed to reflect unusual mutation biases in Firmicute genomes. Here we show that this is not the case and that mutation bias does not explain the atypical AT skew seen in S. aureus. First, recently arisen intergenic SNPs predict the classical replication-derived equilibrium enrichment of T relative to A, contrary to what is observed. Second, sites predicted to be under weak purifying selection display only weak AT skew. Third, AT skew is primarily associated with largely non-synonymous first and second codon sites and is seen with respect to their sense direction, not which replicating strand they lie on. The atypical AT skew we show to be a consequence of the strong bias for genes to be co-oriented with the replicating fork, coupled with the selective avoidance of both stop codons and costly amino acids, which tend to have T-rich codons. That intergenic sequence has more A than T, while at mutational equilibrium a preponderance of T is expected, points to a possible further unresolved selective source of skew.},
 30 | 	language = {en},
 31 | 	number = {9},
 32 | 	urldate = {2021-02-14},
 33 | 	journal = {PLOS Genetics},
 34 | 	author = {Charneski, Catherine A. and Honti, Frank and Bryant, Josephine M. and Hurst, Laurence D. and Feil, Edward J.},
 35 | 	month = sep,
 36 | 	year = {2011},
 37 | 	keywords = {Genomics, Single nucleotide polymorphisms, Staphylococcus aureus, Mutation, Nucleotides, Sense strands, Phylogenetics, Bacterial genomics},
 38 | 	pages = {e1002283},
 39 | }
 40 | 
 41 | @article{arakawa_quantitative_2009,
 42 | 	title = {Quantitative analysis of replication-related mutation and selection pressures in bacterial chromosomes and plasmids using generalised {GC} skew index},
 43 | 	volume = {10},
 44 | 	issn = {1471-2164},
 45 | 	url = {http://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-10-640},
 46 | 	doi = {10.1186/1471-2164-10-640},
 47 | 	language = {en},
 48 | 	number = {1},
 49 | 	urldate = {2021-02-14},
 50 | 	journal = {BMC Genomics},
 51 | 	author = {Arakawa, Kazuharu and Suzuki, Haruo and Tomita, Masaru},
 52 | 	year = {2009},
 53 | 	pages = {640},
 54 | }
 55 | 
 56 | @article{gao_bacteria_2015,
 57 | 	title = {Bacteria may have multiple replication origins},
 58 | 	volume = {6},
 59 | 	issn = {1664-302X},
 60 | 	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4403523/},
 61 | 	doi = {10.3389/fmicb.2015.00324},
 62 | 	urldate = {2021-02-14},
 63 | 	journal = {Frontiers in Microbiology},
 64 | 	author = {Gao, Feng},
 65 | 	month = apr,
 66 | 	year = {2015},
 67 | 	pmid = {25941523},
 68 | 	pmcid = {PMC4403523},
 69 | }
 70 | 
 71 | @article{fariselli_dna_2020,
 72 | 	title = {{DNA} sequence symmetries from randomness: the origin of the {Chargaff}’s second parity rule},
 73 | 	issn = {1477-4054},
 74 | 	shorttitle = {{DNA} sequence symmetries from randomness},
 75 | 	url = {https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbaa041/5817481},
 76 | 	doi = {10.1093/bib/bbaa041},
 77 | 	abstract = {Abstract
 78 |             Most living organisms rely on double-stranded DNA (dsDNA) to store their genetic information and perpetuate themselves. This biological information has been considered as the main target of evolution. However, here we show that symmetries and patterns in the dsDNA sequence can emerge from the physical peculiarities of the dsDNA molecule itself and the maximum entropy principle alone, rather than from biological or environmental evolutionary pressure. The randomness justifies the human codon biases and context-dependent mutation patterns in human populations. Thus, the DNA ‘exceptional symmetries,’ emerged from the randomness, have to be taken into account when looking for the DNA encoded information. Our results suggest that the double helix energy constraints and, more generally, the physical properties of the dsDNA are the hard drivers of the overall DNA sequence architecture, whereas the selective biological processes act as soft drivers, which only under extraordinary circumstances overtake the overall entropy content of the genome.},
 79 | 	language = {en},
 80 | 	urldate = {2021-02-14},
 81 | 	journal = {Briefings in Bioinformatics},
 82 | 	author = {Fariselli, Piero and Taccioli, Cristian and Pagani, Luca and Maritan, Amos},
 83 | 	month = apr,
 84 | 	year = {2020},
 85 | 	pages = {bbaa041},
 86 | }
 87 | 
 88 | @article{rosandic_novel_2019,
 89 | 	title = {Novel look at {DNA} and life—{Symmetry} as evolutionary forcing},
 90 | 	volume = {483},
 91 | 	issn = {00225193},
 92 | 	url = {https://linkinghub.elsevier.com/retrieve/pii/S0022519319303285},
 93 | 	doi = {10.1016/j.jtbi.2019.08.016},
 94 | 	language = {en},
 95 | 	urldate = {2021-02-14},
 96 | 	journal = {Journal of Theoretical Biology},
 97 | 	author = {Rosandić, Marija and Vlahović, Ines and Paar, Vladimir},
 98 | 	month = dec,
 99 | 	year = {2019},
100 | 	pages = {109985},
101 | }
102 | 
103 | @article{lobry_asymmetric_1996,
104 | 	title = {Asymmetric substitution patterns in the two {DNA} strands of bacteria},
105 | 	volume = {13},
106 | 	issn = {0737-4038, 1537-1719},
107 | 	url = {https://academic.oup.com/mbe/article-lookup/doi/10.1093/oxfordjournals.molbev.a025626},
108 | 	doi = {10.1093/oxfordjournals.molbev.a025626},
109 | 	language = {en},
110 | 	number = {5},
111 | 	urldate = {2021-02-14},
112 | 	journal = {Molecular Biology and Evolution},
113 | 	author = {Lobry, J. R.},
114 | 	month = may,
115 | 	year = {1996},
116 | 	pages = {660--665},
117 | }
118 | 
119 | @article{marin_gc_2008,
120 | 	title = {{GC} skew in protein-coding genes between the leading and lagging strands in bacterial genomes: {New} substitution models incorporating strand bias},
121 | 	volume = {253},
122 | 	issn = {00225193},
123 | 	shorttitle = {{GC} skew in protein-coding genes between the leading and lagging strands in bacterial genomes},
124 | 	url = {https://linkinghub.elsevier.com/retrieve/pii/S0022519308001793},
125 | 	doi = {10.1016/j.jtbi.2008.04.004},
126 | 	language = {en},
127 | 	number = {3},
128 | 	urldate = {2021-02-14},
129 | 	journal = {Journal of Theoretical Biology},
130 | 	author = {Marín, Antonio and Xia, Xuhua},
131 | 	month = aug,
132 | 	year = {2008},
133 | 	pages = {508--513},
134 | }
135 | 
136 | @article{hubert_physical_1935,
137 | 	title = {The {Physical} state of {Chlorophyll} in the {Living} {Plastid}},
138 | 	volume = {32},
139 | 	issn = {0370-7504},
140 | 	url = {https://natuurtijdschriften.nl/pub/552276},
141 | 	abstract = {I. A photographic spectroohotometric method has been used to estimate the position of the band-maximum of chlorophyll (a + b) in different media and in the living leaf. A description of the methods and instruments is given. II. The pigment was prepared after the method of W i 11- statter and Stoll. III. The pigment follows Beer and Lambert’s law over a considerable range of concentrations in all the media studied. IV. The effect of photooxidation is studied. Spectroscopic measurements of the position of the first (main) absorption band are not influenced by photooxidation. An approximate absorption spectrum of the photooxidised product is given. V. The shifting of the band maxima In organic media follows, in general, Kundt’s ’’law”. The specific extinction is not altered in these media. The band moves ”bodily” through the spectrum, excepted in methyl, alcohol. In this medium the absorption band shows a kind of wing to the longer wavelengths. VI. A special method is described to prepare colloidal solutions of chlorophyll. The position of the band-maximum of the pigment in the colloidal \$tate is dependent upon the degree of dispersion of the pigment. Addition of bivalent ions to solutions of colloidal chlorophyll decreases the dispersion. With decreasing dispersion of the pigment, the band maximum shifts towards the longer wavelengths. VII. Dry chlorophyll has the same absorption maximum as the living plastid, but the pigment in this state is non-fluorescent. VIII. The system chlorophyll-lecithin-water is strongly fluorescent but shows a shift of too A to the shorter wavelengths. IX. The band maxima of several leaves are found to be practically identical. The maximum is found between 6800—6810 A. When leaves are ground the maximum shows no appreciable shift. On boiling the leaves, the maximum shifts to the shorter wavelengths. The maximum of the band of boiled leaves is found practically on the same place as the maximum of a system chlorophyll-lecithin-water. X. Plastids photographed in red light show that the absorbing complex is distributed inhomogeneously in the stroma of the chloroplastid. Some calculations have been made on the amount of chlorophyll in one single chloroplastid. The pigment cannot be present in one monomolecular layer around the surface of the plastid. XI. An attempt has been made to explain the structure of the plastid. Based on our experiments and the work of Noack (jo) and Mestre (45) the tentative conclusion Is reached that the ’’phyllochlorin” complex consists of a protein complex, lecithinoids and the four well known pigments.},
142 | 	language = {en},
143 | 	number = {1},
144 | 	urldate = {2021-03-23},
145 | 	journal = {Recueil des travaux botaniques néerlandais},
146 | 	author = {Hubert, B.},
147 | 	month = jan,
148 | 	year = {1935},
149 | 	pages = {323--390},
150 | }
151 | 
152 | @article{lu_skewit:_2020,
153 | 	title = {{SkewIT}: {The} {Skew} {Index} {Test} for large-scale {GC} {Skew} analysis of bacterial genomes},
154 | 	volume = {16},
155 | 	issn = {1553-7358},
156 | 	shorttitle = {{SkewIT}},
157 | 	url = {https://dx.plos.org/10.1371/journal.pcbi.1008439},
158 | 	doi = {10.1371/journal.pcbi.1008439},
159 | 	abstract = {GC skew is a phenomenon observed in many bacterial genomes, wherein the two replication strands of the same chromosome contain different proportions of guanine and cytosine nucleotides. Here we demonstrate that this phenomenon, which was first discovered in the mid-1990s, can be used today as an analysis tool for the 15,000+ complete bacterial genomes in NCBI’s Refseq library. In order to analyze all 15,000+ genomes, we introduce a new method, SkewIT (Skew Index Test), that calculates a single metric representing the degree of GC skew for a genome. Using this metric, we demonstrate how GC skew patterns are conserved within certain bacterial phyla, e.g. Firmicutes, but show different patterns in other phylogenetic groups such as Actinobacteria. We also discovered that outlier values of SkewIT highlight potential bacterial mis-assemblies. Using our newly defined metric, we identify multiple mis-assembled chromosomal sequences in previously published complete bacterial genomes. We provide a SkewIT web app
160 |               https://jenniferlu717.shinyapps.io/SkewIT/
161 |               that calculates SkewI for any user-provided bacterial sequence. The web app also provides an interactive interface for the data generated in this paper, allowing users to further investigate the SkewI values and thresholds of the Refseq-97 complete bacterial genomes. Individual scripts for analysis of bacterial genomes are provided in the following repository:
162 |               https://github.com/jenniferlu717/SkewIT
163 |               .},
164 | 	language = {en},
165 | 	number = {12},
166 | 	urldate = {2021-08-30},
167 | 	journal = {PLOS Computational Biology},
168 | 	author = {Lu, Jennifer and Salzberg, Steven L.},
169 | 	editor = {Rzhetsky, Andrey},
170 | 	month = dec,
171 | 	year = {2020},
172 | 	pages = {e1008439},
173 | }
174 | 
175 | @article{luo_doric_2019,
176 | 	title = {{DoriC} 10.0: an updated database of replication origins in prokaryotic genomes including chromosomes and plasmids},
177 | 	volume = {47},
178 | 	issn = {0305-1048, 1362-4962},
179 | 	shorttitle = {{DoriC} 10.0},
180 | 	url = {https://academic.oup.com/nar/article/47/D1/D74/5144951},
181 | 	doi = {10.1093/nar/gky1014},
182 | 	language = {en},
183 | 	number = {D1},
184 | 	urldate = {2021-08-30},
185 | 	journal = {Nucleic Acids Research},
186 | 	author = {Luo, Hao and Gao, Feng},
187 | 	month = jan,
188 | 	year = {2019},
189 | 	pages = {D74--D77},
190 | }
191 | 
192 | @article{grigoriev_analyzing_1998,
193 | 	title = {Analyzing genomes with cumulative skew diagrams},
194 | 	volume = {26},
195 | 	issn = {0305-1048, 1362-4962},
196 | 	url = {https://academic.oup.com/nar/article-lookup/doi/10.1093/nar/26.10.2286},
197 | 	doi = {10.1093/nar/26.10.2286},
198 | 	language = {en},
199 | 	number = {10},
200 | 	urldate = {2021-09-03},
201 | 	journal = {Nucleic Acids Research},
202 | 	author = {Grigoriev, A.},
203 | 	month = may,
204 | 	year = {1998},
205 | 	pages = {2286--2290},
206 | }
207 | 
208 | @article{bell_deviations_1999,
209 | 	title = {Deviations from {Chargaff}'s {Second} {Parity} {Rule} {Correlate} with {Direction} of {Transcription}},
210 | 	volume = {197},
211 | 	issn = {00225193},
212 | 	url = {https://linkinghub.elsevier.com/retrieve/pii/S002251939890858X},
213 | 	doi = {10.1006/jtbi.1998.0858},
214 | 	language = {en},
215 | 	number = {1},
216 | 	urldate = {2021-09-03},
217 | 	journal = {Journal of Theoretical Biology},
218 | 	author = {Bell, S.J. and Forsdyke, D.R.},
219 | 	month = mar,
220 | 	year = {1999},
221 | 	pages = {63--76},
222 | }
223 | 
224 | @article{rudner_separation_1968,
225 | 	title = {Separation of {B}. subtilis {DNA} into complementary strands. 3. {Direct} analysis.},
226 | 	volume = {60},
227 | 	issn = {0027-8424, 1091-6490},
228 | 	url = {http://www.pnas.org/cgi/doi/10.1073/pnas.60.3.921},
229 | 	doi = {10.1073/pnas.60.3.921},
230 | 	language = {en},
231 | 	number = {3},
232 | 	urldate = {2021-09-03},
233 | 	journal = {Proceedings of the National Academy of Sciences},
234 | 	author = {Rudner, R. and Karkas, J. D. and Chargaff, E.},
235 | 	month = jul,
236 | 	year = {1968},
237 | 	pages = {921--922},
238 | }
239 | 
240 | @article{tillier_contributions_2000,
241 | 	title = {The {Contributions} of {Replication} {Orientation}, {Gene} {Direction}, and {Signal} {Sequences} to {Base}-{Composition} {Asymmetries} in {Bacterial} {Genomes}},
242 | 	volume = {50},
243 | 	issn = {0022-2844},
244 | 	url = {http://link.springer.com/10.1007/s002399910029},
245 | 	doi = {10.1007/s002399910029},
246 | 	language = {en},
247 | 	number = {3},
248 | 	urldate = {2021-09-04},
249 | 	journal = {Journal of Molecular Evolution},
250 | 	author = {Tillier, Elisabeth R.M. and Collins, Richard A.},
251 | 	month = mar,
252 | 	year = {2000},
253 | 	pages = {249--257},
254 | }
255 | 
256 | @article{hol_density-dependent_2016,
257 | 	title = {Density-dependent adaptive resistance allows swimming bacteria to colonize an antibiotic gradient},
258 | 	volume = {10},
259 | 	issn = {1751-7362, 1751-7370},
260 | 	url = {http://www.nature.com/articles/ismej2015107},
261 | 	doi = {10.1038/ismej.2015.107},
262 | 	language = {en},
263 | 	number = {1},
264 | 	urldate = {2021-09-04},
265 | 	journal = {The ISME Journal},
266 | 	author = {Hol, Felix J H and Hubert, Bert and Dekker, Cees and Keymer, Juan E},
267 | 	month = jan,
268 | 	year = {2016},
269 | 	pages = {30--38},
270 | }
271 | 
272 | @article{lu_skewit_2020,
273 | 	title = {{SkewIT}: {The} {Skew} {Index} {Test} for large-scale {GC} {Skew} analysis of bacterial genomes},
274 | 	volume = {16},
275 | 	issn = {1553-7358},
276 | 	shorttitle = {{SkewIT}},
277 | 	url = {https://dx.plos.org/10.1371/journal.pcbi.1008439},
278 | 	doi = {10.1371/journal.pcbi.1008439},
279 | 	abstract = {GC skew is a phenomenon observed in many bacterial genomes, wherein the two replication strands of the same chromosome contain different proportions of guanine and cytosine nucleotides. Here we demonstrate that this phenomenon, which was first discovered in the mid-1990s, can be used today as an analysis tool for the 15,000+ complete bacterial genomes in NCBI’s Refseq library. In order to analyze all 15,000+ genomes, we introduce a new method, SkewIT (Skew Index Test), that calculates a single metric representing the degree of GC skew for a genome. Using this metric, we demonstrate how GC skew patterns are conserved within certain bacterial phyla, e.g. Firmicutes, but show different patterns in other phylogenetic groups such as Actinobacteria. We also discovered that outlier values of SkewIT highlight potential bacterial mis-assemblies. Using our newly defined metric, we identify multiple mis-assembled chromosomal sequences in previously published complete bacterial genomes. We provide a SkewIT web app https://jenniferlu717.shinyapps.io/SkewIT/ that calculates SkewI for any user-provided bacterial sequence. The web app also provides an interactive interface for the data generated in this paper, allowing users to further investigate the SkewI values and thresholds of the Refseq-97 complete bacterial genomes. Individual scripts for analysis of bacterial genomes are provided in the following repository: https://github.com/jenniferlu717/SkewIT .},
280 | 	language = {en},
281 | 	number = {12},
282 | 	urldate = {2021-08-30},
283 | 	journal = {PLOS Computational Biology},
284 | 	author = {Lu, Jennifer and Salzberg, Steven L.},
285 | 	editor = {Rzhetsky, Andrey},
286 | 	month = dec,
287 | 	year = {2020},
288 | 	pages = {e1008439},
289 | }
290 | 
291 | @conference{Kluyver:2016aa,
292 | 	Author = {Thomas Kluyver and Benjamin Ragan-Kelley and Fernando P{\'e}rez and Brian Granger and Matthias Bussonnier and Jonathan Frederic and Kyle Kelley and Jessica Hamrick and Jason Grout and Sylvain Corlay and Paul Ivanov and Dami{\'a}n Avila and Safia Abdalla and Carol Willing},
293 | 	Booktitle = {Positioning and Power in Academic Publishing: Players, Agents and Agendas},
294 | 	Editor = {F. Loizides and B. Schmidt},
295 | 	Organization = {IOS Press},
296 | 	Pages = {87 - 90},
297 | 	Title = {Jupyter Notebooks -- a publishing format for reproducible computational workflows},
298 | 	Year = {2016}}
299 | 
300 | @Article{Hunter:2007,
301 |   Author    = {Hunter, J. D.},
302 |   Title     = {Matplotlib: A 2D graphics environment},
303 |   Journal   = {Computing in Science \& Engineering},
304 |   Volume    = {9},
305 |   Number    = {3},
306 |   Pages     = {90--95},
307 |   abstract  = {Matplotlib is a 2D graphics package used for Python for
308 |   application development, interactive scripting, and publication-quality
309 |   image generation across user interfaces and operating systems.},
310 |   publisher = {IEEE COMPUTER SOC},
311 |   doi       = {10.1109/MCSE.2007.55},
312 |   year      = 2007
313 | }
314 | 
315 | @misc{jeff_reback_2021_5203279,
316 |   author       = {Jeff Reback and
317 |                   jbrockmendel and
318 |                   Wes McKinney and
319 |                   Joris Van den Bossche and
320 |                   Tom Augspurger and
321 |                   Phillip Cloud and
322 |                   Simon Hawkins and
323 |                   gfyoung and
324 |                   Sinhrks and
325 |                   Matthew Roeschke and
326 |                   Adam Klein and
327 |                   Terji Petersen and
328 |                   Jeff Tratner and
329 |                   Chang She and
330 |                   William Ayd and
331 |                   Patrick Hoefler and
332 |                   Shahar Naveh and
333 |                   Marc Garcia and
334 |                   Jeremy Schendel and
335 |                   Andy Hayden and
336 |                   Daniel Saxton and
337 |                   Richard Shadrach and
338 |                   Marco Edward Gorelli and
339 |                   Fangchen Li and
340 |                   Vytautas Jancauskas and
341 |                   attack68 and
342 |                   Ali McMaster and
343 |                   Pietro Battiston and
344 |                   Skipper Seabold and
345 |                   Kaiqi Dong},
346 |   title        = {pandas-dev/pandas: Pandas 1.3.2},
347 |   month        = aug,
348 |   year         = 2021,
349 |   publisher    = {Zenodo},
350 |   version      = {v1.3.2},
351 |   doi          = {10.5281/zenodo.5203279},
352 |   url          = {https://doi.org/10.5281/zenodo.5203279}
353 | }
354 | 
355 | @article{zhang_brief_2014,
356 | 	title = {A {Brief} {Review}: {The} {Z}-curve {Theory} and its {Application} in {Genome} {Analysis}},
357 | 	volume = {15},
358 | 	issn = {1389-2029},
359 | 	url = {https://pubmed.ncbi.nlm.nih.gov/24822026},
360 | 	doi = {10.2174/1389202915999140328162433},
361 | 	abstract = {In theoretical physics, there exist two basic mathematical approaches, algebraic and geometrical methods, which, in most cases, are complementary. In the area of genome sequence analysis, however, algebraic approaches have been widely used, while geometrical approaches have been less explored for a long time. The Z-curve theory is a geometrical approach to genome analysis. The Z-curve is a three-dimensional curve that represents a given DNA sequence in the sense that each can be uniquely reconstructed given the other. The Z-curve, therefore, contains all the information that the corresponding DNA sequence carries. The analysis of a DNA sequence can then be performed through studying the corresponding Z-curve. The Z-curve method has found applications in a wide range of areas in the past two decades, including the identifications of protein-coding genes, replication origins, horizontally-transferred genomic islands, promoters, translational start sides and isochores, as well as studies on phylogenetics, genome visualization and comparative genomics. Here, we review the progress of Z-curve studies from aspects of both theory and applications in genome analysis.},
362 | 	language = {eng},
363 | 	number = {2},
364 | 	journal = {Current genomics},
365 | 	author = {Zhang, Ren and Zhang, Chun-Ting},
366 | 	month = apr,
367 | 	year = {2014},
368 | 	note = {Publisher: Bentham Science Publishers},
369 | 	keywords = {GC profile, Gene finding, Genomic island, Replication origin, Z-curve.},
370 | 	pages = {78--94},
371 | }
372 | 
373 | @article{ODonnell2013,
374 |   doi = {10.1101/cshperspect.a010108},
375 |   url = {https://doi.org/10.1101/cshperspect.a010108},
376 |   year = {2013},
377 |   month = jul,
378 |   publisher = {Cold Spring Harbor Laboratory},
379 |   volume = {5},
380 |   number = {7},
381 |   pages = {a010108--a010108},
382 |   author = {M. O{\textquotesingle}Donnell and L. Langston and B. Stillman},
383 |   title = {Principles and Concepts of {DNA} Replication in Bacteria,  Archaea,  and Eukarya},
384 |   journal = {Cold Spring Harbor Perspectives in Biology}
385 | }
386 | 
387 | @article{Lilly2015,
388 |   doi = {10.1128/microbiolspec.plas-0029-2014},
389 |   url = {https://doi.org/10.1128/microbiolspec.plas-0029-2014},
390 |   year = {2015},
391 |   month = feb,
392 |   publisher = {American Society for Microbiology},
393 |   volume = {3},
394 |   number = {1},
395 |   author = {Joshua Lilly and Manel Camps},
396 |   editor = {Marcelo Tolmasky and Juan Carlos Alonso},
397 |   title = {Mechanisms of Theta Plasmid Replication},
398 |   journal = {Microbiology Spectrum}
399 | }
400 | @article{Nelder1965,
401 |   doi = {10.1093/comjnl/7.4.308},
402 |   url = {https://doi.org/10.1093/comjnl/7.4.308},
403 |   year = {1965},
404 |   month = jan,
405 |   publisher = {Oxford University Press ({OUP})},
406 |   volume = {7},
407 |   number = {4},
408 |   pages = {308--313},
409 |   author = {J. A. Nelder and R. Mead},
410 |   title = {A Simplex Method for Function Minimization},
411 |   journal = {The Computer Journal}
412 | }
413 | 
414 | @article{pmid:10570985
415 | ,	title	= {Asymmetric substitution patterns: a review of possible underlying mutational or selective mechanisms.}
416 | ,	author	= {Frank, A C and Lobry, J R}
417 | ,	journal	= {Gene}
418 | ,	journal-iso	= {Gene}
419 | ,	volume	= {238}
420 | ,	number	= {1}
421 | ,	year	= {1999}
422 | ,	month	= {Sep}
423 | ,	pages	= {65--77}
424 | ,	pmid	= {10570985}
425 | }
426 | 
427 | @article{Marn2008,
428 |   doi = {10.1016/j.jtbi.2008.04.004},
429 |   url = {https://doi.org/10.1016/j.jtbi.2008.04.004},
430 |   year = {2008},
431 |   month = aug,
432 |   publisher = {Elsevier {BV}},
433 |   volume = {253},
434 |   number = {3},
435 |   pages = {508--513},
436 |   author = {Antonio Mar{\'{\i}}n and Xuhua Xia},
437 |   title = {{GC} skew in protein-coding genes between the leading and lagging strands in bacterial genomes: New substitution models incorporating strand bias},
438 |   journal = {Journal of Theoretical Biology}
439 | }
440 | 
441 | @article{Quan2019,
442 |   doi = {10.1002/1873-3468.13374},
443 |   url = {https://doi.org/10.1002/1873-3468.13374},
444 |   year = {2019},
445 |   month = apr,
446 |   publisher = {Wiley},
447 |   volume = {593},
448 |   number = {9},
449 |   pages = {918--925},
450 |   author = {Chun-Lan Quan and Feng Gao},
451 |   title = {Quantitative analysis and assessment of base composition asymmetry and gene orientation bias in bacterial genomes},
452 |   journal = {{FEBS} Letters}
453 | }
454 | 
455 | @article{Roten2002,
456 |   doi = {10.1093/nar/30.1.142},
457 |   url = {https://doi.org/10.1093/nar/30.1.142},
458 |   year = {2002},
459 |   month = jan,
460 |   publisher = {Oxford University Press ({OUP})},
461 |   volume = {30},
462 |   number = {1},
463 |   pages = {142--144},
464 |   author = {C.-A. H. Roten},
465 |   title = {Comparative Genometrics ({CG}): a database dedicated to biometric comparisons of whole genomes},
466 |   journal = {Nucleic Acids Research}
467 | }
468 | 
469 | @article{Zhang2003,
470 |   doi = {10.1093/bioinformatics/btg041},
471 |   url = {https://doi.org/10.1093/bioinformatics/btg041},
472 |   year = {2003},
473 |   month = mar,
474 |   publisher = {Oxford University Press ({OUP})},
475 |   volume = {19},
476 |   number = {5},
477 |   pages = {593--599},
478 |   author = {C.-T. Zhang and R. Zhang and H.-Y. Ou},
479 |   title = {The Z curve database: a graphic representation of genome sequences},
480 |   journal = {Bioinformatics}
481 | }
482 | 
483 | @misc{https://doi.org/10.5061/dryad.g4f4qrfr6,
484 |   doi = {10.5061/DRYAD.G4F4QRFR6},
485 |   url = {http://datadryad.org/stash/dataset/doi:10.5061/dryad.g4f4qrfr6},
486 |   author = {Hubert,  Bert},
487 |   keywords = {GC content,  GC skew,  DNA replication,  Origins of DNA replication,  FOS: Biological sciences,  FOS: Biological sciences},
488 |   language = {en},
489 |   title = {SkewDB: A comprehensive database of GC and 10 other skews for over 28, 000 chromosomes and plasmids},
490 |   publisher = {Dryad},
491 |   year = {2021},
492 |   copyright = {Creative Commons Zero v1.0 Universal}
493 | }
494 | 
495 | @article{Thomas2007,
496 |   doi = {10.1186/1471-2105-8-21},
497 |   url = {https://doi.org/10.1186/1471-2105-8-21},
498 |   year = {2007},
499 |   month = jan,
500 |   publisher = {Springer Science and Business Media {LLC}},
501 |   volume = {8},
502 |   number = {1},
503 |   author = {Jamie M Thomas and Daniel Horspool and Gordon Brown and Vasily Tcherepanov and Chris Upton},
504 |   title = {{GraphDNA}: a Java program for graphical display of {DNA} composition analyses},
505 |   journal = {{BMC} Bioinformatics}
506 | }
507 | 
508 | @article{Grigoriev1998,
509 |   doi = {10.1093/nar/26.10.2286},
510 |   url = {https://doi.org/10.1093/nar/26.10.2286},
511 |   year = {1998},
512 |   month = may,
513 |   publisher = {Oxford University Press ({OUP})},
514 |   volume = {26},
515 |   number = {10},
516 |   pages = {2286--2290},
517 |   author = {A. Grigoriev},
518 |   title = {Analyzing genomes with cumulative skew diagrams},
519 |   journal = {Nucleic Acids Research}
520 | }
521 | 
522 | @misc{https://doi.org/10.5281/zenodo.5516524,
523 |   doi = {10.5281/ZENODO.5516524},
524 |   url = {https://zenodo.org/record/5516524},
525 |   author = {Hubert,  Bert and {Beaumont Lab}},
526 |   title = {berthubert/antonie2: SkewVersion 1.0},
527 |   publisher = {Zenodo},
528 |   year = {2021},
529 |   copyright = {Open Access}
530 | }


--------------------------------------------------------------------------------
/gcskew-article-sd.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[fleqn,10pt]{wlscirep}
  2 | \usepackage[utf8]{inputenc}
  3 | \usepackage[T1]{fontenc}
  4 | \usepackage{float}
  5 | \usepackage{lineno}
  6 | \usepackage{graphicx}
  7 | \linenumbers
  8 | 
  9 | \title{SkewDB, a comprehensive database of GC and 10 other skews for over 30,000 chromosomes and plasmids}
 10 | % CHANGE: New title, now 30,000
 11 | 
 12 | \author[*]{Bert Hubert}
 13 | \affil[*]{corresponding author: Bert Hubert (bert@hubertnet.nl)}
 14 | 
 15 | \begin{abstract}
 16 |   GC skew denotes the relative excess of G nucleotides over C nucleotides on the leading versus the lagging replication strand of eubacteria. While the effect is small, typically around 2.5\%, it is robust and pervasive. GC skew and the analogous TA skew are a localized deviation from Chargaff's second parity rule, which states that G and C, and T and A occur with (mostly) equal frequency even within a strand.
 17 | 
 18 |   Different bacterial phyla show different kinds of skew, and differing relations between TA and GC skew.
 19 | 
 20 | % CHANGE: Update to 30,000  
 21 |   This article introduces an open access database (https://skewdb.org) of GC and 10 other skews for over 30,000 chromosomes and plasmids. Further details like codon bias, strand bias, strand lengths and taxonomic data are also included. The \emph{Skew}DB can be used to generate or verify hypotheses. Since the origins of both the second parity rule and GC skew itself are not yet satisfactorily explained, such a database may enhance our understanding of prokaryotic DNA.
 22 | \end{abstract}
 23 | \begin{document}
 24 | 
 25 | \flushbottom
 26 | \maketitle
 27 | \thispagestyle{empty}
 28 | 
 29 | \section*{Background \& Summary}
 30 | % CHANGE: Added three references that introduce GC skew
 31 | The phenomenon of GC skew\cite{pmid:10570985, Marn2008, Quan2019} is tantalizing because it enables a simple numerical analysis that accurately predicts the loci of both the origin and terminus of replication in most bacteria and some archaea \cite{lu_skewit_2020,luo_doric_2019}.
 32 | 
 33 | Bacterial DNA is typically replicated simultaneously on both strands, starting at the origin of replication \cite{ODonnell2013}. Both replication forks travel in the 5' to 3' direction, but given that the replichores are on opposite strands, topologically they are traveling in opposite directions. This continues until the forks meet again at the terminus. This means that roughly one half of every strand is replicated in the opposite direction of the other half. The forward direction is called the leading strand. Many plasmids also replicate in this way \cite{Lilly2015}.
 34 | 
 35 | The excess of G over C on the leading strand is in itself only remarkable because of Chargaff's somewhat mysterious second parity rule\cite{rudner_separation_1968}, which states that within a DNA strand, there are nearly equal numbers of G's and C's, and similarly, T's and A's. This rule does not directly follow from the first parity rule, which is a simple statement of base pairing rules.
 36 | 
 37 | Depending on who is asked, Chargaff's second parity rule is so trivial that it needs no explanation, or it requires complex mathematics and entropy considerations to explain its existence \cite{fariselli_dna_2020}.
 38 | 
 39 | The origins of GC skew are still being debated. The leading and lagging strands of circular bacterial chromosomes are replicated very differently; it is at least plausible that this leads to different mutational biases. In addition, there are selection biases that are theorized to be involved \cite{tillier_contributions_2000}. No single mechanism may be exclusively responsible.
 40 | 
 41 | This article does not attempt to explain or further mystify \cite{zhang_brief_2014} the second parity rule or GC skew, but it may be that the contents of the \emph{Skew}DB can contribute to our further understanding.
 42 | 
 43 | The \emph{Skew}DB also contains some hard to explain data on many chromosomes. These include highly asymmetric skew, but also very disparate strand lengths. Conversely, the \emph{Skew}DB confirms earlier work on skews in the Firmicute phylum  \cite{charneski_atypical_2011}, and also expands on these earlier findings.
 44 | 
 45 | % CHANGE: Added the classical GC Skew formula and how it relates to cumulative skew
 46 | GC skew has often been investigated by looking at windows of DNA of a certain size. GC skew is computed as $(G-C)/(G+C)$ in a window of $N$ bases, where $G$ is the number of guanines and $C$ the number of cytosines in that window. It has been found that the choice of window size impacts the results of the analysis. The \emph{SkewDB} is therefore based exclusively on cumulative skew \cite{grigoriev_analyzing_1998}, which sidesteps window size issues. For example, the sequence GGGCCC has a cumulative GC skew of zero, and as we progress through the sequence, this skew takes on values 1, 2, 3, 2, 1, 0. If the window size $N$ is 6, the non-cumulative skew is also 0. 
 47 | 
 48 | The result of a cumulative GC skew analysis is shown in figure \ref{fig:explainer-graph}. The analysis software fits a linear model on the skews, where it also compensates for chromosome files sequenced in the non-canonical direction, or where the origin of replication is not at the start of the file.
 49 | 
 50 | GC skew analysis is not new. As noted below, the DoriC database for example contains related data that is more precise for its stated purpose (finding the Origin of replication). The SkewIT database \cite{lu_skewit_2020} similarly provides a metric of skew, and also comes with an online analysis tool.
 51 | 
 52 | Other work, like the Comparative Genometrics Database\cite{Roten2002} and the Z Curve Database \cite{Zhang2003} has been foundational, but by dint of their age lack an analysis of the tens of thousands of DNA sequences that have become available since the initial availability of these databases.
 53 | 
 54 | \emph{Skew}DB is funded to be updated monthly with the latest sequences from NCBI until 2026.
 55 | 
 56 | Other software that calculates GC skew is available, like for example GraphDNA\cite{Thomas2007}, GC Skewing\cite{Grigoriev1998} and GenSkew. The \emph{Skew}DB delivers far more metrics however, also because it involves annotation data in its calculations. For ease of use, \emph{Skew}DB is made available as a ready to use database, as well as in software form that reproduces this database exactly.
 57 | 
 58 | \section*{Methods}
 59 | The \emph{Skew}DB analysis relies exclusively on the tens of thousands of FASTA and GFF3 files available through the NCBI download service, which covers both GenBank and RefSeq. The database includes bacteria, archaea and their plasmids.
 60 | 
 61 | Furthermore, to ease analysis, the NCBI Taxonomy database is sourced and merged so output data can quickly be related to (super)phyla or specific species.
 62 | 
 63 | No other data is used, which greatly simplifies processing. Data is read directly in the compressed format provided by NCBI. All results are emitted as standard CSV files.
 64 | 
 65 | In the first step of the analysis, for each organism the FASTA sequence and the GFF3 annotation file are parsed. Every chromosome in the FASTA file is traversed from beginning to end, while a running total is kept for cumulative GC and TA skew. In addition, within protein coding genes, such totals are also kept separately for these skews on the first, second and third codon position. Furthermore, separate totals are kept for regions which do not code for proteins.
 66 | 
 67 | In addition, to enable strand bias measurements, a cumulative count is maintained of nucleotides that are part of a positive or negative sense gene. The counter is increased for positive sense nucleotides, decreased for negative sense nucleotides, and left alone for non-genic regions. A separate counter is kept for non-genic nucleotides. 
 68 | 
 69 | Finally, G and C nucleotides are counted, regardless of if they are part of a gene or not.
 70 | 
 71 | These running totals are emitted at 4096 nucleotide intervals, a resolution suitable for determining skews and shifts.
 72 | 
 73 | % CHANGE: Fixed a typo
 74 | In addition, one line summaries are stored for each chromosome. These lines include the RefSeq identifier of the chromosome, the full name mentioned in the FASTA file, plus counts of A, C, G and T nucleotides. Finally five levels of taxonomic data are stored.
 75 | 
 76 | Chromosomes and plasmids of fewer than 100 thousand nucleotides are ignored, as these are too noisy to model faithfully. Plasmids are clearly marked in the database, enabling researchers to focus on chromosomes if so desired.
 77 | 
 78 | \subsection*{Fitting}
 79 | Once the genomes have been summarised at 4096-nucleotide resolution, the skews are fitted to a simple model.
 80 | 
 81 | \begin{figure}[ht]
 82 | \centering
 83 | \includegraphics[width=0.7\linewidth]{explainer.pdf}
 84 | % CHANGE: Added italics
 85 | \caption{Sample graph showing \emph{Skew}DB data for \emph{Lactiplantibacillus plantarum} strain LZ95 chromosome}
 86 | \label{fig:explainer-graph}
 87 | \end{figure}
 88 | 
 89 | The fits are based on four parameters, as shown in figure \ref{fig:explainer-graph}. {\tt Alpha1} and {\tt alpha2} denote the relative excess of G over C on the leading and lagging strands. If {\tt alpha1} is $0.046$, this means that for every 1000 nucleotides on the leading strand, the cumulative count of G excess increases by 46.
 90 | 
 91 | The third parameter is {\tt div} and it describes how the chromosome is divided over leading and lagging strands. If this number is $0.557$, the leading replication strand is modeled to make up $55.7\%$ of the chromosome.
 92 | 
 93 | The final parameter is {\tt shift} (the dotted vertical line), and denotes the offset of the origin of replication compared to the DNA FASTA file. This parameter has no biological meaning of  itself, and is an artifact of the DNA assembly process. 
 94 | 
 95 | The goodness-of-fit number consists of the root mean squared error of the fit, divided by the absolute mean skew. This latter correction is made to not penalize good fits for bacteria showing significant skew.
 96 | 
 97 | GC skew tends to be defined very strongly, and it is therefore used to pick the {\tt div} and {\tt shift} parameters of the DNA sequence, which are then kept as a fixed constraint for all the other skews, which might not be present as clearly.
 98 | 
 99 | The fitting process itself is a downhill simplex method optimization\cite{Nelder1965} over the four dimensions, seeded with the average observed skew over the whole genome, and assuming there is no shift, and that the leading and lagging strands are evenly distributed. To ensure that the globally optimum fit is (very likely) achieved, ten optimization attempts are made from different starting points. This fitting process is remarkably robust in the sense that even significant changes in parameters or fitting strategies cause no appreciable change in the results. 
100 | 
101 | % CHANGE: Fixed a broken sentence
102 | For every chromosome and plasmid the model parameters are stored, plus the adjusted root mean squared error.
103 | 
104 | Both for quality assurance and ease of plotting, individual CSV files are generated for each chromosome, again at 4096 nucleotide resolution, but this time containing both the actual counts of skews as well as the fitted result.
105 |   
106 | \subsection*{Some sample findings}
107 | To popularize the database, an online viewer is available on https://skewdb.org/view. While this article makes no independent claims to new biological discoveries, the following sections show some results gathered from a brief study of the database. Some of these observations may be of interest for other researchers.
108 | \subsubsection*{GC and TA skews}
109 | Most bacteria show concordant GC and TA skew, with an excess of G correlating with an excess of T. This does not need to be the case however. Figure \ref{fig:gc-ta-scatter} is a scatterplot that shows the correlation between the skews for various major superphyla. Firmicutes (part of the Terrabacteria group) show clearly discordant skews.
110 | 
111 | \begin{figure}[ht]
112 | \centering
113 | \includegraphics[width=.8\linewidth]{phylo-histo.png}
114 | \caption{Scatter graph of 25,000 chromosomes by superphylum, GC skew versus TA skew}
115 | \label{fig:gc-ta-scatter}
116 | \end{figure}
117 | 
118 | 
119 | \subsubsection*{Firmicute prediction}
120 | In many bacteria, genes tend to concentrate on the leading replication strand. If the codon bias of genes is such that they are relatively rich in one nucleotide, the ``strand bias'' may itself give rise to GC or TA bias. Or in other words, if genes contain a lot of G's and they huddle on the leading strand, that strand will show GC skew. As an hypothesis, we can plot the observed GC and TA skews for all Firmicutes for which we have data.
121 | 
122 | \begin{figure}[ht]
123 |   \centering
124 |     \begin{minipage}[b]{0.45\linewidth}
125 |       \includegraphics[width=\linewidth]{firmi.pdf}
126 |       \caption{Predicted versus actual GC/TA skew for 4093 Firmicutes}
127 |       \label{fig:the-big-graph}
128 |     \end{minipage}
129 |     \quad
130 |     \begin{minipage}[b]{0.45\linewidth}
131 |       \includegraphics[width=\linewidth]{cdif-histo.pdf}
132 |       \caption{Scatter graph of codon/strand bias versus GC/TA skew for \emph{C. difficile}}
133 |       \label{fig:cdif-scatter}
134 |     \end{minipage}
135 | \end{figure}
136 | 
137 | 
138 | Mathematically the relation between the codon bias, strand bias and predicted GC skew turns out to be a simple multiplication. In figure \ref{fig:the-big-graph}, the x-axis represents this multiplication. The y-axis represents the GC and TA skew ratio. 
139 | 
140 | It can clearly be seen that both GC and TA skew correlate strongly with the codon/strand bias product. TA skew goes to zero with the two biases, but GC skew appears to persist even in the absence of such biases.
141 | 
142 | Figure \ref{fig:cdif-scatter} shows the situation within an individual chromosome (\emph{C. difficile}), based on overlapping 40960-nucleotide segments. On the x-axis we find the strand bias for these segments, running from entirely negative sense genes to entirely positive sense genes. The skew is meanwhile plotted on the y-axis, and here too we see that TA skew goes to zero in the absence of strand bias, while GC skew persists and clearly has an independent strand-based component.
143 | 
144 | \subsubsection*{Asymmetric skew}
145 | The vast majority of chromosomes show similar skews on the leading and the lagging replication strands, something that makes sense given the pairing rules. There are however many chromosomes that have very asymmetric skews, with one strand sometimes showing no skew at all. In figure \ref{fig:asym-skew} four chromosomes are shown that exhibit such behavior. The \emph{Skew}DB lists around 250 chromosomes where one strand has a GC skew at least 3 times bigger/smaller than the other one.
146 | 
147 | \begin{figure}[ht]
148 |   \centering
149 |   \begin{minipage}[b]{0.45\linewidth}
150 |     \includegraphics[width=\linewidth]{flat-skew.pdf}
151 |     \caption{Chromosomes with asymmetric skews}
152 |     \label{fig:asym-skew}
153 |   \end{minipage}
154 |   \quad
155 |   \begin{minipage}[b]{0.45\linewidth}
156 |     \includegraphics[width=\linewidth]{strand-div.pdf}
157 |     \caption{Chromosomes with differing strand lengths}
158 |     \label{fig:strand-div}
159 |   \end{minipage}
160 | \end{figure}
161 | 
162 | \subsubsection*{Asymmetric strands}
163 | Bacteria tend to have very equally sized replication strands, which is also an optimum for the duration of replication. It is therefore interesting to observe that GC skew analysis finds many chromosomes where one strand is four times larger than the other strand.  In  figure \ref{fig:strand-div} four such chromosomes are shown. The \emph{Skew}DB lists around 100 chromosomes where one strand is at least three times as large as the other strand.
164 | 
165 | 
166 | \subsubsection*{Anomalies}
167 | % CHANGE: Cite lu_skewit_2020 to acknowledge the origin of the idea of using GC skew to detect mis-assemblies
168 | In many ways, GC skew is like a forensic record of the historical developments in a chromosome. Horizontal gene transfer, inversions, integration of plasmids, excisions can all leave traces. In addition, DNA sequencing or assembly artifacts will also reliably show up in GC graphs, as elucidated with examples in \cite{lu_skewit_2020}.
169 | 
170 | Figure \ref{fig:anomalous} shows GC and TA skews for \emph{Salmonella enterica subsp. enterica serovar Concord} strain AR-0407 (NZ\_CP044177.1), and many things could be going on here. The peaks might correspond to multiple origins of replication, but might also indicate inversions or DNA assembly problems.
171 | 
172 | \begin{figure}[ht]
173 | \centering
174 | \includegraphics[width=.5\linewidth]{anomalous.pdf}
175 | % CHANGE: Add italics to species name
176 | \caption{GC and TA skew for \emph{Salmonella enterica subsp. enterica serovar Concord} strain AR-0407}
177 | \label{fig:anomalous}
178 | \end{figure}
179 | 
180 | \section*{Data Records}
181 | % CHANGE: Add proper citation for the dataset and the software
182 | The \emph{Skew}DB is available through https://skewdb.org, where it is frequently (\& automatically) refreshed. A snapshot of the data has also been deposited on Dryad\cite{https://doi.org/10.5061/dryad.g4f4qrfr6}.
183 | 
184 | The \emph{Skew}DB consists of several CSV files: skplot.csv, results.csv, genomes.csv and codongc.csv.
185 | In addition, for each chromosome or plasmid, a separate \_fit.csv file is generated, which contains data at 4096-nucleotide resolution.
186 | 
187 | skplot.csv contains all the 4096-nucleotide resolution data as one big file for all processed chromosomes and plasmids. The parameters are described in table \ref{tab:skplot}.
188 | \begin{table}[ht]
189 | \begin{tabular}{|l|l|l|l|}
190 | \hline
191 | abspos     & locus in chromosome                       & name      & RefSeq ID                                   \\ \hline
192 | acounts0-4 & A nucleotide counter                      & ngcount   & Counter of non-coding nucleotides           \\ \hline
193 | ccounts0-4 & C nucleotide counter                      & pospos    & cumulative positive sense nucleotide counter \\ \hline
194 | gcounts0-4 & G nucleotide counter                      & relpos    & relative position within chromosome/plasmid \\ \hline
195 | tcounts0-4 & T nucleotide counter                      & taskew    & cumulative TA skew                          \\ \hline
196 | gcskew     & cumulative GC skew                        & taskew0-3 & cumulative TA skew per codon position       \\ \hline
197 | gcskew0-3  & cumulative GC skew per codon position     & taskewNG  & cumulative TA skew for non-coding regions   \\ \hline
198 | gcskewNG   & cumulative GC skew for non-coding regions &           &                                             \\ \hline
199 | \end{tabular}
200 | \caption{Fields of skplot.csv}
201 | \label{tab:skplot}
202 | \end{table}
203 | 
204 | results.csv meanwhile contains the details of the fits. In this table \ref{tab:skew-table}, all marked out squares exist. The actual fields
205 | are called alpha1gc, alpha2gc, gcRMS, alpha1ta, alpha2ta etc. DNA sequence shift and div are also specified, and they come from the GC skew.
206 | gc0-2, ta0-2 refers to codon position. gcng and tang refer to the non-coding region skews. Finally sb denotes the strand bias. 
207 | 
208 | \begin{table}[H]
209 |   \centering
210 | \begin{tabular}{|l|l|l|l|ll}
211 | \hline
212 |      & alpha1 & alpha2 & rms & \multicolumn{1}{l|}{div} & \multicolumn{1}{l|}{shift} \\ \hline
213 | gc   & X      &  X     &  X  & \multicolumn{1}{l|}{X}   & \multicolumn{1}{l|}{X}     \\ \hline
214 | ta   & X      &  X     &  X  &                          &                            \\ \cline{1-4}
215 | gc0  & X      &  X     &  X  &                          &                            \\ \cline{1-4}
216 | gc1  & X      &  X     &  X  &                          &                            \\ \cline{1-4}
217 | gc2  & X      &  X     &  X  &                          &                            \\ \cline{1-4}
218 | ta0  & X      &  X     &  X  &                          &                            \\ \cline{1-4}
219 | ta1  & X      &  X     &  X  &                          &                            \\ \cline{1-4}
220 | ta2  & X      &  X     &  X  &                          &                            \\ \cline{1-4}
221 | gcng & X      &  X     &  X  &                          &                            \\ \cline{1-4}
222 | tang & X      &  X     &  X  &                          &                            \\ \cline{1-4}
223 | sb   & X      &  X     &  X  &                          &                            \\ \cline{1-4}
224 | \end{tabular}
225 | \caption{Skew metrics}
226 | \label{tab:skew-table}
227 | \end{table}
228 | 
229 | Table \ref{tab:codongc-table} documents the data on codon bias, also split out by leading or lagging strand found in codongc.csv.
230 | \begin{table}[H]
231 | \begin{tabular}{|l|l|}
232 | \hline
233 | afrac, cfrac, gfrac, tfrac                 & Fraction of coding nucleotides that are A, C, G or T                \\ \hline
234 | leadafrac, leadcfrac, leadgfrac, leadtfrac & Fraction of leading strand coding nucleotides that are A, C, G or T \\ \hline
235 | lagafrac, lagcfrac, laggfrac, lagtfrac     & Fraction of lagging strand coding nucleotides that are A, C, G or T \\ \hline
236 | ggcfrac, cgcfrac                           & The G and C fraction of GC coding nucleotides respectively          \\ \hline
237 | atafrac, ttafrac                           & The A and T fraction of AT coding nucleotides respectively          \\ \hline
238 | \end{tabular}
239 | \caption{Fields in codongc.csv}
240 | \label{tab:codongc-table}
241 | \end{table}
242 | 
243 | Table \ref{tab:genomes-table} documents the fields found in genomes.csv:
244 | 
245 | \begin{table}[H]
246 | \begin{tabular}{|l|l|}
247 |   \hline
248 | fullname                 & The full chromosome name as found in the FASTA file                \\ \hline
249 | acount, ccount, gcount, tcount                 & Count of A, C, G or T nucleotides                \\ \hline
250 | plasmid & Set to 1 in case this sequence is a plasmid \\ \hline
251 | realm1-5     & NCBI sourced taxonomic data \\ \hline
252 | protgenecount                           & Number of protein coding genes processed          \\ \hline
253 | stopTAG, TAA, TGA & Number of TAG, TAA and TGA stop codons respectively          \\ \hline
254 | stopXXX & Number of anomalous stop codons  \\ \hline
255 | startATG, GTG, TTG & Number of ATG, GTG and TTG start codons respectively          \\ \hline
256 | startXXX & Number of unusual start codons  \\ \hline
257 | dnaApos & position of DnaA gene (not DnaA box!) in the DNA sequence. -1 if not found.  \\ \hline
258 | \end{tabular}
259 | \caption{Fields in genomes.csv}
260 | \label{tab:genomes-table}
261 | \end{table}
262 | 
263 | 
264 | Finally, the individual \_fit.csv files contain fields called ``Xskew'' and ``predXskew'' to denote the observed X=gc, ta etc skew, plus the prediction based on the parameters found in results.csv. 
265 | \section*{Technical Validation}
266 | This database models the skews of many chromosomes and plasmids. Validation consists of evaluating the goodness-of-fit compared to the directly available numbers.
267 | 
268 | The \emph{Skew}DB fits skews to a relatively simple model of only four parameters. This prevents overfitting, and this model has proven to be robust in practice. Yet, when doing automated analysis of tens of thousands of chromosomes, mistakes will be made. Also, not all organisms show coherent GC skew.
269 | 
270 | \begin{figure}[tbhp]
271 | \centering
272 | \includegraphics[width=.5\linewidth]{rms-samples.pdf}
273 | \caption{\emph{Skew}DB fits for 16 equal sized quality categories of bacterial chromosomes}
274 | \label{fig:rms-samples}
275 | \end{figure}
276 | 
277 | Figure \ref{fig:rms-samples} shows 16 equal sized quality categories, where it is visually clear that the 88\% best fits are excellent. It is therefore reasonable to filter the database on $RMS_{gc}<0.16$. Or conversely, it could be said that above this limit interesting anomalous chromosomes can be found. 
278 | 
279 | The DoriC database \cite{luo_doric_2019} contains precise details of the location of the origin of replication. 2267 sequences appear both in DoriC and in the \emph{Skew}DB. The DoriC origin of replication should roughly be matched by the ``shift'' metric in the \emph{Skew}DB (but see Usage notes). For 90\% of sequences appearing in both databases, there is less than 5\% relative chromosome distance between these two independent metrics. This is encouraging since these two numbers do not quite measure the same thing.
280 | 
281 | On a similar note, the DnaA gene is typically (but not necessarily) located near the origin of replication. For over 80\% of chromosomes, DnaA is found within 5\% of the \emph{Skew}DB ``shift'' metric. This too is an encouraging independent confirmation of the accuracy of the data.
282 | 
283 | Finally, during processing numbers are kept of the start and stop codons encountered on all protein coding genes on all chromosomes and plasmids. These numbers are interesting in themselves (because they correlate with GC content, for example), but they also match published frequencies, and show limited numbers of non-canonical start codons, and around 0.005\% anomalous stop codons. This too confirms that the analyses are based on correct (annotation) assumptions.
284 | 
285 | 
286 | \section*{Usage Notes}
287 | The existential limitation of any database like the \emph{Skew}DB is that it does not represent the distribution of organisms found in nature. The sequence and annotation databases are dominated by easily culturable microbes. And even within that selection, specific (model) organisms are heavily oversampled because of their scientific, economic or medical relevance.
288 | 
289 | Because of this, care should be taken to interpret numbers in a way that takes such over- and undersampling into account. This leaves enough room however for finding correlations. Some metrics are sampled so heavily that it would be a miracle if the unculturable organisms were collectively conspiring to skew the statistics away from the average. In addition, the database is a very suitable way to test or generate hypotheses, or to find anomalous organisms.
290 | 
291 | Finally it should be noted that the \emph{Skew}DB tries to precisely measure the skew parameters, but it makes no effort to pin down the Origin of replication exactly. For such uses, please refer to the DoriC database \cite{luo_doric_2019}. In future work, the \emph{Skew}DB will attempt to use OriC motifs to improve fitting of this metric.
292 | 
293 | On https://skewdb.org an explanatory Jupyter \cite{Kluyver:2016aa} notebook can be found that uses Matplotlib \cite{Hunter:2007} and Pandas \cite{jeff_reback_2021_5203279} to create all the graphs from this article, and many more. In addition, this notebook reproduces all numerical claims made in this work. The \emph{Skew}DB website also provides links to informal articles that further explain GC skew, and how it could be used for research.
294 | 
295 | \section*{Code availability}
296 | The \emph{Skew}DB is produced using the Antonie DNA processing software (https://github.com/berthubert/antonie2), which is open source. In addition the pipeline is fully automated and reproducible, including the retrieval of sequences, annotations and taxonomic data from the NCBI website. The software has also been deposited with Zenodo\cite{https://doi.org/10.5281/zenodo.5516524}.
297 | 
298 | A GitHub repository is available for this article on https://github.com/berthubert/skewdb-articles, which includes this reproducible pipeline, plus a script that regenerates all the graphs and numerical claims from this paper. 
299 | 
300 | \begin{thebibliography}{10}
301 | \urlstyle{rm}
302 | \expandafter\ifx\csname url\endcsname\relax
303 |   \def\url#1{\texttt{#1}}\fi
304 | \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi
305 | \expandafter\ifx\csname doiprefix\endcsname\relax\def\doiprefix{DOI: }\fi
306 | \providecommand{\bibinfo}[2]{#2}
307 | \providecommand{\eprint}[2][]{\url{#2}}
308 | 
309 | \bibitem{pmid:10570985}
310 | \bibinfo{author}{Frank, A.~C.} \& \bibinfo{author}{Lobry, J.~R.}
311 | \newblock \bibinfo{journal}{\bibinfo{title}{Asymmetric substitution patterns: a
312 |   review of possible underlying mutational or selective mechanisms.}}
313 | \newblock {\emph{\JournalTitle{Gene}}} \textbf{\bibinfo{volume}{238}},
314 |   \bibinfo{pages}{65--77} (\bibinfo{year}{1999}).
315 | 
316 | \bibitem{Marn2008}
317 | \bibinfo{author}{Mar{\'{\i}}n, A.} \& \bibinfo{author}{Xia, X.}
318 | \newblock \bibinfo{journal}{\bibinfo{title}{{GC} skew in protein-coding genes
319 |   between the leading and lagging strands in bacterial genomes: New
320 |   substitution models incorporating strand bias}}.
321 | \newblock {\emph{\JournalTitle{Journal of Theoretical Biology}}}
322 |   \textbf{\bibinfo{volume}{253}}, \bibinfo{pages}{508--513},
323 |   https://doi.org/10.1016/j.jtbi.2008.04.004 (\bibinfo{year}{2008}).
324 | 
325 | \bibitem{Quan2019}
326 | \bibinfo{author}{Quan, C.-L.} \& \bibinfo{author}{Gao, F.}
327 | \newblock \bibinfo{journal}{\bibinfo{title}{Quantitative analysis and
328 |   assessment of base composition asymmetry and gene orientation bias in
329 |   bacterial genomes}}.
330 | \newblock {\emph{\JournalTitle{{FEBS} Letters}}}
331 |   \textbf{\bibinfo{volume}{593}}, \bibinfo{pages}{918--925},
332 |   https://doi.org/10.1002/1873-3468.13374 (\bibinfo{year}{2019}).
333 | 
334 | \bibitem{lu_skewit_2020}
335 | \bibinfo{author}{Lu, J.} \& \bibinfo{author}{Salzberg, S.~L.}
336 | \newblock \bibinfo{journal}{\bibinfo{title}{{SkewIT}: {The} {Skew} {Index}
337 |   {Test} for large-scale {GC} {Skew} analysis of bacterial genomes}}.
338 | \newblock {\emph{\JournalTitle{PLOS Computational Biology}}}
339 |   \textbf{\bibinfo{volume}{16}}, \bibinfo{pages}{e1008439},
340 |   https://doi.org/10.1371/journal.pcbi.1008439 (\bibinfo{year}{2020}).
341 | 
342 | \bibitem{luo_doric_2019}
343 | \bibinfo{author}{Luo, H.} \& \bibinfo{author}{Gao, F.}
344 | \newblock \bibinfo{journal}{\bibinfo{title}{{DoriC} 10.0: an updated database
345 |   of replication origins in prokaryotic genomes including chromosomes and
346 |   plasmids}}.
347 | \newblock {\emph{\JournalTitle{Nucleic Acids Research}}}
348 |   \textbf{\bibinfo{volume}{47}}, \bibinfo{pages}{D74--D77},
349 |   https://doi.org/10.1093/nar/gky1014 (\bibinfo{year}{2019}).
350 | 
351 | \bibitem{ODonnell2013}
352 | \bibinfo{author}{O{\textquotesingle}Donnell, M.}, \bibinfo{author}{Langston,
353 |   L.} \& \bibinfo{author}{Stillman, B.}
354 | \newblock \bibinfo{journal}{\bibinfo{title}{Principles and concepts of {DNA}
355 |   replication in bacteria, archaea, and eukarya}}.
356 | \newblock {\emph{\JournalTitle{Cold Spring Harbor Perspectives in Biology}}}
357 |   \textbf{\bibinfo{volume}{5}}, \bibinfo{pages}{a010108--a010108},
358 |   https://doi.org/10.1101/cshperspect.a010108 (\bibinfo{year}{2013}).
359 | 
360 | \bibitem{Lilly2015}
361 | \bibinfo{author}{Lilly, J.} \& \bibinfo{author}{Camps, M.}
362 | \newblock \bibinfo{journal}{\bibinfo{title}{Mechanisms of theta plasmid
363 |   replication}}.
364 | \newblock {\emph{\JournalTitle{Microbiology Spectrum}}}
365 |   \textbf{\bibinfo{volume}{3}}, https://doi.org/10.1128/microbiolspec.plas-0029-2014
366 |   (\bibinfo{year}{2015}).
367 | 
368 | \bibitem{rudner_separation_1968}
369 | \bibinfo{author}{Rudner, R.}, \bibinfo{author}{Karkas, J.~D.} \&
370 |   \bibinfo{author}{Chargaff, E.}
371 | \newblock \bibinfo{journal}{\bibinfo{title}{Separation of {B}. subtilis {DNA}
372 |   into complementary strands. 3. {Direct} analysis.}}
373 | \newblock {\emph{\JournalTitle{Proceedings of the National Academy of
374 |   Sciences}}} \textbf{\bibinfo{volume}{60}}, \bibinfo{pages}{921--922},
375 |   https://doi.org/10.1073/pnas.60.3.921 (\bibinfo{year}{1968}).
376 | 
377 | \bibitem{fariselli_dna_2020}
378 | \bibinfo{author}{Fariselli, P.}, \bibinfo{author}{Taccioli, C.},
379 |   \bibinfo{author}{Pagani, L.} \& \bibinfo{author}{Maritan, A.}
380 | \newblock \bibinfo{journal}{\bibinfo{title}{{DNA} sequence symmetries from
381 |   randomness: the origin of the {Chargaff}’s second parity rule}}.
382 | \newblock {\emph{\JournalTitle{Briefings in Bioinformatics}}}
383 |   \bibinfo{pages}{bbaa041}, https://doi.org/10.1093/bib/bbaa041 (\bibinfo{year}{2020}).
384 | 
385 | \bibitem{tillier_contributions_2000}
386 | \bibinfo{author}{Tillier, E.~R.} \& \bibinfo{author}{Collins, R.~A.}
387 | \newblock \bibinfo{journal}{\bibinfo{title}{The {Contributions} of
388 |   {Replication} {Orientation}, {Gene} {Direction}, and {Signal} {Sequences} to
389 |   {Base}-{Composition} {Asymmetries} in {Bacterial} {Genomes}}}.
390 | \newblock {\emph{\JournalTitle{Journal of Molecular Evolution}}}
391 |   \textbf{\bibinfo{volume}{50}}, \bibinfo{pages}{249--257},
392 |   https://doi.org/10.1007/s002399910029 (\bibinfo{year}{2000}).
393 | 
394 | \bibitem{zhang_brief_2014}
395 | \bibinfo{author}{Zhang, R.} \& \bibinfo{author}{Zhang, C.-T.}
396 | \newblock \bibinfo{journal}{\bibinfo{title}{A {Brief} {Review}: {The} {Z}-curve
397 |   {Theory} and its {Application} in {Genome} {Analysis}}}.
398 | \newblock {\emph{\JournalTitle{Current genomics}}}
399 |   \textbf{\bibinfo{volume}{15}}, \bibinfo{pages}{78--94},
400 |   https://doi.org/10.2174/1389202915999140328162433 (\bibinfo{year}{2014}).
401 | \newblock \bibinfo{note}{Publisher: Bentham Science Publishers}.
402 | 
403 | \bibitem{charneski_atypical_2011}
404 | \bibinfo{author}{Charneski, C.~A.}, \bibinfo{author}{Honti, F.},
405 |   \bibinfo{author}{Bryant, J.~M.}, \bibinfo{author}{Hurst, L.~D.} \&
406 |   \bibinfo{author}{Feil, E.~J.}
407 | \newblock \bibinfo{journal}{\bibinfo{title}{Atypical {AT} {Skew} in {Firmicute}
408 |   {Genomes} {Results} from {Selection} and {Not} from {Mutation}}}.
409 | \newblock {\emph{\JournalTitle{PLOS Genetics}}} \textbf{\bibinfo{volume}{7}},
410 |   \bibinfo{pages}{e1002283}, https://doi.org/10.1371/journal.pgen.1002283
411 |   (\bibinfo{year}{2011}).
412 | 
413 | \bibitem{grigoriev_analyzing_1998}
414 | \bibinfo{author}{Grigoriev, A.}
415 | \newblock \bibinfo{journal}{\bibinfo{title}{Analyzing genomes with cumulative
416 |   skew diagrams}}.
417 | \newblock {\emph{\JournalTitle{Nucleic Acids Research}}}
418 |   \textbf{\bibinfo{volume}{26}}, \bibinfo{pages}{2286--2290},
419 |   https://doi.org/10.1093/nar/26.10.2286 (\bibinfo{year}{1998}).
420 | 
421 | \bibitem{Roten2002}
422 | \bibinfo{author}{Roten, C.-A.~H.}
423 | \newblock \bibinfo{journal}{\bibinfo{title}{Comparative genometrics ({CG}): a
424 |   database dedicated to biometric comparisons of whole genomes}}.
425 | \newblock {\emph{\JournalTitle{Nucleic Acids Research}}}
426 |   \textbf{\bibinfo{volume}{30}}, \bibinfo{pages}{142--144},
427 |   https://doi.org/10.1093/nar/30.1.142 (\bibinfo{year}{2002}).
428 | 
429 | \bibitem{Zhang2003}
430 | \bibinfo{author}{Zhang, C.-T.}, \bibinfo{author}{Zhang, R.} \&
431 |   \bibinfo{author}{Ou, H.-Y.}
432 | \newblock \bibinfo{journal}{\bibinfo{title}{The z curve database: a graphic
433 |   representation of genome sequences}}.
434 | \newblock {\emph{\JournalTitle{Bioinformatics}}} \textbf{\bibinfo{volume}{19}},
435 |   \bibinfo{pages}{593--599}, https://doi.org/10.1093/bioinformatics/btg041
436 |   (\bibinfo{year}{2003}).
437 | 
438 | \bibitem{Thomas2007}
439 | \bibinfo{author}{Thomas, J.~M.}, \bibinfo{author}{Horspool, D.},
440 |   \bibinfo{author}{Brown, G.}, \bibinfo{author}{Tcherepanov, V.} \&
441 |   \bibinfo{author}{Upton, C.}
442 | \newblock \bibinfo{journal}{\bibinfo{title}{{GraphDNA}: a java program for
443 |   graphical display of {DNA} composition analyses}}.
444 | \newblock {\emph{\JournalTitle{{BMC} Bioinformatics}}}
445 |   \textbf{\bibinfo{volume}{8}}, https://doi.org/10.1186/1471-2105-8-21
446 |   (\bibinfo{year}{2007}).
447 | 
448 | \bibitem{Grigoriev1998}
449 | \bibinfo{author}{Grigoriev, A.}
450 | \newblock \bibinfo{journal}{\bibinfo{title}{Analyzing genomes with cumulative
451 |   skew diagrams}}.
452 | \newblock {\emph{\JournalTitle{Nucleic Acids Research}}}
453 |   \textbf{\bibinfo{volume}{26}}, \bibinfo{pages}{2286--2290},
454 |   https://doi.org/10.1093/nar/26.10.2286 (\bibinfo{year}{1998}).
455 | 
456 | \bibitem{Nelder1965}
457 | \bibinfo{author}{Nelder, J.~A.} \& \bibinfo{author}{Mead, R.}
458 | \newblock \bibinfo{journal}{\bibinfo{title}{A simplex method for function
459 |   minimization}}.
460 | \newblock {\emph{\JournalTitle{The Computer Journal}}}
461 |   \textbf{\bibinfo{volume}{7}}, \bibinfo{pages}{308--313},
462 |   https://doi.org/10.1093/comjnl/7.4.308 (\bibinfo{year}{1965}).
463 | 
464 | \bibitem{https://doi.org/10.5061/dryad.g4f4qrfr6}
465 | \bibinfo{author}{Hubert, B.}
466 | \newblock \bibinfo{title}{Skewdb: A comprehensive database of gc and 10 other
467 |   skews for over 28, 000 chromosomes and plasmids},
468 |   https://doi.org/10.5061/DRYAD.G4F4QRFR6 (\bibinfo{year}{2021}).
469 | 
470 | \bibitem{Kluyver:2016aa}
471 | \bibinfo{author}{Kluyver, T.} \emph{et~al.}
472 | \newblock \bibinfo{title}{Jupyter notebooks -- a publishing format for
473 |   reproducible computational workflows}.
474 | \newblock In \bibinfo{editor}{Loizides, F.} \& \bibinfo{editor}{Schmidt, B.}
475 |   (eds.) \emph{\bibinfo{booktitle}{Positioning and Power in Academic
476 |   Publishing: Players, Agents and Agendas}}, \bibinfo{pages}{87 -- 90}
477 |   (\bibinfo{organization}{IOS Press}, \bibinfo{year}{2016}).
478 | 
479 | \bibitem{Hunter:2007}
480 | \bibinfo{author}{Hunter, J.~D.}
481 | \newblock \bibinfo{journal}{\bibinfo{title}{Matplotlib: A 2d graphics
482 |   environment}}.
483 | \newblock {\emph{\JournalTitle{Computing in Science \& Engineering}}}
484 |   \textbf{\bibinfo{volume}{9}}, \bibinfo{pages}{90--95},
485 |   https://doi.org/10.1109/MCSE.2007.55 (\bibinfo{year}{2007}).
486 | 
487 | \bibitem{jeff_reback_2021_5203279}
488 | \bibinfo{author}{Reback, J.} \emph{et~al.}
489 | \newblock \bibinfo{title}{pandas-dev/pandas: Pandas 1.3.2},
490 |   https://doi.org/10.5281/zenodo.5203279 (\bibinfo{year}{2021}).
491 | 
492 | \bibitem{https://doi.org/10.5281/zenodo.5516524}
493 | \bibinfo{author}{Hubert, B.} \& \bibinfo{author}{{Beaumont Lab}}.
494 | \newblock \bibinfo{title}{berthubert/antonie2: Skewversion 1.0},
495 |   https://doi.org/10.5281/ZENODO.5516524 (\bibinfo{year}{2021}).
496 | 
497 | \bibitem{hol_density-dependent_2016}
498 | \bibinfo{author}{Hol, F. J.~H.}, \bibinfo{author}{Hubert, B.},
499 |   \bibinfo{author}{Dekker, C.} \& \bibinfo{author}{Keymer, J.~E.}
500 | \newblock \bibinfo{journal}{\bibinfo{title}{Density-dependent adaptive
501 |   resistance allows swimming bacteria to colonize an antibiotic gradient}}.
502 | \newblock {\emph{\JournalTitle{The ISME Journal}}}
503 |   \textbf{\bibinfo{volume}{10}}, \bibinfo{pages}{30--38},
504 |   https://doi.org/10.1038/ismej.2015.107 (\bibinfo{year}{2016}).
505 | 
506 | \end{thebibliography}
507 | 
508 | 
509 | \section*{Acknowledgements} 
510 | 
511 | I would like to thank Bertus Beaumont for helping me to think like a biologist, and Jason Piper for regularly pointing me to the relevant literature. In addition, I am grateful that Felix Hol kindly allowed me to field test my software on his DNA sequences \cite{hol_density-dependent_2016}. Twitter users $@$halvorz and $@$Suddenly\_a\_goat also provided valuable feedback.
512 | 
513 | \section*{Author contributions statement}
514 | 
515 | B.H. did all the work.
516 | 
517 | \section*{Competing interests}
518 | The author declares no competing interests.
519 | 
520 | \section*{Figures \& Tables}
521 | \newcommand{\comment}[1]{}
522 | 
523 | Figures:
524 | \begin{enumerate}
525 | \item Sample graph showing \emph{Skew}DB data for \emph{Lactiplantibacillus plantarum} strain LZ95 chromosome
526 | \item Scatter graph of 25,000 chromosomes by superphylum, GC skew versus TA skew
527 | \item Predicted versus actual GC/TA skew for 4093 Firmicutes
528 | \item Scatter graph of codon/strand bias versus GC/TA skew for \emph{C. difficile}
529 | \item Chromosomes with asymmetric skews
530 | \item Chromosomes with differing strand lengths
531 | \item GC and TA skew for \emph{Salmonella enterica subsp. enterica serovar Concord} strain AR-0407
532 | \item \emph{Skew}DB fits for 16 equal sized quality categories of bacterial chromosomes
533 | \end{enumerate}
534 | 
535 | Tables:
536 | \begin{enumerate}
537 | \item Fields of skplot.csv
538 | \item  Skew metrics
539 | \item Fields in codongc.csv
540 | \item  Fields in genomes.csv
541 | \end{enumerate}
542 | \comment{
543 | }
544 | 
545 | \end{document}
546 | 


--------------------------------------------------------------------------------