├── .gitignore
├── Bio3D
    ├── count.R
    ├── distance.R
    └── parse_pdb.R
├── BioJava
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── com
    │       │       └── jgreener
    │       │           └── pdb
    │       │               ├── parse_mmcif.java
    │       │               ├── parse_mmtf.java
    │       │               └── parse_pdb.java
    │   └── test
    │       └── java
    │           └── com
    │               └── jgreener
    │                   └── pdb
    │                       └── AppTest.java
├── BioPerl
    ├── count.pl
    ├── distance.pl
    └── parse_pdb.pl
├── BioRuby
    ├── count.rb
    ├── distance.rb
    └── parse_pdb.rb
├── BioStructures
    ├── count.jl
    ├── distance.jl
    ├── parse_mmcif.jl
    ├── parse_mmtf.jl
    ├── parse_pdb.jl
    └── ramachandran.jl
├── Biopython
    ├── count.py
    ├── distance.py
    ├── parse_mmcif.py
    ├── parse_mmtf.py
    ├── parse_pdb.py
    └── ramachandran.py
├── CITATION.bib
├── ESBTL
    ├── CMakeLists.txt
    ├── README.md
    └── parse_pdb.cc
├── GEMMI
    ├── Makefile
    ├── README.md
    ├── count.cc
    ├── distance.cc
    ├── parse_mmcif.cc
    └── parse_pdb.cc
├── LICENSE
├── MDAnalysis
    ├── count.py
    ├── distance.py
    ├── parse_pdb.py
    └── ramachandran.py
├── MIToS
    ├── count.jl
    ├── distance.jl
    └── parse_pdb.jl
├── ProDy
    ├── count.py
    ├── distance.py
    ├── parse_pdb.py
    └── ramachandran.py
├── README.md
├── Rpdb
    ├── count.R
    ├── distance.R
    ├── parse_pdb.R
    └── ramachandran.R
├── Victor
    ├── Makefile
    ├── README.md
    └── parse_pdb.cc
├── atomium
    ├── parse_mmcif.py
    ├── parse_mmtf.py
    └── parse_pdb.py
├── benchmarks.csv
├── biotite
    ├── parse_mmcif.py
    ├── parse_mmtf.py
    └── parse_pdb.py
├── checkwholepdb
    ├── checknewpdb.jl
    ├── checkwholepdb.jl
    └── checkwholepdb.py
├── chemfiles
    ├── Makefile
    ├── README.md
    ├── count.cpp
    ├── count.py
    ├── distance.cpp
    ├── distance.py
    ├── parse_mmcif.cpp
    ├── parse_mmcif.py
    ├── parse_mmtf.cpp
    ├── parse_mmtf.py
    ├── parse_pdb.cpp
    ├── parse_pdb.py
    ├── ramachandran.cpp
    └── ramachandran.py
├── plot
    ├── plot.jl
    └── plot.png
└── tools
    ├── download_data.jl
    ├── mean.py
    ├── run_benchmarks.sh
    └── table.jl


/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | *.dat
 3 | .Rhistory
 4 | BioJava/target
 5 | BioJava/dependency-reduced-pom.xml
 6 | GEMMI/gemmi
 7 | GEMMI/parse_pdb
 8 | GEMMI/parse_mmcif
 9 | GEMMI/count
10 | GEMMI/distance
11 | Victor/*.o
12 | Victor/parse_pdb
13 | ESBTL/CMakeFiles
14 | ESBTL/CMakeCache.txt
15 | ESBTL/Makefile
16 | ESBTL/cmake_install.cmake
17 | ESBTL/parse_pdb
18 | chemfiles/parse_pdb
19 | chemfiles/parse_mmcif
20 | chemfiles/parse_mmtf
21 | chemfiles/count
22 | chemfiles/distance
23 | chemfiles/ramachandran
24 | *~
25 | *.swp
26 | todo.txt
27 | 


--------------------------------------------------------------------------------
/Bio3D/count.R:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | library(bio3d)
 4 | library(microbenchmark)
 5 | 
 6 | pdb_filepath <- "data/1AKE.pdb"
 7 | struc <- read.pdb(pdb_filepath, multi=TRUE)
 8 | 
 9 | count <- function() {
10 |     resnums <- struc$atom$resno[struc$atom$resid=="ALA"]
11 |     chains <- struc$atom$chain[struc$atom$resid=="ALA"]
12 |     resids <- paste(resnums, chains, sep="")
13 |     return(length(unique(resids)))
14 | }
15 | 
16 | bench <- microbenchmark(count(), times=1)
17 | 
18 | cat(bench$time / 10^9, "\n", sep="")
19 | 


--------------------------------------------------------------------------------
/Bio3D/distance.R:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | library(bio3d)
 6 | library(microbenchmark)
 7 | 
 8 | pdb_filepath <- "data/1AKE.pdb"
 9 | struc <- read.pdb(pdb_filepath)
10 | 
11 | distance <- function() {
12 |     coords <- matrix(struc$xyz, length(struc$xyz) / 3, 3, byrow=TRUE)
13 |     is_res50 <- which(struc$atom$resno == 50 & struc$atom$chain == "A")
14 |     is_res60 <- which(struc$atom$resno == 60 & struc$atom$chain == "A")
15 |     return(min(dist.xyz(coords[is_res50,], coords[is_res60,])))
16 | }
17 | 
18 | bench <- microbenchmark(distance(), times=1)
19 | 
20 | cat(bench$time / 10^9, "\n", sep="")
21 | 


--------------------------------------------------------------------------------
/Bio3D/parse_pdb.R:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | library(bio3d)
 4 | library(microbenchmark)
 5 | 
 6 | pdb_filepath <- commandArgs(trailingOnly=TRUE)[1]
 7 | 
 8 | bench <- microbenchmark(read.pdb(pdb_filepath, multi=TRUE), times=1)
 9 | 
10 | cat(bench$time / 10^9, "\n", sep="")
11 | 


--------------------------------------------------------------------------------
/BioJava/README.md:
--------------------------------------------------------------------------------
1 | Build with `mvn package`.
2 | 


--------------------------------------------------------------------------------
/BioJava/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.jgreener.pdb</groupId>
 5 |   <artifactId>pdb-benchmarks</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <name>pdb-benchmarks</name>
 9 |   <url>http://maven.apache.org</url>
10 |   <dependencies>
11 |     <dependency>
12 |       <groupId>org.biojava</groupId>
13 |       <artifactId>biojava-structure</artifactId>
14 |       <version>5.3.0</version>
15 |     </dependency>
16 |     <dependency>
17 |       <groupId>junit</groupId>
18 |       <artifactId>junit</artifactId>
19 |       <version>4.13.1</version>
20 |       <scope>test</scope>
21 |     </dependency>
22 |   </dependencies>
23 |   <build>
24 |     <plugins>
25 |       <plugin>
26 |         <groupId>org.apache.maven.plugins</groupId>
27 |         <artifactId>maven-shade-plugin</artifactId>
28 |         <version>3.2.2</version>
29 |         <executions>
30 |           <execution>
31 |             <phase>package</phase>
32 |             <goals>
33 |               <goal>shade</goal>
34 |             </goals>
35 |           </execution>
36 |         </executions>
37 |       </plugin>
38 |     </plugins>
39 |   </build>
40 | </project>
41 | 


--------------------------------------------------------------------------------
/BioJava/src/main/java/com/jgreener/pdb/parse_mmcif.java:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a mmCIF file given as an argument
 2 | 
 3 | package com.jgreener.pdb;
 4 | 
 5 | import org.biojava.nbio.structure.Structure;
 6 | import org.biojava.nbio.structure.io.StructureIOFile;
 7 | import org.biojava.nbio.structure.io.MMCIFFileReader;
 8 | 
 9 | public class parse_mmcif
10 | {
11 |     public static void main( String[] args )
12 |     {
13 |         String mmcif_filepath = args[0];
14 |         // Run once to trigger illegal reflective access warning
15 |         StructureIOFile reader1 = new MMCIFFileReader();
16 |         try {
17 |             Structure structure = reader1.getStructure(mmcif_filepath);
18 |         } catch (Exception e) {
19 |             e.printStackTrace();
20 |         }
21 |         long startTime = System.nanoTime();
22 |         StructureIOFile reader2 = new MMCIFFileReader();
23 |         try {
24 |             Structure structure = reader2.getStructure(mmcif_filepath);
25 |         } catch (Exception e) {
26 |             e.printStackTrace();
27 |         }
28 |         long endTime = System.nanoTime();
29 |         System.out.println((endTime - startTime) / 1000000000.0);
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/BioJava/src/main/java/com/jgreener/pdb/parse_mmtf.java:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a MMTF file given as an argument
 2 | 
 3 | package com.jgreener.pdb;
 4 | 
 5 | import org.biojava.nbio.structure.Structure;
 6 | import org.biojava.nbio.structure.io.mmtf.MmtfActions;
 7 | 
 8 | import java.nio.file.Paths;
 9 | 
10 | public class parse_mmtf
11 | {
12 |     public static void main( String[] args )
13 |     {
14 |         String mmtf_filepath = args[0];
15 |         // Run once to trigger illegal reflective access warning
16 |         try {
17 |             Structure structure = MmtfActions.readFromFile(Paths.get(mmtf_filepath));
18 |         } catch (Exception e) {
19 |             e.printStackTrace();
20 |         }
21 |         long startTime = System.nanoTime();
22 |         try {
23 |             Structure structure = MmtfActions.readFromFile(Paths.get(mmtf_filepath));
24 |         } catch (Exception e) {
25 |             e.printStackTrace();
26 |         }
27 |         long endTime = System.nanoTime();
28 |         System.out.println((endTime - startTime) / 1000000000.0);
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/BioJava/src/main/java/com/jgreener/pdb/parse_pdb.java:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | package com.jgreener.pdb;
 4 | 
 5 | import org.biojava.nbio.structure.Structure;
 6 | import org.biojava.nbio.structure.io.PDBFileReader;
 7 | 
 8 | public class parse_pdb
 9 | {
10 |     public static void main( String[] args )
11 |     {
12 |         String pdb_filepath = args[0];
13 |         // Run once to trigger illegal reflective access warning
14 |         PDBFileReader pdbreader1 = new PDBFileReader();
15 |         try {
16 |             Structure structure = pdbreader1.getStructure(pdb_filepath);
17 |         } catch (Exception e) {
18 |             e.printStackTrace();
19 |         }
20 |         long startTime = System.nanoTime();
21 |         PDBFileReader pdbreader2 = new PDBFileReader();
22 |         try {
23 |             Structure structure = pdbreader2.getStructure(pdb_filepath);
24 |         } catch (Exception e) {
25 |             e.printStackTrace();
26 |         }
27 |         long endTime = System.nanoTime();
28 |         System.out.println((endTime - startTime) / 1000000000.0);
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/BioJava/src/test/java/com/jgreener/pdb/AppTest.java:
--------------------------------------------------------------------------------
 1 | package com.jgreener.pdb;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public AppTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( AppTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testApp()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/BioPerl/count.pl:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | use Bio::Structure::IO;
 4 | use Time::HiRes qw(time);
 5 | use strict;
 6 | 
 7 | my $pdb_filepath = "data/1AKE.pdb";
 8 | my $structio = Bio::Structure::IO->new(-file => $pdb_filepath);
 9 | my $struc = $structio->next_structure;
10 | 
11 | sub count {
12 |     my $c = 0;
13 |     for my $chain ($struc->get_chains) {
14 |         for my $res ($struc->get_residues($chain)) {
15 |             if (substr($res->id, 0, 3) eq "ALA") {
16 |                 $c++;
17 |             }
18 |         }
19 |     }
20 |     return $c;
21 | }
22 | 
23 | my $start = time();
24 | count();
25 | my $end = time();
26 | 
27 | print $end - $start, "\n";
28 | 


--------------------------------------------------------------------------------
/BioPerl/distance.pl:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | use Bio::Structure::IO;
 6 | use Time::HiRes qw(time);
 7 | use strict;
 8 | 
 9 | my $pdb_filepath = "data/1AKE.pdb";
10 | my $structio = Bio::Structure::IO->new(-file => $pdb_filepath);
11 | my $struc = $structio->next_structure;
12 | 
13 | sub distance {
14 |     my @coords_50 = ();
15 |     my @coords_60 = ();
16 |     for my $chain ($struc->get_chains) {
17 |         if ($chain->id eq "A") {
18 |             for my $res ($struc->get_residues($chain)) {
19 |                 if (substr($res->id, -3, 3) eq "-50") {
20 |                     for my $atom ($struc->get_atoms($res)) {
21 |                         push @coords_50, [$atom->xyz];
22 |                     }
23 |                 } elsif (substr($res->id, -3, 3) eq "-60") {
24 |                     for my $atom ($struc->get_atoms($res)) {
25 |                         push @coords_60, [$atom->xyz];
26 |                     }
27 |                 }
28 |             }
29 |         }
30 |     }
31 |     my $min_sq_dist = "Infinity";
32 |     for (my $i = 0; $i < scalar(@coords_50); $i++) {
33 |         for (my $j = 0; $j < scalar(@coords_60); $j++) {
34 |             my $sq_dist = ($coords_50[$i][0]-$coords_60[$j][0]) ** 2 + ($coords_50[$i][1]-$coords_60[$j][1]) ** 2 + ($coords_50[$i][2]-$coords_60[$j][2]) ** 2;
35 |             if ($sq_dist < $min_sq_dist) {
36 |                 $min_sq_dist = $sq_dist;
37 |             }
38 |         }
39 |     }
40 |     return sqrt($min_sq_dist);
41 | }
42 | 
43 | my $start = time();
44 | distance();
45 | my $end = time();
46 | 
47 | print $end - $start, "\n";
48 | 


--------------------------------------------------------------------------------
/BioPerl/parse_pdb.pl:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | use Bio::Structure::IO;
 4 | use Time::HiRes qw(time);
 5 | use strict;
 6 | 
 7 | my $pdb_filepath = $ARGV[0];
 8 | 
 9 | sub parse {
10 |     my $structio = Bio::Structure::IO->new(-file => $pdb_filepath);
11 |     return $structio->next_structure;
12 | }
13 | 
14 | my $start = time();
15 | parse();
16 | my $end = time();
17 | 
18 | print $end - $start, "\n";
19 | 


--------------------------------------------------------------------------------
/BioRuby/count.rb:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | require "bio"
 4 | require "benchmark"
 5 | 
 6 | pdb_filepath = "data/1AKE.pdb"
 7 | pdb = Bio::PDB.new(File.read(pdb_filepath))
 8 | 
 9 | elapsed = Benchmark.realtime {
10 |     pdb.find_residue { |res| res.resName == "ALA" }.length
11 | }
12 | 
13 | print elapsed, "\n"
14 | 


--------------------------------------------------------------------------------
/BioRuby/distance.rb:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | require "bio"
 6 | require "benchmark"
 7 | include Bio::PDB::Utils
 8 | 
 9 | pdb_filepath = "data/1AKE.pdb"
10 | pdb = Bio::PDB.new(File.read(pdb_filepath))
11 | 
12 | elapsed = Benchmark.realtime {
13 |     res_50 = pdb.find_residue { |res| res.resSeq == 50 and res.chain.id == "A" }[0]
14 |     res_60 = pdb.find_residue { |res| res.resSeq == 60 and res.chain.id == "A" }[0]
15 |     min_dist = Float::INFINITY
16 |     res_50.each_atom do |atom_50|
17 |         res_60.each_atom do |atom_60|
18 |             if distance(atom_50, atom_60) < min_dist
19 |                 min_dist = distance(atom_50, atom_60)
20 |             end
21 |         end
22 |     end
23 |     min_dist
24 | }
25 | 
26 | print elapsed, "\n"
27 | 


--------------------------------------------------------------------------------
/BioRuby/parse_pdb.rb:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | require "bio"
 4 | require "benchmark"
 5 | 
 6 | pdb_filepath = ARGV[0]
 7 | 
 8 | elapsed = Benchmark.realtime {
 9 |     Bio::PDB.new(File.read(pdb_filepath))
10 | }
11 | 
12 | print elapsed, "\n"
13 | 


--------------------------------------------------------------------------------
/BioStructures/count.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | using BioStructures
 4 | 
 5 | pdb_filepath = "data/1AKE.pdb"
 6 | struc = read(pdb_filepath, PDB)
 7 | 
 8 | function counter()
 9 |     alanineselector(res::AbstractResidue) = resnameselector(res, ["ALA"])
10 |     return countresidues(struc, alanineselector)
11 | end
12 | 
13 | # Run to JIT compile
14 | counter()
15 | 
16 | elapsed = @elapsed counter()
17 | 
18 | println(elapsed)
19 | 


--------------------------------------------------------------------------------
/BioStructures/distance.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | using BioStructures
 6 | 
 7 | pdb_filepath = "data/1AKE.pdb"
 8 | struc = read(pdb_filepath, PDB)
 9 | 
10 | # Run to JIT compile
11 | distance(struc['A'][50], struc['A'][60])
12 | 
13 | elapsed = @elapsed distance(struc['A'][50], struc['A'][60])
14 | 
15 | println(elapsed)
16 | 


--------------------------------------------------------------------------------
/BioStructures/parse_mmcif.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a mmCIF file given as an argument
 2 | 
 3 | using BioStructures
 4 | 
 5 | mmcif_filepath = ARGS[1]
 6 | 
 7 | # Run to JIT compile
 8 | read(mmcif_filepath, MMCIF)
 9 | 
10 | elapsed = @elapsed struc = read(mmcif_filepath, MMCIF)
11 | 
12 | println(elapsed)
13 | 


--------------------------------------------------------------------------------
/BioStructures/parse_mmtf.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a MMTF file given as an argument
 2 | 
 3 | using BioStructures
 4 | 
 5 | mmtf_filepath = ARGS[1]
 6 | 
 7 | # Run to JIT compile
 8 | read(mmtf_filepath, MMTF)
 9 | 
10 | elapsed = @elapsed struc = read(mmtf_filepath, MMTF)
11 | 
12 | println(elapsed)
13 | 


--------------------------------------------------------------------------------
/BioStructures/parse_pdb.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | using BioStructures
 4 | 
 5 | pdb_filepath = ARGS[1]
 6 | 
 7 | # Run to JIT compile
 8 | read(pdb_filepath, PDB)
 9 | 
10 | elapsed = @elapsed struc = read(pdb_filepath, PDB)
11 | 
12 | println(elapsed)
13 | 


--------------------------------------------------------------------------------
/BioStructures/ramachandran.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of Ramachandran phi/psi angles from a PDB file
 2 | 
 3 | using BioStructures
 4 | 
 5 | pdb_filepath = "data/1AKE.pdb"
 6 | struc = read(pdb_filepath, PDB)
 7 | 
 8 | # Run to JIT compile
 9 | ramachandranangles(struc, standardselector)
10 | 
11 | elapsed = @elapsed ramachandranangles(struc, standardselector)
12 | 
13 | println(elapsed)
14 | 


--------------------------------------------------------------------------------
/Biopython/count.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | import time
 4 | from Bio.PDB import PDBParser
 5 | 
 6 | pdb_filepath = "data/1AKE.pdb"
 7 | parser = PDBParser()
 8 | struc = parser.get_structure("", pdb_filepath)
 9 | 
10 | def count():
11 |     count = 0
12 |     for res in struc.get_residues():
13 |         if res.get_resname() == "ALA":
14 |             count += 1
15 |     return count
16 | 
17 | start = time.time()
18 | count()
19 | end = time.time()
20 | 
21 | print(end - start)
22 | 


--------------------------------------------------------------------------------
/Biopython/distance.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | import time
 6 | from Bio.PDB import PDBParser
 7 | 
 8 | pdb_filepath = "data/1AKE.pdb"
 9 | parser = PDBParser()
10 | struc = parser.get_structure("", pdb_filepath)
11 | 
12 | def distance():
13 |     min_dist = float("inf")
14 |     for atom_a in struc[0]['A'][50]:
15 |         for atom_b in struc[0]['A'][60]:
16 |             if atom_a - atom_b < min_dist:
17 |                 min_dist = atom_a - atom_b
18 |     return min_dist
19 | 
20 | start = time.time()
21 | distance()
22 | end = time.time()
23 | 
24 | print(end - start)
25 | 


--------------------------------------------------------------------------------
/Biopython/parse_mmcif.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a mmCIF file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | from Bio.PDB import MMCIFParser
 6 | 
 7 | mmcif_filepath = sys.argv[1]
 8 | parser = MMCIFParser()
 9 | 
10 | start = time.time()
11 | parser.get_structure("", mmcif_filepath)
12 | end = time.time()
13 | 
14 | print(end - start)
15 | 


--------------------------------------------------------------------------------
/Biopython/parse_mmtf.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a MMTF file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | from Bio.PDB.mmtf import MMTFParser
 6 | 
 7 | mmtf_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | MMTFParser.get_structure(mmtf_filepath)
11 | end = time.time()
12 | 
13 | print(end - start)
14 | 


--------------------------------------------------------------------------------
/Biopython/parse_pdb.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | from Bio.PDB import PDBParser
 6 | 
 7 | pdb_filepath = sys.argv[1]
 8 | parser = PDBParser()
 9 | 
10 | start = time.time()
11 | parser.get_structure("", pdb_filepath)
12 | end = time.time()
13 | 
14 | print(end - start)
15 | 


--------------------------------------------------------------------------------
/Biopython/ramachandran.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of Ramachandran phi/psi angles from a PDB file
 2 | 
 3 | import time
 4 | from Bio.PDB import PDBParser
 5 | from Bio.PDB.vectors import calc_dihedral
 6 | 
 7 | pdb_filepath = "data/1AKE.pdb"
 8 | parser = PDBParser()
 9 | struc = parser.get_structure("", pdb_filepath)
10 | 
11 | def ramachandran():
12 |     phi_angles = []
13 |     psi_angles = []
14 |     residues = list(struc.get_residues())
15 |     for i in range(1, len(residues) - 1):
16 |         res = residues[i]
17 |         res_prev = residues[i - 1]
18 |         res_next = residues[i + 1]
19 |         # Check residues have sequential residue numbers
20 |         if res.get_id()[1] == res_prev.get_id()[1] + 1 and res_next.get_id()[1] == res.get_id()[1] + 1:
21 |             try:
22 |                 phi_angle = calc_dihedral(res_prev["C"].get_vector(), res["N"].get_vector(), res["CA"].get_vector(), res["C"].get_vector())
23 |                 psi_angle = calc_dihedral(res["N"].get_vector(), res["CA"].get_vector(), res["C"].get_vector(), res_next["N"].get_vector())
24 |                 phi_angles.append(phi_angle)
25 |                 psi_angles.append(psi_angle)
26 |             except:
27 |                 pass
28 |     return phi_angles, psi_angles
29 | 
30 | start = time.time()
31 | ramachandran()
32 | end = time.time()
33 | 
34 | print(end - start)
35 | 


--------------------------------------------------------------------------------
/CITATION.bib:
--------------------------------------------------------------------------------
 1 | @article{BioStructures.jl-2020,
 2 |     author="Greener, J G and Selvaraj, J and Ward, B J",
 3 |     title="{BioStructures.jl: read, write and manipulate macromolecular structures in Julia}",
 4 |     journal="Bioinformatics",
 5 |     year="2020",
 6 |     volume="36",
 7 |     number="14",
 8 |     pages="4206--4207",
 9 |     doi="10.1093/bioinformatics/btaa502",
10 | }
11 | 


--------------------------------------------------------------------------------
/ESBTL/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # cmake file for ESBTL benchmarks
2 | # ESBTL_DIR needs to be set as the ESBTL root directory
3 | project( benchmarks )
4 | CMAKE_MINIMUM_REQUIRED(VERSION 2.4.5)
5 | include_directories($ESBTL_DIR/include/)
6 | add_executable(parse_pdb parse_pdb.cc)
7 | 


--------------------------------------------------------------------------------
/ESBTL/README.md:
--------------------------------------------------------------------------------
1 | ESBTL_DIR needs to be set as the ESBTL root directory.
2 | CPLUS_INCLUDE_PATH needs to include the ESBTL library path.
3 | Compile with `cmake .` then `make`.
4 | 


--------------------------------------------------------------------------------
/ESBTL/parse_pdb.cc:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | #include <ESBTL/default.h>
 4 | #include <iostream>
 5 | #include <time.h>
 6 | 
 7 | int main( int argc, char* argv[] ) {
 8 |     std::string pdb_filepath = argv[1];
 9 |     std::cout.setstate(std::ios_base::failbit);
10 |     struct timespec tstart, tend;
11 |     clock_gettime(CLOCK_REALTIME, &tstart);
12 |     ESBTL::PDB_line_selector sel;
13 |     std::vector<ESBTL::Default_system> systems;
14 |     ESBTL::All_atom_system_builder<ESBTL::Default_system> builder(systems, sel.max_nb_systems());
15 |     ESBTL::read_a_pdb_file(pdb_filepath, sel, builder, ESBTL::Accept_none_occupancy_policy<ESBTL::PDB::Line_format<> >());
16 |     clock_gettime(CLOCK_REALTIME, &tend);
17 |     std::cout.clear();
18 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec)/1E9);
19 | }
20 | 


--------------------------------------------------------------------------------
/GEMMI/Makefile:
--------------------------------------------------------------------------------
1 | TARGETS=parse_pdb parse_mmcif count distance
2 | all: ${TARGETS}
3 | 
4 | %: %.cc
5 | 	${CXX} -std=c++11 -lrt -O2 -Igemmi/include $< -o $@
6 | 
7 | clean:
8 | 	rm -rf ${TARGETS}
9 | 


--------------------------------------------------------------------------------
/GEMMI/README.md:
--------------------------------------------------------------------------------
 1 | Compile with:
 2 | ```
 3 | git clone https://github.com/project-gemmi/gemmi.git
 4 | make
 5 | ```
 6 | or
 7 | ```
 8 | c++ -std=c++11 -Igemmi/include -O2 parse_pdb.cc   -o parse_pdb
 9 | c++ -std=c++11 -Igemmi/include -O2 parse_mmcif.cc -o parse_mmcif
10 | # etc.
11 | ```
12 | 


--------------------------------------------------------------------------------
/GEMMI/count.cc:
--------------------------------------------------------------------------------
 1 | // Benchmark the counting of alanine residues in a PDB file
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <time.h>
 5 | #include <string>
 6 | #include <gemmi/pdb.hpp>
 7 | 
 8 | static int count(const gemmi::Structure& st) {
 9 |     int counter = 0;
10 |     const std::string resname = "ALA";
11 |     for (const gemmi::Chain& chain : st.first_model().chains)
12 |         for (const gemmi::Residue& residue : chain.residues)
13 |             if (residue.name == resname)
14 |                 ++counter;
15 |     return counter;
16 | }
17 | 
18 | int main() {
19 |     std::string pdb_filepath = "data/1AKE.pdb";
20 |     gemmi::Structure st = gemmi::read_pdb_file(pdb_filepath);
21 |     timespec tstart, tend;
22 |     clock_gettime(CLOCK_REALTIME, &tstart);
23 |     int n = count(st);
24 |     clock_gettime(CLOCK_REALTIME, &tend);
25 |     assert(n == 38);
26 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec) / 1e9);
27 |     return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/GEMMI/distance.cc:
--------------------------------------------------------------------------------
 1 | // Benchmark the calculation of a distance in a PDB file
 2 | // The distance is the closest distance between any atoms of residues 50 and 60
 3 | //   of chain A in 1AKE
 4 | #include <assert.h>
 5 | #include <stdio.h>
 6 | #include <time.h>
 7 | #include <cmath>
 8 | #include <string>
 9 | #include <gemmi/model.hpp>
10 | #include <gemmi/pdb.hpp>
11 | 
12 | static double distance(gemmi::Structure& st) {
13 |     gemmi::Chain* a = st.first_model().find_chain("A");
14 |     gemmi::Residue& r50 = a->find_residue_group(gemmi::SeqId(50,' '))[0];
15 |     gemmi::Residue& r60 = a->find_residue_group(gemmi::SeqId(60,' '))[0];
16 |     double min_dist_sq = INFINITY;
17 |     for (const gemmi::Atom& a: r50.atoms)
18 |         for (const gemmi::Atom& b: r60.atoms) {
19 |             double d2 = a.pos.dist_sq(b.pos);
20 |             if (d2 < min_dist_sq)
21 |                 min_dist_sq = d2;
22 |         }
23 |     return std::sqrt(min_dist_sq);
24 | }
25 | 
26 | int main() {
27 |     std::string pdb_filepath = "data/1AKE.pdb";
28 |     gemmi::Structure st = gemmi::read_pdb_file(pdb_filepath);
29 |     timespec tstart, tend;
30 |     clock_gettime(CLOCK_REALTIME, &tstart);
31 |     double d = distance(st);
32 |     clock_gettime(CLOCK_REALTIME, &tend);
33 |     assert(std::fabs(d - 9.57605) < 1e-5);
34 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec) / 1e9);
35 |     return 0;
36 | }
37 | 


--------------------------------------------------------------------------------
/GEMMI/parse_mmcif.cc:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a mmCIF file given as an argument
 2 | 
 3 | #include <gemmi/mmread.hpp>
 4 | #include <iostream>
 5 | #include <time.h>
 6 | 
 7 | int main( int argc, char* argv[] ) {
 8 |     std::string mmcif_filepath = argv[1];
 9 |     struct timespec tstart, tend;
10 |     clock_gettime(CLOCK_REALTIME, &tstart);
11 |     gemmi::Structure st = gemmi::read_structure_file(mmcif_filepath);
12 |     clock_gettime(CLOCK_REALTIME, &tend);
13 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec)/1E9);
14 | }
15 | 


--------------------------------------------------------------------------------
/GEMMI/parse_pdb.cc:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | #include <gemmi/mmread.hpp>
 4 | #include <iostream>
 5 | #include <time.h>
 6 | 
 7 | int main( int argc, char* argv[] ) {
 8 |     std::string pdb_filepath = argv[1];
 9 |     struct timespec tstart, tend;
10 |     clock_gettime(CLOCK_REALTIME, &tstart);
11 |     gemmi::Structure st = gemmi::read_structure_file(pdb_filepath);
12 |     clock_gettime(CLOCK_REALTIME, &tend);
13 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec)/1E9);
14 | }
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Joe Greener
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MDAnalysis/count.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | import time
 4 | import MDAnalysis as mda
 5 | 
 6 | pdb_filepath = "data/1AKE.pdb"
 7 | u = mda.Universe(pdb_filepath)
 8 | 
 9 | def count():
10 |     return (u.residues.resnames == "ALA").sum()
11 | 
12 | start = time.time()
13 | count()
14 | end = time.time()
15 | 
16 | print(end - start)
17 | 


--------------------------------------------------------------------------------
/MDAnalysis/distance.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | import time
 6 | import MDAnalysis as mda
 7 | from MDAnalysis.lib.distances import distance_array
 8 | 
 9 | pdb_filepath = "data/1AKE.pdb"
10 | u = mda.Universe(pdb_filepath)
11 | 
12 | def distance():
13 |     segA = u.segments[0]
14 |     r50 = segA.atoms.select_atoms("resid 50")
15 |     r60 = segA.atoms.select_atoms("resid 60")
16 |     da = distance_array(r50.positions, r60.positions)
17 |     return da.min()
18 | 
19 | start = time.time()
20 | distance()
21 | end = time.time()
22 | 
23 | print(end - start)
24 | 


--------------------------------------------------------------------------------
/MDAnalysis/parse_pdb.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | import MDAnalysis as mda
 6 | 
 7 | pdb_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | mda.coordinates.PDB.PDBReader(pdb_filepath)
11 | end = time.time()
12 | 
13 | print(end - start)
14 | 


--------------------------------------------------------------------------------
/MDAnalysis/ramachandran.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of Ramachandran phi/psi angles from a PDB file
 2 | 
 3 | import time
 4 | import MDAnalysis as mda
 5 | 
 6 | pdb_filepath = "data/1AKE.pdb"
 7 | u = mda.Universe(pdb_filepath)
 8 | 
 9 | def ramachandran():
10 |     phi_angles = []
11 |     psi_angles = []
12 |     for res in u.residues:
13 |         try:
14 |             phi = res.phi_selection()
15 |         except:
16 |             pass
17 |         else:
18 |             if not phi is None:
19 |                 phi_angles.append(phi.dihedral.value())
20 |         try:
21 |             psi = res.psi_selection()
22 |         except:
23 |             pass
24 |         else:
25 |             if not psi is None:
26 |                 psi_angles.append(psi.dihedral.value())
27 |     return phi_angles, psi_angles
28 | 
29 | start = time.time()
30 | ramachandran()
31 | end = time.time()
32 | 
33 | print(end - start)
34 | 


--------------------------------------------------------------------------------
/MIToS/count.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | using MIToS.PDB
 4 | 
 5 | pdb_filepath = "data/1AKE.pdb"
 6 | struc = read(pdb_filepath, PDBFile)
 7 | 
 8 | counter() = count(res -> res.id.name == "ALA", struc)
 9 | 
10 | # Run to JIT compile
11 | counter()
12 | 
13 | elapsed = @elapsed counter()
14 | 
15 | println(elapsed)
16 | 


--------------------------------------------------------------------------------
/MIToS/distance.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | using MIToS.PDB
 6 | 
 7 | pdb_filepath = "data/1AKE.pdb"
 8 | struc = read(pdb_filepath, PDBFile, model="1", chain="A", group="ATOM")
 9 | 
10 | # Run to JIT compile
11 | distance(struc[50], struc[60])
12 | 
13 | elapsed = @elapsed distance(struc[50], struc[60])
14 | 
15 | println(elapsed)
16 | 


--------------------------------------------------------------------------------
/MIToS/parse_pdb.jl:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | using MIToS.PDB
 4 | 
 5 | pdb_filepath = ARGS[1]
 6 | 
 7 | # Run to JIT compile
 8 | read(pdb_filepath, PDBFile)
 9 | 
10 | elapsed = @elapsed struc = read(pdb_filepath, PDBFile)
11 | 
12 | println(elapsed)
13 | 


--------------------------------------------------------------------------------
/ProDy/count.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | import time
 4 | from prody import *
 5 | 
 6 | pdb_filepath = "data/1AKE.pdb"
 7 | struc = parsePDB(pdb_filepath)
 8 | 
 9 | def count():
10 |     count = 0
11 |     for res in struc.getHierView().iterResidues():
12 |         if res.getResname() == "ALA":
13 |             count += 1
14 |     return count
15 | 
16 | start = time.time()
17 | count()
18 | end = time.time()
19 | 
20 | print(end - start)
21 | 


--------------------------------------------------------------------------------
/ProDy/distance.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | import time
 6 | from prody import *
 7 | 
 8 | pdb_filepath = "data/1AKE.pdb"
 9 | struc = parsePDB(pdb_filepath)
10 | 
11 | def distance():
12 |     min_dist = float("inf")
13 |     for atom_a in struc['A', 50]:
14 |         for atom_b in struc['A', 60]:
15 |             if calcDistance(atom_a, atom_b) < min_dist:
16 |                 min_dist = calcDistance(atom_a, atom_b)
17 |     return min_dist
18 | 
19 | start = time.time()
20 | distance()
21 | end = time.time()
22 | 
23 | print(end - start)
24 | 


--------------------------------------------------------------------------------
/ProDy/parse_pdb.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | from prody import *
 6 | 
 7 | pdb_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | parsePDB(pdb_filepath)
11 | end = time.time()
12 | 
13 | print(end - start)
14 | 


--------------------------------------------------------------------------------
/ProDy/ramachandran.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of Ramachandran phi/psi angles from a PDB file
 2 | 
 3 | import time
 4 | from prody import *
 5 | 
 6 | pdb_filepath = "data/1AKE.pdb"
 7 | struc = parsePDB(pdb_filepath)
 8 | 
 9 | def ramachandran():
10 |     phi_angles = []
11 |     psi_angles = []
12 |     for res in struc.getHierView().iterResidues():
13 |         try:
14 |             phi_angle = calcPhi(res)
15 |             psi_angle = calcPsi(res)
16 |             phi_angles.append(phi_angle)
17 |             psi_angles.append(psi_angle)
18 |         except:
19 |             pass
20 |     return phi_angles, psi_angles
21 | 
22 | start = time.time()
23 | ramachandran()
24 | end = time.time()
25 | 
26 | print(end - start)
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PDB benchmarks
  2 | 
  3 | Open source software packages to parse files in various formats from the [Protein Data Bank](http://www.rcsb.org/pdb/home/home.do) (PDB) and manipulate protein structures exist in many languages, often as part of Bio* projects.
  4 | 
  5 | This repository aims to collate benchmarks for common tasks across various languages and packages. The collection of scripts may also be useful to get an idea how each package works.
  6 | 
  7 | Please feel free to contribute scripts from other packages, or submit improvements to the scripts already present - I'm looking for the fastest implementation for each software that makes use of the provided API.
  8 | 
  9 | Disclosure: I contributed the BioStructures.jl package to BioJulia and have made contributions to Biopython.
 10 | 
 11 | ## Tests
 12 | 
 13 | * Parsing 2 PDB entries, taken from the benchmarking in [1], in the PDB, mmCIF and MMTF formats:
 14 |   * [1CRN](http://www.rcsb.org/pdb/explore/explore.do?structureId=1crn) - hydrophobic protein (327 atoms).
 15 |   * [1HTQ](http://www.rcsb.org/pdb/explore/explore.do?structureId=1htq) - multicopy glutamine synthetase (10 models of 97,872 atoms).
 16 | * Counting the number of alanine residues in adenylate kinase ([1AKE](http://www.rcsb.org/pdb/explore/explore.do?structureId=1ake)).
 17 | * Calculating the distance between residues 50 and 60 of chain A in adenylate kinase ([1AKE](http://www.rcsb.org/pdb/explore/explore.do?structureId=1ake)).
 18 | * Calculating the Ramachandran phi/psi angles in adenylate kinase ([1AKE](http://www.rcsb.org/pdb/explore/explore.do?structureId=1ake)).
 19 | 
 20 | [1] Gajda MJ, hPDB - Haskell library for processing atomic biomolecular structures in protein data bank format, *BMC Research Notes* 2013, **6**:483 - [link](http://bmcresnotes.biomedcentral.com/articles/10.1186/1756-0500-6-483)
 21 | 
 22 | The PDB files can be downloaded to directory `data` by running `julia tools/download_data.jl` from this directory. If you have all the software installed, and compiled where applicable, you can run `sh tools/run_benchmarks.sh` from this directory to run the benchmarks and store the results in `benchmarks.csv`. The mean over a number of runs is taken for each benchmark to obtain the values below.
 23 | 
 24 | Benchmarks were carried out on an Intel Xeon CPU E5-1620 v3 3.50GHz x 8 processor with 32 GB 2400 MHz DDR4 RAM. The operating system was CentOS v8.1. Time is the elapsed time.
 25 | 
 26 | ## Software
 27 | 
 28 | Currently, 16 packages across 7 programming languages are included in the benchmarks:
 29 | * [BioStructures](https://github.com/BioJulia/BioStructures.jl) v0.10.1 running on Julia v1.3.1; times measured after JIT compilation.
 30 | * [MIToS](https://github.com/diegozea/MIToS.jl) v2.4.0 running on Julia v1.3.1; times measured after JIT compilation.
 31 | * [Biopython](http://biopython.org/wiki/Biopython) v1.76 running on Python v3.7.6.
 32 | * [ProDy](http://prody.csb.pitt.edu) v1.10.11 running on Python v3.7.6.
 33 | * [MDAnalysis](http://www.mdanalysis.org) v0.20.1 running on Python v3.7.6.
 34 | * [biotite](https://www.biotite-python.org) v0.20.1 running on Python v3.7.6.
 35 | * [atomium](https://github.com/samirelanduk/atomium) v1.0.2 running on Python v3.7.6.
 36 | * [Bio3D](http://thegrantlab.org/bio3d/index.php) v2.4.1 running on R v3.6.2.
 37 | * [Rpdb](https://cran.r-project.org/web/packages/Rpdb/index.html) v2.3 running on R v3.6.2.
 38 | * [BioJava](https://biojava.org) v5.3.0 running on Java v1.8.0.
 39 | * [BioPerl](http://bioperl.org/index.html) v1.007002 running on Perl v5.26.3.
 40 | * [BioRuby](http://bioruby.org) v2.0.1 running on Ruby v2.5.5.
 41 | * [GEMMI](https://gemmi.readthedocs.io/en/latest/index.html) v0.3.6 compiled with gcc v8.3.1; there is also a Python interface but benchmarking was done in C++.
 42 | * [Victor](http://protein.bio.unipd.it/victor/index.php/Main_Page) v1.0 compiled with gcc v7.3.1.
 43 | * [ESBTL](http://esbtl.sourceforge.net/index.html) v1.0-beta01 compiled with gcc v7.3.1.
 44 | * [chemfiles](https://chemfiles.org) v0.9.3 compiled with gcc v7.3.0 (C++ version) or running on Python v3.7.6 (Python version).
 45 | 
 46 | ## Results
 47 | 
 48 | Note that direct comparison between these times should be treated with caution, as each package does something slightly different. For example, things that increase parsing time include:
 49 | 
 50 | * Parsing the header information.
 51 | * Accounting for disorder at both the atom and residue (point mutation) level.
 52 | * Forming a heirarchical model of the protein that makes access to specific residues, atoms etc. easier and faster after parsing.
 53 | * Allowing models in a file to have different atoms present.
 54 | * Checking that the file format is adhered to at various levels of strictness.
 55 | 
 56 | Each package supports these to varying degrees.
 57 | 
 58 | |                       | BioStructures    | MIToS            | Biopython        | ProDy            | MDAnalysis       | biotite          | atomium          | Bio3D            | Rpdb             | BioJava          | BioPerl          | BioRuby          | GEMMI            | Victor           | ESBTL            | chemfiles-python | chemfiles-cxx    |
 59 | | :-------------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- | :--------------- |
 60 | | Parse PDB 1CRN / ms   | 0.75             | 0.63             | 7.3              | 3.1              | 4.2              | 4.4              | 7.0              | 10.0             | 9.5              | 8.1              | 43.0             | 21.0             | 0.24             | 7.6              | 2.4              | 4.5              | 0.67             |
 61 | | Parse PDB 1HTQ / s    | 2.6              | 2.8              | 16.0             | 2.1              | 1.5              | 4.8              | 20.0             | 2.9              | 14.0             | 1.3              | 49.0             | 13.0             | 0.36             | 11.0             | -                | -                | -                |
 62 | | Parse mmCIF 1CRN / ms | 2.0              | -                | 16.0             | -                | -                | 4.8              | 13.0             | -                | -                | 40.0             | -                | -                | 0.97             | -                | -                | 3.8              | 0.99             |
 63 | | Parse mmCIF 1HTQ / s  | 8.0              | -                | 45.0             | -                | -                | 9.0              | 36.0             | -                | -                | 17.0             | -                | -                | 1.5              | -                | -                | 2.0              | 2.0              |
 64 | | Parse MMTF 1CRN / ms  | 1.1              | -                | 4.5              | -                | -                | 1.2              | 4.6              | -                | -                | 4.1              | -                | -                | -                | -                | -                | 3.2              | 0.44             |
 65 | | Parse MMTF 1HTQ / s   | 3.6              | -                | 16.0             | -                | -                | 0.16             | 43.0             | -                | -                | 0.74             | -                | -                | -                | -                | -                | -                | -                |
 66 | | Count / ms            | 0.17             | 0.017            | 0.21             | 8.8              | 0.068            | -                | -                | 0.16             | 0.2              | -                | 0.42             | 0.073            | 0.004            | -                | -                | 0.75             | 0.092            |
 67 | | Distance / ms         | 0.012            | 0.0044           | 0.25             | 50.0             | 0.62             | -                | -                | 19.0             | 1.3              | -                | 0.53             | 0.32             | 0.001            | -                | -                | 0.55             | 0.19             |
 68 | | Ramachandran / ms     | 1.4              | -                | 120.0            | 210.0            | 1200.0           | -                | -                | -                | -                | -                | -                | -                | -                | -                | -                | 7.4              | 2.1              |
 69 | | Language              | Julia            | Julia            | Python           | Python           | Python           | Python           | Python           | R                | R                | Java             | Perl             | Ruby             | C++/Python       | C++              | C++              | Python           | C++              |
 70 | | License               | MIT              | MIT              | Biopython        | MIT              | GPLv2            | BSD 3-Clause     | MIT              | GPLv2            | GPLv2/GPLv3      | LGPLv2.1         | GPL/Artistic     | Ruby             | MPLv2/LGPLv3     | GPLv3            | GPLv3            | BSD 3-Clause     | BSD 3-Clause     |
 71 | | Hierarchichal parsing | ✓                | ✗                | ✓                | ✓                | ✓                | ✗                | ✓                | ✗                | ✗                | ✓                | ✓                | ✓                | ✓                | ✓                | ✓                | ✗                | ✗                |
 72 | | Supports disorder     | ✓                | ✗                | ✓                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                | ✓                | ✗                | ✓                | ✗                | ✗                |
 73 | | Writes PDBs           | ✓                | ✓                | ✓                | ✓                | ✓                | ✓                | ✓                | ✓                | ✓                | ✓                | ✓                | ✗                | ✓                | ✓                | ✓                | ✓                | ✓                |
 74 | | Parses PDB header     | ✗                | ✗                | ✓                | ✓                | ✗                | ✗                | ✓                | ✓                | ✓                | ✓                | ✗                | ✓                | ✓                | ✓                | ✗                | ✗                | ✗                |
 75 | | Superimposition       | ✓                | ✓                | ✓                | ✓                | ✓                | ✓                | ✗                | ✓                | ✗                | ✓                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                |
 76 | | PCA                   | ✗                | ✗                | ✗                | ✓                | ✓                | ✗                | ✗                | ✓                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                | ✗                |
 77 | 
 78 | Benchmarks as a plot, sorted by increasing time to parse PDB 1CRN:
 79 | 
 80 | ![benchmarks](plot/plot.png "benchmarks")
 81 | 
 82 | ## Parsing the whole PDB
 83 | 
 84 | It is instructive to run parsers over the whole PDB to see where errors arise. This approach has led to me submitting corrections for small mistakes (e.g. duplicate atoms, residue number errors) in a few PDB structures. As of July 2018, the PDB entries that error with the Biopython (permissive mode) and BioJulia parsers are:
 85 | * 4UDF - mmCIF file errors in Biopython and BioJulia due to duplicate C and O atoms in Lys91 of chains B, F etc.
 86 | * 1EJG - mmCIF file errors in Biopython due to blank and non-blank alt loc IDs at residue Pro22/Ser22.
 87 | * 5O61 - mmCIF file errors in Biopython due to an incorrect residue number at line 165,223.
 88 | 
 89 | Running Biopython in non-permissive mode picks up more potential problems such as broken chains and mixed blank/non-blank alt loc IDs. For further discussion on errors in PDB files see the Biopython [documentation](http://biopython.org/DIST/docs/tutorial/Tutorial.pdf). The scripts to reproduce the whole PDB checking can be found in `checkwholepdb`. There is also a script to check recent PDB changes that can be run as a CRON job.
 90 | 
 91 | ## Opinions
 92 | 
 93 | * For most purposes, particularly work on small numbers of files, the speed of the programs will not hold you back. In this case use the language/package you are most familiar with.
 94 | * For fast parsing, use a binary format such as [MMTF](http://mmtf.rcsb.org) or [binaryCIF](https://github.com/dsehnal/BinaryCIF).
 95 | * Whilst mmCIF became the standard PDB archive format in 2014, and is a very flexible archive format, that does not mean that it is the best choice for all of bioinformatics. mmCIF files take up a lot of space on disk, are slowest to read and do not yet work with many bioinformatics tools.
 96 | * If you are analysing ensembles of proteins then use packages with that functionality, such as ProDy or Bio3D, rather than writing the code yourself.
 97 | 
 98 | ## Citation
 99 | 
100 | If you use these benchmarks, please cite the [BioStructures.jl](https://github.com/BioJulia/BioStructures.jl) paper where they appear:
101 | 
102 | Greener JG, Selvaraj J and Ward BJ. BioStructures.jl: read, write and manipulate macromolecular structures in Julia, *Bioinformatics* 36(14):4206-4207 (2020) - [link](https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btaa502/5837108?guestAccessKey=aec90643-1d43-4521-9883-4a4a669187da) - [PDF](https://github.com/BioJulia/BioStructures.jl/blob/master/paper.pdf)
103 | 
104 | ## Contributing
105 | 
106 | If you want to contribute benchmarks for a package, please make a pull request with the script(s) in a directory like the other packages. I will run the benchmarks again and change the README, thanks.
107 | 
108 | ## Resources
109 | 
110 | * Information on file formats for [PDB](http://www.wwpdb.org/documentation/file-format), [mmCIF](http://mmcif.wwpdb.org) and [MMTF](https://github.com/rcsb/mmtf).
111 | * Benchmarks for mmCIF parsing can be found [here](https://github.com/project-gemmi/mmcif-benchmark).
112 | * A list of PDB parsing packages, particularly in C/C++, can be found [here](http://bioinf.org.uk/software/bioplib/libraries).
113 | * The Biopython [documentation](http://biopython.org/DIST/docs/tutorial/Tutorial.pdf) has a useful discussion on disorder at the atom and residue level.
114 | * Sets of utility scripts exist including [pdbtools](https://github.com/harmslab/pdbtools), [pdb-tools](https://github.com/JoaoRodrigues/pdb-tools) and [PDBFixer](https://github.com/pandegroup/pdbfixer).
115 | 


--------------------------------------------------------------------------------
/Rpdb/count.R:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | library(Rpdb)
 4 | library(microbenchmark)
 5 | 
 6 | pdb_filepath <- "data/1AKE.pdb"
 7 | struc <- read.pdb(pdb_filepath)
 8 | 
 9 | count <- function() {
10 |     resnums <- struc$atoms$resid[struc$atoms$resname=="ALA"]
11 |     chains <- struc$atoms$chainid[struc$atoms$resname=="ALA"]
12 |     resids <- paste(resnums, chains, sep="")
13 |     return(length(unique(resids)))
14 | }
15 | 
16 | bench <- microbenchmark(count(), times=1)
17 | 
18 | cat(bench$time / 10^9, "\n", sep="")
19 | 


--------------------------------------------------------------------------------
/Rpdb/distance.R:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | library(Rpdb)
 6 | library(microbenchmark)
 7 | 
 8 | pdb_filepath <- "data/1AKE.pdb"
 9 | struc <- read.pdb(pdb_filepath)
10 | 
11 | distance <- function() {
12 |     is.res50 <- struc$atoms$resid == 50 & struc$atoms$chainid == "A"
13 |     is.res60 <- struc$atoms$resid == 60 & struc$atoms$chainid == "A"
14 |     d <- distances(struc, is.res50, is.res60)
15 |     return(min(norm(d, type="xyz")))
16 | }
17 | 
18 | bench <- microbenchmark(distance(), times=1)
19 | 
20 | cat(bench$time / 10^9, "\n", sep="")
21 | 


--------------------------------------------------------------------------------
/Rpdb/parse_pdb.R:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | library(Rpdb)
 4 | library(microbenchmark)
 5 | 
 6 | pdb_filepath <- commandArgs(trailingOnly=TRUE)[1]
 7 | 
 8 | bench <- microbenchmark(read.pdb(pdb_filepath, MODEL=NULL), times=1)
 9 | 
10 | cat(bench$time / 10^9, "\n", sep="")
11 | 


--------------------------------------------------------------------------------
/Rpdb/ramachandran.R:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of Ramachandran phi/psi angles from a PDB file
 2 | 
 3 | library(Rpdb)
 4 | library(microbenchmark)
 5 | 
 6 | pdb_filepath <- "data/1AKE.pdb"
 7 | struc <- read.pdb(pdb_filepath)
 8 | 
 9 | ramachandran <- function() {
10 |     is_n <- which(struc$atoms$elename=="N")
11 |     is_ca <- which(struc$atoms$elename=="CA")
12 |     is_c <- which(struc$atoms$elename=="C")
13 |     res_count <- length(is_ca)
14 |     phi_angles <- dihedral(struc, is_c[1:res_count-2], is_n[2:res_count-1], is_ca[2:res_count-1], is_c[2:res_count-1])
15 |     psi_angles <- dihedral(struc, is_n[2:res_count-1], is_ca[2:res_count-1], is_c[2:res_count-1], is_n[3:res_count])
16 |     return(phi_angles, psi_angles)
17 | }
18 | 
19 | bench <- microbenchmark(ramachandran(), times=1)
20 | 
21 | cat(bench$time / 10^9, "\n", sep="")
22 | 


--------------------------------------------------------------------------------
/Victor/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Victor benchmarks
 2 | # VICTOR_ROOT needs to be set as the Victor root directory
 3 | 
 4 | BINPATH = $(VICTOR_ROOT)/bin
 5 | 
 6 | LIBS = -lBiopool -ltools
 7 | LIB_PATH = -L. -L$(VICTOR_ROOT)/lib/
 8 | INC_PATH += -I. -I$(VICTOR_ROOT)/tools/ -I$(VICTOR_ROOT)/Biopool/Sources/
 9 | 
10 | CC=g++
11 | CFLAGS=-I. -ansi -pedantic -DNEXCEPTIONS -DLINUX -c -O3 -ffast-math -DNDEBUG -ftemplate-depth-36 -Wno-reorder  -Wno-uninitialized -Wno-write-strings -Wno-narrowing
12 | 
13 | install:
14 | 	$(CC)   $(CFLAGS)   $(INC_PATH) -c parse_pdb.cc -o parse_pdb.o
15 | 	$(CC)   parse_pdb.o -o parse_pdb $(LIB_PATH) $(LIBS)
16 | 
17 | clean:
18 | 	rm parse_pdb.o parse_pdb
19 | 


--------------------------------------------------------------------------------
/Victor/README.md:
--------------------------------------------------------------------------------
1 | VICTOR_ROOT needs to be set as the Victor root directory.
2 | Compile with `make clean` then `make`.
3 | 


--------------------------------------------------------------------------------
/Victor/parse_pdb.cc:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | #include <PdbLoader.h>
 4 | #include <Protein.h>
 5 | #include <iostream>
 6 | #include <time.h>
 7 | 
 8 | using namespace Victor::Biopool;
 9 | using namespace Victor;
10 | 
11 | int main( int argc, char* argv[] ) {
12 |     string pdb_filepath = argv[1];
13 |     struct timespec tstart, tend;
14 |     clock_gettime(CLOCK_REALTIME, &tstart);
15 |     ifstream inFile( pdb_filepath.c_str() );
16 |     // See options at
17 |     // http://protein.bio.unipd.it/victor_doxygen/classVictor_1_1Biopool_1_1PdbLoader.html
18 |     PdbLoader pl(inFile, true, true, false, true, false, false, false, true);
19 |     Protein prot;
20 |     prot.load( pl );
21 |     clock_gettime(CLOCK_REALTIME, &tend);
22 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec)/1E9);
23 | }
24 | 


--------------------------------------------------------------------------------
/atomium/parse_mmcif.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a mmCIF file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | import atomium
 6 | 
 7 | mmcif_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | atomium.open(mmcif_filepath)
11 | end = time.time()
12 | 
13 | print(end - start)
14 | 


--------------------------------------------------------------------------------
/atomium/parse_mmtf.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a MMTF file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | import atomium
 6 | 
 7 | mmtf_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | atomium.open(mmtf_filepath)
11 | end = time.time()
12 | 
13 | print(end - start)
14 | 


--------------------------------------------------------------------------------
/atomium/parse_pdb.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | import atomium
 6 | 
 7 | pdb_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | atomium.open(pdb_filepath)
11 | end = time.time()
12 | 
13 | print(end - start)
14 | 


--------------------------------------------------------------------------------
/benchmarks.csv:
--------------------------------------------------------------------------------
 1 | Package,Benchmark,Runtime
 2 | BioStructures,Parse PDB 1CRN,0.0007521262
 3 | BioStructures,Parse PDB 1HTQ,2.552547826
 4 | BioStructures,Parse mmCIF 1CRN,0.001960067
 5 | BioStructures,Parse mmCIF 1HTQ,7.958684753666667
 6 | BioStructures,Parse MMTF 1CRN,0.001124614
 7 | BioStructures,Parse MMTF 1HTQ,3.5960987396666666
 8 | BioStructures,Count,0.00017121870000000002
 9 | BioStructures,Distance,1.20023e-05
10 | BioStructures,Ramachandran,0.0014263943000000002
11 | MIToS,Parse PDB 1CRN,0.0006277443
12 | MIToS,Parse PDB 1HTQ,2.8049395730000004
13 | MIToS,Count,1.71974e-05
14 | MIToS,Distance,4.4442e-06
15 | Biopython,Parse PDB 1CRN,0.007256412506103515
16 | Biopython,Parse PDB 1HTQ,16.372705459594727
17 | Biopython,Parse mmCIF 1CRN,0.016134476661682128
18 | Biopython,Parse mmCIF 1HTQ,45.07611576716105
19 | Biopython,Parse MMTF 1CRN,0.0045304536819458004
20 | Biopython,Parse MMTF 1HTQ,16.356812477111816
21 | Biopython,Count,0.00020694732666015625
22 | Biopython,Distance,0.0002535343170166016
23 | Biopython,Ramachandran,0.12028734683990479
24 | ProDy,Parse PDB 1CRN,0.00309908390045166
25 | ProDy,Parse PDB 1HTQ,2.1210433642069497
26 | ProDy,Count,0.008846926689147949
27 | ProDy,Distance,0.049677252769470215
28 | ProDy,Ramachandran,0.21265184879302979
29 | MDAnalysis,Parse PDB 1CRN,0.00418851375579834
30 | MDAnalysis,Parse PDB 1HTQ,1.4514319896697998
31 | MDAnalysis,Count,6.7901611328125e-05
32 | MDAnalysis,Distance,0.0006227493286132812
33 | MDAnalysis,Ramachandran,1.2391331672668457
34 | biotite,Parse PDB 1CRN,0.004447317123413086
35 | biotite,Parse PDB 1HTQ,4.8055440584818525
36 | biotite,Parse mmCIF 1CRN,0.00476081371307373
37 | biotite,Parse mmCIF 1HTQ,8.978858550389608
38 | biotite,Parse MMTF 1CRN,0.0012470483779907227
39 | biotite,Parse MMTF 1HTQ,0.1640939712524414
40 | atomium,Parse PDB 1CRN,0.006968569755554199
41 | atomium,Parse PDB 1HTQ,20.193578879038494
42 | atomium,Parse mmCIF 1CRN,0.01340920925140381
43 | atomium,Parse mmCIF 1HTQ,35.97704792022705
44 | atomium,Parse MMTF 1CRN,0.004566097259521484
45 | atomium,Parse MMTF 1HTQ,43.482786417007446
46 | Bio3D,Parse PDB 1CRN,0.0101017309
47 | Bio3D,Parse PDB 1HTQ,2.893682
48 | Bio3D,Count,0.00016179009999999996
49 | Bio3D,Distance,0.018523388999999998
50 | Rpdb,Parse PDB 1CRN,0.0095353409
51 | Rpdb,Parse PDB 1HTQ,14.096083333333334
52 | Rpdb,Count,0.00019563180000000004
53 | Rpdb,Distance,0.0012988237
54 | BioJava,Parse PDB 1CRN,0.0080779367
55 | BioJava,Parse PDB 1HTQ,1.342112537
56 | BioJava,Parse mmCIF 1CRN,0.0399477593
57 | BioJava,Parse mmCIF 1HTQ,16.915583049333335
58 | BioJava,Parse MMTF 1CRN,0.004121529099999999
59 | BioJava,Parse MMTF 1HTQ,0.7408829063333334
60 | BioPerl,Parse PDB 1CRN,0.043307685852050776
61 | BioPerl,Parse PDB 1HTQ,48.674395720164
62 | BioPerl,Count,0.00042252540588378896
63 | BioPerl,Distance,0.0005311250686645508
64 | BioRuby,Parse PDB 1CRN,0.020962584391236307
65 | BioRuby,Parse PDB 1HTQ,12.833540361995498
66 | BioRuby,Count,7.327683269977569e-05
67 | BioRuby,Distance,0.0003178965300321579
68 | GEMMI,Parse PDB 1CRN,0.0002428
69 | GEMMI,Parse PDB 1HTQ,0.3553026666666667
70 | GEMMI,Parse mmCIF 1CRN,0.0009732999999999999
71 | GEMMI,Parse mmCIF 1HTQ,1.4752656666666668
72 | GEMMI,Count,4.000000000000001e-06
73 | GEMMI,Distance,1.0000000000000002e-06
74 | Victor,Parse PDB 1CRN,0.007647900000000001
75 | Victor,Parse PDB 1HTQ,10.691736999999998
76 | ESBTL,Parse PDB 1CRN,0.0024213000000000004
77 | chemfiles-python,Parse PDB 1CRN,0.004539108276367188
78 | chemfiles-python,Parse mmCIF 1CRN,0.003771042823791504
79 | chemfiles-python,Parse mmCIF 1HTQ,2.0077246030171714
80 | chemfiles-python,Parse MMTF 1CRN,0.0031836271286010743
81 | chemfiles-python,Count,0.0007463932037353515
82 | chemfiles-python,Distance,0.0005480289459228515
83 | chemfiles-python,Ramachandran,0.007416057586669922
84 | chemfiles-cxx,Parse PDB 1CRN,0.0006692
85 | chemfiles-cxx,Parse mmCIF 1CRN,0.000986
86 | chemfiles-cxx,Parse mmCIF 1HTQ,1.9856413333333334
87 | chemfiles-cxx,Parse MMTF 1CRN,0.000439
88 | chemfiles-cxx,Count,9.18e-05
89 | chemfiles-cxx,Distance,0.0001881
90 | chemfiles-cxx,Ramachandran,0.0020766
91 | 


--------------------------------------------------------------------------------
/biotite/parse_mmcif.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a mmCIF file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | import biotite.structure.io.pdbx as pdbx
 6 | 
 7 | mmcif_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | file = pdbx.PDBxFile()
11 | file.read(mmcif_filepath)
12 | pdbx.get_structure(file)
13 | end = time.time()
14 | 
15 | print(end - start)
16 | 


--------------------------------------------------------------------------------
/biotite/parse_mmtf.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a MMTF file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | import biotite.structure.io.mmtf as mmtf
 6 | 
 7 | mmtf_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | file = mmtf.MMTFFile()
11 | file.read(mmtf_filepath)
12 | mmtf.get_structure(file)
13 | end = time.time()
14 | 
15 | print(end - start)
16 | 


--------------------------------------------------------------------------------
/biotite/parse_pdb.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | import biotite.structure.io.pdb as pdb
 6 | 
 7 | pdb_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | file = pdb.PDBFile()
11 | file.read(pdb_filepath)
12 | file.get_structure()
13 | end = time.time()
14 | 
15 | print(end - start)
16 | 


--------------------------------------------------------------------------------
/checkwholepdb/checknewpdb.jl:
--------------------------------------------------------------------------------
 1 | # Test which new/modified PDB entries error on PDB/mmCIF parsers
 2 | # Writes output to a file labelled with the week
 3 | 
 4 | using BioStructures
 5 | 
 6 | start = now()
 7 | basedir = "."
 8 | ad, mo, ob = pdbrecentchanges()
 9 | 
10 | outstrs = ["Checking new/modified PDB entries at $(now())",
11 |         "Checking $(length(ad)) new and $(length(mo)) modified entries"]
12 | 
13 | for p in sort(collect(Set([ad..., mo...])))
14 |     try
15 |         downloadpdb(p, dir=basedir, format=PDB)
16 |     catch
17 |         # Not having a PDB file is acceptable, though a failure to download an
18 |         #   available file may hide an error in parsing
19 |         rm("$basedir/$p.pdb", force=true)
20 |     end
21 |     if isfile("$basedir/$p.pdb")
22 |         try
23 |             s = read("$basedir/$p.pdb", PDB)
24 |         catch
25 |             push!(outstrs, "$p - PDB parsing error")
26 |         end
27 |         rm("$basedir/$p.pdb")
28 |     end
29 |     try
30 |         downloadpdb(p, dir=basedir, format=MMCIF)
31 |     catch
32 |         rm("$basedir/$p.cif", force=true)
33 |         push!(outstrs, "$p - no mmCIF download")
34 |     end
35 |     if isfile("$basedir/$p.cif")
36 |         try
37 |             s = read("$basedir/$p.cif", MMCIF)
38 |         catch
39 |             push!(outstrs, "$p - mmCIF parsing error")
40 |         end
41 |         rm("$basedir/$p.cif")
42 |     end
43 | end
44 | 
45 | if length(outstrs) == 2
46 |     push!(outstrs, "All entries read fine")
47 | end
48 | 
49 | push!(outstrs, "Time taken - $(ceil(now() - start, Dates.Minute))")
50 | 
51 | datestr = replace(string(Date(now())), "-", "")
52 | # This overwrites any existing file
53 | open("$basedir/recentpdb_jl_$datestr.txt", "w") do f
54 |     for l in outstrs
55 |         println(f, l)
56 |     end
57 | end
58 | 


--------------------------------------------------------------------------------
/checkwholepdb/checkwholepdb.jl:
--------------------------------------------------------------------------------
 1 | # Test which PDB entries error on PDB/mmCIF parsers
 2 | # Writes output to a file labelled with the week
 3 | 
 4 | using BioStructures
 5 | 
 6 | start = now()
 7 | basedir = "."
 8 | pdblist = pdbentrylist()
 9 | 
10 | outstrs = ["Checking all PDB entries at $(now())",
11 |         "Checking $(length(pdblist)) entries"]
12 | 
13 | for p in sort(pdblist)
14 |     try
15 |         downloadpdb(p, dir=basedir, format=PDB)
16 |     catch
17 |         # Not having a PDB file is acceptable, though a failure to download an
18 |         #   available file may hide an error in parsing
19 |         rm("$basedir/$p.pdb", force=true)
20 |     end
21 |     if isfile("$basedir/$p.pdb")
22 |         try
23 |             s = read("$basedir/$p.pdb", PDB)
24 |         catch
25 |             push!(outstrs, "$p - PDB parsing error")
26 |         end
27 |         rm("$basedir/$p.pdb")
28 |     end
29 |     try
30 |         downloadpdb(p, dir=basedir, format=MMCIF)
31 |     catch
32 |         rm("$basedir/$p.cif", force=true)
33 |         push!(outstrs, "$p - no mmCIF download")
34 |     end
35 |     if isfile("$basedir/$p.cif")
36 |         try
37 |             s = read("$basedir/$p.cif", MMCIF)
38 |         catch
39 |             push!(outstrs, "$p - mmCIF parsing error")
40 |         end
41 |         rm("$basedir/$p.cif")
42 |     end
43 | end
44 | 
45 | if length(outstrs) == 2
46 |     push!(outstrs, "All entries read fine")
47 | end
48 | 
49 | push!(outstrs, "Time taken - $(ceil(now() - start, Dates.Minute))")
50 | 
51 | datestr = replace(string(Date(now())), "-", "")
52 | # This overwrites any existing file
53 | open("$basedir/wholepdb_jl_$datestr.txt", "w") do f
54 |     for l in outstrs
55 |         println(f, l)
56 |     end
57 | end
58 | 


--------------------------------------------------------------------------------
/checkwholepdb/checkwholepdb.py:
--------------------------------------------------------------------------------
 1 | # Test which PDB entries error on PDB/mmCIF parsers
 2 | # Writes output to a file labelled with the week
 3 | 
 4 | import os
 5 | from datetime import datetime
 6 | from math import ceil
 7 | from Bio.PDB import PDBList
 8 | from Bio.PDB.PDBParser import PDBParser
 9 | from Bio.PDB.MMCIFParser import MMCIFParser
10 | 
11 | start = datetime.now()
12 | basedir = "."
13 | pdbl = PDBList()
14 | pdblist = pdbl.get_all_entries()
15 | 
16 | outstrs = ["Checking all PDB entries at {}".format(start.isoformat()),
17 |         "Checking {} entries".format(len(pdblist))]
18 | 
19 | pdb_parser = PDBParser()
20 | mmcif_parser = MMCIFParser()
21 | 
22 | for pu in sorted(pdblist):
23 |     p = pu.lower()
24 |     try:
25 |         pdbl.retrieve_pdb_file(p, pdir=basedir, file_format="pdb")
26 |     except:
27 |         # Not having a PDB file is acceptable, though a failure to download an
28 |         #   available file may hide an error in parsing
29 |         try:
30 |             os.remove("{}/pdb{}.ent".format(basedir, p))
31 |         except:
32 |             pass
33 |     if os.path.isfile("{}/pdb{}.ent".format(basedir, p)):
34 |         try:
35 |             s = pdb_parser.get_structure("", "{}/pdb{}.ent".format(basedir, p))
36 |         except:
37 |             outstrs.append("{} - PDB parsing error".format(pu))
38 |         os.remove("{}/pdb{}.ent".format(basedir, p))
39 |     try:
40 |         pdbl.retrieve_pdb_file(p, pdir=basedir, file_format="mmCif")
41 |     except:
42 |         try:
43 |             os.remove("{}/{}.cif".format(basedir, p))
44 |         except:
45 |             pass
46 |         outstrs.append("{} - no mmCIF download".format(pu))
47 |     if os.path.isfile("{}/{}.cif".format(basedir, p)):
48 |         try:
49 |             s = mmcif_parser.get_structure("", "{}/{}.cif".format(basedir, p))
50 |         except:
51 |             outstrs.append("{} - mmCIF parsing error".format(pu))
52 |         os.remove("{}/{}.cif".format(basedir, p))
53 | 
54 | if len(outstrs) == 2:
55 |     outstrs.append("All entries read fine")
56 | 
57 | end = datetime.now()
58 | outstrs.append("Time taken - {} minute(s)".format(int(ceil((end - start).seconds / 60))))
59 | 
60 | datestr = str(end.date()).replace("-", "")
61 | # This overwrites any existing file
62 | with open("{}/wholepdb_py_{}.txt".format(basedir, datestr), "w") as f:
63 |     for l in outstrs:
64 |         f.write(l + "\n")
65 | 


--------------------------------------------------------------------------------
/chemfiles/Makefile:
--------------------------------------------------------------------------------
 1 | CPPFLAGS=-std=c++11 -O2 -lrt
 2 | INCLUDE=-I${CONDA_PREFIX}/include
 3 | LIBS=-L${CONDA_PREFIX}/lib -lchemfiles -Wl,-rpath,${CONDA_PREFIX}/lib
 4 | TARGETS=count distance parse_mmcif parse_mmtf parse_pdb ramachandran
 5 | 
 6 | all: ${TARGETS}
 7 | 
 8 | %: %.cpp
 9 | 	${CXX} ${CPPFLAGS} ${INCLUDE} $< -o $@ ${LIBS}
10 | 
11 | .PHONY: clean
12 | clean:
13 | 	rm -rf ${TARGETS}
14 | 


--------------------------------------------------------------------------------
/chemfiles/README.md:
--------------------------------------------------------------------------------
1 | Compile C++ version with:
2 | ```
3 | conda install chemfiles-lib
4 | make
5 | ```
6 | 


--------------------------------------------------------------------------------
/chemfiles/count.cpp:
--------------------------------------------------------------------------------
 1 | // Benchmark the counting of alanine residues in a PDB file
 2 | #include <cstdio>
 3 | #include <string>
 4 | 
 5 | #include <time.h>
 6 | 
 7 | #include <chemfiles.hpp>
 8 | 
 9 | using namespace chemfiles;
10 | 
11 | static size_t count(Frame& frame) {
12 |     auto selection = Selection("resname ALA");
13 |     return selection.list(frame).size();
14 | }
15 | 
16 | int main() {
17 |     auto pdb_filepath = "data/1AKE.pdb";
18 |     auto frame = Trajectory(pdb_filepath).read();
19 | 
20 |     timespec tstart, tend;
21 |     clock_gettime(CLOCK_REALTIME, &tstart);
22 |     count(frame);
23 |     clock_gettime(CLOCK_REALTIME, &tend);
24 | 
25 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec) / 1e9);
26 |     return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/chemfiles/count.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the counting of alanine residues in a PDB file
 2 | 
 3 | import time
 4 | from chemfiles import Trajectory, Selection
 5 | 
 6 | 
 7 | def count(frame):
 8 |     selection = Selection("resname ALA")
 9 |     return len(selection.evaluate(frame))
10 | 
11 | 
12 | pdb_filepath = "data/1AKE.pdb"
13 | frame = Trajectory(pdb_filepath).read()
14 | 
15 | start = time.time()
16 | count(frame)
17 | end = time.time()
18 | 
19 | print(end - start)
20 | 


--------------------------------------------------------------------------------
/chemfiles/distance.cpp:
--------------------------------------------------------------------------------
 1 | // Benchmark the calculation of a distance in a PDB file
 2 | // The distance is the closest distance between any atoms of residues 50 and 60
 3 | //   of chain A in 1AKE
 4 | #include <cstdio>
 5 | #include <cmath>
 6 | #include <string>
 7 | 
 8 | #include <time.h>
 9 | 
10 | #include <chemfiles.hpp>
11 | 
12 | using namespace chemfiles;
13 | 
14 | static double distance(Frame& frame) {
15 |     // FIXME: this should use Selection("resid 50 and [chainname] A") which will
16 |     // be available in chemfiles 0.10 (the next release)
17 |     auto r50 = Selection("resid 50 and index < 1000").list(frame);
18 |     auto r60 = Selection("resid 60 and index < 1000").list(frame);
19 | 
20 |     double min = INFINITY;
21 |     for (auto i: r50) {
22 |         for (auto j: r60) {
23 |             auto r = frame.distance(i, j);
24 |             if (r < min) {
25 |                 min = r;
26 |             }
27 |         }
28 |     }
29 | 
30 |     return min;
31 | }
32 | 
33 | int main() {
34 |     auto pdb_filepath = "data/1AKE.pdb";
35 |     auto frame = Trajectory(pdb_filepath).read();
36 | 
37 |     timespec tstart, tend;
38 |     clock_gettime(CLOCK_REALTIME, &tstart);
39 |     distance(frame);
40 |     clock_gettime(CLOCK_REALTIME, &tend);
41 | 
42 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec) / 1e9);
43 |     return 0;
44 | }
45 | 


--------------------------------------------------------------------------------
/chemfiles/distance.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of a distance in a PDB file
 2 | # The distance is the closest distance between any atoms of residues 50 and 60
 3 | #   of chain A in 1AKE
 4 | 
 5 | import time
 6 | from chemfiles import Trajectory, Selection
 7 | 
 8 | 
 9 | def distance(frame):
10 |     # FIXME: this should use Selection("resid 50 and [chainname] A") which will
11 |     # be available in chemfiles 0.10 (the next release)
12 |     r50 = Selection("resid 50 and index < 1000").evaluate(frame)
13 |     r60 = Selection("resid 60 and index < 1000").evaluate(frame)
14 | 
15 |     min = float('inf')
16 |     for i in r50:
17 |         for j in r60:
18 |             r = frame.distance(i, j)
19 |             if r < min:
20 |                 min = r
21 | 
22 |     return min
23 | 
24 | 
25 | pdb_filepath = "data/1AKE.pdb"
26 | frame = Trajectory(pdb_filepath).read()
27 | 
28 | start = time.time()
29 | distance(frame)
30 | end = time.time()
31 | 
32 | print(end - start)
33 | 


--------------------------------------------------------------------------------
/chemfiles/parse_mmcif.cpp:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a mmCIF file given as an argument
 2 | #include <cstdio>
 3 | #include <string>
 4 | #include <time.h>
 5 | 
 6 | #include <chemfiles.hpp>
 7 | 
 8 | int main(int argc, char* argv[]) {
 9 |     std::string mmcif_filepath = argv[1];
10 | 
11 |     struct timespec tstart, tend;
12 |     clock_gettime(CLOCK_REALTIME, &tstart);
13 |     auto trajectory = chemfiles::Trajectory(mmcif_filepath, 'r', "mmCIF");
14 |     for (size_t step=0; step<trajectory.nsteps(); step++) {
15 |         trajectory.read();
16 |     }
17 |     clock_gettime(CLOCK_REALTIME, &tend);
18 | 
19 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec) / 1e9);
20 |     return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/chemfiles/parse_mmcif.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a mmCIF file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | from chemfiles import Trajectory
 6 | 
 7 | mmcif_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | trajectory = Trajectory(mmcif_filepath, 'r', 'mmCIF')
11 | for frame in trajectory:
12 |     pass
13 | end = time.time()
14 | 
15 | print(end - start)
16 | 


--------------------------------------------------------------------------------
/chemfiles/parse_mmtf.cpp:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a MMTF file given as an argument
 2 | #include <cstdio>
 3 | #include <string>
 4 | #include <time.h>
 5 | 
 6 | #include <chemfiles.hpp>
 7 | 
 8 | int main(int argc, char* argv[]) {
 9 |     std::string mmtf_filepath = argv[1];
10 | 
11 |     struct timespec tstart, tend;
12 |     clock_gettime(CLOCK_REALTIME, &tstart);
13 |     auto trajectory = chemfiles::Trajectory(mmtf_filepath);
14 |     for (size_t step=0; step<trajectory.nsteps(); step++) {
15 |         trajectory.read();
16 |     }
17 |     clock_gettime(CLOCK_REALTIME, &tend);
18 | 
19 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec) / 1e9);
20 |     return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/chemfiles/parse_mmtf.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a MMTF file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | from chemfiles import Trajectory
 6 | 
 7 | mmtf_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | trajectory = Trajectory(mmtf_filepath)
11 | for frame in trajectory:
12 |     pass
13 | end = time.time()
14 | 
15 | print(end - start)
16 | 


--------------------------------------------------------------------------------
/chemfiles/parse_pdb.cpp:
--------------------------------------------------------------------------------
 1 | // Benchmark the parsing of a PDB file given as an argument
 2 | #include <cstdio>
 3 | #include <string>
 4 | #include <time.h>
 5 | 
 6 | #include <chemfiles.hpp>
 7 | 
 8 | int main(int argc, char* argv[]) {
 9 |     std::string pdb_filepath = argv[1];
10 | 
11 |     struct timespec tstart, tend;
12 |     clock_gettime(CLOCK_REALTIME, &tstart);
13 |     auto trajectory = chemfiles::Trajectory(pdb_filepath);
14 |     for (size_t step=0; step<trajectory.nsteps(); step++) {
15 |         trajectory.read();
16 |     }
17 |     clock_gettime(CLOCK_REALTIME, &tend);
18 | 
19 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec) / 1e9);
20 |     return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/chemfiles/parse_pdb.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the parsing of a PDB file given as an argument
 2 | 
 3 | import sys
 4 | import time
 5 | from chemfiles import Trajectory
 6 | 
 7 | pdb_filepath = sys.argv[1]
 8 | 
 9 | start = time.time()
10 | trajectory = Trajectory(pdb_filepath)
11 | for frame in trajectory:
12 |     pass
13 | end = time.time()
14 | 
15 | print(end - start)
16 | 


--------------------------------------------------------------------------------
/chemfiles/ramachandran.cpp:
--------------------------------------------------------------------------------
 1 | // Benchmark the calculation of Ramachandran phi/psi angles from a PDB file
 2 | #include <cstdio>
 3 | #include <cmath>
 4 | #include <vector>
 5 | #include <string>
 6 | 
 7 | #include <time.h>
 8 | 
 9 | #include <chemfiles.hpp>
10 | 
11 | using namespace chemfiles;
12 | 
13 | using ramachandran_t = std::pair<std::vector<double>, std::vector<double>>;
14 | static ramachandran_t ramachandran(Frame& frame) {
15 |     auto phi_selection = Selection("dihedrals: name(#1) C and name(#2) N and name(#3) CA and name(#4) C");
16 |     auto phi_list = phi_selection.evaluate(frame);
17 | 
18 |     auto phi_angles = std::vector<double>();
19 |     phi_angles.reserve(phi_list.size());
20 | 
21 |     for (const auto& phi: phi_list) {
22 |         // 57.29578 to convert from radians to degrees
23 |         phi_angles.push_back(frame.dihedral(phi[0], phi[1], phi[2], phi[3]) * 57.29578);
24 |     }
25 | 
26 | 
27 |     auto psi_selection = Selection("dihedrals: name(#1) N and name(#2) CA and name(#3) C and name(#4) N");
28 |     auto psi_list = psi_selection.evaluate(frame);
29 | 
30 |     auto psi_angles = std::vector<double>();
31 |     psi_angles.reserve(psi_list.size());
32 | 
33 |     for (const auto& phi: psi_list) {
34 |         // 57.29578 to convert from radians to degrees
35 |         phi_angles.push_back(frame.dihedral(phi[0], phi[1], phi[2], phi[3]) * 57.29578);
36 |     }
37 | 
38 |     // FIXME: the sign of the angles is inverted w.r.t. the MDAnalysis results
39 |     return {phi_angles, psi_angles};
40 | }
41 | 
42 | int main() {
43 |     auto pdb_filepath = "data/1AKE.pdb";
44 |     auto frame = Trajectory(pdb_filepath).read();
45 | 
46 |     timespec tstart, tend;
47 |     clock_gettime(CLOCK_REALTIME, &tstart);
48 |     ramachandran(frame);
49 |     clock_gettime(CLOCK_REALTIME, &tend);
50 | 
51 |     printf("%.6f\n", (tend.tv_sec - tstart.tv_sec) + (tend.tv_nsec - tstart.tv_nsec) / 1e9);
52 |     return 0;
53 | }
54 | 


--------------------------------------------------------------------------------
/chemfiles/ramachandran.py:
--------------------------------------------------------------------------------
 1 | # Benchmark the calculation of Ramachandran phi/psi angles from a PDB file
 2 | import time
 3 | from chemfiles import Trajectory, Selection
 4 | 
 5 | 
 6 | def ramachandran(frame):
 7 |     phi_selection = Selection("dihedrals: name(#1) C and name(#2) N and name(#3) CA and name(#4) C")
 8 |     phi_angles = []
 9 |     for (i, j, k, m) in phi_selection.evaluate(frame):
10 |         # 57.29578 to convert from radians to degrees
11 |         phi_angles.append(frame.dihedral(i, j, k, m) * 57.29578)
12 | 
13 |     psi_selection = Selection("dihedrals: name(#1) N and name(#2) CA and name(#3) C and name(#4) N")
14 |     psi_angles = []
15 |     for (i, j, k, m) in psi_selection.evaluate(frame):
16 |         psi_angles.append(frame.dihedral(i, j, k, m) * 57.29578)
17 | 
18 |     # FIXME: the sign of the angles is inverted w.r.t. the MDAnalysis results
19 |     return phi_angles, psi_angles
20 | 
21 | 
22 | pdb_filepath = "data/1AKE.pdb"
23 | frame = Trajectory(pdb_filepath).read()
24 | 
25 | start = time.time()
26 | ramachandran(frame)
27 | end = time.time()
28 | 
29 | print(end - start)
30 | 


--------------------------------------------------------------------------------
/plot/plot.jl:
--------------------------------------------------------------------------------
 1 | # Plot benchmark results
 2 | 
 3 | using CSV
 4 | using DataFrames
 5 | using Gadfly
 6 | using Cairo
 7 | using Fontconfig
 8 | 
 9 | df = CSV.read("benchmarks.csv")
10 | 
11 | benchmarks = [
12 |     "Parse PDB 1CRN",
13 |     "Parse PDB 1HTQ",
14 |     "Parse mmCIF 1CRN",
15 |     "Parse mmCIF 1HTQ",
16 |     "Parse MMTF 1CRN",
17 |     "Parse MMTF 1HTQ",
18 |     "Count",
19 |     "Distance",
20 |     "Ramachandran",
21 | ]
22 | benchind(b) = findfirst(x -> x == b, benchmarks)
23 | df_sorted = sort(df, (order(:Benchmark, by=benchind), :Runtime))
24 | 
25 | theme = Theme(
26 |     background_color="white",
27 |     panel_stroke="black",
28 |     major_label_color="black",
29 |     minor_label_color="black",
30 |     highlight_width=0mm,
31 | )
32 | 
33 | p = Gadfly.with_theme(theme) do
34 |     plot(df_sorted,
35 |         x=:Package,
36 |         y=:Runtime,
37 |         color=:Benchmark,
38 |         Scale.y_log10,
39 |         Guide.xlabel(nothing),
40 |         Guide.ylabel("Runtime / s"),
41 |         Geom.point,
42 |         Geom.line,
43 |     )
44 | end
45 | 
46 | draw(PNG("plot/plot.png", dpi=300), p)
47 | 


--------------------------------------------------------------------------------
/plot/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreener64/pdb-benchmarks/b8150b4501a6c385722ad1815d0ba54b87a01431/plot/plot.png


--------------------------------------------------------------------------------
/tools/download_data.jl:
--------------------------------------------------------------------------------
 1 | # Download the data files required for benchmarking into data directory
 2 | 
 3 | using BioStructures
 4 | 
 5 | out_dir = "data"
 6 | 
 7 | if !isdir(out_dir)
 8 |     mkdir(out_dir)
 9 | end
10 | 
11 | for pdbid in ("1CRN", "1HTQ")
12 |     for format in (PDB, MMCIF, MMTF)
13 |         downloadpdb(pdbid, format=format, dir=out_dir)
14 |     end
15 | end
16 | 
17 | downloadpdb("1AKE", dir=out_dir)
18 | 


--------------------------------------------------------------------------------
/tools/mean.py:
--------------------------------------------------------------------------------
 1 | # Calculate the mean value from a file of numbers
 2 | 
 3 | import sys
 4 | import numpy as np
 5 | 
 6 | with open(sys.argv[1]) as in_file:
 7 |     vals = [float(line) for line in in_file]
 8 | 
 9 | print(np.mean(vals))
10 | 


--------------------------------------------------------------------------------
/tools/run_benchmarks.sh:
--------------------------------------------------------------------------------
  1 | # Run all benchmarks, save the results and form a csv file for plotting
  2 | # Requires all packages to be installed and compiled where applicable
  3 | 
  4 | echo "Running benchmarks"
  5 | 
  6 | # Number of runs for each benchmark apart from 1HTQ parsing
  7 | nb=10
  8 | # Number of runs for 1HTQ parsing
  9 | ns=3
 10 | 
 11 | # Remove current data files
 12 | rm */*.dat
 13 | 
 14 | # Remove current plot
 15 | rm plot/plot.png
 16 | 
 17 | # Reset benchmarking results
 18 | echo "Package,Benchmark,Runtime" > benchmarks.csv
 19 | 
 20 | # Run a benchmark
 21 | # Arguments are number of runs, benchmark command, output data file, csv file columns
 22 | function run_benchmark {
 23 |     for i in $(seq 1 $1)
 24 |     do
 25 |         eval $2 | tail -n1 >> $3
 26 |     done
 27 |     echo -n $4 >> benchmarks.csv
 28 |     python tools/mean.py $3 >> benchmarks.csv
 29 | }
 30 | 
 31 | # BioStructures
 32 | run_benchmark $nb "julia BioStructures/parse_pdb.jl data/1CRN.pdb"   "BioStructures/parse_pdb_1CRN.dat"   "BioStructures,Parse PDB 1CRN,"
 33 | run_benchmark $ns "julia BioStructures/parse_pdb.jl data/1HTQ.pdb"   "BioStructures/parse_pdb_1HTQ.dat"   "BioStructures,Parse PDB 1HTQ,"
 34 | run_benchmark $nb "julia BioStructures/parse_mmcif.jl data/1CRN.cif" "BioStructures/parse_mmcif_1CRN.dat" "BioStructures,Parse mmCIF 1CRN,"
 35 | run_benchmark $ns "julia BioStructures/parse_mmcif.jl data/1HTQ.cif" "BioStructures/parse_mmcif_1HTQ.dat" "BioStructures,Parse mmCIF 1HTQ,"
 36 | run_benchmark $nb "julia BioStructures/parse_mmtf.jl data/1CRN.mmtf" "BioStructures/parse_mmtf_1CRN.dat"  "BioStructures,Parse MMTF 1CRN,"
 37 | run_benchmark $ns "julia BioStructures/parse_mmtf.jl data/1HTQ.mmtf" "BioStructures/parse_mmtf_1HTQ.dat"  "BioStructures,Parse MMTF 1HTQ,"
 38 | run_benchmark $nb "julia BioStructures/count.jl"                     "BioStructures/count.dat"            "BioStructures,Count,"
 39 | run_benchmark $nb "julia BioStructures/distance.jl"                  "BioStructures/distance.dat"         "BioStructures,Distance,"
 40 | run_benchmark $nb "julia BioStructures/ramachandran.jl"              "BioStructures/ramachandran.dat"     "BioStructures,Ramachandran,"
 41 | echo "BioStructures benchmarks done"
 42 | 
 43 | # MIToS
 44 | run_benchmark $nb "julia MIToS/parse_pdb.jl data/1CRN.pdb" "MIToS/parse_pdb_1CRN.dat" "MIToS,Parse PDB 1CRN,"
 45 | run_benchmark $ns "julia MIToS/parse_pdb.jl data/1HTQ.pdb" "MIToS/parse_pdb_1HTQ.dat" "MIToS,Parse PDB 1HTQ,"
 46 | run_benchmark $nb "julia MIToS/count.jl"                   "MIToS/count.dat"          "MIToS,Count,"
 47 | run_benchmark $nb "julia MIToS/distance.jl"                "MIToS/distance.dat"       "MIToS,Distance,"
 48 | echo "MIToS benchmarks done"
 49 | 
 50 | # Biopython
 51 | run_benchmark $nb "python Biopython/parse_pdb.py data/1CRN.pdb"   "Biopython/parse_pdb_1CRN.dat"   "Biopython,Parse PDB 1CRN,"
 52 | run_benchmark $ns "python Biopython/parse_pdb.py data/1HTQ.pdb"   "Biopython/parse_pdb_1HTQ.dat"   "Biopython,Parse PDB 1HTQ,"
 53 | run_benchmark $nb "python Biopython/parse_mmcif.py data/1CRN.cif" "Biopython/parse_mmcif_1CRN.dat" "Biopython,Parse mmCIF 1CRN,"
 54 | run_benchmark $ns "python Biopython/parse_mmcif.py data/1HTQ.cif" "Biopython/parse_mmcif_1HTQ.dat" "Biopython,Parse mmCIF 1HTQ,"
 55 | run_benchmark $nb "python Biopython/parse_mmtf.py data/1CRN.mmtf" "Biopython/parse_mmtf_1CRN.dat"  "Biopython,Parse MMTF 1CRN,"
 56 | run_benchmark $ns "python Biopython/parse_mmtf.py data/1HTQ.mmtf" "Biopython/parse_mmtf_1HTQ.dat"  "Biopython,Parse MMTF 1HTQ,"
 57 | run_benchmark $nb "python Biopython/count.py"                     "Biopython/count.dat"            "Biopython,Count,"
 58 | run_benchmark $nb "python Biopython/distance.py"                  "Biopython/distance.dat"         "Biopython,Distance,"
 59 | run_benchmark $nb "python Biopython/ramachandran.py"              "Biopython/ramachandran.dat"     "Biopython,Ramachandran,"
 60 | echo "Biopython benchmarks done"
 61 | 
 62 | # ProDy
 63 | run_benchmark $nb "python ProDy/parse_pdb.py data/1CRN.pdb" "ProDy/parse_pdb_1CRN.dat" "ProDy,Parse PDB 1CRN,"
 64 | run_benchmark $ns "python ProDy/parse_pdb.py data/1HTQ.pdb" "ProDy/parse_pdb_1HTQ.dat" "ProDy,Parse PDB 1HTQ,"
 65 | run_benchmark $nb "python ProDy/count.py"                   "ProDy/count.dat"          "ProDy,Count,"
 66 | run_benchmark $nb "python ProDy/distance.py"                "ProDy/distance.dat"       "ProDy,Distance,"
 67 | run_benchmark $nb "python ProDy/ramachandran.py"            "ProDy/ramachandran.dat"   "ProDy,Ramachandran,"
 68 | echo "ProDy benchmarks done"
 69 | 
 70 | # MDAnalysis
 71 | run_benchmark $nb "python MDAnalysis/parse_pdb.py data/1CRN.pdb" "MDAnalysis/parse_pdb_1CRN.dat" "MDAnalysis,Parse PDB 1CRN,"
 72 | run_benchmark $ns "python MDAnalysis/parse_pdb.py data/1HTQ.pdb" "MDAnalysis/parse_pdb_1HTQ.dat" "MDAnalysis,Parse PDB 1HTQ,"
 73 | run_benchmark $nb "python MDAnalysis/count.py"                   "MDAnalysis/count.dat"          "MDAnalysis,Count,"
 74 | run_benchmark $nb "python MDAnalysis/distance.py"                "MDAnalysis/distance.dat"       "MDAnalysis,Distance,"
 75 | run_benchmark $nb "python MDAnalysis/ramachandran.py"            "MDAnalysis/ramachandran.dat"   "MDAnalysis,Ramachandran,"
 76 | echo "MDAnalysis benchmarks done"
 77 | 
 78 | # biotite
 79 | run_benchmark $nb "python biotite/parse_pdb.py data/1CRN.pdb"   "biotite/parse_pdb_1CRN.dat"   "biotite,Parse PDB 1CRN,"
 80 | run_benchmark $ns "python biotite/parse_pdb.py data/1HTQ.pdb"   "biotite/parse_pdb_1HTQ.dat"   "biotite,Parse PDB 1HTQ,"
 81 | run_benchmark $nb "python biotite/parse_mmcif.py data/1CRN.cif" "biotite/parse_mmcif_1CRN.dat" "biotite,Parse mmCIF 1CRN,"
 82 | run_benchmark $ns "python biotite/parse_mmcif.py data/1HTQ.cif" "biotite/parse_mmcif_1HTQ.dat" "biotite,Parse mmCIF 1HTQ,"
 83 | run_benchmark $nb "python biotite/parse_mmtf.py data/1CRN.mmtf" "biotite/parse_mmtf_1CRN.dat"  "biotite,Parse MMTF 1CRN,"
 84 | run_benchmark $ns "python biotite/parse_mmtf.py data/1HTQ.mmtf" "biotite/parse_mmtf_1HTQ.dat"  "biotite,Parse MMTF 1HTQ,"
 85 | echo "biotite benchmarks done"
 86 | 
 87 | # atomium
 88 | run_benchmark $nb "python atomium/parse_pdb.py   data/1CRN.pdb" "atomium/parse_pdb_1CRN.dat"   "atomium,Parse PDB 1CRN,"
 89 | run_benchmark $ns "python atomium/parse_pdb.py   data/1HTQ.pdb" "atomium/parse_pdb_1HTQ.dat"   "atomium,Parse PDB 1HTQ,"
 90 | run_benchmark $nb "python atomium/parse_mmcif.py data/1CRN.cif" "atomium/parse_mmcif_1CRN.dat" "atomium,Parse mmCIF 1CRN,"
 91 | run_benchmark $ns "python atomium/parse_mmcif.py data/1HTQ.cif" "atomium/parse_mmcif_1HTQ.dat" "atomium,Parse mmCIF 1HTQ,"
 92 | run_benchmark $nb "python atomium/parse_mmtf.py data/1CRN.mmtf" "atomium/parse_mmtf_1CRN.dat"  "atomium,Parse MMTF 1CRN,"
 93 | run_benchmark $ns "python atomium/parse_mmtf.py data/1HTQ.mmtf" "atomium/parse_mmtf_1HTQ.dat"  "atomium,Parse MMTF 1HTQ,"
 94 | echo "atomium benchmarks done"
 95 | 
 96 | # Bio3D
 97 | run_benchmark $nb "Rscript Bio3D/parse_pdb.R data/1CRN.pdb" "Bio3D/parse_pdb_1CRN.dat" "Bio3D,Parse PDB 1CRN,"
 98 | run_benchmark $ns "Rscript Bio3D/parse_pdb.R data/1HTQ.pdb" "Bio3D/parse_pdb_1HTQ.dat" "Bio3D,Parse PDB 1HTQ,"
 99 | run_benchmark $nb "Rscript Bio3D/count.R"                   "Bio3D/count.dat"          "Bio3D,Count,"
100 | run_benchmark $nb "Rscript Bio3D/distance.R"                "Bio3D/distance.dat"       "Bio3D,Distance,"
101 | echo "Bio3D benchmarks done"
102 | 
103 | # Rpdb
104 | run_benchmark $nb "Rscript Rpdb/parse_pdb.R data/1CRN.pdb" "Rpdb/parse_pdb_1CRN.dat" "Rpdb,Parse PDB 1CRN,"
105 | run_benchmark $ns "Rscript Rpdb/parse_pdb.R data/1HTQ.pdb" "Rpdb/parse_pdb_1HTQ.dat" "Rpdb,Parse PDB 1HTQ,"
106 | run_benchmark $nb "Rscript Rpdb/count.R"                   "Rpdb/count.dat"          "Rpdb,Count,"
107 | run_benchmark $nb "Rscript Rpdb/distance.R"                "Rpdb/distance.dat"       "Rpdb,Distance,"
108 | echo "Rpdb benchmarks done"
109 | 
110 | # BioJava
111 | run_benchmark $nb "java -cp BioJava/target/pdb-benchmarks-1.0-SNAPSHOT.jar com.jgreener.pdb.parse_pdb data/1CRN.pdb"   "BioJava/parse_pdb_1CRN.dat"   "BioJava,Parse PDB 1CRN,"
112 | run_benchmark $ns "java -cp BioJava/target/pdb-benchmarks-1.0-SNAPSHOT.jar com.jgreener.pdb.parse_pdb data/1HTQ.pdb"   "BioJava/parse_pdb_1HTQ.dat"   "BioJava,Parse PDB 1HTQ,"
113 | run_benchmark $nb "java -cp BioJava/target/pdb-benchmarks-1.0-SNAPSHOT.jar com.jgreener.pdb.parse_mmcif data/1CRN.cif" "BioJava/parse_mmcif_1CRN.dat" "BioJava,Parse mmCIF 1CRN,"
114 | run_benchmark $ns "java -cp BioJava/target/pdb-benchmarks-1.0-SNAPSHOT.jar com.jgreener.pdb.parse_mmcif data/1HTQ.cif" "BioJava/parse_mmcif_1HTQ.dat" "BioJava,Parse mmCIF 1HTQ,"
115 | run_benchmark $nb "java -cp BioJava/target/pdb-benchmarks-1.0-SNAPSHOT.jar com.jgreener.pdb.parse_mmtf data/1CRN.mmtf" "BioJava/parse_mmtf_1CRN.dat"  "BioJava,Parse MMTF 1CRN,"
116 | run_benchmark $ns "java -cp BioJava/target/pdb-benchmarks-1.0-SNAPSHOT.jar com.jgreener.pdb.parse_mmtf data/1HTQ.mmtf" "BioJava/parse_mmtf_1HTQ.dat"  "BioJava,Parse MMTF 1HTQ,"
117 | echo "BioJava benchmarks done"
118 | 
119 | # BioPerl
120 | run_benchmark $nb "perl BioPerl/parse_pdb.pl data/1CRN.pdb" "BioPerl/parse_pdb_1CRN.dat" "BioPerl,Parse PDB 1CRN,"
121 | run_benchmark $ns "perl BioPerl/parse_pdb.pl data/1HTQ.pdb" "BioPerl/parse_pdb_1HTQ.dat" "BioPerl,Parse PDB 1HTQ,"
122 | run_benchmark $nb "perl BioPerl/count.pl"                   "BioPerl/count.dat"          "BioPerl,Count,"
123 | run_benchmark $nb "perl BioPerl/distance.pl"                "BioPerl/distance.dat"       "BioPerl,Distance,"
124 | echo "BioPerl benchmarks done"
125 | 
126 | # BioRuby
127 | run_benchmark $nb "ruby BioRuby/parse_pdb.rb data/1CRN.pdb" "BioRuby/parse_pdb_1CRN.dat" "BioRuby,Parse PDB 1CRN,"
128 | run_benchmark $ns "ruby BioRuby/parse_pdb.rb data/1HTQ.pdb" "BioRuby/parse_pdb_1HTQ.dat" "BioRuby,Parse PDB 1HTQ,"
129 | run_benchmark $nb "ruby BioRuby/count.rb"                   "BioRuby/count.dat"          "BioRuby,Count,"
130 | run_benchmark $nb "ruby BioRuby/distance.rb"                "BioRuby/distance.dat"       "BioRuby,Distance,"
131 | echo "BioRuby benchmarks done"
132 | 
133 | # GEMMI
134 | run_benchmark $nb "GEMMI/parse_pdb data/1CRN.pdb"   "GEMMI/parse_pdb_1CRN.dat"   "GEMMI,Parse PDB 1CRN,"
135 | run_benchmark $ns "GEMMI/parse_pdb data/1HTQ.pdb"   "GEMMI/parse_pdb_1HTQ.dat"   "GEMMI,Parse PDB 1HTQ,"
136 | run_benchmark $nb "GEMMI/parse_mmcif data/1CRN.cif" "GEMMI/parse_mmcif_1CRN.dat" "GEMMI,Parse mmCIF 1CRN,"
137 | run_benchmark $ns "GEMMI/parse_mmcif data/1HTQ.cif" "GEMMI/parse_mmcif_1HTQ.dat" "GEMMI,Parse mmCIF 1HTQ,"
138 | run_benchmark $nb "GEMMI/count"                     "GEMMI/count.dat"            "GEMMI,Count,"
139 | run_benchmark $nb "GEMMI/distance"                  "GEMMI/distance.dat"         "GEMMI,Distance,"
140 | echo "GEMMI benchmarks done"
141 | 
142 | # Victor
143 | run_benchmark $nb "Victor/parse_pdb data/1CRN.pdb" "Victor/parse_pdb_1CRN.dat" "Victor,Parse PDB 1CRN,"
144 | run_benchmark $ns "Victor/parse_pdb data/1HTQ.pdb" "Victor/parse_pdb_1HTQ.dat" "Victor,Parse PDB 1HTQ,"
145 | echo "Victor benchmarks done"
146 | 
147 | # ESBTL
148 | run_benchmark $nb "ESBTL/parse_pdb data/1CRN.pdb" "ESBTL/parse_pdb_1CRN.dat" "ESBTL,Parse PDB 1CRN,"
149 | echo "ESBTL benchmarks done"
150 | 
151 | # chemfiles - Python
152 | run_benchmark $nb "python chemfiles/parse_pdb.py data/1CRN.pdb"   "chemfiles/parse_pdb_1CRN_py.dat"   "chemfiles-python,Parse PDB 1CRN,"
153 | # FIXME: this uncovered a bug in chemfiles, the bugfix will be avaible on
154 | # chemfiles>=0.10 when released
155 | #run_benchmark $ns "python chemfiles/parse_pdb.py data/1HTQ.pdb"   "chemfiles/parse_pdb_1HTQ_py.dat"   "chemfiles-python,Parse PDB 1HTQ,"
156 | run_benchmark $nb "python chemfiles/parse_mmcif.py data/1CRN.cif" "chemfiles/parse_mmcif_1CRN_py.dat" "chemfiles-python,Parse mmCIF 1CRN,"
157 | run_benchmark $ns "python chemfiles/parse_mmcif.py data/1HTQ.cif" "chemfiles/parse_mmcif_1HTQ_py.dat" "chemfiles-python,Parse mmCIF 1HTQ,"
158 | run_benchmark $nb "python chemfiles/parse_mmtf.py data/1CRN.mmtf" "chemfiles/parse_mmtf_1CRN_py.dat"  "chemfiles-python,Parse MMTF 1CRN,"
159 | #run_benchmark $ns "python chemfiles/parse_mmtf.py data/1HTQ.mmtf" "chemfiles/parse_mmtf_1HTQ_py.dat"  "chemfiles-python,Parse MMTF 1HTQ,"
160 | run_benchmark $nb "python chemfiles/count.py"                     "chemfiles/count_py.dat"            "chemfiles-python,Count,"
161 | run_benchmark $nb "python chemfiles/distance.py"                  "chemfiles/distance_py.dat"         "chemfiles-python,Distance,"
162 | run_benchmark $nb "python chemfiles/ramachandran.py"              "chemfiles/ramachandran_py.dat"     "chemfiles-python,Ramachandran,"
163 | echo "chemfiles-python benchmarks done"
164 | 
165 | # chemfiles - C++
166 | run_benchmark $nb "chemfiles/parse_pdb data/1CRN.pdb"   "chemfiles/parse_pdb_1CRN_cxx.dat"   "chemfiles-cxx,Parse PDB 1CRN,"
167 | #run_benchmark $ns "chemfiles/parse_pdb data/1HTQ.pdb"   "chemfiles/parse_pdb_1HTQ_cxx.dat"   "chemfiles-cxx,Parse PDB 1HTQ,"
168 | run_benchmark $nb "chemfiles/parse_mmcif data/1CRN.cif" "chemfiles/parse_mmcif_1CRN_cxx.dat" "chemfiles-cxx,Parse mmCIF 1CRN,"
169 | run_benchmark $ns "chemfiles/parse_mmcif data/1HTQ.cif" "chemfiles/parse_mmcif_1HTQ_cxx.dat" "chemfiles-cxx,Parse mmCIF 1HTQ,"
170 | run_benchmark $nb "chemfiles/parse_mmtf data/1CRN.mmtf" "chemfiles/parse_mmtf_1CRN_cxx.dat"  "chemfiles-cxx,Parse MMTF 1CRN,"
171 | #run_benchmark $ns "chemfiles/parse_mmtf data/1HTQ.mmtf" "chemfiles/parse_mmtf_1HTQ_cxx.dat"  "chemfiles-cxx,Parse MMTF 1HTQ,"
172 | run_benchmark $nb "chemfiles/count"                     "chemfiles/count_cxx.dat"            "chemfiles-cxx,Count,"
173 | run_benchmark $nb "chemfiles/distance"                  "chemfiles/distance_cxx.dat"         "chemfiles-cxx,Distance,"
174 | run_benchmark $nb "chemfiles/ramachandran"              "chemfiles/ramachandran_cxx.dat"     "chemfiles-cxx,Ramachandran,"
175 | echo "chemfiles-cxx benchmarks done"
176 | 
177 | # Plot results
178 | julia plot/plot.jl
179 | echo "Results plotted"
180 | 


--------------------------------------------------------------------------------
/tools/table.jl:
--------------------------------------------------------------------------------
 1 | # Print the benchmark results as a markdown table
 2 | 
 3 | times = Dict{String, Dict}()
 4 | 
 5 | open("benchmarks.csv") do f
 6 |     for line in eachline(f)
 7 |         if !startswith(line, "Package")
 8 |             software, benchmark, runtime = split(line, ",")
 9 |             if haskey(times, benchmark)
10 |                 times[benchmark][software] = parse(Float64, runtime)
11 |             else
12 |                 times[benchmark] = Dict(software=> parse(Float64, runtime))
13 |             end
14 |         end
15 |     end
16 | end
17 | 
18 | for (benchmark, label, millisecond) in (
19 |                             ("Parse PDB 1CRN"  , "Parse PDB 1CRN / ms"  , true ),
20 |                             ("Parse PDB 1HTQ"  , "Parse PDB 1HTQ / s"   , false),
21 |                             ("Parse mmCIF 1CRN", "Parse mmCIF 1CRN / ms", true ),
22 |                             ("Parse mmCIF 1HTQ", "Parse mmCIF 1HTQ / s" , false),
23 |                             ("Parse MMTF 1CRN" , "Parse MMTF 1CRN / ms" , true ),
24 |                             ("Parse MMTF 1HTQ" , "Parse MMTF 1HTQ / s"  , false),
25 |                             ("Count"           , "Count / ms"           , true ),
26 |                             ("Distance"        , "Distance / ms"        , true ),
27 |                             ("Ramachandran"    , "Ramachandran / ms"    , true ))
28 |     print("| $(rpad(label, 21)) |")
29 |     for software in ("BioStructures", "MIToS", "Biopython", "ProDy", "MDAnalysis", "biotite",
30 |                         "atomium", "Bio3D", "Rpdb", "BioJava", "BioPerl", "BioRuby", "GEMMI",
31 |                         "Victor", "ESBTL", "chemfiles-python", "chemfiles-cxx")
32 |         if haskey(times[benchmark], software)
33 |             if millisecond
34 |                 val = string(round(1000 * times[benchmark][software], sigdigits=2))
35 |             else
36 |                 val = string(round(times[benchmark][software], sigdigits=2))
37 |             end
38 |         else
39 |             val = "-"
40 |         end
41 |         print(" $(rpad(val, 16)) |")
42 |     end
43 |     println()
44 | end
45 | 


--------------------------------------------------------------------------------