The Sequence Read Archive (SRA) contains the raw DNA-sequencing data for many published datasets. We were interested in which sequencing platforms were prevalent, or rather, how prevalent Illumina Sequencing is. We were specifically interested in Human data, notably Whole Genome Sequencing (WGS).

# SETUP
library('scales')

To get at the metadata contained within the SRA, I did a search for all human samples - "homo sapiens"[Organism]. This returns 184586 results at the time of writing. The summaries of these search results can be downloaded as a .csv file, which is used in this analysis.

# Load in the data
data <- read.csv('sra_result_human.csv')

# Look at the column headers
colnames(data)
##  [1] "Experiment.Accession"   "Experiment.Title"      
##  [3] "Organism.Name"          "Instrument"            
##  [5] "Submitter"              "Study.Accession"       
##  [7] "Study.Title"            "Sample.Accession"      
##  [9] "Sample.Title"           "Total.Size..Mb"        
## [11] "Total.RUNs"             "Total.Spots"           
## [13] "Total.Bases"            "FTP.Path.to.Experiment"
## [15] "Library.Name"           "Library.Strategy"      
## [17] "Library.Source"         "Library.Selection"
# Split up by Illumina and Not Illumina
illumina_machines <- c('Illumina Genome Analyzer', 'Illumina Genome Analyzer II', 'Illumina Genome Analyzer IIx',
                       'Illumina HiScanSQ', 'Illumina HiSeq 1000', 'Illumina HiSeq 1500', 'Illumina HiSeq 2000',
                       'Illumina HiSeq 2500', 'Illumina MiSeq')
total_illumina <- data[(data$Instrument %in% illumina_machines),]
total_not_illumina <- data[!(data$Instrument %in% illumina_machines),]

Initial Overview

We’re most interested in the number of samples and the total number of base pairs sequenced - these give a feel of the dominancy of certain platforms.

# Count samples and base pairs
num_samples <- nrow(data)
num_samples.illumina <- nrow(total_illumina)
num_samples.not_illumina <- nrow(total_not_illumina)
num_bp <-sum(data$Total.Bases, na.rm=TRUE)
num_bp.illumina <- sum(total_illumina$Total.Bases, na.rm=TRUE)
num_bp.not_illumina <- sum(total_not_illumina$Total.Bases, na.rm=TRUE)

# What numbers are we talking about here?
num_samples
## [1] 184496
num_bp
## [1] 2.037e+15
# Plot the number of samples for Illumina / Not Illumina
par(mai=c(1,1,1,1))
pie(c(num_samples.illumina, num_samples.not_illumina), init.angle=90, 
    labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)),
             paste('Not Illumina -', percent(num_samples.not_illumina/num_samples))))
title("Number of Samples per Platform")

plot of chunk unnamed-chunk-4

# Plot the total base pairs sequenced for Illumina / Not Illumina
pie(c(num_bp.illumina, num_bp.not_illumina), init.angle=0,
    labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)),
             paste('Not Illumina -', percent(num_bp.not_illumina/num_bp))))
title("Number of Base Pairs per Platform")

plot of chunk unnamed-chunk-4

Hang on, what was that?

The above plot is really surprising! The vast majority of samples are sequences using Illumina machines, but when looking at the total number of base pairs this drops significantly.

Explaining the skew in percentage of base pairs

# Back to the original data - plot number of samples for all instruments
par(mai=c(1,3,1,1))
samples_by_instrument <- table(data$Instrument)
samples_by_instrument <- sort(samples_by_instrument)
barplot(samples_by_instrument, horiz=TRUE, las=1, main="Samples per Instrument", cex.names=0.7)

plot of chunk unnamed-chunk-5

# Plot the total base pairs for all instruments
bp_by_instrument <- rowsum(data$Total.Bases, data$Instrument, na.rm = TRUE)
bp_by_instrument <- bp_by_instrument[order(bp_by_instrument[,1]),] # sort by frequency
barplot(bp_by_instrument, horiz=TRUE, las=1, main="BP Per Instrument", cex.names=0.7)

plot of chunk unnamed-chunk-5

Ok, So it looks like this discrepancy is caused by a handful of samples sequenced using the Complete Genomics platform. What kinds of studies were these?

# Split this up for library types
# I should group the low frequency types into "other". For now, it's just
# sorted by frequency so ignore the stuff at the bottom of the legend.
par(mai=c(1,3,1,1))
samples_type_per_instrument <- table(data$Library.Strategy, data$Instrument)
samples_type_per_instrument <- samples_type_per_instrument[order(rowSums(samples_type_per_instrument),decreasing=TRUE),order(colSums(samples_type_per_instrument))]
library('RColorBrewer')
colours<-brewer.pal(11, 'Spectral')
cols<-colorRampPalette(colours)(nrow(samples_type_per_instrument))
barplot(samples_type_per_instrument, horiz=TRUE, las=1, main="Samples per Instrument", col=colours, cex.names=0.7)
legend("bottomright", rownames(samples_type_per_instrument), fill=colours, cex=0.7)

plot of chunk unnamed-chunk-6

# Base pairs per platform split by library types
# This is horrible code. If anyone has a better way of writing this I'd love to know it!
library('plyr')
library('reshape2')
bp_type_per_instrument <- aggregate(Total.Bases ~ Instrument + Library.Strategy, data = data, sum, simplify=TRUE)
# Unmelt these two variables to make a matrix
bp_type_per_instrument <- dcast(bp_type_per_instrument, Library.Strategy ~ Instrument, fill=0)
## Using Total.Bases as value column: use value.var to override.
# Massage into the correct format
rownames(bp_type_per_instrument) <- bp_type_per_instrument[,1]
bp_type_per_instrument <- as.matrix(bp_type_per_instrument[,-1])
# Sort that bad boy
bp_type_per_instrument <- bp_type_per_instrument[order(rowSums(bp_type_per_instrument),decreasing=TRUE),order(colSums(bp_type_per_instrument))]
cols<-colorRampPalette(colours)(nrow(bp_type_per_instrument))
barplot(bp_type_per_instrument, horiz=TRUE, las=1, main="Base Pairs per Instrument", col=colours, cex.names=0.7)
legend("bottomright", rownames(bp_type_per_instrument), fill=colours, cex=0.7)

plot of chunk unnamed-chunk-6

Ok, so that explains it - there are some Whole Genome Sequencing (WGS) and Whole Exome Sequencing (WXS) projects done by Complete Genomics which are massive and are accounting for a large proportion of the base pairs stored on the SRA.

Complete Genomics are a sequencing facility based in the USA. They have a very complicated library prep and bioinformatics pipline. As such, they don’t sell sequencing machines, but do #’ genome sequencing of samples sent in. They were recently bought by BGI.

Repeating the Summary Stats with Complete Genomics

# Repeat counts with Complete genomics
total_cg <- data[data$Instrument == 'Complete Genomics',]
num_samples.cg <- nrow(total_cg)
num_samples.not_illumina_cg <- num_samples.not_illumina - num_samples.cg
num_bp.cg <- sum(total_cg$Total.Bases, na.rm=TRUE)
num_bp.not_illumina_cg <- num_bp.not_illumina - num_bp.cg

# Summarise again with Complete genomics
par(mai=c(1,1,1,1))

pie(c(num_samples.illumina, num_samples.cg, num_samples.not_illumina_cg), init.angle=90,
    labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)),
             paste('Complete Genomics -', percent(num_samples.cg/num_samples)),
             paste('Not Illumina -', percent(num_samples.not_illumina_cg/num_samples))))
title("Number of Samples per Platform")

plot of chunk unnamed-chunk-7

pie(c(num_bp.illumina, num_bp.cg, num_bp.not_illumina_cg), init.angle=140,
    labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)),
             paste('Complete Genomics -', percent(num_bp.cg/num_bp)),
             paste('Not Illumina -', percent(num_bp.not_illumina_cg/num_bp))))
title("Number of Base Pairs per Platform")

plot of chunk unnamed-chunk-7

WGS Summary, without Complete Genomics

Finally, it would be nice to know the precentages for just Whole Genome Sequencing (WGS), ignoring the Complete Genomics platform.

# Find summary stats in WGS only, without Complete Genomics
total_illumina.WGS <- total_illumina[total_illumina$Library.Strategy=='WGS',]
total_not_illumina_cg <- data[!(data$Instrument %in% c(illumina_machines, 'Complete Genomics')),]
total_not_illumina_cg.WGS <- total_not_illumina_cg[total_not_illumina_cg$Library.Strategy=='WGS',]

num_samples.wgs.illumina <- nrow(total_illumina.WGS)
num_samples.wgs.not_illumina_cg <- nrow(total_not_illumina_cg.WGS)
num_samples.wgs.not_cg <- num_samples.wgs.illumina + num_samples.wgs.not_illumina_cg
num_bp.wgs.illumina <- sum(total_illumina.WGS$Total.Bases, na.rm=TRUE)
num_bp.wgs.not_illumina_cg <- sum(total_not_illumina_cg.WGS$Total.Bases, na.rm=TRUE)
num_bp.wgs.not_cg <- num_bp.wgs.illumina + num_bp.wgs.not_illumina_cg

pie(c(num_samples.wgs.illumina, num_samples.wgs.not_illumina_cg), init.angle=90,
    labels=c(paste('Illumina -', percent(num_samples.wgs.illumina/num_samples.wgs.not_cg)),
             paste('Not Illumina or CG -', percent(num_samples.wgs.not_illumina_cg/num_samples.wgs.not_cg))))
title("Number of Samples per Platform - WGS, no CG")

plot of chunk unnamed-chunk-8

pie(c(num_bp.wgs.illumina, num_bp.wgs.not_illumina_cg), init.angle=90,
    labels=c(paste('Illumina -', percent(num_bp.wgs.illumina/num_bp.wgs.not_cg)),
             paste('Not Illumina or CG -', percent(num_bp.wgs.not_illumina_cg/num_bp.wgs.not_cg))))
title("Number of Base Pairs per Platform - WGS, no CG")

plot of chunk unnamed-chunk-8