The time complexity of the algorithm depends on the number of intervals (candidate changepoints stored). Here we compute the mean number of intervals for real Mono27ac data, and synthetic count data which are always increasing.
data(Mono27ac, package="PeakSegDisk", envir=environment())
library(data.table)
loss.list <- list()
N.data.vec <- 10^seq(1, 3)
for(penalty in c(0, 1e2, 1e4, 1e6)){
for(N.data in N.data.vec){
some.cov <- Mono27ac$coverage[1:N.data]
some.inc <- data.table(some.cov)
some.inc[, count := 1:.N]
data.list <- list(
real=some.cov,
synthetic=some.inc)
for(short.type in names(data.list)){
df <- data.list[[short.type]]
fit <- PeakSegDisk::PeakSegFPOP_df(df, penalty)
loss.list[[paste(penalty, N.data, short.type)]] <- data.table(
N.data,
short.type,
fit$loss)
}
}
}
(loss <- do.call(rbind, loss.list))[, .(
penalty, short.type, N.data,
mean.intervals, max.intervals,
megabytes, seconds)][order(penalty, short.type, N.data)]
#> penalty short.type N.data mean.intervals max.intervals megabytes seconds
#> <int> <char> <num> <num> <int> <num> <num>
#> 1: 0 real 10 0.9500 1 0.0008850098 0.010
#> 2: 0 real 100 1.8250 4 0.0122909546 0.016
#> 3: 0 real 1000 2.8325 5 0.1614456177 0.090
#> 4: 0 synthetic 10 2.3000 3 0.0013999939 0.001
#> 5: 0 synthetic 100 2.8200 5 0.0160865784 0.018
#> 6: 0 synthetic 1000 2.9920 5 0.1675300598 0.081
#> 7: 100 real 10 1.0500 3 0.0009231567 0.001
#> 8: 100 real 100 3.1350 8 0.0172882080 0.008
#> 9: 100 real 1000 10.9590 30 0.4714469910 0.112
#> 10: 100 synthetic 10 3.3000 7 0.0017814636 0.001
#> 11: 100 synthetic 100 8.5050 17 0.0377731323 0.023
#> 12: 100 synthetic 1000 54.5855 108 2.1356658936 0.283
#> 13: 10000 real 10 3.1000 5 0.0017051697 0.002
#> 14: 10000 real 100 7.5750 12 0.0342254639 0.014
#> 15: 10000 real 1000 11.9290 31 0.5084495544 0.152
#> 16: 10000 synthetic 10 3.8000 6 0.0019721985 0.002
#> 17: 10000 synthetic 100 20.7450 39 0.0844650269 0.021
#> 18: 10000 synthetic 1000 151.7575 315 5.8424835205 0.637
#> 19: 1000000 real 10 1.8500 4 0.0012283325 0.001
#> 20: 1000000 real 100 4.5000 11 0.0224952698 0.007
#> 21: 1000000 real 1000 13.2370 19 0.5583457947 0.084
#> 22: 1000000 synthetic 10 2.0000 4 0.0012855530 0.002
#> 23: 1000000 synthetic 100 21.4600 29 0.0871925354 0.028
#> 24: 1000000 synthetic 1000 192.2360 289 7.3866157532 0.551
#> penalty short.type N.data mean.intervals max.intervals megabytes seconds
Theoretically the most intervals that could be stored is \(O(i)\) for
each data point \(i\in\{1, ..., N\}\). Therefore the largest total
number of intervals is sum(1:N)
, which can also be computed by
N*(N+1)/2
. The largest mean is mean(1:N)
, which can be computed
via sum(1:N)/N
= (N+1)/2
.
(worst.dt <- data.table(
N.data=N.data.vec,
mean.intervals=(N.data.vec+1)/2,
short.type="theoretical"))
#> N.data mean.intervals short.type
#> <num> <num> <char>
#> 1: 10 5.5 theoretical
#> 2: 100 50.5 theoretical
#> 3: 1000 500.5 theoretical
The plot below shows that the algorithm achieves the theoretical worst case time complexity for the synthetic increasing data, when the penalty is large. But the number of intervals is always much smaller for real Mono27ac ChIP-seq data.
one <- function(short.type, data.type, color){
data.table(short.type, data.type, color)
}
type.dt <- rbind(
one("theoretical", "Theoretical\nworst case", "grey"),
one("synthetic", "Synthetic\nIncreasing", "red"),
one("real", "Real ChIP-seq", "black"))
loss.types <- type.dt[loss, on=list(short.type)]
worst.types <- type.dt[worst.dt, on=list(short.type)]
(type.colors <- type.dt[, structure(color, names=data.type)])
#> Theoretical\nworst case Synthetic\nIncreasing Real ChIP-seq
#> "grey" "red" "black"
if(require(ggplot2)){
ggplot()+
guides(
color=guide_legend(keyheight=3)
)+
geom_blank(aes(
N.data, 1),
data=data.table(N.data=c(5, 2000)))+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
facet_grid(. ~ penalty, labeller=label_both)+
geom_line(aes(
N.data, mean.intervals, color=data.type),
data=worst.types)+
scale_color_manual(
values=type.colors,
breaks=names(type.colors))+
geom_line(aes(
bedGraph.lines, mean.intervals, color=data.type),
data=loss.types)+
scale_x_log10("N data")+
scale_y_log10("Mean intervals (candidate changepoints)")
}