├── .Rbuildignore
├── .Rproj.user
    └── CD02ED0
    │   ├── cpp-definition-cache
    │   ├── pcs
    │       ├── debug-breakpoints.pper
    │       ├── files-pane.pper
    │       ├── source-pane.pper
    │       ├── windowlayoutstate.pper
    │       └── workbench-pane.pper
    │   ├── persistent-state
    │   ├── saved_source_markers
    │   └── sdb
    │       ├── prop
    │           ├── 23C73958
    │           ├── 2E63A419
    │           ├── 32E011F9
    │           ├── 68A9BA79
    │           ├── D9AC0466
    │           ├── DD5458E2
    │           ├── EBA575A7
    │           ├── F1C0BA22
    │           ├── F78FEF6A
    │           ├── F854CBB1
    │           └── INDEX
    │       └── s-2ED88DA7
    │           ├── 493EA3DC
    │           └── lock_file
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── reading_functions.R
    └── util_functions.R
├── README.md
├── example_peak_calling.R
├── man
    ├── combine.experiments.Rd
    ├── combined.analysis.Rd
    ├── data.frame.to.peakC.Rd
    ├── hello.Rd
    ├── pairwise.jaccard.Rd
    ├── plot_C.Rd
    ├── readMatrix.Rd
    ├── readMultiColumnFile.Rd
    ├── readMultiple.Rd
    ├── readMultipleWig.Rd
    ├── readqWig.Rd
    ├── rem.Rd
    ├── running.Rd
    └── single.analysis.Rd
├── peakC.Rproj
└── tutorial
    ├── data
        ├── Hba_cis.txt
        ├── Hbb_cis.txt
        ├── alpha-globin_ESC_1_cis.wig
        ├── alpha-globin_FL_1_cis.wig.gz
        ├── alpha-globin_FL_2_cis.wig.gz
        ├── alpha-globin_FL_3_cis.wig.gz
        ├── alpha-globin_forward_1_chr11_win_1_cis.wig.zip
        ├── alpha-globin_mESC_1_cis.wig.gz
        ├── alpha-globin_mESC_2_cis.wig.gz
        ├── alpha-globin_mESC_3_cis.wig.gz
        └── peakC_tutorial_Oacyl_6_reps_mappedReads.txt
        │   ├── set2_viewpoint_1_replicate_1.txt.gz
        │   ├── set2_viewpoint_1_replicate_2.txt.gz
        │   ├── set2_viewpoint_1_replicate_3.txt.gz
        │   ├── set2_viewpoint_1_replicate_4.txt.gz
        │   ├── set2_viewpoint_1_replicate_5.txt.gz
        │   └── set2_viewpoint_1_replicate_6.txt.gz
    ├── library.bib
    ├── peakC_tutorial.Rmd
    ├── peakC_tutorial.html
    ├── peakC_tutorial.pdf
    └── peakC_tutorial_conflict-20180412-115545.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/cpp-definition-cache:
--------------------------------------------------------------------------------
1 | [
2 | ]


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/pcs/debug-breakpoints.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "debugBreakpointsState" : {
3 |         "breakpoints" : [
4 |         ]
5 |     }
6 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/pcs/files-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "path" : "~/development/R/packages/peakC/R",
3 |     "sortOrder" : [
4 |         {
5 |             "ascending" : true,
6 |             "columnIndex" : 2
7 |         }
8 |     ]
9 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/pcs/source-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "activeTab" : 0
3 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/pcs/windowlayoutstate.pper:
--------------------------------------------------------------------------------
 1 | {
 2 |     "left" : {
 3 |         "panelheight" : 1007,
 4 |         "splitterpos" : 419,
 5 |         "topwindowstate" : "NORMAL",
 6 |         "windowheight" : 1045
 7 |     },
 8 |     "right" : {
 9 |         "panelheight" : 1007,
10 |         "splitterpos" : 629,
11 |         "topwindowstate" : "NORMAL",
12 |         "windowheight" : 1045
13 |     }
14 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/pcs/workbench-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "TabSet1" : 2,
3 |     "TabSet2" : 0
4 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/persistent-state:
--------------------------------------------------------------------------------
 1 | build-last-errors="[]"
 2 | build-last-errors-base-dir="~/development/R/packages/peakC/"
 3 | build-last-outputs="[{\"output\":\"==> R CMD INSTALL --no-multiarch --with-keep.source peakC\\n\\n\",\"type\":0},{\"output\":\"* installing to library ‘/home/elzo/R/x86_64-pc-linux-gnu-library/3.1’\\n\",\"type\":1},{\"output\":\"* installing *source* package ‘peakC’ ...\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** R\\n\",\"type\":1},{\"output\":\"** preparing package for lazy loading\\n\",\"type\":1},{\"output\":\"** help\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"*** installing help indices\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** building package indices\\n\",\"type\":1},{\"output\":\"** testing if installed package can be loaded\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"* DONE (peakC)\\n\",\"type\":1},{\"output\":\"\",\"type\":1}]"
 4 | compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}"
 5 | console_procs="[]"
 6 | files.monitored-path=""
 7 | find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":false,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}"
 8 | imageDirtyState="1"
 9 | saveActionState="-1"
10 | 


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/saved_source_markers:
--------------------------------------------------------------------------------
1 | {"active_set":"","sets":[]}


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/23C73958:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/2E63A419:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/32E011F9:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/68A9BA79:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/D9AC0466:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/DD5458E2:
--------------------------------------------------------------------------------
1 | {
2 |     "tempName" : "Untitled2"
3 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/EBA575A7:
--------------------------------------------------------------------------------
1 | {
2 |     "tempName" : "Untitled1"
3 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/F1C0BA22:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/F78FEF6A:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/F854CBB1:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/prop/INDEX:
--------------------------------------------------------------------------------
 1 | %2Fdata%2F4C%2Fhigh_res_methods%2Fnew_analysis%2FpeakC_new_allwig.R="F78FEF6A"
 2 | %2Fdata%2F4C%2Fhigh_res_methods%2Fnew_analysis%2Futil_functions.R="F1C0BA22"
 3 | ~%2Fdevelopment%2FR%2Fpackages%2Fbackup%2FpeakC%2FR%2Fcombined_experiment_analysis.R="2E63A419"
 4 | ~%2Fdevelopment%2FR%2Fpackages%2FpeakC%2FDESCRIPTION="F854CBB1"
 5 | ~%2Fdevelopment%2FR%2Fpackages%2FpeakC%2FNAMESPACE="D9AC0466"
 6 | ~%2Fdevelopment%2FR%2Fpackages%2FpeakC%2FR%2Fexample_peak_calling.R="EBA575A7"
 7 | ~%2Fdevelopment%2FR%2Fpackages%2FpeakC%2FR%2Fhello.R="68A9BA79"
 8 | ~%2Fdevelopment%2FR%2Fpackages%2FpeakC%2FR%2Freading_functions.R="DD5458E2"
 9 | ~%2Fdevelopment%2FR%2Fpackages%2FpeakC%2FR%2Futil_functions.R="23C73958"
10 | ~%2Fdevelopment%2FR%2Fpackages%2FpeakC%2Fexample_peak_calling.R="32E011F9"
11 | 


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/s-2ED88DA7/493EA3DC:
--------------------------------------------------------------------------------
 1 | {
 2 |     "contents" : "\n################################################\n#4C specific functions\n################################################\n\n\nsignificant.fragments <- function( p.value, pos, window = 21, FDR = 0.01 ){\n  #correct the nominal p-value for multiple hypothesis testing\n  p.combined <- p.adjust(p.value, method=\"fdr\")\n  #determine the significant windows and select the fragments therein\n  sig.i <- which(p.combined < FDR)\n  if(length(sig.i)>0) {\n\t\tsig.i.start <- sig.i-floor(window/2); sig.i.end <- sig.i+floor(window/2)\n\t\tsig.i <- unique(multi.seq(sig.i.start,sig.i.end))\n\t\tsig.i <- sig.i[sig.i >= 1 & sig.i <= length(pos)]\n\t\tsigFrags <- pos[sig.i]\n\t\treturn(sigFrags)\n  } else {\n    return(NULL)\n  }\n}\n\nrighttailgamma = function(r,k,n) 1 - pgamma(-log(r/(n+1)^k),k,scale=1)\n\nrank.product.p <- function( data, num.exp,method=\"diff\"){\n  if(method==\"diff\") {\n  stats <- data[,2:(num.exp+1)]-data[,(2:(num.exp+1))+num.exp]\n  } else {\n  stats <- data[,2:(num.exp+1)]/data[,(2:(num.exp+1))+num.exp]\n  }\n  rp <- nrow(data)-apply(stats,2,rank)+1\n  rp <- apply(rp,1,prod)\n  p <- righttailgamma(rp,num.exp,length(rp))\n}\n\n\ngetWindowedFrags <- function(x,frags,wSize=21) {\n\n\toutFrags <- ((match(x,frags)-floor(wSize/2))):((match(x,frags)+floor(wSize/2)))\n\toutFrags <- outFrags[outFrags>=1&outFrags<=length(frags)]\n\n\treturn(frags[outFrags])\n\n}\n\n#set a dynamic threshold for the residuals\ngetThreshold <- function(resids,qW=5) {\n\tq75 <- quantile(resids,probs=0.75) #75% quantile of the residuals\n\tqd50 <- diff(quantile(resids,probs=c(0.25,0.75))) #the range between the 25% and 75% quantiles\n\tthreshold <- q75 + qW*qd50\n\treturn(threshold)\n}\n\n\n#these are not \"significant\" frags just above an arbitrary threshold\nthresholdFrags <- function(resids,frags,wSize=21,qW=5) {\n\n\tqMax <- getThreshold(resids=resids,qW=qW)\n\n\tsel.i <- which(resids > qMax)\n  if(length(sel.i)>0) {\n\t\tsel.i.start <- sel.i-floor(wSize/2); sel.i.end <- sel.i+floor(wSize/2)\n\t\tsel.i <- unique(multi.seq(sel.i.start,sel.i.end))\n\t\tsel.i <- sel.i[sel.i >= 1 & sel.i <= length(frags)]\n\t\tselFrags <- frags[sel.i]\n\t\treturn(selFrags)\n\t}else{\n\t\treturn(NULL)\n\t}\n\n\n}\n\n#' Single experiment 4C/Capture C analysis\n#'\n#' @param data list containing the 4C/CapC data in two column format, an additional element num.exp describes the number of experiments\n#' @param vp.pos viewpoint position, this can be a single value or a two values to analyse a viewpoint region\n#' @param wSize number of fragments in a window\n#' @param alphaFDR false-discovery rate threshold\n#' @param qWd threshold for difference from the background\n#' @param qWr threshold for ratio over the background\n#' @param minDist minimal region around the viewpoint to exclude for the significance analysis\n#'\n#' @description Function for identifying interaction peaks above a background distribution. A list containing a 4C/Capture-C dataset is required as input. The viewpoint position is given in the vp.pos argument.\n#'\n#' @return a list containing a matrix with the data and the background model and a vector with the significant fragments\n#' @export\n#'\n#' @examples\n#' data <- readMultiple(f[1:3], vp.pos = 65923803)\n#' res <- combined.analysis(data, num.exp=3, vp.pos = 65923803)\n#'\n#'\n#'\nsingle.analysis <- function(data, vp.pos, wSize = 21, qWd = 1.5, qWr = 1, minDist = 15e3) {\n\n  #create two element vector containing the viewpoint position\n  #if only one viewpoint is given\n  if(length(vp.pos) == 1){\n    vp.pos <- c(vp.pos,vp.pos)\n  }\n  vp.pos <- sort(vp.pos)\n\n\n  db <- get.single.background(data=data[[1]], num.exp = 1, vp.pos=vp.pos)\n  #running mean over the data\n  db[,2] <- caTools::runmean(x=db[,2],k=wSize,endrule=\"mean\")\n  #running mean over the isotonic regression line\n  db[,3] <- caTools::runmean(x=db[,3],k=5,endrule=\"mean\")\n\n  #add a pseudocount to improve the calculations\n  pseudoCount <- non.zero.quantile(x=db[,2],probs=0.05)\n  ratios <- cbind(db[, 1], (db[, 2] + pseudoCount)/(db[,3] + pseudoCount))\n  deltas <- cbind(db[, 1], db[,2]-db[,3])\n\n  #frags <- db[, 1]\n  #distFrags <- frags[abs(frags-vp.pos)>=minDist]\n\n  #select the fragments that are more than minDist from the viewpoint\n  sel.frag <- db[which( (db[,1] < vp.pos[1] & vp.pos[1]-db[,1] > minDist) | (db[,1] > vp.pos[2] & db[,1]-vp.pos[2] > minDist) ),1]\n\n  rTFrags <- thresholdFrags(resids=ratios[,2],frags=ratios[,1],wSize=wSize,qW=qWr)\n  dTFrags <- thresholdFrags(resids=deltas[,2],frags=deltas[,1],wSize=wSize,qW=qWd)\n  peakFrags <- intersect(intersect(rTFrags,dTFrags), sel.frag)\n\n  return(list(dbR=db,ratios=ratios,deltas=deltas,peak=peakFrags, num.exp=1))\n\n}\n\n#' Combined 4C/Capture C analysis\n#'\n#' @param data list containing the 4C/CapC data in two column format, an additional element num.exp describes the number of experiments\n#' @param num.exp number of experiments, used in conjuction with \"data\" and \"multi\" in type, default is 0 which means the number in the data list is used, a different number overwrites the default number\n#' @param vp.pos viewpoint position, this can be a single value or a two values to analyse a viewpoint region\n#' @param wSize number of fragments in a window\n#' @param alphaFDR false-discovery rate threshold\n#' @param qW threshold for absolute difference\n#' @param minDist minimal region around the viewpoint to exclude for the significance analysis\n#'\n#' @description Function for identifying interaction peaks above a background distribution. A list of 4C/Capture-C datasets are required as input. The viewpoint position is given in the vp.pos argument.\n#'\n#' @return a list containing a matrix with the data and the background model and a vector with the significant fragments\n#' @export\n#'\n#' @examples\n#' data <- readMultiple(f[1:3], vp.pos = 65923803)\n#' res <- combined.analysis(data, num.exp=3, vp.pos = 65923803)\n#'\n#'\n#'\ncombined.analysis <- function( data, num.exp = 0, vp.pos, wSize = 21, alphaFDR = 0.1, qWr = 1, minDist = 15e3 ){\n  #set the number of experiments\n  if(num.exp == 0){\n    num.exp = data$num.exp\n\n  }\n\n  #create two element vector containing the viewpoint position\n  #if only one viewpoint is given\n  if(length(vp.pos) == 1){\n    vp.pos <- c(vp.pos,vp.pos)\n  }\n  vp.pos <- sort(vp.pos)\n\n\n\tdb <- combine.experiments(data,num.exp, vp.pos)\n\n\t# make a data.frame where a running mean is already applied to apply all statistics to those data -> stronger (positive) dependency between statistics but less variance\n\n\tdbR <- db\n\t#running mean over the data\n\tdbR[,2:(num.exp+1)] <- apply(db[,2:(num.exp+1)],2,caTools::runmean,k=wSize,endrule=\"mean\")\n\t#running mean over the isotonic regression line (window is 5)\n\tdbR[,2:(num.exp+1)+num.exp] <- apply(db[,2:(num.exp+1)+num.exp],2,caTools::runmean,k=5,endrule=\"mean\")\n\n\t# for the ratios add a small pseudocount to avoid dividing by 0. Calculate the ratios and deltas (diff) on the runmean data\n\n\t#add a small pseudo count so that there will be no divide/0 errors\n\tpseudoCount <- apply(db[,2:(num.exp+1)], 2, non.zero.quantile, probs=0.05)\n\tprint(pseudoCount)\n\tpseudoCount <- sum(pseudoCount)/num.exp\n\t#calculate the ratio of the data with the regression line\n\tratio <- cbind(db[,1],(dbR[,2:(num.exp+1)]+pseudoCount)/(dbR[,(2:(num.exp+1))+num.exp]+pseudoCount))\n\t#calculate the differene between the data and the regression line\n\tdelta <- cbind(db[,1],dbR[,2:(num.exp+1)]-dbR[,(2:(num.exp+1))+num.exp])\n\n\t#determine the per-window p-value using rank products based on the ratio\n\tp.val <- rank.product.p(data = dbR, num.exp = num.exp,method=\"diff\")\n\t#select the significant fragments\n\tsfr <- significant.fragments(p.value = p.val, pos = db[, 1], window = wSize, FDR = alphaFDR)\n\n\t#select the fragments that are more than minDist from the viewpoint\n\tsel.frag <- db[which( (db[,1] < vp.pos[1] & vp.pos[1]-db[,1] > minDist) | (db[,1] > vp.pos[2] & db[,1]-vp.pos[2] > minDist) ),1]\n\tidx <- delta[,1]%in%sel.frag\n\n\t#set a threshold on the minimal delta threshold, this threshold is defined empirically\n\ttfr <- thresholdFrags(resids=apply(ratio[idx,2:(num.exp+1)],1,mean),frags=ratio[idx,1],wSize=wSize,qW=qWr)\n\n\tsfr <- intersect(sfr,tfr)\n\tlist(dbR=dbR, peak=sfr, num.exp = num.exp, p.value=p.val, ratio = apply(ratio[,2:(num.exp+1)],1,mean), sel=sel.frag )\n}\n\n#' Take a result from the combined.analysis function and generate a chromosomal map of the result\n#'\n#' @param data list containing the output of combined.analysis (i.e. 4C data and significant fragments)\n#' @param num.exp number of experiments\n#' @param y.min bottom limit of the plot\n#' @param y.max top limit of the plot\n#'\n#' @return Nothing, a plot is drawn\n#' @export\n#'\n#' @examples\nplot_C <- function(data, num.exp = 0, y.min=0, y.max=3000, ...){\n  if(num.exp == 0){\n    num.exp = data$num.exp\n  }\n  pos <- data$dbR[,1]\n  if(num.exp == 1){\n    y.ave <- data$dbR[,2]\n  }else{\n    y.ave <- apply(data$dbR[,2:(num.exp+1)], 1, median)\n  }\n  plot(pos, y.ave, type='h', col=ifelse(pos%in%data$peak, \"red\", \"grey\"), axes=F, xlab=\"chromosomal position\", ylab=\"4C signal\", ylim=c(y.min,y.max), ... )\n  axis(2, at=c(0,y.max), las=2)\n  at <- seq(200e3*floor(min(pos)/200e3), ceiling(max(pos)/200e3)*200e3, by=200e3)\n  axis(1, at=seq(0,1e9,by=200e3), lab=sprintf(\"%.1f\", seq(0,1e9,by=200e3)/1e6), cex.axis=1.5)\n}\n\n\n#' Combine list of experiments into a matrix with the background model\n#'\n#' @param data list containing the 4C/CapC data in two column format\n#' @param num.exp number of experiments, default is 0, which means the the number in the data list is taken, other values allow for choosing a subset of the experiments\n#' @param vp.pos position of the viewpoint\n#'\n#' @return merged matrix containing: 1. the position of the fragments, 2:(n+1) the data, (n+1):(n+n+1) background models corresponding to the respective datasets\n#' @export\n#'\n#' @examples\ncombine.experiments <- function( data, num.exp = 0, vp.pos ){\n  if(num.exp == 0){\n    num.exp = data$num.exp\n  }\n  #create two element vector containing the viewpoint position\n  #if only one viewpoint is given\n  if(length(vp.pos) == 1){\n    vp.pos <- c(vp.pos,vp.pos)\n  }\n  vp.pos <- sort(vp.pos)\n  data.m <- data[[1]]\n  for( i in 2:num.exp ){\n    data.m <- merge(data.m, data[[i]], by=1)\n  }\n  #create the background model for the upstream regions\n  data.bg <- data.m\n  for( i in 1:num.exp ){\n    data.bg[data.m[,1] < vp.pos[1],i+1] <- get.background(data.m[data.m[,1] < vp.pos[1],c(1,i+1)], vp.pos[1] )\n  }\n  #and for the downstream regions\n  for( i in 1:num.exp ){\n    data.bg[data.m[,1] > vp.pos[2],i+1] <- get.background(data.m[data.m[,1] > vp.pos[2],c(1,i+1)], vp.pos[2] )\n  }\n  #if two viewpoint fragments are given set the intervening fragments\n  #to zero\n  #set background to 1 to prevent NaN in the ratio\n  if(vp.pos[1] != vp.pos[2]){\n    for( i in 1:num.exp){\n      data.m[data.m[,1] >= vp.pos[1] & data.m[,1] <= vp.pos[2],i+1] <- 0\n      data.bg[data.m[,1] >= vp.pos[1] & data.m[,1] <= vp.pos[2],i] <- 1\n    }\n  }\n  cbind(data.m, data.bg[,-1])\n}\n\n#perform pava regression and return the background regression line\nget.background <- function( data, vp.pos, weight.factor=0, fractile=F){\n  require(isotone)\n  switched = FALSE\n  weights <- (1:nrow(data))**weight.factor\n  if(data[1,1] > vp.pos){\n    data[,1] <- -data[,1] #reverse the sign to make the trend increasing\n    switched = TRUE\n    weights <- rev(weights)\n  }\n  #create the isotonic regression\n  if(fractile){\n    lm <- gpava(data[,1], data[,2], solver=weighted.fractile, weights=NULL, p=0.75)\n  }else{\n    lm <- gpava(data[,1], data[,2], solver=weighted.mean)\n  }\n\n  if(switched)\n    pred.data <- data.frame( -lm$z, lm$x )\n  else\n    pred.data <- data.frame( lm$z, lm$x )\n\n  pred.data[order(pred.data[,1]),2]\n}\n\nget.single.background <- function(data, num.exp = 1, vp.pos) {\n\n  if (length(vp.pos) == 1) {\n    vp.pos <- c(vp.pos, vp.pos)\n  }\n  vp.pos <- sort(vp.pos)\n\n  data.bg <- data\n  data.bg[data[, 1] < vp.pos[1], 2] <- get.background(data[data[, 1] < vp.pos[1], c(1, 2)], vp.pos[1])\n  data.bg[data[, 1] > vp.pos[2], 2] <- get.background(data[data[, 1] > vp.pos[2], c(1, 2)], vp.pos[2])\n\n  return(cbind(data, data.bg[, -1]))\n\n}\n\n\n\n\n##################################################################\n#General functions\n##################################################################\n\n#running mean function\n#' Title\n#'\n#' @param x numeric vector\n#' @param n window size for the running window\n#'\n#' @return a numeric vector with the windowed means\n#' @export\n#'\n#' @examples\nrunning<-function(x,n=20){\n  cumsum(x)->sum.v\n  sum.v<-c(0,sum.v)\n  #(sum.v[(n+1):length(x)]-sum.v[1:(length(x)-n)])/n\n  diff(sum.v,n)/n\n}\n\n#running sum\nrunsum<-function(x,n=20){\n  cumsum(x)->sum.v\n  sum.v<-c(0,sum.v)\n  diff(sum.v,n)\n}\n\n#make running function compatible vector\n#' Remove leading and trailing values\n#'\n#' @param a vector\n#' @param n window size of the corresponding running function\n#' @description remove n/2 elements from the front and the end\n#' @return vector, shortened by (n - 1) elements\n#' @export\n#'\n#' @examples\nrem <- function(a, n ){\n  half.window <- floor(n/2)\n  head(tail(a, -half.window),-half.window)\n}\n\n#quick way of generating a vector with the required indexes\nmulti.seq <- function( start, end ){\n  x <- rep(start, end-start+1)->x\n  df <- diff(x)\n  df <- df + 1\n  low <- which(df > 1)\n  df[low] <- -diff(c(0,low))+1\n  add <- c(0,cumsum(df))\n  x + add\n}\n\n#wrapper function to calculate the quantile distribution of all\n#non-zero values\nnon.zero.quantile <- function( x, probs ){\n  quantile(x[x > 0], probs)\n}\n\n\n",
 3 |     "created" : 1489585485107.000,
 4 |     "dirty" : false,
 5 |     "encoding" : "UTF-8",
 6 |     "folds" : "",
 7 |     "hash" : "3233609055",
 8 |     "id" : "493EA3DC",
 9 |     "lastKnownWriteTime" : 1508158242,
10 |     "path" : "~/development/R/packages/peakC/R/util_functions.R",
11 |     "project_path" : "R/util_functions.R",
12 |     "properties" : {
13 |     },
14 |     "relative_order" : 2,
15 |     "source_on_save" : false,
16 |     "type" : "r_source"
17 | }


--------------------------------------------------------------------------------
/.Rproj.user/CD02ED0/sdb/s-2ED88DA7/lock_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/.Rproj.user/CD02ED0/sdb/s-2ED88DA7/lock_file


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: peakC
 2 | Type: Package
 3 | Title: Perform peak calling for 4C/Capture-C experiments
 4 | Version: 0.2
 5 | Date: 2017-03-01
 6 | Author: Elzo de Wit, Geert Geeven
 7 | Maintainer: Elzo de Wit <e.d.wit@nki.nl>
 8 | Description: 4C/Capture-C data show a non-uniform distribution. By employing monotonic regression, an emperical background model is estimated and increases above this background model are called as peaks (interactions).
 9 | License: GPL
10 | LazyData: TRUE
11 | RoxygenNote: 6.0.1
12 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(combine.experiments)
 4 | export(combined.analysis)
 5 | export(data.frame.to.peakC)
 6 | export(pairwise.jaccard)
 7 | export(plot_C)
 8 | export(readMatrix)
 9 | export(readMultiColumnFile)
10 | export(readMultiple)
11 | export(readMultipleWig)
12 | export(readqWig)
13 | export(rem)
14 | export(running)
15 | export(single.analysis)
16 | 


--------------------------------------------------------------------------------
/R/reading_functions.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Read in a list of wig files
  4 | #'
  5 | #' @param files vector containg paths to wig files
  6 | #' @param vp.pos position of the viewpoint
  7 | #' @param window flanking sequence to be considered in the analysis
  8 | #'
  9 | #' @return A list containing two column matrices with the position and the normalized coverage score
 10 | #' @export
 11 | #'
 12 | #' @examples
 13 | readMultipleWig <- function( files, vp.pos, window = 700e3 ){
 14 |   data.list <- list()
 15 |   quality <- list()
 16 |   i <- 1
 17 |   for(f in files){
 18 |     d <- readqWig(f, vp.pos=vp.pos, window=window)
 19 |     if(i == 1){
 20 |       quality <- d$quality
 21 |     }else{
 22 |       quality$percentage.capture.100kb[i] <- d$quality$percentage.capture.100kb
 23 |       quality$percentage.capture.1Mb[i] <- d$quality$percentage.capture.1Mb
 24 |       quality$percentage.capture.cis[i] <- d$quality$percentage.capture.cis
 25 |       quality$total.read.cis[i] <- d$quality$total.read.cis
 26 |     }
 27 |     data.list[[i]] <- d$data
 28 |     i <- i+1
 29 |   }
 30 |   data.list$num.exp = length(data.list)
 31 |   data.list$quality = quality
 32 |   data.list
 33 | }
 34 | 
 35 | #' Read in a list of matrix files
 36 | #'
 37 | #' @param files vector containg paths to two column matrix files
 38 | #' @param vp.pos position of the viewpoint
 39 | #' @param window flanking sequence to be considered in the analysis
 40 | #'
 41 | #' @return A list containing two column matrices with the position and the normalized coverage score
 42 | #' @export
 43 | #'
 44 | #' @examples
 45 | readMultiple <- function( files, vp.pos, window = 700e3, normalize=T ){
 46 |   data.list <- list()
 47 |   quality <- list()
 48 |   i <- 1
 49 |   for(f in files){
 50 |     d <- readMatrix(f, vp.pos=vp.pos, window=window, normalize=normalize)
 51 |     data.list[[i]] <- d$data
 52 |     if(i == 1){
 53 |         quality <- d$quality
 54 |     }else{
 55 |       quality$percentage.capture.100kb[i] <- d$quality$percentage.capture.100kb
 56 |       quality$percentage.capture.1Mb[i] <- d$quality$percentage.capture.1Mb
 57 |       quality$percentage.capture.cis[i] <- d$quality$percentage.capture.cis
 58 |       quality$total.read.cis[i] <- d$quality$total.read.cis
 59 |     }
 60 |     i <- i+1
 61 |   }
 62 |   data.list$num.exp = length(data.list)
 63 |   data.list$quality = quality
 64 |   data.list
 65 | }
 66 | 
 67 | #' Read in a multiple experiment file
 68 | #'
 69 | #' @param file path to the experiment file
 70 | #' @param vp.pos position of the viewpoint
 71 | #' @param window flanking sequence to be considered in the analysis
 72 | #' @param num.exp Number of experiments
 73 | #'
 74 | #' @return A list containing two column matrices with the position and the normalized coverage score
 75 | #' @export
 76 | #'
 77 | #' @examples
 78 | readMultiColumnFile <- function(file, vp.pos, window=700e3, num.exp=4){
 79 |   d <- read.delim(file, h=F, stringsAsFactors=F)
 80 |   d <- d[,1:(num.exp+1)] #in case there are weird empty columns
 81 | 
 82 |   quality <- list()
 83 | 
 84 |   #normalize to 1M reads
 85 |   for( i in 1:num.exp){
 86 |     #calculate and store the quality characteristics
 87 |     q.temp <- quality.metrics(d[,c(1,i+1)], vp.pos)
 88 |     quality$percentage.capture.100kb[i] <- q.temp$percentage.capture.100kb
 89 |     quality$percentage.capture.1Mb[i] <- q.temp$percentage.capture.1Mb
 90 |     quality$percentage.capture.cis[i] <- q.temp$percentage.capture.cis
 91 |     quality$total.read.cis[i] <- q.temp$total.read.cis
 92 |     num.reads <- sum(d[,i+1], na.rm=T)
 93 |     d[,i+1] <- 1e6*d[,i+1]/num.reads
 94 |   }
 95 |   d <- d[d[,1] > vp.pos-window & d[,1] < vp.pos+window,]
 96 |   #make a list out of the matrix to make it compatible with the peak caller
 97 |   data.list <- list()
 98 |   for( i in 2:(num.exp+1)){
 99 |     d.sub <- d[,c(1,i)]
100 |     colnames(d.sub) <- c("frag_pos", "frag_score")
101 |     data.list[[i-1]] <- d.sub
102 |   }
103 |   data.list$num.exp = length(data.list)
104 |   data.list$quality = quality
105 |   data.list
106 | }
107 | 
108 | #' Transform a data frame to a list that can be used as input for peakC
109 | #'
110 | #' @param df data.frame containing the experimental data
111 | #' @param vp.pos position of the viewpoint
112 | #' @param window flanking sequence to be considered in the analysis
113 | #' @param num.exp Number of experiments
114 | #'
115 | #' @return A list containing two column matrices with the position and the normalized coverage score
116 | #' @export
117 | #'
118 | #' @examples
119 | data.frame.to.peakC <- function( df, vp.pos, window, num.exp ){
120 |   df <- df[,1:(num.exp+1)] #in case there are weird empty columns
121 | 
122 |   quality <- list()
123 | 
124 |   #normalize to 1M reads
125 |   for( i in 1:num.exp){
126 |     #calculate and store the quality characteristics
127 |     q.temp <- quality.metrics(df[,c(1,i+1)], vp.pos)
128 |     quality$percentage.capture.100kb[i] <- q.temp$percentage.capture.100kb
129 |     quality$percentage.capture.1Mb[i] <- q.temp$percentage.capture.1Mb
130 |     quality$percentage.capture.cis[i] <- q.temp$percentage.capture.cis
131 |     quality$total.read.cis[i] <- q.temp$total.read.cis
132 |     num.reads <- sum(df[,i+1], na.rm=T)
133 |     df[,i+1] <- 1e6*df[,i+1]/num.reads
134 |   }
135 |   df <- df[df[,1] > vp.pos-window & df[,1] < vp.pos+window,]
136 |   #make a list out of the matrix to make it compatible with the peak caller
137 |   data.list <- list()
138 |   for( i in 2:(num.exp+1)){
139 |     d.sub <- df[,c(1,i)]
140 |     colnames(d.sub) <- c("frag_pos", "frag_score")
141 |     data.list[[i-1]] <- d.sub
142 |   }
143 |   data.list$num.exp = length(data.list)
144 |   data.list$quality = quality
145 |   data.list
146 | }
147 | 
148 | 
149 | 
150 | #qwigly read a wig file (with only one chromosome)
151 | #' Quickly read and normalize a wig formatted file
152 | #'
153 | #' @param file path to a wiggle file
154 | #' @param window genomic window around the viewpoint to read in
155 | #' @param vp.pos postion of the viewpoint in the genome
156 | #'
157 | #' @return a matrix with two columns, position and score
158 | #' @export
159 | #' @description Wrapper function for reading files that are formatted as wig files. The data is also normalized to 1 million sequencing reads.
160 | #' @examples
161 | #' data <- readqWig(file="alpha.wig", window = 700e3, vp.pos = 32224333 )
162 | readqWig <- function( file, window, vp.pos ){
163 |   wig <- scan(file, skip = 2, quiet = T)
164 |   d <- matrix(wig, ncol=2, byrow=T)
165 |   d <- d[-which.max(d[,2]),]
166 |   d <- d[d[,1]!=vp.pos,]
167 | 
168 |   #calculate the quality metrics for the experiment
169 |   quality <- quality.metrics(d, vp.pos)
170 | 
171 |   #check if the file contains any data
172 |   if(sum(d[,2]) > 0){
173 |     d[,2] <- 1e6*d[,2]/sum(d[,2])
174 |   }else{
175 |     stop("Data file does not contain any data")
176 |   }
177 | 
178 |   if(window > 0){
179 |     d <- d[d[,1] > vp.pos - window & d[,1] < vp.pos + window,]
180 |   }else{
181 |     #select a genomic region around the viewpoint with a given amount of coverage
182 |     i <- range(which( running(d[,2]>0,2001) > 0.2))+1000
183 |     print(i)
184 |     if(any(is.infinite(i))){
185 |       window = 100e3
186 |       d <- d[d[,1] > vp.pos - window & d[,1] < vp.pos + window,]
187 |     }else{
188 |       d <- d[i[1]:i[2],]
189 |     }
190 | 
191 |   }
192 |   colnames(d) <- c("frag_pos", "frag_score")
193 |   #d
194 |   list(data=d, quality=quality)
195 | }
196 | 
197 | #qwigly read a 4C file (with only one chromosome)
198 | #' Quickly read and normalize a matrix formatted file
199 | #'
200 | #' @param file path to a two column file
201 | #' @param window genomic window around the viewpoint to read in
202 | #' @param vp.pos postion of the viewpoint in the genome
203 | #'
204 | #' @return a matrix with two columns, position and score
205 | #' @export
206 | #' @description Wrapper function for reading files that are formatted as wig files. The data is also normalized to 1 million sequencing reads.
207 | #' @examples
208 | #'
209 | readMatrix <- function( file, window, vp.pos, normalize = T ){
210 |   vec <- scan(file, quiet = T)
211 |   d <- matrix(vec, ncol=2, byrow=T)
212 |   d <- d[-which.max(d[,2]),]
213 |   d <- d[d[,1]!=vp.pos,]
214 | 
215 |   #calculate the quality metrics for the experiment
216 |   quality <- quality.metrics(d, vp.pos)
217 | 
218 |   if(normalize){
219 |     if(sum(d[,2]) > 0){
220 |       d[,2] <- 1e6*d[,2]/sum(d[,2])
221 |     }else{
222 |       stop("Data file does not contain any data")
223 |     }
224 |   }
225 |   if(window > 0){
226 |     d <- d[d[,1] > vp.pos - window & d[,1] < vp.pos + window,]
227 |   }else{
228 |     #select a genomic region around the viewpoint with a given amount of coverage
229 |     i <- range(which( running(d[,2]>0,2001) > 0.2))+1000
230 |     print(i)
231 |     if(any(is.infinite(i))){
232 |       window = 100e3
233 |       d <- d[d[,1] > vp.pos - window & d[,1] < vp.pos + window,]
234 |     }else{
235 |       d <- d[i[1]:i[2],]
236 |     }
237 | 
238 |   }
239 |   #add names to the columns
240 |   colnames(d) <- c("frag_pos", "frag_score")
241 |   #d
242 |   list(data=d, quality=quality)
243 | }
244 | 
245 | ###############################################
246 | #Quality metrics function
247 | ###############################################
248 | 
249 | #internal function for calculating the quality metrics for a specific
250 | #experiment
251 | #the quality metrics that are assessed are:
252 | #% captured fragments within the first 100kb
253 | #% captured fragments within the first 1Mb
254 | #% captured fragments total chromosome
255 | #total number of reads in cis
256 | quality.metrics <- function(data, vp.pos){
257 |   quality <- list()
258 |   quality$percentage.capture.100kb <- 100*mean( data[data[,1] > vp.pos-100e3 & data[,1] < vp.pos+100e3,2] > 0)
259 |   quality$percentage.capture.1Mb   <- 100*mean( data[data[,1] > vp.pos-1e6   & data[,1] < vp.pos+1e6,2] > 0)
260 |   quality$percentage.capture.cis   <- 100*mean(data[,2] > 0)
261 |   quality$total.read.cis <- sum(data[,2])
262 |   quality
263 | }
264 | 


--------------------------------------------------------------------------------
/R/util_functions.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ################################################
  3 | #4C specific functions
  4 | ################################################
  5 | 
  6 | 
  7 | significant.fragments <- function( p.value, pos, window = 21, FDR = 0.01 ){
  8 |   #correct the nominal p-value for multiple hypothesis testing
  9 |   p.combined <- p.adjust(p.value, method="fdr")
 10 |   #determine the significant windows and select the fragments therein
 11 |   sig.i <- which(p.combined < FDR)
 12 |   if(length(sig.i)>0) {
 13 | 		sig.i.start <- sig.i-floor(window/2); sig.i.end <- sig.i+floor(window/2)
 14 | 		sig.i <- unique(multi.seq(sig.i.start,sig.i.end))
 15 | 		sig.i <- sig.i[sig.i >= 1 & sig.i <= length(pos)]
 16 | 		sigFrags <- pos[sig.i]
 17 | 		return(sigFrags)
 18 |   } else {
 19 |     return(NULL)
 20 |   }
 21 | }
 22 | 
 23 | righttailgamma = function(r,k,n) 1 - pgamma(-log(r/(n+1)^k),k,scale=1)
 24 | 
 25 | rank.product.p <- function( data, num.exp,method="diff"){
 26 |   if(method=="diff") {
 27 |   stats <- data[,2:(num.exp+1)]-data[,(2:(num.exp+1))+num.exp]
 28 |   } else {
 29 |   stats <- data[,2:(num.exp+1)]/data[,(2:(num.exp+1))+num.exp]
 30 |   }
 31 |   rp <- nrow(data)-apply(stats,2,rank)+1
 32 |   rp <- apply(rp,1,prod)
 33 |   p <- righttailgamma(rp,num.exp,length(rp))
 34 | }
 35 | 
 36 | 
 37 | getWindowedFrags <- function(x,frags,wSize=21) {
 38 | 
 39 | 	outFrags <- ((match(x,frags)-floor(wSize/2))):((match(x,frags)+floor(wSize/2)))
 40 | 	outFrags <- outFrags[outFrags>=1&outFrags<=length(frags)]
 41 | 
 42 | 	return(frags[outFrags])
 43 | 
 44 | }
 45 | 
 46 | #set a dynamic threshold for the residuals
 47 | getThreshold <- function(resids,qW=5) {
 48 | 	q75 <- quantile(resids,probs=0.75) #75% quantile of the residuals
 49 | 	qd50 <- diff(quantile(resids,probs=c(0.25,0.75))) #the range between the 25% and 75% quantiles
 50 | 	threshold <- q75 + qW*qd50
 51 | 	return(threshold)
 52 | }
 53 | 
 54 | 
 55 | #these are not "significant" frags just above an arbitrary threshold
 56 | thresholdFrags <- function(resids,frags,wSize=21,qW=5) {
 57 | 
 58 | 	qMax <- getThreshold(resids=resids,qW=qW)
 59 | 
 60 | 	sel.i <- which(resids > qMax)
 61 |   if(length(sel.i)>0) {
 62 | 		sel.i.start <- sel.i-floor(wSize/2); sel.i.end <- sel.i+floor(wSize/2)
 63 | 		sel.i <- unique(multi.seq(sel.i.start,sel.i.end))
 64 | 		sel.i <- sel.i[sel.i >= 1 & sel.i <= length(frags)]
 65 | 		selFrags <- frags[sel.i]
 66 | 		return(selFrags)
 67 | 	}else{
 68 | 		return(NULL)
 69 | 	}
 70 | 
 71 | 
 72 | }
 73 | 
 74 | #' Single experiment 4C/Capture C analysis
 75 | #'
 76 | #' @param data list containing the 4C/CapC data in two column format, an additional element num.exp describes the number of experiments
 77 | #' @param vp.pos viewpoint position, this can be a single value or a two values to analyse a viewpoint region
 78 | #' @param wSize number of fragments in a window
 79 | #' @param alphaFDR false-discovery rate threshold
 80 | #' @param qWd threshold for difference from the background
 81 | #' @param qWr threshold for ratio over the background
 82 | #' @param minDist minimal region around the viewpoint to exclude for the significance analysis
 83 | #'
 84 | #' @description Function for identifying interaction peaks above a background distribution. A list containing a 4C/Capture-C dataset is required as input. The viewpoint position is given in the vp.pos argument.
 85 | #'
 86 | #' @return a list containing a matrix with the data and the background model and a vector with the significant fragments
 87 | #' @export
 88 | #'
 89 | #' @examples
 90 | #' data <- readMultiple(f[1:3], vp.pos = 65923803)
 91 | #' res <- combined.analysis(data, num.exp=3, vp.pos = 65923803)
 92 | #'
 93 | #'
 94 | #'
 95 | single.analysis <- function(data, vp.pos, wSize = 21, qWd = 1.5, qWr = 1, minDist = 15e3) {
 96 | 
 97 |   #create two element vector containing the viewpoint position
 98 |   #if only one viewpoint is given
 99 |   if(length(vp.pos) == 1){
100 |     vp.pos <- c(vp.pos,vp.pos)
101 |   }
102 |   vp.pos <- sort(vp.pos)
103 | 
104 | 
105 |   db <- get.single.background(data=data, num.exp = 1, vp.pos=vp.pos)
106 |   #running mean over the data
107 |   db[,2] <- caTools::runmean(x=db[,2],k=wSize,endrule="mean")
108 |   #running mean over the isotonic regression line
109 |   db[,3] <- caTools::runmean(x=db[,3],k=5,endrule="mean")
110 | 
111 |   #add a pseudocount to improve the calculations
112 |   pseudoCount <- non.zero.quantile(x=db[,2],probs=0.05)
113 |   ratios <- cbind(db[, 1], (db[, 2] + pseudoCount)/(db[,3] + pseudoCount))
114 |   deltas <- cbind(db[, 1], db[,2]-db[,3])
115 | 
116 |   #frags <- db[, 1]
117 |   #distFrags <- frags[abs(frags-vp.pos)>=minDist]
118 | 
119 |   #select the fragments that are more than minDist from the viewpoint
120 |   sel.frag <- db[which( (db[,1] < vp.pos[1] & vp.pos[1]-db[,1] > minDist) | (db[,1] > vp.pos[2] & db[,1]-vp.pos[2] > minDist) ),1]
121 | 
122 |   rTFrags <- thresholdFrags(resids=ratios[,2],frags=ratios[,1],wSize=wSize,qW=qWr)
123 |   dTFrags <- thresholdFrags(resids=deltas[,2],frags=deltas[,1],wSize=wSize,qW=qWd)
124 |   peakFrags <- intersect(intersect(rTFrags,dTFrags), sel.frag)
125 | 
126 |   return(list(dbR=db,ratios=ratios,deltas=deltas,peak=peakFrags, num.exp=1))
127 | 
128 | }
129 | 
130 | #' Combined 4C/Capture C analysis
131 | #'
132 | #' @param data list containing the 4C/CapC data in two column format, an additional element num.exp describes the number of experiments
133 | #' @param num.exp number of experiments, used in conjuction with "data" and "multi" in type, default is 0 which means the number in the data list is used, a different number overwrites the default number
134 | #' @param vp.pos viewpoint position, this can be a single value or a two values to analyse a viewpoint region
135 | #' @param wSize number of fragments in a window
136 | #' @param alphaFDR false-discovery rate threshold
137 | #' @param qW threshold for absolute difference
138 | #' @param minDist minimal region around the viewpoint to exclude for the significance analysis
139 | #'
140 | #' @description Function for identifying interaction peaks above a background distribution. A list of 4C/Capture-C datasets are required as input. The viewpoint position is given in the vp.pos argument.
141 | #'
142 | #' @return a list containing a matrix with the data and the background model and a vector with the significant fragments
143 | #' @export
144 | #'
145 | #' @examples
146 | #' data <- readMultiple(f[1:3], vp.pos = 65923803)
147 | #' res <- combined.analysis(data, num.exp=3, vp.pos = 65923803)
148 | #'
149 | #'
150 | #'
151 | combined.analysis <- function( data, num.exp = 0, vp.pos, wSize = 21, alphaFDR = 0.1, qWr = 1, minDist = 15e3 ){
152 |   #set the number of experiments
153 |   if(num.exp == 0){
154 |     num.exp = data$num.exp
155 | 
156 |   }
157 | 
158 |   #create two element vector containing the viewpoint position
159 |   #if only one viewpoint is given
160 |   if(length(vp.pos) == 1){
161 |     vp.pos <- c(vp.pos,vp.pos)
162 |   }
163 |   vp.pos <- sort(vp.pos)
164 | 
165 | 
166 | 	db <- combine.experiments(data,num.exp, vp.pos)
167 | 
168 | 	# make a data.frame where a running mean is already applied to apply all statistics to those data -> stronger (positive) dependency between statistics but less variance
169 | 
170 | 	dbR <- db
171 | 	#running mean over the data
172 | 	dbR[,2:(num.exp+1)] <- apply(db[,2:(num.exp+1)],2,caTools::runmean,k=wSize,endrule="mean")
173 | 	#running mean over the isotonic regression line (window is 5)
174 | 	dbR[,2:(num.exp+1)+num.exp] <- apply(db[,2:(num.exp+1)+num.exp],2,caTools::runmean,k=5,endrule="mean")
175 | 
176 | 	# for the ratios add a small pseudocount to avoid dividing by 0. Calculate the ratios and deltas (diff) on the runmean data
177 | 
178 | 	#add a small pseudo count so that there will be no divide/0 errors
179 | 	pseudoCount <- apply(db[,2:(num.exp+1)], 2, non.zero.quantile, probs=0.05)
180 | 	pseudoCount <- sum(pseudoCount)/num.exp
181 | 	#calculate the ratio of the data with the regression line
182 | 	ratio <- cbind(db[,1],(dbR[,2:(num.exp+1)]+pseudoCount)/(dbR[,(2:(num.exp+1))+num.exp]+pseudoCount))
183 | 	#calculate the differene between the data and the regression line
184 | 	delta <- cbind(db[,1],dbR[,2:(num.exp+1)]-dbR[,(2:(num.exp+1))+num.exp])
185 | 
186 | 	#determine the per-window p-value using rank products based on the ratio
187 | 	p.val <- rank.product.p(data = dbR, num.exp = num.exp,method="diff")
188 | 	#select the significant fragments
189 | 	sfr <- significant.fragments(p.value = p.val, pos = db[, 1], window = wSize, FDR = alphaFDR)
190 | 
191 | 	#select the fragments that are more than minDist from the viewpoint
192 | 	sel.frag <- db[which( (db[,1] < vp.pos[1] & vp.pos[1]-db[,1] > minDist) | (db[,1] > vp.pos[2] & db[,1]-vp.pos[2] > minDist) ),1]
193 | 	idx <- delta[,1]%in%sel.frag
194 | 
195 | 	#set a threshold on the minimal delta threshold, this threshold is defined empirically
196 | 	tfr <- thresholdFrags(resids=apply(ratio[idx,2:(num.exp+1)],1,mean),frags=ratio[idx,1],wSize=wSize,qW=qWr)
197 | 
198 | 	sfr <- intersect(sfr,tfr)
199 | 	list(dbR=dbR, peak=sfr, num.exp = num.exp, p.value=p.val, ratio = apply(ratio[,2:(num.exp+1)],1,mean), sel=sel.frag )
200 | }
201 | 
202 | #' Take a result from the combined.analysis function and generate a chromosomal map of the result
203 | #'
204 | #' @param data list containing the output of combined.analysis (i.e. 4C data and significant fragments)
205 | #' @param num.exp number of experiments
206 | #' @param y.min bottom limit of the plot
207 | #' @param y.max top limit of the plot
208 | #'
209 | #' @return Nothing, a plot is drawn
210 | #' @export
211 | #'
212 | #' @examples
213 | plot_C <- function(data, num.exp = 0, y.min=0, y.max=3000, ...){
214 |   if(num.exp == 0){
215 |     num.exp = data$num.exp
216 |   }
217 |   pos <- data$dbR[,1]
218 |   if(num.exp == 1){
219 |     y.ave <- data$dbR[,2]
220 |   }else{
221 |     y.ave <- apply(data$dbR[,2:(num.exp+1)], 1, median)
222 |   }
223 |   plot(pos, y.ave, type='h', col=ifelse(pos%in%data$peak, "red", "grey"), axes=F, xlab="chromosomal position", ylab="4C signal", ylim=c(y.min,y.max), ... )
224 |   axis(2, at=c(0,y.max), las=2)
225 |   at <- seq(200e3*floor(min(pos)/200e3), ceiling(max(pos)/200e3)*200e3, by=200e3)
226 |   axis(1, at=seq(0,1e9,by=200e3), lab=sprintf("%.1f", seq(0,1e9,by=200e3)/1e6), cex.axis=1.5)
227 | }
228 | 
229 | 
230 | #' Combine list of experiments into a matrix with the background model
231 | #'
232 | #' @param data list containing the 4C/CapC data in two column format
233 | #' @param num.exp number of experiments, default is 0, which means the the number in the data list is taken, other values allow for choosing a subset of the experiments
234 | #' @param vp.pos position of the viewpoint
235 | #'
236 | #' @return merged matrix containing: 1. the position of the fragments, 2:(n+1) the data, (n+1):(n+n+1) background models corresponding to the respective datasets
237 | #' @export
238 | #'
239 | #' @examples
240 | combine.experiments <- function( data, num.exp = 0, vp.pos ){
241 |   if(num.exp == 0){
242 |     num.exp = data$num.exp
243 |   }
244 |   #create two element vector containing the viewpoint position
245 |   #if only one viewpoint is given
246 |   if(length(vp.pos) == 1){
247 |     vp.pos <- c(vp.pos,vp.pos)
248 |   }
249 |   vp.pos <- sort(vp.pos)
250 |   data.m <- data[[1]]
251 |   for( i in 2:num.exp ){
252 |     data.m <- merge(data.m, data[[i]], by=1)
253 |   }
254 |   #create the background model for the upstream regions
255 |   data.bg <- data.m
256 |   for( i in 1:num.exp ){
257 |     data.bg[data.m[,1] < vp.pos[1],i+1] <- get.background(data.m[data.m[,1] < vp.pos[1],c(1,i+1)], vp.pos[1] )
258 |   }
259 |   #and for the downstream regions
260 |   for( i in 1:num.exp ){
261 |     data.bg[data.m[,1] > vp.pos[2],i+1] <- get.background(data.m[data.m[,1] > vp.pos[2],c(1,i+1)], vp.pos[2] )
262 |   }
263 |   #if two viewpoint fragments are given set the intervening fragments
264 |   #to zero
265 |   #set background to 1 to prevent NaN in the ratio
266 |   if(vp.pos[1] != vp.pos[2]){
267 |     for( i in 1:num.exp){
268 |       data.m[data.m[,1] >= vp.pos[1] & data.m[,1] <= vp.pos[2],i+1] <- 0
269 |       data.bg[data.m[,1] >= vp.pos[1] & data.m[,1] <= vp.pos[2],i] <- 1
270 |     }
271 |   }
272 |   cbind(data.m, data.bg[,-1])
273 | }
274 | 
275 | #perform pava regression and return the background regression line
276 | get.background <- function( data, vp.pos, weight.factor=0, fractile=F){
277 |   require(isotone)
278 |   switched = FALSE
279 |   weights <- (1:nrow(data))**weight.factor
280 |   if(data[1,1] > vp.pos){
281 |     data[,1] <- -data[,1] #reverse the sign to make the trend increasing
282 |     switched = TRUE
283 |     weights <- rev(weights)
284 |   }
285 |   #create the isotonic regression
286 |   if(fractile){
287 |     lm <- gpava(data[,1], data[,2], solver=weighted.fractile, weights=NULL, p=0.75)
288 |   }else{
289 |     lm <- gpava(data[,1], data[,2], solver=weighted.mean)
290 |   }
291 | 
292 |   if(switched)
293 |     pred.data <- data.frame( -lm$z, lm$x )
294 |   else
295 |     pred.data <- data.frame( lm$z, lm$x )
296 | 
297 |   pred.data[order(pred.data[,1]),2]
298 | }
299 | 
300 | get.single.background <- function(data, num.exp = 1, vp.pos) {
301 | 
302 |   if (length(vp.pos) == 1) {
303 |     vp.pos <- c(vp.pos, vp.pos)
304 |   }
305 |   vp.pos <- sort(vp.pos)
306 | 
307 |   data.bg <- data
308 |   data.bg[data[, 1] < vp.pos[1], 2] <- get.background(data[data[, 1] < vp.pos[1], c(1, 2)], vp.pos[1])
309 |   data.bg[data[, 1] > vp.pos[2], 2] <- get.background(data[data[, 1] > vp.pos[2], c(1, 2)], vp.pos[2])
310 | 
311 |   return(cbind(data, data.bg[, -1]))
312 | 
313 | }
314 | 
315 | #' Single experiment Jaccard similarity calculation
316 | #'
317 | #' @param data list containing the 4C/CapC data in two column format, an additional element num.exp describes the number of experiments
318 | #' @param vp.pos viewpoint position, this can be a single value or a two values to analyse a viewpoint region
319 | #' @param wSize number of fragments in a window
320 | #' @param alphaFDR false-discovery rate threshold
321 | #' @param qWd threshold for difference from the background
322 | #' @param qWr threshold for ratio over the background
323 | #' @param minDist minimal region around the viewpoint to exclude for the significance analysis
324 | #' @description Function for identifying interaction peaks above a background distribution. A list of 4C/Capture-C datasets are required as input. The viewpoint position is given in the vp.pos argument.
325 | #'
326 | #' @return a matrix containing the pairwise Jaccard similarity scores
327 | #' @export
328 | #'
329 | #' @examples
330 | #' data <- readMultiple(f[1:3], vp.pos = 65923803)
331 | #' res <- combined.analysis(data, num.exp=3, vp.pos = 65923803)
332 | #'
333 | #'
334 | #'
335 | pairwise.jaccard <- function(data, vp.pos, wSize = 21, qWd = 1.5, qWr = 1, minDist = 15e3) {
336 |   num.exp <- data$num.exp
337 |   js.mat <- matrix(0, nrow=num.exp, ncol=num.exp)
338 |   diag(js.mat) <- 1
339 |   res.list <- list()
340 |   #first calculate the single experiment peak calling
341 |   for( i in 1:num.exp){
342 |     res.list[[i]] <- single.analysis(data[[i]], vp.pos = vp.pos, wSize = wSize, qWd = qWd, qWr = qWr, minDist = minDist)
343 |   }
344 |   for(i in 1:(num.exp-1)){
345 |     for(j in (i+1):num.exp){
346 |       js <- jaccardSim(res.list[[i]]$peak, res.list[[j]]$peak)
347 |       js.mat[i,j] <- js; js.mat[j,i] <- js
348 |     }
349 |   }
350 |   js.mat
351 | }
352 | 
353 | #calculate the jaccard index for two vectors
354 | jaccardSim <- function( p1, p2 ){
355 |   intersection <- sum(p1 %in% p2)
356 |   union <- length(unique(c(p1,p2)))
357 |   intersection/union
358 | }
359 | 
360 | ##################################################################
361 | #General functions
362 | ##################################################################
363 | 
364 | #running mean function
365 | #' Title
366 | #'
367 | #' @param x numeric vector
368 | #' @param n window size for the running window
369 | #'
370 | #' @return a numeric vector with the windowed means
371 | #' @export
372 | #'
373 | #' @examples
374 | running<-function(x,n=20){
375 |   cumsum(x)->sum.v
376 |   sum.v<-c(0,sum.v)
377 |   #(sum.v[(n+1):length(x)]-sum.v[1:(length(x)-n)])/n
378 |   diff(sum.v,n)/n
379 | }
380 | 
381 | #running sum
382 | runsum<-function(x,n=20){
383 |   cumsum(x)->sum.v
384 |   sum.v<-c(0,sum.v)
385 |   diff(sum.v,n)
386 | }
387 | 
388 | #make running function compatible vector
389 | #' Remove leading and trailing values
390 | #'
391 | #' @param a vector
392 | #' @param n window size of the corresponding running function
393 | #' @description remove n/2 elements from the front and the end
394 | #' @return vector, shortened by (n - 1) elements
395 | #' @export
396 | #'
397 | #' @examples
398 | rem <- function(a, n ){
399 |   half.window <- floor(n/2)
400 |   head(tail(a, -half.window),-half.window)
401 | }
402 | 
403 | #quick way of generating a vector with the required indexes
404 | multi.seq <- function( start, end ){
405 |   x <- rep(start, end-start+1)->x
406 |   df <- diff(x)
407 |   df <- df + 1
408 |   low <- which(df > 1)
409 |   df[low] <- -diff(c(0,low))+1
410 |   add <- c(0,cumsum(df))
411 |   x + add
412 | }
413 | 
414 | #wrapper function to calculate the quantile distribution of all
415 | #non-zero values
416 | non.zero.quantile <- function( x, probs ){
417 |   quantile(x[x > 0], probs)
418 | }
419 | 
420 | 
421 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # peakC
 2 | 
 3 | Methods for studying the spatial organization of the genome, such as 4C or Capture-C, have become more common. Here we provide an R package that enables non-parametric peak calling for one-vs-all 'C' methods called peakC.
 4 | 
 5 | ## Getting Started
 6 | 
 7 | These instructions will allow you to install peakC on your local machine.
 8 | 
 9 | ### Prerequisites
10 | 
11 | You need The following software and packages to install and run peakC:
12 | 
13 | ```
14 | R (any recent version)
15 | isotone package (from CRAN)
16 | devtools (for installation from github, not required for running the package)
17 | ```
18 | 
19 | ### Installing
20 | 
21 | By far the easiest is to install peakC directly from GitHub. The following command will enable installation using devtools.
22 | 
23 | ```
24 | library(devtools)
25 | install_github("deWitLab/peakC")
26 | ```
27 | 


--------------------------------------------------------------------------------
/example_peak_calling.R:
--------------------------------------------------------------------------------
1 | 
2 | for(i in c(3,1,2)){
3 |   f <- list.files(path = "/data/4C/high_res_methods/new_analysis/reciprocal/peakC/", recursive = T, pattern=paste0("pos",i), full.names = T)
4 |   p <- readMultiple(f[1], vp.pos = vp[i], window = 1.5e6)
5 |   r <- single.analysis(p,  vp.pos=vp[i], qWd = 2.5)
6 |   plot_C(r, xlim=c(64e6,67e6))
7 | }
8 | 


--------------------------------------------------------------------------------
/man/combine.experiments.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util_functions.R
 3 | \name{combine.experiments}
 4 | \alias{combine.experiments}
 5 | \title{Combine list of experiments into a matrix with the background model}
 6 | \usage{
 7 | combine.experiments(data, num.exp = 0, vp.pos)
 8 | }
 9 | \arguments{
10 | \item{data}{list containing the 4C/CapC data in two column format}
11 | 
12 | \item{num.exp}{number of experiments, default is 0, which means the the number in the data list is taken, other values allow for choosing a subset of the experiments}
13 | 
14 | \item{vp.pos}{position of the viewpoint}
15 | }
16 | \value{
17 | merged matrix containing: 1. the position of the fragments, 2:(n+1) the data, (n+1):(n+n+1) background models corresponding to the respective datasets
18 | }
19 | \description{
20 | Combine list of experiments into a matrix with the background model
21 | }
22 | 


--------------------------------------------------------------------------------
/man/combined.analysis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util_functions.R
 3 | \name{combined.analysis}
 4 | \alias{combined.analysis}
 5 | \title{Combined 4C/Capture C analysis}
 6 | \usage{
 7 | combined.analysis(data, num.exp = 0, vp.pos, wSize = 21, alphaFDR = 0.1,
 8 |   qWr = 1, minDist = 15000)
 9 | }
10 | \arguments{
11 | \item{data}{list containing the 4C/CapC data in two column format, an additional element num.exp describes the number of experiments}
12 | 
13 | \item{num.exp}{number of experiments, used in conjuction with "data" and "multi" in type, default is 0 which means the number in the data list is used, a different number overwrites the default number}
14 | 
15 | \item{vp.pos}{viewpoint position, this can be a single value or a two values to analyse a viewpoint region}
16 | 
17 | \item{wSize}{number of fragments in a window}
18 | 
19 | \item{alphaFDR}{false-discovery rate threshold}
20 | 
21 | \item{minDist}{minimal region around the viewpoint to exclude for the significance analysis}
22 | 
23 | \item{qW}{threshold for absolute difference}
24 | }
25 | \value{
26 | a list containing a matrix with the data and the background model and a vector with the significant fragments
27 | }
28 | \description{
29 | Function for identifying interaction peaks above a background distribution. A list of 4C/Capture-C datasets are required as input. The viewpoint position is given in the vp.pos argument.
30 | }
31 | \examples{
32 | data <- readMultiple(f[1:3], vp.pos = 65923803)
33 | res <- combined.analysis(data, num.exp=3, vp.pos = 65923803)
34 | 
35 | 
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/man/data.frame.to.peakC.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reading_functions.R
 3 | \name{data.frame.to.peakC}
 4 | \alias{data.frame.to.peakC}
 5 | \title{Transform a data frame to a list that can be used as input for peakC}
 6 | \usage{
 7 | data.frame.to.peakC(df, vp.pos, window, num.exp)
 8 | }
 9 | \arguments{
10 | \item{df}{data.frame containing the experimental data}
11 | 
12 | \item{vp.pos}{position of the viewpoint}
13 | 
14 | \item{window}{flanking sequence to be considered in the analysis}
15 | 
16 | \item{num.exp}{Number of experiments}
17 | }
18 | \value{
19 | A list containing two column matrices with the position and the normalized coverage score
20 | }
21 | \description{
22 | Transform a data frame to a list that can be used as input for peakC
23 | }
24 | 


--------------------------------------------------------------------------------
/man/hello.Rd:
--------------------------------------------------------------------------------
 1 | \name{hello}
 2 | \alias{hello}
 3 | \title{Hello, World!}
 4 | \usage{
 5 | hello()
 6 | }
 7 | \description{
 8 | Prints 'Hello, world!'.
 9 | }
10 | \examples{
11 | hello()
12 | }
13 | 


--------------------------------------------------------------------------------
/man/pairwise.jaccard.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util_functions.R
 3 | \name{pairwise.jaccard}
 4 | \alias{pairwise.jaccard}
 5 | \title{Single experiment Jaccard similarity calculation}
 6 | \usage{
 7 | pairwise.jaccard(data, vp.pos, wSize = 21, qWd = 1.5, qWr = 1,
 8 |   minDist = 15000)
 9 | }
10 | \arguments{
11 | \item{data}{list containing the 4C/CapC data in two column format, an additional element num.exp describes the number of experiments}
12 | 
13 | \item{vp.pos}{viewpoint position, this can be a single value or a two values to analyse a viewpoint region}
14 | 
15 | \item{wSize}{number of fragments in a window}
16 | 
17 | \item{qWd}{threshold for difference from the background}
18 | 
19 | \item{qWr}{threshold for ratio over the background}
20 | 
21 | \item{minDist}{minimal region around the viewpoint to exclude for the significance analysis}
22 | 
23 | \item{alphaFDR}{false-discovery rate threshold}
24 | }
25 | \value{
26 | a matrix containing the pairwise Jaccard similarity scores
27 | }
28 | \description{
29 | Function for identifying interaction peaks above a background distribution. A list of 4C/Capture-C datasets are required as input. The viewpoint position is given in the vp.pos argument.
30 | }
31 | \examples{
32 | data <- readMultiple(f[1:3], vp.pos = 65923803)
33 | res <- combined.analysis(data, num.exp=3, vp.pos = 65923803)
34 | 
35 | 
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/man/plot_C.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util_functions.R
 3 | \name{plot_C}
 4 | \alias{plot_C}
 5 | \title{Take a result from the combined.analysis function and generate a chromosomal map of the result}
 6 | \usage{
 7 | plot_C(data, num.exp = 0, y.min = 0, y.max = 3000, ...)
 8 | }
 9 | \arguments{
10 | \item{data}{list containing the output of combined.analysis (i.e. 4C data and significant fragments)}
11 | 
12 | \item{num.exp}{number of experiments}
13 | 
14 | \item{y.min}{bottom limit of the plot}
15 | 
16 | \item{y.max}{top limit of the plot}
17 | }
18 | \value{
19 | Nothing, a plot is drawn
20 | }
21 | \description{
22 | Take a result from the combined.analysis function and generate a chromosomal map of the result
23 | }
24 | 


--------------------------------------------------------------------------------
/man/readMatrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reading_functions.R
 3 | \name{readMatrix}
 4 | \alias{readMatrix}
 5 | \title{Quickly read and normalize a matrix formatted file}
 6 | \usage{
 7 | readMatrix(file, window, vp.pos, normalize = T)
 8 | }
 9 | \arguments{
10 | \item{file}{path to a two column file}
11 | 
12 | \item{window}{genomic window around the viewpoint to read in}
13 | 
14 | \item{vp.pos}{postion of the viewpoint in the genome}
15 | }
16 | \value{
17 | a matrix with two columns, position and score
18 | }
19 | \description{
20 | Wrapper function for reading files that are formatted as wig files. The data is also normalized to 1 million sequencing reads.
21 | }
22 | \examples{
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/man/readMultiColumnFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reading_functions.R
 3 | \name{readMultiColumnFile}
 4 | \alias{readMultiColumnFile}
 5 | \title{Read in a multiple experiment file}
 6 | \usage{
 7 | readMultiColumnFile(file, vp.pos, window = 7e+05, num.exp = 4)
 8 | }
 9 | \arguments{
10 | \item{file}{path to the experiment file}
11 | 
12 | \item{vp.pos}{position of the viewpoint}
13 | 
14 | \item{window}{flanking sequence to be considered in the analysis}
15 | 
16 | \item{num.exp}{Number of experiments}
17 | }
18 | \value{
19 | A list containing two column matrices with the position and the normalized coverage score
20 | }
21 | \description{
22 | Read in a multiple experiment file
23 | }
24 | 


--------------------------------------------------------------------------------
/man/readMultiple.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reading_functions.R
 3 | \name{readMultiple}
 4 | \alias{readMultiple}
 5 | \title{Read in a list of matrix files}
 6 | \usage{
 7 | readMultiple(files, vp.pos, window = 7e+05, normalize = T)
 8 | }
 9 | \arguments{
10 | \item{files}{vector containg paths to two column matrix files}
11 | 
12 | \item{vp.pos}{position of the viewpoint}
13 | 
14 | \item{window}{flanking sequence to be considered in the analysis}
15 | }
16 | \value{
17 | A list containing two column matrices with the position and the normalized coverage score
18 | }
19 | \description{
20 | Read in a list of matrix files
21 | }
22 | 


--------------------------------------------------------------------------------
/man/readMultipleWig.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reading_functions.R
 3 | \name{readMultipleWig}
 4 | \alias{readMultipleWig}
 5 | \title{Read in a list of wig files}
 6 | \usage{
 7 | readMultipleWig(files, vp.pos, window = 7e+05)
 8 | }
 9 | \arguments{
10 | \item{files}{vector containg paths to wig files}
11 | 
12 | \item{vp.pos}{position of the viewpoint}
13 | 
14 | \item{window}{flanking sequence to be considered in the analysis}
15 | }
16 | \value{
17 | A list containing two column matrices with the position and the normalized coverage score
18 | }
19 | \description{
20 | Read in a list of wig files
21 | }
22 | 


--------------------------------------------------------------------------------
/man/readqWig.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reading_functions.R
 3 | \name{readqWig}
 4 | \alias{readqWig}
 5 | \title{Quickly read and normalize a wig formatted file}
 6 | \usage{
 7 | readqWig(file, window, vp.pos)
 8 | }
 9 | \arguments{
10 | \item{file}{path to a wiggle file}
11 | 
12 | \item{window}{genomic window around the viewpoint to read in}
13 | 
14 | \item{vp.pos}{postion of the viewpoint in the genome}
15 | }
16 | \value{
17 | a matrix with two columns, position and score
18 | }
19 | \description{
20 | Wrapper function for reading files that are formatted as wig files. The data is also normalized to 1 million sequencing reads.
21 | }
22 | \examples{
23 | data <- readqWig(file="alpha.wig", window = 700e3, vp.pos = 32224333 )
24 | }
25 | 


--------------------------------------------------------------------------------
/man/rem.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util_functions.R
 3 | \name{rem}
 4 | \alias{rem}
 5 | \title{Remove leading and trailing values}
 6 | \usage{
 7 | rem(a, n)
 8 | }
 9 | \arguments{
10 | \item{a}{vector}
11 | 
12 | \item{n}{window size of the corresponding running function}
13 | }
14 | \value{
15 | vector, shortened by (n - 1) elements
16 | }
17 | \description{
18 | remove n/2 elements from the front and the end
19 | }
20 | 


--------------------------------------------------------------------------------
/man/running.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util_functions.R
 3 | \name{running}
 4 | \alias{running}
 5 | \title{Title}
 6 | \usage{
 7 | running(x, n = 20)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric vector}
11 | 
12 | \item{n}{window size for the running window}
13 | }
14 | \value{
15 | a numeric vector with the windowed means
16 | }
17 | \description{
18 | Title
19 | }
20 | 


--------------------------------------------------------------------------------
/man/single.analysis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util_functions.R
 3 | \name{single.analysis}
 4 | \alias{single.analysis}
 5 | \title{Single experiment 4C/Capture C analysis}
 6 | \usage{
 7 | single.analysis(data, vp.pos, wSize = 21, qWd = 1.5, qWr = 1,
 8 |   minDist = 15000)
 9 | }
10 | \arguments{
11 | \item{data}{list containing the 4C/CapC data in two column format, an additional element num.exp describes the number of experiments}
12 | 
13 | \item{vp.pos}{viewpoint position, this can be a single value or a two values to analyse a viewpoint region}
14 | 
15 | \item{wSize}{number of fragments in a window}
16 | 
17 | \item{qWd}{threshold for difference from the background}
18 | 
19 | \item{qWr}{threshold for ratio over the background}
20 | 
21 | \item{minDist}{minimal region around the viewpoint to exclude for the significance analysis}
22 | 
23 | \item{alphaFDR}{false-discovery rate threshold}
24 | }
25 | \value{
26 | a list containing a matrix with the data and the background model and a vector with the significant fragments
27 | }
28 | \description{
29 | Function for identifying interaction peaks above a background distribution. A list containing a 4C/Capture-C dataset is required as input. The viewpoint position is given in the vp.pos argument.
30 | }
31 | \examples{
32 | data <- readMultiple(f[1:3], vp.pos = 65923803)
33 | res <- combined.analysis(data, num.exp=3, vp.pos = 65923803)
34 | 
35 | 
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/peakC.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | 


--------------------------------------------------------------------------------
/tutorial/data/alpha-globin_FL_1_cis.wig.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/alpha-globin_FL_1_cis.wig.gz


--------------------------------------------------------------------------------
/tutorial/data/alpha-globin_FL_2_cis.wig.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/alpha-globin_FL_2_cis.wig.gz


--------------------------------------------------------------------------------
/tutorial/data/alpha-globin_FL_3_cis.wig.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/alpha-globin_FL_3_cis.wig.gz


--------------------------------------------------------------------------------
/tutorial/data/alpha-globin_forward_1_chr11_win_1_cis.wig.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/alpha-globin_forward_1_chr11_win_1_cis.wig.zip


--------------------------------------------------------------------------------
/tutorial/data/alpha-globin_mESC_1_cis.wig.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/alpha-globin_mESC_1_cis.wig.gz


--------------------------------------------------------------------------------
/tutorial/data/alpha-globin_mESC_2_cis.wig.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/alpha-globin_mESC_2_cis.wig.gz


--------------------------------------------------------------------------------
/tutorial/data/alpha-globin_mESC_3_cis.wig.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/alpha-globin_mESC_3_cis.wig.gz


--------------------------------------------------------------------------------
/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_1.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_1.txt.gz


--------------------------------------------------------------------------------
/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_2.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_2.txt.gz


--------------------------------------------------------------------------------
/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_3.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_3.txt.gz


--------------------------------------------------------------------------------
/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_4.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_4.txt.gz


--------------------------------------------------------------------------------
/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_5.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_5.txt.gz


--------------------------------------------------------------------------------
/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_6.txt.gz


--------------------------------------------------------------------------------
/tutorial/peakC_tutorial.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "peakC tutorial"
  3 | author: "Geert Geeven & Elzo de Wit"
  4 | date: "April 4, 2018"
  5 | output:
  6 |   html_document:
  7 |     df_print: paged
  8 |   pdf_document: default
  9 | bibliography: library.bib
 10 | ---
 11 | 
 12 | It is becoming increasingly clear that the 3D organization of the genome plays an important in the regulation of genes. To measure the 3D conformation of the genome various 3C-based have been developed (4C/5C/Hi-C/Capture-C/Promoter-Capture-Hi-C). 4C, Capture-C and PCHiC measure the contact frequency of a single region ("viewpoint" or "bait") with the rest of the genome. 
 13 | 
 14 | The main goal of these experiments is to identify regions (which we will call "peaks") whose contact frequencies with the viewpoint region are enriched over the background. Because of the non-uniform distribution of background contact frequencies this is not a trivial exercise. In order to model the background, we assume that the contact frequency only decreases with an increase in distance from the viewpoint (i.e. monotonically decreases). peakC achieves this by performing monotonic regression using the `isotone` package. For a more detailed discussion of the theoretical background and performance of the package we refer the reader to our paper: (reference pending). The current tutorial focuses on how to  perform 4C/Capture-C/PCHiC data analysis using peakC.
 15 | 
 16 | In order to load the package into the R environment run the following command:
 17 | ```{r}
 18 | library(peakC)
 19 | ```
 20 | 
 21 | #### Data structure
 22 | 
 23 | The basic data structure for a (single) 4C/Capture-C/PCHiC experiment is a two-column matrix, with one column for the position of the fragment (end) and one column with the number of reads (coverage, 4C signal). We can use one of the peakC functions to read in the 4C/Capture-C/PCHiC data. As an example we use the `readqWig` function, which can read in *variableStep* wiggle files, which are compatible with the UCSC genome browser. The data we read in is a 4C experiment for one of the hemoglobin genes (alpha-globin) in fetal livers of mice.
 24 | 
 25 | ```{r}
 26 | data <- readqWig("data/alpha-globin_FL_1_cis.wig.gz", vp.pos=32224333, window=700e3)
 27 | head(data$data)
 28 | ```
 29 | 
 30 | The data structure contains an entry for the 4C data `data$data` and the quality metrics of the experiment (`data$quality`). The quality metrics will be discussed at the end of the tutorial. The `readqWig` function has selected the fragments in the 700kb genomic region downstream and upstream of the viewpoint (position set by `vp.pos`). The flanking window size can be set with `window`.  The coverage score is automatically normalized to 1 million intrachromosomal reads (reads in cis). 
 31 | 
 32 | **Nota bene**
 33 | As you can see in the data structure the 1<sup>st</sup>, 4<sup>th</sup> and 6<sup>th</sup> row have zeroes in the data. For peakC to function properly it is necessary to leave the zeroes (i.e. uncovered regions) in the dataset. A lack of captured fragments is also information that is required for the proper calculation of the background model (see below).
 34 | 
 35 | Alternative functions for reading in data are:
 36 | 
 37 | * `readMatrix`: reads a simple two column file (wig file without the header)
 38 | * `readMultiple`: reads multiple matrix files at once, returns a list
 39 | * `readMultipleWig`: like read multiple, but for wig files
 40 | * `readMultiColumnFile`: reads a multiple column file, 1st column is fragment position, the rest of the columns are 4C/Capture-C/PCHiC data columns
 41 | 
 42 | 
 43 | A plot of the raw 4C data looks like this:
 44 | 
 45 | ```{r echo=FALSE}
 46 | plot(data$data, type='h', xlab="chromosomal position", ylab="4C signal")
 47 | ```
 48 | 
 49 | The expression of the alpha-globin gene is extremely high in this tissue and is regulated by enhancers that flank this gene. We can already see the promoter-enhancer contacts appearing in the raw data. But peakC can be used to detect the "peak" regions that contact the viewpoint regions significantly above the background frequency.
 50 | 
 51 | 
 52 | ---------------
 53 | 
 54 | ## Identifying contacted regions ("peaks")
 55 | 
 56 | ##### 4C analysis with replicate experiments
 57 | In order to identify peaks we *strongly* advise using replicates. For this tutorial we have analyze data from a 4C experiment of the alpha-globin gene which was done in triplicate. The peak calling consists of two steps. 1) Reading the data (shown above). 2) Statistical analysis using the `combined.analysis` function. A third optional step is to plot the 4C data and the peaks that were identified.
 58 | 
 59 | Below we show the simplest way to run the peakC peak caller.  Note that as default it runs with a 700kb flanking region (`window`), a running window size of 21 and an FDR threshold of 0.1. For details on the required and optional parameters see the man page. We refer the reader to our paper for guidance on how to set appropriate significance thresholds.
 60 | 
 61 | ```{r}
 62 | viewpoint <- 32224333
 63 | files <- dir(path="data/", pattern="alpha-globin_FL_[123]_cis.wig.gz", full=T)
 64 | data <- readMultipleWig(files, vp.pos = viewpoint)
 65 | res <- combined.analysis(data, vp.pos = viewpoint)
 66 | head(res$peak)
 67 | ```
 68 | 
 69 | `combined.analysis` outputs a list of with the results of the peak calling (stored here in `res`). The contents of the results list will be discussed in detail below. What is important for now is the `peak` vector in the list contains the fragments that represent significant interactions with the viewpoint. We can use the output of `combined.analysis` to create a plot by calling the `plot_C` function. The significant fragment windows are shown in red. 
 70 | ```{r}
 71 | plot_C(res)
 72 | ```
 73 | 
 74 | ##### Single experiment 4C analysis
 75 | Below an explanation will be given for performing single experiment analysis. Please note that in a single experiment analysis no formal statistical analysis can be performed and the selection of peak fragments relies on thresholds for the differences and ratios of fragments with respect to the background model fitted 4C coverage. In general it is hard to control for spurious peaks which are not reproducible between 4C experiments. This is discussed in more detail in the companion paper.
 76 | 
 77 | First read in the data:
 78 | 
 79 | ```{r}
 80 | data <- readqWig("data/alpha-globin_FL_1_cis.wig.gz", vp.pos=32224333, window=700e3)
 81 | ```
 82 | 
 83 | and the run the command for identifying the peaks
 84 | 
 85 | ```{r}
 86 | res <- single.analysis(data$data, vp.pos=32224333, qWd = 2.5)
 87 | ```
 88 | 
 89 | Both `qWd` (difference with respect to background) and `qWr` (ratio) can be varied to increase or lower the stringency of the peak calling. To plot the significant fragments, we again use the `plot_C` function.
 90 | 
 91 | ```{r}
 92 | plot_C(res)
 93 | ```
 94 | 
 95 | The single experiment peak caller calls most of the peaks that were called by the single experiment peak caller. However, an additional set of peaks are called, some of which are likely false positive peaks.
 96 | 
 97 | ##### Multi-column Capture-C experiment
 98 | In addition to 4C experiments peakC can also analyse Capture-C data. Below an example is shown how to identify significant fragments from a Capture-C experiment with 4 replicates. This data was taken from [@Davies2015]. A file with multiple columns can be read as follows and return a list of data.frames.
 99 | 
100 | ```{r}
101 | data <- readMultiColumnFile("data/Hba_cis.txt", vp.pos = 32189804, window=300e3)
102 | head(data[[1]])
103 | head(data[[2]])
104 | ````
105 | 
106 | The list can be passed to the `combined.analysis` function as follows:
107 | 
108 | ```{r}
109 | vp.pos <- c(32182969,32196638)
110 | res <- combined.analysis(data=data, num.exp = 2, vp.pos = vp.pos, wSize = 5)
111 | plot_C(res, y.max=6000, y.min=-600)
112 | ```
113 | 
114 | Note that in the these Capture-C experiments two baits were used. We would like to exclude the sequence in between the baits, which is why we use two values for the position of the viewpoint. The fragments between the baits are excluded from the 
115 | 
116 | Due to the higher resolution of the Capture-C experiment we have changed the window size, i.e. the number fragments over which the averaging is performed, `wSize` to 5. Even though we read in 4 experiments, when we set `num.exp` to 2 only the first two experiments are used for the peak calling.
117 | 
118 | In the function `plot_C` we can play with the limits of the vertical axis by varying `y.min` and `y.max`. By setting the y.min to a negative value it is possible to insert annotations such as gene tracks or ChIPseq data. 
119 | 
120 | Note that there is a difference between the 4C and the CapC data because there is a 43kb difference in the position of the bait or viewpoint.
121 | 
122 | ----------------------------
123 | 
124 | ##### Output details
125 | 
126 | Let's look into the resulting data structure in bit more detail. For the two replicate Capture-C experiment the result is a list with 6 elements. Let's go over them:
127 | `res$dbR` contains the original data (1st column fragment position, columns 2 until n+1 contain the n experiments). Columns n+2 until 2n+1 contain the estimated background model.
128 | ```{r}
129 | plot(res$dbR[,1], res$dbR[,2], type='h', ylim=c(0,6000), xlab="chromosomal position", ylab="4C signal")
130 | lines(res$dbR[,1], res$dbR[,4], lwd=2, col='blue')
131 | ```
132 | 
133 | First we plot the raw data in the second column, next we add the background model shown by the blue line.
134 | 
135 | `res$peak` contains the fragments that have been called as significant peaks.
136 | 
137 | `res$num.exp` stores the number of experiments that are stored in `res`.
138 | 
139 | `res$p.value` contains for every fragment a rank product probability score whether the fragment is a peak fragment.
140 | 
141 | `res$sel` contains the fragments that are used in the analysis; fragments too close to the viewpoint are not used in the analysis. This distance can be set using the `minDist` parameter in `combined.analysis`.
142 | 
143 | ------------------------------
144 | 
145 | ##### Quality control
146 | 
147 | To determine the similarity between experiments we have written a function that performs a pairwise similarity analysis. We use the Jaccard index as a similarity metric. For the Jaccard index the overlap or intersection of the peak fragments is divided by the union of the fragments. Please read our companion paper for a more detailed analysis.
148 | 
149 | First we will read in a dataset with six replicates of the Oacyl viewpoint in mouse embryonic stem cells.
150 | ```{r}
151 | inFiles <- paste0("data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_",
152 |                   1:6,".txt.gz")
153 | 
154 | # the genomic position of the Oacyl viewpoint
155 | vp.pos <- 65923816
156 | 
157 | # read from 2 column matrix files
158 | data <- readMultiple(files=inFiles,vp.pos=vp.pos,window=1.2e6)
159 | ```
160 | 
161 | After reading in the 6 replicates in the data structure we can take a look at the quality characteristics of the dataset. peakC report a number of quality metrics: the percentages of captured/covered fragments in the 100kb and 1Mb flanking the viewpoint and of the total covered fragments in cis (i.e. the whole chromosome of the viewpoint). Also the total number of reads in cis is also reported. Note that reads on non-viewpoint chromosomes are left out of this analysis. For the dataset that we have just read in the quality metrics are the following:
162 | 
163 | ```{r}
164 | data$quality
165 | ```
166 | 
167 | To compare the pairwise repoducibility we run the dataset through the `pairwise.jaccard` function. Note that this function has the same input parameters as the `single.analysis` function. See below what precise parameters we have chosen for the current analysis.
168 | 
169 | ```{r}
170 | # set threshold parameters for peakC in single experiments
171 | wSize <- 21; qWd <- 2.5; qWr <- 1.0; minDist <- 15e3
172 | jac.mat <- pairwise.jaccard(data = data, vp.pos = vp.pos, wSize = wSize, qWd = qWd, qWr = qWr, minDist = minDist )
173 | ```
174 | 
175 | The result is a symmetrical matrix `jac.mat`, in which the Jaccard indices for all pairwise comparisons are stored.
176 | 
177 | ```{r}
178 | jac.mat
179 | ```
180 | 
181 | Note that by changing the parameters `qWd`, `qWr` and `wSize` it is possible to increase or decrease the Jaccard similarity. Making the analysis more or less stringent.
182 | 
183 | #References
184 | 


--------------------------------------------------------------------------------
/tutorial/peakC_tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deWitLab/peakC/a17c1115b41d00b9953d6199965acb30a7ac0ea8/tutorial/peakC_tutorial.pdf


--------------------------------------------------------------------------------
/tutorial/peakC_tutorial_conflict-20180412-115545.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "peakC tutorial"
  3 | author: "Geert Geeven & Elzo de Wit"
  4 | date: "April 4, 2018"
  5 | output:
  6 |   pdf_document: default
  7 |   html_document:
  8 |     df_print: paged
  9 | bibliography: library.bib
 10 | ---
 11 | 
 12 | It is becoming increasingly clear that the 3D organization of the genome plays an important in the regulation of genes. To measure the 3D conformation of the genome various 3C-based have been developed (4C/5C/Hi-C/Capture-C/Promoter-Capture-Hi-C). 4C, Capture-C and PCHiC measure the contact frequency of a single region ("viewpoint" or "bait") with the rest of the genome. 
 13 | 
 14 | The main goal of these experiments is to identify regions (which we will call "peaks") whose contact frequencies with the viewpoint region are enriched over the background ("peaks"). Because of the non-uniform distribution of background contact frequencies this is not a trivial exercise. In order to model the background, we assume that the contact frequency only decreases with an increase in distance from the viewpoint (i.e. monotonically decreases). peakC achieves this by performing monotonic regression using the `isotone` package. For a more detailed discussion of the theoretical background and performance of the package we refer the reader to our paper: (reference pending). The current tutorial focuses on how to  perform 4C/Capture-C/PCHiC data analysis using peakC.
 15 | 
 16 | In order to load the package into the R environment run the following command:
 17 | ```{r}
 18 | library(peakC)
 19 | ```
 20 | 
 21 | #### Data structure
 22 | 
 23 | The basic data structure for a (single) 4C/Capture-C/PCHiC experiment is a two-column matrix, with one column for the position of the fragment (end) and one column with the number of reads (coverage, 4C signal). We can use one of the peakC functions to read in the 4C/Capture-C/PCHiC data. As an example we use the `readqWig` function, which can read in *variableStep* wiggle files, which are compatible with the UCSC genome browser. The data we read in is a 4C experiment for one of the hemoglobin genes (alpha-globin) in fetal livers of mice.
 24 | 
 25 | ```{r}
 26 | data <- readqWig("data/alpha-globin_FL_1_cis.wig.gz", vp.pos=32224333, window=700e3)
 27 | head(data)
 28 | ```
 29 | 
 30 | This is what the data structure looks like. The `readqWig` function has selected the fragments in the 700kb genomic region downstream and upstream of the viewpoint (position set by `vp.pos`). The flanking window size can be set with `window`.  The coverage score is automatically normalized to 1 million intrachromosomal reads (reads in cis). 
 31 | 
 32 | **Nota bene**
 33 | As you can see in the data structure the 1<sup>st</sup>, 4<sup>th</sup> and 6<sup>th</sup> row have zeroes in the data. For peakC to function properly it is necessary to leave the zeroes (i.e. uncovered regions) in the dataset. A lack of captured fragments is also information that is required for the proper calculation of the background model (see below).
 34 | 
 35 | Alternative functions for reading in data are:
 36 | 
 37 | * `readMatrix`: reads a simple two column file (wig file without the header)
 38 | * `readMultiple`: reads multiple matrix files at once, returns a list
 39 | * `readMultipleWig`: like read multiple, but for wig files
 40 | * `readMultiColumnFile`: reads a multiple column file, 1st column is fragment position, the rest of the columns are 4C/Capture-C/PCHiC data columns
 41 | 
 42 | 
 43 | A plot of the raw 4C data looks like this:
 44 | 
 45 | ```{r echo=FALSE}
 46 | plot(data, type='h', xlab="chromosomal position", ylab="4C signal")
 47 | ```
 48 | 
 49 | The expression of the alpha-globin gene is extremely high in this tissue and is regulated by enhancers that flank this gene. We can already see the promoter-enhancer contacts appearing in the raw data. But peakC can be used to detect the "peak" regions that contact the viewpoint regions significantly above the background frequency.
 50 | 
 51 | 
 52 | ---------------
 53 | 
 54 | ## Identifying contacted regions ("peaks")
 55 | 
 56 | ##### 4C analysis with replicate experiments
 57 | In order to identify peaks we *strongly* advise using replicates. For this tutorial we have analyze data from a 4C experiment of the alpha-globin gene which was done in triplicate. The peak calling consists of two steps. 1) Reading the data (shown above). 2) Statistical analysis using the `combined.analysis` function. A third optional step is to plot the 4C data and the peaks that were identified.
 58 | 
 59 | Below we show the simplest way to run the peakC peak caller.  Note that as default it runs with a 700kb flanking region (`window`), a running window size of 21 (`wSize`) and an FDR threshold of 0.1 (`alphaFDR`). For details on the required and optional parameters see the man page. We refer the reader to our paper for guidance on how to set appropriate significance thresholds.
 60 | 
 61 | ```{r}
 62 | viewpoint <- 32224333
 63 | files <- dir(path="data/", pattern="alpha-globin_FL_[123]_cis.wig.gz", full=T)
 64 | data <- readMultipleWig(files, vp.pos = viewpoint)
 65 | res <- combined.analysis(data, vp.pos = viewpoint)
 66 | head(res$peak)
 67 | ```
 68 | 
 69 | `combined.analysis` outputs a list of with the results of the peak calling (stored here in `res`). The contents of the results list will be discussed in detail below. What is important for now is the `peak` vector in the list contains the fragments that represent significant interactions with the viewpoint. We can use the output of `combined.analysis` to create a plot by calling the `plot_C` function. The significant fragment windows are shown in red. 
 70 | ```{r}
 71 | plot_C(res)
 72 | ```
 73 | 
 74 | ##### Single experiment 4C analysis
 75 | Below an explanation will be given for performing single experiment analysis. Please note that in a single experiment analysis no formal statistical analysis can be performed and the selection of peak fragments relies on thresholds for the differences and ratios of fragments with respect to the background model fitted 4C coverage. In general it is hard to control for spurious peaks which are not reproducible between 4C experiments. This is discussed in more detail in the companion paper.
 76 | 
 77 | First read in the data:
 78 | 
 79 | ```{r}
 80 | data <- readqWig("data/alpha-globin_FL_1_cis.wig.gz", vp.pos=32224333, window=700e3)
 81 | ```
 82 | 
 83 | and the run the command for identifying the peaks
 84 | 
 85 | ```{r}
 86 | res <- single.analysis(data, vp.pos=32224333, qWd = 2.5)
 87 | ```
 88 | 
 89 | TBoth qWd (difference with respect to background) and qWr (ratio) can be varied to increase or lower the stringency of the peak calling. To plot the significant fragments, we again use the `plot_C` function.
 90 | 
 91 | ```{r}
 92 | plot_C(res)
 93 | ```
 94 | 
 95 | The single experiment peak caller calls most of the peaks that were called by the single experiment peak caller. However, an additional set of peaks are called, some of which are likely false positive peaks.
 96 | 
 97 | ##### Multi-column Capture-C experiment
 98 | In addition to 4C experiments peakC can also analyse Capture-C data. Below an example is shown how to identify significant fragments from a Capture-C experiment with 4 replicates. This data was taken from [@Davies2015]. A file with multiple columns can be read as follows and return a list of data.frames.
 99 | 
100 | ```{r}
101 | data <- readMultiColumnFile("data/Hba_cis.txt", vp.pos = 32189804, window=300e3)
102 | head(data[[1]])
103 | head(data[[2]])
104 | ````
105 | 
106 | The list can be passed to the `combined.analysis` function as follows:
107 | 
108 | ```{r}
109 | vp.pos <- c(32182969,32196638)
110 | res <- combined.analysis(data=data, num.exp = 2, vp.pos = vp.pos, wSize = 5)
111 | plot_C(res, y.max=6000, y.min=-600)
112 | ```
113 | 
114 | Note that in the these Capture-C experiments two baits were used. We would like to exclude the sequence in between the baits, which is why we use two values for the position of the viewpoint. The fragments between the baits are excluded from the 
115 | 
116 | Due to the higher resolution of the Capture-C experiment we have changed the window size, i.e. the number fragments over which the averaging is performed, `wSize` to 5. Even though we read in 4 experiments, when we set `num.exp` to 2 only the first two experiments are used for the peak calling.
117 | 
118 | In the function `plot_C` we can play with the limits of the vertical axis by varying `y.min` and `y.max`. By setting the y.min to a negative value it is possible to insert annotations such as gene tracks or ChIPseq data. 
119 | 
120 | Note that there is a difference between the 4C and the CapC data because there is a 43kb difference in the position of the bait or viewpoint.
121 | 
122 | ----------------------------
123 | 
124 | ##### Output details
125 | 
126 | Let's look into the resulting data structure in bit more detail. For the two replicate Capture-C experiment the result is a list with 6 elements. Let's go over them:
127 | `res$dbR` contains the original data (1st column fragment position, columns 2 until n+1 contain the n experiments). Columns n+2 until 2n+1 contain the estimated background model.
128 | ```{r}
129 | plot(res$dbR[,1], res$dbR[,2], type='h', ylim=c(0,6000), xlab="chromosomal position", ylab="4C signal")
130 | lines(res$dbR[,1], res$dbR[,4], lwd=2, col='blue')
131 | ```
132 | 
133 | First we plot the raw data in the second column, next we add the background model shown by the blue line.
134 | 
135 | `res$peak` contains the fragments that have been called as significant peaks.
136 | 
137 | `res$num.exp` stores the number of experiments that are stored in `res`.
138 | 
139 | `res$p.value` contains for every fragment a rank product probability score whether the fragment is a peak fragment.
140 | 
141 | `res$sel` contains the fragments that are used in the analysis; fragments too close to the viewpoint are not used in the analysis. This distance can be set using the `minDist` parameter in `combined.analysis`.
142 | 
143 | ------------------------------
144 | 
145 | ##### Quality control
146 | 
147 | To determine the similarity between experiments we have written a function that performs a pairwise similarity analysis. We use the Jaccard index as a similarity metric. For the Jaccard index the overlap or intersection of the peak fragments is divided by the union of the fragments. Please read our companion paper for a more detailed analysis.
148 | 
149 | First we will read in a dataset with six replicates of the Oacyl viewpoint in mouse embryonic stem cells.
150 | ```{r}
151 | inFiles <- paste0("data/peakC_tutorial_Oacyl_6_reps_mappedReads.txt/set2_viewpoint_1_replicate_",
152 |                   1:6,".txt.gz")
153 | 
154 | # the genomic position of the Oacyl viewpoint
155 | vp.pos <- 65923816
156 | 
157 | # read from 2 column matrix files
158 | data <- readMultiple(files=inFiles,vp.pos=vp.pos,window=1.2e6)
159 | ```
160 | 
161 | After reading in the 6 replicates in the data structure we can run the dataset through the `pairwise.jaccard` function. Note that this function has the same input parameters as the `single.analysis` function. See below what precise parameters we have chosen for the current analysis.
162 | 
163 | ```{r}
164 | # set threshold parameters for peakC in single experiments
165 | wSize <- 21; qWd <- 2.5; qWr <- 1.0; minDist <- 15e3
166 | jac.mat <- pairwise.jaccard(data = data, vp.pos = vp.pos, wSize = wSize, qWd = qWd, qWr = qWr, minDist = minDist )
167 | ```
168 | 
169 | The result is a symmetrical matrix `jac.mat`, in which the Jaccard indices for all pairwise comparisons are stored.
170 | 
171 | ```{r}
172 | jac.mat
173 | ```
174 | 
175 | Note that by changing the parameters `qWd`, `qWr` and `wSize` it is possible to increase or decrease the Jaccard similarity. Making the analysis more or less stringent.
176 | 
177 | #References
178 | 


--------------------------------------------------------------------------------