Libraries
library(Matrix)
library(ggplot2)
library(plyr)
library(Biobase)
library(locfit)
library(geneplotter)
library(DESeq2)
library(edgeR)
library(genefilter)
library(smoothmest)
library(st)
library(Gviz)
library(biomaRt)
library(GenomicFeatures)
library(Rsamtools)
library(tools)
library(rtracklayer)
library(GenomicRanges)
library(IRanges)
library(colorspace)
library(BSgenome.Hsapiens.UCSC.hg38)
library(LSD)
library(pheatmap)
library(data.table)
library(limma)
library(ggrepel)
library(dplyr)
library(plotly)
library(kableExtra)
library(biomaRt)
library(reshape2)
library(DESeq2)
library(glmGamPoi)
library(LSD)
library(geneplotter)
library(RColorBrewer)
library(forcats)
library(scuttle)
library(VennDiagram)
library(rtracklayer)
library(beeswarm)
library(BSgenome.Hsapiens.UCSC.hg38)
library(BSgenome.Ptroglodytes.UCSC.panTro6)
library(pheatmap)
library(rtracklayer)
library(GenomicRanges)
library(DESeq2)
library(Rsubread)
library(rBLAST)
library(ggVennDiagram)
library(Biostrings)
library(readr)
library(ggpubr)
library(fdrtool)
options(scipen=20)
Functions to import bed files, plot volcanos
readNarrowPeak2getSummit = function(filePath, chroms, scoreCol){
# filePath='/Volumes/T7/CTCF/ATAC_ES_2i_nov/peaks/ATACseq40_ES_2i_IAA_1_MM.narrowPeak'
x= read.delim(filePath,as.is=T,header=F)
x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]) + as.numeric(x[,10]+1) ,
end=as.numeric(x[,2]) + as.numeric(x[,10]+1),
names=rownames(x)),
score=x[,scoreCol]) ) }
readBed_filterChroms = function(filePath, chroms, scoreCol){
# filePath='/Volumes/T7/CTCF/CTCF_HUMAN.H11MO.0.A.bed'
x = read.delim(filePath,as.is=T,header=F)
# x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]+1),
end=as.numeric(x[,3]),
names=x[,4]),
score=x[,scoreCol]) ) }
getSignalInBins = function( bins, modification, featureColumn ){
PsDFvsX = findOverlaps(bins,modification)
PsDFvsX_score = split( as.numeric(elementMetadata(modification)[,featureColumn][subjectHits(PsDFvsX)]), queryHits(PsDFvsX) )
PsDFvsX_score = unlist(lapply(PsDFvsX_score,function(x){mean(x,na.rm=T)}))
bm = rep( 0, length(bins) )
bm[as.numeric(names(PsDFvsX_score))]=PsDFvsX_score
bm = matrix(bm,nrow=length(bins)/sum(bins$peak==1), ncol=sum(bins$peak==1), byrow = T )
rownames(bm) = names(bins)[seq(1,length(bins),by=sum(bins$peak==1))]
return(bm)
}
plot_volcano = function(deseq_result_object){
par(mar=c(5,5,5,5))#, cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)
topT <- as.data.frame(deseq_result_object)
#Adjusted P values (FDR Q values)
with(topT, plot(log2FoldChange, -log10(padj),
pch=20, cex=1.0,
xlab=bquote(~Log[2]~fold~change),
ylab=bquote(~-log[10]~Q~value),
xlim=c(-10,10),
ylim=c(0,20)))
with(subset(topT, padj<0.1 & abs(log2FoldChange)>0),
points(log2FoldChange, -log10(padj), pch=20, col="steelblue", cex=0.5))
#with(subset(topT, padj<0.05 & abs(log2FoldChange)>2), text(log2FoldChange, -log10(padj), labels=subset(rownames(topT), topT$padj<0.05 & abs(topT$log2FoldChange)>2),cex=0.8, pos=3))
#Add lines for absolute FC>1.5 and P-value cut-off at FDR Q<0.01
abline(v=0, col="black", lty=3, lwd=1.0)
#abline(v=-1, col="black", lty=4, lwd=2.0)
#abline(v=1, col="black", lty=4, lwd=2.0)
abline(h=-log10(max(topT$padj[topT$padj<0.1], na.rm=TRUE)), col="black", lty=4, lwd=2.0)}
AppendScoreFromBW = function( peakList, bw ){
getSummitRanges = function(GR){
# GR = hs_atac
start(GR) = start(GR)+GR$score
end(GR) = start(GR)
return(GR) }
# any(duplicated(subjectHits(findOverlaps(hs_atac_track,getSummitRanges(hs_atac))))); table(diff(subjectHits(findOverlaps(hs_atac_track,getSummitRanges(hs_atac)))))
peakList$score[subjectHits(findOverlaps( bw,getSummitRanges(peakList)))] = bw$score[ queryHits(findOverlaps( bw,getSummitRanges(peakList))) ]
return( peakList) }
GetAPRangesForGenomicRangesObject = function( gro ){
# gro = hs_atac_human_spe_enhancers_wo_K27_in_NHP
tp =do.call('rbind', lapply( as.list(seq(1:length(gro))), function(i){
g = gro[i]
tss = start(g) + 250
chr = as.character( chrom(g) )
peak=names(g)
return( data.frame( chr = rep( (chr), 200),
starts = seq( tss-1000, tss+990, length.out=200),
ends = seq( tss-1000, tss+990, length.out=200)+9,
peak = rep( peak, 200),
id = rep(i,200 )) ) } ) )
all_ranges = GRanges(seqnames = Rle(tp$chr),
ranges = IRanges(as.numeric(tp$starts),
end = as.numeric(tp$ends),
names = tp$peak ),
strand = Rle(rep("*", nrow(tp))),
peak = tp$id )
seqlevelsStyle(all_ranges) = 'ucsc'
return(all_ranges) }
getGR = function(X,Y,Z){
GRanges(seqnames = Rle(X),
ranges = IRanges(as.numeric(Y),
end = as.numeric(Z),
names = seq(1, length(X))),
strand = Rle(rep("*", length(X)))) }
plot_volcano <- function(deseq_result_object){
par(mar=c(5,5,5,5))#, cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)
topT <- as.data.frame(deseq_result_object)
#Adjusted P values (FDR Q values)
with(topT, plot(log2FoldChange, -log10(padj),
pch=20, cex=1.0,
xlab=bquote(~Log[2]~fold~change),
ylab=bquote(~-log[10]~Q~value),
xlim=c(-10,10),
ylim=c(0,20)))
with(subset(topT, padj<0.01 & abs(log2FoldChange)>0),
points(log2FoldChange, -log10(padj), pch=20, col="red", cex=0.5))
#with(subset(topT, padj<0.05 & abs(log2FoldChange)>2), text(log2FoldChange, -log10(padj), labels=subset(rownames(topT), topT$padj<0.05 & abs(topT$log2FoldChange)>2),cex=0.8, pos=3))
#Add lines for absolute FC>1.5 and P-value cut-off at FDR Q<0.01
abline(v=0, col="black", lty=3, lwd=1.0)
#abline(v=-1, col="black", lty=4, lwd=2.0)
#abline(v=1, col="black", lty=4, lwd=2.0)
abline(h=-log10(max(topT$padj[topT$padj<0.01], na.rm=TRUE)), col="black", lty=4, lwd=2.0)
}
GetTPM = function( m,cols2take,gn ){
# m=PL_Dat; cols2take=c(7,8,14,15,9,10,11,12,13)
mb = m[,cols2take]
cs = colSums( mb )
f1 = do.call( "cbind", lapply(1:ncol(mb), function(x){
rpk = 1000*(mb[,x]/m$Length)
SF = sum(rpk)/1000000
return( rpk/SF ) } ) )
colnames(f1) = colnames( mb )
rownames(f1) = gn
return(f1)
}
3D data
readTADs = function( domain.path ){
# domain.path="/Volumes/T7/iAstrocyte_evolution/Species_genomes/hic/25kb_domains/pt_mandy_krnorm.all.25kb.topdom.bedpe"
tp = read.delim( domain.path, as.is=TRUE )
tp = tp[-1,]
tp = tp[!is.na(tp$score),]
domains = getGR(tp[tp$tag=="domain",1],
tp[tp$tag=="domain",2],
tp[tp$tag=="domain",6])
names(domains) = tp[tp$tag=="domain",7]
boundaries = c( getGR(tp[tp$tag=="domain",1],
ifelse( tp[tp$tag=="domain",2]>25000, tp[tp$tag=="domain",2]-25000,
tp[tp$tag=="domain",2]),
tp[tp$tag=="domain",2]+25000),
getGR(tp[tp$tag=="domain",1],
tp[tp$tag=="domain",6],
tp[tp$tag=="domain",6]+25000),
getGR(tp[tp$tag=="boundary",1],
ifelse( tp[tp$tag=="boundary",2]>25000,
tp[tp$tag=="boundary",2]-25000,
tp[tp$tag=="boundary",2]),
tp[tp$tag=="boundary",2]+25000) )
boundaries = reduce(boundaries)
names(boundaries) = paste("boundaires_reduced",1:length(boundaries),sep="_")
boundaries$mid = rowMeans(cbind(start(boundaries),end(boundaries)))
res = list(TADs = domains,
boundaries = boundaries)
return(res)
}
liftOverBoundaries = function( tdo, chain, WSize ){
# tdo = man_domains; chain = chain_PtHs; WSize=500
tp = tdo$boundaries
tp = reduce(tp)
names(tp) = paste( "boundary", seq(1,length(tp)),sep="_" )
tp = GenomicRanges::resize(tp,WSize,fix="center")
res = vector("list",2)
res[[1]] = tp
unprocessed_liftover = unlist( liftOver( tp, chain ) )
# unprocessed_liftover = GenomicRanges::resize(unprocessed_liftover,width(unprocessed_liftover)+50,fix="center")
print("processing and filtering the liftovers")
processed_liftover = do.call("c",lapply(as.list(unique(names(unprocessed_liftover))),function(b){
# include some more filtering. First the same chromosome as the other species and then the liftover needs to be at leats 500 bp long
# b="boundary_580"
print(b)
theseB = unprocessed_liftover[which(names(unprocessed_liftover) == b) ]
theseB = reduce(theseB)
theseB = theseB[which(width(theseB)>WSize/2)]
theseB = theseB[which.max(width(theseB))]
otherSpeciesB = tp[which(names(tp)==b)]
if(length(theseB)>0){
chromosomes_match = 0
if( unique(chrom(theseB))==unique(chrom(otherSpeciesB)) ){ chromosomes_match = c( 1 ) }
if( unique(chrom(theseB)) %in% c( "chr2A","chr2B" )
& unique(chrom(otherSpeciesB))=="chr2" ){ chromosomes_match = 1 }
if( unique(chrom(otherSpeciesB)) %in% c( "chr2A","chr2B" )
& unique(chrom(theseB))=="chr2" ){ chromosomes_match = 1 }
if( chromosomes_match==1 ){
resized=GenomicRanges::resize(theseB,50000,fix="center")
names(resized) = unique(b)
return(resized) } } }
))
res[[2]] = processed_liftover
# res[[2]]$mid = start( res[[2]] )
# start(res[[2]] ) = ifelse( start(res[[2]])>25000, start(res[[2]])-25000, start(res[[2]]) )
# end( res[[2]] ) = start(res[[2]]) + 50000
names(res) = c("original","lifted_over")
return(res)
}
getAllBoundaries = function( rep1, rep2 ){
tp = findOverlaps(rep1,rep2)
rone = rep1[-queryHits(tp)]
rone$type="Rep1"
rtwo = rep2[-subjectHits(tp)]
rtwo$type = "Rep2"
cmmon = rep1[queryHits(tp)]
cmmon$type="common"
TP = c( rone, cmmon, rtwo)
names(TP) = paste0("boundaries_",1:length(TP))
return(TP)
}
binAnno = function( chrSizes, chroms, binSize ){
do.call('rbind', lapply(chroms, function(chr){
chr.length = seqlengths(chrSizes[chr])
dongling = chr.length %% binSize
if( dongling > 0 ) chr.length = chr.length + (binSize-dongling) + 1
data.frame( chr=chr,
start=seq(1, chr.length-binSize, by=binSize),
end = seq(binSize, chr.length, by=binSize),
stringsAsFactors=FALSE )
}))
}
transformToSparseMatrix=function( df,row.annotation,col.annotation ){
# row.annotation=rowAnnotation; col.annotation=colAnnotation
sm = Matrix( 0, nrow=nrow(row.annotation),
ncol=nrow(col.annotation) )
tp = cbind( row=match(as.numeric(df[,1]+1),row.annotation[,2]),
col=match(as.numeric(df[,2]+1),col.annotation[,2]) )
sm[tp] = df[,3]
return( sm )
}
read.hic_files = function( directory, prefix, suffix, binAnnotation, chroms ){
chrlist = as.list(chroms)
names(chrlist)= chroms
# prefix="ES1_10kb/ES1_30_"; suffix=".txt"; directory=dumped_dir; binAnnotation=ga
# prefix=""; suffix=".matrix.txt"; directory=dumped_directory_ele; binAnnotation=ga;chrlist= paste0("chr",c(1:22,"X"))
lapply( as.list(chrlist), function(chromosome){
# left = unlist( strsplit( chromosome, ' ' ) )[1]
# right = unlist( strsplit( chromosome, ' ' ) )[2]
# newChromosome = paste(left,right,sep='_')
# chromosome = "chr1"
df = read.delim( paste0(directory, prefix, chromosome, suffix ), as.is=T, header=F )
df = df[!is.na(df$V3),]
df[,1] = as.numeric(df[,1])
rowAnnotation = binAnnotation[binAnnotation$chr==chromosome,]
colAnnotation = binAnnotation[binAnnotation$chr==chromosome,]
print(chromosome)
# print(chromosome)
return( transformToSparseMatrix(df,
row.annotation = rowAnnotation,
col.annotation = colAnnotation ) )
} )
}
getGWmatrix_intra=function( list.of.sm, ga, which.object ){
if( which.object == 'LFM' ) {i=1}
if( which.object == 'balanced' ) {i=2}
nummSM = Matrix(0, nrow=nrow(ga), ncol=nrow(ga) )
ids = do.call('rbind', lapply( as.list(names(list.of.sm)), function(chrom){
print( paste0('including ', chrom) )
if( chrom == 'chr1' ) {start.row = 0; start.col = 0}
if( chrom != 'chr1') {start.row = min(which( ga$chr==chrom ))-1
start.col = min(which( ga$chr==chrom ))-1 }
tp = as.data.frame(summary(list.of.sm[[chrom]][[i]]),stringsAsFactors=F )
tp$i = tp$i + start.row
tp$j = tp$j + start.col
return(tp)
}))
print( 'assembling the matrix, depending on the resolution this might take a while' )
nummSM[ cbind(ids[,1],ids[,2])]=ids[,3]
return( nummSM )
}
loss.function = function(mat)
{tmp = rowSums(mat)
tmp = tmp[tmp > 0]
rs = abs(tmp - 1)
tmp = colSums(mat)
tmp = tmp[tmp > 0]
cs = abs(tmp - 1)
return( 0.5*sum(c(rs,cs)))
}
Create_complete_matrix = function(x){
t.sum = as.data.frame(summary(x), stringsAsFactors=FALSE)
test = sparseMatrix(i = t.sum$i,
j = t.sum$j,
x = t.sum$x,
dims = dim(x),
symmetric = TRUE)
rownames(test) = rownames(x)
colnames(test) = colnames(x)
return( test )
}
IPF_alg = function(lfm, numberOfIterations){
d = dim(lfm)[2]
x = matrix(1, nrow = d, ncol = numberOfIterations)
y = matrix(1, nrow = d, ncol = numberOfIterations)
lf = numeric(numberOfIterations)
counter = 1
while( counter <= numberOfIterations)
{
s.temp = rowSums(lfm)
x[, counter] = ifelse(s.temp > 0,s.temp,1)
lfm = lfm / x[,counter]
s.temp = colSums(lfm)
y[, counter] = ifelse(s.temp > 0,s.temp,1)
lfm = t(t(lfm) / y[,counter])
lf = c(lf, loss.function(lfm))
print(paste("loss function: ", lf[length(lf)], "counter: ", counter))
counter = counter + 1
}
return(list(lfm = lfm, x = x, y=y, lf = lf))
}
normalize_LFM_iteratively = function(x, numberOfIterations){
cm = Create_complete_matrix(x)
tp = IPF_alg(cm, numberOfIterations = numberOfIterations)
x = tp$x
y = tp$y
tp.x = data.frame(x)
test.x = exp(Reduce('+',log(tp.x)))
tp.y = data.frame(y)
test.y = exp(Reduce('+',log(tp.y)))
fac = smhuber(test.x/test.y)$mu
van = rowSums(cm) == 0
test.x[van] = fac
test.x = test.x/sqrt(fac)
tp = cm / test.x
tp = t(t(tp) / test.x)
return( list( lfm = tp, x=x, y=y ))
}
IPF = function(x, numberOfIterations){
twm = normalize_LFM_iteratively(x, numberOfIterations=numberOfIterations)
biases = twm[2:3]
corrected = twm[[1]]
res = vector("list", 3)
res[[1]] = x
res[[2]] = corrected
res[[3]] = biases
names(res) = c("LFM", "balanced", "IC_biases")
return(res)
}
getSUMMEDsignal4bins = function( B, D, A, M ){
# B=bins-HOWFAR; D=distance; A=Area; M=thism
rows = unlist(lapply(as.list(B), function(b){
rep( ( (b-D) + seq(-A,A)) , (1+2*A))}))
cols = unlist(lapply(as.list(B), function(b){
rep( ( (b+D) + seq(-A,A) ), each=(1+2*A))}))
ids = cbind(rows, cols)
m = M[ids]
IDS = rep( names(B), each = (1+2*A)^2 )
# print(c(min(ids),max(ids)))
matrices = split( m, IDS )
return( unlist(lapply(matrices,function(m){sum(m, na.rm=T)})) )}
InsulationScore = function( bin, mat, GAGR, distance, Area, HOWFAR ){
do.call("rbind",lapply( as.list(unique(as.character(chrom(bin)))),function(chr){
print(paste0("processing ",chr))
thism = mat[[chr]]$balanced
theseb = GenomicRanges::resize( bin[which(chrom(bin)==chr)], 1, fix="center")
bins = GAGR$binid[queryHits(findOverlaps(GAGR,theseb))]
names(bins) = names(theseb[subjectHits(findOverlaps(GAGR,theseb))])
bins = bins[bins-distance-HOWFAR-Area>0 & bins+distance+HOWFAR+Area<(length(GAGR[which(chrom(GAGR)==chr)]))]
return(data.frame( left=getSUMMEDsignal4bins(B=bins-HOWFAR, D=distance, A=Area, M=thism ),
middle=getSUMMEDsignal4bins(B=bins, D=distance, A=Area, M=thism ),
right=getSUMMEDsignal4bins(B=bins+HOWFAR, D=distance, A=Area, M=thism ) )) } )) }
Sequence analysis
Figure_out_mismatching_sequences = function( region_ref,
ref_sequences,
liftover_to_the_other_species,
other_species_sequence){
# 7072,3169
# enh="ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_39067";
# enh="ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_1026"
# enh="ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_4338";
# enh="ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_601"; region_ref=enhancers_to_assess[which(names(enhancers_to_assess)==enh)];ref_sequences=hs_enhancers_sequences[which(names(hs_enhancers_sequences)==enh)];liftover_to_the_other_species=hs_atac_mapped_in_chimp_mod[which(names(hs_atac_mapped_in_chimp_mod)==enh)];other_species_sequence=hs_enhancers_Pt_sequences[which(names(hs_enhancers_Pt_sequences)==enh)]
# ref_sequence=enhancers_linked_with_activation_seq_Hs["ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_2521"]
# sub_sequence = enhancers_linked_with_activation_seq_Pt["ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_2521"]
# chr1:1354454-1354599
# attention - lift over gives us often only a piece of our sequence
# we need to account for that
# mismatches should also fall within the comparable sequence
res = data.frame()
for( i in 1:length(region_ref)){
ref_gr = region_ref
peak_in_the_other_species = liftover_to_the_other_species
ref_sequence = ref_sequences
sub_sequence = other_species_sequence
sub_mat <- nucleotideSubstitutionMatrix(match = 2, mismatch = -3, baseOnly = TRUE)
alignmR = pairwiseAlignment(ref_sequence,reverseComplement(sub_sequence),substitutionMatrix = sub_mat, gapOpening = 5, gapExtension = 2 )
alignmF = pairwiseAlignment(ref_sequence,(sub_sequence), substitutionMatrix = sub_mat, gapOpening = 5, gapExtension = 2 )
if( score(alignmR) > score(alignmF)) { alignm=alignmR
LD = nedit( alignm ) }
if( score(alignmR) <= score(alignmF)) { alignm=alignmF
LD = nedit(alignm) }
ps = unlist(strsplit(as.character(pattern(alignm)),""))
ss = unlist(strsplit(as.character(subject(alignm)),""))
DEL = which( ps =="-" )
# if( length(DEL)>0){
# x=reduce(IRanges(start=DEL,end=DEL))
# if( length(x)==1 ){ DELP=start(x) + start(ref_gr)-1 }
# if( length(x)>1 ) { DELP=c(start(x)[1],
# start(x)[2:length(x)]-cumsum(width(x))[1:(length(x)-1)] ) + min(start(ref_gr))-1}
# deletions = data.frame( seqnames = Rle( rep( as.character( chrom(ref_gr) ),length(DELP) ) ),
# start = as.numeric(DELP),
# end=DELP,
# names = rep( names(ref_gr), length(DELP) ),
# type = rep( "Deletion", length(DELP) ) )
# res = rbind(res,deletions)}
if( length(DEL)>0){
x=reduce(IRanges(start=DEL,end=DEL))
if( length(x)==1 ){ DELP=start(x) + start(ref_gr)-1 }
if( length(x)>1 ) { DELP=c( start(x) - cumsum(c(0,width(x)) )[1:(length(x))] ) + min(start(ref_gr))-1}
deletions = data.frame( seqnames = Rle( rep( as.character( chrom(ref_gr) ),length(DELP) ) ),
start = as.numeric(DELP),
end=DELP,
names = rep( names(ref_gr), length(DELP) ),
type = rep( "Deletion", length(DELP) ) )
res = rbind(res,deletions)}
ss = ss[ps!="-"]
ps = ps[ps!="-"]
mm = which( ps!=ss & ss != "-" )
insertions = which( ss == "-" )
if( length(mm)>0 ){
positions = c( start(ref_gr)-1 + mm )
positions_GR = data.frame( seqnames = Rle( rep( as.character( chrom(ref_gr) ),length(positions) ) ),
start = as.numeric(positions),
end=positions,
names = rep( names(ref_gr), length(positions) ),
type = rep( "MM", length(positions) ) )
res=rbind(res,positions_GR)}
if( length(insertions)>0 ){
x=reduce(IRanges(start=insertions,end=insertions))
insertion_GR = data.frame( seqnames=Rle( rep( chrom(ref_gr),length(x) ) ),
start = start(ref_gr)-1+as.numeric(start(x)),
end = start(ref_gr)-1+as.numeric(end(x)),
names = unique(names(ref_gr)),
type="insertion" )
res=rbind(res,insertion_GR)}
}
return(res) }
readBedtools_res = function(filePath, chroms, extra_column1,extra_column2){
# filePath='/Users/apekowska/Desktop/Ciuba_et_al_SM/data/outputs/gainedE_TFBS/'
filesInDir = paste0(filePath, list.files(filePath))
x = do.call("rbind", lapply(filesInDir, function(x){ read.delim(x,as.is=T,header=F) }) )
x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
x = x[x$V8>0,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]+1) ,
end=as.numeric(x[,3]),
names=rownames(x)),
peak = x[,extra_column1],
TFmotif = x[,extra_column2]) ) }
readBedtools_UCSC = function(filePath, chroms, extra_column1,extra_column2){
# filePath='/Users/apekowska/Desktop/Ciuba_et_al_SM/data/outputs/gainedE_TFBS/'
filesInDir = paste0(filePath, list.files(filePath))
x = do.call("rbind", lapply(filesInDir, function(x){ read.delim(x,as.is=T,header=F) }) )
# x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
x = x[x$V8>0,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]+1) ,
end=as.numeric(x[,3]),
names=rownames(x)),
peak = x[,extra_column1],
TFmotif = x[,extra_column2]) ) }
processTFBSresult = function( tfbsres, tfanno, nameColumn ){
# tfbsres=lostE_TFBSchange; peaks=lostE; tfanno=TFsEnsemblG
# add peak name
tfbs = unlist( lapply( strsplit(tfbsres$TFmotif,"/"),function(x){x[length(x)]}) )
tfbsres$TF = tfanno$Fixed[ match(tfbs, tfanno[,nameColumn]) ]
return( tfbsres ) }
makeMatrixTFBS4peaks = function( tfmut, theTFs, allPeaks ){
# theTFs=unique(TFsEnsemblG$Fixed);allPeaks=gainedE;tfmut=gainedE_TFBSchange
res = matrix(0L,
nrow=length(allPeaks),
ncol=length(unique(theTFs)) )
res = as.data.frame(res)
rownames(res) = names(allPeaks)
colnames(res) = unique(theTFs)
tp = split( tfmut$peak, tfmut$TF )
for( tf in unique(theTFs)) {
# tf="ZNF713"
thisC = which( colnames(res)==tf )
theseRows = which( rownames(res) %in% tp[[tf]])
coordineates = cbind(row=theseRows,
col=rep(thisC,length(theseRows)))
res[ coordineates ] = 1
}
return(res)
}