Libraries
library(Matrix)
library(ggplot2)
library(plyr)
library(Biobase)
library(locfit)
library(geneplotter)
library(DESeq2)
library(edgeR)
library(genefilter)
library(smoothmest)
library(st)
library(Gviz)
library(biomaRt)
library(GenomicFeatures)
library(Rsamtools)
library(tools)
library(rtracklayer)
library(GenomicRanges)
library(IRanges)
library(colorspace)
library(BSgenome.Hsapiens.UCSC.hg38)
library(LSD)
library(pheatmap)
library(data.table)
library(limma)
library(ggrepel)
library(dplyr)
library(plotly)
library(kableExtra)
library(biomaRt)
library(reshape2)
library(DESeq2)
library(glmGamPoi)
library(LSD)
library(geneplotter)
library(RColorBrewer)
library(forcats)
library(scuttle)
library(VennDiagram)
library(rtracklayer)
library(beeswarm)
library(BSgenome.Hsapiens.UCSC.hg38)
library(BSgenome.Ptroglodytes.UCSC.panTro6)
library(pheatmap)
library(rtracklayer)
library(GenomicRanges)
library(DESeq2)
library(Rsubread)
library(rBLAST)
library(ggVennDiagram)
library(Biostrings)
library(readr)
library(ggpubr)
library(fdrtool)
options(scipen=20)
 
Functions to import bed files, plot volcanos
readNarrowPeak2getSummit = function(filePath, chroms, scoreCol){
  # filePath='/Volumes/T7/CTCF/ATAC_ES_2i_nov/peaks/ATACseq40_ES_2i_IAA_1_MM.narrowPeak'
  x= read.delim(filePath,as.is=T,header=F)
  x[,1] = paste0('chr',x[,1])
  x = x[x[,1] %in% chroms,]
  return( GRanges( seqnames = Rle(x[,1]),
                   ranges = IRanges(as.numeric(x[,2]) + as.numeric(x[,10]+1) ,
                                    end=as.numeric(x[,2]) + as.numeric(x[,10]+1),
                                    names=rownames(x)),
                   score=x[,scoreCol]) ) }
readBed_filterChroms = function(filePath, chroms, scoreCol){
  # filePath='/Volumes/T7/CTCF/CTCF_HUMAN.H11MO.0.A.bed'
  x = read.delim(filePath,as.is=T,header=F)
  # x[,1] = paste0('chr',x[,1])
  x = x[x[,1] %in% chroms,]
  return( GRanges( seqnames = Rle(x[,1]),
                   ranges = IRanges(as.numeric(x[,2]+1),
                                    end=as.numeric(x[,3]),
                                    names=x[,4]),
                   score=x[,scoreCol]) ) } 
getSignalInBins = function( bins, modification, featureColumn ){
  PsDFvsX = findOverlaps(bins,modification)
  PsDFvsX_score = split( as.numeric(elementMetadata(modification)[,featureColumn][subjectHits(PsDFvsX)]), queryHits(PsDFvsX) )
  PsDFvsX_score = unlist(lapply(PsDFvsX_score,function(x){mean(x,na.rm=T)}))
  
  bm = rep( 0, length(bins) )
  bm[as.numeric(names(PsDFvsX_score))]=PsDFvsX_score
  bm = matrix(bm,nrow=length(bins)/sum(bins$peak==1), ncol=sum(bins$peak==1), byrow = T )
  rownames(bm) = names(bins)[seq(1,length(bins),by=sum(bins$peak==1))]
  return(bm)
}
plot_volcano = function(deseq_result_object){
  par(mar=c(5,5,5,5))#, cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)
  topT <- as.data.frame(deseq_result_object)
  #Adjusted P values (FDR Q values)
  with(topT, plot(log2FoldChange, -log10(padj), 
                  pch=20, cex=1.0, 
                  xlab=bquote(~Log[2]~fold~change), 
                  ylab=bquote(~-log[10]~Q~value), 
                  xlim=c(-10,10),
                  ylim=c(0,20)))
  with(subset(topT, padj<0.1 & abs(log2FoldChange)>0), 
       points(log2FoldChange, -log10(padj), pch=20, col="steelblue", cex=0.5))
  #with(subset(topT, padj<0.05 & abs(log2FoldChange)>2), text(log2FoldChange, -log10(padj), labels=subset(rownames(topT), topT$padj<0.05 & abs(topT$log2FoldChange)>2),cex=0.8, pos=3))
  #Add lines for absolute FC>1.5 and P-value cut-off at FDR Q<0.01
  abline(v=0, col="black", lty=3, lwd=1.0)
  #abline(v=-1, col="black", lty=4, lwd=2.0)
  #abline(v=1, col="black", lty=4, lwd=2.0)
  abline(h=-log10(max(topT$padj[topT$padj<0.1], na.rm=TRUE)), col="black", lty=4, lwd=2.0)}
AppendScoreFromBW = function( peakList, bw ){
  getSummitRanges = function(GR){
    # GR = hs_atac
    start(GR) = start(GR)+GR$score
    end(GR) = start(GR)
    return(GR) }
  # any(duplicated(subjectHits(findOverlaps(hs_atac_track,getSummitRanges(hs_atac))))); table(diff(subjectHits(findOverlaps(hs_atac_track,getSummitRanges(hs_atac)))))
  peakList$score[subjectHits(findOverlaps( bw,getSummitRanges(peakList)))] = bw$score[ queryHits(findOverlaps( bw,getSummitRanges(peakList))) ]
  return( peakList) }
GetAPRangesForGenomicRangesObject = function( gro ){
  # gro = hs_atac_human_spe_enhancers_wo_K27_in_NHP
  tp =do.call('rbind', lapply( as.list(seq(1:length(gro))), function(i){
    g = gro[i]
    tss = start(g) + 250
    chr = as.character( chrom(g) )
    peak=names(g)
    return( data.frame( chr = rep( (chr), 200),
                        starts = seq( tss-1000, tss+990, length.out=200),
                        ends =  seq( tss-1000, tss+990, length.out=200)+9,
                        peak = rep( peak, 200),
                        id = rep(i,200 )) ) } ) )
  all_ranges = GRanges(seqnames = Rle(tp$chr),
                       ranges = IRanges(as.numeric(tp$starts),
                                        end = as.numeric(tp$ends),
                                        names = tp$peak ),
                       strand = Rle(rep("*", nrow(tp))),
                       peak = tp$id )
  seqlevelsStyle(all_ranges) = 'ucsc'
  return(all_ranges) }
getGR = function(X,Y,Z){
  GRanges(seqnames = Rle(X),
          ranges = IRanges(as.numeric(Y),
                           end = as.numeric(Z),
                           names = seq(1, length(X))),
          strand = Rle(rep("*", length(X)))) }
plot_volcano <- function(deseq_result_object){
  par(mar=c(5,5,5,5))#, cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)
  topT <- as.data.frame(deseq_result_object)
  #Adjusted P values (FDR Q values)
  with(topT, plot(log2FoldChange, -log10(padj), 
                  pch=20, cex=1.0, 
                  xlab=bquote(~Log[2]~fold~change), 
                  ylab=bquote(~-log[10]~Q~value), 
                  xlim=c(-10,10),
                  ylim=c(0,20)))
  with(subset(topT, padj<0.01 & abs(log2FoldChange)>0), 
       points(log2FoldChange, -log10(padj), pch=20, col="red", cex=0.5))
  #with(subset(topT, padj<0.05 & abs(log2FoldChange)>2), text(log2FoldChange, -log10(padj), labels=subset(rownames(topT), topT$padj<0.05 & abs(topT$log2FoldChange)>2),cex=0.8, pos=3))
  #Add lines for absolute FC>1.5 and P-value cut-off at FDR Q<0.01
  abline(v=0, col="black", lty=3, lwd=1.0)
  #abline(v=-1, col="black", lty=4, lwd=2.0)
  #abline(v=1, col="black", lty=4, lwd=2.0)
  abline(h=-log10(max(topT$padj[topT$padj<0.01], na.rm=TRUE)), col="black", lty=4, lwd=2.0)
}
GetTPM = function( m,cols2take,gn ){
  # m=PL_Dat; cols2take=c(7,8,14,15,9,10,11,12,13)
  mb = m[,cols2take]
  cs = colSums( mb )
  f1 = do.call( "cbind", lapply(1:ncol(mb), function(x){
    rpk = 1000*(mb[,x]/m$Length)
    SF = sum(rpk)/1000000
    return( rpk/SF ) } )  )
  colnames(f1) = colnames( mb )
  rownames(f1) = gn
  return(f1)
}
 
3D data
readTADs = function( domain.path ){
  # domain.path="/Volumes/T7/iAstrocyte_evolution/Species_genomes/hic/25kb_domains/pt_mandy_krnorm.all.25kb.topdom.bedpe"
  tp = read.delim( domain.path, as.is=TRUE )
  tp = tp[-1,]
  tp = tp[!is.na(tp$score),]
  domains = getGR(tp[tp$tag=="domain",1],
                  tp[tp$tag=="domain",2],
                  tp[tp$tag=="domain",6])
  names(domains) = tp[tp$tag=="domain",7]
  boundaries = c( getGR(tp[tp$tag=="domain",1],
                        ifelse( tp[tp$tag=="domain",2]>25000, tp[tp$tag=="domain",2]-25000,
                                tp[tp$tag=="domain",2]),
                        tp[tp$tag=="domain",2]+25000),
                  getGR(tp[tp$tag=="domain",1],
                        tp[tp$tag=="domain",6],
                        tp[tp$tag=="domain",6]+25000),
                  getGR(tp[tp$tag=="boundary",1],
                        ifelse( tp[tp$tag=="boundary",2]>25000,
                                tp[tp$tag=="boundary",2]-25000,
                                tp[tp$tag=="boundary",2]),
                        tp[tp$tag=="boundary",2]+25000) )
  boundaries = reduce(boundaries)
  names(boundaries) = paste("boundaires_reduced",1:length(boundaries),sep="_")
  boundaries$mid = rowMeans(cbind(start(boundaries),end(boundaries)))
  res = list(TADs = domains,
             boundaries = boundaries)
  return(res)
}
liftOverBoundaries = function( tdo, chain, WSize ){
  # tdo = man_domains; chain = chain_PtHs; WSize=500
  tp = tdo$boundaries
  tp = reduce(tp)
  names(tp) = paste( "boundary", seq(1,length(tp)),sep="_" )
  tp = GenomicRanges::resize(tp,WSize,fix="center")
  res = vector("list",2)
  
  res[[1]] = tp
  unprocessed_liftover = unlist( liftOver( tp, chain ) )
  # unprocessed_liftover = GenomicRanges::resize(unprocessed_liftover,width(unprocessed_liftover)+50,fix="center") 
  
  print("processing and filtering the liftovers")
  
  processed_liftover = do.call("c",lapply(as.list(unique(names(unprocessed_liftover))),function(b){
    # include some more filtering. First the same chromosome as the other species and then the liftover needs to be at leats 500 bp long
    # b="boundary_580"
    print(b)
    theseB = unprocessed_liftover[which(names(unprocessed_liftover) == b) ]
    theseB = reduce(theseB)
    theseB = theseB[which(width(theseB)>WSize/2)]
    theseB = theseB[which.max(width(theseB))]
    otherSpeciesB = tp[which(names(tp)==b)]
    if(length(theseB)>0){
      chromosomes_match = 0
      if( unique(chrom(theseB))==unique(chrom(otherSpeciesB)) ){ chromosomes_match = c( 1 ) }
      if( unique(chrom(theseB)) %in% c( "chr2A","chr2B" ) 
          & unique(chrom(otherSpeciesB))=="chr2" ){ chromosomes_match = 1 }
      if( unique(chrom(otherSpeciesB)) %in% c( "chr2A","chr2B" ) 
          & unique(chrom(theseB))=="chr2" ){ chromosomes_match = 1 }
      if( chromosomes_match==1 ){ 
        resized=GenomicRanges::resize(theseB,50000,fix="center")
        names(resized) = unique(b)
        return(resized) }  } } 
  ))
  
  res[[2]] = processed_liftover
  # res[[2]]$mid = start( res[[2]] )
  # start(res[[2]] ) = ifelse( start(res[[2]])>25000, start(res[[2]])-25000, start(res[[2]]) )
  # end( res[[2]] ) = start(res[[2]]) + 50000
  names(res) = c("original","lifted_over")
  return(res)
}
getAllBoundaries = function( rep1, rep2 ){
  tp = findOverlaps(rep1,rep2)
  rone = rep1[-queryHits(tp)]
  rone$type="Rep1"
  rtwo = rep2[-subjectHits(tp)]
  rtwo$type = "Rep2"
  cmmon = rep1[queryHits(tp)]
  cmmon$type="common"
  TP = c( rone, cmmon, rtwo)
  names(TP) = paste0("boundaries_",1:length(TP))
  return(TP)
}
binAnno = function( chrSizes, chroms, binSize ){
  
  do.call('rbind', lapply(chroms, function(chr){
    chr.length = seqlengths(chrSizes[chr])
    dongling = chr.length %% binSize
    if( dongling > 0 ) chr.length = chr.length + (binSize-dongling) + 1 
    data.frame( chr=chr,
                start=seq(1, chr.length-binSize, by=binSize),
                end = seq(binSize, chr.length, by=binSize),
                stringsAsFactors=FALSE )
  }))
  
}
transformToSparseMatrix=function( df,row.annotation,col.annotation ){
  # row.annotation=rowAnnotation; col.annotation=colAnnotation
  sm = Matrix( 0, nrow=nrow(row.annotation), 
               ncol=nrow(col.annotation) )
  tp = cbind( row=match(as.numeric(df[,1]+1),row.annotation[,2]),
              col=match(as.numeric(df[,2]+1),col.annotation[,2]) )
  sm[tp] = df[,3]
  return( sm )
  
}  
read.hic_files = function( directory, prefix, suffix, binAnnotation, chroms ){
  
  chrlist = as.list(chroms)
  names(chrlist)= chroms
  
  # prefix="ES1_10kb/ES1_30_"; suffix=".txt"; directory=dumped_dir; binAnnotation=ga
  # prefix=""; suffix=".matrix.txt"; directory=dumped_directory_ele; binAnnotation=ga;chrlist= paste0("chr",c(1:22,"X"))
  lapply( as.list(chrlist), function(chromosome){
    # left = unlist( strsplit( chromosome, ' ' ) )[1] 
    # right = unlist( strsplit( chromosome, ' ' ) )[2]
    # newChromosome = paste(left,right,sep='_')
    # chromosome = "chr1"
    df = read.delim( paste0(directory, prefix, chromosome, suffix ), as.is=T, header=F )
    df = df[!is.na(df$V3),]
    df[,1] = as.numeric(df[,1])
    rowAnnotation = binAnnotation[binAnnotation$chr==chromosome,]
    colAnnotation = binAnnotation[binAnnotation$chr==chromosome,]
    print(chromosome)
    # print(chromosome)
    return( transformToSparseMatrix(df, 
                                    row.annotation = rowAnnotation,
                                    col.annotation = colAnnotation ) )
  } )
}
getGWmatrix_intra=function( list.of.sm, ga, which.object ){
  if( which.object == 'LFM' ) {i=1}
  if( which.object == 'balanced' ) {i=2}
  
  nummSM = Matrix(0, nrow=nrow(ga), ncol=nrow(ga) )
  
  ids = do.call('rbind', lapply( as.list(names(list.of.sm)), function(chrom){
    print( paste0('including ', chrom) )
    if( chrom == 'chr1' ) {start.row = 0; start.col = 0}
    if( chrom != 'chr1')  {start.row = min(which( ga$chr==chrom ))-1
    start.col = min(which( ga$chr==chrom ))-1 }
    tp = as.data.frame(summary(list.of.sm[[chrom]][[i]]),stringsAsFactors=F )
    tp$i = tp$i + start.row
    tp$j = tp$j + start.col
    return(tp)
  }))
  print( 'assembling the matrix, depending on the resolution this might take a while' )
  nummSM[ cbind(ids[,1],ids[,2])]=ids[,3]
  return( nummSM )
  
}
loss.function = function(mat)
{tmp = rowSums(mat)
tmp = tmp[tmp > 0]
rs = abs(tmp - 1)
tmp = colSums(mat)
tmp = tmp[tmp > 0]
cs = abs(tmp - 1)
return( 0.5*sum(c(rs,cs)))  
}
Create_complete_matrix = function(x){
  
  t.sum = as.data.frame(summary(x), stringsAsFactors=FALSE)
  test = sparseMatrix(i = t.sum$i,
                      j = t.sum$j,
                      x = t.sum$x,
                      dims = dim(x),
                      symmetric = TRUE)
  rownames(test) = rownames(x)
  colnames(test) = colnames(x)
  return( test )
  
}
IPF_alg = function(lfm, numberOfIterations){
  
  d =  dim(lfm)[2]
  
  x = matrix(1, nrow = d, ncol = numberOfIterations) 
  y =  matrix(1, nrow = d, ncol = numberOfIterations)
  lf = numeric(numberOfIterations)
  counter = 1  
  
  while( counter <= numberOfIterations) 
  {
    s.temp = rowSums(lfm)
    x[, counter] = ifelse(s.temp > 0,s.temp,1)
    lfm = lfm / x[,counter]
    
    s.temp = colSums(lfm)
    y[, counter] = ifelse(s.temp > 0,s.temp,1)
    
    lfm = t(t(lfm) / y[,counter])
    
    
    lf = c(lf, loss.function(lfm))
    print(paste("loss function: ", lf[length(lf)], "counter: ",  counter))
    counter = counter + 1   
  }
  
  return(list(lfm = lfm, x = x, y=y, lf = lf))
}
normalize_LFM_iteratively = function(x, numberOfIterations){
  
  cm = Create_complete_matrix(x)
  tp = IPF_alg(cm, numberOfIterations = numberOfIterations)
  
  x = tp$x
  y = tp$y
  tp.x = data.frame(x)
  test.x = exp(Reduce('+',log(tp.x)))
  
  tp.y = data.frame(y)
  test.y = exp(Reduce('+',log(tp.y)))
  
  fac = smhuber(test.x/test.y)$mu
  
  van = rowSums(cm) == 0 
  test.x[van] = fac
  
  test.x = test.x/sqrt(fac)
  
  tp = cm / test.x
  tp = t(t(tp) / test.x)
  
  return( list( lfm = tp, x=x, y=y ))
  
}
IPF = function(x, numberOfIterations){
  
  twm = normalize_LFM_iteratively(x, numberOfIterations=numberOfIterations)
  biases = twm[2:3]
  corrected = twm[[1]]
  res = vector("list", 3)
  res[[1]] = x
  res[[2]] = corrected
  res[[3]] = biases
  names(res) = c("LFM", "balanced", "IC_biases")
  return(res)
  
  
}
getSUMMEDsignal4bins = function( B, D, A, M ){
  # B=bins-HOWFAR; D=distance; A=Area; M=thism
  rows = unlist(lapply(as.list(B), function(b){
    rep( ( (b-D) + seq(-A,A)) , (1+2*A))}))
  
  cols = unlist(lapply(as.list(B), function(b){
    rep( ( (b+D) + seq(-A,A) ), each=(1+2*A))}))
  ids = cbind(rows, cols)
  m = M[ids]
  IDS = rep( names(B), each = (1+2*A)^2 )
  # print(c(min(ids),max(ids)))
  matrices = split( m, IDS )
  return( unlist(lapply(matrices,function(m){sum(m, na.rm=T)})) )}
InsulationScore = function( bin, mat, GAGR, distance, Area, HOWFAR ){
  do.call("rbind",lapply( as.list(unique(as.character(chrom(bin)))),function(chr){
    print(paste0("processing ",chr))
    thism = mat[[chr]]$balanced
    theseb = GenomicRanges::resize( bin[which(chrom(bin)==chr)], 1, fix="center")
    
    bins = GAGR$binid[queryHits(findOverlaps(GAGR,theseb))]
    names(bins) = names(theseb[subjectHits(findOverlaps(GAGR,theseb))])
    
    bins = bins[bins-distance-HOWFAR-Area>0 & bins+distance+HOWFAR+Area<(length(GAGR[which(chrom(GAGR)==chr)]))]
    
    return(data.frame( left=getSUMMEDsignal4bins(B=bins-HOWFAR, D=distance, A=Area, M=thism ),
                       middle=getSUMMEDsignal4bins(B=bins, D=distance, A=Area, M=thism ),
                       right=getSUMMEDsignal4bins(B=bins+HOWFAR, D=distance, A=Area, M=thism ) )) } )) }
 
Sequence analysis
Figure_out_mismatching_sequences = function( region_ref, 
                                             ref_sequences, 
                                             liftover_to_the_other_species,
                                             other_species_sequence){
  # 7072,3169
  # enh="ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_39067";
  # enh="ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_1026"
  # enh="ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_4338";
  # enh="ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_601"; region_ref=enhancers_to_assess[which(names(enhancers_to_assess)==enh)];ref_sequences=hs_enhancers_sequences[which(names(hs_enhancers_sequences)==enh)];liftover_to_the_other_species=hs_atac_mapped_in_chimp_mod[which(names(hs_atac_mapped_in_chimp_mod)==enh)];other_species_sequence=hs_enhancers_Pt_sequences[which(names(hs_enhancers_Pt_sequences)==enh)]
  # ref_sequence=enhancers_linked_with_activation_seq_Hs["ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_2521"]
  # sub_sequence = enhancers_linked_with_activation_seq_Pt["ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peak_2521"]
  # chr1:1354454-1354599
  # attention - lift over gives us often only a piece of our sequence
  # we need to account for that
  # mismatches should also fall within the comparable sequence
  
  res = data.frame()
  for( i in 1:length(region_ref)){
    ref_gr = region_ref
    peak_in_the_other_species = liftover_to_the_other_species
    ref_sequence = ref_sequences
    sub_sequence = other_species_sequence
    sub_mat <- nucleotideSubstitutionMatrix(match = 2, mismatch = -3, baseOnly = TRUE)
    
    alignmR = pairwiseAlignment(ref_sequence,reverseComplement(sub_sequence),substitutionMatrix = sub_mat, gapOpening = 5, gapExtension = 2 )
    alignmF = pairwiseAlignment(ref_sequence,(sub_sequence), substitutionMatrix = sub_mat, gapOpening = 5, gapExtension = 2 )
    
    
    if( score(alignmR) > score(alignmF)) { alignm=alignmR
    LD = nedit( alignm ) }
    if( score(alignmR) <= score(alignmF)) { alignm=alignmF
    LD = nedit(alignm) }
    
    
    ps = unlist(strsplit(as.character(pattern(alignm)),""))
    ss = unlist(strsplit(as.character(subject(alignm)),""))
    DEL = which( ps =="-" )
    
    # if( length(DEL)>0){
    #   x=reduce(IRanges(start=DEL,end=DEL))
    #   if( length(x)==1 ){ DELP=start(x) + start(ref_gr)-1 }
    #   if( length(x)>1 ) { DELP=c(start(x)[1], 
    #                              start(x)[2:length(x)]-cumsum(width(x))[1:(length(x)-1)] ) + min(start(ref_gr))-1}
    #   deletions = data.frame( seqnames = Rle( rep( as.character( chrom(ref_gr) ),length(DELP) ) ),
    #                               start = as.numeric(DELP),
    #                               end=DELP,
    #                               names = rep( names(ref_gr), length(DELP) ),
    #                               type = rep( "Deletion", length(DELP) ) )
    #   res = rbind(res,deletions)}
    
    if( length(DEL)>0){
      x=reduce(IRanges(start=DEL,end=DEL))
      if( length(x)==1 ){ DELP=start(x) + start(ref_gr)-1 }
      if( length(x)>1 ) { DELP=c( start(x) - cumsum(c(0,width(x)) )[1:(length(x))] ) + min(start(ref_gr))-1}
      deletions = data.frame( seqnames = Rle( rep( as.character( chrom(ref_gr) ),length(DELP) ) ),
                              start = as.numeric(DELP),
                              end=DELP,
                              names = rep( names(ref_gr), length(DELP) ),
                              type = rep( "Deletion", length(DELP) ) )
      res = rbind(res,deletions)}
    
    ss = ss[ps!="-"]
    ps = ps[ps!="-"]
    
    mm = which( ps!=ss & ss != "-" )
    insertions = which( ss == "-" )
    
    
    if( length(mm)>0  ){
      positions = c( start(ref_gr)-1 + mm )
      positions_GR = data.frame( seqnames = Rle( rep( as.character( chrom(ref_gr) ),length(positions) ) ),
                                 start = as.numeric(positions),
                                 end=positions,
                                 names = rep( names(ref_gr), length(positions) ),
                                 type = rep( "MM", length(positions) ) ) 
      res=rbind(res,positions_GR)}
    
    if( length(insertions)>0   ){ 
      x=reduce(IRanges(start=insertions,end=insertions))
      insertion_GR = data.frame( seqnames=Rle( rep( chrom(ref_gr),length(x) ) ),
                                 start = start(ref_gr)-1+as.numeric(start(x)),
                                 end = start(ref_gr)-1+as.numeric(end(x)),
                                 names = unique(names(ref_gr)),
                                 type="insertion" ) 
      res=rbind(res,insertion_GR)} 
    
    
  }
  return(res) }
readBedtools_res = function(filePath, chroms, extra_column1,extra_column2){
  # filePath='/Users/apekowska/Desktop/Ciuba_et_al_SM/data/outputs/gainedE_TFBS/'
  filesInDir = paste0(filePath, list.files(filePath))
  x = do.call("rbind", lapply(filesInDir, function(x){ read.delim(x,as.is=T,header=F) }) )
  x[,1] = paste0('chr',x[,1])
  x = x[x[,1] %in% chroms,]
  x = x[x$V8>0,]
  return( GRanges( seqnames = Rle(x[,1]),
                   ranges = IRanges(as.numeric(x[,2]+1) ,
                                    end=as.numeric(x[,3]),
                                    names=rownames(x)),
                   peak = x[,extra_column1],
                   TFmotif = x[,extra_column2]) ) }
readBedtools_UCSC = function(filePath, chroms, extra_column1,extra_column2){
  # filePath='/Users/apekowska/Desktop/Ciuba_et_al_SM/data/outputs/gainedE_TFBS/'
  filesInDir = paste0(filePath, list.files(filePath))
  x = do.call("rbind", lapply(filesInDir, function(x){ read.delim(x,as.is=T,header=F) }) )
  # x[,1] = paste0('chr',x[,1])
  x = x[x[,1] %in% chroms,]
  x = x[x$V8>0,]
  return( GRanges( seqnames = Rle(x[,1]),
                   ranges = IRanges(as.numeric(x[,2]+1) ,
                                    end=as.numeric(x[,3]),
                                    names=rownames(x)),
                   peak = x[,extra_column1],
                   TFmotif = x[,extra_column2]) ) }
processTFBSresult = function( tfbsres, tfanno, nameColumn ){
  # tfbsres=lostE_TFBSchange; peaks=lostE; tfanno=TFsEnsemblG
  # add peak name
  tfbs = unlist( lapply( strsplit(tfbsres$TFmotif,"/"),function(x){x[length(x)]}) )
  tfbsres$TF = tfanno$Fixed[ match(tfbs, tfanno[,nameColumn]) ]
  return( tfbsres ) }
makeMatrixTFBS4peaks = function( tfmut, theTFs, allPeaks ){
  # theTFs=unique(TFsEnsemblG$Fixed);allPeaks=gainedE;tfmut=gainedE_TFBSchange
  res = matrix(0L,
               nrow=length(allPeaks),
               ncol=length(unique(theTFs)) )
  res = as.data.frame(res)
  rownames(res) = names(allPeaks)
  colnames(res) = unique(theTFs)
  tp = split( tfmut$peak, tfmut$TF )
  for( tf in unique(theTFs)) {
    # tf="ZNF713"
    thisC = which( colnames(res)==tf )
    theseRows = which( rownames(res) %in% tp[[tf]])
    coordineates = cbind(row=theseRows,
                         col=rep(thisC,length(theseRows)))
    res[ coordineates ] = 1
  }
  return(res)
}