Genemapping as of February 2022.
ensembl_hg38 = useEnsembl(biomart="ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", mirror="useast")
genemap = getBM( attributes = c("ensembl_gene_id","chromosome_name","start_position",
                                "end_position","transcript_start","transcript_end",
                                "transcript_length","strand","percentage_gene_gc_content",
                                "transcription_start_site","external_gene_name",
                                "go_id","gene_biotype","hgnc_symbol","arrayexpress"),
                 filters = "ensembl_gene_id",
                 values = human_count$Geneid,
                 mart = ensembl_hg38)
save(ensembl_hg38_genemap, file=paste0(objects_directory,"ensembl_hg38_genemap.RData"))
gtf = import(paste0(outputs_directory,'hg38_ensembl.gtf'))
promoters_ap = data.frame( chr=as.character(chrom(gtf)),
                           start=as.numeric(start(gtf)),
                           end=as.numeric(end(gtf)),
                           strand=as.character(strand(gtf)),
                           transcript_id=as.character(gtf$transcript_id),
                           gene_id=as.character(gtf$gene_id),
                           gene_name = as.character(gtf$gene_name),
                           gene_biotype = as.character(gtf$gene_biotype),
                           type = gtf$type,
                           stringsAsFactors = FALSE )
length(unique(promoters_ap$gene_id))
promoters_ap = promoters_ap[promoters_ap$type == "transcript",]
promoters_sp = split(promoters_ap,promoters_ap$transcript_id)
## for each transcript find the TSS
promoters_tss = do.call('rbind', lapply( promoters_sp, function(x){
    tss = ifelse( as.character(unique(x$strand))=="+", 
                  x[which.min(x$start),'start'], 
                  x[which.max(x$end),'end'] ) 
    if(  as.character(unique(x$strand))=="+") tp = x[which.min(x$start),] else tp = x[which.max(x$end),]
    tp$tss = tss
  return(tp)
  }))
## 
promoters_tss_gr = GRanges(seqnames = promoters_tss$chr,
                           ranges = IRanges(as.numeric(promoters_tss$tss)-500 ,
                                            end=as.numeric(promoters_tss$tss) + 500,
                                            names=promoters_tss$transcript_id),
                           strand = promoters_tss$strand,
                           gene_id = promoters_tss$gene_id,
                           gene_name = promoters_tss$gene_name,
                           gene_biotype = promoters_tss$gene_biotype,
                           tss = promoters_tss$tss)
seqlevelsStyle(promoters_tss_gr) = 'ucsc'
promoters_tss$me3_peak = 0
promoters_tss$me3_peak[queryHits(findOverlaps(promoters_tss_gr,hs_me3))]=subjectHits(findOverlaps(promoters_tss_gr,hs_me3))
length(unique(promoters_tss$gene_id))
all(names(promoters_tss_gr)==promoters_tss$transcript_id)
promoters_tss_split = split(promoters_tss,promoters_tss$gene_id)
length(promoters_tss_split)
promoters_filtered = do.call('rbind', lapply(promoters_tss_split,function(p){
  if( sum(p$me3_peak)>0 ){
    PS = p[p$me3_peak>0,]
    if( unique(PS$strand)=='-') res=PS[which.max(PS$tss),]  else res=PS[which.min(PS$tss),] }
  if( sum(p$me3_peak)==0 ) { if( unique(p$strand)=='+') res=p[which.min(p$tss),] else res=p[which.max(p$tss),] }
  return(res) 
  } ))
length(unique(promoters_filtered$gene_id))
promoters_filtered_gr = GRanges(seqnames = promoters_filtered$chr,
                        ranges = IRanges(as.numeric(promoters_filtered$tss)-500 ,
                                         end=as.numeric(promoters_filtered$tss) + 500,
                                         names=promoters_filtered$gene_id),
                        strand = promoters_filtered$strand,
                        gene_id = promoters_filtered$gene_id,
                        gene_name = promoters_filtered$gene_name,
                        gene_biotype = promoters_filtered$gene_biotype,
                        tss = promoters_filtered$tss)
seqlevelsStyle(promoters_filtered_gr)='ucsc'
save( promoters_filtered, promoters_filtered_gr, promoters_tss,promoters_tss_gr, 
      file=paste0(objects_directory,'tss_objects.RData') )load(paste0(objects_directory,"Zhang_DataBundle.RData"))
load(paste0(objects_directory,"GTF_Annotation.RData"))
load(paste0(objects_directory,"ensembl_hg38_genemap.RData"))
genemapu = genemap[!duplicated(genemap$ensembl_gene_id),]
species.colors = c( 'MF' = '#66CCFF', 'HS' = '#000000', 'PT' = '#FF3300', 'MM' = '#0033FF')countdata_zhang = read.table(file = paste0(outputs_directory, "Zhang_gene_counts_redownloaded.txt"), header = T)
zhang_countdata_4tpm = data.frame(
  fetal=rowSums(countdata_zhang[,c(36,28,29,23,24,11)]),
  adult=rowSums(countdata_zhang[,colnames(countdata_zhang) %like% "YO_ATL_Astro|YO_HPC_Astro"]),
  Length=countdata_zhang$Length,
  row.names = countdata_zhang$Geneid,
  stringsAsFactors = FALSE)
zhang_countdata_tpm = as.data.frame(GetTPM(zhang_countdata_4tpm,1:2,
                                           rownames(zhang_countdata_4tpm)))
expressed = zhang_countdata_tpm[zhang_countdata_tpm$fetal>1 | zhang_countdata_tpm$adult>1, ]
not_expressed = zhang_countdata_tpm[zhang_countdata_tpm$fetal<0.1 & zhang_countdata_tpm$adult<0.1, ]
expressed_fetal = zhang_countdata_tpm[zhang_countdata_tpm$fetal>1, ]
expressed_adult = zhang_countdata_tpm[zhang_countdata_tpm$adult>1, ]
expressed_only_fetal = rownames(expressed_fetal)[! rownames(expressed_fetal) %in% rownames(expressed_adult) ]
expressed_only_adult = rownames(expressed_adult)[! rownames(expressed_adult) %in% rownames(expressed_fetal) ]
expressed_fetal_adult = rownames(expressed_fetal)[rownames(expressed_fetal) %in% rownames(expressed_adult) ]
fetal_markers_geneName = unique(genemap$hgnc_symbol[genemap$ensembl_gene_id %in% Fetal_Markers])
adult_markers_geneName = unique(genemap$hgnc_symbol[genemap$ensembl_gene_id %in% Adult_Markers])Based on data quality and previous analyses the selected samples are processed and filtered. We perform a differential expression analysis of genes between 4 species - Humans, Chimps, Rhesus Macaques and Crab Eating Macaques, based upon their expression profile on the Consensus Genome. This sheet details the steps of the differential analysis with relevant graphs for overview of the data and finally list the significant hits based on a the canonical workflow using DESeq2.
countdata = read.table(file = paste0(outputs_directory,"featureCounts_Counts_MO_All.tsv"), header = T)
countdata_tcw_iAstrocytes = read.table(file = paste0(outputs_directory,'tcw_latest_gene_counts.txt'), header = T)
## prep the tables
colnames(countdata_tcw_iAstrocytes)[7:ncol(countdata_tcw_iAstrocytes)]=c('tcw_3651_Astros','tcw_3651_NPCs','tcw_9319_Astros',
                                                                         'tcw_9429_Astros', 'tcw_9429_NPCs','tcw_BJ_Astros',
                                                                         'Cerebral_Cortex_pAstros','Midbrain_pAstros')
all(rownames(countdata) == countdata_tcw_iAstrocytes$Geneid)## [1] TRUEcountdata = data.frame( PrimaryFetal_F = countdata$PrimaryFetal_F,
                        PrimaryFetal_M = countdata$PrimaryFetal_M,
                        PrimaryFetal_1 = countdata$PrimaryFetal_1,
                        HSapiens_ELE10 = countdata$HSapiens_ELE10_1 + countdata$HSapiens_ELE10_2,
                        HSapiens_ELE30 = countdata$HSapiens_ELE30_1 + countdata$HSapiens_ELE30_2,
                        HSapiens_TCW_F1 = countdata_tcw_iAstrocytes[,'tcw_3651_Astros'],
                        HSapiens_TCW_F3 = countdata_tcw_iAstrocytes[,'tcw_9319_Astros'],
                        HSapiens_TCW_F4 = countdata_tcw_iAstrocytes[,'tcw_9429_Astros'],
                        Chimp_SandraA = countdata$Chimp_Sandra_BD1 + countdata$Chimp_Sandra_BD2 + countdata$Chimp_Sandra_nwNPC,
                        Chimp_Mandy6 = countdata$Chimp_Mandy6 + countdata$Chimp_Mandy6_New,
                        Chimp_Mandy4 = countdata$Chimp_Mandy4_New,
                        RhMacaque_Becky = countdata$RhMacaque_Becky_BD1 + countdata$RhMacaque_Becky_BD2,
                        
                        row.names = rownames(countdata))
sample_names = c("PrimaryFetal_F",
                 "PrimaryFetal_M",
                 "PrimaryFetal_1",
                 "HSapiens_ELE10",
                 "HSapiens_ELE30",
                 "HSapiens_TCW_F1",
                 "HSapiens_TCW_F3",
                 "HSapiens_TCW_F4",
                 "Chimp_SandraA",
                 "Chimp_Mandy6",
                 "Chimp_Mandy4",
                 "RhMacaque_Becky" )
# Setting up metadata for included samples
species = c(rep("HS",8),rep("PT",3),"MM")
sources = c(rep("Fetal",3),rep("iPSC",9))
sub_class = c("PF","PF","PF","ELE10","ELE30","TCW_F1","TCW_F3","TCW_F4",
              "SandraA","Mandy6","Mandy4",
              "Becky")
metadata = data.frame(species=as.factor(species),
                      sources=as.factor(sources),
                      class=as.factor(sub_class),
                      row.names = sample_names,
                      gender=c('F','M','F',rep('F',9)),
                      lab=c('other','other',rep("PL",3),rep("other",3),rep("PL",4)))
metadata$lp = 1:nrow(metadata)
all(colnames(countdata)==rownames(metadata))## [1] TRUE# Creating a TPM normalized table for the read counts for all genes
Length=countdata_tcw_iAstrocytes$Length[match(rownames(countdata),countdata_tcw_iAstrocytes$Geneid)]
tpm_norm_count_table = GetTPM(data.frame(cbind(countdata,Length=Length)),
                                         1:ncol(countdata),
                                         rownames(countdata))
# Creating a TPM normalized table for the read counts for all genes
all(rownames(tpm_norm_count_table)==rownames(zhang_countdata_tpm))## [1] TRUEtpm_norm_count_all = cbind( zhang_countdata_tpm,tpm_norm_count_table )
all(rownames(zhang_countdata_tpm) == rownames(tpm_norm_count_table))## [1] TRUEall(rownames(tpm_norm_count_table)==rownames(zhang_countdata_tpm))## [1] TRUE# filtering out expressed fetal and adult genes
expressed_fetal_str = zhang_countdata_tpm[zhang_countdata_tpm$fetal>5, ]
expressed_adult_str = zhang_countdata_tpm[zhang_countdata_tpm$adult>5, ]
expressed_only_fetal_str = rownames(expressed_fetal_str)[! rownames(expressed_fetal_str) %in% rownames(expressed_adult_str) ]
expressed_only_adult_str = rownames(expressed_adult_str)[! rownames(expressed_adult_str) %in% rownames(expressed_fetal_str) ]zhang_countdata_DS = data.frame(countdata_zhang[,c(36,28,29,23,24,11)],
                                countdata_zhang[,colnames(countdata_zhang) %like% "YO_ATL_Astro|YO_HPC_Astro"],
                                row.names=countdata_zhang$Geneid)
zhang_countdata_DS = log( colSums(zhang_countdata_DS[rownames(zhang_countdata_DS) %in% Adult_Markers,])/
                          colSums(zhang_countdata_DS[rownames(zhang_countdata_DS) %in% Fetal_Markers,]) )
countdata_DS = log( colSums(countdata[rownames(countdata) %in% Adult_Markers,])/colSums(countdata[rownames(countdata) %in% Fetal_Markers,]) )
ds = c(zhang_countdata_DS,countdata_DS)
sampleType = c(rep('acute_fetal',6),rep('acute_adult',15),
               rep('fetal_cultured',3),rep('iAstrocytes',9))
dsd=split(ds,sampleType)
par(mfrow=c(1,1),mar=c(5,4,1,1))
beeswarm(ds ~ sampleType, pch = 19, 
         col = c( 'blue4', 'turquoise3', 'purple3', 'pink4'), 
         method = "swarm", ylim=c(-2,5), ylab="Log[2] Differentiation score" )
axis(2,lwd=2)
box(col="black",lwd=2)sampleType = factor( c(rep('human',5),rep('chimpanzee',3),rep('rhesus',1)),
                     levels=c('human','chimpanzee','rhesus') )
beeswarm(countdata_DS[4:length(countdata_DS)] ~ sampleType, pch = 19, 
         col = c( 'black', 'red', 'blue'), 
         method = "swarm", ylim=c(-2,5), ylab="Log[2] Differentiation score" )
axis(2,lwd=2)
box(col="black",lwd=2)beeswarm(countdata_DS[4:length(countdata_DS)] ~ sampleType, pch = 19, 
         col = c( 'black', 'red', 'blue'), 
         method = "swarm", ylim=c(-2,5), ylab="Log[2] Differentiation score" )
axis(2,lwd=2)
box(col="black",lwd=2)astro_genes = unlist(unique( genemap[genemap$go_id == 'GO:0048708','hgnc_symbol']))
astro_genes = unique( c(astro_genes,
                        'ABL1','ABL2', 'ARP3','ADORA2A', 'AGER', 'AGT',
                        'APP', 'ATF5', 'BIN','BMP2', 'C1QA', 'C5AR1',
                        'CNTF','CNTN2','DAB1','DLL1','DLL3','DRD1',
                        'EIF2B5','EPHA4','F2','FGFR3','GCM1','GFAP',
                        'GM5849','GPR37l1','GRN','HES1','HES5','HMGA2',
                        'ID2','ID4','IFNG','IFNGR1','IL1B','IL6ST',
                        'KDM4A','LAMB2','LDLR','MAG','MAP2K1','MAPK3',
                        'MBD1','MECP2','MT3','MYCN','NF1','NFIX',
                        'NKX2-2','NOG','NOTCH1','NR1D1','NR2E1','NTRK3',
                        'PLP1','PLPP3','POU3F2','PRPF19','PSEN1','PTPN11',
                        'ROR2','S100A8','S100A9','SERPINE2','SHH','SMO',
                        'SOX6','SOX8','SOX9','STAT3','TAL1','TLR4',
                        'TREM2','TSPAN2','TTC21B','VIM', 'SLC1A3'))
astro_genes = data.frame(unique(genemap[which(genemap$hgnc_symbol %in% astro_genes),c('ensembl_gene_id','hgnc_symbol')]))
tmp_count_table = tpm_norm_count_table[rownames(tpm_norm_count_table) %in% astro_genes$ensembl_gene_id, ]
rownames(tmp_count_table) = astro_genes$hgnc_symbol[match(rownames(tmp_count_table), astro_genes$ensembl_gene_id)]
count_frame = as.data.frame(log10(tmp_count_table))
count_frame = count_frame %>% replace(.=='-Inf', 0)
count_frame = count_frame[order(apply(count_frame, 1, median), decreasing = T),]
pheatmap(count_frame, cellheight = 10,
         treeheight_row = 0, 
         cluster_cols = F, 
         cluster_rows = F, 
         scale = "none",
         angle_col = '315')We consider comparisons between human and chimpanzee and between human and macaque samples separately.
ids=1:nrow(countdata)
## DEGs in the comparison between humans and chimps
res_HSvPT <- DESeqDataSetFromMatrix(
 countData = countdata[ids,which(metadata$species %in% c("HS","PT") & metadata$sources=="iPSC" & metadata$gender=='F' )],
 colData = metadata[which(metadata$species %in% c("HS","PT") & metadata$sources=="iPSC" & metadata$gender=='F'),],
 design = ~ 0 + species )## factor levels were dropped which had no samplesres_HSvPT$species = relevel(res_HSvPT$species, "HS")
res_HSvPT <- DESeq(res_HSvPT,fitType="local")## estimating size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingresultsNames(res_HSvPT)## [1] "speciesHS" "speciesPT"res_HSvPT_sh <- lfcShrink(res_HSvPT, contrast = c("species","HS","PT"),type='ashr')## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041res_HSvPT <- results(res_HSvPT, contrast = c("species","HS","PT") )
summary(res_HSvPT)## 
## out of 46457 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 3881, 8.4%
## LFC < 0 (down)     : 3890, 8.4%
## outliers [1]       : 307, 0.66%
## low counts [2]     : 13293, 29%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results## 
res_HSvMM <- DESeqDataSetFromMatrix(
 countData = countdata[ids,c(metadata$species %in% c("HS","MM") & metadata$sources=="iPSC" & metadata$gender=='F')],
 colData = metadata[c(metadata$species %in% c("HS","MM") & metadata$sources=="iPSC" & metadata$gender=='F'),],
 design = ~ 0 + species
)## factor levels were dropped which had no samplesres_HSvMM$species = relevel(res_HSvMM$species, "HS")
res_HSvMM <- DESeq(res_HSvMM,fitType="local")## estimating size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingres_HSvMM_sh <- lfcShrink(res_HSvMM, contrast = c("species","HS","MM"), type="ashr")## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041res_HSvMM <- results(res_HSvMM, contrast = c("species","HS","MM") )
summary(res_HSvMM)## 
## out of 45191 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 5517, 12%
## LFC < 0 (down)     : 4670, 10%
## outliers [1]       : 133, 0.29%
## low counts [2]     : 15485, 34%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?resultsHSvPT_DEG <- as.data.frame(res_HSvPT)
HSvPT_DEG_0.1 <- HSvPT_DEG %>% filter(padj <= 0.01 )
res_HSvPT_sh = as.data.frame(res_HSvPT_sh)
HSvPT_DEG_sh <- res_HSvPT_sh %>% filter(padj <= 0.01 )
setDT(HSvPT_DEG_0.1, keep.rownames = TRUE)
HSvMM_DEG <- as.data.frame(res_HSvMM)
HSvMM_DEG_0.1 <- HSvMM_DEG %>% filter(padj <= 0.01 )
setDT(HSvMM_DEG_0.1, keep.rownames = TRUE)
res_HSvMM_sh = as.data.frame(res_HSvMM_sh)
HSvMM_DEG_sh <- res_HSvMM_sh %>% filter(padj <= 0.01 )
sum( HSvMM_DEG_0.1$rn %in% HSvPT_DEG_0.1$rn )## [1] 1271sum(! HSvMM_DEG_0.1$rn %in% HSvPT_DEG_0.1$rn )## [1] 3885all( HSvMM_DEG_sh$rn == HSvMM_DEG_0.1$rn )## [1] TRUEHSvMM_DEG_0.1$lfc_sh = HSvMM_DEG_sh$log2FoldChange
HSvPT_DEG_0.1$lfc_sh = HSvPT_DEG_sh$log2FoldChangeHuman versus chimpanzee - volcano
Human versus macaque - volcano
Largely congruent changes in gene expression
# tpm_norm_count_table_thresholded_top  = apply(tpm_norm_count_table,2,function(x){x>(quantile(x[x>0])[3])})
tpm_norm_count_table_thresholded_top  = tpm_norm_count_table>1
## ------------------------------
log_fold_dat <- gtf_annotation_table[,c(1,7,12)]
log_fold_dat[log_fold_dat$gene_biotype %like% "pseudogene",]$gene_biotype = "pseudogene"
log_fold_dat[log_fold_dat$gene_biotype %like% "TR_",]$gene_biotype = "TR_genes"
log_fold_dat$rn = log_fold_dat$ensembl_gene_id
log_fold_dat = merge(log_fold_dat,HSvPT_DEG_0.1[,c(1,3,8)], by='rn')
colnames(log_fold_dat)[5] ="HSvPT_lfc"
colnames(log_fold_dat)[6] ="HSvPT_lfc_shrunk"
log_fold_dat = merge(log_fold_dat,HSvMM_DEG_0.1[,c(1,3,8)], by='rn')
colnames(log_fold_dat)[7] ="HSvMM_lfc"
colnames(log_fold_dat)[8] ="HSvMM_lfc_shrunk"
## ------------------------------
all(colnames(tpm_norm_count_table) == rownames(metadata))## [1] TRUEtpm_norm_count_table_df = data.frame( human = rowMeans(tpm_norm_count_table[,which(metadata$species =="HS" & metadata$sources=="iPSC" & metadata$gender=='F')]),
                                      chimp = rowMeans(tpm_norm_count_table[,which(metadata$species =="PT" & metadata$sources=="iPSC" & metadata$gender=='F')]),
                                      macaque=tpm_norm_count_table[,which(metadata$species =="MM" & metadata$sources=="iPSC" & metadata$gender=='F')],
                                      rn=rownames(tpm_norm_count_table) )
log_fold_dat = merge(log_fold_dat, tpm_norm_count_table_df, by='rn')
tpm_norm_count_table_df$rn=NULL
log_fold_dat$rn=NULL
log_fold_dat_biotype=log_fold_dat[log_fold_dat$gene_biotype %in% c('protein_coding',
                                                    'pseudogene',
                                                    'lncRNA','miRNA'),]
log_fold_dat_biotype$col = rep('steelblue',nrow(log_fold_dat_biotype))
log_fold_dat_biotype$col[log_fold_dat_biotype$gene_biotype=='pseudogene']='thistle3'
log_fold_dat_biotype$col[log_fold_dat_biotype$gene_biotype=='lncRNA']='red3'
log_fold_dat_biotype$col[log_fold_dat_biotype$gene_biotype=='miRNA']='black'
log_fold_dat_biotype$DEX = 0
# log_fold_dat_biotype$DEX[log_fold_dat_biotype$ensembl_gene_id %in% eid] = 1
par(mfrow=c(1,1),mar=c(5,5,5,5),cex.lab=2,pty='s')
plot(x=log_fold_dat_biotype$HSvPT_lfc_shrunk,
     y=log_fold_dat_biotype$HSvMM_lfc_shrunk,
     xlim=c(-10,10),ylim=c(-10,10),
     col=log_fold_dat_biotype$col,pch=19,cex=0.5,
     ylab='Hs vs. Pt',xlab='Hs vs. Mm',axes=F)
axis(1,lwd=2,cex.axis=2)
axis(2,lwd=2,cex.axis=2)
abline(a=0,b=1)
abline(h=0,v=0,lwd=2,col='gray')
box(col='black',lwd=2)cor.test(log_fold_dat_biotype$HSvPT_lfc,log_fold_dat_biotype$HSvMM_lfc)## 
##  Pearson's product-moment correlation
## 
## data:  log_fold_dat_biotype$HSvPT_lfc and log_fold_dat_biotype$HSvMM_lfc
## t = 42.744, df = 1243, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7479379 0.7930088
## sample estimates:
##       cor 
## 0.7714392colnames(log_fold_dat)[1:3] = c("ensembl_id","hgnc_symbol","gene_biotype")EAGs, the expression of how many of these genes is detected in the human astrocytes.
HS_UP_Genes <- log_fold_dat %>% filter(HSvPT_lfc > 0 & HSvMM_lfc > 0)
HS_DN_Genes <- log_fold_dat %>% filter(HSvPT_lfc < (-0) & HSvMM_lfc < (-0) )
all(HS_UP_Genes$ensembl_id %in% HSvMM_DEG_0.1$rn )## [1] TRUEall(HS_UP_Genes$ensembl_id %in% HSvPT_DEG_0.1$rn )## [1] TRUEhits_up = as.data.frame(HS_UP_Genes)
hits_dn = as.data.frame(HS_DN_Genes)
dim(hits_up) # 677## [1] 677  10dim(hits_dn) # 486## [1] 486  10sum(hits_up$ensembl_id %in% rownames(expressed))/nrow(hits_up)## [1] 0.8227474sum(hits_dn$ensembl_id %in% rownames(expressed))/nrow(hits_dn)## [1] 0.8395062hits_up = hits_up[hits_up$ensembl_id %in% rownames(tpm_norm_count_table_thresholded_top[rowSums(tpm_norm_count_table_thresholded_top[,which(metadata$species =="HS" & metadata$sources=="iPSC" & metadata$gender=='F')])>3,]),]
hits_dn = hits_dn[hits_dn$ensembl_id %in% rownames(tpm_norm_count_table_thresholded_top[rowSums(tpm_norm_count_table_thresholded_top[,which(metadata$species %in% c("PT","MM"))])>2,]),]
sum(hits_up$ensembl_id %in% rownames(expressed))/nrow(hits_up)## [1] 0.8464052sum(hits_dn$ensembl_id %in% rownames(expressed))/nrow(hits_dn)## [1] 0.8883929## boxplot expression in zhang
u = cbind(not_expressed = sum(hits_up$ensembl_id %in% rownames(not_expressed )),
          fetal=sum(hits_up$ensembl_id %in% expressed_only_fetal ),
               adult=sum(hits_up$ensembl_id %in% expressed_only_adult ),
               both=sum(hits_up$ensembl_id %in% expressed_fetal_adult ))
d = cbind(not_expressed = sum(hits_dn$ensembl_id %in% rownames(not_expressed )),
          fetal=sum(hits_dn$ensembl_id %in% expressed_only_fetal ),
               adult=sum(hits_dn$ensembl_id %in% expressed_only_adult ),
               both=sum(hits_dn$ensembl_id %in% expressed_fetal_adult ))
m=rbind(u,d)
m##      not_expressed fetal adult both
## [1,]            33   106    61  351
## [2,]            26    29    26  343par(lwd=2, cex.axis=1.5,mar=c(5,5,1,1),pty='m')
barplot(t(m/rowSums(m)),col=c('red4','white','black','gray'),
        ylab="%",names=c("Up","Down"),xlab="EAGs",
        cex.names=2.5,cex.lab=2)
axis(2,lwd=3)par(mfrow=c(2,2))
hits_up_split_pt = split(hits_up$HSvPT_lfc,hits_up$gene_biotype)
boxplot( hits_up_split_pt[c('protein_coding','lncRNA','pseudogene')],ylim=c(0,15),col='white',border=c('steelblue','red3','thistle3'))
hits_up_split_mm = split(hits_up$HSvMM_lfc,hits_up$gene_biotype)
boxplot( hits_up_split_mm[c('protein_coding','lncRNA','pseudogene')],ylim=c(0,15),col='white',border=c('steelblue','red3','thistle3'))
hits_dn_split_pt = split(hits_dn$HSvPT_lfc,hits_dn$gene_biotype)
boxplot( hits_dn_split_pt[c('protein_coding','lncRNA','pseudogene')],ylim=c(-15,0),col='white',border=c('steelblue','red3','thistle3'))
hits_dn_split_mm = split(hits_dn$HSvMM_lfc,hits_dn$gene_biotype)
boxplot( hits_dn_split_mm[c('protein_coding','lncRNA','pseudogene')],
         ylim=c(-15,0),col='white',border=c('steelblue','red3','thistle3'))hits_up = hits_up[hits_up$ensembl_id %in% rownames(expressed),]
dim(hits_up)## [1] 518  10hits_dn = hits_dn[hits_dn$ensembl_id %in% rownames(expressed),]
dim(hits_dn)## [1] 398  10sum(hits_up$ensembl_id %in% Fetal_Markers)## [1] 25sum(hits_dn$ensembl_id %in% Fetal_Markers)## [1] 26sum(hits_up$ensembl_id %in% Adult_Markers)## [1] 20sum(hits_dn$ensembl_id %in% Adult_Markers)## [1] 25hits_up = hits_up[! hits_up$ensembl_id %in% Fetal_Markers,]
hits_dn = hits_dn[! hits_dn$ensembl_id %in% Adult_Markers,]
dim(hits_up)## [1] 493  10dim(hits_dn)## [1] 373  10par(lwd=2, cex.axis=1.5,mar=c(5,5,3,1),mfrow=c(1,1))
barplot( c(up=nrow(hits_up),
           down=nrow(hits_dn)), 
         col=c("green4","wheat3"),
         ylim=c(0,500),ylab="EAGs",cex.axis = 1.5, cex.lab=2)
axis(2,lwd=2)x=hits_up$ensembl_id
y=hits_dn$ensembl_id
write.table(y,file=paste0(outputs_directory,'dn_engs.txt'),quote=FALSE, row.names=FALSE,col.names=FALSE,sep='\n')
write.table(x,file=paste0(outputs_directory,'up_engs.txt'),quote=FALSE, row.names=FALSE,col.names=FALSE,sep='\n')
## are genes affected by evolution frequently totally on or off??
table(hits_up[hits_up$chimp<0.1 & hits_up$macaque<0.1,'gene_biotype'])## 
##         lncRNA protein_coding     pseudogene            TEC 
##              7              1              6              1nrow(hits_up[hits_up$chimp<0.1 & hits_up$macaque<0.1,])## [1] 15table(hits_dn[hits_dn$human<0.1,'gene_biotype'])## 
##         lncRNA        Mt_tRNA protein_coding     pseudogene 
##              3              1             11              2nrow(hits_dn[hits_dn$human<0.1,])## [1] 17all_Deseqs = merge( HSvPT_DEG,HSvMM_DEG,by=0,all=TRUE ) # data frame of merged results
save( hits_dn, hits_up, HS_DN_Genes, HS_UP_Genes, all_Deseqs,log_fold_dat,tpm_norm_count_table,
      file=paste0(objects_directory,"DEseq2_RNA.RData"))load(paste0(objects_directory,"bda_final.RData"))
bda_final = bda_final[bda_final$ensid %in% rownames(countdata),]
tot_n_EAG = nrow(hits_up) + nrow(hits_dn)
x = bda_final[bda_final$ensid %in% hits_up$ensembl_id | bda_final$Gene.symbol %in% hits_up$hgnc_symbol,]
x$or = paste(x$ensid,x$Disease,sep='-')
x = x[!duplicated(x$or),]
y = bda_final[bda_final$ensid %in% hits_dn$ensembl_id | bda_final$Gene.symbol %in% hits_dn$hgnc_symbol,]
y$or = paste(y$ensid,y$Disease,sep='-')
y = y[!duplicated(y$or),]
length(unique(c(x$Disease,y$Disease)))## [1] 23All_diseases = table(bda_final$Disease)
X = table(x$Disease)
Y = table(y$Disease)
All_diseases = table(bda_final$Disease)
All_diseases = All_diseases[order(All_diseases,decreasing=TRUE)]
nulv = rep(0, length(All_diseases))
names(nulv) = names(All_diseases)
nulv1=nulv
nulv1[match(names(X),names(nulv1))]=X
nulv2=nulv
nulv2[match(names(Y),names(nulv2))]=Y
m=rbind(nulv1,nulv2)
mcut1=m[,colSums(m)>0]
dim(mcut1)## [1]  2 23par(mar=c(12,4,1,1),mfrow=c(1,1))
barplot(mcut1, beside=TRUE, col=c('green4','wheat3'),las=2,
        ylim=c(0,25),axes=FALSE,ylab="EAG")
axis(2,lwd=3)M = matrix( c( length(unique(x$Gene.symbol)),
                     length(unique(hits_up$ensembl_id)),
                     length(unique(y$Gene.symbol)),
                     length(unique(hits_dn$ensembl_id))),ncol=2,nrow=2)
prop.test( M )## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  M
## X-squared = 20.933, df = 1, p-value = 0.000004757
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.3106748 -0.1256033
## sample estimates:
##    prop 1    prop 2 
## 0.3511450 0.5692841fisher.test( M )## 
##  Fisher's Exact Test for Count Data
## 
## data:  M
## p-value = 0.000003401
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2727455 0.6096183
## sample estimates:
## odds ratio 
##  0.4098171M##      [,1] [,2]
## [1,]   46   85
## [2,]  493  373M = matrix( c( length(unique(x$Gene.symbol[x$Disease=="Intellectual Disability"])),
                     length(unique(hits_up$ensembl_id)),
                     length(unique(y$Gene.symbol[y$Disease=="Intellectual Disability"])),
                     length(unique(hits_dn$ensembl_id))),ncol=2,nrow=2)
chisq.test( M )## Warning in chisq.test(M): Chi-squared approximation may be incorrect## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  M
## X-squared = 8.2604, df = 1, p-value = 0.004052fisher.test( M )## 
##  Fisher's Exact Test for Count Data
## 
## data:  M
## p-value = 0.001251
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.0000000 0.4471496
## sample estimates:
## odds ratio 
##          0unique(bda_final$Disease)##   [1] "Autism Spectrum Disorder"                                         
##   [2] "Alzheimer's Disease"                                              
##   [3] "Amyotrophic Lateral Sclerosis"                                    
##   [4] "Multiple Sclerosis"                                               
##   [5] "Epilepsy"                                                         
##   [6] "Intracranial Aneurysm"                                            
##   [7] "Neuroblastoma"                                                    
##   [8] "Parkinson's Disease"                                              
##   [9] "Restless legs Syndrome"                                           
##  [10] "Meningioma"                                                       
##  [11] "Narcolepsy"                                                       
##  [12] "Glioma"                                                           
##  [13] "Prader-Willi Syndrome"                                            
##  [14] "Progressive Supranuclear Plasy"                                   
##  [15] "Restless Legs Syndrome"                                           
##  [16] "Rett Syndrome"                                                    
##  [17] "Rolandic Epilepsy with Speech impairment"                         
##  [18] "Shy Drager Syndrome"                                              
##  [19] "Spasmodic Dysphonia"                                              
##  [20] "Stroke"                                                           
##  [21] "Tay-Sachs Disease"                                                
##  [22] "Tourette Syndrome"                                                
##  [23] "Tuberous Sclerosis"                                               
##  [24] "Von Hippel-Lindau Syndrome"                                       
##  [25] "X-linked Hydrocephalus"                                           
##  [26] "Agenesis Corpus Callosum"                                         
##  [27] "Alopecia with Mental Retardation"                                 
##  [28] "Alpha-Thalassemia X-Linked Intellectual Disability Syndrome"      
##  [29] "Alternating Hemiplegia of Childhood"                              
##  [30] "Aphasia"                                                          
##  [31] "Attention Deficit Hyperactivity Disorder"                         
##  [32] "Autosomal Dominant Nocturnal Frontal Lobe Epilepsy"               
##  [33] "Autosomal Dominant Partial Epilepsy with Auditory Features"       
##  [34] "Autosomal Recessive Cerebellar Ataxia Type 1"                     
##  [35] "Batten Disease"                                                   
##  [36] "Benign Familial Neonatal Seizures"                                
##  [37] "Benign Hereditary Chorea"                                         
##  [38] "Cerebral Aneurysm"                                                
##  [39] "Cerebellar Ataxia, Mental Retardation and Disequilibrium Syndrome"
##  [40] "Cerebral Palsy"                                                   
##  [41] "Cerebro-Oculo-Facio-Skeletal Syndrome"                            
##  [42] "Cerebrocostomandibular Syndrome"                                  
##  [43] "Charcot-Marie-Tooth Disease"                                      
##  [44] "Chiari Malformation"                                              
##  [45] "Chronic Inflammatory Demyelinating Polyneuropathy"                
##  [46] "Coma"                                                             
##  [47] "Creutzfeldt Jakob Disease"                                        
##  [48] "Dementia (Non Alzheimer)"                                         
##  [49] "Down Syndrome"                                                    
##  [50] "Dysautonomia"                                                     
##  [51] "Dyslexia"                                                         
##  [52] "Dyspraxia"                                                        
##  [53] "Dystonia"                                                         
##  [54] "Encephalitis"                                                     
##  [55] "Essential Tremor"                                                 
##  [56] "Familial Focal Epilepsy with Variable Foci"                       
##  [57] "Ferro-Cerebro-Cutaneous Syndrome"                                 
##  [58] "Friedreich Ataxia"                                                
##  [59] "Gaucher Disease"                                                  
##  [60] "Generalized Epilepsy with Febrile Seizures Plus"                  
##  [61] "Huntington's Disease"                                             
##  [62] "Hydrocephalus"                                                    
##  [63] "Intellectual Disability"                                          
##  [64] "Meningitis"                                                       
##  [65] "Motor Neurone Disease"                                            
##  [66] "Muscular Dystrophy"                                               
##  [67] "Neurodegenerative Disease"                                        
##  [68] "Paraganglioma"                                                    
##  [69] "Schizophrenia"                                                    
##  [70] "Pontocerebellar Hypoplasia"                                       
##  [71] "Depression Disorder"                                              
##  [72] "Neurofibromatosis"                                                
##  [73] "Major Depression Disorder"                                        
##  [74] "Ischemic Stroke"                                                  
##  [75] "Ataxia Telangiectasia"                                            
##  [76] "Spinocerebellar Ataxia"                                           
##  [77] "Smith-Magenis Syndrome"                                           
##  [78] "Anorexia Nervosa"                                                 
##  [79] "Bipolar Disorder"                                                 
##  [80] "Frontotemporal Lobar Degeneration"                                
##  [81] "Neurodevelopmental Disability"                                    
##  [82] "Panic Disorder"                                                   
##  [83] "Post-traumatic Stress Disorder"                                   
##  [84] "Amyotrophic lateral Sclerosis"                                    
##  [85] "Angelman Syndrome"                                                
##  [86] "Cerebral infarction"                                              
##  [87] "Cognitive Functions and Neuronal plasticity"                      
##  [88] "Fragile X Syndrome"                                               
##  [89] "Neurological Disorder"                                            
##  [90] "Non-functioning Pituitary Adenoma"                                
##  [91] "Pituitary Adenoma"                                                
##  [92] "Plexiform Neurofibroma"                                           
##  [93] "Prader-willi Syndrome and Angelman Syndrome"                      
##  [94] "Psychiatric Disease"                                              
##  [95] "West Syndrome"                                                    
##  [96] "Non-functioning Pituitary Neoplasms"                              
##  [97] "Pituitary Neoplasms"                                              
##  [98] "Forebrain Ischemia"                                               
##  [99] "Status Epilepticus"                                               
## [100] "Acute Cerebral Infarction"                                        
## [101] "Acute Cerebral Ischemia"                                          
## [102] "Brain Neoplasms"                                                  
## [103] "Cerebellum Cancer"                                                
## [104] "Cerebral Cavernous Malformation"                                  
## [105] "Cerebral Ischemia"                                                
## [106] "Cerebral Malaria"                                                 
## [107] "Encephalomyelitis"                                                
## [108] "Intracerebral Hemorrhage"                                         
## [109] "Mild Cognitive Impairment"                                        
## [110] "Neurilemmoma"                                                     
## [111] "Neuroendocrine Tumor"                                             
## [112] "Neuroepithelial Tumor"                                            
## [113] "Neuroma"                                                          
## [114] "Neuronal Apoptosis-Related Disease"                               
## [115] "Frontotemporal Dementia"                                          
## [116] "Anxiety Disorder"                                                 
## [117] "Acute Ischemic Stroke"                                            
## [118] "Aneurysmal Subarachnoid Hemorrhage"                               
## [119] "Central Nervous System Embryonal Tumor"id_DAVID_all = read.delim(paste0(outputs_directory,"uniprotkb_keyword_KW_0991_2023_09_01.tsv"),
                          header=TRUE)                                
id_DAVID_all = unlist(lapply(split(id_DAVID_all$Gene.Names,id_DAVID_all$Entry),
                             function(x){strsplit(x," ")}))
id_DAVID_all_ensg = unique( genemapu$ensembl_gene_id[genemapu$external_gene_name %in% id_DAVID_all ] )
fisher.test(matrix(c(sum(hits_dn$ensembl_id %in% id_DAVID_all_ensg),
                     sum(hits_up$ensembl_id %in% id_DAVID_all_ensg),
                     nrow(hits_dn),
                     nrow(hits_up)),2,2))## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(sum(hits_dn$ensembl_id %in% id_DAVID_all_ensg), sum(hits_up$ensembl_id %in% id_DAVID_all_ensg), nrow(hits_dn), nrow(hits_up)), 2, 2)
## p-value = 0.000000000003697
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##     7.914837 1916.514987
## sample estimates:
## odds ratio 
##   47.39888hits_dn$hgnc_symbol[hits_dn$hgnc_symbol %in% id_DAVID_all ]##  [1] "KMT2E"   "SYN1"    "CDH1"    "NUP133"  "ATP2B1"  "OPHN1"   "KAT6A"  
##  [8] "ZC3H14"  "CTCF"    "SMC3"    "FBXW7"   "KMT5B"   "ARL6"    "CEP104" 
## [15] "ZMYM2"   "FGF13"   "DPP6"    "ATP8A2"  "CDK8"    "DPF2"    "STXBP1" 
## [22] "FBXO11"  "ASXL2"   "PHIP"    "TLK2"    "RBMX"    "AFF2"    "PHF6"   
## [29] "DYRK1A"  "ZNF148"  "HNRNPH1" "SOX11"   "DCC"     "USP7"    "ZNF292" 
## [36] "PGAP1"hits_up$hgnc_symbol[hits_up$hgnc_symbol %in% id_DAVID_all ]## [1] "ZNHIT3"Heatmap of ID related genes
idgenesensembl = hits_dn$ensembl_id[ hits_dn$ensembl_id %in% id_DAVID_all_ensg ]
tpm_norm_count_table_ID= tpm_norm_count_table[rownames(tpm_norm_count_table) %in% idgenesensembl,3:ncol(tpm_norm_count_table)] 
tpm_norm_count_table_id = as.data.frame(log10(0.1+tpm_norm_count_table_ID))
rownames(tpm_norm_count_table_id) = genemapu$external_gene_name[match(idgenesensembl,genemapu$ensembl_gene_id)]
png(paste0(plots_directory,'/ID_heatmap.png'),
    width = 5000, height = 10000, res = 1200 )
pheatmap(tpm_norm_count_table_id, 
         cellheight = 10, 
         treeheight_row = 0, 
         cluster_cols = F, 
         cluster_rows = T, 
         scale = "row",color=colorRampPalette(c("blue","white","red"))(100),
         angle_col = '315')
dev.off()## quartz_off_screen 
##                 3up_fa = read.delim(paste0(outputs_directory,"hits_up_DAVID_KEGG.txt"))
exosomal_genes = unique( unlist(strsplit(up_fa$Genes[up_fa$Term=="GO:0070062~extracellular exosome"],", ")) )
exosomal_genes = exosomal_genes[-which(exosomal_genes %in% "ENSG00000285762")]
tpm_norm_count_table_EX = tpm_norm_count_table[rownames(tpm_norm_count_table) %in% exosomal_genes,3:ncol(tpm_norm_count_table)] 
tpm_norm_count_table_ex = as.data.frame(log10(0.1+tpm_norm_count_table_EX))
rownames(tpm_norm_count_table_ex) = hits_up$hgnc_symbol[match(rownames(tpm_norm_count_table_ex),hits_up$ensembl_id)]
  
png(paste0(plots_directory,'/Exosome_heatmap_GeneNames.png'),
    width = 5000, height = 10000, res = 1200 )
pheatmap(tpm_norm_count_table_ex,
         cellheight = 10, 
         treeheight_row = 0,
         cluster_cols = F, 
         cluster_rows = T,
         scale = "row",color=colorRampPalette(c("blue","white","red"))(100),
         angle_col = '315')
dev.off()## quartz_off_screen 
##                 3go_dn = read.delim( paste0(outputs_directory,'dn_engs_DAVID_KEGG.txt' ))
go_dn = go_dn[order(go_dn$Benjamini,decreasing=TRUE),]
go_dn = go_dn[go_dn$Benjamini<0.05,]
go_dn$anyGo = unlist(lapply(strsplit(go_dn$Term,":"),function(x){x[[1]]}))
go_dn = go_dn[go_dn$anyGo %in% c("GO","hsa01100"),]
par(mfrow=c(1,1),mar=c(5,30, 1,1))
barplot(-log10(go_dn$Benjamini), horiz=TRUE,
        names=go_dn$Term,las=2,xlim=c(0,20),xlab="-Log[10]B-H adj. P-val")
axis(1,lwd=2,las=2)Genes related to nucleus
length(unique(unlist(strsplit(go_dn$Genes,", "))))## [1] 249go_up = read.delim( paste0(outputs_directory,'hits_up_DAVID_KEGG.txt' ))
go_up = go_up[order(go_up$Benjamini,decreasing=TRUE),]
go_up = go_up[go_up$Benjamini<0.01,]
go_up$anyGo = unlist(lapply(strsplit(go_up$Term,":"),function(x){x[[1]]}))
go_up = go_up[go_up$anyGo %in% c("GO","hsa01100"),]
par(mfrow=c(1,1),mar=c(5,20, 1,1))
barplot(-log10(go_up$Benjamini), horiz=TRUE,
        names=go_up$Term,las=2,xlim=c(0,3),xlab="-Log[10]B-H adj. P-val")
axis(1,lwd=2,las=2)pluripotencyGenes = read.delim(paste0(outputs_directory,'Conserved_Pluripotency_genes.txt'),header=FALSE,as.is=TRUE)
df = read.delim( paste0(outputs_directory,'gene_counts_Mandy.txt'),skip=1, as.is=TRUE)
countTable = df[,c(7,8,9,10,11,6)]
rownames(countTable) = df$Geneid
chimp_tpm=GetTPM(countTable,1:5,rownames(countTable))
colnames(chimp_tpm) = unlist(strsplit(colnames(chimp_tpm),"analyses.star.RNA_Seq_02.22_PanTro_iPSC_WT_"))[seq(2,2*ncol(chimp_tpm),by=2)]
colnames(chimp_tpm) = unlist(strsplit(colnames(chimp_tpm),"_Rep_1_Aligned.sortedByCoord.out.bam"))
countTable = df[,c(7,8,9,10,11)]
rownames(countTable) = df$Geneid
colnames(countTable) = unlist(strsplit(colnames(countTable),"analyses.star.RNA_Seq_02.22_PanTro_iPSC_WT_"))[seq(2,2*ncol(chimp_tpm),by=2)]
colnames(countTable) = unlist(strsplit(colnames(countTable),"_Rep_1_Aligned.sortedByCoord.out.bam"))
coldata = data.frame(condition=c(rep("Mandy",4),"SandraA"))
rownames(coldata) = colnames(countTable)
dds <- DESeqDataSetFromMatrix(
 countData = countTable,
 colData = coldata,
 design = ~ condition )## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factorsdds <- DESeq(dds)## estimating size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingresultsNames(dds)## [1] "Intercept"                  "condition_SandraA_vs_Mandy"res = results(dds)
res <- res [order(res$padj),]
vsdat <- vst(dds, blind=FALSE)
matvsdat=assay(vsdat) ## variance stablised data
mat=counts(dds, normalized=TRUE) ## variance stablised data
mat = mat[rowSums(mat)>20,]
par(mfrow=c(2,2),mar=c(5,5,5,5),pty="s",bty="O")
heatscatter( log2(0.1+mat[,'Mandy4']),log2(0.1+mat[,'SandraA']), 
             colpal = 'crazyblue',pch=19, cex=0.5,
             xlab="Mandy4 [log2(counts)]", ylab="SandraA [log2(counts)]")
box(col="black")
heatscatter( log2(0.1+mat[,'Mandy6']),log2(0.1+mat[,'SandraA']), 
             colpal = 'crazyblue',pch=19, cex=0.5,
             xlab="Mandy6 [log2(counts)]", ylab="SandraA [log2(counts)]")
box(col="black")
heatscatter( log2(0.1+mat[,'Mandy4']),log2(0.1+mat[,'Mandy6']), 
             colpal = 'crazyblue',pch=19, cex=0.5,
             xlab="Mandy4 [log2(counts)]", ylab="Mandy6 [log2(counts)]")
box(col="black")par(mfrow=c(1,1),mar=c(7,5,5,1),bty='n')
boxplot( chimp_tpm[, 'Mandy6'],
         chimp_tpm[rownames(chimp_tpm) %in% pluripotencyGenes$V1, 'Mandy6'],
         chimp_tpm[, 'Mandy4'],
         chimp_tpm[rownames(chimp_tpm) %in% pluripotencyGenes$V1, 'Mandy4'],
         chimp_tpm[, 'SandraA'],
         chimp_tpm[rownames(chimp_tpm) %in% pluripotencyGenes$V1, 'SandraA'],
         border=rep(c('gray','red'),3), main='',col='white',
         ylab=expression('TPM'), outline=FALSE, 
         ylim=c(0,60), bty='n',notch=FALSE,lwd=2,
         names=rep(c("all genes","Pluripotency"),3),las=2 )id_DAVID_all = read.delim(paste0(outputs_directory,"uniprotkb_keyword_KW_0991_2023_09_01.tsv"),
                          header=TRUE)                                
id_DAVID_all = unlist(lapply(split(id_DAVID_all$Gene.Names,id_DAVID_all$Entry),
                             function(x){strsplit(x," ")}))
id_DAVID_all_ensg = unique( genemapu$ensembl_gene_id[genemapu$external_gene_name %in% id_DAVID_all ] )
id = id_DAVID_all_ensg[id_DAVID_all_ensg %in% hits_dn$ensembl_id]
up_fa = read.delim(paste0(outputs_directory,"hits_up_DAVID_KEGG.txt"))
exosomal_genes = unique( unlist(strsplit(up_fa$Genes[up_fa$Term=="GO:0070062~extracellular exosome"],", ")) )
exosome = exosomal_genes[-which(exosomal_genes %in% "ENSG00000285762")]
tpm_norm_count_table_EX = tpm_norm_count_table[rownames(tpm_norm_count_table) %in% exosomal_genes,3:ncol(tpm_norm_count_table)] 
tpm_norm_count_table_ex = as.data.frame(log10(0.1+tpm_norm_count_table_EX))
rownames(tpm_norm_count_table_ex) = hits_up$hgnc_symbol[match(rownames(tpm_norm_count_table_ex),hits_up$ensembl_id)]human_count = read.delim(paste0(outputs_directory,'gene_counts_human_dec2022.txt'),skip=1)
macaque_count = read.delim(paste0(outputs_directory,'gene_counts_macaque.txt'),skip=1)
all(human_count$Geneid==macaque_count$Geneid)## [1] TRUEAPlab_count = data.frame(Hs_CTX_WT_Brain_S3A1_M = human_count[,7],
                         Hs_CTX_WT_Brain_S7A1_M = human_count[,8],
                         Hs_CTX_WT_Brain_S2A1_M = human_count[,9],
                         Hs_CTX_WT_Brain_S1A1_M = human_count[,10],
                         Hs_CTX_WT_Brain_S6A1_F = human_count[,11],
                         Mm_CTX_WT_Brain_10506_M = macaque_count[,7],
                         Mm_CTX_WT_Brain_10521_F = macaque_count[,8], 
                         row.names = human_count$Geneid)
brain_met = data.frame(species=factor( c(rep("HS",5),c('MM','MM')),levels=c("HS","MM")),
                       sample='WholeCortex',sex=c('M','M','M','M','F','M','F'),
                       row.names=colnames(APlab_count))
brain_bulk = DESeqDataSetFromMatrix(
 countData = APlab_count,
 colData = brain_met,
 design = ~ species )
brain_bulk = estimateSizeFactors(brain_bulk)
brain_bulk <- DESeq(brain_bulk)## using pre-existing size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingvst_data = vst(brain_bulk, blind=TRUE)
log_data = rlog(brain_bulk, blind=TRUE)
normalized_counts = counts(brain_bulk,normalized=TRUE)
brain_bulk_PL_res = results(brain_bulk, contrast = c("species","HS","MM") )
brain_bulk_PL_sig = brain_bulk_PL_res[!is.na(brain_bulk_PL_res$padj),]
brain_bulk_PL_sig = brain_bulk_PL_sig[brain_bulk_PL_sig$padj<0.01,]
brain_bulk_PL_sig_down = brain_bulk_PL_sig[brain_bulk_PL_sig$log2FoldChange<(0),]
brain_bulk_PL_sig_up = brain_bulk_PL_sig[brain_bulk_PL_sig$log2FoldChange>(0),]brain_tpm = read.delim(paste0(outputs_directory,'Ext_RNASeq_TPMCOUNTS.tsv'),header=TRUE, as.is=TRUE)
brain_met = read.delim(paste0(outputs_directory,'Ext_RNASeq_METADATA.tsv'),header=TRUE, as.is=TRUE)
brain_counts = read.delim(paste0(outputs_directory,'Ext_RNASeq_COUNTDATA.tsv'),header=TRUE, as.is=TRUE)
rownames(brain_met) = brain_met$sample_names
brain_counts = brain_counts[,match(rownames(brain_met),colnames(brain_counts))]load(paste0(objects_directory,"ensembl_hg38_genemap.RData"))
load(paste0(objects_directory,"GTF_Annotation.RData"))
genemapu = genemap[!duplicated(genemap$ensembl_gene_id),]Load objects from other vignettes
load(paste0(objects_directory,"bda_final.RData"))
load(paste0(objects_directory,"DEseq2_RNA.RData"))
id_DAVID_all = read.delim(paste0(outputs_directory,"uniprotkb_keyword_KW_0991_2023_09_01.tsv"),
                          header=TRUE)                                
id_DAVID_all = unlist(lapply(split(id_DAVID_all$Gene.Names,id_DAVID_all$Entry),
                             function(x){strsplit(x," ")}))
id_DAVID_all_ensg = unique( genemapu$ensembl_gene_id[genemapu$external_gene_name %in% id_DAVID_all ] )
id = id_DAVID_all_ensg[id_DAVID_all_ensg %in% hits_dn$ensembl_id]
up_fa = read.delim(paste0(outputs_directory,"hits_up_DAVID_KEGG.txt"))
exosomal_genes = unique( unlist(strsplit(up_fa$Genes[up_fa$Term=="GO:0070062~extracellular exosome"],", ")) )
exosome = exosomal_genes[-which(exosomal_genes %in% "ENSG00000285762")]
tpm_norm_count_table_EX = tpm_norm_count_table[rownames(tpm_norm_count_table) %in% exosomal_genes,3:ncol(tpm_norm_count_table)] 
tpm_norm_count_table_ex = as.data.frame(log10(0.1+tpm_norm_count_table_EX))
rownames(tpm_norm_count_table_ex) = hits_up$hgnc_symbol[match(rownames(tpm_norm_count_table_ex),hits_up$ensembl_id)]human_count = read.delim(paste0(outputs_directory,'gene_counts_human_dec2022.txt'),skip=1)
macaque_count = read.delim(paste0(outputs_directory,'gene_counts_macaque.txt'),skip=1)
all(human_count$Geneid==macaque_count$Geneid)## [1] TRUEAPlab_count = data.frame(Hs_CTX_WT_Brain_S3A1_M = human_count[,7],
                         Hs_CTX_WT_Brain_S7A1_M = human_count[,8],
                         Hs_CTX_WT_Brain_S2A1_M = human_count[,9],
                         Hs_CTX_WT_Brain_S1A1_M = human_count[,10],
                         Hs_CTX_WT_Brain_S6A1_F = human_count[,11],
                         Mm_CTX_WT_Brain_10506_M = macaque_count[,7],
                         Mm_CTX_WT_Brain_10521_F = macaque_count[,8], 
                         row.names = human_count$Geneid)
brain_met = data.frame(species=factor( c(rep("HS",5),c('MM','MM')),levels=c("HS","MM")),
                       sample='WholeCortex',sex=c('M','M','M','M','F','M','F'),
                       row.names=colnames(APlab_count))
brain_bulk = DESeqDataSetFromMatrix(
 countData = APlab_count,
 colData = brain_met,
 design = ~ species )
brain_bulk = estimateSizeFactors(brain_bulk)
brain_bulk <- DESeq(brain_bulk)## using pre-existing size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingvst_data = vst(brain_bulk, blind=TRUE)
log_data = rlog(brain_bulk, blind=TRUE)
normalized_counts = counts(brain_bulk,normalized=TRUE)
brain_bulk_PL_res = results(brain_bulk, contrast = c("species","HS","MM") )
brain_bulk_PL_sig = brain_bulk_PL_res[!is.na(brain_bulk_PL_res$padj),]
brain_bulk_PL_sig = brain_bulk_PL_sig[brain_bulk_PL_sig$padj<0.01,]
brain_bulk_PL_sig_down = brain_bulk_PL_sig[brain_bulk_PL_sig$log2FoldChange<(0),]
brain_bulk_PL_sig_up = brain_bulk_PL_sig[brain_bulk_PL_sig$log2FoldChange>(0),]brain_tpm = read.delim(paste0(outputs_directory,'Ext_RNASeq_TPMCOUNTS.tsv'),header=TRUE, as.is=TRUE)
brain_met = read.delim(paste0(outputs_directory,'Ext_RNASeq_METADATA.tsv'),header=TRUE, as.is=TRUE)
brain_counts = read.delim(paste0(outputs_directory,'Ext_RNASeq_COUNTDATA.tsv'),header=TRUE, as.is=TRUE)
rownames(brain_met) = brain_met$sample_names
brain_counts = brain_counts[,match(rownames(brain_met),colnames(brain_counts))]Retain normal Cortex and Female samples and perform DESeq2 based normalisation
klrna = brain_counts[ , brain_met$lab=="Khaitovich Lab" & brain_met$sources %like% 'Cortex' & brain_met$condition=="Normal" & brain_met$sex=="F"]
brain_met_kl = brain_met[brain_met$lab=="Khaitovich Lab" & brain_met$sources %like% 'Cortex' & brain_met$condition=="Normal" & brain_met$sex=="F",]
brain_bulk_kl = DESeqDataSetFromMatrix(
 countData = klrna,
 colData = brain_met_kl,
 design = ~ species )## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factorsbrain_bulk_kl = estimateSizeFactors(brain_bulk_kl)
brain_bulk_kl_normalized = counts(brain_bulk_kl, normalized=TRUE )
brain_bulk_kl = DESeqDataSetFromMatrix(
 countData = klrna,
 colData = brain_met_kl,
 design = ~ species )## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factorsbrain_bulk_kl = estimateSizeFactors(brain_bulk_kl)
brain_bulk_kl = DESeq(brain_bulk_kl)## using pre-existing size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingbrain_bulk_kl_normalized = counts(brain_bulk_kl, normalized=TRUE )
brain_bulk_KL_res = results(brain_bulk_kl, contrast = c("species","HS","MM") )
brain_bulk_KL_res2 = results(brain_bulk_kl, contrast = c("species","HS","PT") )
brain_bulk_KL_res3 = results(brain_bulk_kl, contrast = c("species","HS","PP") )All the other diseases - do they show such a trend? Fix the gene names! https://www.mirbase.org/ftp.shtml
choroby = table(bda_final$Disease)
choroby = choroby[choroby>30]
LFC_disease = lapply( split( bda_final$ensid[bda_final$Disease %in% names(choroby)], 
                             bda_final$Disease[bda_final$Disease %in% names(choroby)]), function(genes){
                               pt = brain_bulk_PL_res[rownames(brain_bulk_PL_res) %in% genes, ]
                               pt = pt[!is.na(pt$log2FoldChange),]
                               res = pt$log2FoldChange
                               names(res) = rownames(pt)
                               return(res)} )
LFC_disease2 = lapply( split( bda_final$ensid[bda_final$Disease %in% names(choroby)], 
                              bda_final$Disease[bda_final$Disease %in% names(choroby)]), function(genes){
                               pt = brain_bulk_KL_res[rownames(brain_bulk_KL_res) %in% genes, ]
                               pt = pt[!is.na(pt$log2FoldChange),]
                               res = pt$log2FoldChange
                               names(res) = rownames(pt)
                               return(res)} )
LFC_disease3 = lapply( split( bda_final$ensid[bda_final$Disease %in% names(choroby)], 
                             bda_final$Disease[bda_final$Disease %in% names(choroby)]), function(genes){
                               pt = brain_bulk_KL_res2[rownames(brain_bulk_KL_res2) %in% genes, ]
                               pt = pt[!is.na(pt$log2FoldChange),]
                               res = pt$log2FoldChange
                               names(res) = rownames(pt)
                               return(res) } )
### --------------------
ChosenFunction = function(x){t.test(x)$p.value}
LFC_disease_pv = unlist(lapply(LFC_disease,ChosenFunction))
LFC_disease2_pv = unlist(lapply(LFC_disease2,ChosenFunction))
LFC_disease3_pv = unlist(lapply(LFC_disease3,ChosenFunction))
ChosenFunction = median
LFC_disease_fc = unlist(lapply(LFC_disease,ChosenFunction))
LFC_disease2_fc = unlist(lapply(LFC_disease2,ChosenFunction))
LFC_disease3_fc = unlist(lapply(LFC_disease3,ChosenFunction))
cols=colorRampPalette(c("orange3","white","aquamarine3"))(length(LFC_disease_fc))
par(mfrow=c(1,1),mar=c(15,4,5,1))
barplot(LFC_disease_fc[order(LFC_disease_fc,decreasing=FALSE)],
        col=ifelse(LFC_disease_pv[order(LFC_disease_fc,decreasing=FALSE)]<0.05,"aquamarine3","gray80"),
        las=2,axes=FALSE,ylim=c(-1,1),ylab="log[2]FC (Human/NHP)")
axis(2,lwd=2,las=2,cex.lab=1.5)par(mfrow=c(1,1),mar=c(15,4,5,1))
barplot(LFC_disease2_fc[order(LFC_disease2_fc,decreasing=FALSE)], 
        col=ifelse(LFC_disease2_pv[order(LFC_disease2_fc,decreasing=FALSE)]<0.05,"aquamarine3","gray80"),
        las=2,axes=FALSE,ylab="log[2]FC (Human/NHP)")
axis(2,lwd=2,las=2,cex.lab=1.5)par(mfrow=c(1,1),mar=c(15,4,1,1))
barplot(LFC_disease3_fc[order(LFC_disease3_fc,decreasing=FALSE)], 
        col=ifelse(LFC_disease3_pv[order(LFC_disease3_fc,decreasing=FALSE)]<0.05,"aquamarine3","gray80"),
        las=2,axes=FALSE,ylim=c(-0.4,0.4),ylab="log[2]FC (Human/NHP)")
axis(2,lwd=2,las=2,cex.lab=1.5)library(beeswarm)
library(ggpubr)
library(dplyr)
normalized_counts_PL = counts(brain_bulk,normalized=TRUE)
normalized_counts_KL = counts(brain_bulk_kl,normalized=TRUE)
samples_KL = brain_met[brain_met$lab=="Khaitovich Lab" & brain_met$sources %like% 'Cortex' & brain_met$condition=="Normal" & brain_met$sex=="F",]
tead3_pl = data.frame( expression=normalized_counts_PL['ENSG00000007866',],species=c(rep("HS",5),rep("MM",2)))
tead3_kl = data.frame( expression=normalized_counts_KL['ENSG00000007866',],
                       species=samples_KL$species[match(colnames(normalized_counts_KL),samples_KL$sample_names)] )
tead3_kl = tead3_kl[tead3_kl$species %in% c("HS","PT","MM"),]
tead3_kl$species = factor(tead3_kl$species,levels=c("HS","PT","MM"))
se <- function(x){sd(x)/sqrt(length(x))}
my_dat <- summarise(group_by(tead3_pl, species), mean=mean(expression),se=se(expression))
ggplot(my_dat, aes(x=species, y=mean, fill=species)) + 
   geom_bar(stat="identity", position=position_dodge()) +
   geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.2,
                 position=position_dodge(.9)) + theme_classic() + ylim(c(0,50)) + scale_fill_manual(values=c('gray','blue')) se <- function(x){sd(x)/sqrt(length(x))}
my_dat <- summarise(group_by(tead3_kl, species), 
                    mean=mean(expression),se=se(expression))
ggplot(my_dat, aes(x=species, y=mean, fill=species)) + 
   geom_bar(stat="identity", position=position_dodge()) +
   geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.2,
                 position=position_dodge(.9)) + theme_classic() + ylim(c(0,90)) + scale_fill_manual(values=c('gray','red','blue')) P-values
brain_bulk_KL_res['ENSG00000007866',]## log2 fold change (MLE): species HS vs MM 
## Wald test p-value: species HS vs MM 
## DataFrame with 1 row and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000007866   39.6105        1.75034  0.441979   3.96023 0.000074879
##                      padj
##                 <numeric>
## ENSG00000007866 0.0004964brain_bulk_KL_res3['ENSG00000007866',]## log2 fold change (MLE): species HS vs PP 
## Wald test p-value: species HS vs PP 
## DataFrame with 1 row and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000007866   39.6105        1.16246  0.350057   3.32078 0.000897662
##                       padj
##                  <numeric>
## ENSG00000007866 0.00614593brain_bulk_PL_sig['ENSG00000007866',]## log2 fold change (MLE): species HS vs MM 
## Wald test p-value: species HS vs MM 
## DataFrame with 1 row and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat     pvalue
##                 <numeric>      <numeric> <numeric> <numeric>  <numeric>
## ENSG00000007866   30.2115        1.78403  0.581936   3.06568 0.00217175
##                       padj
##                  <numeric>
## ENSG00000007866 0.00825217Domains were identified using TOPDOM. We read them in here. We consider boundaries that have support in two replicates. First chunk lifts over the boundary coordinates between human and chimpanzee assemblies.
ele_domains = readTADs( paste0(outputs_directory,"25kb_domains/hs_ele_krnorm.all.25kb.topdom.bedpe" ) )
fas_domains = readTADs( paste0(outputs_directory,"25kb_domains/hs_pf_krnorm.all.25kb.topdom.bedpe" ) )
man_domains = readTADs( paste0(outputs_directory,"25kb_domains/pt_mandy_krnorm.all.25kb.topdom.bedpe" ) )
saa_domains = readTADs( paste0(outputs_directory,"25kb_domains/pt_sandra_krnorm.all.25kb.topdom.bedpe" ) )
ele_domains_lift_over_Pt6 = liftOverBoundaries( ele_domains, chain_HsPt, WSize = 500 )
fas_domains_lift_over_Pt6 = liftOverBoundaries( fas_domains, chain_HsPt, WSize = 500 )
bed_file = c(ele_domains_lift_over_Pt6$lifted_over,fas_domains_lift_over_Pt6$lifted_over)
export.bed( bed_file, con=paste0(outputs_directory,"ele_fas_boundaries_lift_Pt6.bed" ) )
save( ele_domains_lift_over_Pt6, fas_domains_lift_over_Pt6, file=paste0(objects_directory, "ele_fas_boundaries_lift_Pt6.RData"))
man_domains_lift_over_hg38 = liftOverBoundaries( man_domains, chain_PtHs, WSize = 500 )
saa_domains_lift_over_hg38 = liftOverBoundaries( saa_domains, chain_PtHs, WSize = 500 )
bed_file = c(man_domains_lift_over_hg38$lifted_over,saa_domains_lift_over_hg38$lifted_over)
export.bed( bed_file, con=paste0(outputs_directory,"man_saa_boundaries_lift_Hg38.bed" ) )
save( man_domains_lift_over_hg38, saa_domains_lift_over_hg38, file=paste0(objects_directory, "man_saa_domains_lift_Hg38.RData") )We display the reproducibility
ele_domains = readTADs( paste0(outputs_directory,"25kb_domains/hs_ele_krnorm.all.25kb.topdom.bedpe" ) )
fas_domains = readTADs( paste0(outputs_directory,"25kb_domains/hs_pf_krnorm.all.25kb.topdom.bedpe" ) )
man_domains = readTADs( paste0(outputs_directory,"25kb_domains/pt_mandy_krnorm.all.25kb.topdom.bedpe" ) )
saa_domains = readTADs( paste0(outputs_directory,"25kb_domains/pt_sandra_krnorm.all.25kb.topdom.bedpe" ) )
load(paste0(objects_directory, "man_saa_domains_lift_Hg38.RData"))
load(paste0(objects_directory, "ele_fas_boundaries_lift_Pt6.RData"))
all_human_boundaires = getAllBoundaries( ele_domains$boundaries, fas_domains$boundaries )
all_chimp_boundaires = getAllBoundaries( man_domains$boundaries, saa_domains$boundaries )
peak_list = list(ELE30 = unique(queryHits(findOverlaps(all_human_boundaires,ele_domains$boundaries))),
                 PF = unique(queryHits(findOverlaps(all_human_boundaires,fas_domains$boundaries))) )
ggVennDiagram(peak_list,label_alpha=0) + scale_fill_distiller( direction = 1)peak_list = list(Sandra = unique(queryHits(findOverlaps(all_chimp_boundaires,saa_domains$boundaries))),
                 Mandy = unique(queryHits(findOverlaps(all_chimp_boundaires,man_domains$boundaries))) )
ggVennDiagram(peak_list,label_alpha=0) + scale_fill_distiller( direction = 1)m = matrix(c(6100,
             1021,
             769,
             6000,
             593,
             1131),
           ncol = 2, nrow=3,
           byrow = FALSE)
barplot(m,col=c("green4","steelblue3","blue4"), ylim=c(0,8000),ylab="Loops",names=c("Human", "Chimpanzee"))
axis(2,lwd=2)export.bed(all_human_boundaires,con=paste0(outputs_directory,"all_human_boundaires_input.bed"))
export.bed(all_chimp_boundaires,con=paste0(outputs_directory,"all_chimp_boundaires.bed") )Check the evolutionary conservation of the reproducible boundaries.
human_boundaires_reproducible = ele_domains$boundaries[queryHits(findOverlaps(ele_domains$boundaries,fas_domains$boundaries))]
chimp_domains = vector("list",1)
names(chimp_domains) = "boundaries"
chimp_domains$boundaries = man_domains$boundaries[queryHits(findOverlaps(man_domains$boundaries,saa_domains$boundaries))]
chimp_domains_lift_over_hg38 = liftOverBoundaries( chimp_domains, chain_PtHs, WSize = 500 )
export.bed(chimp_domains_lift_over_hg38$lifted_over,con=paste0(outputs_directory,"chimp_domains_lift_over_hg38.bed"))
save( chimp_domains_lift_over_hg38, file=paste0(objects_directory,"chimp_domains_lift_over_hg38.RData"))
save(human_boundaires_reproducible,file=paste0(objects_directory,"human_boundaires_reproducible.RData"))Display the result
load( paste0(objects_directory,"chimp_domains_lift_over_hg38.RData") )
load(paste0(objects_directory,"human_boundaires_reproducible.RData"))
allBound = getAllBoundaries( human_boundaires_reproducible,chimp_domains_lift_over_hg38$lifted_over )
peak_list = list(Human = unique(queryHits(findOverlaps(allBound,human_boundaires_reproducible))),
                 Chimp = unique(queryHits(findOverlaps(allBound,chimp_domains_lift_over_hg38$lifted_over))) )
ggVennDiagram(peak_list,label_alpha=0) + scale_fill_distiller( direction = 1)all_evol_shared_boundaries = human_boundaires_reproducible[unique(queryHits(findOverlaps(human_boundaires_reproducible,chimp_domains_lift_over_hg38$lifted_over)))]
all_evol_shared_boundaries_Pt = chimp_domains_lift_over_hg38$original[ which(names(chimp_domains_lift_over_hg38$original) %in% names(chimp_domains_lift_over_hg38$lifted_over[queryHits(findOverlaps(chimp_domains_lift_over_hg38$lifted_over,all_evol_shared_boundaries))]) ) ]Sometimes the human boundaries in the chimp have no reads or are in the regions with an overtly low mappability and vice versa. We want to gent rid of those instances. Boundaries called in human should not be in the vicinity of low coverage regions in the human and in the chimp. Boundaries called in the chimp should not be in the vicinity of low coverage bins in chrim and in human.
lowCoverageBinsHG38 = do.call("c", lapply( as.list(names(ele)), function(x){ 
  print(x)
  thischr = gagr[which(chrom(gagr)==x)]
  m = ele[[x]]
  tp = as.data.frame(summary(m$LFM))
  tp = tp[tp$j > (tp$i+200),]
  M = Matrix::Matrix(0, nrow=nrow(m$LFM),ncol=ncol(m$LFM),sparse=TRUE )
  M[cbind(tp$i,tp$j)] = tp$x
  M = M + t(M)
  thischr = thischr[which(rowSums(M)<100)]
  return(thischr)  } ) )
export.bed(lowCoverageBinsHG38,con=paste0(outputs_directory,"lowCoverageBinsHG38.bed"))
lowCoverageBinsPT6 = do.call("c", lapply( as.list(names(mandy)), function(x){ 
  print(x)
  # x = "chr1"
  thischr = gagr_pt[which(chrom(gagr_pt)==x)]
  m = mandy[[x]]
  tp = as.data.frame(summary(m$LFM))
  tp = tp[tp$j > (tp$i+200),]
  M = Matrix::Matrix(0, nrow=nrow(m$LFM),ncol=ncol(m$LFM),sparse=TRUE )
  M[cbind(tp$i,tp$j)] = tp$x
  M = M + t(M)
  thischr = thischr[which(rowSums(M)<100)]
  return(thischr)  } ) )
export.bed(lowCoverageBinsPT6,con=paste0(outputs_directory,"lowCoverageBinsPaT6.bed"))
lowCoverageBinsHG38_2 = do.call("c", lapply( as.list(names(fa)), function(x){ 
  print(x)
  thischr = gagr[which(chrom(gagr)==x)]
  m = fa[[x]]
  tp = as.data.frame(summary(m$LFM))
  tp = tp[tp$j > (tp$i+200),]
  M = Matrix::Matrix(0, nrow=nrow(m$LFM),ncol=ncol(m$LFM),sparse=TRUE )
  M[cbind(tp$i,tp$j)] = tp$x
  M = M + t(M)
  thischr = thischr[which(rowSums(M)<100)]
  return(thischr)  } ) )
export.bed(lowCoverageBinsHG38_2,con=paste0(outputs_directory,"lowCoverageBinsHG38_2.bed"))
lowCoverageBinsPT6_2 = do.call("c", lapply( as.list(names(sa)), function(x){ 
  print(x)
  # x = "chr1"
  thischr = gagr_pt[which(chrom(gagr_pt)==x)]
  m = sa[[x]]
  tp = as.data.frame(summary(m$LFM))
  tp = tp[tp$j > (tp$i+200),]
  M = Matrix::Matrix(0, nrow=nrow(m$LFM),ncol=ncol(m$LFM),sparse=TRUE )
  M[cbind(tp$i,tp$j)] = tp$x
  M = M + t(M)
  thischr = thischr[which(rowSums(M)<100)]
  return(thischr)  } ) )
export.bed(lowCoverageBinsPT6_2,con=paste0(outputs_directory,"lowCoverageBinsPaT6_2.bed"))
save( lowCoverageBinsHG38, lowCoverageBinsPT6, file=paste0(objects_directory,"lowCoverageBins.RData") )
save( lowCoverageBinsHG38_2, lowCoverageBinsPT6_2, file=paste0(objects_directory,"lowCoverageBins2.RData") )Lift over these intervals of low coverage. Then lift over the boundaries for the next steps of the analyses.
load(paste0(objects_directory,"lowCoverageBins.RData"))
load(paste0(objects_directory,"lowCoverageBins2.RData"))
lowCoverageBinsHG38 = GenomicRanges::resize(lowCoverageBinsHG38,50000,fix="center")
lowCoverageBinsHG38_2 = GenomicRanges::resize(lowCoverageBinsHG38_2,50000,fix="center")
lowCoverageBinsPT6 = GenomicRanges::resize(lowCoverageBinsPT6,50000,fix="center")
lowCoverageBinsPT6_2 = GenomicRanges::resize(lowCoverageBinsPT6_2,50000,fix="center")
lowCoverageBinsHG38 = lowCoverageBinsHG38[queryHits(findOverlaps(lowCoverageBinsHG38,lowCoverageBinsHG38_2))]
lowCoverageBinsPT6 = lowCoverageBinsPT6[queryHits(findOverlaps(lowCoverageBinsPT6,lowCoverageBinsPT6_2))]
# used to be 10000
lowCoverageBinsHG38_Pt = GenomicRanges::resize( unlist(liftOver(GenomicRanges::resize(lowCoverageBinsHG38,500,fix="center"), 
                                                                chain = chain_HsPt)), 50000, fix="center")
lowCoverageBinsPt_Hg38 = GenomicRanges::resize( unlist(liftOver(GenomicRanges::resize(lowCoverageBinsPT6,500,fix="center"), 
                                                                chain = chain_PtHs)), 50000, fix="center")
export.bed(lowCoverageBinsPt_Hg38,con=paste0(outputs_directory,"lowCoverageBinsPt_Hg38.bed"))
save( lowCoverageBinsPt_Hg38, lowCoverageBinsPt_Hg38,
      file=paste0(objects_directory,"low_coverage_bins_lifted_over.RData"))Get to the list of human and chimp specific boundaries. Consider boundaries observed in both replicates. Remove boundaries within regions with poor mappability in both species (50kb intervals centered on the lifted over region).
load( paste0(objects_directory, "ele_fas_boundaries_lift_Pt6.RData") )
load( paste0(objects_directory, "man_saa_domains_lift_Hg38.RData") )
### ----------- 
all_human_boundaires_input = getAllBoundaries( ele_domains$boundaries, 
                                               fas_domains$boundaries )
all_chimp_boundaires_input = getAllBoundaries( man_domains$boundaries,
                                               saa_domains$boundaries )
all_chimp_boundaires_Hg38 = getAllBoundaries( man_domains_lift_over_hg38$lifted_over,
                                              saa_domains_lift_over_hg38$lifted_over )
export.bed( all_chimp_boundaires_Hg38, con=paste0(outputs_directory,"all_chimp_boundaires_Hg38_tp.bed" ))
### --------------------
### remove boundaries that intersect poorly mappable regions in the two species
all_human_boundaires = all_human_boundaires_input[-queryHits(findOverlaps(all_human_boundaires_input,c( lowCoverageBinsHG38,lowCoverageBinsPt_Hg38) )) ]
all_chimp_boundaires = all_chimp_boundaires_input[-queryHits(findOverlaps(all_chimp_boundaires_input,c(lowCoverageBinsPT6,lowCoverageBinsHG38_Pt)))]
export.bed( all_human_boundaires, con=paste0(outputs_directory,"all_human_boundaires_Hg38.bed" ) )
export.bed( all_chimp_boundaires, con=paste0(outputs_directory,"all_chimp_boundaires_Pt6.bed" ) )Liftovers: - we pick the longest one - lift over needs to be on the same chromosome.
Identify species specific boundaries. To call a boundary species specific it needs to be: - found in both replicates of this species - not in a poorly mappable region in either of the two species - never found in the other species - be amenable for liftOver.
human_specific_boundaries = human_boundaires_reproducible[ - queryHits(findOverlaps(human_boundaires_reproducible,all_chimp_boundaires_Hg38 )) ]
human_specific_boundaries = human_specific_boundaries[ - queryHits(findOverlaps(human_specific_boundaries,reduce(c(lowCoverageBinsHG38,lowCoverageBinsPt_Hg38)) )) ]
chimp_specific_boundaries = chimp_domains$boundaries[ - queryHits(findOverlaps(chimp_domains$boundaries,reduce(c(ele_domains_lift_over_Pt6$lifted_over,
                                                                                                                 fas_domains_lift_over_Pt6$lifted_over)) )) ]
chimp_specific_boundaries = chimp_specific_boundaries[ - queryHits(findOverlaps(chimp_specific_boundaries,reduce(c(lowCoverageBinsPT6,lowCoverageBinsHG38_Pt)) )) ]
## Boundaries need to be able to be lifted over! Otherwise we do not know if the boundary is lost because it is not lifted over or it is lost because it was not called
chimp_specific_boundaries_Hg38 = liftOverBoundaries(list(boundaries=chimp_specific_boundaries), chain_PtHs, WSize = 500 )
human_specific_boundaries_Pt = liftOverBoundaries(list(boundaries=human_specific_boundaries), chain_HsPt, WSize = 500 )
## double filtering for lift overs, any chimp boundary lifted over to hg38 should not be observed in human baoundaries
chimp_specific_boundaries = GenomicRanges::resize(chimp_specific_boundaries_Hg38$original[which(names(chimp_specific_boundaries_Hg38$original) %in% names(chimp_specific_boundaries_Hg38$lifted_over))],50000,fix="center")
chimp_specific_boundaries_Hg38 = GenomicRanges::resize(chimp_specific_boundaries_Hg38$lifted_over,50000,fix="center")
all(names(chimp_specific_boundaries)==names(chimp_specific_boundaries_Hg38))
human_specific_boundaries = GenomicRanges::resize(human_specific_boundaries_Pt$original[which(names(human_specific_boundaries_Pt$original) %in% names(human_specific_boundaries_Pt$lifted_over))],50000,fix="center")
human_specific_boundaries_Pt = GenomicRanges::resize(human_specific_boundaries_Pt$lifted_over,50000,fix="center")
all(names(human_specific_boundaries)==names(human_specific_boundaries_Pt))
chimp_specific_boundaries = chimp_specific_boundaries[-queryHits(findOverlaps(chimp_specific_boundaries_Hg38,all_human_boundaires_input))]
human_specific_boundaries = human_specific_boundaries[-queryHits(findOverlaps(human_specific_boundaries_Pt,all_chimp_boundaires_input))]
chimp_specific_boundaries_Hg38 = chimp_specific_boundaries_Hg38[-queryHits(findOverlaps(chimp_specific_boundaries_Hg38,all_human_boundaires_input))]
human_specific_boundaries_Pt = human_specific_boundaries_Pt[-queryHits(findOverlaps(human_specific_boundaries_Pt,all_chimp_boundaires_input))]
export.bed( chimp_specific_boundaries_Hg38,
            con=paste0(outputs_directory,"/chimp_specific_boundaries_Hg38.bed" ))
export.bed( human_specific_boundaries_Pt,
            con=paste0(outputs_directory,"human_specific_boundaries_Pt.bed" ) )
export.bed( human_specific_boundaries, 
            con=paste0(outputs_directory,"human_specific_boundaries.bed" ) )
export.bed( chimp_specific_boundaries, 
            con=paste0(outputs_directory,"chimp_specific_boundaries.bed" ) )
##########################
species_specific_boundaries = c( human_specific_boundaries, chimp_specific_boundaries_Hg38 )
export.bed( species_specific_boundaries, 
            con=paste0(outputs_directory,"species_specific_boundaries.bed" ) )
save( human_specific_boundaries, chimp_specific_boundaries_Hg38,human_specific_boundaries_Pt,chimp_specific_boundaries,
      file=paste0(objects_directory,"species_specific_boundaries.RData"))
##########################We have the species specific boundaries
load(paste0(objects_directory,"species_specific_boundaries.RData"))
load(paste0(objects_directory,"low_coverage_bins_lifted_over.RData"))
species_specific_boundaries = c( human_specific_boundaries, chimp_specific_boundaries_Hg38 )load(paste0(objects_directory,'si.RData'))
chroms_combs_hs = data.frame( V1=paste0("chr",c(1:22,"X")), 
                              V2=paste0("chr",c(1:22,"X")),stringsAsFactors = FALSE ) 
itn=20
chroms=paste0("chr",c(1:22,"X"))
ele_lfm_5kb = read.hic_files( paste0(dumped_directory_ele), "",".matrix.txt", ga, paste0("chr",c(1:22,"X") ) )
ele = lapply( ele_lfm_5kb, function(m){ IPF( m, numberOfIterations=itn ) } )
save( ele, file=paste0(objects_directory,"Ele30_hic.RData" ))
fa_lfm_5kb = read.hic_files( paste0(dumped_directory_fa), "",".matrix.txt", ga, paste0("chr",c(1:22,"X") ) )
fa = lapply( fa_lfm_5kb, function(m){ IPF( m, numberOfIterations=itn ) } )
save( fa, file=paste0(objects_directory,"FetalAstrocytes_hic.RData" ))load(paste0(objects_directory,'si_pt.RData'))
chroms_combs_pt = data.frame( V1=paste0("chr",c(c(1,"2A","2B",3:22),"X")), 
                              V2=paste0("chr",c(c(1,"2A","2B",3:22),"X")),
                              stringsAsFactors = FALSE)
itn=20
chroms=paste0("chr",c(c(1,"2A","2B",3:22),"X"))
## ---------------
mandy_lfm_5kb = read.hic_files( paste0(dumped_directory_mandy), "",".matrix.txt", ga_pt, chroms=paste0("chr",c(c(1,"2A","2B",3:22),"X")) )
mandy = lapply( mandy_lfm_5kb, function(m){ IPF( m, numberOfIterations=itn ) } )
save( mandy, file=paste0(objects_directory,"Mandy_hic.RData" ) )
## ---------------
sa_lfm_5kb = read.hic_files( paste0(dumped_directory_sandra), "",".matrix.txt", ga_pt, chroms=paste0("chr",c(c(1,"2A","2B",3:22),"X")) )
sa = lapply( sa_lfm_5kb, function(m){ IPF( m, numberOfIterations=itn ) } )
save( sa, file=paste0(objects_directory,"SandraA_hic.RData" ) )human_spe_bound_IS_ele = InsulationScore( human_specific_boundaries, 
                                          ele, gagr, 5, 3, 10 )
save(human_spe_bound_IS_ele,file=paste0(objects_directory,"human_spe_bound_IS_ele.RData"))
human_spe_bound_IS_fas = InsulationScore( human_specific_boundaries, 
                                           fa, gagr, 5, 3, 10 )
save(human_spe_bound_IS_fas,file=paste0(objects_directory,"human_spe_bound_IS_fas.RData"))
chimp_spe_bound_IS_ele = InsulationScore( chimp_specific_boundaries_Hg38, 
                                          ele, gagr, 5, 3, 10 )
save(chimp_spe_bound_IS_ele,file=paste0(objects_directory,"chimp_spe_bound_IS_ele.RData"))
chimp_spe_bound_IS_fas = InsulationScore( chimp_specific_boundaries_Hg38, 
                                          fa, gagr, 5, 3, 10 )
save(chimp_spe_bound_IS_fas,file=paste0(objects_directory,"chimp_spe_bound_IS_fas.RData"))
shared_bound_IS_ele = InsulationScore( all_evol_shared_boundaries, 
                                       ele, gagr, 5, 3, 10 )
save(shared_bound_IS_ele,file=paste0(objects_directory,"shared_bound_IS_ele.RData"))
shared_bound_IS_fas = InsulationScore( all_evol_shared_boundaries, 
                                       fa, gagr, 5, 3, 10 )
save(shared_bound_IS_fas,file=paste0(objects_directory,"shared_bound_IS_fas.RData"))
## values for chimpanzee samples
human_spe_bound_IS_mandy = InsulationScore( human_specific_boundaries_Pt, 
                                            mandy, gagr_pt, 5, 3, 10 )
save(human_spe_bound_IS_mandy,file=paste0(objects_directory,"human_spe_bound_IS_mandy.RData"))
human_spe_bound_IS_sandraA = InsulationScore( human_specific_boundaries_Pt, 
                                              sa, gagr_pt, 5, 3, 10 )
save(human_spe_bound_IS_sandraA,file=paste0(objects_directory,"human_spe_bound_IS_sandraA.RData"))
chimp_spe_bound_IS_mandy = InsulationScore( chimp_specific_boundaries, 
                                            mandy, gagr_pt, 5, 3, 10 )
save(chimp_spe_bound_IS_mandy,file=paste0(objects_directory,"chimp_spe_bound_IS_mandy.RData"))
chimp_spe_bound_IS_sandraA = InsulationScore( chimp_specific_boundaries, 
                                              sa, gagr_pt, 5, 3, 10 )
save(chimp_spe_bound_IS_sandraA,file=paste0(objects_directory,"chimp_spe_bound_IS_sandraA.RData"))
shared_bound_IS_mandy = InsulationScore( all_evol_shared_boundaries_Pt, 
                                        mandy, gagr_pt, 5, 3, 10 )
save(shared_bound_IS_mandy,file=paste0(objects_directory,"shared_bound_IS_mandy.RData"))
shared_bound_IS_sandraA = InsulationScore( all_evol_shared_boundaries_Pt, 
                                       sa, gagr_pt, 5, 3, 10 )
save(shared_bound_IS_sandraA,file=paste0(objects_directory,"shared_bound_IS_sandraA.RData"))Species specific boundaries, insulation change
load(paste0(objects_directory,"human_spe_bound_IS_mandy.RData"))
load(paste0(objects_directory,"human_spe_bound_IS_fas.RData"))
load(paste0(objects_directory,"human_spe_bound_IS_ele.RData"))
load(paste0(objects_directory,"human_spe_bound_IS_sandraA.RData"))
load(paste0(objects_directory,"chimp_spe_bound_IS_mandy.RData"))
load(paste0(objects_directory,"chimp_spe_bound_IS_ele.RData"))
load(paste0(objects_directory,"chimp_spe_bound_IS_fas.RData"))
load(paste0(objects_directory,"chimp_spe_bound_IS_sandraA.RData"))
load(paste0(objects_directory,"shared_bound_IS_ele.RData"))
load(paste0(objects_directory,"shared_bound_IS_fas.RData"))
load(paste0(objects_directory,"shared_bound_IS_mandy.RData"))
load(paste0(objects_directory,"shared_bound_IS_sandraA.RData"))
human_spe_bound_IS_ele = log2(rowMeans(human_spe_bound_IS_ele[,c(1,3)])/human_spe_bound_IS_ele[,2])
human_spe_bound_IS_mandy = log2(rowMeans(human_spe_bound_IS_mandy[,c(1,3)])/human_spe_bound_IS_mandy[,2])
human_spe_bound_IS_fas = log2(rowMeans(human_spe_bound_IS_fas[,c(1,3)])/human_spe_bound_IS_fas[,2])
human_spe_bound_IS_sandraA = log2(rowMeans(human_spe_bound_IS_sandraA[,c(1,3)])/human_spe_bound_IS_sandraA[,2])
chimp_spe_bound_IS_ele = log2(rowMeans(chimp_spe_bound_IS_ele[,c(1,3)])/chimp_spe_bound_IS_ele[,2])
chimp_spe_bound_IS_mandy = log2(rowMeans(chimp_spe_bound_IS_mandy[,c(1,3)])/chimp_spe_bound_IS_mandy[,2])
chimp_spe_bound_IS_fas = log2(rowMeans(chimp_spe_bound_IS_fas[,c(1,3)])/chimp_spe_bound_IS_fas[,2])
chimp_spe_bound_IS_sandraA = log2(rowMeans(chimp_spe_bound_IS_sandraA[,c(1,3)])/chimp_spe_bound_IS_sandraA[,2])
shared_bound_IS_mandy = log2(rowMeans(shared_bound_IS_mandy[,c(1,3)])/shared_bound_IS_mandy[,2])
shared_bound_IS_sandraA = log2(rowMeans(shared_bound_IS_sandraA[,c(1,3)])/shared_bound_IS_sandraA[,2])
shared_bound_IS_ele = log2(rowMeans(shared_bound_IS_ele[,c(1,3)])/shared_bound_IS_ele[,2])
shared_bound_IS_fas = log2(rowMeans(shared_bound_IS_fas[,c(1,3)])/shared_bound_IS_fas[,2])
is_hs = c(shared_bound_IS_ele, shared_bound_IS_fas)
is_pt = c(shared_bound_IS_mandy, shared_bound_IS_sandraA)
is_hs = is_hs[is.finite(is_hs)]
is_pt = is_pt[is.finite(is_pt)]
## -----
human_spe_bound_human = rowMax(cbind(human_spe_bound_IS_ele,human_spe_bound_IS_fas))
names(human_spe_bound_human) = names(human_spe_bound_IS_ele)
human_spe_bound_chimp = rowMax(cbind(human_spe_bound_IS_mandy,human_spe_bound_IS_sandraA))
names(human_spe_bound_chimp) = names(human_spe_bound_IS_mandy)
human_spe_bound_human = human_spe_bound_human[match(names(human_spe_bound_chimp),names(human_spe_bound_human))]
human_spe_boundaries_evol = human_spe_bound_human-human_spe_bound_chimp
## -----
chimp_spe_bound_human = rowMax(cbind(chimp_spe_bound_IS_ele,chimp_spe_bound_IS_fas))
names(chimp_spe_bound_human) = names(chimp_spe_bound_IS_ele)
chimp_spe_bound_chimp = rowMax(cbind(chimp_spe_bound_IS_mandy,chimp_spe_bound_IS_sandraA))
names(chimp_spe_bound_chimp) = names(chimp_spe_bound_IS_mandy)
chimp_spe_bound_chimp = chimp_spe_bound_chimp[match(names(chimp_spe_bound_human),names(chimp_spe_bound_chimp))]
chimp_spe_boundaries_evol = chimp_spe_bound_human-chimp_spe_bound_chimp
boxplot( human_spe_boundaries_evol, chimp_spe_boundaries_evol,
         outline=FALSE, col="white",border=c("black","red"),
         tlim=c(-0.6,0.6),ylab="Insulation change Human/Chimp [log2]",
         names=c("Human","Chimp"),xlab="Species specificity of boundary")t.test(human_spe_boundaries_evol,chimp_spe_boundaries_evol)## 
##  Welch Two Sample t-test
## 
## data:  human_spe_boundaries_evol and chimp_spe_boundaries_evol
## t = 6.6461, df = 274.65, p-value = 0.0000000001614
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2357400 0.4341763
## sample estimates:
##  mean of x  mean of y 
##  0.1432925 -0.1916657names(gagr) = paste(chrom(gagr),names(gagr),sep="_")
processIS = function( IS, GAGR ){
  res = GAGR
  res$binid=NULL
  res$score = 0
  res$score[match(rownames(IS),names(res))] = log2( rowMeans((0.001+IS[,c(1,3)]))/(0.001+IS[,2] ) )
  return(res) }
## --------
genome_wide_IS_ele = InsulationScore( gagr[which(chrom(gagr)!="chrY")], ele, gagr, 5, 3, 10 )
save(genome_wide_IS_ele,file=paste0(objects_directory,"genome_wide_IS_ele.RData"))
genome_wide_IS_ele_gr = processIS(genome_wide_IS_ele,gagr)
export.bedGraph( genome_wide_IS_ele_gr, con=paste0(outputs_directory,"genome_wide_IS_ele_gagr.bedGraph"))
genome_wide_IS_fas = InsulationScore(  gagr[which(chrom(gagr)!="chrY")], fa, gagr, 5, 3, 10 )
save(genome_wide_IS_fas,file=paste0(objects_directory,"genome_wide_IS_fas.RData"))
genome_wide_IS_fas_gr = processIS(genome_wide_IS_fas,gagr)
export.bedGraph( genome_wide_IS_fas_gr, con=paste0(outputs_directory,"genome_wide_IS_fas_gr.bedGraph"))
genome_wide_IS_mandy = InsulationScore( gagr_pt[which(chrom(gagr_pt)!="chrY")], mandy, gagr_pt, 5, 3, 10 )
save(genome_wide_IS_mandy,file=paste0(objects_directory,"genome_wide_IS_mandy.RData"))
genome_wide_IS_mandy_gr = processIS(genome_wide_IS_mandy,gagr_pt)
export.bedGraph( genome_wide_IS_mandy_gr, con=paste0(outputs_directory,"genome_wide_IS_mandy_gr.bedGraph"))
genome_wide_IS_sandraA = InsulationScore(  gagr_pt[which(chrom(gagr_pt)!="chrY")], sa, gagr_pt, 5, 3, 10 )
save(genome_wide_IS_sandraA,file=paste0(objects_directory,"genome_wide_IS_sandraA.RData"))
genome_wide_IS_sandraA_gr = processIS(genome_wide_IS_sandraA,gagr_pt)
export.bedGraph( genome_wide_IS_sandraA_gr, con=paste0(outputs_directory,"genome_wide_IS_sandraA_gr.bedGraph"))Final plots
Display the insulation scores for all the bins, species specific as well as shared boundaries.
genome_wide_IS_ele_gr=import.bedGraph(paste0(outputs_directory,"genome_wide_IS_ele_gagr.bedGraph"))
genome_wide_IS_fas_gr=import.bedGraph(paste0(outputs_directory,"genome_wide_IS_fas_gr.bedGraph"))
genome_wide_IS_sandraA=import.bedGraph(paste0(outputs_directory,"genome_wide_IS_sandraA_gr.bedGraph"))
genome_wide_IS_mandy_gr=import.bedGraph(paste0(outputs_directory,"genome_wide_IS_mandy_gr.bedGraph"))
genome_wide_IS_human = rowMeans(cbind(genome_wide_IS_ele_gr$score,genome_wide_IS_fas_gr$score))
genome_wide_IS_chimp = rowMeans(cbind(genome_wide_IS_sandraA$score,genome_wide_IS_mandy_gr$score))
boxplot( genome_wide_IS_human, genome_wide_IS_chimp,
         is_hs, human_spe_bound_human,human_spe_bound_chimp,
         is_pt,chimp_spe_bound_chimp,
         chimp_spe_bound_human, outline=FALSE,
         col="white",border=c("black","red","black","black","red","red","red","black"),
         ylim=c(-1,1),ylab=expression("Insulation (log"[2]*")"),
         names=c("GW Hs","GW Pt","Bound Hs","Hs-sp Hs","Hs-spe Pt","Bound Pt","Pt-spe Pt","Pt-spe Hs"),las=2)
axis(1,lwd=2,at=1:8,c("GW Hs","GW Pt","Bound Hs","Hs-sp Hs","Hs-spe Pt","Bound Pt","Pt-spe Pt","Pt-spe Hs"),las=2)
axis(2,lwd=2,las=2)
box(col="black",lwd=2)
abline(h=0,lwd=2,lty=2,col="gray")t.test(human_spe_bound_human,human_spe_bound_chimp)## 
##  Welch Two Sample t-test
## 
## data:  human_spe_bound_human and human_spe_bound_chimp
## t = 5.3102, df = 510.25, p-value = 0.0000001638
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.09027806 0.19630691
## sample estimates:
## mean of x mean of y 
## 0.3546137 0.2113212t.test(chimp_spe_bound_human,chimp_spe_bound_chimp)## 
##  Welch Two Sample t-test
## 
## data:  chimp_spe_bound_human and chimp_spe_bound_chimp
## t = -3.7563, df = 315.76, p-value = 0.0002053
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.30303371 -0.09470476
## sample estimates:
## mean of x mean of y 
## 0.3448980 0.5437673hs_me3 = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K4me3_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)
hs_k27ac = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K27ac_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)
pt_me3 = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K4me3_12-22_PanTro_i-Astro_Sandra_Mandy4-6_merged_PanTro6_peaks.narrowPeak'),
                                chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
pt_k27ac = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K27ac_12-22_PanTro_i-Astro_Sandra_Mandy4-6_merged_PanTro6_peaks.narrowPeak'),
                                chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
mm_me3 = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K4me3_03-22_MacMul_i-Astro_WT_Becky_Rep_1_RheMac10_peaks.narrowPeak'),
                                chroms=paste0(c(1:22,'X')),4)
mm_k27ac = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K27ac_03-22_MacMul_i-Astro_WT_Becky_Rep_1_RheMac10_peaks.narrowPeak'),
                                chroms=paste0(c(1:22,'X')),4)
seqlevelsStyle(hs_k27ac) = "ucsc"
seqlevelsStyle(pt_k27ac) = "ucsc"
seqlevelsStyle(mm_k27ac) = "ucsc"
seqlevelsStyle(hs_me3) = "ucsc"
seqlevelsStyle(pt_me3) = "ucsc"
seqlevelsStyle(mm_me3) = "ucsc"Load the files from the other vignettes
load(paste0(objects_directory,'tss_objects.RData'))
load(paste0(objects_directory,'DEseq2_RNA.RData'))
species_specific_boundaries = import.bed(paste0(outputs_directory,"species_specific_boundaries.bed"))hs_atac = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)
pt_atac = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_Pantro_i-Astro_Sandra_Mandy4-6_merged_PanTro6_peaks.narrowPeak'),
                               paste0("chr",c(1,"2A","2B", 3:22,'X')),10)
mm_atac = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_MacMul_i-Astro_Becky_merged_RheMac10_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)
seqlevelsStyle(hs_atac) = 'ucsc'
seqlevelsStyle(mm_atac) = 'ucsc'
start(hs_atac) = start(hs_atac) + hs_atac$score
end(hs_atac) = start(hs_atac) + 1
start(pt_atac) = start(pt_atac) + pt_atac$score
end(pt_atac) = start(pt_atac) + 1
start(mm_atac) = start(mm_atac) + mm_atac$score
end(mm_atac) = start(mm_atac) + 1
export.bed( hs_atac, con=paste0(outputs_directory,'ATAC.Hs_clean_summit.narrowPeak'))
export.bed( pt_atac, con=paste0(outputs_directory,'ATAC.Pt_clean_summit.narrowPeak'))
export.bed( mm_atac, con=paste0(outputs_directory,'ATAC.Mm_clean_summit.narrowPeak'))The generic liftOver command: $liftOver -minMatch=0.5 -bedPlus=6 -tab 
cd ~/Documents/Tools/
## human to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Hs_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_PT6_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_PT6_summit.unmapped.file
## human to macaque
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Hs_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToRheMac10.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_RM10_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_RM10_summit.unmapped.file
## chimp to human
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Pt_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/panTro6ToHg38.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_Hg38_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_Hg38_summit.unmapped.file
## macaque to human
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Mm_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/rheMac10ToHg38.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_Hg38_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_Hg38_summit.unmapped.file
## chimp to macaque
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Pt_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/panTro6ToRheMac10.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_RheMac10_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_RheMac10_summit.unmapped.file
## macaque to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Mm_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/rheMac10ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_PanTro6_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_PanTro6_summit.unmapped.fileWe define a unique set of intervals of peaks found in at least one species and that are aligneable.
hs_atac_mapped_in_chimp = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Hs_clean_peaks_on_PT6_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
hs_atac_mapped_in_rhesus = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Hs_clean_peaks_on_RM10_summit.narrowPeak"),
                                               chroms=paste0("chr",c(1:22,'X')),4)
chimp_mapped_in_humans = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Pt_clean_peaks_on_Hg38_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1:22,'X')),4)
macaque_mapped_in_humans = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Mm_clean_peaks_on_Hg38_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1:22,'X')),4)
macaque_in_chimps = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Mm_clean_peaks_on_PanTro6_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
chimps_in_macaque = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Pt_clean_peaks_on_RheMac10_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1:22,'X')),4)
CleanAndResize = function( gro, finalSize ){
  return( GenomicRanges::resize(gro[ width(gro)==2 & start(gro)>500 ],finalSize,fix="center"))
}
hs_atac_mapped_in_chimp = CleanAndResize( hs_atac_mapped_in_chimp, 500 )
hs_atac_mapped_in_rhesus = CleanAndResize( hs_atac_mapped_in_rhesus, 500 )
chimp_mapped_in_humans = CleanAndResize( chimp_mapped_in_humans, 500 )
macaque_mapped_in_humans = CleanAndResize( macaque_mapped_in_humans, 500 )
macaque_in_chimps = CleanAndResize( macaque_in_chimps, 500 )
chimps_in_macaque = CleanAndResize( chimps_in_macaque, 500 )
hs_atac = CleanAndResize( hs_atac, 500 )
pt_atac = CleanAndResize( pt_atac, 500 )
mm_atac = CleanAndResize( mm_atac, 500 )Now, we have all the lifted over combinations.
# human peaks aligned in all the species
hs_pt_mm_liftover = names(hs_atac_mapped_in_chimp)[names(hs_atac_mapped_in_chimp) %in% names(hs_atac_mapped_in_rhesus) ]
human_peaks_aligned_Pt_Mm_coordinates_hs = hs_atac[ which(names(hs_atac) %in% hs_pt_mm_liftover ) ]
human_peaks_aligned_Pt_Mm_coordinates_hs = human_peaks_aligned_Pt_Mm_coordinates_hs[-subjectHits(findOverlaps(human_peaks_aligned_Pt_Mm_coordinates_hs,drop.self=TRUE,drop.redundant=TRUE))]
length(human_peaks_aligned_Pt_Mm_coordinates_hs)## [1] 141484human_peaks_aligned_Pt_Mm_coordinates_pt = hs_atac_mapped_in_chimp[ match(names(human_peaks_aligned_Pt_Mm_coordinates_hs),names(hs_atac_mapped_in_chimp)) ]
human_peaks_aligned_Pt_Mm_coordinates_mm = hs_atac_mapped_in_rhesus[ match(names(human_peaks_aligned_Pt_Mm_coordinates_hs),names(hs_atac_mapped_in_rhesus)) ]
all(names(human_peaks_aligned_Pt_Mm_coordinates_hs)==names(human_peaks_aligned_Pt_Mm_coordinates_pt))## [1] TRUEall(names(human_peaks_aligned_Pt_Mm_coordinates_hs)==names(human_peaks_aligned_Pt_Mm_coordinates_mm))## [1] TRUE# peaks found in chimp and macaque (aligned to the human genome) but not detected in human 
nhp_peaks = chimp_mapped_in_humans[ queryHits(findOverlaps(chimp_mapped_in_humans,macaque_mapped_in_humans))]
nhp_peaks = nhp_peaks[ which(names(nhp_peaks) %in% names(chimps_in_macaque))]
nhp_peaks_coordinates_hs = nhp_peaks[ -queryHits(findOverlaps(nhp_peaks,hs_atac))]
nhp_peaks_coordinates_hs = nhp_peaks_coordinates_hs[-subjectHits(findOverlaps(nhp_peaks_coordinates_hs,drop.self=TRUE,drop.redundant=TRUE))]
nhp_peaks_coordinates_pt = pt_atac[ match(names(nhp_peaks_coordinates_hs),names(pt_atac)) ]
nhp_peaks_coordinates_mm = chimps_in_macaque[ match(names(nhp_peaks_coordinates_pt),names(chimps_in_macaque)) ]
all(names(nhp_peaks_coordinates_hs)==names(nhp_peaks_coordinates_mm))## [1] TRUEall(names(nhp_peaks_coordinates_hs)==names(nhp_peaks_coordinates_pt))## [1] TRUE# chimp peaks aligned both in humans and macaques but not detected as peaks in humans and macaques 
chimp_peaks = chimp_mapped_in_humans[ -queryHits(findOverlaps(chimp_mapped_in_humans,c(nhp_peaks_coordinates_hs, hs_atac))) ]
chimp_peaks = chimp_peaks[ which(names(chimp_peaks) %in% names(chimps_in_macaque)) ]
chimp_peaks = chimp_peaks[ which(names(chimp_peaks) %in% names(pt_atac)) ]
chimp_uniquely_peaks_coordinates_hs = chimp_mapped_in_humans[ match( names(chimp_peaks), names(chimp_mapped_in_humans) )]
chimp_uniquely_peaks_coordinates_hs = chimp_uniquely_peaks_coordinates_hs[-subjectHits(findOverlaps(chimp_uniquely_peaks_coordinates_hs,
                                                                                                    drop.self=TRUE,drop.redundant=TRUE))]
chimp_uniquely_peaks_coordinates_pt = pt_atac[ match( names(chimp_uniquely_peaks_coordinates_hs), names(pt_atac) )]
chimp_uniquely_peaks_coordinates_mm = chimps_in_macaque[ match( names(chimp_uniquely_peaks_coordinates_pt), names(chimps_in_macaque) )]
all(names(chimp_uniquely_peaks_coordinates_hs)==names(chimp_uniquely_peaks_coordinates_pt))## [1] TRUEall(names(chimp_uniquely_peaks_coordinates_hs)==names(chimp_uniquely_peaks_coordinates_mm))## [1] TRUElength(chimp_uniquely_peaks_coordinates_hs)## [1] 31740# macaque peaks aligned both in humans and chimps but not detected as peaks in humans and chimps
macaque_peaks = macaque_mapped_in_humans[ -queryHits(findOverlaps(macaque_mapped_in_humans,c(nhp_peaks_coordinates_hs, hs_atac))) ]
macaque_peaks = macaque_peaks[ which(names(macaque_peaks) %in% names(macaque_in_chimps)) ]
macaque_peaks = macaque_peaks[ which(names(macaque_peaks) %in% names(mm_atac)) ]
macaque_uniquely_peaks_coordinates_hs = macaque_mapped_in_humans[ match( names(macaque_peaks), names(macaque_mapped_in_humans) )]
macaque_uniquely_peaks_coordinates_hs = macaque_uniquely_peaks_coordinates_hs[ -subjectHits(findOverlaps(macaque_uniquely_peaks_coordinates_hs,drop.self=TRUE,drop.redundant=TRUE))]
macaque_uniquely_peaks_coordinates_pt = macaque_in_chimps[ match( names(macaque_uniquely_peaks_coordinates_hs), names(macaque_in_chimps) )]
macaque_uniquely_peaks_coordinates_mm = mm_atac[ match( names(macaque_uniquely_peaks_coordinates_hs), names(mm_atac) )]
all(names(macaque_uniquely_peaks_coordinates_hs)==names(macaque_uniquely_peaks_coordinates_pt))## [1] TRUEall(names(macaque_uniquely_peaks_coordinates_hs)==names(macaque_uniquely_peaks_coordinates_mm))## [1] TRUElength(macaque_uniquely_peaks_coordinates_hs)## [1] 43319### we pool all togehter and remove duplicated peaks --> 225,059
all_human_intervals = c( human_peaks_aligned_Pt_Mm_coordinates_hs, nhp_peaks_coordinates_hs, 
                         chimp_uniquely_peaks_coordinates_hs, macaque_uniquely_peaks_coordinates_hs )
all_chimp_intervals = c( human_peaks_aligned_Pt_Mm_coordinates_pt, nhp_peaks_coordinates_pt, 
                         chimp_uniquely_peaks_coordinates_pt, macaque_uniquely_peaks_coordinates_pt )
all_macaque_intervals = c( human_peaks_aligned_Pt_Mm_coordinates_mm, nhp_peaks_coordinates_mm, 
                           chimp_uniquely_peaks_coordinates_mm, macaque_uniquely_peaks_coordinates_mm )
all(names(all_human_intervals) == names(all_chimp_intervals))## [1] TRUEall(names(all_human_intervals) == names(all_macaque_intervals))## [1] TRUEduplicated_peak_names = table(names(all_human_intervals))
duplicated_peak_names = names(duplicated_peak_names[duplicated_peak_names>1])
all_human_intervals = all_human_intervals[which(! names(all_human_intervals) %in% duplicated_peak_names) ]
all_chimp_intervals = all_chimp_intervals[which(! names(all_chimp_intervals) %in% duplicated_peak_names) ]
all_macaque_intervals = all_macaque_intervals[which(! names(all_macaque_intervals) %in% duplicated_peak_names) ]hs_atac2 = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)
pt_atac2 = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_Pantro_i-Astro_Sandra_Mandy4-6_merged_PanTro6_peaks.narrowPeak'),
                               paste0("chr",c(1,"2A","2B", 3:22,'X')),10)
mm_atac2 = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_MacMul_i-Astro_Becky_merged_RheMac10_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)
seqlevelsStyle(hs_atac2) = 'ucsc'
seqlevelsStyle(mm_atac2) = 'ucsc'
start(hs_atac2) = start(hs_atac2) + hs_atac2$score
end(hs_atac2) = start(hs_atac2) 
hs_atac2 = GenomicRanges::resize(hs_atac2,500,fix="center")
start(pt_atac2) = start(pt_atac2) + pt_atac2$score
end(pt_atac2) = start(pt_atac2) 
pt_atac2 = GenomicRanges::resize(pt_atac2,500,fix="center")
start(mm_atac2) = start(mm_atac2) + mm_atac2$score
end(mm_atac2) = start(mm_atac2) 
mm_atac2 = GenomicRanges::resize(mm_atac2,500,fix="center")
export.bed( hs_atac2, con=paste0(outputs_directory,'ATAC.Hs_clean_500_summit.narrowPeak'))
export.bed( pt_atac2, con=paste0(outputs_directory,'ATAC.Pt_clean_500_summit.narrowPeak'))
export.bed( mm_atac2, con=paste0(outputs_directory,'ATAC.Mm_clean_500_summit.narrowPeak'))cd ~/Documents/Tools/
## human to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Hs_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_PT6_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_PT6_500_summit.unmapped.file
## human to macaque
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Hs_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToRheMac10.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_RM10_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_RM10_500_summit.unmapped.file
## chimp to human
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Pt_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/panTro6ToHg38.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_Hg38_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_Hg38_500_summit.unmapped.file
## macaque to human
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Mm_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/rheMac10ToHg38.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_Hg38_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_Hg38_500_summit.unmapped.file
## chimp to macaque
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Pt_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/panTro6ToRheMac10.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_RheMac10_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_RheMac10_500_summit.unmapped.file
## macaque to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Mm_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/rheMac10ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_PanTro6_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_PanTro6_500_summit.unmapped.fileRead in the results to retrieve the peak names that we wish to use.
peaks_hs_Pt =  readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Hs_clean_peaks_on_PT6_500_summit.narrowPeak"), 
                                    paste0("chr",c(1,"2A","2B", 3:22,'X')),5 )
peaks_hs_Mm =  readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Hs_clean_peaks_on_RM10_500_summit.narrowPeak"), 
                                    paste0("chr",c(1:22,'X')),5 )
peaks_hs = names(peaks_hs_Pt)[names(peaks_hs_Pt) %in% names(peaks_hs_Mm) ]
peaks_Pt_Hs = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Pt_clean_peaks_on_Hg38_500_summit.narrowPeak"), 
                                   paste0("chr",c(1:22,'X')),5 ) 
peaks_Pt_Mm = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Pt_clean_peaks_on_RheMac10_500_summit.narrowPeak"), 
                                   paste0("chr",c(1:22,'X')),5 ) 
  
peaks_pt = names(peaks_Pt_Hs)[names(peaks_Pt_Hs) %in% names(peaks_Pt_Mm) ]
peaks_Mm_Hs = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Mm_clean_peaks_on_Hg38_500_summit.narrowPeak"), 
                                   paste0("chr",c(1:22,'X')),5 )
peaks_Mm_Pt = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Mm_clean_peaks_on_PanTro6_500_summit.narrowPeak"), 
                                   paste0("chr",c(1,"2A","2B", 3:22,'X')), 5 ) 
  
peaks_mm = names(peaks_Mm_Hs)[names(peaks_Mm_Hs) %in% names(peaks_Mm_Pt) ]
peaks = unique( c( peaks_hs, peaks_pt, peaks_mm))The final ranges
length(all_human_intervals)==length(all_chimp_intervals)## [1] TRUElength(all_human_intervals)==length(all_macaque_intervals)## [1] TRUElength(all_macaque_intervals) # 225,059## [1] 225059all_human_intervals = all_human_intervals[which(names(all_human_intervals) %in% peaks)]
all_chimp_intervals = all_chimp_intervals[which(names(all_chimp_intervals) %in% peaks)]
all_macaque_intervals = all_macaque_intervals[which(names(all_macaque_intervals) %in% peaks)]
length(all_human_intervals)==length(all_chimp_intervals)## [1] TRUElength(all_human_intervals)==length(all_macaque_intervals)## [1] TRUElength(all_macaque_intervals) # 224,411## [1] 224411all(names(all_human_intervals)==names(all_chimp_intervals))## [1] TRUEall(names(all_human_intervals)==names(all_macaque_intervals))## [1] TRUEexport.gff( all_human_intervals,
            con=paste0(outputs_directory,"hs_atac_for_Deseq2.gtf" ) )
export.gff( all_chimp_intervals,
            con=paste0(outputs_directory,"pt_atac_for_Deseq2.gtf" ) )
export.gff( all_macaque_intervals,
            con=paste0(outputs_directory,"mm_atac_for_Deseq2.gtf" ) )
writeLines( paste0( seqlevels(all_human_intervals), ",", 
                    gsub("chr",'',seqlevels(all_human_intervals)) ),
            paste0(outputs_directory,'hs_atac_for_Deseq2.txt') )
writeLines( paste0( seqlevels(all_chimp_intervals), ",", 
                    gsub("chr",'',seqlevels(all_chimp_intervals)) ),
            paste0(outputs_directory,'pt_atac_for_Deseq2.txt') )
writeLines( paste0( seqlevels(all_macaque_intervals), ",", 
                    gsub("chr",'',seqlevels(all_macaque_intervals)) ),
            paste0(outputs_directory,'mm_atac_for_Deseq2.txt') )setwd('/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/bam_files/')
## -------------------------
ele10 = featureCounts( 'ATAC_Seq_12-21_HomSap_i-Astro_WT_ELE10_merged_hg38.bam', 
                       annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/hs_atac_for_Deseq2.gtf', 
                       isGTFAnnotationFile = TRUE, 
                       chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/hs_atac_for_Deseq2.txt' ,
                       GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)
ele30 = featureCounts( 'ATAC_Seq_05-22_HomSap_i-Astro_WT_ELE30_2_Rep_1_hg38.bam', 
                       annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/hs_atac_for_Deseq2.gtf', 
                       isGTFAnnotationFile = TRUE, 
                       chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/hs_atac_for_Deseq2.txt' ,
                       GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)
## -------------------------
sandraa = featureCounts( 'ATAC_Seq_12-22_Pantro_i-Astro_Sandra_merged_PanTro6.bam', 
                         annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.gtf', 
                         isGTFAnnotationFile = TRUE, 
                         chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.txt' ,
                         GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)
Mandy04 = featureCounts( 'ATAC_Seq_05-22_PanTro_i-Astro_WT_Mandy4_Rep_1_PanTro6.bam', 
                         annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.gtf', 
                         isGTFAnnotationFile = TRUE, 
                         chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.txt' ,
                         GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)
Mandy06 = featureCounts( 'ATAC_Seq_05-22_PanTro_i-Astro_WT_Mandy6_Rep_1_PanTro6.bam', 
                         annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.gtf', 
                         isGTFAnnotationFile = TRUE, 
                         chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.txt' ,
                         GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)
## -------------------------
becky = featureCounts( 'ATAC_Seq_12-22_MacMul_i-Astro_Becky_merged_RheMac10.bam', 
                       annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/mm_atac_for_Deseq2.gtf',
                       isGTFAnnotationFile = TRUE, 
                       chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/mm_atac_for_Deseq2.txt' ,
                       GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)
## -------------------------
save( ele10, ele30, sandraa, Mandy04, Mandy06, becky,
      file=paste0(outputs_directory,'counts_ATAC_refined.RData' ) )
## -------------------------
all( ele10$annotation$GeneID == ele30$annotation$GeneID )
all( names(all_human_intervals)== ele10$annotation$GeneID)
all(start(all_human_intervals)==ele10$annotation$Start)
all( names(all_human_intervals)== becky$annotation$GeneID)
ATAC_count = data.frame( ELE10 = ele10$counts[,1], 
                         ELE30 = ele30$counts[,1], 
                         SandraA = sandraa$counts[,1], 
                         Mandy04 = Mandy04$counts[,1],
                         Mandy06 = Mandy06$counts[,1],
                         Becky = becky$counts[,1] )
save( ATAC_count,
      file = paste0(outputs_directory,'ATAC_count.RData' ) )||              Paired-end : yes                                              ||
||        Count read pairs : yes                                              ||
||              Annotation : hs_atac_for_Deseq2.gtf (GTF)                     ||
||      Dir for temp files : .                                                ||
||   Chromosome alias file : hs_atac_for_Deseq2.txt                           ||
||                 Threads : 1                                                ||
||                   Level : meta-feature level                               ||
||      Multimapping reads : counted                                          ||
|| Multi-overlapping reads : not counted                                      ||
||   Min overlapping bases : 1    Here for the quantitative analysis we will consider only peaks that have at least 50% liftover between all the species
load( paste0(outputs_directory,'ATAC_count.RData') )
metadata = data.frame(species=c('HS','HS','PT','PT','PT','MM'),
                      human_or_not = c("HS","HS","NHP","NHP","NHP","NHP"),
                      assay='ATAC',
                      row.names=colnames(ATAC_count))
data <- DESeqDataSetFromMatrix( countData=ATAC_count,
                                colData = metadata,
                                design = ~ 0 + species )
data$species = relevel(data$species, "HS")
data = DESeq(data,fitType = 'local')## estimating size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingvst_data = vst(data, blind=TRUE)
log_data = rlog(data, blind=TRUE)PCA Plot for all Samples.
species.colors <- c('HS' = '#000000', 'PT' = '#FF3300', 'MM' = '#0033FF')
ord = order(rowVars(counts(data, normalized = TRUE)), decreasing = TRUE)
pca = prcomp(t(counts(data, normalized = TRUE)[ord,]))
plotPCA(log_data, intgroup="species") + 
    geom_label_repel(aes(label = name),fill = alpha(c("white"),0.2),
                   show.legend = FALSE, size = 3.25, label.size=0.5,
                   fontface = 'bold') + 
  scale_color_manual(values = species.colors) + theme_bw() + labs(color = "Species") +
  theme(aspect.ratio = 1, axis.text = element_text(face = 'bold', size = 11),
        axis.title = element_text(face = 'bold', size = 13),
        legend.text = element_text(face = 'bold'), legend.title = element_text(face = 'bold', size = 12)) +
  ggtitle("PCA Plot")Next, we will identify human specific ATAC peaks in comparison with chimpanzee and macaque.
hs_atac_for_Deseq2 = all_human_intervals[ which(names(all_human_intervals) %in% rownames(ATAC_count))]
score(hs_atac_for_Deseq2) = 1
export.bed( hs_atac_for_Deseq2,
            con=paste0(outputs_directory,"hs_atac_for_Deseq2.bed" ))
save(hs_atac_for_Deseq2,
     file=paste0(objects_directory,"hs_atac_for_Deseq2.RData") )
pt_atac_for_Deseq2 = all_chimp_intervals[ which(names(all_chimp_intervals) %in% names(hs_atac_for_Deseq2))]
score(pt_atac_for_Deseq2) = 1
export.bed( pt_atac_for_Deseq2,
            con=paste0(outputs_directory,"pt_atac_for_Deseq2.bed" ))
save(pt_atac_for_Deseq2,
     file=paste0(objects_directory,"pt_atac_for_Deseq2.RData") )
mm_atac_for_Deseq2 = all_macaque_intervals[ which(names(all_macaque_intervals) %in% names(hs_atac_for_Deseq2))]
score(mm_atac_for_Deseq2) = 1
export.bed( mm_atac_for_Deseq2,
            con=paste0(outputs_directory,"mm_atac_for_Deseq2.bed" ))
save(mm_atac_for_Deseq2,
     file=paste0(objects_directory,"mm_atac_for_Deseq2.RData") )Individual comparisons and a table of these
HS_PT = DESeqDataSetFromMatrix( countData = ATAC_count[ ,colnames(ATAC_count) %in% rownames(metadata[metadata$species %in% c("HS","PT"),])],
                                colData = metadata[metadata$species %in% c("HS","PT"),],
                                design = ~ 0 + species )## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factorsHS_PT = DESeq(HS_PT,fitType = 'local')## estimating size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingHS_PT$species = relevel(HS_PT$species, "HS")
res_HS_PT = results(HS_PT, contrast = c("species","HS","PT"))
# all(rownames(res_HS_PT)==names(hs_atac_for_Deseq2))
HS_MM = DESeqDataSetFromMatrix( countData=ATAC_count[,metadata$species %in% c("HS","MM")],
                                colData = metadata[metadata$species %in% c("HS","MM"),],
                                design = ~ 0 + species )## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factorsHS_MM = DESeq(HS_MM,fitType = 'local')## estimating size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingHS_MM$species = relevel(HS_MM$species, "HS")
res_HS_MM = results(HS_MM, contrast = c("species","HS","MM"))
# all(rownames(res_HS_PT)==rownames(res_HS_MM))
res_HS_NHP = data.frame( hs_pt_LFC = res_HS_PT$log2FoldChange,
                         hs_pt_Padj = res_HS_PT$padj,
                         hs_mm_LFC = res_HS_MM$log2FoldChange,
                         hs_mm_Padj = res_HS_MM$padj,
                         row.names = rownames(res_HS_MM),
                         chrom_hs = chrom(hs_atac_for_Deseq2),
                         start= start(hs_atac_for_Deseq2),
                         end = end(hs_atac_for_Deseq2) )
hs_atac_for_Deseq2$score=0
hs_atac_for_Deseq2$padj_HSPT = res_HS_PT$padj
hs_atac_for_Deseq2$padj_HSMM = res_HS_MM$padj
export.gff(hs_atac_for_Deseq2,con=paste0(outputs_directory,"hs_atac_for_Deseq2.gtf"))
table(rowSums(res_HS_NHP[,c("hs_pt_Padj","hs_mm_Padj")]<0.1))## 
##      0      1      2 
## 122226  71847  17203table(rowSums(all_Deseqs[,c("pvalue.x","padj.y")]<0.1))## 
##     0     1     2 
## 13083 11086  5137table( res_HS_NHP$hs_pt_Padj<0.1 & res_HS_NHP$hs_pt_LFC<0 )## 
##  FALSE   TRUE 
## 205878  15321table( res_HS_NHP$hs_pt_Padj<0.1 & res_HS_NHP$hs_pt_LFC>0 )## 
##  FALSE   TRUE 
## 202928  20257table( res_HS_NHP$hs_mm_Padj<0.1 & res_HS_NHP$hs_mm_LFC<0 )## 
##  FALSE   TRUE 
## 184934  35243table( res_HS_NHP$hs_mm_Padj<0.1 & res_HS_NHP$hs_mm_LFC>0 )## 
##  FALSE   TRUE 
## 175875  43810## export locations of the altered peaks
hs_atac_for_Deseq2_Hs_vs_NHP_filt = hs_atac_for_Deseq2[ which(!is.na(res_HS_NHP$hs_pt_Padj) & ! is.na(res_HS_NHP$hs_mm_Padj)) ]
res_HS_NHP_filt = res_HS_NHP[ which(!is.na(res_HS_NHP$hs_pt_Padj) & ! is.na(res_HS_NHP$hs_mm_Padj)), ]
all(names(hs_atac_for_Deseq2_Hs_vs_NHP_filt)==rownames(res_HS_NHP_filt))## [1] TRUEsum( rowSums(cbind(res_HS_NHP_filt$hs_pt_Padj<0.1,res_HS_NHP_filt$hs_mm_Padj<0.1 ))>0 )## [1] 89050## -----------------------------
pvalthr=0.1
gained_ATAC_gr = hs_atac_for_Deseq2_Hs_vs_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC>0 & res_HS_NHP_filt$hs_pt_Padj<pvalthr & res_HS_NHP_filt$hs_mm_LFC>0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr) ]
gained_ATAC = res_HS_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC>0 & res_HS_NHP_filt$hs_pt_Padjpvalthr & res_HS_NHP_filt$hs_mm_LFC>0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr), ]
gained_ATAC_gr$score=0
export.bed(gained_ATAC_gr,con=paste0(outputs_directory,"gained_ATAC_gr.bed"))
lost_ATAC_gr = hs_atac_for_Deseq2_Hs_vs_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC<0 & res_HS_NHP_filt$hs_pt_Padj<pvalthr & res_HS_NHP_filt$hs_mm_LFC<0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr) ]
lost_ATAC = res_HS_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC<0 & res_HS_NHP_filt$hs_pt_Padj<pvalthr & res_HS_NHP_filt$hs_mm_LFC<0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr), ]
lost_ATAC_gr$score=0
export.bed(lost_ATAC_gr,con=paste0(outputs_directory,"lost_ATAC_gr.bed"))
par(mar=c(5,5,5,5),mfrow=c(1,1))#, cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)
topT <- as.data.frame(res_HS_NHP)
topTsig = rownames(topT[topT$hs_pt_Padj <=pvalthr & topT$hs_mm_Padj<=pvalthr,])
length(topTsig)## [1] 25677with(topT, plot(hs_pt_LFC, -log10(hs_pt_Padj), 
                  pch=20, cex=1.0, 
                  xlab=bquote(~Log[2]~fold~change), 
                  ylab=bquote(~-log[10]~Q~value), 
                  xlim=c(-10,10),
                  ylim=c(0,20)),col="gray60")  
with(subset(topT, rownames(topT) %in% topTsig), 
       points(hs_pt_LFC, -log10(hs_pt_Padj), pch=20, col="steelblue", cex=0.5))
axis(2,lwd=2)
axis(1,lwd=2)
box(col="black",lwd=2)First we will consider only regions that do not overlap promoters nor H3K4me3 peaks. Then, we identify the human specific ATAC-seq peaks and remove the peaks that overlap H3K27ac in NHPs. Create a big annotation table for the DORegions between Hs, Pt and Mm.
human_spe_enhancers = gained_ATAC_gr[ - queryHits(findOverlaps(gained_ATAC_gr,c(hs_me3,promoters_tss_gr)))]
human_spe_active_promoters = gained_ATAC_gr[ queryHits(findOverlaps(gained_ATAC_gr, promoters_tss_gr[queryHits(findOverlaps(promoters_tss_gr,hs_me3))]))]
human_spe_inactive_promoters = gained_ATAC_gr[ queryHits(findOverlaps(gained_ATAC_gr, promoters_tss_gr[-queryHits(findOverlaps(promoters_tss_gr,hs_me3))]))]
pt_atac_for_Deseq2_not_H3K27ac = names(pt_atac_for_Deseq2[-queryHits(findOverlaps(pt_atac_for_Deseq2,pt_k27ac))])
mm_atac_for_Deseq2_not_H3K27ac = names(mm_atac_for_Deseq2[-queryHits(findOverlaps(mm_atac_for_Deseq2,mm_k27ac))])
## HUMAN SPECIFIC ENHANCERS
human_spe_enhancers = human_spe_enhancers[which(names(human_spe_enhancers) %in% pt_atac_for_Deseq2_not_H3K27ac[pt_atac_for_Deseq2_not_H3K27ac %in% mm_atac_for_Deseq2_not_H3K27ac])]
human_spe_enhancers_with_K27_peak = human_spe_enhancers[queryHits(findOverlaps(human_spe_enhancers,hs_k27ac))]
human_spe_enhancers_without_K27_peak = human_spe_enhancers[-queryHits(findOverlaps(human_spe_enhancers,hs_k27ac))]
length(human_spe_enhancers)## [1] 9356gained_atac_peaks_hs = names(gained_ATAC_gr)
gained_ATAC_gr_pt = pt_atac_for_Deseq2[which(names(pt_atac_for_Deseq2) %in% gained_atac_peaks_hs)]
gained_ATAC_gr_mm = mm_atac_for_Deseq2[which(names(mm_atac_for_Deseq2) %in% gained_atac_peaks_hs)]
all(names(gained_atac_peaks_hs) == names(gained_ATAC_gr_pt))## [1] TRUEall(names(gained_atac_peaks_hs) == names(gained_ATAC_gr_mm))## [1] TRUEintergenic_gained = gained_atac_peaks_hs[-queryHits(findOverlaps(gained_ATAC_gr,promoters_tss_gr))]
gained_ATAC_functional_annotation = data.frame(atac_Hs = countOverlaps(gained_ATAC_gr,hs_atac),
                                               atac_Pt = countOverlaps(gained_ATAC_gr_pt,pt_atac),
                                               atac_Mm = countOverlaps(gained_ATAC_gr_mm,mm_atac),
                                               me3_Hs = countOverlaps(gained_ATAC_gr,hs_me3),
                                               me3_Pt = countOverlaps(gained_ATAC_gr_pt,pt_me3),
                                               me3_Mm = countOverlaps(gained_ATAC_gr_mm,mm_me3),
                                               k27_Hs = countOverlaps(gained_ATAC_gr,hs_k27ac),
                                               k27_Pt = countOverlaps(gained_ATAC_gr_pt,pt_k27ac),
                                               k27_Mm = countOverlaps(gained_ATAC_gr_mm,mm_k27ac),
                                               promoter = countOverlaps(gained_ATAC_gr,promoters_tss_gr),
                                               is_intergenic = gained_atac_peaks_hs %in% intergenic_gained,
                                               row.names = gained_atac_peaks_hs )
colSums(gained_ATAC_functional_annotation>0)##       atac_Hs       atac_Pt       atac_Mm        me3_Hs        me3_Pt 
##         13108          4253          1792           411           329 
##        me3_Mm        k27_Hs        k27_Pt        k27_Mm      promoter 
##           301          7382          2888          1424           908 
## is_intergenic 
##         12268par(mfrow=c(1,1))
x=gained_ATAC_functional_annotation>0
x=x[order(x[,1],x[,2],x[,3],x[,4],x[,5],x[,6],x[,7],x[,8],x[,9],x[,10],x[,11]),]
x[x[,2]>0,2] = 2
x[x[,3]>0,3] = 3
x[x[,4]>0,4] = 4
x[x[,5]>0,5] = 5
x[x[,6]>0,6] = 6
x[x[,7]>0,7] = 7
x[x[,8]>0,8] = 8
x[x[,9]>0,9] = 9
x[x[,10]>0,10] = 10
x[x[,11]>0,11] = 11
par(mfrow=c(1,1),mar=c(7,1,1,1))
image(t(x),
      col=c("white","gray80","gray80","gray80",
            'forestgreen','forestgreen','forestgreen',
            "coral3","coral3","coral3",
            "black","blue4"),
      axes=FALSE,
      las=2)
box(col="black",lwd=2)
axis(1,at=seq(0,1,length=11),
     c("atac_human","atac_chimp","atac_macaque",
       "me3_human","me3_chimp","me3_macaque",
       "k27_human","k27_chimp","k27_macaque",
       "promoter","intergenic"),
     las=2)
abline(v=seq(0,1,length.out=11)[c(3,6,9,10)]+0.05,lwd=2)gained_promoters = gained_ATAC_functional_annotation[gained_ATAC_functional_annotation$me3_Pt==0 & gained_ATAC_functional_annotation$me3_Mm==0 & gained_ATAC_functional_annotation$me3_Hs>0 & gained_ATAC_functional_annotation$atac_Hs>0 & gained_ATAC_functional_annotation$atac_Pt==0 & gained_ATAC_functional_annotation$atac_Mm==0 & gained_ATAC_functional_annotation$k27_Mm==0 & gained_ATAC_functional_annotation$promoter>0 & gained_ATAC_functional_annotation$me3_Hs>0 & gained_ATAC_functional_annotation$atac_Pt==0 & gained_ATAC_functional_annotation$promoter>0,]
gained_promoters_gr = gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(gained_promoters))]lost_atac_peaks_hs = names(lost_ATAC_gr)
lost_ATAC_gr_pt = pt_atac_for_Deseq2[which(names(pt_atac_for_Deseq2) %in% lost_atac_peaks_hs)]
lost_ATAC_gr_mm = mm_atac_for_Deseq2[which(names(mm_atac_for_Deseq2) %in% lost_atac_peaks_hs)]
all(names(lost_atac_peaks_hs) == names(lost_ATAC_gr_pt))## [1] TRUEall(names(lost_atac_peaks_hs) == names(lost_ATAC_gr_mm))## [1] TRUEintergenic_lost = lost_atac_peaks_hs[-queryHits(findOverlaps(lost_ATAC_gr,promoters_tss_gr))]
lost_ATAC_functional_annotation = data.frame(atac_Hs = countOverlaps(lost_ATAC_gr,hs_atac),
                                             atac_Pt = countOverlaps(lost_ATAC_gr_pt,pt_atac),
                                             atac_Mm = countOverlaps(lost_ATAC_gr_mm,mm_atac),
                                             me3_Hs = countOverlaps(lost_ATAC_gr,hs_me3),
                                             me3_Pt = countOverlaps(lost_ATAC_gr_pt,pt_me3),
                                             me3_Mm = countOverlaps(lost_ATAC_gr_mm,mm_me3),
                                             k27_Hs = countOverlaps(lost_ATAC_gr,hs_k27ac),
                                             k27_Pt = countOverlaps(lost_ATAC_gr_pt,pt_k27ac),
                                             k27_Mm = countOverlaps(lost_ATAC_gr_mm,mm_k27ac),
                                             promoter = countOverlaps(lost_ATAC_gr,promoters_tss_gr),
                                             is_intergenic = lost_atac_peaks_hs %in% intergenic_lost,
                                             row.names = lost_atac_peaks_hs )
colSums(lost_ATAC_functional_annotation>0)##       atac_Hs       atac_Pt       atac_Mm        me3_Hs        me3_Pt 
##           479          2441          3172           145           446 
##        me3_Mm        k27_Hs        k27_Pt        k27_Mm      promoter 
##           505           325          1538          1234           540 
## is_intergenic 
##          2780We observe a 3 fold over representation of lost than gained promoters in evolution
lost_promoters = lost_ATAC_functional_annotation[lost_ATAC_functional_annotation$promoter>0 & rowSums( lost_ATAC_functional_annotation[,c("me3_Pt","me3_Mm","k27_Pt","k27_Mm")]>0)==4,]
gained_promoters = gained_ATAC_functional_annotation[gained_ATAC_functional_annotation$me3_Hs>0  & gained_ATAC_functional_annotation$promoter>0 & rowSums(gained_ATAC_functional_annotation[,c("me3_Pt","me3_Mm","k27_Pt","k27_Mm")]==0)==4,]
lost_promoters = lost_ATAC_functional_annotation[lost_ATAC_functional_annotation$promoter>0 ,]
gained_promoters = gained_ATAC_functional_annotation[gained_ATAC_functional_annotation$promoter>0 ,]
lost_promoters_ensid = unique( promoters_tss_gr[queryHits(findOverlaps(promoters_tss_gr, lost_ATAC_gr[which(names(lost_ATAC_gr) %in% rownames(lost_promoters))] ))]$gene_id )
gained_promoters_ensid = unique( promoters_tss_gr[queryHits(findOverlaps(promoters_tss_gr, gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(gained_promoters) )]))]$gene_id )
length(gained_promoters_ensid)## [1] 951length(lost_promoters_ensid)## [1] 613any(lost_promoters_ensid %in% gained_promoters_ensid)## [1] TRUEm=matrix(c(nrow(lost_promoters),
           nrow(gained_promoters),
           length(lost_ATAC_gr),
           length(gained_ATAC_gr)),2,2)
fisher.test(m)## 
##  Fisher's Exact Test for Count Data
## 
## data:  m
## p-value < 0.00000000000000022
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  2.103521 2.646321
## sample estimates:
## odds ratio 
##   2.360091save(gained_ATAC_gr,file=paste0(objects_directory,"gained_ATAC_gr.RData"))
save(lost_ATAC_gr,file=paste0(objects_directory,"lost_ATAC_gr.RData"))## ----------------
geniune_lost_active_enhancers = lost_ATAC_functional_annotation[ lost_ATAC_functional_annotation$is_intergenic>0 & lost_ATAC_functional_annotation$promoter==0 & rowSums(lost_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & lost_ATAC_functional_annotation$k27_Hs==0 & lost_ATAC_functional_annotation$k27_Pt>0 & lost_ATAC_functional_annotation$k27_Mm>0,]
geniune_lost_poised_enhancers = lost_ATAC_functional_annotation[ lost_ATAC_functional_annotation$is_intergenic>0 & lost_ATAC_functional_annotation$promoter==0 & rowSums(lost_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & lost_ATAC_functional_annotation$k27_Hs==0 &  lost_ATAC_functional_annotation$k27_Pt==0 & lost_ATAC_functional_annotation$k27_Mm==0,]
geniune_lost_enhancers = lost_ATAC_functional_annotation[ lost_ATAC_functional_annotation$is_intergenic>0 & lost_ATAC_functional_annotation$promoter==0 & rowSums(lost_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & lost_ATAC_functional_annotation$k27_Hs==0 ,]
genuine_lost_enhancers_gr = lost_ATAC_gr[which(names(lost_ATAC_gr) %in% rownames(geniune_lost_enhancers))]
geniune_lost_poised_enhancers_gr = lost_ATAC_gr[which(names(lost_ATAC_gr) %in% rownames(geniune_lost_poised_enhancers))]
geniune_lost_active_enhancers_gr = lost_ATAC_gr[which(names(lost_ATAC_gr) %in% rownames(geniune_lost_active_enhancers))]
save( genuine_lost_enhancers_gr, file=paste0(objects_directory,"genuine_lost_enhancers_gr.RData"))
## ----------------
genuine_gained_enhancers = gained_ATAC_functional_annotation[ gained_ATAC_functional_annotation$is_intergenic>0 & gained_ATAC_functional_annotation$promoter==0 & rowSums(gained_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & gained_ATAC_functional_annotation$k27_Pt==0 & gained_ATAC_functional_annotation$k27_Mm==0,]
genuine_gained_active_enhancers = gained_ATAC_functional_annotation[ gained_ATAC_functional_annotation$is_intergenic>0 & gained_ATAC_functional_annotation$promoter==0 & rowSums(gained_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & gained_ATAC_functional_annotation$k27_Pt==0 & gained_ATAC_functional_annotation$k27_Mm==0 & gained_ATAC_functional_annotation$k27_Hs>0,]
genuine_gained_poised_enhancers = gained_ATAC_functional_annotation[ gained_ATAC_functional_annotation$is_intergenic>0 & gained_ATAC_functional_annotation$promoter==0 & rowSums(gained_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & gained_ATAC_functional_annotation$k27_Pt==0 & gained_ATAC_functional_annotation$k27_Mm==0 & gained_ATAC_functional_annotation$k27_Hs==0,]
genuine_gained_enhancers_gr = gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(genuine_gained_enhancers))]
genuine_gained_active_enhancers_gr = gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(genuine_gained_active_enhancers))]
genuine_gained_poised_enhancers_gr = gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(genuine_gained_poised_enhancers))]
seqlevelsStyle(genuine_gained_enhancers_gr) = "ncbi"
export.bed( genuine_gained_enhancers_gr, con=paste0(outputs_directory,"genuine_gained_enhancers.bed"))
seqlevelsStyle(genuine_gained_enhancers_gr) = "ucsc"
export.bed( genuine_gained_enhancers_gr, con=paste0(outputs_directory,"genuine_gained_enhancers_ucsc.bed"))up_set = HS_UP_Genes$ensembl_id
dn_set = HS_DN_Genes$ensembl_id
promoters_HITS_UP = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% up_set ) ]
promoters_HITS_DN = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% dn_set ) ]length(genuine_gained_enhancers_gr)## [1] 9343length(genuine_lost_enhancers_gr)## [1] 2351promoters_HITS_UP_500 = GenomicRanges::resize(promoters_HITS_UP,1000000,fix="center")
sum(countOverlaps(promoters_HITS_UP_500,genuine_gained_enhancers_gr)>0)## [1] 586sum(countOverlaps(promoters_HITS_UP_500,genuine_gained_active_enhancers_gr)>0)## [1] 460## ----------------
promoters_HITS_UP = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% up_set ) ]
promoters_HITS_DN = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% dn_set ) ]
save( promoters_HITS_UP, promoters_HITS_DN, 
      file=paste0(objects_directory,"promoters_up_down.RData"))
prom_up_hs_pt = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.x>0 & all_Deseqs$padj.x<0.01,"Row.names"] ) ]
prom_dn_hs_pt = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.x<0 & all_Deseqs$padj.x<0.01,"Row.names"] ) ]
prom_up_hs_mm = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.y>0 & all_Deseqs$padj.y<0.01,"Row.names"] ) ]
prom_dn_hs_mm = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.y<0 & all_Deseqs$padj.y<0.01,"Row.names"] ) ]
## ----------------
genuine_gained_enhancers_that_do_something = genuine_gained_enhancers_gr[ which(elementMetadata(distanceToNearest(genuine_gained_enhancers_gr,promoters_HITS_UP))[,1]<500000)]
genuine_lost_enhancers_that_do_something = genuine_lost_enhancers_gr[ which(elementMetadata(distanceToNearest(genuine_lost_enhancers_gr,promoters_HITS_DN))[,1]<500000)]
genuine_lost_enhancers_that_do_nothing = genuine_lost_enhancers_gr[ which(elementMetadata(distanceToNearest(genuine_lost_enhancers_gr,c( prom_up_hs_pt,prom_dn_hs_pt,prom_up_hs_mm,prom_dn_hs_mm,promoters_HITS_UP,promoters_HITS_DN )))[,1]>500000)]
## verify on elements identified without TAD filtering
enhancers_linked_with_activation = genuine_gained_enhancers_gr[which(elementMetadata(distanceToNearest(genuine_gained_enhancers_gr, promoters_HITS_UP ))[,1]<500000)]
enhancers_not_linked_with_activation = genuine_gained_enhancers_gr[which(elementMetadata(distanceToNearest(genuine_gained_enhancers_gr, c( prom_up_hs_pt,prom_dn_hs_pt,prom_up_hs_mm,prom_dn_hs_mm,promoters_HITS_UP,promoters_HITS_DN )))[,1]>500000) ]
save( enhancers_linked_with_activation, enhancers_not_linked_with_activation,
      file=paste0(objects_directory,"enhancers_functional_groups.RData"))
export.bed(enhancers_linked_with_activation,con=paste0(outputs_directory,"enhancers_linked_with_activation.bed"))
export.bed(enhancers_not_linked_with_activation,con=paste0(outputs_directory,"enhancers_not_linked_with_activation.bed"))
save( genuine_lost_enhancers_that_do_something, genuine_lost_enhancers_that_do_nothing,
      file=paste0(objects_directory,"lost_enhancers_functional_groups.RData"))
export.bed(genuine_lost_enhancers_that_do_something,con=paste0(outputs_directory,"lost_enhancers_linked_with_activation.bed"))
export.bed(genuine_lost_enhancers_that_do_nothing,con=paste0(outputs_directory,"lost_enhancers_not_linked_with_activation.bed") )
## all enhancers
enhancers_HS = hs_atac[- queryHits(findOverlaps(hs_atac,c(hs_me3,promoters_tss_gr))) ]
enhancers_HS = enhancers_HS[ queryHits(findOverlaps(enhancers_HS, hs_k27ac) ) ]
## conserved enhancers - very not a lot and do not change significantly and do not overlap any promoter
conserved_enhancers = res_HS_NHP[ abs(res_HS_NHP$hs_pt_LFC)<log2(1.5) & abs(res_HS_NHP$hs_mm_LFC)<log2(1.5) &
                                  res_HS_NHP$hs_pt_Padj>0.1 & res_HS_NHP$hs_mm_Padj>0.1, ]
conserved_enhancers = enhancers_HS[ which(names(enhancers_HS) %in% rownames(conserved_enhancers)) ]
save( conserved_enhancers,
      file=paste0(objects_directory,"conserved_enhancers.RData" ))
export.bed(conserved_enhancers,con=paste0(outputs_directory,"conserved_enhancers.bed"))Bar-graph showing how many enhancer do something
up_dn_sep = c(prom_up_hs_pt,prom_dn_hs_pt,prom_up_hs_mm,prom_dn_hs_mm)
up_dn_sep = up_dn_sep[which(! names(up_dn_sep) %in% names(promoters_HITS_UP))]
m =   c( any_DEG=sum( elementMetadata(distanceToNearest(genuine_gained_enhancers_gr,promoters_HITS_UP))[,1] > 500000)-length(enhancers_not_linked_with_activation),
         EAG=sum( elementMetadata(distanceToNearest(genuine_gained_enhancers_gr,promoters_HITS_UP))[,1] < 500000) )
par(mar=c(4,4,4,4),mfrow=c(1,1))
barplot(as.matrix(m),beside = FALSE,col=c("#0B6623","steelblue3"),
        ylim=c(0,10000),ylab="Enhancers")
axis(2,lwd=2)m ## any_DEG     EAG 
##    5219    1443genuine_gained_enhancers_gr = import.bed(paste0(outputs_directory,"genuine_gained_enhancers_ucsc.bed"))
enhancers_TAD_annotation = data.frame( up_genes = countOverlaps(ele_domains$TADs,promoters_HITS_UP),
                                       up_enhancers = countOverlaps(ele_domains$TADs,genuine_gained_enhancers_gr),
                                       genes_Hs_NHP = countOverlaps(ele_domains$TADs,c(prom_up_hs_pt,prom_dn_hs_pt,prom_up_hs_mm,prom_dn_hs_mm)))
enhancers_linked_with_activation_TAD = genuine_gained_enhancers_gr[unique(queryHits(findOverlaps(genuine_gained_enhancers_gr,ele_domains$TADs[which(enhancers_TAD_annotation[,1]>0 & enhancers_TAD_annotation[,2]>0)])))]
enhancers_not_linked_with_activation_TAD = genuine_gained_enhancers_gr[unique(queryHits(findOverlaps(genuine_gained_enhancers_gr,ele_domains$TADs[which(enhancers_TAD_annotation[,1]==0 & enhancers_TAD_annotation[,2]>0 & enhancers_TAD_annotation[,3]==0)])))]
enhancers_linked_with_activation_TADs = enhancers_linked_with_activation_TAD[which(!enhancers_linked_with_activation_TAD$name %in% enhancers_not_linked_with_activation_TAD$name)]
enhancers_not_linked_with_activation_TADs = enhancers_not_linked_with_activation_TAD[which(!enhancers_not_linked_with_activation_TAD$name %in% enhancers_linked_with_activation_TAD$name)]
names(enhancers_linked_with_activation_TADs) = enhancers_linked_with_activation_TADs$name
names(enhancers_not_linked_with_activation_TAD) = enhancers_not_linked_with_activation_TAD$name
export.bed(enhancers_linked_with_activation_TADs,con=paste0(outputs_directory,"enhancers_linked_with_activation_TADs.bed"))
export.bed(enhancers_not_linked_with_activation_TADs,con=paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs.bed"))We read in the annotation of TADs from TopDom. We identify DEGs in single comparisions and assess how frequently we see up and down regulated genes per TAD.
prom_up_hs_pt = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.x>0 & all_Deseqs$padj.x<0.01,"Row.names"] ) ]
prom_dn_hs_pt = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.x<0 & all_Deseqs$padj.x<0.01,"Row.names"] ) ]
prom_up_hs_mm = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.y>0 & all_Deseqs$padj.y<0.01,"Row.names"] ) ]
prom_dn_hs_mm = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.y<0 & all_Deseqs$padj.y<0.01,"Row.names"] ) ]
ele_domains_anno = data.frame( up_prom = countOverlaps(ele_domains$TADs,promoters_HITS_UP),
                               dn_prom = countOverlaps(ele_domains$TADs,promoters_HITS_DN),
                               up_vsPt = countOverlaps(ele_domains$TADs,prom_up_hs_pt),
                               dn_vsPt = countOverlaps(ele_domains$TADs,prom_dn_hs_pt),
                               up_vsMm = countOverlaps(ele_domains$TADs,prom_up_hs_mm),
                               dn_vsMm = countOverlaps(ele_domains$TADs,prom_dn_hs_mm),
                               number_of_enh_hs = countOverlaps(ele_domains$TADs, enhancers_HS ),
                               genuine_gained_enhancers = countOverlaps(ele_domains$TADs,genuine_gained_enhancers_gr),
                               genuine_gained_active_enhancers = countOverlaps(ele_domains$TADs,genuine_gained_active_enhancers_gr),
                               genuine_gained_poised_enhancers = countOverlaps(ele_domains$TADs,genuine_gained_poised_enhancers_gr),
                               genuine_lost_enhancers = countOverlaps(ele_domains$TADs,genuine_lost_enhancers_gr),
                               geniune_lost_active_enhancers = countOverlaps(ele_domains$TADs,geniune_lost_active_enhancers_gr),
                               geniune_lost_poised_enhancers = countOverlaps(ele_domains$TADs,geniune_lost_poised_enhancers_gr),
                               gained_enhancers_that_do_sth = countOverlaps(ele_domains$TADs,genuine_gained_enhancers_that_do_something),
                               lost_enhancers_that_do_sth = countOverlaps(ele_domains$TADs,genuine_lost_enhancers_that_do_something),
                               prom_number=countOverlaps(ele_domains$TADs,promoters_filtered_gr),
                               me3_number=countOverlaps(ele_domains$TADs,hs_me3),
                               size = width(ele_domains$TADs))
length(unique(queryHits(findOverlaps(promoters_HITS_UP,ele_domains$TADs))))## [1] 555length(unique(queryHits(findOverlaps(promoters_HITS_DN,ele_domains$TADs))))## [1] 447tads_with_up_gene = ele_domains_anno[ele_domains_anno$up_prom>0 & ele_domains_anno$dn_prom==0,]
tads_with_dn_gene = ele_domains_anno[ele_domains_anno$dn_prom>0 & ele_domains_anno$up_prom==0,]
colSums(tads_with_up_gene>0)##                         up_prom                         dn_prom 
##                             382                               0 
##                         up_vsPt                         dn_vsPt 
##                             382                              72 
##                         up_vsMm                         dn_vsMm 
##                             382                              83 
##                number_of_enh_hs        genuine_gained_enhancers 
##                             349                             231 
## genuine_gained_active_enhancers genuine_gained_poised_enhancers 
##                             154                             176 
##          genuine_lost_enhancers   geniune_lost_active_enhancers 
##                              92                              19 
##   geniune_lost_poised_enhancers    gained_enhancers_that_do_sth 
##                              55                             228 
##      lost_enhancers_that_do_sth                     prom_number 
##                               8                             382 
##                      me3_number                            size 
##                             366                             382colSums(tads_with_dn_gene>0)##                         up_prom                         dn_prom 
##                               0                             365 
##                         up_vsPt                         dn_vsPt 
##                              38                             365 
##                         up_vsMm                         dn_vsMm 
##                              83                             365 
##                number_of_enh_hs        genuine_gained_enhancers 
##                             335                             183 
## genuine_gained_active_enhancers genuine_gained_poised_enhancers 
##                              99                             133 
##          genuine_lost_enhancers   geniune_lost_active_enhancers 
##                              98                              20 
##   geniune_lost_poised_enhancers    gained_enhancers_that_do_sth 
##                              50                              12 
##      lost_enhancers_that_do_sth                     prom_number 
##                              93                             365 
##                      me3_number                            size 
##                             353                             365sum( rowSums(ele_domains_anno[,c("up_prom","dn_prom")]>0)==2 )## [1] 33sum( rowSums(ele_domains_anno[,c("up_prom","dn_prom")]>0)==1 & rowSums(ele_domains_anno[,c("up_prom","dn_prom")])>1)## [1] 124## compute co-occurence of up and down regulated genes in TADs
getStats = function(x,col1,col2){
  tp = c( sum(x[,col1]>0 & x[,col2]==0), 
          sum(x[,col1]==0 & x[,col2]>0),
          sum(x[,col1]>0 & x[,col2]>0 ) )
  names(tp) = c("FirstOnly","SecondOnly","Both")
  return(tp)
}
par(mfrow=c(1,1),mar=c(8,5,1,1))
m = rbind( getStats(ele_domains_anno,
                    which(colnames(ele_domains_anno)=="up_prom"),
                    which(colnames(ele_domains_anno)=="dn_prom")),
           getStats(ele_domains_anno,
                    which(colnames(ele_domains_anno)=="up_vsPt"),
                    which(colnames(ele_domains_anno)=="dn_vsPt")),
           getStats(ele_domains_anno,
                    which(colnames(ele_domains_anno)=="up_vsMm"),
                    which(colnames(ele_domains_anno)=="dn_vsMm")) )
barplot(t(m), beside=FALSE, col=c("green4","wheat3","gray60"),
        names=c("Hs vs NHP","Hs vs. Pt","Hs vs. Mm"),las=2,
        ylab="EAG")
axis(2,lwd=2,las=2)Figure showing how many genuine gained enhancers are there per domain. First of all there are many domains that only feature gained enhancer and no upregulated EAG. There are few domains where I do not see a gained enhancer despite the presence of an upregulated EAG. We see both the up-regulated EAG and a gained DOR in 253 TADs.
getStats(ele_domains_anno,
         which(colnames(ele_domains_anno)=="genuine_gained_enhancers"),
         which(colnames(ele_domains_anno)=="up_prom"))##  FirstOnly SecondOnly       Both 
##       3420        162        253hist(ele_domains_anno[ele_domains_anno$up_prom>0,"genuine_gained_enhancers"],n=14,
     main="",col="green4",xlab="Number of gained putative enhancers",ylim=c(0,300))
axis(1,lwd=2)
axis(2,lwd=2)Majority of TADs have a gained enhancer and an upregulated EAG!
sum(ele_domains_anno$genuine_gained_enhancers>0)## [1] 3673sum(ele_domains_anno[ele_domains_anno$up_prom>0,"genuine_gained_enhancers"]>0)## [1] 253sum(ele_domains_anno$up_prom>0)## [1] 415gained_enhancers_in_any_comp = hs_atac_for_Deseq2_Hs_vs_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC>0 & res_HS_NHP_filt$hs_pt_Padj<pvalthr | res_HS_NHP_filt$hs_mm_LFC>0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr) ]
gained_enhancers_in_any_comp = gained_enhancers_in_any_comp[- queryHits(findOverlaps(gained_enhancers_in_any_comp,c(hs_me3,promoters_tss_gr)))]
gained_enhancers_in_any_comp = gained_enhancers_in_any_comp[which(names(gained_enhancers_in_any_comp) %in% pt_atac_for_Deseq2_not_H3K27ac[pt_atac_for_Deseq2_not_H3K27ac %in% mm_atac_for_Deseq2_not_H3K27ac])]
sum( countOverlaps(ele_domains$TADs,gained_enhancers_in_any_comp)>0 & ele_domains_anno$up_prom>0  )## [1] 358sum( countOverlaps(ele_domains$TADs,gained_enhancers_in_any_comp)>0 & rowSums( ele_domains_anno[,c("up_vsPt", "up_vsMm")]>0)>0  )## [1] 1607Overall number of human gained enhancers as compared to chimps and macaques - is it explaining the fact that the log fold change in the human lineage is more pronounced when compared to macaques?
all(names(hs_atac_for_Deseq2)==rownames(res_HS_NHP))## [1] TRUEproms_in_tads_wo_DORs = promoters_HITS_UP[queryHits(findOverlaps(promoters_HITS_UP, ele_domains$TADs[which(ele_domains_anno$up_prom>0 & ele_domains_anno$genuine_gained_enhancers==0)]))]
proms_in_tads_with_1_DORs = promoters_HITS_UP[queryHits(findOverlaps(promoters_HITS_UP, ele_domains$TADs[which(ele_domains_anno$up_prom>0 & ele_domains_anno$genuine_gained_enhancers==1)]))]
proms_in_tads_with_many_DORs = promoters_HITS_UP[queryHits(findOverlaps(promoters_HITS_UP, ele_domains$TADs[which(ele_domains_anno$up_prom>0 & ele_domains_anno$genuine_gained_enhancers>1 & ele_domains_anno$genuine_gained_enhancers<4)]))]
proms_in_tads_with_very_many_DORs = promoters_HITS_UP[queryHits(findOverlaps(promoters_HITS_UP, ele_domains$TADs[which(ele_domains_anno$up_prom>0 & ele_domains_anno$genuine_gained_enhancers>4)]))]
DORs_gained_Hs_Pt = hs_atac_for_Deseq2[which(res_HS_NHP$hs_pt_LFC>0 & res_HS_NHP$hs_pt_Padj<0.1)]
DORs_gained_Hs_Mm = hs_atac_for_Deseq2[which(res_HS_NHP$hs_mm_LFC>0 & res_HS_NHP$hs_mm_Padj<0.1)]
DORs_gained_Hs_Pt = DORs_gained_Hs_Pt[-unique(queryHits(findOverlaps(DORs_gained_Hs_Pt,c(promoters_tss_gr,hs_me3,gained_ATAC_gr))))]
DORs_gained_Hs_Mm = DORs_gained_Hs_Mm[-unique(queryHits(findOverlaps(DORs_gained_Hs_Mm,c(promoters_tss_gr,hs_me3,gained_ATAC_gr))))]
ele_domains_anno$DORs_gained_Hs_Pt = countOverlaps(ele_domains$TADs,DORs_gained_Hs_Pt)
ele_domains_anno$DORs_gained_Hs_Mm = countOverlaps(ele_domains$TADs,DORs_gained_Hs_Mm)
ele_domains_anno$DORs_anno = cut(ele_domains_anno$genuine_gained_enhancers,c(-Inf,0,1,3,1000))
par(mfrow=c(2,2),mar=c(3,2,1,2))
boxplot( all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_wo_DORs)],
         # all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_1_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_many_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_very_many_DORs)],
         notch=TRUE, ylim=c(0,12), col="white",border=colorRampPalette(c("steelblue","green4"))(4),
         names=c("0","1","2-3",">3"))
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)
boxplot( all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_wo_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_1_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_many_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_very_many_DORs)],
         notch=TRUE, ylim=c(0,12), col="white",border=colorRampPalette(c("steelblue","green4"))(4),
         names=c("0","1","2-3",">3"))
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)
boxplot(split(ele_domains_anno$DORs_gained_Hs_Pt,ele_domains_anno$DORs_anno),ylim=c(0,30),notch=TRUE, col="white",border=colorRampPalette(c("steelblue","green4"))(4),names=c("0","1","2-3",">3"))## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSEboxplot(split(ele_domains_anno$DORs_gained_Hs_Mm,ele_domains_anno$DORs_anno),ylim=c(0,30),notch=TRUE, col="white",border=colorRampPalette(c("steelblue","green4"))(4),names=c("0","1","2-3",">3"))DORs_lost_Hs_Pt = hs_atac_for_Deseq2[which(res_HS_NHP$hs_pt_LFC<0 & res_HS_NHP$hs_pt_Padj<0.1)]
DORs_lost_Hs_Mm = hs_atac_for_Deseq2[which(res_HS_NHP$hs_mm_LFC<0 & res_HS_NHP$hs_mm_Padj<0.1)]
DORs_lost_Hs_Pt = DORs_lost_Hs_Pt[-unique(queryHits(findOverlaps(DORs_lost_Hs_Pt,c(promoters_tss_gr,hs_me3,lost_ATAC_gr))))]
DORs_lost_Hs_Mm = DORs_lost_Hs_Mm[-unique(queryHits(findOverlaps(DORs_lost_Hs_Mm,c(promoters_tss_gr,hs_me3,lost_ATAC_gr))))]
ele_domains_anno$DORs_lost_Hs_Pt = countOverlaps(ele_domains$TADs,DORs_lost_Hs_Pt)
ele_domains_anno$DORs_lost_Hs_Mm = countOverlaps(ele_domains$TADs,DORs_lost_Hs_Mm)
ele_domains_anno$DORs_anno = cut(ele_domains_anno$genuine_lost_enhancers,c(-Inf,0,1,3,1000))
### 
proms_in_tads_wo_lost_DORs = promoters_HITS_DN[queryHits(findOverlaps(promoters_HITS_DN, ele_domains$TADs[which(ele_domains_anno$dn_prom>0 & ele_domains_anno$genuine_lost_enhancers==0)]))]
proms_in_tads_with_1_lost_DORs = promoters_HITS_DN[queryHits(findOverlaps(promoters_HITS_DN, ele_domains$TADs[which(ele_domains_anno$dn_prom>0 & ele_domains_anno$genuine_lost_enhancers==1)]))]
proms_in_tads_with_many_lost_DORs = promoters_HITS_DN[queryHits(findOverlaps(promoters_HITS_DN, ele_domains$TADs[which(ele_domains_anno$dn_prom>0 & ele_domains_anno$genuine_lost_enhancers>1 & ele_domains_anno$genuine_gained_enhancers<4)]))]
proms_in_tads_with_very_many_lost_DORs = promoters_HITS_DN[queryHits(findOverlaps(promoters_HITS_DN, ele_domains$TADs[which(ele_domains_anno$dn_prom>0 & ele_domains_anno$genuine_lost_enhancers>3)]))]
par(mfrow=c(2,2),mar=c(3,2,1,1))
boxplot( all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_wo_lost_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_1_lost_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_many_lost_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_very_many_lost_DORs)],
         notch=FALSE, ylim=c(-12,2), col="white",border=colorRampPalette(c("black","red"))(4),
         names=c("0","1","2-3",">3"))
boxplot( all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_wo_lost_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_1_lost_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_many_lost_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_very_many_lost_DORs)],
         notch=FALSE, ylim=c(-12,2), col="white",border=colorRampPalette(c("black","red"))(4),
         names=c("0","1","2-3",">3"))
boxplot(split(ele_domains_anno$DORs_lost_Hs_Pt,ele_domains_anno$DORs_anno),
        ylim=c(0,30),notch=TRUE, col="white",border=colorRampPalette(c("black","red"))(4),
        names=c("0","1","2-3",">3"))
boxplot(split(ele_domains_anno$DORs_lost_Hs_Mm,ele_domains_anno$DORs_anno),ylim=c(0,30),notch=TRUE, 
        col="white",border=colorRampPalette(c("black","red"))(4),
        names=c("0","1","2-3",">3"))phastCons = readRDS(paste0(objects_directory,'phastCons30way_signal_in_5bp_bins_for_all_ATAC_peaks_500Kb_around_summit.Rds'))
par(mfrow=c(1,1),mar=c(5,5,1,1))
plot( seq(-500,500,length.out=200),
      colMeans(phastCons[rownames(phastCons) %in% names(conserved_enhancers),]),
      ylab="PhastCons", 
      ty="l",col="black",lwd=3,ylim=c(0.0,0.4),xlim=c(-500,500),
      xlab="distance from ATAC-seq peak summit" )
lines(  seq(-500,500,length.out=200),
      colMeans(phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation),]),ty="l",col="turquoise4",lwd=3 )
lines(  seq(-500,500,length.out=200),
      colMeans(phastCons[rownames(phastCons) %in% names(enhancers_not_linked_with_activation),]),ty="l",col="gray80",lwd=3 )
lines(  seq(-500,500,length.out=200),
      colMeans(phastCons[rownames(phastCons) %in% names(genuine_lost_enhancers_gr),]),ty="l",col="red",lwd=3 )
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)
abline(v=0,lwd=2,lty=2,col="gray")t.test(phastCons[rownames(phastCons) %in% names(conserved_enhancers),95],
       phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation),95])## 
##  Welch Two Sample t-test
## 
## data:  phastCons[rownames(phastCons) %in% names(conserved_enhancers), 95] and phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation), 95]
## t = 18.66, df = 3400.3, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1738891 0.2147229
## sample estimates:
## mean of x mean of y 
## 0.3625148 0.1682088t.test(phastCons[rownames(phastCons) %in% names(enhancers_not_linked_with_activation),95],
       phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation),95])## 
##  Welch Two Sample t-test
## 
## data:  phastCons[rownames(phastCons) %in% names(enhancers_not_linked_with_activation), 95] and phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation), 95]
## t = 4.7247, df = 3272.1, p-value = 0.000002401
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.02917625 0.07056904
## sample estimates:
## mean of x mean of y 
## 0.2180814 0.1682088DOR_Deseq2 = merge(as.data.frame(res_HS_PT),
                   as.data.frame(res_HS_MM),
                   by="row.names")
## all putative enhancers
seqlevelsStyle(hs_atac) = "UCSC"
all_primate_enhancers = hs_atac[ -unique( queryHits(findOverlaps(hs_atac,c(promoters_tss_gr,hs_me3)))) ]
changed_enhancers_ATAC_signal_change = DOR_Deseq2[DOR_Deseq2$Row.names %in% names(all_primate_enhancers),]
changed_enhancers_ATAC_signal_change = changed_enhancers_ATAC_signal_change[ !is.na(changed_enhancers_ATAC_signal_change$padj.x),]
changed_enhancers_ATAC_signal_change = changed_enhancers_ATAC_signal_change[ !is.na(changed_enhancers_ATAC_signal_change$padj.y),]
changed_enhancers_ATAC_signal_change = changed_enhancers_ATAC_signal_change[ changed_enhancers_ATAC_signal_change$padj.x<sqrt(0.1) | changed_enhancers_ATAC_signal_change$padj.y<sqrt(0.1), ]
changed_enhancers_ATAC_signal_change = changed_enhancers_ATAC_signal_change[! changed_enhancers_ATAC_signal_change$log2FoldChange.x==changed_enhancers_ATAC_signal_change$log2FoldChange.y,]
par(mfrow=c(1,1),mar=c(4,4,1,1))
boxplot(abs(changed_enhancers_ATAC_signal_change$log2FoldChange.x),
        abs(changed_enhancers_ATAC_signal_change$log2FoldChange.y), 
        outline=FALSE, ylab=expression("Human/NHP [log"[2]*")]"),
        names=c("Hs vs. Pt","Hs vs. Mm"),
        col="white",border=c("red","blue"),lwd=2,ylim=c(0,7))
axis(1,lwd=2, at=c(1,2),labels=c("Hs vs. Pt","Hs vs. Mm"))
axis(2,lwd=2)
box(col="black",lwd=2)par(mfrow=c(1,1),pty='s')
heatscatter( changed_enhancers_ATAC_signal_change$log2FoldChange.x,
             changed_enhancers_ATAC_signal_change$log2FoldChange.y,
             colpal="blues",cex=0.5, 
             ylab=expression("Human/chimp [log"[2]*")]"),
             xlab=expression("Human/macaque [log"[2]*")]"),
             ylim=c(-10,10),
             xlim=c(-10,10))
axis(1,lwd=2)
axis(2,lwd=2)Regulomes are less correlated than transcriptomes
cor.test(res_HS_NHP$hs_pt_LFC[res_HS_NHP$hs_pt_Padj<0.1 & res_HS_NHP$hs_mm_Padj<0.1],
         res_HS_NHP$hs_mm_LFC[res_HS_NHP$hs_pt_Padj<0.1 & res_HS_NHP$hs_mm_Padj<0.1] )## 
##  Pearson's product-moment correlation
## 
## data:  res_HS_NHP$hs_pt_LFC[res_HS_NHP$hs_pt_Padj < 0.1 & res_HS_NHP$hs_mm_Padj < 0.1] and res_HS_NHP$hs_mm_LFC[res_HS_NHP$hs_pt_Padj < 0.1 & res_HS_NHP$hs_mm_Padj < 0.1]
## t = 241.55, df = 17201, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8753683 0.8821743
## sample estimates:
##      cor 
## 0.878816cor.test(all_Deseqs$log2FoldChange.x[all_Deseqs$padj.x<0.01 & all_Deseqs$pvalue.y<0.01],
         all_Deseqs$log2FoldChange.y[all_Deseqs$padj.x<0.01 & all_Deseqs$pvalue.y<0.01] )## 
##  Pearson's product-moment correlation
## 
## data:  all_Deseqs$log2FoldChange.x[all_Deseqs$padj.x < 0.01 & all_Deseqs$pvalue.y < 0.01] and all_Deseqs$log2FoldChange.y[all_Deseqs$padj.x < 0.01 & all_Deseqs$pvalue.y < 0.01]
## t = 49.93, df = 1680, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7529456 0.7914822
## sample estimates:
##       cor 
## 0.7729257expressionRNA = cor.test(log_fold_dat$HSvPT_lfc_shrunk,log_fold_dat$HSvMM_lfc, conf.level = 0.99)
regulomeATAC = cor.test(changed_enhancers_ATAC_signal_change$log2FoldChange.x,
                    changed_enhancers_ATAC_signal_change$log2FoldChange.y, conf.level = 0.99)
par(pty="m")
barplot( c(expressionRNA$estimate,
           regulomeATAC$estimate),
         col=c('green4','steelblue'),ylim=c(0,1),
         names=c("gene expression","ATAC"))
segments(0.7,expressionRNA$conf.int[[1]],0.7,expressionRNA$conf.int[[2]]) 
segments(1.9,regulomeATAC$conf.int[[1]],1.9,regulomeATAC$conf.int[[2]]) 
axis(2,lwd=2)linked_500 = GenomicRanges::resize(enhancers_linked_with_activation,1000000,fix="center")
not_linked_500 = GenomicRanges::resize(enhancers_not_linked_with_activation,1000000,fix="center")
sum(countOverlaps(linked_500,promoters_filtered_gr)>0)## [1] 1443sum(countOverlaps(not_linked_500,promoters_filtered_gr)>0)## [1] 2675Enhancers that do something are more open
atac_hs_bw = import.bw(paste0(outputs_directory,"ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_RPGC.bw"))
k27_hs_bw = import.bw(paste0(outputs_directory,"ChIP_Seq_H3K27ac_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_RPGC.bw"))
seqlevelsStyle(atac_hs_bw) = "ucsc"
seqlevelsStyle(k27_hs_bw) = "ucsc"
linked_GR = import.bed(paste0(outputs_directory,"enhancers_linked_with_activation.bed"))
names(linked_GR) = linked_GR$name
not_linked_GR = import.bed(paste0(outputs_directory,"enhancers_not_linked_with_activation.bed"))
names(not_linked_GR) = not_linked_GR$name
linked_GR_AP = GetAPRangesForGenomicRangesObject(linked_GR)
not_linked_GR_AP = GetAPRangesForGenomicRangesObject(not_linked_GR)
linked_atac_hs = getSignalInBins( linked_GR_AP, atac_hs_bw, 1 )
linked_k27_hs = getSignalInBins( linked_GR_AP, k27_hs_bw, 1 )
not_linked_atac_hs = getSignalInBins( not_linked_GR_AP, atac_hs_bw, 1 )
not_linked_k27_hs = getSignalInBins( not_linked_GR_AP, k27_hs_bw, 1 )
par(mfrow=c(1,2),mar=c(5,5,1,1),pty="m")
plot(seq(-1000,1000,length.out=200),
     colMeans(linked_atac_hs),col="turquoise4",ty='l',lwd=2,
     xlab="Distance from the DOR summit",ylab="ATAC-seq signal (RPGC)")
lines(seq(-1000,1000,length.out=200),
     colMeans(not_linked_atac_hs),col="gray",lwd=2)
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)
plot(seq(-1000,1000,length.out=200),
     colMeans(linked_k27_hs),col="turquoise4",ty='l',lwd=2,
     xlab="Distance from the DOR summit",ylab="H3K27ac ChIP-seq signal (RPGC)",ylim=c(0,6))
lines(seq(-1000,1000,length.out=200),
     colMeans(not_linked_k27_hs),col="gray",lwd=2)
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)Names of all the TFs in Hocomoco database.
AllTFs = c(list.files('~/human_beds/A'), 
           unlist( lapply( as.list(paste0("~/human_beds/A-kopia",c('',2,3,4,5,6))), function(x){list.files(x)} ) ) )
TFs = unique( do.call('c', lapply(strsplit(AllTFs,'.bed'),function(el){el[[1]]})) )
TFsEnsemblG = read.delim( file=paste0(outputs_directory,'TFsymbol_fixed.txt'),as.is=TRUE )
TFsEnsemblG$eig = genemapu$ensembl_gene_id[match(TFsEnsemblG$Fixed,genemapu$hgnc_symbol)]
TFsEnsemblG$names = AllTFs
TFsEnsemblG$names2 = unlist(strsplit(AllTFs,".bed"))
TFsEnsemblG$names3 = paste0( unlist(strsplit(AllTFs,".bed")), "_HG38.bed" )
save(TFsEnsemblG,file=paste0(objects_directory,"TFsEnsemblG.RData"))Load objects
human_stripe_factors = read.delim(paste0(outputs_directory,"human_stripe_factors.txt"),header=FALSE,as.is=TRUE)
load(paste0(objects_directory,"TFsEnsemblG.RData"))
load( paste0(objects_directory,"enhancers_functional_groups.RData"))
load( paste0(objects_directory,"conserved_enhancers.RData" ))
export.bed( conserved_enhancers, con=paste0(outputs_directory,"conserved_enhancers.bed" ))
genuine_gained_enhancers_gr = import.bed(paste0(outputs_directory,"genuine_gained_enhancers_ucsc.bed"))
load(paste0(objects_directory,"lost_enhancers_functional_groups.RData"))
genuine_lost_enhancers_that_do_something = import.bed(paste0(outputs_directory,"lost_enhancers_linked_with_activation.bed"))
genuine_lost_enhancers_that_do_nothing = import.bed(paste0(outputs_directory,"lost_enhancers_not_linked_with_activation.bed"))
enhancers_linked_with_activation_TADs = import.bed(paste0(outputs_directory,"enhancers_linked_with_activation_TADs.bed"))
enhancers_not_linked_with_activation_TADs = import.bed(paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs.bed"))
names(enhancers_linked_with_activation_TADs) = enhancers_linked_with_activation_TADs$name
names(enhancers_not_linked_with_activation_TADs) = enhancers_not_linked_with_activation_TADs$nameAlign the chosen enhancer groups to chimp
cd ~/Documents/Tools/
## human to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_PanTro.unmapped.file
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_PanTro.unmapped.file
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_PanTro.unmapped.file
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_PanTro.unmapped.file
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_PanTro.unmapped.file
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_with_activation.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_with_activation_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_with_activation_PanTro.unmapped.file
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_with_activation.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_with_activation_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_with_activation_PanTro.unmapped.fileGet sequences for the enhancers
enhancers_linked_with_activation_pt = readBed_filterChroms(paste0(outputs_directory,'enhancers_linked_with_activation_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
enhancers_not_linked_with_activation_pt = readBed_filterChroms(paste0(outputs_directory,'enhancers_not_linked_with_activation_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
enhancers_linked_with_activation_TADs_pt = readBed_filterChroms(paste0(outputs_directory,'enhancers_linked_with_activation_TADs_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
enhancers_not_linked_with_activation_TADs_pt = readBed_filterChroms(paste0(outputs_directory,'enhancers_not_linked_with_activation_TADs_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
conserved_enhancers_pt = readBed_filterChroms(paste0(outputs_directory,'conserved_enhancers_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
lost_linked_pt = readBed_filterChroms(paste0(outputs_directory,'lost_enhancers_linked_with_activation_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
lost_not_linked_pt = readBed_filterChroms(paste0(outputs_directory,'lost_enhancers_not_linked_with_activation_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
## checks
enhancers_linked_with_activation = enhancers_linked_with_activation[match(names(enhancers_linked_with_activation_pt),names(enhancers_linked_with_activation))]
all(names(enhancers_linked_with_activation_pt)==names(enhancers_linked_with_activation))## [1] TRUEall(names(enhancers_not_linked_with_activation_pt)==names(enhancers_not_linked_with_activation))## [1] TRUEgenuine_lost_enhancers_that_do_something = genuine_lost_enhancers_that_do_something[match(names(lost_linked_pt),genuine_lost_enhancers_that_do_something$name)]
genuine_lost_enhancers_that_do_nothing = genuine_lost_enhancers_that_do_nothing[match(names(lost_not_linked_pt),genuine_lost_enhancers_that_do_nothing$name)]
all(genuine_lost_enhancers_that_do_something$name==names(lost_linked_pt))## [1] TRUEall(genuine_lost_enhancers_that_do_nothing$name==names(lost_not_linked_pt))## [1] TRUEnames(genuine_lost_enhancers_that_do_something) = genuine_lost_enhancers_that_do_something$name
names(genuine_lost_enhancers_that_do_nothing) = genuine_lost_enhancers_that_do_nothing$name
enhancers_linked_with_activation_TADs = enhancers_linked_with_activation_TADs[match(names(enhancers_linked_with_activation_TADs_pt),names(enhancers_linked_with_activation_TADs))]
enhancers_not_linked_with_activation_TADs = enhancers_not_linked_with_activation_TADs[match(names(enhancers_not_linked_with_activation_TADs_pt),names(enhancers_not_linked_with_activation_TADs))]
all(names(enhancers_linked_with_activation_TADs)==names(enhancers_linked_with_activation_TADs_pt))## [1] TRUEall(names(enhancers_not_linked_with_activation_TADs)==names(enhancers_not_linked_with_activation_TADs_pt))## [1] TRUEenhancers_linked_with_activation_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,enhancers_linked_with_activation)
enhancers_linked_with_activation_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,enhancers_linked_with_activation_pt)
enhancers_not_linked_with_activation_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,enhancers_not_linked_with_activation)
enhancers_not_linked_with_activation_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,enhancers_not_linked_with_activation_pt)
enhancers_linked_with_activation_TADs_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,enhancers_linked_with_activation_TADs)
enhancers_linked_with_activation_TADs_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,enhancers_linked_with_activation_TADs_pt)
enhancers_not_linked_with_activation_TADs_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,enhancers_not_linked_with_activation_TADs)
enhancers_not_linked_with_activation_TADs_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,enhancers_not_linked_with_activation_TADs_pt)
conserved_enhancers_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,conserved_enhancers)
conserved_enhancers_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,conserved_enhancers_pt)
lost_enhancers_linked_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,genuine_lost_enhancers_that_do_something)
lost_enhancers_linked_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,lost_linked_pt)
lost_enhancers_not_linked_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,genuine_lost_enhancers_that_do_nothing)
lost_enhancers_not_linked_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,lost_not_linked_pt)Now let’s find the evolutionary mismatches between sequences. We compare human to chimp sequences.
test = mclapply( as.list(names(enhancers_linked_with_activation)), 
                 function(enh){ Figure_out_mismatching_sequences( enhancers_linked_with_activation[which(names(enhancers_linked_with_activation)==enh)],
                                                                                                  enhancers_linked_with_activation_seq_Hs[which(names(enhancers_linked_with_activation_seq_Hs)==enh)],
                                                                                                       enhancers_linked_with_activation_pt[which(names(enhancers_linked_with_activation_pt)==enh)],
                                                                                                       enhancers_linked_with_activation_seq_Pt[which(names(enhancers_linked_with_activation_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )
enhancers_linked_with_activation_hs_vs_Pt = do.call("rbind",test)
save(enhancers_linked_with_activation_hs_vs_Pt,
     file=paste0(objects_directory,"enhancers_linked_with_activation_hs_vs_Pt.RData") )
enhancers_linked_with_activation_hs_vs_Pt_gr =  GRanges( seqnames=Rle(enhancers_linked_with_activation_hs_vs_Pt$seqnames),
                                                         ranges = IRanges( enhancers_linked_with_activation_hs_vs_Pt$start,
                                                         end=enhancers_linked_with_activation_hs_vs_Pt$end ),
                                                         kind=enhancers_linked_with_activation_hs_vs_Pt$type)
names(enhancers_linked_with_activation_hs_vs_Pt_gr) = enhancers_linked_with_activation_hs_vs_Pt$names
export.bed( enhancers_linked_with_activation_hs_vs_Pt_gr, con=paste0(outputs_directory,"enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed"))
seqlevelsStyle(enhancers_linked_with_activation_hs_vs_Pt_gr) = "ncbi"
export.gff( enhancers_linked_with_activation_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_linked_with_activation_hs_vs_Pt.gtf"))
export.bed( enhancers_linked_with_activation_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_linked_with_activation_hs_vs_Pt.bed"))Intersect with bedtools
cd ~/Documents/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A6.bedIntersect with chimp TFBS lifted over to the Hg38 genome assembly
cd ~/Documents/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A6.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia7/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A7.bednorole = mclapply( as.list(names(enhancers_not_linked_with_activation)), 
                 function(enh){ Figure_out_mismatching_sequences( enhancers_not_linked_with_activation[which(names(enhancers_not_linked_with_activation)==enh)],
                                                                                                  enhancers_not_linked_with_activation_seq_Hs[which(names(enhancers_not_linked_with_activation_seq_Hs)==enh)],
                                                                                                       enhancers_not_linked_with_activation_pt[which(names(enhancers_not_linked_with_activation_pt)==enh)],
                                                                                                       enhancers_not_linked_with_activation_seq_Pt[which(names(enhancers_not_linked_with_activation_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )
enhancers_not_linked_with_activation_hs_vs_Pt = do.call("rbind",norole)
save(enhancers_not_linked_with_activation_hs_vs_Pt,
     file=paste0(objects_directory,"enhancers_not_linked_with_activation_hs_vs_Pt.RData"))
enhancers_not_linked_with_activation_hs_vs_Pt_gr =  GRanges( seqnames=Rle(enhancers_not_linked_with_activation_hs_vs_Pt$seqnames),
                                                             ranges = IRanges( enhancers_not_linked_with_activation_hs_vs_Pt$start,
                                                             end=enhancers_not_linked_with_activation_hs_vs_Pt$end ),
                                                             kind=enhancers_not_linked_with_activation_hs_vs_Pt$type)
names(enhancers_not_linked_with_activation_hs_vs_Pt_gr) = enhancers_not_linked_with_activation_hs_vs_Pt$names
seqlevelsStyle(enhancers_not_linked_with_activation_hs_vs_Pt_gr) = "ncbi"
export.gff( enhancers_not_linked_with_activation_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_not_linked_with_activation_hs_vs_Pt.gtf"))
export.bed( enhancers_not_linked_with_activation_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_not_linked_with_activation_hs_vs_Pt.bed"))Intersect with bedtools
cd ~/Documents/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A6.bedNow let’s find the evolutionary mismatches between sequences. We compare human to chimp sequences.
TADs_enh_linked = mclapply( as.list(names(enhancers_linked_with_activation_TADs)), 
                 function(enh){ Figure_out_mismatching_sequences( enhancers_linked_with_activation_TADs[which(names(enhancers_linked_with_activation_TADs)==enh)],
                                                                                                  enhancers_linked_with_activation_TADs_seq_Hs[which(names(enhancers_linked_with_activation_TADs_seq_Hs)==enh)],
                                                                                                       enhancers_linked_with_activation_TADs_pt[which(names(enhancers_linked_with_activation_TADs_pt)==enh)],
                                                                                                       enhancers_linked_with_activation_TADs_seq_Pt[which(names(enhancers_linked_with_activation_TADs_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )
enhancers_linked_with_activation_TADs_hs_vs_Pt = do.call("rbind",TADs_enh_linked)
save(enhancers_linked_with_activation_TADs_hs_vs_Pt,
     file=paste0(objects_directory,"eenhancers_linked_with_activation_TADs_hs_vs_Pt.RData") )
enhancers_linked_with_activation_TADs_hs_vs_Pt_gr =  GRanges( seqnames=Rle(enhancers_linked_with_activation_TADs_hs_vs_Pt$seqnames),
                                                              ranges = IRanges( enhancers_linked_with_activation_TADs_hs_vs_Pt$start,
                                                              end=enhancers_linked_with_activation_TADs_hs_vs_Pt$end ),
                                                              kind=enhancers_linked_with_activation_TADs_hs_vs_Pt$type)
names(enhancers_linked_with_activation_TADs_hs_vs_Pt_gr) = enhancers_linked_with_activation_TADs_hs_vs_Pt$names
export.bed( enhancers_linked_with_activation_TADs_hs_vs_Pt_gr, con=paste0(outputs_directory,"enhancers_linked_with_activation_TADs_hs_vs_Pt_ucsc.bed"))
seqlevelsStyle(enhancers_linked_with_activation_TADs_hs_vs_Pt_gr) = "ncbi"
export.gff( enhancers_linked_with_activation_TADs_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_linked_with_activation_TADs_hs_vs_Pt.gtf"))
export.bed( enhancers_linked_with_activation_TADs_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_linked_with_activation_TADs_hs_vs_Pt.bed"))Intersect with bedtools
cd ~/Documents/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A6.bedTADs_enh__not_linked = mclapply( as.list(names(enhancers_not_linked_with_activation_TADs)), 
                 function(enh){ Figure_out_mismatching_sequences( enhancers_not_linked_with_activation_TADs[which(names(enhancers_not_linked_with_activation_TADs)==enh)],
                                                                                                  enhancers_not_linked_with_activation_TADs_seq_Hs[which(names(enhancers_not_linked_with_activation_TADs_seq_Hs)==enh)],
                                                                                                       enhancers_not_linked_with_activation_TADs_pt[which(names(enhancers_not_linked_with_activation_TADs_pt)==enh)],
                                                                                                       enhancers_not_linked_with_activation_TADs_seq_Pt[which(names(enhancers_not_linked_with_activation_TADs_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )
enhancers_not_linked_with_activation_TADs_hs_vs_Pt = do.call("rbind",TADs_enh__not_linked)
save(enhancers_not_linked_with_activation_TADs_hs_vs_Pt,
     file=paste0(objects_directory,"enhancers_not_linked_with_activation_TADs_hs_vs_Pt.RData"))
enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr =  GRanges( seqnames=Rle(enhancers_not_linked_with_activation_TADs_hs_vs_Pt$seqnames),
                                                                  ranges = IRanges( enhancers_not_linked_with_activation_TADs_hs_vs_Pt$start,
                                                                  end=enhancers_not_linked_with_activation_TADs_hs_vs_Pt$end ),
                                                                  kind=enhancers_not_linked_with_activation_TADs_hs_vs_Pt$type)
names(enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr) = enhancers_not_linked_with_activation_TADs_hs_vs_Pt$names
seqlevelsStyle(enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr) = "ncbi"
export.gff( enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs_hs_vs_Pt.gtf"))
export.bed( enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed"))Intersect with bedtools.
cd ~/Documents/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A6.bedcons = mclapply( as.list(names(conserved_enhancers)), 
                 function(enh){ Figure_out_mismatching_sequences( conserved_enhancers[which(names(conserved_enhancers)==enh)],
                                                                  conserved_enhancers_seq_Hs[which(names(conserved_enhancers_seq_Hs)==enh)],
                                                                  conserved_enhancers_pt[which(names(conserved_enhancers_pt)==enh)],
                                                                  conserved_enhancers_seq_Pt[which(names(conserved_enhancers_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )
conserved_enhancers_hs_vs_Pt = do.call("rbind",cons)
save(conserved_enhancers_hs_vs_Pt,
     file=paste0(objects_directory,"conserved_enhancers_hs_vs_Pt.RData"))
conserved_enhancers_hs_vs_Pt_gr =  GRanges( seqnames=Rle(conserved_enhancers_hs_vs_Pt$seqnames),
                                            ranges = IRanges( conserved_enhancers_hs_vs_Pt$start,
                                                              end=conserved_enhancers_hs_vs_Pt$end ),
                                            kind=conserved_enhancers_hs_vs_Pt$type)
names(conserved_enhancers_hs_vs_Pt_gr) = conserved_enhancers_hs_vs_Pt$names
export.bed( conserved_enhancers_hs_vs_Pt_gr, con=paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt_ucsc.bed"))
seqlevelsStyle(conserved_enhancers_hs_vs_Pt_gr) = "ncbi"
export.gff( conserved_enhancers_hs_vs_Pt_gr, con=paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt.gtf"))
export.bed( conserved_enhancers_hs_vs_Pt_gr, con=paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt.bed"))Intersect with bedtools.
cd ~/Documents/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A6.bedIntersect with chimp TFBS lifted over to the Hg38 genome assembly
cd ~/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A6.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia7/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A7.bedlostE = mclapply( as.list(names(genuine_lost_enhancers_that_do_something)), 
                 function(enh){ Figure_out_mismatching_sequences( genuine_lost_enhancers_that_do_something[which(names(genuine_lost_enhancers_that_do_something)==enh)],
                                                                  lost_enhancers_linked_seq_Hs[which(names(lost_enhancers_linked_seq_Hs)==enh)],
                                                                  lost_linked_pt[which(names(lost_linked_pt)==enh)],
                                                                  lost_enhancers_linked_seq_Pt[which(names(lost_enhancers_linked_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )
lost_enhancers_linked_hs_vs_Pt = do.call("rbind",lostE)
save(lost_enhancers_linked_hs_vs_Pt,
     file=paste0(objects_directory,"lost_enhancers_linked_hs_vs_Pt.RData"))
lost_enhancers_linked_hs_vs_Pt_gr =  GRanges( seqnames=Rle(lost_enhancers_linked_hs_vs_Pt$seqnames),
                                            ranges = IRanges( lost_enhancers_linked_hs_vs_Pt$start,
                                                              end=lost_enhancers_linked_hs_vs_Pt$end ),
                                            kind=lost_enhancers_linked_hs_vs_Pt$type)
names(lost_enhancers_linked_hs_vs_Pt_gr) = lost_enhancers_linked_hs_vs_Pt$names
export.bed( lost_enhancers_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_linked_hs_vs_Pt_gr_ucsc.bed"))
seqlevelsStyle(lost_enhancers_linked_hs_vs_Pt_gr) = "ncbi"
export.gff( lost_enhancers_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_linked_hs_vs_Pt_gr.gtf"))
export.bed( lost_enhancers_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_linked_hs_vs_Pt_gr.bed"))Intersect the positions of mismatches with TFBS inferred for Chimp and lifted over to human.
cd ~/Documents/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A6.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia7/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A7.bedlostI = mclapply( as.list(names(genuine_lost_enhancers_that_do_nothing)), 
                 function(enh){ Figure_out_mismatching_sequences( genuine_lost_enhancers_that_do_nothing[which(names(genuine_lost_enhancers_that_do_nothing)==enh)],
                                                                  lost_enhancers_not_linked_seq_Hs[which(names(lost_enhancers_not_linked_seq_Hs)==enh)],
                                                                  lost_not_linked_pt[which(names(lost_not_linked_pt)==enh)],
                                                                  lost_enhancers_not_linked_seq_Pt[which(names(lost_enhancers_not_linked_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )
lost_enhancers_not_linked_hs_vs_Pt = do.call("rbind",lostI)
save(lost_enhancers_not_linked_hs_vs_Pt,
     file=paste0(outputs_directory,"lost_enhancers_not_linked_hs_vs_Pt.RData"))
lost_enhancers_not_linked_hs_vs_Pt_gr =  GRanges( seqnames=Rle(lost_enhancers_not_linked_hs_vs_Pt$seqnames),
                                            ranges = IRanges( lost_enhancers_not_linked_hs_vs_Pt$start,
                                                              end=lost_enhancers_not_linked_hs_vs_Pt$end ),
                                            kind=lost_enhancers_not_linked_hs_vs_Pt$type)
names(lost_enhancers_not_linked_hs_vs_Pt_gr) = lost_enhancers_not_linked_hs_vs_Pt$names
export.bed( lost_enhancers_not_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed"))
seqlevelsStyle(lost_enhancers_not_linked_hs_vs_Pt_gr) = "ncbi"
export.gff( lost_enhancers_not_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_not_linked_hs_vs_Pt_gr.gtf"))
export.bed( lost_enhancers_not_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_not_linked_hs_vs_Pt_gr.bed"))Again, intersect the positions of mismatches with TFBS inferred for Chimp and lifted over to human.
cd ~/Documents/Tools/bedtools2/
## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A1.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A2.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A3.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A4.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A5.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A6.bed
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia7/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A7.bedconserved_enhancers_hs_vs_Pt = import.bed(paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt_ucsc.bed"))
conserved_enhancers = import.bed(paste0(outputs_directory,"conserved_enhancers.bed"))
names(conserved_enhancers) = conserved_enhancers$name
conserved_enhancers = conserved_enhancers[order(width(conserved_enhancers))]
conserved_enhancers = conserved_enhancers[which(!duplicated(names(conserved_enhancers))) ]
linked_with_activation_TFBSchange = readBedtools_res( filePath=paste0(outputs_directory,"TFBS_analysis/enhancers_linked_with_activation/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
linked_with_activation_TFBSchange_chimp = readBedtools_UCSC( filePath=paste0(outputs_directory,"TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
not_linked_with_activation_TFBSchange = readBedtools_res( filePath=paste0(outputs_directory,"TFBS_analysis/enhancers_not_linked_with_activation/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
conserved_TFBSchange_Hs = readBedtools_res( filePath=paste0(outputs_directory,"TFBS_analysis/conserved_enhancers_Hs/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
conserved_TFBSchange_Pt = readBedtools_UCSC( filePath=paste0(outputs_directory,"TFBS_analysis/conserved_enhancers_Pt/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
lost_linked = readBedtools_UCSC( filePath=paste0(outputs_directory,"TFBS_analysis/lost_enhancers_linked_pt/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
lost_not_linked = readBedtools_UCSC( filePath=paste0(outputs_directory,"TFBS_analysis/lost_enhancers_not_linked_pt/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
#### --------------------------------------------------------------
linked_with_activation_TFBSchange = processTFBSresult(linked_with_activation_TFBSchange,
                                                      tfanno=TFsEnsemblG,
                                                      nameColumn="names")
linked_with_activation_TFBSchange_chimp = processTFBSresult(linked_with_activation_TFBSchange_chimp,
                                                            tfanno=TFsEnsemblG,
                                                            nameColumn = "names3")
not_linked_with_activation_TFBSchange = processTFBSresult(not_linked_with_activation_TFBSchange,
                                                          tfanno=TFsEnsemblG,
                                                          nameColumn="names")
conserved_TFBSchange = processTFBSresult(conserved_TFBSchange_Hs,
                                         tfanno=TFsEnsemblG,
                                         nameColumn = "names")
conserved_TFBSchange_chimp = processTFBSresult(conserved_TFBSchange_Pt,
                                               tfanno=TFsEnsemblG,
                                               nameColumn = "names3")
lost_linked_TFBSchange = processTFBSresult(lost_linked,
                                           tfanno=TFsEnsemblG,
                                           nameColumn = "names3")
lost_not_linked_TFBSchange = processTFBSresult(lost_not_linked,
                                               tfanno=TFsEnsemblG,
                                               nameColumn = "names3")
save( linked_with_activation_TFBSchange, not_linked_with_activation_TFBSchange,linked_with_activation_TFBSchange_chimp,
      conserved_TFBSchange,conserved_TFBSchange_chimp,
      lost_linked_TFBSchange,lost_not_linked_TFBSchange,
      file=paste0(objects_directory,"evolutionary_changes_in_TFBS.RData" ) )
save(linked_with_activation_TFBSchange,file=paste0(objects_directory,"linked_with_activation_TFBSchange.RData"))
save(linked_with_activation_TFBSchange_chimp,file=paste0(objects_directory,"linked_with_activation_TFBSchange_chimp.Rdata"))load(paste0(objects_directory,"evolutionary_changes_in_TFBS.RData" ))
conserved_enhancers_hs_vs_Pt = import.bed(paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt_ucsc.bed"))
conserved_enhancers = import.bed(paste0(outputs_directory,"conserved_enhancers.bed"))
names(conserved_enhancers) = conserved_enhancers$name
conserved_enhancers = conserved_enhancers[order(width(conserved_enhancers))]
conserved_enhancers = conserved_enhancers[which(!duplicated(names(conserved_enhancers))) ]Overall conservation of TFBS - take conserved enhancers
conserved_TFBSchange_table = table( conserved_TFBSchange$TF )
conserved_TFBSchange_chimp_table = table(conserved_TFBSchange_chimp$TF)
conserved_TFBSchange_human_table = conserved_TFBSchange_table[ match(names(conserved_TFBSchange_chimp_table),names(conserved_TFBSchange_table))]
all(names(conserved_TFBSchange_human_table)==names(conserved_TFBSchange_chimp_table))## [1] TRUElinked_TFBSchange_table = table( linked_with_activation_TFBSchange$TF )
linked_with_activation_TFBSchange_chimp_table = table(linked_with_activation_TFBSchange_chimp$TF)
linked_with_activation_TFBSchange_human_table = linked_TFBSchange_table[ match(names(linked_with_activation_TFBSchange_chimp_table),names(linked_TFBSchange_table))]
table(names(linked_with_activation_TFBSchange_human_table)==names(linked_with_activation_TFBSchange_human_table))## 
## TRUE 
##  674boxplot( log2(conserved_TFBSchange_human_table/conserved_TFBSchange_chimp_table),
         log2(linked_with_activation_TFBSchange_human_table/linked_with_activation_TFBSchange_chimp_table),
         col="white",border=c("green4","turquoise4"),ylab="",
         notch=TRUE, outline=FALSE, ylim=c(-1.5,1.5),
         ylab="Change in TFBS [log2(human/Chimp)]")## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, :
## Duplicated argument ylab = "Change in TFBS [log2(human/Chimp)]" is disregardedaxis(1,at=c(1,2),lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)t.test( log2(conserved_TFBSchange_human_table/conserved_TFBSchange_chimp_table),
        log2(linked_with_activation_TFBSchange_human_table/linked_with_activation_TFBSchange_chimp_table) )## 
##  Welch Two Sample t-test
## 
## data:  log2(conserved_TFBSchange_human_table/conserved_TFBSchange_chimp_table) and log2(linked_with_activation_TFBSchange_human_table/linked_with_activation_TFBSchange_chimp_table)
## t = -8.9693, df = 1006.3, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2721183 -0.1744232
## sample estimates:
##  mean of x  mean of y 
## 0.06966084 0.29293161Conserved and species specific peaks feature TFBS changes frequently
howManyChangesPerPeak = function(tfbsobj){
  unlist(lapply(split(start(tfbsobj),tfbsobj$peak),function(x){length(unique(x))}))
}
boxplot( howManyChangesPerPeak( linked_with_activation_TFBSchange ),
         howManyChangesPerPeak( not_linked_with_activation_TFBSchange ),
         howManyChangesPerPeak( conserved_TFBSchange ),
         col="white",outline=FALSE,
         names=c('linked','not linked','conserved'),las=2,
         border=c('turquoise4','gray80','green4'),
         ylab="changes in TFBS per element",lwd=2 )
axis(2,lwd=2,las=2)
axis(1,at=c(1,2,3),lwd=2,c('linked','not linked','conserved'),las=2)
box(col="black",lwd=2)t.test(  howManyChangesPerPeak( linked_with_activation_TFBSchange ),
         howManyChangesPerPeak( not_linked_with_activation_TFBSchange ) )## 
##  Welch Two Sample t-test
## 
## data:  howManyChangesPerPeak(linked_with_activation_TFBSchange) and howManyChangesPerPeak(not_linked_with_activation_TFBSchange)
## t = 7.0175, df = 1931.9, p-value = 0.000000000003114
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.7881839 1.3996136
## sample estimates:
## mean of x mean of y 
##  6.746499  5.652600t.test(  howManyChangesPerPeak( conserved_TFBSchange ),
         howManyChangesPerPeak( not_linked_with_activation_TFBSchange ) )## 
##  Welch Two Sample t-test
## 
## data:  howManyChangesPerPeak(conserved_TFBSchange) and howManyChangesPerPeak(not_linked_with_activation_TFBSchange)
## t = -6.2941, df = 6123.5, p-value = 0.0000000003306
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6712331 -0.3524108
## sample estimates:
## mean of x mean of y 
##  5.140778  5.652600A=table(cut(width(conserved_TFBSchange),c(0,1,1000)))/length(conserved_TFBSchange)
B=table(cut(width(linked_with_activation_TFBSchange),c(0,1,1000)))/length(linked_with_activation_TFBSchange)
C=table(cut(width(not_linked_with_activation_TFBSchange),c(0,1,1000)))/length(not_linked_with_activation_TFBSchange)
ABC=rbind(A,B,C)
barplot( 100*ABC, beside=TRUE,col=c("green4","turquoise4","gray80"),ylim=c(0,100),
         names=c("MM","Changes>1bp"),ylab="%" )
axis(2,lwd=2)
legend(x=5,y=90,c("Conserved","Linked","Not linked"),cex=1,
       pch=15,col=c("green4","turquoise4","gray80"),bty="n")Any particular TFs? Preparations
## ------------------------------------------------------------
# matrix for each peak
TFmat_linked_with_activation = makeMatrixTFBS4peaks( tfmut=linked_with_activation_TFBSchange, 
                                                     theTFs=unique(TFsEnsemblG$Fixed), 
                                                     allPeaks=enhancers_linked_with_activation )
TFmat_not_linked_with_activation = makeMatrixTFBS4peaks( tfmut=not_linked_with_activation_TFBSchange, 
                                                         theTFs=unique(TFsEnsemblG$Fixed), 
                                                         allPeaks=enhancers_not_linked_with_activation )
TFmat_linked_with_activation_chimp = makeMatrixTFBS4peaks( tfmut=linked_with_activation_TFBSchange_chimp, 
                                                           theTFs=unique(TFsEnsemblG$Fixed), 
                                                           allPeaks=enhancers_linked_with_activation )
TFmat_conserved = makeMatrixTFBS4peaks( tfmut=conserved_TFBSchange, 
                                        theTFs=unique(TFsEnsemblG$Fixed), 
                                        allPeaks=conserved_enhancers )
TFmat_conserved_chimp = makeMatrixTFBS4peaks( tfmut=conserved_TFBSchange_chimp, 
                                              theTFs=unique(TFsEnsemblG$Fixed), 
                                              allPeaks=conserved_enhancers )
TFmat_lost_linked = makeMatrixTFBS4peaks( tfmut=lost_linked_TFBSchange, 
                                          theTFs=unique(TFsEnsemblG$Fixed), 
                                          allPeaks=genuine_lost_enhancers_that_do_something )
TFmat_lost_not_linked = makeMatrixTFBS4peaks( tfmut=lost_not_linked_TFBSchange, 
                                              theTFs=unique(TFsEnsemblG$Fixed), 
                                              allPeaks=genuine_lost_enhancers_that_do_nothing )
save(TFmat_linked_with_activation,TFmat_not_linked_with_activation,TFmat_linked_with_activation_chimp,
     TFmat_conserved, TFmat_conserved_chimp,TFmat_lost_linked,TFmat_lost_not_linked,
     file=paste0(objects_directory,"TFmatrices_linked_not_linked.RData"))Assess the significance of the observed differences in frequency
load(paste0(objects_directory,"TFmatrices_linked_not_linked.RData"))
RES = do.call("rbind",
              apply(TFmat_linked_with_activation,2,function(x){data.frame( Motif=sum(x>0),
                                                                 noMotif=sum(x==0) ) } ) )
SER = do.call("rbind",
              apply(TFmat_not_linked_with_activation,2,function(x){data.frame( Motif=sum(x>0),
                                                                               noMotif=sum(x==0) ) } ) )
TFs_FT = data.frame()
for( i in colnames(TFmat_linked_with_activation) ){
  m=rbind(linked=RES[rownames(RES)==i,],
          notLinked=SER[rownames(SER)==i,])
  tp = fisher.test(m)
  tp = data.frame(p_val=tp$p.value,
                  odds=tp$estimate,
                  number_in_linked = RES[rownames(RES)==i,1],
                  number_in_not_linked = SER[rownames(SER)==i,1],
                  fraction_in_linked = RES[rownames(RES)==i,1]/rowSums(RES[rownames(RES)==i,]),
                  fraction_in_not_linked = SER[rownames(SER)==i,1]/rowSums(SER[rownames(SER)==i,]),
                  tf = i)
  TFs_FT=rbind(tp,TFs_FT) }
TFs_FT$p_adjust = p.adjust(TFs_FT$p_val)
TFs_FT$p_adjust_bin = cut(-log10(TFs_FT$p_adjust), c(-1,0,1, seq(2,10,length.out=252),45) )
par(pty="s")
plot( x=TFs_FT$fraction_in_linked, 
      y=TFs_FT$fraction_in_not_linked,
      pch=19, cex=0.5, 
      xlab="Linked with activation",
      ylab="Not linked with activation",
      xlim=c(0,0.3), ylim=c(0,0.3),
      col=ifelse(TFs_FT$p_adjust<0.01,"blue3","wheat2"))
abline(a=0,b=1,col='black')
axis(1,lwd=2)
axis(2,lwd=2)
box(col='black',lwd=2)
text(x=TFs_FT$fraction_in_linked[TFs_FT$p_adjust<0.01 & TFs_FT$fraction_in_linked>0.1]+0.005,
     y=TFs_FT$fraction_in_not_linked[TFs_FT$p_adjust<0.01 & TFs_FT$fraction_in_linked>0.1]+0.005,
     TFs_FT$tf[TFs_FT$p_adjust<0.01 & TFs_FT$fraction_in_linked>0.1],
     cex=1)Odds of seeing that many stripe TFs
TFs_FT_filt = TFs_FT[TFs_FT$p_adjust<0.01 ,]
sum( TFs_FT_filt$tf %in% human_stripe_factors$V1 )/nrow(TFs_FT_filt)## [1] 0.8651685m = rbind( affected = c(stripe=sum( TFs_FT[TFs_FT$p_adjust<0.01,]$tf %in% human_stripe_factors$V1 ),
                        non_stripe = sum( ! TFs_FT[ TFs_FT$p_adjust<0.01,]$tf %in% human_stripe_factors$V1 )),
           non_affected = c(stripe=sum( TFs_FT$tf %in% human_stripe_factors$V1 ),
                            non_stripe = sum( ! TFs_FT$tf %in% human_stripe_factors$V1)) )
m##              stripe non_stripe
## affected         77         12
## non_affected    199        476fisher.test(m)## 
##  Fisher's Exact Test for Count Data
## 
## data:  m
## p-value < 0.00000000000000022
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   8.046439 31.566055
## sample estimates:
## odds ratio 
##   15.28785par(pty="m",mar=c(5,3,3,1))
barplot( log2(fisher.test(m)$estimate), col="blue3", ylim=c(0,4))
axis(2,lwd=2)keyTFs = TFs_FT_filt$tf
keyTFs = keyTFs[keyTFs %in% human_stripe_factors$V1]l=(1+colSums(TFmat_lost_linked>0))/nrow(TFmat_lost_linked)
L=(1+colSums(TFmat_lost_not_linked>0))/nrow(TFmat_lost_not_linked)LOS = do.call("rbind",
              apply(TFmat_lost_linked,2,function(x){data.frame( Motif=1+sum(x>0),
                                                                noMotif=1+sum(x==0) ) } ) )
SOL = do.call("rbind",
              apply(TFmat_lost_not_linked,2,function(x){data.frame( Motif=1+sum(x>0),
                                                                    noMotif=1+sum(x==0) ) } ) )
lost_TFs_FT_chimp = data.frame()
for( i in colnames(TFmat_linked_with_activation) ){
  m=rbind(linked=LOS[rownames(LOS)==i,],
          notLinked=SOL[rownames(SOL)==i,])
  tp = fisher.test(m)
  tp = data.frame(p_val=tp$p.value,
                  odds=tp$estimate,
                  number_in_linked = LOS[rownames(LOS)==i,1],
                  number_in_not_linked = SOL[rownames(SOL)==i,1],
                  fraction_in_linked = LOS[rownames(LOS)==i,1]/rowSums(LOS[rownames(LOS)==i,]),
                  fraction_in_not_linked = SOL[rownames(SOL)==i,1]/rowSums(SOL[rownames(SOL)==i,]),
                  tf = i)
  lost_TFs_FT_chimp=rbind(tp,lost_TFs_FT_chimp) }
par(mfrow=c(1,1),mar=c(10,4,4,4), pty="m")
boxplot( lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% keyTFs],
         lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1],
         lost_TFs_FT_chimp$odds[! lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1],
         col="white",border=c("blue3","steelblue","coral3"),
         ylim=c(0,3),ylab="Odds",
         outline=FALSE,axes=FALSE)
abline(h=1)
axis(1,lwd=2,at=c(1,2,3),c("77 stripe TFs","All stripe TFs","non-stripe TFs"),las=2)
axis(2,lwd=2)t.test(lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% keyTFs],
       lost_TFs_FT_chimp$odds[! lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1])## 
##  Welch Two Sample t-test
## 
## data:  lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% keyTFs] and lost_TFs_FT_chimp$odds[!lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1]
## t = 4.2022, df = 332.24, p-value = 0.00003402
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1255168 0.3464608
## sample estimates:
## mean of x mean of y 
##  1.374035  1.138047t.test(lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1],
       lost_TFs_FT_chimp$odds[! lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1])## 
##  Welch Two Sample t-test
## 
## data:  lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1] and lost_TFs_FT_chimp$odds[!lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1]
## t = 2.4189, df = 346.62, p-value = 0.01608
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03773178 0.36604516
## sample estimates:
## mean of x mean of y 
##  1.339935  1.138047par(mfrow=c(1,1), pty="m")
m = rbind( linked=table(cut(rowSums(TFmat_linked_with_activation[,keyTFs]), 
                            c(-Inf,0,1,2,4,Inf) ) )/nrow(TFmat_linked_with_activation),
           not_linked=table(cut(rowSums(TFmat_not_linked_with_activation[,keyTFs]), 
                                c(-Inf,0,1,2,4,Inf) ) )/nrow(TFmat_not_linked_with_activation))
par(mfrow=c(2,1),mar=c(4,4,1,1),pty="m")
barplot(m,beside=TRUE,col=c("turquoise4","gray70"),ylim=c(0,0.5),
        names=c(0,1,2,4,">4"),ylab="Francion of sequences with TFs")
axis(2,lwd=2)
astroTFs = c("SOX9","SOX2","NFIA","NFIB","AFT3","RUNX2","NR1F2","DBX2","LHX2","STAT3")
astroTFs = astroTFs[astroTFs %in% colnames(TFmat_linked_with_activation)]
# par(mfrow=c(1,1))
M = rbind( linked=table(cut(rowSums(TFmat_linked_with_activation[,astroTFs]), 
                            c(-Inf,0,1,2,4,Inf) ) )/nrow(TFmat_linked_with_activation),
           not_linked=table(cut(rowSums(TFmat_not_linked_with_activation[,astroTFs]), 
                                c(-Inf,0,1,2,4,Inf) ) )/nrow(TFmat_not_linked_with_activation))
barplot(M,beside=TRUE,col=c("turquoise4","gray70"),ylim=c(0,1),
        names=c(0,1,2,4,">4"),ylab="Francion of sequences with TFs")
axis(2,lwd=2)table(cut(rowSums(TFmat_linked_with_activation[,keyTFs]), 
          c(-Inf,0,1,2,4,Inf) ) )## 
## (-Inf,0]    (0,1]    (1,2]    (2,4] (4, Inf] 
##      325      185      102      147      6841443 - sum(rowSums(TFmat_linked_with_activation[,keyTFs])==0)## [1] 1118How many changes in TF not being stripe factors?
not_stripeTFs_changes = TFmat_linked_with_activation[,!colnames(TFmat_linked_with_activation) %in% human_stripe_factors$V1]
table(rowSums(not_stripeTFs_changes>0))## 
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
## 25 27 51 64 67 68 67 66 77 59 82 73 71 65 47 61 60 42 37 35 33 25 29 23 24 19 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 40 41 42 43 44 45 46 47 48 50 53 54 56 
## 20 19 16  9 11 15 10  7  3  8  2  3  2  2  1  2  1  2  3  1  1  1  1  1  1  1 
## 66 68 70 
##  1  1  1Export these enhancers
enhancers__stripe_factors = TFmat_linked_with_activation[,keyTFs]
enhancers_with_stripe_factors = TFmat_linked_with_activation[rowSums(enhancers__stripe_factors)>0,]
enhancers_wo_stripe_factors = TFmat_linked_with_activation[rowSums(enhancers__stripe_factors)==0,]
save(enhancers_with_stripe_factors,
     enhancers_wo_stripe_factors,
     file=paste0(objects_directory,"enhancers_stripeTFs_no_stripeTFs.RData"))numberTFperPeak = function(tfmut,theTFs,allPeaks ){
  # tfmut=linked_with_activation_TFBSchange_Hs_spe;theTFs=unique(TFsEnsemblG$Fixed);allPeaks=enhancers_linked_with_activationII
    res = matrix(0L,
                 nrow=length(allPeaks),
                 ncol=length(unique(theTFs)) )
    res = as.data.frame(res)
    rownames(res) = names(allPeaks)
    colnames(res) = unique(theTFs)
    tp = split( tfmut$peak, tfmut$TF )
    for( tf in unique(theTFs)) {
        # tf="AFX3"
        thisC = which( colnames(res)==tf )
        theseRows = which( rownames(res) %in% tp[[tf]])
        numbers4rows = table( tp[[tf]] )
        coordinates = cbind(row=theseRows,
                            col=rep(thisC,length(theseRows)),
                            number = numbers4rows[match(rownames(res)[theseRows],names(numbers4rows))])
        if(nrow(coordinates)>0){
          res[ cbind( coordinates[,1], coordinates[,2]) ] = coordinates[,3] }
        
    }
    return(res) }
## -----------------------
linked_with_activation_TFBSchange_Hs_spe = linked_with_activation_TFBSchange[-queryHits(findOverlaps(linked_with_activation_TFBSchange,linked_with_activation_TFBSchange_chimp))]
linked_with_activation_TFBSchange_Pt_spe = linked_with_activation_TFBSchange_chimp[-queryHits(findOverlaps(linked_with_activation_TFBSchange_chimp,linked_with_activation_TFBSchange))]
TFmat_linked_with_activation_Hs = numberTFperPeak( tfmut=linked_with_activation_TFBSchange_Hs_spe, 
                                                   theTFs=unique(TFsEnsemblG$Fixed), 
                                                   allPeaks=enhancers_linked_with_activation )
TFmat_linked_with_activation_Pt = numberTFperPeak( tfmut=linked_with_activation_TFBSchange_Pt_spe, 
                                                   theTFs=unique(TFsEnsemblG$Fixed), 
                                                   allPeaks=enhancers_linked_with_activation )
conserved_TFBSchange_Hs_spe = conserved_TFBSchange[-queryHits(findOverlaps(conserved_TFBSchange,conserved_TFBSchange_chimp))]
conserved_TFBSchange_Pt_spe = conserved_TFBSchange_chimp[-queryHits(findOverlaps(conserved_TFBSchange_chimp,conserved_TFBSchange))]
TFmat_conserved_Hs = numberTFperPeak( tfmut=conserved_TFBSchange_Hs_spe, 
                                      theTFs=unique(TFsEnsemblG$Fixed), 
                                      allPeaks=conserved_enhancers )
TFmat_conserved_Pt = numberTFperPeak( tfmut=conserved_TFBSchange_Pt_spe, 
                                      theTFs=unique(TFsEnsemblG$Fixed), 
                                      allPeaks=conserved_enhancers )
all(rownames(TFmat_conserved_Hs)==rownames(TFmat_conserved_Pt))## [1] TRUEnet_TFBS_gain_conserved = (TFmat_conserved_Hs>TFmat_conserved_Pt)
net_TFBS_loss_conserved = (TFmat_conserved_Hs<TFmat_conserved_Pt)
net_TFBS_gain_linked = (TFmat_linked_with_activation_Hs>TFmat_linked_with_activation_Pt)
net_TFBS_loss_linked = (TFmat_linked_with_activation_Hs<TFmat_linked_with_activation_Pt)
## -----------------------
par(mfrow=c(1,2),pty="s",mar=c(4,4,3,3))
plot(colSums(net_TFBS_gain_linked),
     colSums(net_TFBS_loss_linked),pch=19, cex=0.5,
     main="Linked",ylab="TFBS loss",xlab="Gain in TFBS",
     ylim=c(0,60),xlim=c(0,60),
     col=ifelse(names(colSums(net_TFBS_gain_linked)) %in% keyTFs,"blue","gray"))
abline(a=0,b=1)
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)
plot(colSums(net_TFBS_gain_conserved),colSums(net_TFBS_loss_conserved), pch=19, cex=0.5,
     main="Conserved",ylab="TFBS loss",xlab="Gain in TFBS" ,ylim=c(0,60),xlim=c(0,60),
     col=ifelse(names(colSums(net_TFBS_gain_conserved)) %in% keyTFs,"blue","gray"))
abline(a=0,b=1)
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)up_set = HS_UP_Genes$ensembl_id
promoters_HITS_UP = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% up_set ) ]
promoters_HITS_UP_500 = resize(promoters_HITS_UP,1000000,fix="center")
promoters_HITS_UP_500_counting = data.frame( with_stripeTF = countOverlaps(promoters_HITS_UP_500,enhancers_linked_with_activation[which(names(enhancers_linked_with_activation) %in% rownames(enhancers_with_stripe_factors))]),
                                             wo_stripeTF = countOverlaps(promoters_HITS_UP_500,enhancers_linked_with_activation[which(names(enhancers_linked_with_activation) %in% rownames(enhancers_wo_stripe_factors))]),
                                             any = countOverlaps(promoters_HITS_UP_500,genuine_gained_enhancers_gr) )
## stripe no stripe
prom_with_with = rownames(promoters_HITS_UP_500_counting[promoters_HITS_UP_500_counting$with_stripeTF>0& promoters_HITS_UP_500_counting$wo_stripeTF>0,])
prom_with_wo = rownames(promoters_HITS_UP_500_counting[promoters_HITS_UP_500_counting$with_stripeTF>0 & promoters_HITS_UP_500_counting$wo_stripeTF==0,])
prom_wo_with = rownames(promoters_HITS_UP_500_counting[promoters_HITS_UP_500_counting$with_stripeTF==0 & promoters_HITS_UP_500_counting$wo_stripeTF>0,])
prom_wo_wo = rownames(promoters_HITS_UP_500_counting[promoters_HITS_UP_500_counting$with_stripeTF==0 & promoters_HITS_UP_500_counting$wo_stripeTF==0,])
promoters_HITS_UP_500_counting_enh = promoters_HITS_UP_500_counting[rowSums(promoters_HITS_UP_500_counting[,1:2])>0,]
m = promoters_HITS_UP_500_counting_enh>0
m = m[order(m[,1],m[,2]),1:2]
par(mar=c(1,1,1,1))
image(t(m),col=c("white","coral2"),axes=FALSE)
box(col="black",lwd=2)
abline(v=0.5,lwd=2)sum(m[,1]==0 & m[,2]>0)## [1] 29sum(m[,1]>0 & m[,2]>0)## [1] 258sum(m[,1]>0 & m[,2]==0)## [1] 299readFootprintAnalysis_bed = function(TFdir,SPECIES){
  allF = as.list( unlist(strsplit(list.files(TFdir),"_FootPrints")) )
  res=do.call("rbind",lapply(allF,function(x){
    # x = allF[[1]]
    tp=read.delim(paste0(TFdir,"/",x,"_FootPrints/",x,".bed"),
                  sep="\t",header=FALSE )
    return( data.frame(score=tp$V5, TF=unlist(strsplit(x,"_"))[1], species=SPECIES ) ) }))
  return(res) }
footprintHg = readFootprintAnalysis_bed(paste0(outputs_directory,"footprint_analysis/Stripe_TF_HG38_Footprints_10bp/"), "Human")
footprintPt = readFootprintAnalysis_bed(paste0(outputs_directory,"footprint_analysis/Stripe_TF_PT06_Footprints_10bp/"), "Chimpanzee")
footprintHg_keyTFs = footprintHg[footprintHg$TF %in% TFsEnsemblG[ TFsEnsemblG$Fixed %in% keyTFs,1], ]
footprintPt_keyTFs = footprintPt[footprintPt$TF %in% TFsEnsemblG[ TFsEnsemblG$Fixed %in% keyTFs,1], ]
footprint_scores = rbind(footprintHg,footprintPt)
footprint_scores$species = factor(footprint_scores$species,levels=c("Human","Chimpanzee"))
p1=ggboxplot(footprint_scores, x="TF", y="score",color = "species",
          palette=c("black","red"),outlier.shape = NA,rotate = TRUE) 
ggpar(p1,ylim = c(0,500)) + rotate_x_text(90)## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.enhancers_linked_with_activation_TADs = import.bed(paste0(outputs_directory,"enhancers_linked_with_activation_TADs.bed"))
names(enhancers_linked_with_activation_TADs) = enhancers_linked_with_activation_TADs$name
enhancers_not_linked_with_activation_TADs = import.bed(paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs.bed"))
names(enhancers_not_linked_with_activation_TADs) = enhancers_not_linked_with_activation_TADs$name
linked_with_activation_TADs_TFBSchange = readBedtools_res( filePath=paste0(outputs_directory,"/TFBS_analysis/enhancers_linked_with_activation_TADs/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
not_linked_with_activation_TADs_TFBSchange = readBedtools_res( filePath=paste0(outputs_directory,"/TFBS_analysis/enhancers_not_linked_with_activation_TADs/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)
linked_with_activation_TADs_TFBSchange = processTFBSresult(linked_with_activation_TADs_TFBSchange,
                                                      tfanno=TFsEnsemblG,
                                                      nameColumn="names")
not_linked_with_activation_TADs_TFBSchange = processTFBSresult(not_linked_with_activation_TADs_TFBSchange,
                                                      tfanno=TFsEnsemblG,
                                                      nameColumn="names")
save(linked_with_activation_TADs_TFBSchange,
     not_linked_with_activation_TADs_TFBSchange,
     file=paste0(objects_directory,"linked_or_not_with_activation_TADs_TFBSchange_chimp.RData"))
TFmat_linked_with_activation_TADs = makeMatrixTFBS4peaks( tfmut=linked_with_activation_TADs_TFBSchange, 
                                                     theTFs=unique(TFsEnsemblG$Fixed), 
                                                     allPeaks=enhancers_linked_with_activation_TADs )
TFmat_not_linked_with_activation_TADs = makeMatrixTFBS4peaks( tfmut=not_linked_with_activation_TADs_TFBSchange, 
                                                         theTFs=unique(TFsEnsemblG$Fixed), 
                                                         allPeaks=enhancers_not_linked_with_activation_TADs )
save(TFmat_linked_with_activation_TADs,TFmat_not_linked_with_activation_TADs,
     file=paste0(objects_directory,"TFmatrices_linked_not_linked_TADs.RData"))load(paste0(objects_directory,"TFmatrices_linked_not_linked_TADs.RData"))
TAL = do.call("rbind",
              apply(TFmat_linked_with_activation_TADs,2,function(x){data.frame( Motif=sum(x>0),
                                                                 noMotif=sum(x==0) ) } ) )
LAT = do.call("rbind",
              apply(TFmat_not_linked_with_activation_TADs,2,function(x){data.frame( Motif=sum(x>0),
                                                                               noMotif=sum(x==0) ) } ) )
TFs_TAD_FT = data.frame()
for( i in colnames(TFmat_linked_with_activation_TADs) ){
  m=rbind(linked=TAL[rownames(TAL)==i,],
          notLinked=LAT[rownames(LAT)==i,])
  tp = fisher.test(m)
  tp = data.frame(p_val=tp$p.value,
                  odds=tp$estimate,
                  number_in_linked = TAL[rownames(TAL)==i,1],
                  number_in_not_linked = LAT[rownames(LAT)==i,1],
                  fraction_in_linked = TAL[rownames(TAL)==i,1]/rowSums(TAL[rownames(TAL)==i,]),
                  fraction_in_not_linked = LAT[rownames(LAT)==i,1]/rowSums(LAT[rownames(LAT)==i,]),
                  tf = i)
  TFs_TAD_FT=rbind(tp,TFs_TAD_FT) }
TFs_TAD_FT$p_adjust = p.adjust(TFs_TAD_FT$p_val)
par(pty="s",mfrow=c(1,1))
plot( x=TFs_TAD_FT$fraction_in_linked, 
      y=TFs_TAD_FT$fraction_in_not_linked,
      pch=19, cex=0.5, 
      xlab="Linked with activation",
      ylab="Not linked with activation",
      xlim=c(0,0.3), ylim=c(0,0.3),
      col=ifelse(TFs_TAD_FT$p_adjust<0.05 ,"blue3","wheat2"))
abline(a=0,b=1,col='black')
axis(1,lwd=2)
axis(2,lwd=2)
box(col='black',lwd=2)
text(x=TFs_TAD_FT$fraction_in_linked[TFs_TAD_FT$p_adjust<0.05 ]+0.005,
     y=TFs_TAD_FT$fraction_in_not_linked[TFs_TAD_FT$p_adjust<0.05 ]+0.005,
     TFs_TAD_FT$tf[TFs_TAD_FT$p_adjust<0.05 ],
     cex=1)TFs_TAD_FT[TFs_TAD_FT$p_val<0.01 & TFs_TAD_FT$fraction_in_linked>0.1,]##                                p_val     odds number_in_linked
## odds ratio673 0.00150988966405420457 1.608891               64
## odds ratio669 0.00061743780893750864 1.683180               63
## odds ratio664 0.00000001553881099418 1.845451              143
## odds ratio645 0.00002249720073059971 1.586955              135
## odds ratio639 0.00547001523869697359 1.401620              100
## odds ratio636 0.00001844604778766271 1.738707               91
## odds ratio635 0.00000177955637888710 1.693353              133
## odds ratio626 0.00051003649052152309 1.607932               79
## odds ratio623 0.00004148693474305783 1.610882              117
## odds ratio616 0.00020763714135284235 1.702754               72
## odds ratio603 0.00000000100972795250 2.345061               86
## odds ratio587 0.00044599744467552140 1.518042              109
## odds ratio584 0.00005385690669902456 1.799449               72
## odds ratio582 0.00000037117379643772 1.768726              130
## odds ratio579 0.00003393160944478522 1.537540              153
## odds ratio566 0.00000000003216698365 2.880740               68
## odds ratio538 0.00000410606699485287 1.786143               98
## odds ratio537 0.00000000049265940835 2.234227              101
## odds ratio514 0.00000028707333831881 1.828129              117
## odds ratio513 0.00000196034299164705 1.876367               89
## odds ratio512 0.00000000000004617375 2.398890              131
## odds ratio511 0.00000000010021448003 2.256986              106
## odds ratio478 0.00003385372076594007 1.704218               93
## odds ratio473 0.00003225347827537949 1.725282               90
## odds ratio458 0.00000033506023894474 1.969773               91
## odds ratio457 0.00000010818835810204 2.112920               80
## odds ratio408 0.00000000256319916115 1.926115              141
## odds ratio308 0.00000138201327477164 1.652499              155
## odds ratio287 0.00000002345307931912 2.008273              105
## odds ratio286 0.00000025777319341452 2.094659               77
## odds ratio284 0.00000000811899658359 2.205526               87
## odds ratio283 0.00000000007428138142 2.293758              103
## odds ratio282 0.00001458947286974133 1.698132              104
## odds ratio96  0.00000578088929590625 1.670073              124
## odds ratio95  0.00000000624475659928 2.205966               88
## odds ratio57  0.00001776370364941432 1.818494               78
##               number_in_not_linked fraction_in_linked fraction_in_not_linked
## odds ratio673                  316             0.1024             0.06620574
## odds ratio669                  298             0.1008             0.06243453
## odds ratio664                  661             0.2288             0.13848732
## odds ratio645                  706             0.2160             0.14791536
## odds ratio639                  571             0.1600             0.11963126
## odds ratio636                  426             0.1456             0.08925204
## odds ratio635                  657             0.2128             0.13764928
## odds ratio626                  394             0.1264             0.08254766
## odds ratio623                  597             0.1872             0.12507857
## odds ratio616                  339             0.1152             0.07102451
## odds ratio603                  304             0.1376             0.06369160
## odds ratio587                  583             0.1744             0.12214540
## odds ratio584                  322             0.1152             0.06746281
## odds ratio582                  617             0.2080             0.12926880
## odds ratio579                  831             0.2448             0.17410434
## odds ratio566                  194             0.1088             0.04064530
## odds ratio538                  450             0.1568             0.09428033
## odds ratio537                  379             0.1616             0.07940499
## odds ratio514                  534             0.1872             0.11187932
## odds ratio513                  388             0.1424             0.08129059
## odds ratio512                  475             0.2096             0.09951812
## odds ratio511                  396             0.1696             0.08296669
## odds ratio478                  444             0.1488             0.09302326
## odds ratio473                  424             0.1440             0.08883302
## odds ratio458                  380             0.1456             0.07961450
## odds ratio457                  310             0.1280             0.06494867
## odds ratio408                  627             0.2256             0.13136392
## odds ratio308                  794             0.2480             0.16635240
## odds ratio287                  436             0.1680             0.09134716
## odds ratio286                  300             0.1232             0.06285355
## odds ratio284                  326             0.1392             0.06830086
## odds ratio283                  378             0.1648             0.07919547
## odds ratio282                  502             0.1664             0.10517494
## odds ratio96                   616             0.1984             0.12905929
## odds ratio95                   330             0.1408             0.06913891
## odds ratio57                   347             0.1248             0.07270061
##                    tf            p_adjust
## odds ratio673 ZSCAN22 0.92254258473711903
## odds ratio669   ZNF76 0.38281144154125535
## odds ratio664  ZNF770 0.00001034884812212
## odds ratio645  ZNF467 0.01444320286904501
## odds ratio639  ZNF394 1.00000000000000000
## odds ratio636  ZNF350 0.01189770082304245
## odds ratio635  ZNF341 0.00116738898454994
## odds ratio626  ZNF281 0.31724269710438735
## odds ratio623  ZNF263 0.02638569049658478
## odds ratio616  ZNF148 0.13060376191093784
## odds ratio603     ZFX 0.00000067651772818
## odds ratio587  ZBTB17 0.27830240547752533
## odds ratio584  ZNF324 0.03419913575388060
## odds ratio582     WT1 0.00024460353185246
## odds ratio579   VEZF1 0.02168229843521776
## odds ratio566   THAP1 0.00000002168054698
## odds ratio538   TBX15 0.00267715568064407
## odds ratio537    TBX1 0.00000033057446300
## odds ratio514     SP4 0.00018975547662873
## odds ratio513     SP3 0.00128402465952881
## odds ratio512     SP2 0.00000000003116728
## odds ratio511     SP1 0.00000006734413058
## odds ratio478    RXRA 0.02166638129020165
## odds ratio473   RREB1 0.02067447957451825
## odds ratio458    RARA 0.00022113975770353
## odds ratio457    PURA 0.00007183706977975
## odds ratio408   PATZ1 0.00000171478023881
## odds ratio308     MAZ 0.00090798272152497
## odds ratio287    KLF6 0.00001559629774721
## odds ratio286    KLF5 0.00017064585404041
## odds ratio284    KLF3 0.00000541537072126
## odds ratio283   KLF16 0.00000004999136969
## odds ratio282   KLF15 0.00943938894672264
## odds ratio96     EGR2 0.00376335893163497
## odds ratio95     EGR1 0.00000417149740832
## odds ratio57    NR2F1 0.01147535255752165ms=64000*1024^2 
options(future.globals.maxSize=ms)
human1=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_667_S13_SingleCell/raw_feature_bc_matrix/')
human2=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_698_S15_SingleCell/raw_feature_bc_matrix/')
human3=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_700_S14_SingleCell/raw_feature_bc_matrix/')
human4=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_702_9C_SingleCell/raw_feature_bc_matrix/')
human5=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_667_60C_SingleCell/raw_feature_bc_matrix/')
human6=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_669_3C_SingleCell/raw_feature_bc_matrix/')
human7=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_671_64C_SingleCell/raw_feature_bc_matrix/')
human8=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_673_62C_SingleCell/raw_feature_bc_matrix/')
human9=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_675_23C_SingleCell/raw_feature_bc_matrix/')
human10=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_677_63C_SingleCell/raw_feature_bc_matrix/')
human11=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_679_30C_SingleCell/raw_feature_bc_matrix/')
human12=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_681_34C_SingleCell/raw_feature_bc_matrix/')
human13=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_686_56C_SingleCell/raw_feature_bc_matrix/')
human14=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_690_26C_SingleCell/raw_feature_bc_matrix/')
human15=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_688_11C_SingleCell/raw_feature_bc_matrix/')
human16=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_692_24C_SingleCell/raw_feature_bc_matrix/')
macaque1=paste0(outputs_directory,'/scRNA_published_data/RhMacaque/scRNA_syn17093056_RMB683_DFC/multi/count/raw_feature_bc_matrix/')
macaque2=paste0(outputs_directory,'/scRNA_published_data/RhMacaque/scRNA_syn17093056_RMB691_DFC/multi/count/raw_feature_bc_matrix/')
macaque_Ch_78_1=paste0(outputs_directory,'scRNA_published_data/RhMacaque/scRNA_SRR23687004_macaque/raw_feature_bc_matrix/')
macaque_Ch_110_DFC=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23687017_M_DFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_110_OFC=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23686999_M_OFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_93_DFC=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E93_SRR23687065_M_DFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_110_DFC_S2=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23687057_M_DFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_110_VFC=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23687060_M_VFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_110_VFC_S2=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23687012_M_VFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_77_Frontal=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E77_F_Frontal/raw_feature_bc_matrix/')
macaque_Ch_64_Frontal=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E64_F_Frontal/raw_feature_bc_matrix/')
macaque_Ch_62_Frontal=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E62_F_frontal_scRNA/raw_feature_bc_matrix/')
human_metadata = read.csv( paste0(outputs_directory,'/scRNA_published_data/GSE217511_CorticalPlate_Seuratmetadata.csv' ))
human_sample_anno = paste0(outputs_directory,'/scRNA_published_data/MetaTable.txt' )
human_metadata$UMI = unlist(lapply(strsplit(human_metadata$X,"_"),function(x){x[[1]]}))Let’s consider the data from the foetal like cells
human1_expression=Read10X( human1 )
human2_expression=Read10X( human2 )
human3_expression=Read10X( human3 )
human4_expression=Read10X( human4 )
human5_expression=Read10X( human5 )
human6_expression=Read10X( human6 )
human7_expression=Read10X( human7 )
human8_expression=Read10X( human8 )
human9_expression=Read10X( human9 )
human10_expression=Read10X( human10 )
human11_expression=Read10X( human11 )
human12_expression=Read10X( human12 )
human13_expression=Read10X( human13 )
human14_expression=Read10X( human14 )
human15_expression=Read10X( human15 )
human16_expression=Read10X( human16 )
macaque1_expression=Read10X( macaque1 )
macaque2_expression=Read10X( macaque2 )
macaque_CN_78_1_expression=Read10X( macaque_Ch_78_1 )
macaque_DFC_110_1_expression=Read10X( macaque_Ch_110_DFC )
macaque_OFC_110_1_expression=Read10X( macaque_Ch_110_OFC )
macaque_93_DFC_expression= Read10X( macaque_Ch_93_DFC )
macaque_110_DFC_S2_expression= Read10X( macaque_Ch_110_DFC_S2 )
macaque_110_VFC_expression=Read10X( macaque_Ch_110_VFC)
macaque_110_VFC_S2_expression=Read10X( macaque_Ch_110_VFC_S2)
macaque_77_Frontal_expression=Read10X( macaque_Ch_77_Frontal)
macaque_64_Frontal_expression=Read10X( macaque_Ch_64_Frontal)
macaque_62_Frontal_expression=Read10X( macaque_Ch_62_Frontal)
## ---------------------------
human1_expression = CreateSeuratObject(human1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human2_expression = CreateSeuratObject(human2_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human3_expression = CreateSeuratObject(human3_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human4_expression=  CreateSeuratObject(human4_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human5_expression=  CreateSeuratObject(human5_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human6_expression=  CreateSeuratObject(human6_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human7_expression=  CreateSeuratObject(human7_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human8_expression=  CreateSeuratObject(human8_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human9_expression=  CreateSeuratObject(human9_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human10_expression= CreateSeuratObject(human10_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human11_expression= CreateSeuratObject(human11_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human12_expression= CreateSeuratObject(human12_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human13_expression= CreateSeuratObject(human13_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human14_expression= CreateSeuratObject(human14_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human15_expression= CreateSeuratObject(human15_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human16_expression= CreateSeuratObject(human16_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque1_expression = CreateSeuratObject(macaque1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque2_expression = CreateSeuratObject(macaque2_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_CN_78_1_expression = CreateSeuratObject(macaque_CN_78_1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_DFC_110_1_expression = CreateSeuratObject(macaque_DFC_110_1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_OFC_110_1_expression = CreateSeuratObject(macaque_OFC_110_1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_93_DFC_expression= CreateSeuratObject(macaque_93_DFC_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_110_DFC_S2_expression= CreateSeuratObject(macaque_110_DFC_S2_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_110_VFC_expression= CreateSeuratObject(macaque_110_VFC_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_110_VFC_S2_expression= CreateSeuratObject(macaque_110_VFC_S2_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_77_Frontal_expression= CreateSeuratObject(macaque_77_Frontal_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_64_Frontal_expression= CreateSeuratObject(macaque_64_Frontal_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_62_Frontal_expression= CreateSeuratObject(macaque_62_Frontal_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
## ----------------
human1_expression[["percent.mt"]] <- PercentageFeatureSet(human1_expression, pattern = "^MT-")
human2_expression[["percent.mt"]] <- PercentageFeatureSet(human2_expression, pattern = "^MT-")
human3_expression[["percent.mt"]] <- PercentageFeatureSet(human3_expression, pattern = "^MT-")
human4_expression[["percent.mt"]] <- PercentageFeatureSet(human4_expression, pattern = "^MT-")
human5_expression[["percent.mt"]] <- PercentageFeatureSet(human5_expression, pattern = "^MT-")
human6_expression[["percent.mt"]] <- PercentageFeatureSet(human6_expression, pattern = "^MT-")
human7_expression[["percent.mt"]] <- PercentageFeatureSet(human7_expression, pattern = "^MT-")
human8_expression[["percent.mt"]] <- PercentageFeatureSet(human8_expression, pattern = "^MT-")
human9_expression[["percent.mt"]] <- PercentageFeatureSet(human9_expression, pattern = "^MT-")
human10_expression[["percent.mt"]] <- PercentageFeatureSet(human10_expression, pattern = "^MT-")
human11_expression[["percent.mt"]] <- PercentageFeatureSet(human11_expression, pattern = "^MT-")
human12_expression[["percent.mt"]] <- PercentageFeatureSet(human12_expression, pattern = "^MT-")
human13_expression[["percent.mt"]] <- PercentageFeatureSet(human13_expression, pattern = "^MT-")
human14_expression[["percent.mt"]] <- PercentageFeatureSet(human14_expression, pattern = "^MT-")
human15_expression[["percent.mt"]] <- PercentageFeatureSet(human15_expression, pattern = "^MT-")
human16_expression[["percent.mt"]] <- PercentageFeatureSet(human16_expression, pattern = "^MT-")
macaque1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque1_expression, pattern = "^MT-")
macaque2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque2_expression, pattern = "^MT-")
macaque_CN_78_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_CN_78_1_expression, pattern = "^MT-")
macaque_DFC_110_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_DFC_110_1_expression, pattern = "^MT-")
macaque_OFC_110_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_OFC_110_1_expression, pattern = "^MT-")
macaque_93_DFC_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_93_DFC_expression, pattern = "^MT-")
macaque_110_DFC_S2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_DFC_S2_expression, pattern = "^MT-")
macaque_110_VFC_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_VFC_expression, pattern = "^MT-")
macaque_110_VFC_S2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_VFC_S2_expression, pattern = "^MT-")
macaque_77_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_77_Frontal_expression, pattern = "^MT-")
macaque_64_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_64_Frontal_expression, pattern = "^MT-")
macaque_62_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_62_Frontal_expression, pattern = "^MT-")
## --------------------------------------------
human1_expression$orig.ident = 'human1'
human2_expression$orig.ident = 'human2'
human3_expression$orig.ident = 'human3'
human4_expression$orig.ident = 'human4'
human5_expression$orig.ident = 'human5'
human6_expression$orig.ident = 'human6'
human7_expression$orig.ident = 'human7'
human8_expression$orig.ident = 'human8'
human9_expression$orig.ident = 'human9'
human10_expression$orig.ident = 'human10'
human11_expression$orig.ident = 'human11'
human12_expression$orig.ident = 'human12'
human13_expression$orig.ident = 'human13'
human14_expression$orig.ident = 'human14'
human15_expression$orig.ident = 'human15'
human16_expression$orig.ident = 'human16'
macaque1_expression$orig.ident = 'Macaque1'
macaque2_expression$orig.ident = 'Macaque2'
macaque_CN_78_1_expression$orig.ident = 'Macaque3_78_1'
macaque_DFC_110_1_expression$orig.ident = 'Macaque3_110_DFC'
macaque_OFC_110_1_expression$orig.ident = 'Macaque3_110_OFC'
macaque_93_DFC_expression$orig.ident = 'Macaque3_93_DFC'
macaque_110_DFC_S2_expression$orig.ident ='Macaque3_110_DFC_S2'
macaque_110_VFC_expression$orig.ident ='Macaque3_110_VFC'
macaque_110_VFC_S2_expression$orig.ident ='Macaque3_110_VFC_S2'
macaque_77_Frontal_expression$orig.ident ='Macaque3_77_Frontal'
macaque_64_Frontal_expression$orig.ident ='Macaque3_64_Frontal'
macaque_62_Frontal_expression$orig.ident ='Macaque3_62_Frontal'
## ------------------------------------------
all_genes = rownames(human1_expression)
human1_expression = CellCycleScoring(human1_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human2_expression = CellCycleScoring(human2_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human3_expression = CellCycleScoring(human3_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human4_expression = CellCycleScoring(human4_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human5_expression = CellCycleScoring(human5_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human6_expression = CellCycleScoring(human6_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human7_expression = CellCycleScoring(human7_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human8_expression = CellCycleScoring(human8_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human9_expression = CellCycleScoring(human9_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human10_expression = CellCycleScoring(human10_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human11_expression = CellCycleScoring(human11_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human12_expression = CellCycleScoring(human12_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human13_expression = CellCycleScoring(human13_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human14_expression = CellCycleScoring(human14_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human15_expression = CellCycleScoring(human15_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human16_expression = CellCycleScoring(human16_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
macaque1_expression = CellCycleScoring(macaque1_expression,
                                      g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                      s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                      set.ident = FALSE )
macaque2_expression = CellCycleScoring(macaque2_expression,
                                       g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                       s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                       set.ident = FALSE )
macaque_CN_78_1_expression = CellCycleScoring(macaque_CN_78_1_expression,
                                              g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                              s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                              set.ident = FALSE )
macaque_DFC_110_1_expression = CellCycleScoring(macaque_DFC_110_1_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_OFC_110_1_expression = CellCycleScoring(macaque_OFC_110_1_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_93_DFC_expression= CellCycleScoring(macaque_93_DFC_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_110_DFC_S2_expression= CellCycleScoring(macaque_110_DFC_S2_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_110_VFC_expression= CellCycleScoring(macaque_110_VFC_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_110_VFC_S2_expression= CellCycleScoring(macaque_110_VFC_S2_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_77_Frontal_expression= CellCycleScoring(macaque_77_Frontal_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_64_Frontal_expression= CellCycleScoring(macaque_64_Frontal_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_62_Frontal_expression= CellCycleScoring(macaque_62_Frontal_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
human1_expression[["percent.mt"]] <- PercentageFeatureSet(human1_expression, pattern = "^MT-")
human2_expression[["percent.mt"]] <- PercentageFeatureSet(human2_expression, pattern = "^MT-")
human3_expression[["percent.mt"]] <- PercentageFeatureSet(human3_expression, pattern = "^MT-")
human4_expression[["percent.mt"]] <- PercentageFeatureSet(human4_expression, pattern = "^MT-")
human5_expression[["percent.mt"]] <- PercentageFeatureSet(human5_expression, pattern = "^MT-")
human6_expression[["percent.mt"]] <- PercentageFeatureSet(human6_expression, pattern = "^MT-")
human7_expression[["percent.mt"]] <- PercentageFeatureSet(human7_expression, pattern = "^MT-")
human8_expression[["percent.mt"]] <- PercentageFeatureSet(human8_expression, pattern = "^MT-")
human9_expression[["percent.mt"]] <- PercentageFeatureSet(human9_expression, pattern = "^MT-")
human10_expression[["percent.mt"]] <- PercentageFeatureSet(human10_expression, pattern = "^MT-")
human11_expression[["percent.mt"]] <- PercentageFeatureSet(human11_expression, pattern = "^MT-")
human12_expression[["percent.mt"]] <- PercentageFeatureSet(human12_expression, pattern = "^MT-")
human13_expression[["percent.mt"]] <- PercentageFeatureSet(human13_expression, pattern = "^MT-")
human14_expression[["percent.mt"]] <- PercentageFeatureSet(human14_expression, pattern = "^MT-")
human15_expression[["percent.mt"]] <- PercentageFeatureSet(human15_expression, pattern = "^MT-")
human16_expression[["percent.mt"]] <- PercentageFeatureSet(human16_expression, pattern = "^MT-")
macaque1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque1_expression, pattern = "^MT-")
macaque2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque2_expression, pattern = "^MT-")
macaque_CN_78_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_CN_78_1_expression, pattern = "^MT-")
macaque_DFC_110_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_DFC_110_1_expression, pattern = "^MT-")
macaque_OFC_110_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_OFC_110_1_expression, pattern = "^MT-")
macaque_93_DFC_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_93_DFC_expression, pattern = "^MT-")
macaque_110_DFC_S2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_DFC_S2_expression, pattern = "^MT-")
macaque_110_VFC_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_VFC_expression, pattern = "^MT-")
macaque_110_VFC_S2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_VFC_S2_expression, pattern = "^MT-")
macaque_77_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_77_Frontal_expression, pattern = "^MT-")
macaque_64_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_64_Frontal_expression, pattern = "^MT-")
macaque_62_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_62_Frontal_expression, pattern = "^MT-")
human1_expression = subset(human1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human2_expression = subset(human2_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human3_expression = subset(human3_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human4_expression = subset(human4_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human5_expression = subset(human5_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human6_expression = subset(human6_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human7_expression = subset(human7_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human8_expression = subset(human8_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human9_expression = subset(human9_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human10_expression = subset(human10_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human11_expression = subset(human11_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human12_expression = subset(human12_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human13_expression = subset(human13_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human14_expression = subset(human14_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human15_expression = subset(human15_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human16_expression = subset(human16_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque1_expression = subset(macaque1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque2_expression = subset(macaque2_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_CN_78_1_expression = subset(macaque_CN_78_1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_DFC_110_1_expression = subset(macaque_DFC_110_1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_OFC_110_1_expression = subset(macaque_OFC_110_1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_93_DFC_expression = subset(macaque_93_DFC_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_110_DFC_S2_expression= subset(macaque_110_DFC_S2_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_110_VFC_expression= subset(macaque_110_VFC_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_110_VFC_S2_expression= subset(macaque_110_VFC_S2_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_77_Frontal_expression= subset(macaque_77_Frontal_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_64_Frontal_expression= subset(macaque_64_Frontal_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_62_Frontal_expression= subset(macaque_62_Frontal_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
alldata = merge( human1_expression, 
                 c(human2_expression,
                   human3_expression,
                   human4_expression,
                   human5_expression,
                   human6_expression,
                   human7_expression,
                   human8_expression,
                   human9_expression,
                   human10_expression,
                   human11_expression,
                   human12_expression,
                   human13_expression,
                   human14_expression,
                   human15_expression,
                   human16_expression,
                   macaque1_expression,
                   macaque2_expression,
                   macaque_CN_78_1_expression,
                   macaque_DFC_110_1_expression,
                   macaque_OFC_110_1_expression,
                   macaque_93_DFC_expression, 
                   macaque_110_DFC_S2_expression, 
                   macaque_110_VFC_expression, 
                   macaque_110_VFC_S2_expression, 
                   macaque_77_Frontal_expression, 
                   macaque_64_Frontal_expression,
                   macaque_62_Frontal_expression), 
                 add.cell.ids = c('human1','human2','human3','human4','human5','human6','human7','human8','human9','human10','human11','human12','human13','human14','human15','human16', 
                                  "macaque1","macaque2","macaque_78_1","Macaque3_110_DFC","Macaque3_110_OFC","macaque_93_DFC_expression", "macaque_110_DFC_S2_expression", "macaque_110_VFC_expression", "macaque_110_VFC_S2_expression", "macaque_77_Frontal_expression", "macaque_64_Frontal_expression", "macaque_62_Frontal_expression"))
save(alldata,file=paste0(objects_directory,"scRNA_published_foetal_samples.RData"))load(paste0(objects_directory,"scRNA_published_foetal_samples.RData"))
split_seurat = SplitObject(alldata, split.by = "orig.ident")
human1 = perform_clustering_to_find_astrocytes(split_seurat[[1]])
human2 = perform_clustering_to_find_astrocytes(split_seurat[[2]])
human3 = perform_clustering_to_find_astrocytes(split_seurat[[3]])
human5 = perform_clustering_to_find_astrocytes(split_seurat[[5]])
human6 = perform_clustering_to_find_astrocytes(split_seurat[[6]])
human7 = perform_clustering_to_find_astrocytes(split_seurat[[7]])
human9 = perform_clustering_to_find_astrocytes(split_seurat[[9]])
human10 = perform_clustering_to_find_astrocytes(split_seurat[[10]])
human13 = perform_clustering_to_find_astrocytes(split_seurat[[13]])
human15 = perform_clustering_to_find_astrocytes(split_seurat[[15]])
human16 = perform_clustering_to_find_astrocytes(split_seurat[[16]])
## takes longer
human4 = perform_clustering_to_find_astrocytes(split_seurat[[4]]) # long
human8 = perform_clustering_to_find_astrocytes(split_seurat[[8]]) # long
human11 = perform_clustering_to_find_astrocytes(split_seurat[[11]]) # long
human12 = perform_clustering_to_find_astrocytes(split_seurat[[12]]) # long
human14 = perform_clustering_to_find_astrocytes(split_seurat[[14]]) # long
save( human1, file=paste0(objects_directory,"human1_scRNA.RData"))
save( human2, file=paste0(objects_directory,"human2_scRNA.RData"))
save( human3, file=paste0(objects_directory,"human3_scRNA.RData"))
save( human4, file=paste0(objects_directory,"human4_scRNA.RData"))
save( human5, file=paste0(objects_directory,"human5_scRNA.RData"))
save( human6, file=paste0(objects_directory,"human6_scRNA.RData"))
save( human7, file=paste0(objects_directory,"human7_scRNA.RData"))
save( human8, file=paste0(objects_directory,"human8_scRNA.RData"))
save( human9, file=paste0(objects_directory,"human9_scRNA.RData"))
save( human10, file=paste0(objects_directory,"human10_scRNA.RData"))
save( human11, file=paste0(objects_directory,"human11_scRNA.RData"))
save( human12, file=paste0(objects_directory,"human12_scRNA.RData"))
save( human13, file=paste0(objects_directory,"human13_scRNA.RData"))
save( human14, file=paste0(objects_directory,"human14_scRNA.RData"))
save( human15, file=paste0(objects_directory,"human15_scRNA.RData"))
save( human16, file=paste0(objects_directory,"human16_scRNA.RData"))
objects_directory="~/Desktop/Ciuba_et_al_SM/data/objects/"
macaque1 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque1"]])
save( macaque1, file=paste0(objects_directory,"macaque1_scRNA.RData"))
macaque2 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque2"]])
save( macaque2, file=paste0(objects_directory,"macaque2_scRNA.RData"))
macaque3 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_78_1"]])
save( macaque3, file=paste0(objects_directory,"macaque3_scRNA.RData"))
macaque4 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_DFC"]])
save( macaque4, file=paste0(objects_directory,"macaque4_scRNA.RData"))
macaque5 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_OFC"]])
save( macaque5, file=paste0(objects_directory,"macaque5_scRNA.RData"))
macaque6 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_93_DFC"]])
save( macaque6, file=paste0(objects_directory,"macaque6_scRNA.RData"))
macaque7 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_DFC_S2"]])
save( macaque7, file=paste0(objects_directory,"macaque7_scRNA.RData"))
macaque8 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_VFC"]])
save( macaque8, file=paste0(objects_directory,"macaque8_scRNA.RData"))
macaque9 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_VFC_S2"]])
save( macaque9, file=paste0(objects_directory,"macaque9_scRNA.RData"))
macaque10 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_77_Frontal"]])
save( macaque10, file=paste0(objects_directory,"macaque10_scRNA.RData"))
macaque11 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_64_Frontal"]])
save( macaque11, file=paste0(objects_directory,"macaque11_scRNA.RData"))
macaque12 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_62_Frontal"]])
save( macaque12, file=paste0(objects_directory,"macaque12_scRNA.RData"))load(paste0(objects_directory,"human1_scRNA.RData"))
load(paste0(objects_directory,"human2_scRNA.RData"))
load(paste0(objects_directory,"human3_scRNA.RData"))
load(paste0(objects_directory,"human4_scRNA.RData"))
load(paste0(objects_directory,"human5_scRNA.RData"))
load(paste0(objects_directory,"human6_scRNA.RData"))
load(paste0(objects_directory,"human7_scRNA.RData"))
load(paste0(objects_directory,"human8_scRNA.RData"))
load(paste0(objects_directory,"human9_scRNA.RData"))
load(paste0(objects_directory,"human10_scRNA.RData"))
load(paste0(objects_directory,"human11_scRNA.RData"))
load(paste0(objects_directory,"human12_scRNA.RData"))
load(paste0(objects_directory,"human13_scRNA.RData"))
load(paste0(objects_directory,"human14_scRNA.RData"))
load(paste0(objects_directory,"human15_scRNA.RData"))
load(paste0(objects_directory,"human16_scRNA.RData"))
human1_astrocyte_counts = findClusterCorrespondingToAstrocytes(human1,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human2_astrocyte_counts = findClusterCorrespondingToAstrocytes(human2,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human3_astrocyte_counts = findClusterCorrespondingToAstrocytes(human3,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human4_astrocyte_counts = findClusterCorrespondingToAstrocytes(human4,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human5_astrocyte_counts = findClusterCorrespondingToAstrocytes(human5,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human6_astrocyte_counts = findClusterCorrespondingToAstrocytes(human6,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human7_astrocyte_counts = findClusterCorrespondingToAstrocytes(human7,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human8_astrocyte_counts = findClusterCorrespondingToAstrocytes(human8,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human9_astrocyte_counts = findClusterCorrespondingToAstrocytes(human9,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human10_astrocyte_counts = findClusterCorrespondingToAstrocytes(human10,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human11_astrocyte_counts = findClusterCorrespondingToAstrocytes(human11,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human12_astrocyte_counts = findClusterCorrespondingToAstrocytes(human12,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human13_astrocyte_counts = findClusterCorrespondingToAstrocytes(human13,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human14_astrocyte_counts = findClusterCorrespondingToAstrocytes(human14,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human15_astrocyte_counts = findClusterCorrespondingToAstrocytes(human15,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human16_astrocyte_counts = findClusterCorrespondingToAstrocytes(human16,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
save( human1_astrocyte_counts, human2_astrocyte_counts, human3_astrocyte_counts, human4_astrocyte_counts, human5_astrocyte_counts,
      human6_astrocyte_counts, human7_astrocyte_counts, human8_astrocyte_counts, human9_astrocyte_counts, human10_astrocyte_counts, human11_astrocyte_counts, human12_astrocyte_counts, human13_astrocyte_counts, human14_astrocyte_counts, human15_astrocyte_counts, human16_astrocyte_counts, file=paste0(objects_directory,"human_scRNA_pseudobulk_data.RData"))
human1_astrocyte = getAstrocytes(human1,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human2_astrocyte = getAstrocytes(human2,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  276
human3_astrocyte = getAstrocytes(human3,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  252
human4_astrocyte = getAstrocytes(human4,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  280
human5_astrocyte = getAstrocytes(human5,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human6_astrocyte = getAstrocytes(human6,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  85
human7_astrocyte = getAstrocytes(human7,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  41
human8_astrocyte = getAstrocytes(human8,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  205
human9_astrocyte = getAstrocytes(human9,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  40
human10_astrocyte = getAstrocytes(human10,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human11_astrocyte = getAstrocytes(human11,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human12_astrocyte = getAstrocytes(human12,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human13_astrocyte = getAstrocytes(human13,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human14_astrocyte = getAstrocytes(human14,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human15_astrocyte = getAstrocytes(human15,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human16_astrocyte = getAstrocytes(human16,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153human1_astrocyte = getAstrocytes(human1,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 26 found 153 astrocytes"
> human2_astrocyte = getAstrocytes(human2,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 9 found 276 astrocytes"
> human3_astrocyte = getAstrocytes(human3,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 10 found 252 astrocytes"
> human4_astrocyte = getAstrocytes(human4,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 22 found 280 astrocytes"
> human5_astrocyte = getAstrocytes(human5,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 26 found 153 astrocytes"
> human6_astrocyte = getAstrocytes(human6,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 20 found 85 astrocytes"
> human7_astrocyte = getAstrocytes(human7,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 23 found 41 astrocytes"
> human8_astrocyte = getAstrocytes(human8,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 21 found 205 astrocytes"
> human9_astrocyte = getAstrocytes(human9,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 22 found 40 astrocytes"
> human10_astrocyte = getAstrocytes(human10,chosenClusterSet = "RNA_snn_res.2",
+                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 21 found 82 astrocytes"
> human11_astrocyte = getAstrocytes(human11,chosenClusterSet = "RNA_snn_res.2",
+                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 31 found 259 astrocytes"
> human12_astrocyte = getAstrocytes(human12,chosenClusterSet = "RNA_snn_res.2",
+                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 27 found 96 astrocytes"
> human13_astrocyte = getAstrocytes(human13,chosenClusterSet = "RNA_snn_res.2",
+                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 12 found 112 astrocytes"load(paste0(objects_directory,"human_scRNA_pseudobulk_data.RData"))
human_astrocyte_counts = data.frame(human1=human1_astrocyte_counts,
                                    human2=human2_astrocyte_counts,
                                    human3=human3_astrocyte_counts,
                                    human4=human4_astrocyte_counts,
                                    human5=human5_astrocyte_counts,
                                    human6=human6_astrocyte_counts,
                                    human7=human7_astrocyte_counts,
                                    human8=human8_astrocyte_counts,
                                    human9=human9_astrocyte_counts,
                                    human10=human10_astrocyte_counts,
                                    human11=human11_astrocyte_counts,
                                    human12=human12_astrocyte_counts,
                                    human13=human13_astrocyte_counts,
                                    human14=human14_astrocyte_counts,
                                    human15=human15_astrocyte_counts,
                                    human16=human16_astrocyte_counts,
                                    row.names = names(human1_astrocyte_counts))
human_astrocyte_counts_metadata = data.frame(Species=rep("Human",ncol(human_astrocyte_counts)),
                                             Human_NHP=rep("Human",ncol(human_astrocyte_counts)),
                                             study=rep("Mixed",ncol(human_astrocyte_counts)),
                                             stage=rep("Foetal",ncol(human_astrocyte_counts)),
                                             row.names=colnames(human_astrocyte_counts))
save( human_astrocyte_counts, human_astrocyte_counts_metadata,
      file=paste0(objects_directory,"human_astrocyte_counts.RData"))load(paste0(objects_directory,"macaque1_scRNA.RData"))
load(paste0(objects_directory,"macaque2_scRNA.RData"))
load(paste0(objects_directory,"macaque3_scRNA.RData"))
load(paste0(objects_directory,"macaque4_scRNA.RData"))
load(paste0(objects_directory,"macaque5_scRNA.RData"))
load(paste0(objects_directory,"macaque6_scRNA.RData"))
load(paste0(objects_directory,"macaque7_scRNA.RData"))
load(paste0(objects_directory,"macaque8_scRNA.RData"))
load(paste0(objects_directory,"macaque9_scRNA.RData"))
load(paste0(objects_directory,"macaque10_scRNA.RData"))
load(paste0(objects_directory,"macaque11_scRNA.RData"))
load(paste0(objects_directory,"macaque12_scRNA.RData"))
macaque1_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque1,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque2_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque2,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque3_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque3,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
save( macaque1_astrocyte_counts, macaque2_astrocyte_counts, macaque3_astrocyte_counts,
      file="~/Desktop/macaques123.RData")
rm(list=c("macaque1","macaque2","macaque3"))
gc()
macaque4_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque4,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque5_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque5,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
save( macaque4_astrocyte_counts, macaque5_astrocyte_counts,
      file="~/Desktop/macaques45.RData")
rm(list=c("macaque4","macaque5"))
gc()
macaque6_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque6,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque7_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque7,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
save( macaque6_astrocyte_counts, macaque7_astrocyte_counts,
      file="~/Desktop/macaques67.RData")
rm(list=c("macaque6","macaque7"))
gc()
macaque8_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque8,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque9_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque9,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
save( macaque8_astrocyte_counts,macaque9_astrocyte_counts,
      file="~/Desktop/macaques89.RData")
rm(list=c("macaque8","macaque9"))
gc()
macaque10_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque10,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque11_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque11,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
save( macaque10_astrocyte_counts,macaque11_astrocyte_counts,
      file="~/Desktop/macaques10_11.RData")
rm(list=c("macaque10","macaque11"))
gc()
macaque12_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque12,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
save( macaque12_astrocyte_counts,
      file="~/Desktop/macaques_12.RData")
rm(list=c("macaque12"))
gc()
macaque1_astrocyte = getAstrocytes(macaque1,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque2_astrocyte = getAstrocytes(macaque2,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque3_astrocyte = getAstrocytes(macaque3,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque4_astrocyte = getAstrocytes(macaque4,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque5_astrocyte = getAstrocytes(macaque5,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque6_astrocyte = getAstrocytes(macaque6,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque7_astrocyte = getAstrocytes(macaque7,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque8_astrocyte = getAstrocytes(macaque8,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque9_astrocyte = getAstrocytes(macaque9,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque10_astrocyte = getAstrocytes(macaque10,chosenClusterSet = "RNA_snn_res.2",
                                    astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque11_astrocyte = getAstrocytes(macaque11,chosenClusterSet = "RNA_snn_res.2",
                                    astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque12_astrocyte = getAstrocytes(macaque12,chosenClusterSet = "RNA_snn_res.2",
                                    astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque_astrocyte_counts = data.frame(macaque1_late=macaque1_astrocyte_counts, # 807
                                      macaque2_late=macaque2_astrocyte_counts, # 36
                                      Macaque3_78_1=macaque3_astrocyte_counts, # 687
                                      Macaque3_110_DFC=macaque4_astrocyte_counts, # 783
                                      Macaque3_110_OFC=macaque5_astrocyte_counts, # 114
                                      Macaque3_93_DFC=macaque6_astrocyte_counts, # 613
                                      Macaque3_110_DFC_S2=macaque7_astrocyte_counts, # 37
                                      Macaque3_110_VFC=macaque8_astrocyte_counts, # 713
                                      Macaque3_110_VFC_S2=macaque9_astrocyte_counts, # 265
                                      Macaque3_77_Frontal=macaque10_astrocyte_counts, # 441
                                      Macaque3_64_Frontal=macaque11_astrocyte_counts, # 314
                                      Macaque3_62_Frontal=macaque12_astrocyte_counts, # 246
                                      row.names = names(macaque1_astrocyte_counts))
macaque_astrocyte_counts_metadata = data.frame(Species=rep("Macaque",ncol(macaque_astrocyte_counts)),
                                               Human_NHP=rep("NHP",ncol(macaque_astrocyte_counts)),
                                               study=rep("Mixed",ncol(macaque_astrocyte_counts)),
                                               stage=rep("Foetal",ncol(macaque_astrocyte_counts)),
                                               row.names=colnames(macaque_astrocyte_counts))
save( macaque_astrocyte_counts, macaque_astrocyte_counts_metadata,
      file=paste0(objects_directory,"macaque_astrocyte_counts.RData"))“human14”,“human7”,“human13” here we find only few astrocytes, we remove these samples from the analysis.
load(paste0(objects_directory,"macaque_astrocyte_counts.RData"))
load(paste0(objects_directory,"human_astrocyte_counts.RData"))
all(rownames(macaque_astrocyte_counts)==rownames(human_astrocyte_counts))## [1] TRUEstitched_counts = data.frame( macaque_astrocyte_counts, human_astrocyte_counts )
st_metadata = rbind( macaque_astrocyte_counts_metadata,human_astrocyte_counts_metadata )
all(colnames(stitched_counts)==rownames(st_metadata))## [1] TRUEdata = DESeqDataSetFromMatrix( countData = stitched_counts[,! colnames(stitched_counts) %in% c("human14","human7","human13")],
                               colData = st_metadata[! rownames(st_metadata) %in% c("human14","human7","human13"),],
                               design = ~ Human_NHP )## converting counts to integer mode## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factorsdata =  estimateSizeFactors(data)
data = estimateDispersions(data, fitType = "local")## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimatesdata = DESeq(data, fitType = 'local')## using pre-existing size factors## estimating dispersions## found already estimated dispersions, replacing these## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testing## -- replacing outliers and refitting for 89 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)## estimating dispersions## fitting model and testingdegs = results(data, contrast = c("Human_NHP","Human", "NHP") )
human_macaque_fetal_norm_counts = counts(data,normalized=TRUE)
human_macaque_fetal_unnorm_counts = counts(data,normalized=FALSE)
save(data,degs, 
     human_macaque_fetal_norm_counts,
     human_macaque_fetal_unnorm_counts,
     file=paste0(objects_directory,"pseudobulk_published_scRNA_Foetal.RData"))degs = degs[! is.na(degs$padj), ]
degs_01 = degs[ degs$padj < 0.1, ]
sum( degs_01$log2FoldChange>0 )## [1] 5212sum( degs_01$log2FoldChange<0 )## [1] 6219We confirm (87/237) 36% of up-regulated genes and 28% (104/301) of down-regulated genes.
up.hits.ensids = read.delim(paste0(outputs_directory,"up_engs.txt"),as.is=TRUE, header=FALSE)
dn.hits.ensids = read.delim(paste0(outputs_directory,"dn_engs.txt"),as.is=TRUE, header=FALSE)
up.hits.geneN = unique(genemap$hgnc_symbol[genemap$ensembl_gene_id %in% up.hits.ensids$V1])
dn.hits.geneN = unique(genemap$hgnc_symbol[genemap$ensembl_gene_id %in% dn.hits.ensids$V1])
up.hits.geneN_filt = up.hits.geneN[up.hits.geneN %in% rownames(degs)]
dn.hits.geneN_filt = dn.hits.geneN[dn.hits.geneN %in% rownames(degs)]
length(up.hits.geneN_filt)## [1] 238length(dn.hits.geneN_filt)## [1] 301degs_us_up = degs[rownames(degs) %in% up.hits.geneN_filt,]
degs_us_dn = degs[rownames(degs) %in% dn.hits.geneN_filt,]
degs_us_up[degs_us_up$padj<0.1 & degs_us_up$log2FoldChange>0,]## log2 fold change (MLE): Human_NHP Human vs NHP 
## Wald test p-value: Human_NHP Human vs NHP 
## DataFrame with 87 rows and 6 columns
##           baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##          <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## SCNN1D     6.61796       5.252182  0.580160   9.05299 1.39105e-19 9.86892e-19
## CDK11A    35.76199       2.444782  0.184597  13.24392 4.89269e-40 9.54270e-39
## SLC35E2A  58.70908       2.730201  0.189547  14.40384 4.89453e-47 1.27041e-45
## H6PD      20.64759       1.287565  0.170613   7.54670 4.46417e-14 2.23169e-13
## DFFA      17.20583       0.636637  0.168038   3.78865 1.51468e-04 3.34767e-04
## ...            ...            ...       ...       ...         ...         ...
## ADA2       8.39128       2.439363  0.293316   8.31651 9.05931e-17 5.33378e-16
## LZTR1     25.92802       0.882482  0.168153   5.24808 1.53688e-07 4.68215e-07
## C1QTNF6    7.36232       1.673785  0.369422   4.53082 5.87541e-06 1.52153e-05
## MT-ATP8   24.80009       8.290334  0.666553  12.43761 1.63309e-35 2.54872e-34
## MT-ATP6   15.51741       7.188377  0.712925  10.08294 6.57311e-24 5.92171e-23degs_us_dn[degs_us_dn$padj<0.1 & degs_us_dn$log2FoldChange<0,]## log2 fold change (MLE): Human_NHP Human vs NHP 
## Wald test p-value: Human_NHP Human vs NHP 
## DataFrame with 104 rows and 6 columns
##             baseMean log2FoldChange     lfcSE      stat               pvalue
##            <numeric>      <numeric> <numeric> <numeric>            <numeric>
## SRSF10      104.4759      -0.932483  0.197947  -4.71077 0.000002467874236896
## RCAN3        15.2877      -2.830897  0.421129  -6.72216 0.000000000017904770
## PDIK1L       14.7172      -1.528401  0.215272  -7.09987 0.000000000001248702
## SRSF4       109.6152      -0.319722  0.169547  -1.88574 0.059329607370328094
## PRPF38A      32.0105      -1.528840  0.200637  -7.61992 0.000000000000025384
## ...              ...            ...       ...       ...                  ...
## ZNF776      24.28439       -2.44103  0.202366 -12.06248          1.66682e-33
## OLIG2       23.58954       -2.73026  0.728066  -3.75002          1.76820e-04
## DONSON      10.98414       -1.23557  0.311389  -3.96792          7.25017e-05
## MT-CO3     138.76363       -4.75420  0.436041 -10.90310          1.11399e-27
## C1GALT1C1L   1.77877       -3.83977  0.630962  -6.08558          1.16069e-09
##                            padj
##                       <numeric>
## SRSF10     0.000006656184798295
## RCAN3      0.000000000075424177
## PDIK1L     0.000000000005675825
## SRSF4      0.089430349211630344
## PRPF38A    0.000000000000128673
## ...                         ...
## ZNF776              2.37697e-32
## OLIG2               3.88128e-04
## DONSON              1.66126e-04
## MT-CO3              1.20738e-26
## C1GALT1C1L          4.24191e-09conf_up = rownames(degs_us_up[degs_us_up$padj<0.1 & degs_us_up$log2FoldChange>0,])
conf_down = rownames(degs_us_dn[degs_us_dn$padj<0.1 & degs_us_dn$log2FoldChange<0,])
barplot( c(length(conf_up)/length(up.hits.geneN_filt),
           length(conf_down)/length(dn.hits.geneN_filt)),
         col=c("green4","wheat3"), ylim=c(0,0.5),ylab="Fraction",
         names=c("Up","Down"),xlab="EAGs")
axis(2,lwd=1)Boxplots of chosen genes
sa=st_metadata[! rownames(st_metadata) %in% c("human14","human7","human13"),]
plotAGene = function( ct, gene, sa, cols ){
  # ct = human_macaque_fetal_norm_counts; gene="CTCF"
  # sa = st_metadata[! rownames(st_metadata) %in% c("human14","human7","human13"),]
  # cols = c("black","blue")
  x = split( ct[rownames(ct)==gene,], sa$Species )[c("Human","Macaque")]
  boxplot(x,border=cols,main=gene,col="white")
}
degs["CTCF",]## log2 fold change (MLE): Human_NHP Human vs NHP 
## Wald test p-value: Human_NHP Human vs NHP 
## DataFrame with 1 row and 6 columns
##       baseMean log2FoldChange     lfcSE      stat    pvalue      padj
##      <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
## CTCF   73.2475      -0.458664  0.186933  -2.45363 0.0141422 0.0240199plotAGene( human_macaque_fetal_norm_counts, "CTCF", sa, c("black","blue")) # P=0.0147813 degs["TEAD3",]## log2 fold change (MLE): Human_NHP Human vs NHP 
## Wald test p-value: Human_NHP Human vs NHP 
## DataFrame with 1 row and 6 columns
##        baseMean log2FoldChange     lfcSE      stat          pvalue
##       <numeric>      <numeric> <numeric> <numeric>       <numeric>
## TEAD3   11.8658        1.36589   0.24607   5.55085 0.0000000284291
##                  padj
##             <numeric>
## TEAD3 0.0000000923959plotAGene( human_macaque_fetal_norm_counts, "TEAD3", sa, c("black","blue")) # P=3.20079e-08 kanton_hits = read.delim(paste0(outputs_directory,"Supplementary_Table_15_human_DE.txt"))
kanton_hits_up = kanton_hits[kanton_hits$Average.expression..human.>kanton_hits$Average.expression..chimp.,]
kanton_hits_dn = kanton_hits[kanton_hits$Average.expression..human.<kanton_hits$Average.expression..chimp.,]
kanton_hits_up = kanton_hits_up$Symbol
kanton_hits_dn = kanton_hits_dn$Symbolhs_pt = read.delim( paste0(outputs_directory,'Jorstad_tables/Astro_human_vs_chimp_sig_genes.txt'),sep=",")
hs_pp = read.delim( paste0(outputs_directory,'Jorstad_tables/Astro_human_vs_gorilla_sig_genes.txt'),sep=",")
hs_rm = read.delim( paste0(outputs_directory,'Jorstad_tables/Astro_human_vs_rhesus_sig_genes.txt'), sep=",")
hs_cj = read.delim( paste0(outputs_directory,'Jorstad_tables/Astro_human_vs_marmoset_sig_genes.txt'),sep=",")
hs_pt = hs_pt[!is.na(hs_pt$padj),]
hs_pp = hs_pp[!is.na(hs_pp$padj),]
hs_rm = hs_rm[!is.na(hs_rm$padj),]
hs_cj = hs_cj[!is.na(hs_cj$padj),]
# identify downregulated and up regulated genes
thr=0
down1 = hs_pt$gene[hs_pt$log2FoldChange<(-1*thr) & hs_pt$padj<0.1]
down2 = hs_pp$gene[hs_pp$log2FoldChange<(-1*thr) & hs_pp$padj<0.1]
down3 = hs_rm$gene[hs_rm$log2FoldChange<(-1*thr) & hs_rm$padj<0.1]
down4 = hs_cj$gene[hs_cj$log2FoldChange<(-1*thr)  & hs_cj$padj<0.1]
up1 = hs_pt$gene[hs_pt$log2FoldChange>thr & hs_pt$padj<0.1]
up2 = hs_pp$gene[hs_pp$log2FoldChange>thr & hs_pp$padj<0.1]
up3 = hs_rm$gene[hs_rm$log2FoldChange>thr & hs_rm$padj<0.1]
up4 = hs_cj$gene[hs_cj$log2FoldChange>thr & hs_cj$padj<0.1]
down1234=down1[down1 %in% down2[down2 %in% down3[down3 %in% down4]]]
up1234=up1[up1 %in% up2[up2 %in% up3[up3 %in% up4]]]
down12=down1[down1 %in% down2]
up12=up1[up1 %in% up2]
down123=down1[down1 %in% down2[down2 %in% down3]]
up123=up1[up1 %in% up2[up2 %in% up3]]these are the tables I obtained from Shaojie Ma directly.
load(paste0(outputs_directory,'/Wilcox_DEG_results_raw.Rdata'))
deg_species_filt = deg_species[deg_species$cluster=="Astro" & deg_species$p_val_adj<0.01,]
#############################
deg_species_filt_astro_hs = deg_species_filt[log2(deg_species_filt$ratio_fc)>0,]
deg_species_filt_astro_hs = deg_species_filt_astro_hs[deg_species_filt_astro_hs$species1=="Human" & deg_species_filt_astro_hs$species2 %in% c("Chimpanzee","Rhesus"),]
up_genes = table(deg_species_filt_astro_hs$gene)
up_genes = names(up_genes[up_genes>1])
#############################
deg_species_filt_astro_hs = deg_species_filt[log2(deg_species_filt$ratio_fc)<(0),]
deg_species_filt_astro_hs = deg_species_filt_astro_hs[deg_species_filt_astro_hs$species1=="Human" & deg_species_filt_astro_hs$species2 %in% c("Chimpanzee","Rhesus"),]
dn_genes = table(deg_species_filt_astro_hs$gene)
dn_genes = names(dn_genes[dn_genes>1])
any( up_genes %in% dn_genes )## [1] FALSElength(up_genes)## [1] 1429length(dn_genes)## [1] 1123all_up_all = unique( c(kanton_hits_up,up123,up_genes,up.hits.geneN,conf_up))
all_up_all = data.frame( Kanton = all_up_all %in% kanton_hits_up,
                         Jorstad = all_up_all %in% up123,
                          Ma = all_up_all %in% up_genes,
                          Foetal = all_up_all %in% conf_up,
                          Ciuba = all_up_all %in% up.hits.geneN,
                          row.names = all_up_all)
all_up_all = all_up_all[all_up_all$Ciuba & rowSums(all_up_all[,1:4])>0,]
all_up_all##               Kanton Jorstad    Ma Foetal Ciuba
## NBPF11          TRUE   FALSE FALSE   TRUE  TRUE
## NBPF14          TRUE   FALSE FALSE   TRUE  TRUE
## PABPC1L         TRUE   FALSE FALSE   TRUE  TRUE
## PALLD           TRUE    TRUE  TRUE   TRUE  TRUE
## PCAT6           TRUE   FALSE FALSE  FALSE  TRUE
## PIGZ            TRUE    TRUE FALSE   TRUE  TRUE
## SCNN1D          TRUE   FALSE FALSE   TRUE  TRUE
## SCRG1           TRUE    TRUE  TRUE  FALSE  TRUE
## THBS4           TRUE   FALSE  TRUE   TRUE  TRUE
## PAGR1          FALSE    TRUE FALSE   TRUE  TRUE
## STK33          FALSE    TRUE  TRUE   TRUE  TRUE
## VKORC1         FALSE    TRUE FALSE  FALSE  TRUE
## AQP1           FALSE    TRUE  TRUE  FALSE  TRUE
## MTCH1          FALSE    TRUE  TRUE  FALSE  TRUE
## PRDX6          FALSE    TRUE  TRUE  FALSE  TRUE
## RMDN1          FALSE    TRUE FALSE  FALSE  TRUE
## RANGRF         FALSE    TRUE FALSE  FALSE  TRUE
## GUK1           FALSE    TRUE  TRUE  FALSE  TRUE
## ATP6V1E2       FALSE    TRUE  TRUE   TRUE  TRUE
## S100A13        FALSE    TRUE FALSE  FALSE  TRUE
## FAM228B        FALSE    TRUE FALSE  FALSE  TRUE
## LIN7A          FALSE    TRUE FALSE   TRUE  TRUE
## ACACA          FALSE   FALSE  TRUE   TRUE  TRUE
## BAIAP3         FALSE   FALSE  TRUE   TRUE  TRUE
## C1orf54        FALSE   FALSE  TRUE   TRUE  TRUE
## C1QTNF6        FALSE   FALSE  TRUE   TRUE  TRUE
## C22orf46       FALSE   FALSE  TRUE  FALSE  TRUE
## CPS1           FALSE   FALSE  TRUE  FALSE  TRUE
## DGCR6L         FALSE   FALSE  TRUE  FALSE  TRUE
## EFHD1          FALSE   FALSE  TRUE   TRUE  TRUE
## GFPT2          FALSE   FALSE  TRUE   TRUE  TRUE
## GTF3C5         FALSE   FALSE  TRUE   TRUE  TRUE
## HSPB1          FALSE   FALSE  TRUE  FALSE  TRUE
## MLH1           FALSE   FALSE  TRUE   TRUE  TRUE
## MMP19          FALSE   FALSE  TRUE  FALSE  TRUE
## MOV10          FALSE   FALSE  TRUE   TRUE  TRUE
## NDUFV1         FALSE   FALSE  TRUE  FALSE  TRUE
## NR1H3          FALSE   FALSE  TRUE   TRUE  TRUE
## PDLIM7         FALSE   FALSE  TRUE  FALSE  TRUE
## RHOBTB3        FALSE   FALSE  TRUE  FALSE  TRUE
## SIRT3          FALSE   FALSE  TRUE   TRUE  TRUE
## STYXL1         FALSE   FALSE  TRUE   TRUE  TRUE
## TCF25          FALSE   FALSE  TRUE   TRUE  TRUE
## TCTN3          FALSE   FALSE  TRUE   TRUE  TRUE
## TMEM9B-AS1     FALSE   FALSE  TRUE  FALSE  TRUE
## TRIP6          FALSE   FALSE  TRUE  FALSE  TRUE
## TSR3           FALSE   FALSE  TRUE  FALSE  TRUE
## VIM            FALSE   FALSE  TRUE  FALSE  TRUE
## ZNF266         FALSE   FALSE  TRUE   TRUE  TRUE
## ZNHIT3         FALSE   FALSE  TRUE  FALSE  TRUE
## CDK11A         FALSE   FALSE FALSE   TRUE  TRUE
## H6PD           FALSE   FALSE FALSE   TRUE  TRUE
## DFFA           FALSE   FALSE FALSE   TRUE  TRUE
## PAQR7          FALSE   FALSE FALSE   TRUE  TRUE
## SRGAP2B        FALSE   FALSE FALSE   TRUE  TRUE
## HHLA3          FALSE   FALSE FALSE   TRUE  TRUE
## SLC35E2A       FALSE   FALSE FALSE   TRUE  TRUE
## NBPF1          FALSE   FALSE FALSE   TRUE  TRUE
## NBPF15         FALSE   FALSE FALSE   TRUE  TRUE
## NBPF9          FALSE   FALSE FALSE   TRUE  TRUE
## NBPF19         FALSE   FALSE FALSE   TRUE  TRUE
## NBPF26         FALSE   FALSE FALSE   TRUE  TRUE
## ACOX3          FALSE   FALSE FALSE   TRUE  TRUE
## CBR4           FALSE   FALSE FALSE   TRUE  TRUE
## TMEM129        FALSE   FALSE FALSE   TRUE  TRUE
## SULT1C4        FALSE   FALSE FALSE   TRUE  TRUE
## TEAD3          FALSE   FALSE FALSE   TRUE  TRUE
## MAN2B2         FALSE   FALSE FALSE   TRUE  TRUE
## RIPK1          FALSE   FALSE FALSE   TRUE  TRUE
## SRD5A1         FALSE   FALSE FALSE   TRUE  TRUE
## WDR27          FALSE   FALSE FALSE   TRUE  TRUE
## INSYN2B        FALSE   FALSE FALSE   TRUE  TRUE
## C1QTNF3-AMACR  FALSE   FALSE FALSE   TRUE  TRUE
## ABCB4          FALSE   FALSE FALSE   TRUE  TRUE
## CCND3          FALSE   FALSE FALSE   TRUE  TRUE
## MAPKAP1        FALSE   FALSE FALSE   TRUE  TRUE
## NUP43          FALSE   FALSE FALSE   TRUE  TRUE
## PDE1C          FALSE   FALSE FALSE   TRUE  TRUE
## POLR2J3        FALSE   FALSE FALSE   TRUE  TRUE
## ADAM9          FALSE   FALSE FALSE   TRUE  TRUE
## EPHB4          FALSE   FALSE FALSE   TRUE  TRUE
## COL27A1        FALSE   FALSE FALSE   TRUE  TRUE
## SPDYE3         FALSE   FALSE FALSE   TRUE  TRUE
## OSBPL5         FALSE   FALSE FALSE   TRUE  TRUE
## CUBN           FALSE   FALSE FALSE   TRUE  TRUE
## ELMOD1         FALSE   FALSE FALSE   TRUE  TRUE
## SHLD2          FALSE   FALSE FALSE   TRUE  TRUE
## EML3           FALSE   FALSE FALSE   TRUE  TRUE
## TIMM23B-AGAP6  FALSE   FALSE FALSE   TRUE  TRUE
## AGAP4          FALSE   FALSE FALSE   TRUE  TRUE
## FAM111B        FALSE   FALSE FALSE   TRUE  TRUE
## TIMM23B        FALSE   FALSE FALSE   TRUE  TRUE
## AGAP9          FALSE   FALSE FALSE   TRUE  TRUE
## DGKA           FALSE   FALSE FALSE   TRUE  TRUE
## DHRS12         FALSE   FALSE FALSE   TRUE  TRUE
## RFLNA          FALSE   FALSE FALSE   TRUE  TRUE
## LTB4R          FALSE   FALSE FALSE   TRUE  TRUE
## LPCAT2         FALSE   FALSE FALSE   TRUE  TRUE
## CNTNAP1        FALSE   FALSE FALSE   TRUE  TRUE
## ADCY9          FALSE   FALSE FALSE   TRUE  TRUE
## SLCO3A1        FALSE   FALSE FALSE   TRUE  TRUE
## NPIPA1         FALSE   FALSE FALSE   TRUE  TRUE
## ADA2           FALSE   FALSE FALSE   TRUE  TRUE
## MAN2B1         FALSE   FALSE FALSE   TRUE  TRUE
## CARD8          FALSE   FALSE FALSE   TRUE  TRUE
## SLC66A2        FALSE   FALSE FALSE   TRUE  TRUE
## ZNF486         FALSE   FALSE FALSE   TRUE  TRUE
## GYG2           FALSE   FALSE FALSE   TRUE  TRUE
## LZTR1          FALSE   FALSE FALSE   TRUE  TRUE
## MT-ATP6        FALSE   FALSE FALSE   TRUE  TRUE
## MT-ATP8        FALSE   FALSE FALSE   TRUE  TRUEall_dn_all = unique( c(kanton_hits_dn,down123,dn_genes,dn.hits.geneN,conf_down))
all_dn_all = data.frame( Kanton = all_dn_all %in% kanton_hits_dn,
                         Jorstad = all_dn_all %in% down123,
                         Ma = all_dn_all %in% dn_genes,
                         foetal = all_dn_all %in% conf_down,
                         Ciuba = all_dn_all %in% dn.hits.geneN,
                         row.names = all_dn_all)
all_dn_all = all_dn_all[all_dn_all$Ciuba & rowSums(all_dn_all[,1:4])>0,]
all_dn_all##            Kanton Jorstad    Ma foetal Ciuba
## CELF4        TRUE   FALSE FALSE  FALSE  TRUE
## FGF13        TRUE   FALSE FALSE  FALSE  TRUE
## SYN1         TRUE   FALSE  TRUE   TRUE  TRUE
## PDZRN4      FALSE    TRUE  TRUE   TRUE  TRUE
## PLCL2       FALSE    TRUE  TRUE  FALSE  TRUE
## SRSF4       FALSE    TRUE FALSE   TRUE  TRUE
## PBLD        FALSE    TRUE  TRUE   TRUE  TRUE
## GABPB1      FALSE    TRUE  TRUE  FALSE  TRUE
## UNC5D       FALSE    TRUE  TRUE   TRUE  TRUE
## RAPGEF5     FALSE    TRUE FALSE  FALSE  TRUE
## DCC         FALSE    TRUE  TRUE  FALSE  TRUE
## ATP8A2      FALSE    TRUE  TRUE  FALSE  TRUE
## PANK3       FALSE    TRUE FALSE   TRUE  TRUE
## RCAN3       FALSE    TRUE  TRUE   TRUE  TRUE
## MAP3K2      FALSE    TRUE FALSE  FALSE  TRUE
## NUDT4       FALSE    TRUE FALSE   TRUE  TRUE
## RND3        FALSE    TRUE  TRUE   TRUE  TRUE
## SPAST       FALSE    TRUE  TRUE  FALSE  TRUE
## FBXO11      FALSE    TRUE FALSE  FALSE  TRUE
## ACIN1       FALSE   FALSE  TRUE  FALSE  TRUE
## AHCTF1      FALSE   FALSE  TRUE  FALSE  TRUE
## ATAD2B      FALSE   FALSE  TRUE  FALSE  TRUE
## CECR2       FALSE   FALSE  TRUE  FALSE  TRUE
## CEP104      FALSE   FALSE  TRUE  FALSE  TRUE
## CREBRF      FALSE   FALSE  TRUE  FALSE  TRUE
## CSPP1       FALSE   FALSE  TRUE  FALSE  TRUE
## DONSON      FALSE   FALSE  TRUE   TRUE  TRUE
## DYRK2       FALSE   FALSE  TRUE   TRUE  TRUE
## EED         FALSE   FALSE  TRUE   TRUE  TRUE
## EFL1        FALSE   FALSE  TRUE   TRUE  TRUE
## ERCC6L2     FALSE   FALSE  TRUE  FALSE  TRUE
## FBXW7       FALSE   FALSE  TRUE  FALSE  TRUE
## GRK4        FALSE   FALSE  TRUE  FALSE  TRUE
## INA         FALSE   FALSE  TRUE   TRUE  TRUE
## INSR        FALSE   FALSE  TRUE  FALSE  TRUE
## KAT6A       FALSE   FALSE  TRUE  FALSE  TRUE
## KLHL24      FALSE   FALSE  TRUE  FALSE  TRUE
## MBTD1       FALSE   FALSE  TRUE  FALSE  TRUE
## MIB1        FALSE   FALSE  TRUE  FALSE  TRUE
## MLLT10      FALSE   FALSE  TRUE  FALSE  TRUE
## PGBD2       FALSE   FALSE  TRUE  FALSE  TRUE
## POLR1B      FALSE   FALSE  TRUE   TRUE  TRUE
## PPM1A       FALSE   FALSE  TRUE   TRUE  TRUE
## PPP4R3B     FALSE   FALSE  TRUE   TRUE  TRUE
## PTPN4       FALSE   FALSE  TRUE  FALSE  TRUE
## RAB3A       FALSE   FALSE  TRUE   TRUE  TRUE
## RPRD2       FALSE   FALSE  TRUE  FALSE  TRUE
## STRN3       FALSE   FALSE  TRUE  FALSE  TRUE
## STXBP1      FALSE   FALSE  TRUE  FALSE  TRUE
## SYT16       FALSE   FALSE  TRUE  FALSE  TRUE
## TERF1       FALSE   FALSE  TRUE   TRUE  TRUE
## TFDP2       FALSE   FALSE  TRUE  FALSE  TRUE
## TRIM2       FALSE   FALSE  TRUE   TRUE  TRUE
## TRIM23      FALSE   FALSE  TRUE   TRUE  TRUE
## TTC33       FALSE   FALSE  TRUE   TRUE  TRUE
## TUBB4A      FALSE   FALSE  TRUE   TRUE  TRUE
## UBN2        FALSE   FALSE  TRUE  FALSE  TRUE
## ZNF148      FALSE   FALSE  TRUE  FALSE  TRUE
## ZNF595      FALSE   FALSE  TRUE  FALSE  TRUE
## ZRANB3      FALSE   FALSE  TRUE  FALSE  TRUE
## NUP133      FALSE   FALSE FALSE   TRUE  TRUE
## RSBN1       FALSE   FALSE FALSE   TRUE  TRUE
## CCDC181     FALSE   FALSE FALSE   TRUE  TRUE
## PRPF38A     FALSE   FALSE FALSE   TRUE  TRUE
## ETAA1       FALSE   FALSE FALSE   TRUE  TRUE
## PDIK1L      FALSE   FALSE FALSE   TRUE  TRUE
## SOX11       FALSE   FALSE FALSE   TRUE  TRUE
## AIDA        FALSE   FALSE FALSE   TRUE  TRUE
## SRSF10      FALSE   FALSE FALSE   TRUE  TRUE
## GDAP2       FALSE   FALSE FALSE   TRUE  TRUE
## ARL6        FALSE   FALSE FALSE   TRUE  TRUE
## PHOSPHO2    FALSE   FALSE FALSE   TRUE  TRUE
## SMARCA5     FALSE   FALSE FALSE   TRUE  TRUE
## KCNH7       FALSE   FALSE FALSE   TRUE  TRUE
## C1GALT1C1L  FALSE   FALSE FALSE   TRUE  TRUE
## KIF2A       FALSE   FALSE FALSE   TRUE  TRUE
## CDC5L       FALSE   FALSE FALSE   TRUE  TRUE
## PRPF4B      FALSE   FALSE FALSE   TRUE  TRUE
## CLK4        FALSE   FALSE FALSE   TRUE  TRUE
## OARD1       FALSE   FALSE FALSE   TRUE  TRUE
## KIF3A       FALSE   FALSE FALSE   TRUE  TRUE
## CEP162      FALSE   FALSE FALSE   TRUE  TRUE
## EIF4E       FALSE   FALSE FALSE   TRUE  TRUE
## ZUP1        FALSE   FALSE FALSE   TRUE  TRUE
## ZCCHC10     FALSE   FALSE FALSE   TRUE  TRUE
## PGM2        FALSE   FALSE FALSE   TRUE  TRUE
## HDAC2       FALSE   FALSE FALSE   TRUE  TRUE
## ZKSCAN8     FALSE   FALSE FALSE   TRUE  TRUE
## BRD2        FALSE   FALSE FALSE   TRUE  TRUE
## CFAP69      FALSE   FALSE FALSE   TRUE  TRUE
## CBLL1       FALSE   FALSE FALSE   TRUE  TRUE
## RANBP6      FALSE   FALSE FALSE   TRUE  TRUE
## ZNF92       FALSE   FALSE FALSE   TRUE  TRUE
## C9orf72     FALSE   FALSE FALSE   TRUE  TRUE
## TMEM196     FALSE   FALSE FALSE   TRUE  TRUE
## ZBTB10      FALSE   FALSE FALSE   TRUE  TRUE
## UBXN2B      FALSE   FALSE FALSE   TRUE  TRUE
## RPAP3       FALSE   FALSE FALSE   TRUE  TRUE
## FAM76B      FALSE   FALSE FALSE   TRUE  TRUE
## FOLH1       FALSE   FALSE FALSE   TRUE  TRUE
## IKZF5       FALSE   FALSE FALSE   TRUE  TRUE
## SMC3        FALSE   FALSE FALSE   TRUE  TRUE
## KMT5B       FALSE   FALSE FALSE   TRUE  TRUE
## DPF2        FALSE   FALSE FALSE   TRUE  TRUE
## LIN7C       FALSE   FALSE FALSE   TRUE  TRUE
## DCDC1       FALSE   FALSE FALSE   TRUE  TRUE
## GVQW3       FALSE   FALSE FALSE   TRUE  TRUE
## HSPA14      FALSE   FALSE FALSE   TRUE  TRUE
## C10orf143   FALSE   FALSE FALSE   TRUE  TRUE
## YAF2        FALSE   FALSE FALSE   TRUE  TRUE
## PKP2        FALSE   FALSE FALSE   TRUE  TRUE
## ATP2B1      FALSE   FALSE FALSE   TRUE  TRUE
## VCPKMT      FALSE   FALSE FALSE   TRUE  TRUE
## CAND1       FALSE   FALSE FALSE   TRUE  TRUE
## ZC2HC1C     FALSE   FALSE FALSE   TRUE  TRUE
## RBM26       FALSE   FALSE FALSE   TRUE  TRUE
## THTPA       FALSE   FALSE FALSE   TRUE  TRUE
## ZNF200      FALSE   FALSE FALSE   TRUE  TRUE
## CTCF        FALSE   FALSE FALSE   TRUE  TRUE
## AKTIP       FALSE   FALSE FALSE   TRUE  TRUE
## NRG4        FALSE   FALSE FALSE   TRUE  TRUE
## ADAP2       FALSE   FALSE FALSE   TRUE  TRUE
## DLL3        FALSE   FALSE FALSE   TRUE  TRUE
## ZNF175      FALSE   FALSE FALSE   TRUE  TRUE
## APOE        FALSE   FALSE FALSE   TRUE  TRUE
## OSBPL2      FALSE   FALSE FALSE   TRUE  TRUE
## ZNF304      FALSE   FALSE FALSE   TRUE  TRUE
## ZNF776      FALSE   FALSE FALSE   TRUE  TRUE
## EID2B       FALSE   FALSE FALSE   TRUE  TRUE
## MEX3C       FALSE   FALSE FALSE   TRUE  TRUE
## ZNF17       FALSE   FALSE FALSE   TRUE  TRUE
## ZNF600      FALSE   FALSE FALSE   TRUE  TRUE
## ZNF181      FALSE   FALSE FALSE   TRUE  TRUE
## PEG3        FALSE   FALSE FALSE   TRUE  TRUE
## OLIG2       FALSE   FALSE FALSE   TRUE  TRUE
## ZNF134      FALSE   FALSE FALSE   TRUE  TRUE
## RBMX        FALSE   FALSE FALSE   TRUE  TRUE
## PHF6        FALSE   FALSE FALSE   TRUE  TRUE
## MT-CO3      FALSE   FALSE FALSE   TRUE  TRUE
## PGAM4       FALSE   FALSE FALSE   TRUE  TRUE
## RTL5        FALSE   FALSE FALSE   TRUE  TRUEsessionInfo()## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] pl_PL.UTF-8/pl_PL.UTF-8/pl_PL.UTF-8/C/pl_PL.UTF-8/pl_PL.UTF-8
## 
## attached base packages:
##  [1] tools     grid      stats4    parallel  stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0                            
##  [2] readr_2.1.4                             
##  [3] SeuratObject_4.1.3                      
##  [4] Seurat_4.3.0                            
##  [5] ggVennDiagram_1.2.2                     
##  [6] rBLAST_0.99.2                           
##  [7] Rsubread_2.6.4                          
##  [8] BSgenome.Ptroglodytes.UCSC.panTro6_1.4.2
##  [9] beeswarm_0.4.0                          
## [10] VennDiagram_1.7.3                       
## [11] futile.logger_1.4.3                     
## [12] scuttle_1.2.1                           
## [13] SingleCellExperiment_1.14.1             
## [14] forcats_1.0.0                           
## [15] RColorBrewer_1.1-3                      
## [16] glmGamPoi_1.4.0                         
## [17] reshape2_1.4.4                          
## [18] kableExtra_1.3.4                        
## [19] plotly_4.10.1                           
## [20] dplyr_1.1.2                             
## [21] ggrepel_0.9.3                           
## [22] data.table_1.14.8                       
## [23] pheatmap_1.0.12                         
## [24] LSD_4.1-0                               
## [25] BSgenome.Hsapiens.UCSC.hg38_1.4.3       
## [26] BSgenome_1.60.0                         
## [27] colorspace_2.1-0                        
## [28] rtracklayer_1.52.1                      
## [29] Rsamtools_2.8.0                         
## [30] Biostrings_2.60.2                       
## [31] XVector_0.32.0                          
## [32] GenomicFeatures_1.44.2                  
## [33] biomaRt_2.48.3                          
## [34] Gviz_1.36.2                             
## [35] st_1.2.7                                
## [36] sda_1.3.8                               
## [37] fdrtool_1.2.17                          
## [38] corpcor_1.6.10                          
## [39] entropy_1.3.1                           
## [40] smoothmest_0.1-3                        
## [41] MASS_7.3-58.3                           
## [42] genefilter_1.74.1                       
## [43] edgeR_3.34.1                            
## [44] limma_3.48.3                            
## [45] DESeq2_1.32.0                           
## [46] SummarizedExperiment_1.22.0             
## [47] MatrixGenerics_1.4.3                    
## [48] matrixStats_0.63.0                      
## [49] GenomicRanges_1.44.0                    
## [50] GenomeInfoDb_1.28.4                     
## [51] geneplotter_1.70.0                      
## [52] annotate_1.70.0                         
## [53] XML_3.99-0.14                           
## [54] AnnotationDbi_1.54.1                    
## [55] IRanges_2.26.0                          
## [56] S4Vectors_0.30.2                        
## [57] lattice_0.21-8                          
## [58] locfit_1.5-9.7                          
## [59] Biobase_2.52.0                          
## [60] BiocGenerics_0.38.0                     
## [61] plyr_1.8.8                              
## [62] ggplot2_3.4.2                           
## [63] Matrix_1.5-4                            
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3            scattermore_0.8          
##   [3] tidyr_1.3.0               bit64_4.0.5              
##   [5] knitr_1.42                irlba_2.3.5.1            
##   [7] DelayedArray_0.18.0       rpart_4.1.19             
##   [9] KEGGREST_1.32.0           RCurl_1.98-1.12          
##  [11] AnnotationFilter_1.16.0   generics_0.1.3           
##  [13] cowplot_1.1.1             lambda.r_1.2.4           
##  [15] RSQLite_2.3.1             RANN_2.6.1               
##  [17] proxy_0.4-27              future_1.32.0            
##  [19] tzdb_0.3.0                bit_4.0.5                
##  [21] spatstat.data_3.0-1       webshot_0.5.4            
##  [23] xml2_1.3.3                httpuv_1.6.9             
##  [25] xfun_0.38                 hms_1.1.3                
##  [27] jquerylib_0.1.4           evaluate_0.20            
##  [29] promises_1.2.0.1          fansi_1.0.4              
##  [31] restfulr_0.0.15           progress_1.2.2           
##  [33] dbplyr_2.3.2              igraph_1.4.2             
##  [35] DBI_1.1.3                 htmlwidgets_1.6.2        
##  [37] spatstat.geom_3.1-0       purrr_1.0.1              
##  [39] ellipsis_0.3.2            backports_1.4.1          
##  [41] deldir_1.0-6              sparseMatrixStats_1.4.2  
##  [43] vctrs_0.6.1               ensembldb_2.16.4         
##  [45] ROCR_1.0-11               abind_1.4-5              
##  [47] cachem_1.0.7              withr_2.5.0              
##  [49] RVenn_1.1.0               progressr_0.13.0         
##  [51] checkmate_2.1.0           sctransform_0.3.5        
##  [53] GenomicAlignments_1.28.0  prettyunits_1.1.1        
##  [55] goftest_1.2-3             svglite_2.1.1            
##  [57] cluster_2.1.4             lazyeval_0.2.2           
##  [59] crayon_1.5.2              spatstat.explore_3.1-0   
##  [61] units_0.8-1               labeling_0.4.2           
##  [63] pkgconfig_2.0.3           nlme_3.1-162             
##  [65] ProtGenerics_1.24.0       nnet_7.3-18              
##  [67] rlang_1.1.0               globals_0.16.2           
##  [69] lifecycle_1.0.3           miniUI_0.1.1.1           
##  [71] filelock_1.0.2            BiocFileCache_2.0.0      
##  [73] dichromat_2.0-0.1         invgamma_1.1             
##  [75] polyclip_1.10-4           lmtest_0.9-40            
##  [77] ashr_2.2-54               carData_3.0-5            
##  [79] zoo_1.8-11                base64enc_0.1-3          
##  [81] ggridges_0.5.4            png_0.1-8                
##  [83] viridisLite_0.4.1         rjson_0.2.21             
##  [85] bitops_1.0-7              KernSmooth_2.23-20       
##  [87] blob_1.2.4                DelayedMatrixStats_1.14.3
##  [89] classInt_0.4-9            mixsqp_0.3-48            
##  [91] SQUAREM_2021.1            stringr_1.5.0            
##  [93] spatstat.random_3.1-4     parallelly_1.35.0        
##  [95] rstatix_0.7.2             jpeg_0.1-10              
##  [97] ggsignif_0.6.4            beachmat_2.8.1           
##  [99] scales_1.2.1              memoise_2.0.1            
## [101] magrittr_2.0.3            ica_1.0-3                
## [103] zlibbioc_1.38.0           compiler_4.1.0           
## [105] BiocIO_1.2.0              fitdistrplus_1.1-8       
## [107] cli_3.6.1                 listenv_0.9.0            
## [109] patchwork_1.1.2           pbapply_1.7-0            
## [111] htmlTable_2.4.1           formatR_1.14             
## [113] Formula_1.2-5             tidyselect_1.2.0         
## [115] stringi_1.7.12            highr_0.10               
## [117] yaml_2.3.7                latticeExtra_0.6-30      
## [119] sass_0.4.5                VariantAnnotation_1.38.0 
## [121] future.apply_1.10.0       rstudioapi_0.14          
## [123] foreign_0.8-84            gridExtra_2.3            
## [125] farver_2.1.1              Rtsne_0.16               
## [127] digest_0.6.31             shiny_1.7.4              
## [129] Rcpp_1.0.10               car_3.1-2                
## [131] broom_1.0.4               later_1.3.0              
## [133] RcppAnnoy_0.0.20          httr_1.4.5               
## [135] biovizBase_1.40.0         sf_1.0-12                
## [137] tensor_1.5                rvest_1.0.3              
## [139] reticulate_1.28           truncnorm_1.0-9          
## [141] splines_4.1.0             uwot_0.1.14              
## [143] spatstat.utils_3.0-2      sp_1.6-0                 
## [145] systemfonts_1.0.4         xtable_1.8-4             
## [147] jsonlite_1.8.4            futile.options_1.0.1     
## [149] R6_2.5.1                  Hmisc_5.0-1              
## [151] pillar_1.9.0              htmltools_0.5.5          
## [153] mime_0.12                 glue_1.6.2               
## [155] fastmap_1.1.1             BiocParallel_1.26.2      
## [157] class_7.3-21              codetools_0.2-19         
## [159] utf8_1.2.3                spatstat.sparse_3.0-1    
## [161] bslib_0.4.2               tibble_3.2.1             
## [163] curl_5.0.0                leiden_0.4.3             
## [165] interp_1.1-4              survival_3.5-5           
## [167] rmarkdown_2.21            munsell_0.5.0            
## [169] e1071_1.7-13              GenomeInfoDbData_1.2.6   
## [171] gtable_0.3.3