Libraries, functions, paths
Transcriptome - from validation to evolution
- Zhang data - expressed and not genes
Differential Gene Expression Analysis
Differentiation score
- Heatmap for figure 1 and differentiation score
Evolution of astrocyte transcriptomes
Regulome analysis - chromatin structure
Regulome analysis - differential openess
ATAC - preparations
DEseq analysis of ATAC seq peak openess
Sequence analysis of the enhancer classes
scRNA-seq data
Session Info

Libraries, functions, paths

Transcriptome - from validation to evolution

Genemapping as of February 2022.

ensembl_hg38 = useEnsembl(biomart="ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", mirror="useast")
genemap = getBM( attributes = c("ensembl_gene_id","chromosome_name","start_position",
                                "end_position","transcript_start","transcript_end",
                                "transcript_length","strand","percentage_gene_gc_content",
                                "transcription_start_site","external_gene_name",
                                "go_id","gene_biotype","hgnc_symbol","arrayexpress"),
                 filters = "ensembl_gene_id",
                 values = human_count$Geneid,
                 mart = ensembl_hg38)
save(ensembl_hg38_genemap, file=paste0(objects_directory,"ensembl_hg38_genemap.RData"))

gtf = import(paste0(outputs_directory,'hg38_ensembl.gtf'))
promoters_ap = data.frame( chr=as.character(chrom(gtf)),
                           start=as.numeric(start(gtf)),
                           end=as.numeric(end(gtf)),
                           strand=as.character(strand(gtf)),
                           transcript_id=as.character(gtf$transcript_id),
                           gene_id=as.character(gtf$gene_id),
                           gene_name = as.character(gtf$gene_name),
                           gene_biotype = as.character(gtf$gene_biotype),
                           type = gtf$type,
                           stringsAsFactors = FALSE )
length(unique(promoters_ap$gene_id))
promoters_ap = promoters_ap[promoters_ap$type == "transcript",]
promoters_sp = split(promoters_ap,promoters_ap$transcript_id)

## for each transcript find the TSS
promoters_tss = do.call('rbind', lapply( promoters_sp, function(x){
    tss = ifelse( as.character(unique(x$strand))=="+", 
                  x[which.min(x$start),'start'], 
                  x[which.max(x$end),'end'] ) 
    if(  as.character(unique(x$strand))=="+") tp = x[which.min(x$start),] else tp = x[which.max(x$end),]
    tp$tss = tss
  return(tp)
  }))

## 
promoters_tss_gr = GRanges(seqnames = promoters_tss$chr,
                           ranges = IRanges(as.numeric(promoters_tss$tss)-500 ,
                                            end=as.numeric(promoters_tss$tss) + 500,
                                            names=promoters_tss$transcript_id),
                           strand = promoters_tss$strand,
                           gene_id = promoters_tss$gene_id,
                           gene_name = promoters_tss$gene_name,
                           gene_biotype = promoters_tss$gene_biotype,
                           tss = promoters_tss$tss)
seqlevelsStyle(promoters_tss_gr) = 'ucsc'

promoters_tss$me3_peak = 0
promoters_tss$me3_peak[queryHits(findOverlaps(promoters_tss_gr,hs_me3))]=subjectHits(findOverlaps(promoters_tss_gr,hs_me3))
length(unique(promoters_tss$gene_id))

all(names(promoters_tss_gr)==promoters_tss$transcript_id)
promoters_tss_split = split(promoters_tss,promoters_tss$gene_id)
length(promoters_tss_split)

promoters_filtered = do.call('rbind', lapply(promoters_tss_split,function(p){
  if( sum(p$me3_peak)>0 ){
    PS = p[p$me3_peak>0,]
    if( unique(PS$strand)=='-') res=PS[which.max(PS$tss),]  else res=PS[which.min(PS$tss),] }
  if( sum(p$me3_peak)==0 ) { if( unique(p$strand)=='+') res=p[which.min(p$tss),] else res=p[which.max(p$tss),] }
  return(res) 
  } ))
length(unique(promoters_filtered$gene_id))

promoters_filtered_gr = GRanges(seqnames = promoters_filtered$chr,
                        ranges = IRanges(as.numeric(promoters_filtered$tss)-500 ,
                                         end=as.numeric(promoters_filtered$tss) + 500,
                                         names=promoters_filtered$gene_id),
                        strand = promoters_filtered$strand,
                        gene_id = promoters_filtered$gene_id,
                        gene_name = promoters_filtered$gene_name,
                        gene_biotype = promoters_filtered$gene_biotype,
                        tss = promoters_filtered$tss)
seqlevelsStyle(promoters_filtered_gr)='ucsc'

save( promoters_filtered, promoters_filtered_gr, promoters_tss,promoters_tss_gr, 
      file=paste0(objects_directory,'tss_objects.RData') )

load(paste0(objects_directory,"Zhang_DataBundle.RData"))
load(paste0(objects_directory,"GTF_Annotation.RData"))
load(paste0(objects_directory,"ensembl_hg38_genemap.RData"))
genemapu = genemap[!duplicated(genemap$ensembl_gene_id),]
species.colors = c( 'MF' = '#66CCFF', 'HS' = '#000000', 'PT' = '#FF3300', 'MM' = '#0033FF')

Zhang data - expressed and not genes

countdata_zhang = read.table(file = paste0(outputs_directory, "Zhang_gene_counts_redownloaded.txt"), header = T)

zhang_countdata_4tpm = data.frame(
  fetal=rowSums(countdata_zhang[,c(36,28,29,23,24,11)]),
  adult=rowSums(countdata_zhang[,colnames(countdata_zhang) %like% "YO_ATL_Astro|YO_HPC_Astro"]),
  Length=countdata_zhang$Length,
  row.names = countdata_zhang$Geneid,
  stringsAsFactors = FALSE)

zhang_countdata_tpm = as.data.frame(GetTPM(zhang_countdata_4tpm,1:2,
                                           rownames(zhang_countdata_4tpm)))

expressed = zhang_countdata_tpm[zhang_countdata_tpm$fetal>1 | zhang_countdata_tpm$adult>1, ]
not_expressed = zhang_countdata_tpm[zhang_countdata_tpm$fetal<0.1 & zhang_countdata_tpm$adult<0.1, ]

expressed_fetal = zhang_countdata_tpm[zhang_countdata_tpm$fetal>1, ]
expressed_adult = zhang_countdata_tpm[zhang_countdata_tpm$adult>1, ]

expressed_only_fetal = rownames(expressed_fetal)[! rownames(expressed_fetal) %in% rownames(expressed_adult) ]
expressed_only_adult = rownames(expressed_adult)[! rownames(expressed_adult) %in% rownames(expressed_fetal) ]
expressed_fetal_adult = rownames(expressed_fetal)[rownames(expressed_fetal) %in% rownames(expressed_adult) ]

fetal_markers_geneName = unique(genemap$hgnc_symbol[genemap$ensembl_gene_id %in% Fetal_Markers])
adult_markers_geneName = unique(genemap$hgnc_symbol[genemap$ensembl_gene_id %in% Adult_Markers])

Differential Gene Expression Analysis

Based on data quality and previous analyses the selected samples are processed and filtered. We perform a differential expression analysis of genes between 4 species - Humans, Chimps, Rhesus Macaques and Crab Eating Macaques, based upon their expression profile on the Consensus Genome. This sheet details the steps of the differential analysis with relevant graphs for overview of the data and finally list the significant hits based on a the canonical workflow using DESeq2.

countdata = read.table(file = paste0(outputs_directory,"featureCounts_Counts_MO_All.tsv"), header = T)
countdata_tcw_iAstrocytes = read.table(file = paste0(outputs_directory,'tcw_latest_gene_counts.txt'), header = T)

## prep the tables
colnames(countdata_tcw_iAstrocytes)[7:ncol(countdata_tcw_iAstrocytes)]=c('tcw_3651_Astros','tcw_3651_NPCs','tcw_9319_Astros',
                                                                         'tcw_9429_Astros', 'tcw_9429_NPCs','tcw_BJ_Astros',
                                                                         'Cerebral_Cortex_pAstros','Midbrain_pAstros')
all(rownames(countdata) == countdata_tcw_iAstrocytes$Geneid)

## [1] TRUE

countdata = data.frame( PrimaryFetal_F = countdata$PrimaryFetal_F,
                        PrimaryFetal_M = countdata$PrimaryFetal_M,
                        PrimaryFetal_1 = countdata$PrimaryFetal_1,
                        HSapiens_ELE10 = countdata$HSapiens_ELE10_1 + countdata$HSapiens_ELE10_2,
                        HSapiens_ELE30 = countdata$HSapiens_ELE30_1 + countdata$HSapiens_ELE30_2,
                        HSapiens_TCW_F1 = countdata_tcw_iAstrocytes[,'tcw_3651_Astros'],
                        HSapiens_TCW_F3 = countdata_tcw_iAstrocytes[,'tcw_9319_Astros'],
                        HSapiens_TCW_F4 = countdata_tcw_iAstrocytes[,'tcw_9429_Astros'],
                        Chimp_SandraA = countdata$Chimp_Sandra_BD1 + countdata$Chimp_Sandra_BD2 + countdata$Chimp_Sandra_nwNPC,
                        Chimp_Mandy6 = countdata$Chimp_Mandy6 + countdata$Chimp_Mandy6_New,
                        Chimp_Mandy4 = countdata$Chimp_Mandy4_New,
                        RhMacaque_Becky = countdata$RhMacaque_Becky_BD1 + countdata$RhMacaque_Becky_BD2,
                        
                        row.names = rownames(countdata))

sample_names = c("PrimaryFetal_F",
                 "PrimaryFetal_M",
                 "PrimaryFetal_1",
                 "HSapiens_ELE10",
                 "HSapiens_ELE30",
                 "HSapiens_TCW_F1",
                 "HSapiens_TCW_F3",
                 "HSapiens_TCW_F4",
                 "Chimp_SandraA",
                 "Chimp_Mandy6",
                 "Chimp_Mandy4",
                 "RhMacaque_Becky" )

# Setting up metadata for included samples
species = c(rep("HS",8),rep("PT",3),"MM")
sources = c(rep("Fetal",3),rep("iPSC",9))
sub_class = c("PF","PF","PF","ELE10","ELE30","TCW_F1","TCW_F3","TCW_F4",
              "SandraA","Mandy6","Mandy4",
              "Becky")

metadata = data.frame(species=as.factor(species),
                      sources=as.factor(sources),
                      class=as.factor(sub_class),
                      row.names = sample_names,
                      gender=c('F','M','F',rep('F',9)),
                      lab=c('other','other',rep("PL",3),rep("other",3),rep("PL",4)))
metadata$lp = 1:nrow(metadata)
all(colnames(countdata)==rownames(metadata))

## [1] TRUE

# Creating a TPM normalized table for the read counts for all genes
Length=countdata_tcw_iAstrocytes$Length[match(rownames(countdata),countdata_tcw_iAstrocytes$Geneid)]
tpm_norm_count_table = GetTPM(data.frame(cbind(countdata,Length=Length)),
                                         1:ncol(countdata),
                                         rownames(countdata))


# Creating a TPM normalized table for the read counts for all genes
all(rownames(tpm_norm_count_table)==rownames(zhang_countdata_tpm))

## [1] TRUE

tpm_norm_count_all = cbind( zhang_countdata_tpm,tpm_norm_count_table )
all(rownames(zhang_countdata_tpm) == rownames(tpm_norm_count_table))

## [1] TRUE

all(rownames(tpm_norm_count_table)==rownames(zhang_countdata_tpm))

## [1] TRUE

# filtering out expressed fetal and adult genes
expressed_fetal_str = zhang_countdata_tpm[zhang_countdata_tpm$fetal>5, ]
expressed_adult_str = zhang_countdata_tpm[zhang_countdata_tpm$adult>5, ]
expressed_only_fetal_str = rownames(expressed_fetal_str)[! rownames(expressed_fetal_str) %in% rownames(expressed_adult_str) ]
expressed_only_adult_str = rownames(expressed_adult_str)[! rownames(expressed_adult_str) %in% rownames(expressed_fetal_str) ]

Differentiation score

zhang_countdata_DS = data.frame(countdata_zhang[,c(36,28,29,23,24,11)],
                                countdata_zhang[,colnames(countdata_zhang) %like% "YO_ATL_Astro|YO_HPC_Astro"],
                                row.names=countdata_zhang$Geneid)
zhang_countdata_DS = log( colSums(zhang_countdata_DS[rownames(zhang_countdata_DS) %in% Adult_Markers,])/
                          colSums(zhang_countdata_DS[rownames(zhang_countdata_DS) %in% Fetal_Markers,]) )

countdata_DS = log( colSums(countdata[rownames(countdata) %in% Adult_Markers,])/colSums(countdata[rownames(countdata) %in% Fetal_Markers,]) )

ds = c(zhang_countdata_DS,countdata_DS)
sampleType = c(rep('acute_fetal',6),rep('acute_adult',15),
               rep('fetal_cultured',3),rep('iAstrocytes',9))
dsd=split(ds,sampleType)

par(mfrow=c(1,1),mar=c(5,4,1,1))
beeswarm(ds ~ sampleType, pch = 19, 
         col = c( 'blue4', 'turquoise3', 'purple3', 'pink4'), 
         method = "swarm", ylim=c(-2,5), ylab="Log[2] Differentiation score" )
axis(2,lwd=2)
box(col="black",lwd=2)

sampleType = factor( c(rep('human',5),rep('chimpanzee',3),rep('rhesus',1)),
                     levels=c('human','chimpanzee','rhesus') )

beeswarm(countdata_DS[4:length(countdata_DS)] ~ sampleType, pch = 19, 
         col = c( 'black', 'red', 'blue'), 
         method = "swarm", ylim=c(-2,5), ylab="Log[2] Differentiation score" )
axis(2,lwd=2)
box(col="black",lwd=2)

beeswarm(countdata_DS[4:length(countdata_DS)] ~ sampleType, pch = 19, 
         col = c( 'black', 'red', 'blue'), 
         method = "swarm", ylim=c(-2,5), ylab="Log[2] Differentiation score" )
axis(2,lwd=2)
box(col="black",lwd=2)

Heatmap for figure 1 and differentiation score

astro_genes = unlist(unique( genemap[genemap$go_id == 'GO:0048708','hgnc_symbol']))
astro_genes = unique( c(astro_genes,
                        'ABL1','ABL2', 'ARP3','ADORA2A', 'AGER', 'AGT',
                        'APP', 'ATF5', 'BIN','BMP2', 'C1QA', 'C5AR1',
                        'CNTF','CNTN2','DAB1','DLL1','DLL3','DRD1',
                        'EIF2B5','EPHA4','F2','FGFR3','GCM1','GFAP',
                        'GM5849','GPR37l1','GRN','HES1','HES5','HMGA2',
                        'ID2','ID4','IFNG','IFNGR1','IL1B','IL6ST',
                        'KDM4A','LAMB2','LDLR','MAG','MAP2K1','MAPK3',
                        'MBD1','MECP2','MT3','MYCN','NF1','NFIX',
                        'NKX2-2','NOG','NOTCH1','NR1D1','NR2E1','NTRK3',
                        'PLP1','PLPP3','POU3F2','PRPF19','PSEN1','PTPN11',
                        'ROR2','S100A8','S100A9','SERPINE2','SHH','SMO',
                        'SOX6','SOX8','SOX9','STAT3','TAL1','TLR4',
                        'TREM2','TSPAN2','TTC21B','VIM', 'SLC1A3'))

astro_genes = data.frame(unique(genemap[which(genemap$hgnc_symbol %in% astro_genes),c('ensembl_gene_id','hgnc_symbol')]))

tmp_count_table = tpm_norm_count_table[rownames(tpm_norm_count_table) %in% astro_genes$ensembl_gene_id, ]
rownames(tmp_count_table) = astro_genes$hgnc_symbol[match(rownames(tmp_count_table), astro_genes$ensembl_gene_id)]

count_frame = as.data.frame(log10(tmp_count_table))
count_frame = count_frame %>% replace(.=='-Inf', 0)
count_frame = count_frame[order(apply(count_frame, 1, median), decreasing = T),]

pheatmap(count_frame, cellheight = 10,
         treeheight_row = 0, 
         cluster_cols = F, 
         cluster_rows = F, 
         scale = "none",
         angle_col = '315')

Evolution of astrocyte transcriptomes

We consider comparisons between human and chimpanzee and between human and macaque samples separately.

ids=1:nrow(countdata)

## DEGs in the comparison between humans and chimps
res_HSvPT <- DESeqDataSetFromMatrix(
 countData = countdata[ids,which(metadata$species %in% c("HS","PT") & metadata$sources=="iPSC" & metadata$gender=='F' )],
 colData = metadata[which(metadata$species %in% c("HS","PT") & metadata$sources=="iPSC" & metadata$gender=='F'),],
 design = ~ 0 + species )

## factor levels were dropped which had no samples

res_HSvPT$species = relevel(res_HSvPT$species, "HS")
res_HSvPT <- DESeq(res_HSvPT,fitType="local")

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

resultsNames(res_HSvPT)

## [1] "speciesHS" "speciesPT"

res_HSvPT_sh <- lfcShrink(res_HSvPT, contrast = c("species","HS","PT"),type='ashr')

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

res_HSvPT <- results(res_HSvPT, contrast = c("species","HS","PT") )
summary(res_HSvPT)

## 
## out of 46457 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 3881, 8.4%
## LFC < 0 (down)     : 3890, 8.4%
## outliers [1]       : 307, 0.66%
## low counts [2]     : 13293, 29%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

## 
res_HSvMM <- DESeqDataSetFromMatrix(
 countData = countdata[ids,c(metadata$species %in% c("HS","MM") & metadata$sources=="iPSC" & metadata$gender=='F')],
 colData = metadata[c(metadata$species %in% c("HS","MM") & metadata$sources=="iPSC" & metadata$gender=='F'),],
 design = ~ 0 + species
)

## factor levels were dropped which had no samples

res_HSvMM$species = relevel(res_HSvMM$species, "HS")
res_HSvMM <- DESeq(res_HSvMM,fitType="local")

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res_HSvMM_sh <- lfcShrink(res_HSvMM, contrast = c("species","HS","MM"), type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

res_HSvMM <- results(res_HSvMM, contrast = c("species","HS","MM") )
summary(res_HSvMM)

## 
## out of 45191 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 5517, 12%
## LFC < 0 (down)     : 4670, 10%
## outliers [1]       : 133, 0.29%
## low counts [2]     : 15485, 34%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

HSvPT_DEG <- as.data.frame(res_HSvPT)
HSvPT_DEG_0.1 <- HSvPT_DEG %>% filter(padj <= 0.01 )
res_HSvPT_sh = as.data.frame(res_HSvPT_sh)
HSvPT_DEG_sh <- res_HSvPT_sh %>% filter(padj <= 0.01 )
setDT(HSvPT_DEG_0.1, keep.rownames = TRUE)

HSvMM_DEG <- as.data.frame(res_HSvMM)
HSvMM_DEG_0.1 <- HSvMM_DEG %>% filter(padj <= 0.01 )
setDT(HSvMM_DEG_0.1, keep.rownames = TRUE)
res_HSvMM_sh = as.data.frame(res_HSvMM_sh)
HSvMM_DEG_sh <- res_HSvMM_sh %>% filter(padj <= 0.01 )


sum( HSvMM_DEG_0.1$rn %in% HSvPT_DEG_0.1$rn )

## [1] 1271

sum(! HSvMM_DEG_0.1$rn %in% HSvPT_DEG_0.1$rn )

## [1] 3885

all( HSvMM_DEG_sh$rn == HSvMM_DEG_0.1$rn )

## [1] TRUE

HSvMM_DEG_0.1$lfc_sh = HSvMM_DEG_sh$log2FoldChange
HSvPT_DEG_0.1$lfc_sh = HSvPT_DEG_sh$log2FoldChange

Volcano plots

Human versus chimpanzee - volcano

Human versus macaque - volcano

Largely congruent changes in gene expression

# tpm_norm_count_table_thresholded_top  = apply(tpm_norm_count_table,2,function(x){x>(quantile(x[x>0])[3])})
tpm_norm_count_table_thresholded_top  = tpm_norm_count_table>1

## ------------------------------
log_fold_dat <- gtf_annotation_table[,c(1,7,12)]
log_fold_dat[log_fold_dat$gene_biotype %like% "pseudogene",]$gene_biotype = "pseudogene"
log_fold_dat[log_fold_dat$gene_biotype %like% "TR_",]$gene_biotype = "TR_genes"
log_fold_dat$rn = log_fold_dat$ensembl_gene_id

log_fold_dat = merge(log_fold_dat,HSvPT_DEG_0.1[,c(1,3,8)], by='rn')
colnames(log_fold_dat)[5] ="HSvPT_lfc"
colnames(log_fold_dat)[6] ="HSvPT_lfc_shrunk"

log_fold_dat = merge(log_fold_dat,HSvMM_DEG_0.1[,c(1,3,8)], by='rn')
colnames(log_fold_dat)[7] ="HSvMM_lfc"
colnames(log_fold_dat)[8] ="HSvMM_lfc_shrunk"

## ------------------------------
all(colnames(tpm_norm_count_table) == rownames(metadata))

## [1] TRUE

tpm_norm_count_table_df = data.frame( human = rowMeans(tpm_norm_count_table[,which(metadata$species =="HS" & metadata$sources=="iPSC" & metadata$gender=='F')]),
                                      chimp = rowMeans(tpm_norm_count_table[,which(metadata$species =="PT" & metadata$sources=="iPSC" & metadata$gender=='F')]),
                                      macaque=tpm_norm_count_table[,which(metadata$species =="MM" & metadata$sources=="iPSC" & metadata$gender=='F')],
                                      rn=rownames(tpm_norm_count_table) )

log_fold_dat = merge(log_fold_dat, tpm_norm_count_table_df, by='rn')
tpm_norm_count_table_df$rn=NULL
log_fold_dat$rn=NULL

log_fold_dat_biotype=log_fold_dat[log_fold_dat$gene_biotype %in% c('protein_coding',
                                                    'pseudogene',
                                                    'lncRNA','miRNA'),]
log_fold_dat_biotype$col = rep('steelblue',nrow(log_fold_dat_biotype))
log_fold_dat_biotype$col[log_fold_dat_biotype$gene_biotype=='pseudogene']='thistle3'
log_fold_dat_biotype$col[log_fold_dat_biotype$gene_biotype=='lncRNA']='red3'
log_fold_dat_biotype$col[log_fold_dat_biotype$gene_biotype=='miRNA']='black'
log_fold_dat_biotype$DEX = 0
# log_fold_dat_biotype$DEX[log_fold_dat_biotype$ensembl_gene_id %in% eid] = 1

par(mfrow=c(1,1),mar=c(5,5,5,5),cex.lab=2,pty='s')
plot(x=log_fold_dat_biotype$HSvPT_lfc_shrunk,
     y=log_fold_dat_biotype$HSvMM_lfc_shrunk,
     xlim=c(-10,10),ylim=c(-10,10),
     col=log_fold_dat_biotype$col,pch=19,cex=0.5,
     ylab='Hs vs. Pt',xlab='Hs vs. Mm',axes=F)
axis(1,lwd=2,cex.axis=2)
axis(2,lwd=2,cex.axis=2)
abline(a=0,b=1)
abline(h=0,v=0,lwd=2,col='gray')
box(col='black',lwd=2)

cor.test(log_fold_dat_biotype$HSvPT_lfc,log_fold_dat_biotype$HSvMM_lfc)

## 
##  Pearson's product-moment correlation
## 
## data:  log_fold_dat_biotype$HSvPT_lfc and log_fold_dat_biotype$HSvMM_lfc
## t = 42.744, df = 1243, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7479379 0.7930088
## sample estimates:
##       cor 
## 0.7714392

colnames(log_fold_dat)[1:3] = c("ensembl_id","hgnc_symbol","gene_biotype")

EAGs, the expression of how many of these genes is detected in the human astrocytes.

HS_UP_Genes <- log_fold_dat %>% filter(HSvPT_lfc > 0 & HSvMM_lfc > 0)
HS_DN_Genes <- log_fold_dat %>% filter(HSvPT_lfc < (-0) & HSvMM_lfc < (-0) )

all(HS_UP_Genes$ensembl_id %in% HSvMM_DEG_0.1$rn )

## [1] TRUE

all(HS_UP_Genes$ensembl_id %in% HSvPT_DEG_0.1$rn )

## [1] TRUE

hits_up = as.data.frame(HS_UP_Genes)
hits_dn = as.data.frame(HS_DN_Genes)

dim(hits_up) # 677

## [1] 677  10

dim(hits_dn) # 486

## [1] 486  10

sum(hits_up$ensembl_id %in% rownames(expressed))/nrow(hits_up)

## [1] 0.8227474

sum(hits_dn$ensembl_id %in% rownames(expressed))/nrow(hits_dn)

## [1] 0.8395062

hits_up = hits_up[hits_up$ensembl_id %in% rownames(tpm_norm_count_table_thresholded_top[rowSums(tpm_norm_count_table_thresholded_top[,which(metadata$species =="HS" & metadata$sources=="iPSC" & metadata$gender=='F')])>3,]),]
hits_dn = hits_dn[hits_dn$ensembl_id %in% rownames(tpm_norm_count_table_thresholded_top[rowSums(tpm_norm_count_table_thresholded_top[,which(metadata$species %in% c("PT","MM"))])>2,]),]
sum(hits_up$ensembl_id %in% rownames(expressed))/nrow(hits_up)

## [1] 0.8464052

sum(hits_dn$ensembl_id %in% rownames(expressed))/nrow(hits_dn)

## [1] 0.8883929

## boxplot expression in zhang
u = cbind(not_expressed = sum(hits_up$ensembl_id %in% rownames(not_expressed )),
          fetal=sum(hits_up$ensembl_id %in% expressed_only_fetal ),
               adult=sum(hits_up$ensembl_id %in% expressed_only_adult ),
               both=sum(hits_up$ensembl_id %in% expressed_fetal_adult ))

d = cbind(not_expressed = sum(hits_dn$ensembl_id %in% rownames(not_expressed )),
          fetal=sum(hits_dn$ensembl_id %in% expressed_only_fetal ),
               adult=sum(hits_dn$ensembl_id %in% expressed_only_adult ),
               both=sum(hits_dn$ensembl_id %in% expressed_fetal_adult ))

m=rbind(u,d)
m

##      not_expressed fetal adult both
## [1,]            33   106    61  351
## [2,]            26    29    26  343

par(lwd=2, cex.axis=1.5,mar=c(5,5,1,1),pty='m')
barplot(t(m/rowSums(m)),col=c('red4','white','black','gray'),
        ylab="%",names=c("Up","Down"),xlab="EAGs",
        cex.names=2.5,cex.lab=2)
axis(2,lwd=3)

par(mfrow=c(2,2))
hits_up_split_pt = split(hits_up$HSvPT_lfc,hits_up$gene_biotype)
boxplot( hits_up_split_pt[c('protein_coding','lncRNA','pseudogene')],ylim=c(0,15),col='white',border=c('steelblue','red3','thistle3'))
hits_up_split_mm = split(hits_up$HSvMM_lfc,hits_up$gene_biotype)
boxplot( hits_up_split_mm[c('protein_coding','lncRNA','pseudogene')],ylim=c(0,15),col='white',border=c('steelblue','red3','thistle3'))

hits_dn_split_pt = split(hits_dn$HSvPT_lfc,hits_dn$gene_biotype)
boxplot( hits_dn_split_pt[c('protein_coding','lncRNA','pseudogene')],ylim=c(-15,0),col='white',border=c('steelblue','red3','thistle3'))
hits_dn_split_mm = split(hits_dn$HSvMM_lfc,hits_dn$gene_biotype)
boxplot( hits_dn_split_mm[c('protein_coding','lncRNA','pseudogene')],
         ylim=c(-15,0),col='white',border=c('steelblue','red3','thistle3'))

hits_up = hits_up[hits_up$ensembl_id %in% rownames(expressed),]
dim(hits_up)

## [1] 518  10

hits_dn = hits_dn[hits_dn$ensembl_id %in% rownames(expressed),]
dim(hits_dn)

## [1] 398  10

sum(hits_up$ensembl_id %in% Fetal_Markers)

## [1] 25

sum(hits_dn$ensembl_id %in% Fetal_Markers)

## [1] 26

sum(hits_up$ensembl_id %in% Adult_Markers)

## [1] 20

sum(hits_dn$ensembl_id %in% Adult_Markers)

## [1] 25

hits_up = hits_up[! hits_up$ensembl_id %in% Fetal_Markers,]
hits_dn = hits_dn[! hits_dn$ensembl_id %in% Adult_Markers,]
dim(hits_up)

## [1] 493  10

dim(hits_dn)

## [1] 373  10

par(lwd=2, cex.axis=1.5,mar=c(5,5,3,1),mfrow=c(1,1))
barplot( c(up=nrow(hits_up),
           down=nrow(hits_dn)), 
         col=c("green4","wheat3"),
         ylim=c(0,500),ylab="EAGs",cex.axis = 1.5, cex.lab=2)
axis(2,lwd=2)

x=hits_up$ensembl_id
y=hits_dn$ensembl_id


write.table(y,file=paste0(outputs_directory,'dn_engs.txt'),quote=FALSE, row.names=FALSE,col.names=FALSE,sep='\n')
write.table(x,file=paste0(outputs_directory,'up_engs.txt'),quote=FALSE, row.names=FALSE,col.names=FALSE,sep='\n')


## are genes affected by evolution frequently totally on or off??
table(hits_up[hits_up$chimp<0.1 & hits_up$macaque<0.1,'gene_biotype'])

## 
##         lncRNA protein_coding     pseudogene            TEC 
##              7              1              6              1

nrow(hits_up[hits_up$chimp<0.1 & hits_up$macaque<0.1,])

## [1] 15

table(hits_dn[hits_dn$human<0.1,'gene_biotype'])

## 
##         lncRNA        Mt_tRNA protein_coding     pseudogene 
##              3              1             11              2

nrow(hits_dn[hits_dn$human<0.1,])

## [1] 17

save object for other vignettes

all_Deseqs = merge( HSvPT_DEG,HSvMM_DEG,by=0,all=TRUE ) # data frame of merged results

save( hits_dn, hits_up, HS_DN_Genes, HS_UP_Genes, all_Deseqs,log_fold_dat,tpm_norm_count_table,
      file=paste0(objects_directory,"DEseq2_RNA.RData"))

Functional annotation

load(paste0(objects_directory,"bda_final.RData"))
bda_final = bda_final[bda_final$ensid %in% rownames(countdata),]

tot_n_EAG = nrow(hits_up) + nrow(hits_dn)

x = bda_final[bda_final$ensid %in% hits_up$ensembl_id | bda_final$Gene.symbol %in% hits_up$hgnc_symbol,]
x$or = paste(x$ensid,x$Disease,sep='-')
x = x[!duplicated(x$or),]

y = bda_final[bda_final$ensid %in% hits_dn$ensembl_id | bda_final$Gene.symbol %in% hits_dn$hgnc_symbol,]
y$or = paste(y$ensid,y$Disease,sep='-')
y = y[!duplicated(y$or),]

length(unique(c(x$Disease,y$Disease)))

## [1] 23

All_diseases = table(bda_final$Disease)
X = table(x$Disease)
Y = table(y$Disease)

All_diseases = table(bda_final$Disease)
All_diseases = All_diseases[order(All_diseases,decreasing=TRUE)]

nulv = rep(0, length(All_diseases))
names(nulv) = names(All_diseases)

nulv1=nulv
nulv1[match(names(X),names(nulv1))]=X

nulv2=nulv
nulv2[match(names(Y),names(nulv2))]=Y

m=rbind(nulv1,nulv2)
mcut1=m[,colSums(m)>0]

dim(mcut1)

## [1]  2 23

par(mar=c(12,4,1,1),mfrow=c(1,1))
barplot(mcut1, beside=TRUE, col=c('green4','wheat3'),las=2,
        ylim=c(0,25),axes=FALSE,ylab="EAG")
axis(2,lwd=3)

M = matrix( c( length(unique(x$Gene.symbol)),
                     length(unique(hits_up$ensembl_id)),
                     length(unique(y$Gene.symbol)),
                     length(unique(hits_dn$ensembl_id))),ncol=2,nrow=2)
prop.test( M )

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  M
## X-squared = 20.933, df = 1, p-value = 0.000004757
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.3106748 -0.1256033
## sample estimates:
##    prop 1    prop 2 
## 0.3511450 0.5692841

fisher.test( M )

## 
##  Fisher's Exact Test for Count Data
## 
## data:  M
## p-value = 0.000003401
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2727455 0.6096183
## sample estimates:
## odds ratio 
##  0.4098171

##      [,1] [,2]
## [1,]   46   85
## [2,]  493  373

M = matrix( c( length(unique(x$Gene.symbol[x$Disease=="Intellectual Disability"])),
                     length(unique(hits_up$ensembl_id)),
                     length(unique(y$Gene.symbol[y$Disease=="Intellectual Disability"])),
                     length(unique(hits_dn$ensembl_id))),ncol=2,nrow=2)
chisq.test( M )

## Warning in chisq.test(M): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  M
## X-squared = 8.2604, df = 1, p-value = 0.004052

fisher.test( M )

## 
##  Fisher's Exact Test for Count Data
## 
## data:  M
## p-value = 0.001251
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.0000000 0.4471496
## sample estimates:
## odds ratio 
##          0

unique(bda_final$Disease)

##   [1] "Autism Spectrum Disorder"                                         
##   [2] "Alzheimer's Disease"                                              
##   [3] "Amyotrophic Lateral Sclerosis"                                    
##   [4] "Multiple Sclerosis"                                               
##   [5] "Epilepsy"                                                         
##   [6] "Intracranial Aneurysm"                                            
##   [7] "Neuroblastoma"                                                    
##   [8] "Parkinson's Disease"                                              
##   [9] "Restless legs Syndrome"                                           
##  [10] "Meningioma"                                                       
##  [11] "Narcolepsy"                                                       
##  [12] "Glioma"                                                           
##  [13] "Prader-Willi Syndrome"                                            
##  [14] "Progressive Supranuclear Plasy"                                   
##  [15] "Restless Legs Syndrome"                                           
##  [16] "Rett Syndrome"                                                    
##  [17] "Rolandic Epilepsy with Speech impairment"                         
##  [18] "Shy Drager Syndrome"                                              
##  [19] "Spasmodic Dysphonia"                                              
##  [20] "Stroke"                                                           
##  [21] "Tay-Sachs Disease"                                                
##  [22] "Tourette Syndrome"                                                
##  [23] "Tuberous Sclerosis"                                               
##  [24] "Von Hippel-Lindau Syndrome"                                       
##  [25] "X-linked Hydrocephalus"                                           
##  [26] "Agenesis Corpus Callosum"                                         
##  [27] "Alopecia with Mental Retardation"                                 
##  [28] "Alpha-Thalassemia X-Linked Intellectual Disability Syndrome"      
##  [29] "Alternating Hemiplegia of Childhood"                              
##  [30] "Aphasia"                                                          
##  [31] "Attention Deficit Hyperactivity Disorder"                         
##  [32] "Autosomal Dominant Nocturnal Frontal Lobe Epilepsy"               
##  [33] "Autosomal Dominant Partial Epilepsy with Auditory Features"       
##  [34] "Autosomal Recessive Cerebellar Ataxia Type 1"                     
##  [35] "Batten Disease"                                                   
##  [36] "Benign Familial Neonatal Seizures"                                
##  [37] "Benign Hereditary Chorea"                                         
##  [38] "Cerebral Aneurysm"                                                
##  [39] "Cerebellar Ataxia, Mental Retardation and Disequilibrium Syndrome"
##  [40] "Cerebral Palsy"                                                   
##  [41] "Cerebro-Oculo-Facio-Skeletal Syndrome"                            
##  [42] "Cerebrocostomandibular Syndrome"                                  
##  [43] "Charcot-Marie-Tooth Disease"                                      
##  [44] "Chiari Malformation"                                              
##  [45] "Chronic Inflammatory Demyelinating Polyneuropathy"                
##  [46] "Coma"                                                             
##  [47] "Creutzfeldt Jakob Disease"                                        
##  [48] "Dementia (Non Alzheimer)"                                         
##  [49] "Down Syndrome"                                                    
##  [50] "Dysautonomia"                                                     
##  [51] "Dyslexia"                                                         
##  [52] "Dyspraxia"                                                        
##  [53] "Dystonia"                                                         
##  [54] "Encephalitis"                                                     
##  [55] "Essential Tremor"                                                 
##  [56] "Familial Focal Epilepsy with Variable Foci"                       
##  [57] "Ferro-Cerebro-Cutaneous Syndrome"                                 
##  [58] "Friedreich Ataxia"                                                
##  [59] "Gaucher Disease"                                                  
##  [60] "Generalized Epilepsy with Febrile Seizures Plus"                  
##  [61] "Huntington's Disease"                                             
##  [62] "Hydrocephalus"                                                    
##  [63] "Intellectual Disability"                                          
##  [64] "Meningitis"                                                       
##  [65] "Motor Neurone Disease"                                            
##  [66] "Muscular Dystrophy"                                               
##  [67] "Neurodegenerative Disease"                                        
##  [68] "Paraganglioma"                                                    
##  [69] "Schizophrenia"                                                    
##  [70] "Pontocerebellar Hypoplasia"                                       
##  [71] "Depression Disorder"                                              
##  [72] "Neurofibromatosis"                                                
##  [73] "Major Depression Disorder"                                        
##  [74] "Ischemic Stroke"                                                  
##  [75] "Ataxia Telangiectasia"                                            
##  [76] "Spinocerebellar Ataxia"                                           
##  [77] "Smith-Magenis Syndrome"                                           
##  [78] "Anorexia Nervosa"                                                 
##  [79] "Bipolar Disorder"                                                 
##  [80] "Frontotemporal Lobar Degeneration"                                
##  [81] "Neurodevelopmental Disability"                                    
##  [82] "Panic Disorder"                                                   
##  [83] "Post-traumatic Stress Disorder"                                   
##  [84] "Amyotrophic lateral Sclerosis"                                    
##  [85] "Angelman Syndrome"                                                
##  [86] "Cerebral infarction"                                              
##  [87] "Cognitive Functions and Neuronal plasticity"                      
##  [88] "Fragile X Syndrome"                                               
##  [89] "Neurological Disorder"                                            
##  [90] "Non-functioning Pituitary Adenoma"                                
##  [91] "Pituitary Adenoma"                                                
##  [92] "Plexiform Neurofibroma"                                           
##  [93] "Prader-willi Syndrome and Angelman Syndrome"                      
##  [94] "Psychiatric Disease"                                              
##  [95] "West Syndrome"                                                    
##  [96] "Non-functioning Pituitary Neoplasms"                              
##  [97] "Pituitary Neoplasms"                                              
##  [98] "Forebrain Ischemia"                                               
##  [99] "Status Epilepticus"                                               
## [100] "Acute Cerebral Infarction"                                        
## [101] "Acute Cerebral Ischemia"                                          
## [102] "Brain Neoplasms"                                                  
## [103] "Cerebellum Cancer"                                                
## [104] "Cerebral Cavernous Malformation"                                  
## [105] "Cerebral Ischemia"                                                
## [106] "Cerebral Malaria"                                                 
## [107] "Encephalomyelitis"                                                
## [108] "Intracerebral Hemorrhage"                                         
## [109] "Mild Cognitive Impairment"                                        
## [110] "Neurilemmoma"                                                     
## [111] "Neuroendocrine Tumor"                                             
## [112] "Neuroepithelial Tumor"                                            
## [113] "Neuroma"                                                          
## [114] "Neuronal Apoptosis-Related Disease"                               
## [115] "Frontotemporal Dementia"                                          
## [116] "Anxiety Disorder"                                                 
## [117] "Acute Ischemic Stroke"                                            
## [118] "Aneurysmal Subarachnoid Hemorrhage"                               
## [119] "Central Nervous System Embryonal Tumor"

Pick genes randomly how many would we expect to be related to diseases?

expressed_geneId =  rownames(expressed)
sum( expressed_geneId %in% bda_final$ensid ) # 3149 out of 19004 are related to brain disease

## [1] 3216

nullGenes = unique(rownames(tpm_norm_count_table_thresholded_top[rowSums(tpm_norm_count_table_thresholded_top[,which(metadata$species =="HS" & metadata$sources=="iPSC" & metadata$gender=='F')])>3,]),
                   rownames(tpm_norm_count_table_thresholded_top[rowSums(tpm_norm_count_table_thresholded_top[,which(metadata$species %in% c("PT","MM"))])>2,]))
  
nullGenes_geneNames = unique(genemapu$external_gene_name[genemapu$ensembl_gene_id %in% nullGenes]) 
nullGenes_geneNames = nullGenes_geneNames[nullGenes_geneNames %in% rownames(expressed)]
randRes = c()

i=1
while( i < 10000 ){
  tp = expressed_geneId[sample( seq(1,length(nullGenes)),
                                nrow(hits_up)+nrow(hits_dn),replace=FALSE )]
  randRes = c(randRes,sum(tp %in% bda_final$ensid))
  i=i+1
}

par(mfrow=c(1,1),mar=c(5,5,1,1))
hist( randRes, n=100, xlim=c(50,200), main="Randomly picked gene sets", 
      ylab="Disease related genes")
points(x=sum(length(unique(x$ensid)),
             length(unique(y$ensid))),
       y=0,col="red3",cex=2,pch=19)
axis(1,lwd=2)
axis(2,lwd=2)

mean(randRes)

## [1] 112.9413

ID and intelligence

id_DAVID_all = read.delim(paste0(outputs_directory,"uniprotkb_keyword_KW_0991_2023_09_01.tsv"),
                          header=TRUE)                                
id_DAVID_all = unlist(lapply(split(id_DAVID_all$Gene.Names,id_DAVID_all$Entry),
                             function(x){strsplit(x," ")}))
id_DAVID_all_ensg = unique( genemapu$ensembl_gene_id[genemapu$external_gene_name %in% id_DAVID_all ] )

fisher.test(matrix(c(sum(hits_dn$ensembl_id %in% id_DAVID_all_ensg),
                     sum(hits_up$ensembl_id %in% id_DAVID_all_ensg),
                     nrow(hits_dn),
                     nrow(hits_up)),2,2))

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(sum(hits_dn$ensembl_id %in% id_DAVID_all_ensg), sum(hits_up$ensembl_id %in% id_DAVID_all_ensg), nrow(hits_dn), nrow(hits_up)), 2, 2)
## p-value = 0.000000000003697
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##     7.914837 1916.514987
## sample estimates:
## odds ratio 
##   47.39888

hits_dn$hgnc_symbol[hits_dn$hgnc_symbol %in% id_DAVID_all ]

##  [1] "KMT2E"   "SYN1"    "CDH1"    "NUP133"  "ATP2B1"  "OPHN1"   "KAT6A"  
##  [8] "ZC3H14"  "CTCF"    "SMC3"    "FBXW7"   "KMT5B"   "ARL6"    "CEP104" 
## [15] "ZMYM2"   "FGF13"   "DPP6"    "ATP8A2"  "CDK8"    "DPF2"    "STXBP1" 
## [22] "FBXO11"  "ASXL2"   "PHIP"    "TLK2"    "RBMX"    "AFF2"    "PHF6"   
## [29] "DYRK1A"  "ZNF148"  "HNRNPH1" "SOX11"   "DCC"     "USP7"    "ZNF292" 
## [36] "PGAP1"

hits_up$hgnc_symbol[hits_up$hgnc_symbol %in% id_DAVID_all ]

## [1] "ZNHIT3"

Heatmap of ID related genes

idgenesensembl = hits_dn$ensembl_id[ hits_dn$ensembl_id %in% id_DAVID_all_ensg ]
tpm_norm_count_table_ID= tpm_norm_count_table[rownames(tpm_norm_count_table) %in% idgenesensembl,3:ncol(tpm_norm_count_table)] 

tpm_norm_count_table_id = as.data.frame(log10(0.1+tpm_norm_count_table_ID))
rownames(tpm_norm_count_table_id) = genemapu$external_gene_name[match(idgenesensembl,genemapu$ensembl_gene_id)]

png(paste0(plots_directory,'/ID_heatmap.png'),
    width = 5000, height = 10000, res = 1200 )
pheatmap(tpm_norm_count_table_id, 
         cellheight = 10, 
         treeheight_row = 0, 
         cluster_cols = F, 
         cluster_rows = T, 
         scale = "row",color=colorRampPalette(c("blue","white","red"))(100),
         angle_col = '315')
dev.off()

## quartz_off_screen 
##                 3

Extracellular exosome

up_fa = read.delim(paste0(outputs_directory,"hits_up_DAVID_KEGG.txt"))
exosomal_genes = unique( unlist(strsplit(up_fa$Genes[up_fa$Term=="GO:0070062~extracellular exosome"],", ")) )
exosomal_genes = exosomal_genes[-which(exosomal_genes %in% "ENSG00000285762")]

tpm_norm_count_table_EX = tpm_norm_count_table[rownames(tpm_norm_count_table) %in% exosomal_genes,3:ncol(tpm_norm_count_table)] 
tpm_norm_count_table_ex = as.data.frame(log10(0.1+tpm_norm_count_table_EX))
rownames(tpm_norm_count_table_ex) = hits_up$hgnc_symbol[match(rownames(tpm_norm_count_table_ex),hits_up$ensembl_id)]
  
png(paste0(plots_directory,'/Exosome_heatmap_GeneNames.png'),
    width = 5000, height = 10000, res = 1200 )
pheatmap(tpm_norm_count_table_ex,
         cellheight = 10, 
         treeheight_row = 0,
         cluster_cols = F, 
         cluster_rows = T,
         scale = "row",color=colorRampPalette(c("blue","white","red"))(100),
         angle_col = '315')
dev.off()

## quartz_off_screen 
##                 3

DAVID analysis - downregulated genes

go_dn = read.delim( paste0(outputs_directory,'dn_engs_DAVID_KEGG.txt' ))
go_dn = go_dn[order(go_dn$Benjamini,decreasing=TRUE),]
go_dn = go_dn[go_dn$Benjamini<0.05,]
go_dn$anyGo = unlist(lapply(strsplit(go_dn$Term,":"),function(x){x[[1]]}))
go_dn = go_dn[go_dn$anyGo %in% c("GO","hsa01100"),]
par(mfrow=c(1,1),mar=c(5,30, 1,1))
barplot(-log10(go_dn$Benjamini), horiz=TRUE,
        names=go_dn$Term,las=2,xlim=c(0,20),xlab="-Log[10]B-H adj. P-val")
axis(1,lwd=2,las=2)

Genes related to nucleus

length(unique(unlist(strsplit(go_dn$Genes,", "))))

## [1] 249

DAVID analysis - upregulated genes

go_up = read.delim( paste0(outputs_directory,'hits_up_DAVID_KEGG.txt' ))
go_up = go_up[order(go_up$Benjamini,decreasing=TRUE),]
go_up = go_up[go_up$Benjamini<0.01,]
go_up$anyGo = unlist(lapply(strsplit(go_up$Term,":"),function(x){x[[1]]}))
go_up = go_up[go_up$anyGo %in% c("GO","hsa01100"),]
par(mfrow=c(1,1),mar=c(5,20, 1,1))
barplot(-log10(go_up$Benjamini), horiz=TRUE,
        names=go_up$Term,las=2,xlim=c(0,3),xlab="-Log[10]B-H adj. P-val")
axis(1,lwd=2,las=2)

Mandy6 validation

pluripotencyGenes = read.delim(paste0(outputs_directory,'Conserved_Pluripotency_genes.txt'),header=FALSE,as.is=TRUE)
df = read.delim( paste0(outputs_directory,'gene_counts_Mandy.txt'),skip=1, as.is=TRUE)
countTable = df[,c(7,8,9,10,11,6)]
rownames(countTable) = df$Geneid
chimp_tpm=GetTPM(countTable,1:5,rownames(countTable))
colnames(chimp_tpm) = unlist(strsplit(colnames(chimp_tpm),"analyses.star.RNA_Seq_02.22_PanTro_iPSC_WT_"))[seq(2,2*ncol(chimp_tpm),by=2)]
colnames(chimp_tpm) = unlist(strsplit(colnames(chimp_tpm),"_Rep_1_Aligned.sortedByCoord.out.bam"))


countTable = df[,c(7,8,9,10,11)]
rownames(countTable) = df$Geneid
colnames(countTable) = unlist(strsplit(colnames(countTable),"analyses.star.RNA_Seq_02.22_PanTro_iPSC_WT_"))[seq(2,2*ncol(chimp_tpm),by=2)]
colnames(countTable) = unlist(strsplit(colnames(countTable),"_Rep_1_Aligned.sortedByCoord.out.bam"))

coldata = data.frame(condition=c(rep("Mandy",4),"SandraA"))
rownames(coldata) = colnames(countTable)
dds <- DESeqDataSetFromMatrix(
 countData = countTable,
 colData = coldata,
 design = ~ condition )

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

resultsNames(dds)

## [1] "Intercept"                  "condition_SandraA_vs_Mandy"

res = results(dds)
res <- res [order(res$padj),]
vsdat <- vst(dds, blind=FALSE)
matvsdat=assay(vsdat) ## variance stablised data
mat=counts(dds, normalized=TRUE) ## variance stablised data

mat = mat[rowSums(mat)>20,]

par(mfrow=c(2,2),mar=c(5,5,5,5),pty="s",bty="O")
heatscatter( log2(0.1+mat[,'Mandy4']),log2(0.1+mat[,'SandraA']), 
             colpal = 'crazyblue',pch=19, cex=0.5,
             xlab="Mandy4 [log2(counts)]", ylab="SandraA [log2(counts)]")
box(col="black")
heatscatter( log2(0.1+mat[,'Mandy6']),log2(0.1+mat[,'SandraA']), 
             colpal = 'crazyblue',pch=19, cex=0.5,
             xlab="Mandy6 [log2(counts)]", ylab="SandraA [log2(counts)]")
box(col="black")
heatscatter( log2(0.1+mat[,'Mandy4']),log2(0.1+mat[,'Mandy6']), 
             colpal = 'crazyblue',pch=19, cex=0.5,
             xlab="Mandy4 [log2(counts)]", ylab="Mandy6 [log2(counts)]")
box(col="black")

par(mfrow=c(1,1),mar=c(7,5,5,1),bty='n')
boxplot( chimp_tpm[, 'Mandy6'],
         chimp_tpm[rownames(chimp_tpm) %in% pluripotencyGenes$V1, 'Mandy6'],
         chimp_tpm[, 'Mandy4'],
         chimp_tpm[rownames(chimp_tpm) %in% pluripotencyGenes$V1, 'Mandy4'],
         chimp_tpm[, 'SandraA'],
         chimp_tpm[rownames(chimp_tpm) %in% pluripotencyGenes$V1, 'SandraA'],
         border=rep(c('gray','red'),3), main='',col='white',
         ylab=expression('TPM'), outline=FALSE, 
         ylim=c(0,60), bty='n',notch=FALSE,lwd=2,
         names=rep(c("all genes","Pluripotency"),3),las=2 )

ID

id_DAVID_all = read.delim(paste0(outputs_directory,"uniprotkb_keyword_KW_0991_2023_09_01.tsv"),
                          header=TRUE)                                
id_DAVID_all = unlist(lapply(split(id_DAVID_all$Gene.Names,id_DAVID_all$Entry),
                             function(x){strsplit(x," ")}))
id_DAVID_all_ensg = unique( genemapu$ensembl_gene_id[genemapu$external_gene_name %in% id_DAVID_all ] )
id = id_DAVID_all_ensg[id_DAVID_all_ensg %in% hits_dn$ensembl_id]

up_fa = read.delim(paste0(outputs_directory,"hits_up_DAVID_KEGG.txt"))
exosomal_genes = unique( unlist(strsplit(up_fa$Genes[up_fa$Term=="GO:0070062~extracellular exosome"],", ")) )
exosome = exosomal_genes[-which(exosomal_genes %in% "ENSG00000285762")]

tpm_norm_count_table_EX = tpm_norm_count_table[rownames(tpm_norm_count_table) %in% exosomal_genes,3:ncol(tpm_norm_count_table)] 
tpm_norm_count_table_ex = as.data.frame(log10(0.1+tpm_norm_count_table_EX))
rownames(tpm_norm_count_table_ex) = hits_up$hgnc_symbol[match(rownames(tpm_norm_count_table_ex),hits_up$ensembl_id)]

DESeq2 analysis of gene expression in bulk cortex tissue - this paper

human_count = read.delim(paste0(outputs_directory,'gene_counts_human_dec2022.txt'),skip=1)
macaque_count = read.delim(paste0(outputs_directory,'gene_counts_macaque.txt'),skip=1)
all(human_count$Geneid==macaque_count$Geneid)

## [1] TRUE

APlab_count = data.frame(Hs_CTX_WT_Brain_S3A1_M = human_count[,7],
                         Hs_CTX_WT_Brain_S7A1_M = human_count[,8],
                         Hs_CTX_WT_Brain_S2A1_M = human_count[,9],
                         Hs_CTX_WT_Brain_S1A1_M = human_count[,10],
                         Hs_CTX_WT_Brain_S6A1_F = human_count[,11],
                         Mm_CTX_WT_Brain_10506_M = macaque_count[,7],
                         Mm_CTX_WT_Brain_10521_F = macaque_count[,8], 
                         row.names = human_count$Geneid)
brain_met = data.frame(species=factor( c(rep("HS",5),c('MM','MM')),levels=c("HS","MM")),
                       sample='WholeCortex',sex=c('M','M','M','M','F','M','F'),
                       row.names=colnames(APlab_count))

brain_bulk = DESeqDataSetFromMatrix(
 countData = APlab_count,
 colData = brain_met,
 design = ~ species )

brain_bulk = estimateSizeFactors(brain_bulk)
brain_bulk <- DESeq(brain_bulk)

## using pre-existing size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

vst_data = vst(brain_bulk, blind=TRUE)
log_data = rlog(brain_bulk, blind=TRUE)

normalized_counts = counts(brain_bulk,normalized=TRUE)
brain_bulk_PL_res = results(brain_bulk, contrast = c("species","HS","MM") )
brain_bulk_PL_sig = brain_bulk_PL_res[!is.na(brain_bulk_PL_res$padj),]
brain_bulk_PL_sig = brain_bulk_PL_sig[brain_bulk_PL_sig$padj<0.01,]
brain_bulk_PL_sig_down = brain_bulk_PL_sig[brain_bulk_PL_sig$log2FoldChange<(0),]
brain_bulk_PL_sig_up = brain_bulk_PL_sig[brain_bulk_PL_sig$log2FoldChange>(0),]

Consider Khaitovitch lab bulk RNA seq data

brain_tpm = read.delim(paste0(outputs_directory,'Ext_RNASeq_TPMCOUNTS.tsv'),header=TRUE, as.is=TRUE)
brain_met = read.delim(paste0(outputs_directory,'Ext_RNASeq_METADATA.tsv'),header=TRUE, as.is=TRUE)
brain_counts = read.delim(paste0(outputs_directory,'Ext_RNASeq_COUNTDATA.tsv'),header=TRUE, as.is=TRUE)
rownames(brain_met) = brain_met$sample_names
brain_counts = brain_counts[,match(rownames(brain_met),colnames(brain_counts))]

load(paste0(objects_directory,"ensembl_hg38_genemap.RData"))
load(paste0(objects_directory,"GTF_Annotation.RData"))
genemapu = genemap[!duplicated(genemap$ensembl_gene_id),]

Load objects from other vignettes

load(paste0(objects_directory,"bda_final.RData"))
load(paste0(objects_directory,"DEseq2_RNA.RData"))
id_DAVID_all = read.delim(paste0(outputs_directory,"uniprotkb_keyword_KW_0991_2023_09_01.tsv"),
                          header=TRUE)                                
id_DAVID_all = unlist(lapply(split(id_DAVID_all$Gene.Names,id_DAVID_all$Entry),
                             function(x){strsplit(x," ")}))
id_DAVID_all_ensg = unique( genemapu$ensembl_gene_id[genemapu$external_gene_name %in% id_DAVID_all ] )
id = id_DAVID_all_ensg[id_DAVID_all_ensg %in% hits_dn$ensembl_id]

up_fa = read.delim(paste0(outputs_directory,"hits_up_DAVID_KEGG.txt"))
exosomal_genes = unique( unlist(strsplit(up_fa$Genes[up_fa$Term=="GO:0070062~extracellular exosome"],", ")) )
exosome = exosomal_genes[-which(exosomal_genes %in% "ENSG00000285762")]

tpm_norm_count_table_EX = tpm_norm_count_table[rownames(tpm_norm_count_table) %in% exosomal_genes,3:ncol(tpm_norm_count_table)] 
tpm_norm_count_table_ex = as.data.frame(log10(0.1+tpm_norm_count_table_EX))
rownames(tpm_norm_count_table_ex) = hits_up$hgnc_symbol[match(rownames(tpm_norm_count_table_ex),hits_up$ensembl_id)]

DESeq2 analysis of gene expression in bulk cortex tissue - this paper

human_count = read.delim(paste0(outputs_directory,'gene_counts_human_dec2022.txt'),skip=1)
macaque_count = read.delim(paste0(outputs_directory,'gene_counts_macaque.txt'),skip=1)
all(human_count$Geneid==macaque_count$Geneid)

## [1] TRUE

APlab_count = data.frame(Hs_CTX_WT_Brain_S3A1_M = human_count[,7],
                         Hs_CTX_WT_Brain_S7A1_M = human_count[,8],
                         Hs_CTX_WT_Brain_S2A1_M = human_count[,9],
                         Hs_CTX_WT_Brain_S1A1_M = human_count[,10],
                         Hs_CTX_WT_Brain_S6A1_F = human_count[,11],
                         Mm_CTX_WT_Brain_10506_M = macaque_count[,7],
                         Mm_CTX_WT_Brain_10521_F = macaque_count[,8], 
                         row.names = human_count$Geneid)
brain_met = data.frame(species=factor( c(rep("HS",5),c('MM','MM')),levels=c("HS","MM")),
                       sample='WholeCortex',sex=c('M','M','M','M','F','M','F'),
                       row.names=colnames(APlab_count))

brain_bulk = DESeqDataSetFromMatrix(
 countData = APlab_count,
 colData = brain_met,
 design = ~ species )

brain_bulk = estimateSizeFactors(brain_bulk)
brain_bulk <- DESeq(brain_bulk)

## using pre-existing size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

vst_data = vst(brain_bulk, blind=TRUE)
log_data = rlog(brain_bulk, blind=TRUE)

normalized_counts = counts(brain_bulk,normalized=TRUE)
brain_bulk_PL_res = results(brain_bulk, contrast = c("species","HS","MM") )
brain_bulk_PL_sig = brain_bulk_PL_res[!is.na(brain_bulk_PL_res$padj),]
brain_bulk_PL_sig = brain_bulk_PL_sig[brain_bulk_PL_sig$padj<0.01,]
brain_bulk_PL_sig_down = brain_bulk_PL_sig[brain_bulk_PL_sig$log2FoldChange<(0),]
brain_bulk_PL_sig_up = brain_bulk_PL_sig[brain_bulk_PL_sig$log2FoldChange>(0),]

Consider Khaitovitch lab bulk RNA seq data

brain_tpm = read.delim(paste0(outputs_directory,'Ext_RNASeq_TPMCOUNTS.tsv'),header=TRUE, as.is=TRUE)
brain_met = read.delim(paste0(outputs_directory,'Ext_RNASeq_METADATA.tsv'),header=TRUE, as.is=TRUE)
brain_counts = read.delim(paste0(outputs_directory,'Ext_RNASeq_COUNTDATA.tsv'),header=TRUE, as.is=TRUE)
rownames(brain_met) = brain_met$sample_names
brain_counts = brain_counts[,match(rownames(brain_met),colnames(brain_counts))]

Retain normal Cortex and Female samples and perform DESeq2 based normalisation

klrna = brain_counts[ , brain_met$lab=="Khaitovich Lab" & brain_met$sources %like% 'Cortex' & brain_met$condition=="Normal" & brain_met$sex=="F"]
brain_met_kl = brain_met[brain_met$lab=="Khaitovich Lab" & brain_met$sources %like% 'Cortex' & brain_met$condition=="Normal" & brain_met$sex=="F",]

brain_bulk_kl = DESeqDataSetFromMatrix(
 countData = klrna,
 colData = brain_met_kl,
 design = ~ species )

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

brain_bulk_kl = estimateSizeFactors(brain_bulk_kl)
brain_bulk_kl_normalized = counts(brain_bulk_kl, normalized=TRUE )

brain_bulk_kl = DESeqDataSetFromMatrix(
 countData = klrna,
 colData = brain_met_kl,
 design = ~ species )

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

brain_bulk_kl = estimateSizeFactors(brain_bulk_kl)
brain_bulk_kl = DESeq(brain_bulk_kl)

## using pre-existing size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

brain_bulk_kl_normalized = counts(brain_bulk_kl, normalized=TRUE )

brain_bulk_KL_res = results(brain_bulk_kl, contrast = c("species","HS","MM") )
brain_bulk_KL_res2 = results(brain_bulk_kl, contrast = c("species","HS","PT") )
brain_bulk_KL_res3 = results(brain_bulk_kl, contrast = c("species","HS","PP") )

Intellectual Dissability-related genes

Plot showing the log fold change of expression of genes we identified in the analyses of iAstrocyte transcriptomes.

id_PL_bulk = brain_bulk_PL_res[rownames(brain_bulk_PL_res) %in% id,]
id_PL_bulk$Genename = genemapu$external_gene_name[match(rownames(id_PL_bulk),genemapu$ensembl_gene_id)]
id_PL_bulk_sig = id_PL_bulk[id_PL_bulk$padj<0.1,]

exosome_PL_bulk = brain_bulk_PL_res[rownames(brain_bulk_PL_res) %in% exosome,]
exosome_PL_bulk_sig = exosome_PL_bulk[exosome_PL_bulk$padj<0.01,]

length(exosome_PL_bulk$log2FoldChange)

## [1] 49

length(id_PL_bulk$log2FoldChange)

## [1] 36

par(mfrow=c(1,1),mar=c(5,5,1,5))
boxplot( exosome_PL_bulk$log2FoldChange,
         id_PL_bulk$log2FoldChange,
         names=c('Exosome','ID'),
         col='white',
         border=c('blue4','red4'),
         las=2,
         ylab="log[2](FC) (Human/Macaque)")
abline(h=0,lwd=1,lty=2)
axis(2,lwd=2,las=2)
axis(1,lwd=2,at=c(1,2),c('Exosome','ID'),las=2)
box(col='black',lwd=2)

t.test( exosome_PL_bulk$log2FoldChange )

## 
##  One Sample t-test
## 
## data:  exosome_PL_bulk$log2FoldChange
## t = 4.1179, df = 48, p-value = 0.0001498
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.5752992 1.6731316
## sample estimates:
## mean of x 
##  1.124215

t.test( id_PL_bulk$log2FoldChange )

## 
##  One Sample t-test
## 
## data:  id_PL_bulk$log2FoldChange
## t = -4.0094, df = 35, p-value = 0.0003038
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -1.0847568 -0.3555055
## sample estimates:
##  mean of x 
## -0.7201311

The same but based on published bulk RNA seq profiles

exosome_KL_Pt = brain_bulk_KL_res2[rownames(brain_bulk_KL_res2) %in% exosome,]
exosome_KL_PP = brain_bulk_KL_res3[rownames(brain_bulk_KL_res3) %in% exosome,]
exosome_KL_Mm = brain_bulk_KL_res[rownames(brain_bulk_KL_res) %in% exosome,]

IntleDi_KL_Pt = brain_bulk_KL_res2[rownames(brain_bulk_KL_res2) %in% id,]
IntleDi_KL_PP = brain_bulk_KL_res3[rownames(brain_bulk_KL_res3) %in% id,]
IntleDi_KL_Mm = brain_bulk_KL_res[rownames(brain_bulk_KL_res) %in% id,]

par(mfrow=c(1,1),mar=c(12,5,1,1),pty="m")
boxplot( exosome_KL_Pt$log2FoldChange,
         exosome_KL_PP$log2FoldChange,
         exosome_KL_Mm$log2FoldChange,
         IntleDi_KL_Pt$log2FoldChange,
         IntleDi_KL_PP$log2FoldChange,
         IntleDi_KL_Mm$log2FoldChange,
         horizontal = FALSE,
         names=c('Exosome (Hs vs. Pt)',
                 'Exosome (Hs vs. Pp)',
                 'Exosome (Hs vs. Rm)',
                 'ID (Hs vs. Pt)',
                 'ID (Hs vs. Pp)',
                 'ID (Hs vs. Rm)'),
         col='white',outline=FALSE,
         border=c('blue4','blue4','blue4','red4','red4','red4'),las=2,lwd=2,
         ylab="log[2](FC) (Human/NHP)")
abline(h=0,lwd=2,lty=2)
axis(2,lwd=2,las=2)
axis(1,lwd=2,at=c(1:6),las=2,
     labels=c('Exosome (Hs vs. Pt)',
              'Exosome (Hs vs. Pp)',
              'Exosome (Hs vs. Rm)',
              'ID (Hs vs. Pt)',
              'ID (Hs vs. Pp)',
              'ID (Hs vs. Rm)'))
box(col='black',lwd=2)

t.test( exosome_KL_Pt$log2FoldChange )

## 
##  One Sample t-test
## 
## data:  exosome_KL_Pt$log2FoldChange
## t = 4.4683, df = 48, p-value = 0.00004797
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.2864792 0.7552193
## sample estimates:
## mean of x 
## 0.5208493

t.test( exosome_KL_PP$log2FoldChange )

## 
##  One Sample t-test
## 
## data:  exosome_KL_PP$log2FoldChange
## t = 2.8454, df = 48, p-value = 0.006504
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.1072718 0.6240243
## sample estimates:
## mean of x 
## 0.3656481

t.test( exosome_KL_Mm$log2FoldChange )

## 
##  One Sample t-test
## 
## data:  exosome_KL_Mm$log2FoldChange
## t = 4.0998, df = 48, p-value = 0.0001587
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.3969305 1.1609365
## sample estimates:
## mean of x 
## 0.7789335

t.test( IntleDi_KL_Pt$log2FoldChange )

## 
##  One Sample t-test
## 
## data:  IntleDi_KL_Pt$log2FoldChange
## t = -0.75818, df = 35, p-value = 0.4534
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.2247032  0.1025023
## sample estimates:
##   mean of x 
## -0.06110044

t.test( IntleDi_KL_PP$log2FoldChange )

## 
##  One Sample t-test
## 
## data:  IntleDi_KL_PP$log2FoldChange
## t = -2.5948, df = 35, p-value = 0.01374
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.34656803 -0.04231215
## sample estimates:
##  mean of x 
## -0.1944401

t.test( IntleDi_KL_Mm$log2FoldChange )

## 
##  One Sample t-test
## 
## data:  IntleDi_KL_Mm$log2FoldChange
## t = -2.7958, df = 35, p-value = 0.00835
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.50965182 -0.08086503
## sample estimates:
##  mean of x 
## -0.2952584

How many ID DEGs?

all(rownames(IntleDi_KL_Pt)==rownames(IntleDi_KL_Mm))

## [1] TRUE

sum( IntleDi_KL_Pt$padj<0.01 & IntleDi_KL_Pt$log2FoldChange<0 ) + sum( IntleDi_KL_Mm$padj<0.01 & IntleDi_KL_Mm$log2FoldChange<0 ) + sum( IntleDi_KL_PP$padj<0.01 & IntleDi_KL_PP$log2FoldChange<0 )

## [1] 18

sum( IntleDi_KL_Pt$padj<0.01 & IntleDi_KL_Pt$log2FoldChange>0 ) + sum(IntleDi_KL_Mm$padj<0.01 & IntleDi_KL_Mm$log2FoldChange>0 ) + sum(IntleDi_KL_PP$padj<0.01 & IntleDi_KL_PP$log2FoldChange>0 )

## [1] 6

## Level of TBP is not changed in humans and macaques
"ENSG00000112592"

## [1] "ENSG00000112592"

brain_bulk_KL_res["ENSG00000112592",]

## log2 fold change (MLE): species HS vs MM 
## Wald test p-value: species HS vs MM 
## DataFrame with 1 row and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000112592   141.195      -0.468244  0.267576  -1.74995 0.0801266
##                      padj
##                 <numeric>
## ENSG00000112592  0.156862

brain_bulk_PL_res["ENSG00000112592",]

## log2 fold change (MLE): species HS vs MM 
## Wald test p-value: species HS vs MM 
## DataFrame with 1 row and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000112592    185.88      -0.182397  0.298586  -0.61087  0.541285
##                      padj
##                 <numeric>
## ENSG00000112592  0.647498

More broad analysis of diseases

All the other diseases - do they show such a trend? Fix the gene names! https://www.mirbase.org/ftp.shtml

choroby = table(bda_final$Disease)
choroby = choroby[choroby>30]

LFC_disease = lapply( split( bda_final$ensid[bda_final$Disease %in% names(choroby)], 
                             bda_final$Disease[bda_final$Disease %in% names(choroby)]), function(genes){
                               pt = brain_bulk_PL_res[rownames(brain_bulk_PL_res) %in% genes, ]
                               pt = pt[!is.na(pt$log2FoldChange),]
                               res = pt$log2FoldChange
                               names(res) = rownames(pt)
                               return(res)} )

LFC_disease2 = lapply( split( bda_final$ensid[bda_final$Disease %in% names(choroby)], 
                              bda_final$Disease[bda_final$Disease %in% names(choroby)]), function(genes){
                               pt = brain_bulk_KL_res[rownames(brain_bulk_KL_res) %in% genes, ]
                               pt = pt[!is.na(pt$log2FoldChange),]
                               res = pt$log2FoldChange
                               names(res) = rownames(pt)
                               return(res)} )

LFC_disease3 = lapply( split( bda_final$ensid[bda_final$Disease %in% names(choroby)], 
                             bda_final$Disease[bda_final$Disease %in% names(choroby)]), function(genes){
                               pt = brain_bulk_KL_res2[rownames(brain_bulk_KL_res2) %in% genes, ]
                               pt = pt[!is.na(pt$log2FoldChange),]
                               res = pt$log2FoldChange
                               names(res) = rownames(pt)
                               return(res) } )


### --------------------
ChosenFunction = function(x){t.test(x)$p.value}
LFC_disease_pv = unlist(lapply(LFC_disease,ChosenFunction))
LFC_disease2_pv = unlist(lapply(LFC_disease2,ChosenFunction))
LFC_disease3_pv = unlist(lapply(LFC_disease3,ChosenFunction))

ChosenFunction = median
LFC_disease_fc = unlist(lapply(LFC_disease,ChosenFunction))
LFC_disease2_fc = unlist(lapply(LFC_disease2,ChosenFunction))
LFC_disease3_fc = unlist(lapply(LFC_disease3,ChosenFunction))

cols=colorRampPalette(c("orange3","white","aquamarine3"))(length(LFC_disease_fc))

par(mfrow=c(1,1),mar=c(15,4,5,1))
barplot(LFC_disease_fc[order(LFC_disease_fc,decreasing=FALSE)],
        col=ifelse(LFC_disease_pv[order(LFC_disease_fc,decreasing=FALSE)]<0.05,"aquamarine3","gray80"),
        las=2,axes=FALSE,ylim=c(-1,1),ylab="log[2]FC (Human/NHP)")
axis(2,lwd=2,las=2,cex.lab=1.5)

par(mfrow=c(1,1),mar=c(15,4,5,1))
barplot(LFC_disease2_fc[order(LFC_disease2_fc,decreasing=FALSE)], 
        col=ifelse(LFC_disease2_pv[order(LFC_disease2_fc,decreasing=FALSE)]<0.05,"aquamarine3","gray80"),
        las=2,axes=FALSE,ylab="log[2]FC (Human/NHP)")
axis(2,lwd=2,las=2,cex.lab=1.5)

par(mfrow=c(1,1),mar=c(15,4,1,1))

barplot(LFC_disease3_fc[order(LFC_disease3_fc,decreasing=FALSE)], 
        col=ifelse(LFC_disease3_pv[order(LFC_disease3_fc,decreasing=FALSE)]<0.05,"aquamarine3","gray80"),
        las=2,axes=FALSE,ylim=c(-0.4,0.4),ylab="log[2]FC (Human/NHP)")
axis(2,lwd=2,las=2,cex.lab=1.5)

TEADs - expression

library(beeswarm)
library(ggpubr)
library(dplyr)

normalized_counts_PL = counts(brain_bulk,normalized=TRUE)
normalized_counts_KL = counts(brain_bulk_kl,normalized=TRUE)

samples_KL = brain_met[brain_met$lab=="Khaitovich Lab" & brain_met$sources %like% 'Cortex' & brain_met$condition=="Normal" & brain_met$sex=="F",]

tead3_pl = data.frame( expression=normalized_counts_PL['ENSG00000007866',],species=c(rep("HS",5),rep("MM",2)))
tead3_kl = data.frame( expression=normalized_counts_KL['ENSG00000007866',],
                       species=samples_KL$species[match(colnames(normalized_counts_KL),samples_KL$sample_names)] )
tead3_kl = tead3_kl[tead3_kl$species %in% c("HS","PT","MM"),]
tead3_kl$species = factor(tead3_kl$species,levels=c("HS","PT","MM"))

se <- function(x){sd(x)/sqrt(length(x))}
my_dat <- summarise(group_by(tead3_pl, species), mean=mean(expression),se=se(expression))

ggplot(my_dat, aes(x=species, y=mean, fill=species)) + 
   geom_bar(stat="identity", position=position_dodge()) +
   geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.2,
                 position=position_dodge(.9)) + theme_classic() + ylim(c(0,50)) + scale_fill_manual(values=c('gray','blue'))

se <- function(x){sd(x)/sqrt(length(x))}
my_dat <- summarise(group_by(tead3_kl, species), 
                    mean=mean(expression),se=se(expression))

ggplot(my_dat, aes(x=species, y=mean, fill=species)) + 
   geom_bar(stat="identity", position=position_dodge()) +
   geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.2,
                 position=position_dodge(.9)) + theme_classic() + ylim(c(0,90)) + scale_fill_manual(values=c('gray','red','blue'))

P-values

brain_bulk_KL_res['ENSG00000007866',]

## log2 fold change (MLE): species HS vs MM 
## Wald test p-value: species HS vs MM 
## DataFrame with 1 row and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000007866   39.6105        1.75034  0.441979   3.96023 0.000074879
##                      padj
##                 <numeric>
## ENSG00000007866 0.0004964

brain_bulk_KL_res3['ENSG00000007866',]

## log2 fold change (MLE): species HS vs PP 
## Wald test p-value: species HS vs PP 
## DataFrame with 1 row and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000007866   39.6105        1.16246  0.350057   3.32078 0.000897662
##                       padj
##                  <numeric>
## ENSG00000007866 0.00614593

brain_bulk_PL_sig['ENSG00000007866',]

## log2 fold change (MLE): species HS vs MM 
## Wald test p-value: species HS vs MM 
## DataFrame with 1 row and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat     pvalue
##                 <numeric>      <numeric> <numeric> <numeric>  <numeric>
## ENSG00000007866   30.2115        1.78403  0.581936   3.06568 0.00217175
##                       padj
##                  <numeric>
## ENSG00000007866 0.00825217

Regulome analysis - chromatin structure

Analysis of domains

Domains were identified using TOPDOM. We read them in here. We consider boundaries that have support in two replicates. First chunk lifts over the boundary coordinates between human and chimpanzee assemblies.

ele_domains = readTADs( paste0(outputs_directory,"25kb_domains/hs_ele_krnorm.all.25kb.topdom.bedpe" ) )
fas_domains = readTADs( paste0(outputs_directory,"25kb_domains/hs_pf_krnorm.all.25kb.topdom.bedpe" ) )
man_domains = readTADs( paste0(outputs_directory,"25kb_domains/pt_mandy_krnorm.all.25kb.topdom.bedpe" ) )
saa_domains = readTADs( paste0(outputs_directory,"25kb_domains/pt_sandra_krnorm.all.25kb.topdom.bedpe" ) )


ele_domains_lift_over_Pt6 = liftOverBoundaries( ele_domains, chain_HsPt, WSize = 500 )
fas_domains_lift_over_Pt6 = liftOverBoundaries( fas_domains, chain_HsPt, WSize = 500 )
bed_file = c(ele_domains_lift_over_Pt6$lifted_over,fas_domains_lift_over_Pt6$lifted_over)
export.bed( bed_file, con=paste0(outputs_directory,"ele_fas_boundaries_lift_Pt6.bed" ) )
save( ele_domains_lift_over_Pt6, fas_domains_lift_over_Pt6, file=paste0(objects_directory, "ele_fas_boundaries_lift_Pt6.RData"))

man_domains_lift_over_hg38 = liftOverBoundaries( man_domains, chain_PtHs, WSize = 500 )
saa_domains_lift_over_hg38 = liftOverBoundaries( saa_domains, chain_PtHs, WSize = 500 )
bed_file = c(man_domains_lift_over_hg38$lifted_over,saa_domains_lift_over_hg38$lifted_over)
export.bed( bed_file, con=paste0(outputs_directory,"man_saa_boundaries_lift_Hg38.bed" ) )
save( man_domains_lift_over_hg38, saa_domains_lift_over_hg38, file=paste0(objects_directory, "man_saa_domains_lift_Hg38.RData") )

We display the reproducibility

ele_domains = readTADs( paste0(outputs_directory,"25kb_domains/hs_ele_krnorm.all.25kb.topdom.bedpe" ) )
fas_domains = readTADs( paste0(outputs_directory,"25kb_domains/hs_pf_krnorm.all.25kb.topdom.bedpe" ) )
man_domains = readTADs( paste0(outputs_directory,"25kb_domains/pt_mandy_krnorm.all.25kb.topdom.bedpe" ) )
saa_domains = readTADs( paste0(outputs_directory,"25kb_domains/pt_sandra_krnorm.all.25kb.topdom.bedpe" ) )

load(paste0(objects_directory, "man_saa_domains_lift_Hg38.RData"))
load(paste0(objects_directory, "ele_fas_boundaries_lift_Pt6.RData"))

all_human_boundaires = getAllBoundaries( ele_domains$boundaries, fas_domains$boundaries )
all_chimp_boundaires = getAllBoundaries( man_domains$boundaries, saa_domains$boundaries )

peak_list = list(ELE30 = unique(queryHits(findOverlaps(all_human_boundaires,ele_domains$boundaries))),
                 PF = unique(queryHits(findOverlaps(all_human_boundaires,fas_domains$boundaries))) )
ggVennDiagram(peak_list,label_alpha=0) + scale_fill_distiller( direction = 1)

peak_list = list(Sandra = unique(queryHits(findOverlaps(all_chimp_boundaires,saa_domains$boundaries))),
                 Mandy = unique(queryHits(findOverlaps(all_chimp_boundaires,man_domains$boundaries))) )
ggVennDiagram(peak_list,label_alpha=0) + scale_fill_distiller( direction = 1)

m = matrix(c(6100,
             1021,
             769,
             6000,
             593,
             1131),
           ncol = 2, nrow=3,
           byrow = FALSE)

barplot(m,col=c("green4","steelblue3","blue4"), ylim=c(0,8000),ylab="Loops",names=c("Human", "Chimpanzee"))
axis(2,lwd=2)

export.bed(all_human_boundaires,con=paste0(outputs_directory,"all_human_boundaires_input.bed"))
export.bed(all_chimp_boundaires,con=paste0(outputs_directory,"all_chimp_boundaires.bed") )

Check the evolutionary conservation of the reproducible boundaries.

human_boundaires_reproducible = ele_domains$boundaries[queryHits(findOverlaps(ele_domains$boundaries,fas_domains$boundaries))]
chimp_domains = vector("list",1)
names(chimp_domains) = "boundaries"
chimp_domains$boundaries = man_domains$boundaries[queryHits(findOverlaps(man_domains$boundaries,saa_domains$boundaries))]

chimp_domains_lift_over_hg38 = liftOverBoundaries( chimp_domains, chain_PtHs, WSize = 500 )
export.bed(chimp_domains_lift_over_hg38$lifted_over,con=paste0(outputs_directory,"chimp_domains_lift_over_hg38.bed"))
save( chimp_domains_lift_over_hg38, file=paste0(objects_directory,"chimp_domains_lift_over_hg38.RData"))
save(human_boundaires_reproducible,file=paste0(objects_directory,"human_boundaires_reproducible.RData"))

Display the result

load( paste0(objects_directory,"chimp_domains_lift_over_hg38.RData") )
load(paste0(objects_directory,"human_boundaires_reproducible.RData"))

allBound = getAllBoundaries( human_boundaires_reproducible,chimp_domains_lift_over_hg38$lifted_over )
peak_list = list(Human = unique(queryHits(findOverlaps(allBound,human_boundaires_reproducible))),
                 Chimp = unique(queryHits(findOverlaps(allBound,chimp_domains_lift_over_hg38$lifted_over))) )
ggVennDiagram(peak_list,label_alpha=0) + scale_fill_distiller( direction = 1)

all_evol_shared_boundaries = human_boundaires_reproducible[unique(queryHits(findOverlaps(human_boundaires_reproducible,chimp_domains_lift_over_hg38$lifted_over)))]
all_evol_shared_boundaries_Pt = chimp_domains_lift_over_hg38$original[ which(names(chimp_domains_lift_over_hg38$original) %in% names(chimp_domains_lift_over_hg38$lifted_over[queryHits(findOverlaps(chimp_domains_lift_over_hg38$lifted_over,all_evol_shared_boundaries))]) ) ]

Identification of human and chimpanzee specific domian boundaries

Sometimes the human boundaries in the chimp have no reads or are in the regions with an overtly low mappability and vice versa. We want to gent rid of those instances. Boundaries called in human should not be in the vicinity of low coverage regions in the human and in the chimp. Boundaries called in the chimp should not be in the vicinity of low coverage bins in chrim and in human.

lowCoverageBinsHG38 = do.call("c", lapply( as.list(names(ele)), function(x){ 
  print(x)
  thischr = gagr[which(chrom(gagr)==x)]
  m = ele[[x]]
  tp = as.data.frame(summary(m$LFM))
  tp = tp[tp$j > (tp$i+200),]
  M = Matrix::Matrix(0, nrow=nrow(m$LFM),ncol=ncol(m$LFM),sparse=TRUE )
  M[cbind(tp$i,tp$j)] = tp$x
  M = M + t(M)
  thischr = thischr[which(rowSums(M)<100)]
  return(thischr)  } ) )
export.bed(lowCoverageBinsHG38,con=paste0(outputs_directory,"lowCoverageBinsHG38.bed"))

lowCoverageBinsPT6 = do.call("c", lapply( as.list(names(mandy)), function(x){ 
  print(x)
  # x = "chr1"
  thischr = gagr_pt[which(chrom(gagr_pt)==x)]
  m = mandy[[x]]
  tp = as.data.frame(summary(m$LFM))
  tp = tp[tp$j > (tp$i+200),]
  M = Matrix::Matrix(0, nrow=nrow(m$LFM),ncol=ncol(m$LFM),sparse=TRUE )
  M[cbind(tp$i,tp$j)] = tp$x
  M = M + t(M)
  thischr = thischr[which(rowSums(M)<100)]
  return(thischr)  } ) )
export.bed(lowCoverageBinsPT6,con=paste0(outputs_directory,"lowCoverageBinsPaT6.bed"))


lowCoverageBinsHG38_2 = do.call("c", lapply( as.list(names(fa)), function(x){ 
  print(x)
  thischr = gagr[which(chrom(gagr)==x)]
  m = fa[[x]]
  tp = as.data.frame(summary(m$LFM))
  tp = tp[tp$j > (tp$i+200),]
  M = Matrix::Matrix(0, nrow=nrow(m$LFM),ncol=ncol(m$LFM),sparse=TRUE )
  M[cbind(tp$i,tp$j)] = tp$x
  M = M + t(M)
  thischr = thischr[which(rowSums(M)<100)]
  return(thischr)  } ) )
export.bed(lowCoverageBinsHG38_2,con=paste0(outputs_directory,"lowCoverageBinsHG38_2.bed"))

lowCoverageBinsPT6_2 = do.call("c", lapply( as.list(names(sa)), function(x){ 
  print(x)
  # x = "chr1"
  thischr = gagr_pt[which(chrom(gagr_pt)==x)]
  m = sa[[x]]
  tp = as.data.frame(summary(m$LFM))
  tp = tp[tp$j > (tp$i+200),]
  M = Matrix::Matrix(0, nrow=nrow(m$LFM),ncol=ncol(m$LFM),sparse=TRUE )
  M[cbind(tp$i,tp$j)] = tp$x
  M = M + t(M)
  thischr = thischr[which(rowSums(M)<100)]
  return(thischr)  } ) )
export.bed(lowCoverageBinsPT6_2,con=paste0(outputs_directory,"lowCoverageBinsPaT6_2.bed"))

save( lowCoverageBinsHG38, lowCoverageBinsPT6, file=paste0(objects_directory,"lowCoverageBins.RData") )
save( lowCoverageBinsHG38_2, lowCoverageBinsPT6_2, file=paste0(objects_directory,"lowCoverageBins2.RData") )

Lift over these intervals of low coverage. Then lift over the boundaries for the next steps of the analyses.

load(paste0(objects_directory,"lowCoverageBins.RData"))
load(paste0(objects_directory,"lowCoverageBins2.RData"))

lowCoverageBinsHG38 = GenomicRanges::resize(lowCoverageBinsHG38,50000,fix="center")
lowCoverageBinsHG38_2 = GenomicRanges::resize(lowCoverageBinsHG38_2,50000,fix="center")

lowCoverageBinsPT6 = GenomicRanges::resize(lowCoverageBinsPT6,50000,fix="center")
lowCoverageBinsPT6_2 = GenomicRanges::resize(lowCoverageBinsPT6_2,50000,fix="center")

lowCoverageBinsHG38 = lowCoverageBinsHG38[queryHits(findOverlaps(lowCoverageBinsHG38,lowCoverageBinsHG38_2))]
lowCoverageBinsPT6 = lowCoverageBinsPT6[queryHits(findOverlaps(lowCoverageBinsPT6,lowCoverageBinsPT6_2))]

# used to be 10000
lowCoverageBinsHG38_Pt = GenomicRanges::resize( unlist(liftOver(GenomicRanges::resize(lowCoverageBinsHG38,500,fix="center"), 
                                                                chain = chain_HsPt)), 50000, fix="center")

lowCoverageBinsPt_Hg38 = GenomicRanges::resize( unlist(liftOver(GenomicRanges::resize(lowCoverageBinsPT6,500,fix="center"), 
                                                                chain = chain_PtHs)), 50000, fix="center")

export.bed(lowCoverageBinsPt_Hg38,con=paste0(outputs_directory,"lowCoverageBinsPt_Hg38.bed"))
save( lowCoverageBinsPt_Hg38, lowCoverageBinsPt_Hg38,
      file=paste0(objects_directory,"low_coverage_bins_lifted_over.RData"))

Get to the list of human and chimp specific boundaries. Consider boundaries observed in both replicates. Remove boundaries within regions with poor mappability in both species (50kb intervals centered on the lifted over region).

load( paste0(objects_directory, "ele_fas_boundaries_lift_Pt6.RData") )
load( paste0(objects_directory, "man_saa_domains_lift_Hg38.RData") )

### ----------- 
all_human_boundaires_input = getAllBoundaries( ele_domains$boundaries, 
                                               fas_domains$boundaries )
all_chimp_boundaires_input = getAllBoundaries( man_domains$boundaries,
                                               saa_domains$boundaries )

all_chimp_boundaires_Hg38 = getAllBoundaries( man_domains_lift_over_hg38$lifted_over,
                                              saa_domains_lift_over_hg38$lifted_over )
export.bed( all_chimp_boundaires_Hg38, con=paste0(outputs_directory,"all_chimp_boundaires_Hg38_tp.bed" ))

### --------------------
### remove boundaries that intersect poorly mappable regions in the two species
all_human_boundaires = all_human_boundaires_input[-queryHits(findOverlaps(all_human_boundaires_input,c( lowCoverageBinsHG38,lowCoverageBinsPt_Hg38) )) ]
all_chimp_boundaires = all_chimp_boundaires_input[-queryHits(findOverlaps(all_chimp_boundaires_input,c(lowCoverageBinsPT6,lowCoverageBinsHG38_Pt)))]

export.bed( all_human_boundaires, con=paste0(outputs_directory,"all_human_boundaires_Hg38.bed" ) )
export.bed( all_chimp_boundaires, con=paste0(outputs_directory,"all_chimp_boundaires_Pt6.bed" ) )

Liftovers: - we pick the longest one - lift over needs to be on the same chromosome.

Identify species specific boundaries. To call a boundary species specific it needs to be: - found in both replicates of this species - not in a poorly mappable region in either of the two species - never found in the other species - be amenable for liftOver.

human_specific_boundaries = human_boundaires_reproducible[ - queryHits(findOverlaps(human_boundaires_reproducible,all_chimp_boundaires_Hg38 )) ]
human_specific_boundaries = human_specific_boundaries[ - queryHits(findOverlaps(human_specific_boundaries,reduce(c(lowCoverageBinsHG38,lowCoverageBinsPt_Hg38)) )) ]

chimp_specific_boundaries = chimp_domains$boundaries[ - queryHits(findOverlaps(chimp_domains$boundaries,reduce(c(ele_domains_lift_over_Pt6$lifted_over,
                                                                                                                 fas_domains_lift_over_Pt6$lifted_over)) )) ]
chimp_specific_boundaries = chimp_specific_boundaries[ - queryHits(findOverlaps(chimp_specific_boundaries,reduce(c(lowCoverageBinsPT6,lowCoverageBinsHG38_Pt)) )) ]

## Boundaries need to be able to be lifted over! Otherwise we do not know if the boundary is lost because it is not lifted over or it is lost because it was not called
chimp_specific_boundaries_Hg38 = liftOverBoundaries(list(boundaries=chimp_specific_boundaries), chain_PtHs, WSize = 500 )
human_specific_boundaries_Pt = liftOverBoundaries(list(boundaries=human_specific_boundaries), chain_HsPt, WSize = 500 )

## double filtering for lift overs, any chimp boundary lifted over to hg38 should not be observed in human baoundaries
chimp_specific_boundaries = GenomicRanges::resize(chimp_specific_boundaries_Hg38$original[which(names(chimp_specific_boundaries_Hg38$original) %in% names(chimp_specific_boundaries_Hg38$lifted_over))],50000,fix="center")
chimp_specific_boundaries_Hg38 = GenomicRanges::resize(chimp_specific_boundaries_Hg38$lifted_over,50000,fix="center")
all(names(chimp_specific_boundaries)==names(chimp_specific_boundaries_Hg38))

human_specific_boundaries = GenomicRanges::resize(human_specific_boundaries_Pt$original[which(names(human_specific_boundaries_Pt$original) %in% names(human_specific_boundaries_Pt$lifted_over))],50000,fix="center")
human_specific_boundaries_Pt = GenomicRanges::resize(human_specific_boundaries_Pt$lifted_over,50000,fix="center")
all(names(human_specific_boundaries)==names(human_specific_boundaries_Pt))

chimp_specific_boundaries = chimp_specific_boundaries[-queryHits(findOverlaps(chimp_specific_boundaries_Hg38,all_human_boundaires_input))]
human_specific_boundaries = human_specific_boundaries[-queryHits(findOverlaps(human_specific_boundaries_Pt,all_chimp_boundaires_input))]

chimp_specific_boundaries_Hg38 = chimp_specific_boundaries_Hg38[-queryHits(findOverlaps(chimp_specific_boundaries_Hg38,all_human_boundaires_input))]
human_specific_boundaries_Pt = human_specific_boundaries_Pt[-queryHits(findOverlaps(human_specific_boundaries_Pt,all_chimp_boundaires_input))]

export.bed( chimp_specific_boundaries_Hg38,
            con=paste0(outputs_directory,"/chimp_specific_boundaries_Hg38.bed" ))

export.bed( human_specific_boundaries_Pt,
            con=paste0(outputs_directory,"human_specific_boundaries_Pt.bed" ) )

export.bed( human_specific_boundaries, 
            con=paste0(outputs_directory,"human_specific_boundaries.bed" ) )

export.bed( chimp_specific_boundaries, 
            con=paste0(outputs_directory,"chimp_specific_boundaries.bed" ) )

##########################
species_specific_boundaries = c( human_specific_boundaries, chimp_specific_boundaries_Hg38 )
export.bed( species_specific_boundaries, 
            con=paste0(outputs_directory,"species_specific_boundaries.bed" ) )
save( human_specific_boundaries, chimp_specific_boundaries_Hg38,human_specific_boundaries_Pt,chimp_specific_boundaries,
      file=paste0(objects_directory,"species_specific_boundaries.RData"))
##########################

We have the species specific boundaries

load(paste0(objects_directory,"species_specific_boundaries.RData"))
load(paste0(objects_directory,"low_coverage_bins_lifted_over.RData"))
species_specific_boundaries = c( human_specific_boundaries, chimp_specific_boundaries_Hg38 )

HiC data normalisation: human data

load(paste0(objects_directory,'si.RData'))
chroms_combs_hs = data.frame( V1=paste0("chr",c(1:22,"X")), 
                              V2=paste0("chr",c(1:22,"X")),stringsAsFactors = FALSE ) 

itn=20
chroms=paste0("chr",c(1:22,"X"))
ele_lfm_5kb = read.hic_files( paste0(dumped_directory_ele), "",".matrix.txt", ga, paste0("chr",c(1:22,"X") ) )
ele = lapply( ele_lfm_5kb, function(m){ IPF( m, numberOfIterations=itn ) } )
save( ele, file=paste0(objects_directory,"Ele30_hic.RData" ))

fa_lfm_5kb = read.hic_files( paste0(dumped_directory_fa), "",".matrix.txt", ga, paste0("chr",c(1:22,"X") ) )
fa = lapply( fa_lfm_5kb, function(m){ IPF( m, numberOfIterations=itn ) } )
save( fa, file=paste0(objects_directory,"FetalAstrocytes_hic.RData" ))

HiC data normalisation: chimpanzee data

load(paste0(objects_directory,'si_pt.RData'))
chroms_combs_pt = data.frame( V1=paste0("chr",c(c(1,"2A","2B",3:22),"X")), 
                              V2=paste0("chr",c(c(1,"2A","2B",3:22),"X")),
                              stringsAsFactors = FALSE)
itn=20
chroms=paste0("chr",c(c(1,"2A","2B",3:22),"X"))

## ---------------
mandy_lfm_5kb = read.hic_files( paste0(dumped_directory_mandy), "",".matrix.txt", ga_pt, chroms=paste0("chr",c(c(1,"2A","2B",3:22),"X")) )
mandy = lapply( mandy_lfm_5kb, function(m){ IPF( m, numberOfIterations=itn ) } )
save( mandy, file=paste0(objects_directory,"Mandy_hic.RData" ) )

## ---------------
sa_lfm_5kb = read.hic_files( paste0(dumped_directory_sandra), "",".matrix.txt", ga_pt, chroms=paste0("chr",c(c(1,"2A","2B",3:22),"X")) )
sa = lapply( sa_lfm_5kb, function(m){ IPF( m, numberOfIterations=itn ) } )
save( sa, file=paste0(objects_directory,"SandraA_hic.RData" ) )

Boundary strength – boxplot of Insulation score

human_spe_bound_IS_ele = InsulationScore( human_specific_boundaries, 
                                          ele, gagr, 5, 3, 10 )
save(human_spe_bound_IS_ele,file=paste0(objects_directory,"human_spe_bound_IS_ele.RData"))

human_spe_bound_IS_fas = InsulationScore( human_specific_boundaries, 
                                           fa, gagr, 5, 3, 10 )
save(human_spe_bound_IS_fas,file=paste0(objects_directory,"human_spe_bound_IS_fas.RData"))

chimp_spe_bound_IS_ele = InsulationScore( chimp_specific_boundaries_Hg38, 
                                          ele, gagr, 5, 3, 10 )
save(chimp_spe_bound_IS_ele,file=paste0(objects_directory,"chimp_spe_bound_IS_ele.RData"))

chimp_spe_bound_IS_fas = InsulationScore( chimp_specific_boundaries_Hg38, 
                                          fa, gagr, 5, 3, 10 )
save(chimp_spe_bound_IS_fas,file=paste0(objects_directory,"chimp_spe_bound_IS_fas.RData"))

shared_bound_IS_ele = InsulationScore( all_evol_shared_boundaries, 
                                       ele, gagr, 5, 3, 10 )
save(shared_bound_IS_ele,file=paste0(objects_directory,"shared_bound_IS_ele.RData"))

shared_bound_IS_fas = InsulationScore( all_evol_shared_boundaries, 
                                       fa, gagr, 5, 3, 10 )
save(shared_bound_IS_fas,file=paste0(objects_directory,"shared_bound_IS_fas.RData"))


## values for chimpanzee samples
human_spe_bound_IS_mandy = InsulationScore( human_specific_boundaries_Pt, 
                                            mandy, gagr_pt, 5, 3, 10 )
save(human_spe_bound_IS_mandy,file=paste0(objects_directory,"human_spe_bound_IS_mandy.RData"))

human_spe_bound_IS_sandraA = InsulationScore( human_specific_boundaries_Pt, 
                                              sa, gagr_pt, 5, 3, 10 )
save(human_spe_bound_IS_sandraA,file=paste0(objects_directory,"human_spe_bound_IS_sandraA.RData"))

chimp_spe_bound_IS_mandy = InsulationScore( chimp_specific_boundaries, 
                                            mandy, gagr_pt, 5, 3, 10 )
save(chimp_spe_bound_IS_mandy,file=paste0(objects_directory,"chimp_spe_bound_IS_mandy.RData"))

chimp_spe_bound_IS_sandraA = InsulationScore( chimp_specific_boundaries, 
                                              sa, gagr_pt, 5, 3, 10 )
save(chimp_spe_bound_IS_sandraA,file=paste0(objects_directory,"chimp_spe_bound_IS_sandraA.RData"))

shared_bound_IS_mandy = InsulationScore( all_evol_shared_boundaries_Pt, 
                                        mandy, gagr_pt, 5, 3, 10 )
save(shared_bound_IS_mandy,file=paste0(objects_directory,"shared_bound_IS_mandy.RData"))

shared_bound_IS_sandraA = InsulationScore( all_evol_shared_boundaries_Pt, 
                                       sa, gagr_pt, 5, 3, 10 )
save(shared_bound_IS_sandraA,file=paste0(objects_directory,"shared_bound_IS_sandraA.RData"))

Species specific boundaries, insulation change

load(paste0(objects_directory,"human_spe_bound_IS_mandy.RData"))
load(paste0(objects_directory,"human_spe_bound_IS_fas.RData"))
load(paste0(objects_directory,"human_spe_bound_IS_ele.RData"))
load(paste0(objects_directory,"human_spe_bound_IS_sandraA.RData"))

load(paste0(objects_directory,"chimp_spe_bound_IS_mandy.RData"))
load(paste0(objects_directory,"chimp_spe_bound_IS_ele.RData"))
load(paste0(objects_directory,"chimp_spe_bound_IS_fas.RData"))
load(paste0(objects_directory,"chimp_spe_bound_IS_sandraA.RData"))

load(paste0(objects_directory,"shared_bound_IS_ele.RData"))
load(paste0(objects_directory,"shared_bound_IS_fas.RData"))
load(paste0(objects_directory,"shared_bound_IS_mandy.RData"))
load(paste0(objects_directory,"shared_bound_IS_sandraA.RData"))


human_spe_bound_IS_ele = log2(rowMeans(human_spe_bound_IS_ele[,c(1,3)])/human_spe_bound_IS_ele[,2])
human_spe_bound_IS_mandy = log2(rowMeans(human_spe_bound_IS_mandy[,c(1,3)])/human_spe_bound_IS_mandy[,2])
human_spe_bound_IS_fas = log2(rowMeans(human_spe_bound_IS_fas[,c(1,3)])/human_spe_bound_IS_fas[,2])
human_spe_bound_IS_sandraA = log2(rowMeans(human_spe_bound_IS_sandraA[,c(1,3)])/human_spe_bound_IS_sandraA[,2])

chimp_spe_bound_IS_ele = log2(rowMeans(chimp_spe_bound_IS_ele[,c(1,3)])/chimp_spe_bound_IS_ele[,2])
chimp_spe_bound_IS_mandy = log2(rowMeans(chimp_spe_bound_IS_mandy[,c(1,3)])/chimp_spe_bound_IS_mandy[,2])
chimp_spe_bound_IS_fas = log2(rowMeans(chimp_spe_bound_IS_fas[,c(1,3)])/chimp_spe_bound_IS_fas[,2])
chimp_spe_bound_IS_sandraA = log2(rowMeans(chimp_spe_bound_IS_sandraA[,c(1,3)])/chimp_spe_bound_IS_sandraA[,2])

shared_bound_IS_mandy = log2(rowMeans(shared_bound_IS_mandy[,c(1,3)])/shared_bound_IS_mandy[,2])
shared_bound_IS_sandraA = log2(rowMeans(shared_bound_IS_sandraA[,c(1,3)])/shared_bound_IS_sandraA[,2])
shared_bound_IS_ele = log2(rowMeans(shared_bound_IS_ele[,c(1,3)])/shared_bound_IS_ele[,2])
shared_bound_IS_fas = log2(rowMeans(shared_bound_IS_fas[,c(1,3)])/shared_bound_IS_fas[,2])

is_hs = c(shared_bound_IS_ele, shared_bound_IS_fas)
is_pt = c(shared_bound_IS_mandy, shared_bound_IS_sandraA)
is_hs = is_hs[is.finite(is_hs)]
is_pt = is_pt[is.finite(is_pt)]

## -----
human_spe_bound_human = rowMax(cbind(human_spe_bound_IS_ele,human_spe_bound_IS_fas))
names(human_spe_bound_human) = names(human_spe_bound_IS_ele)
human_spe_bound_chimp = rowMax(cbind(human_spe_bound_IS_mandy,human_spe_bound_IS_sandraA))
names(human_spe_bound_chimp) = names(human_spe_bound_IS_mandy)
human_spe_bound_human = human_spe_bound_human[match(names(human_spe_bound_chimp),names(human_spe_bound_human))]
human_spe_boundaries_evol = human_spe_bound_human-human_spe_bound_chimp

## -----
chimp_spe_bound_human = rowMax(cbind(chimp_spe_bound_IS_ele,chimp_spe_bound_IS_fas))
names(chimp_spe_bound_human) = names(chimp_spe_bound_IS_ele)
chimp_spe_bound_chimp = rowMax(cbind(chimp_spe_bound_IS_mandy,chimp_spe_bound_IS_sandraA))
names(chimp_spe_bound_chimp) = names(chimp_spe_bound_IS_mandy)
chimp_spe_bound_chimp = chimp_spe_bound_chimp[match(names(chimp_spe_bound_human),names(chimp_spe_bound_chimp))]

chimp_spe_boundaries_evol = chimp_spe_bound_human-chimp_spe_bound_chimp


boxplot( human_spe_boundaries_evol, chimp_spe_boundaries_evol,
         outline=FALSE, col="white",border=c("black","red"),
         tlim=c(-0.6,0.6),ylab="Insulation change Human/Chimp [log2]",
         names=c("Human","Chimp"),xlab="Species specificity of boundary")

t.test(human_spe_boundaries_evol,chimp_spe_boundaries_evol)

## 
##  Welch Two Sample t-test
## 
## data:  human_spe_boundaries_evol and chimp_spe_boundaries_evol
## t = 6.6461, df = 274.65, p-value = 0.0000000001614
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2357400 0.4341763
## sample estimates:
##  mean of x  mean of y 
##  0.1432925 -0.1916657

Genome-wide insulation score

names(gagr) = paste(chrom(gagr),names(gagr),sep="_")
processIS = function( IS, GAGR ){
  res = GAGR
  res$binid=NULL
  res$score = 0
  res$score[match(rownames(IS),names(res))] = log2( rowMeans((0.001+IS[,c(1,3)]))/(0.001+IS[,2] ) )
  return(res) }

## --------
genome_wide_IS_ele = InsulationScore( gagr[which(chrom(gagr)!="chrY")], ele, gagr, 5, 3, 10 )
save(genome_wide_IS_ele,file=paste0(objects_directory,"genome_wide_IS_ele.RData"))
genome_wide_IS_ele_gr = processIS(genome_wide_IS_ele,gagr)
export.bedGraph( genome_wide_IS_ele_gr, con=paste0(outputs_directory,"genome_wide_IS_ele_gagr.bedGraph"))

genome_wide_IS_fas = InsulationScore(  gagr[which(chrom(gagr)!="chrY")], fa, gagr, 5, 3, 10 )
save(genome_wide_IS_fas,file=paste0(objects_directory,"genome_wide_IS_fas.RData"))
genome_wide_IS_fas_gr = processIS(genome_wide_IS_fas,gagr)
export.bedGraph( genome_wide_IS_fas_gr, con=paste0(outputs_directory,"genome_wide_IS_fas_gr.bedGraph"))

genome_wide_IS_mandy = InsulationScore( gagr_pt[which(chrom(gagr_pt)!="chrY")], mandy, gagr_pt, 5, 3, 10 )
save(genome_wide_IS_mandy,file=paste0(objects_directory,"genome_wide_IS_mandy.RData"))
genome_wide_IS_mandy_gr = processIS(genome_wide_IS_mandy,gagr_pt)
export.bedGraph( genome_wide_IS_mandy_gr, con=paste0(outputs_directory,"genome_wide_IS_mandy_gr.bedGraph"))

genome_wide_IS_sandraA = InsulationScore(  gagr_pt[which(chrom(gagr_pt)!="chrY")], sa, gagr_pt, 5, 3, 10 )
save(genome_wide_IS_sandraA,file=paste0(objects_directory,"genome_wide_IS_sandraA.RData"))
genome_wide_IS_sandraA_gr = processIS(genome_wide_IS_sandraA,gagr_pt)
export.bedGraph( genome_wide_IS_sandraA_gr, con=paste0(outputs_directory,"genome_wide_IS_sandraA_gr.bedGraph"))

Final plots

Display the insulation scores for all the bins, species specific as well as shared boundaries.

genome_wide_IS_ele_gr=import.bedGraph(paste0(outputs_directory,"genome_wide_IS_ele_gagr.bedGraph"))
genome_wide_IS_fas_gr=import.bedGraph(paste0(outputs_directory,"genome_wide_IS_fas_gr.bedGraph"))
genome_wide_IS_sandraA=import.bedGraph(paste0(outputs_directory,"genome_wide_IS_sandraA_gr.bedGraph"))
genome_wide_IS_mandy_gr=import.bedGraph(paste0(outputs_directory,"genome_wide_IS_mandy_gr.bedGraph"))

genome_wide_IS_human = rowMeans(cbind(genome_wide_IS_ele_gr$score,genome_wide_IS_fas_gr$score))
genome_wide_IS_chimp = rowMeans(cbind(genome_wide_IS_sandraA$score,genome_wide_IS_mandy_gr$score))


boxplot( genome_wide_IS_human, genome_wide_IS_chimp,
         is_hs, human_spe_bound_human,human_spe_bound_chimp,
         is_pt,chimp_spe_bound_chimp,
         chimp_spe_bound_human, outline=FALSE,
         col="white",border=c("black","red","black","black","red","red","red","black"),
         ylim=c(-1,1),ylab=expression("Insulation (log"[2]*")"),
         names=c("GW Hs","GW Pt","Bound Hs","Hs-sp Hs","Hs-spe Pt","Bound Pt","Pt-spe Pt","Pt-spe Hs"),las=2)
axis(1,lwd=2,at=1:8,c("GW Hs","GW Pt","Bound Hs","Hs-sp Hs","Hs-spe Pt","Bound Pt","Pt-spe Pt","Pt-spe Hs"),las=2)
axis(2,lwd=2,las=2)
box(col="black",lwd=2)
abline(h=0,lwd=2,lty=2,col="gray")

t.test(human_spe_bound_human,human_spe_bound_chimp)

## 
##  Welch Two Sample t-test
## 
## data:  human_spe_bound_human and human_spe_bound_chimp
## t = 5.3102, df = 510.25, p-value = 0.0000001638
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.09027806 0.19630691
## sample estimates:
## mean of x mean of y 
## 0.3546137 0.2113212

t.test(chimp_spe_bound_human,chimp_spe_bound_chimp)

## 
##  Welch Two Sample t-test
## 
## data:  chimp_spe_bound_human and chimp_spe_bound_chimp
## t = -3.7563, df = 315.76, p-value = 0.0002053
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.30303371 -0.09470476
## sample estimates:
## mean of x mean of y 
## 0.3448980 0.5437673

Regulome analysis - differential openess

hs_me3 = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K4me3_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)
hs_k27ac = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K27ac_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)

pt_me3 = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K4me3_12-22_PanTro_i-Astro_Sandra_Mandy4-6_merged_PanTro6_peaks.narrowPeak'),
                                chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
pt_k27ac = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K27ac_12-22_PanTro_i-Astro_Sandra_Mandy4-6_merged_PanTro6_peaks.narrowPeak'),
                                chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)

mm_me3 = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K4me3_03-22_MacMul_i-Astro_WT_Becky_Rep_1_RheMac10_peaks.narrowPeak'),
                                chroms=paste0(c(1:22,'X')),4)
mm_k27ac = readBed_filterChroms(paste0(outputs_directory,'ChIP_Seq_H3K27ac_03-22_MacMul_i-Astro_WT_Becky_Rep_1_RheMac10_peaks.narrowPeak'),
                                chroms=paste0(c(1:22,'X')),4)

seqlevelsStyle(hs_k27ac) = "ucsc"
seqlevelsStyle(pt_k27ac) = "ucsc"
seqlevelsStyle(mm_k27ac) = "ucsc"

seqlevelsStyle(hs_me3) = "ucsc"
seqlevelsStyle(pt_me3) = "ucsc"
seqlevelsStyle(mm_me3) = "ucsc"

Load the files from the other vignettes

load(paste0(objects_directory,'tss_objects.RData'))
load(paste0(objects_directory,'DEseq2_RNA.RData'))
species_specific_boundaries = import.bed(paste0(outputs_directory,"species_specific_boundaries.bed"))

ATAC - preparations

We consider peak that map to “standard” chromosomes.
We exclude chromosome Y as all our cells are female.
We center the peak at its summit and take an interval -250/+250 bp.

hs_atac = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)

pt_atac = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_Pantro_i-Astro_Sandra_Mandy4-6_merged_PanTro6_peaks.narrowPeak'),
                               paste0("chr",c(1,"2A","2B", 3:22,'X')),10)

mm_atac = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_MacMul_i-Astro_Becky_merged_RheMac10_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)

seqlevelsStyle(hs_atac) = 'ucsc'
seqlevelsStyle(mm_atac) = 'ucsc'

start(hs_atac) = start(hs_atac) + hs_atac$score
end(hs_atac) = start(hs_atac) + 1

start(pt_atac) = start(pt_atac) + pt_atac$score
end(pt_atac) = start(pt_atac) + 1

start(mm_atac) = start(mm_atac) + mm_atac$score
end(mm_atac) = start(mm_atac) + 1

export.bed( hs_atac, con=paste0(outputs_directory,'ATAC.Hs_clean_summit.narrowPeak'))
export.bed( pt_atac, con=paste0(outputs_directory,'ATAC.Pt_clean_summit.narrowPeak'))
export.bed( mm_atac, con=paste0(outputs_directory,'ATAC.Mm_clean_summit.narrowPeak'))

Identificaion of peaks to work with

We identify peaks that display at least 50% homology in their DNA sequence.
We define which peaks are not homologous (<50% homology).
We consider peaks identified in human, chimpanzee and macaque files separately.

The generic liftOver command: $liftOver -minMatch=0.5 -bedPlus=6 -tab hg38ToPanTro6.over.chain.gz[from UCSC GoldenPath] We use human as a common reference

cd ~/Documents/Tools/

## human to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Hs_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_PT6_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_PT6_summit.unmapped.file

## human to macaque
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Hs_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToRheMac10.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_RM10_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_RM10_summit.unmapped.file

## chimp to human
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Pt_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/panTro6ToHg38.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_Hg38_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_Hg38_summit.unmapped.file

## macaque to human
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Mm_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/rheMac10ToHg38.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_Hg38_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_Hg38_summit.unmapped.file


## chimp to macaque
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Pt_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/panTro6ToRheMac10.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_RheMac10_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_RheMac10_summit.unmapped.file

## macaque to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Mm_clean_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/rheMac10ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_PanTro6_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_PanTro6_summit.unmapped.file

Preparation of count table for ATAC regions

Reading of peaks and liftovers.

We define a unique set of intervals of peaks found in at least one species and that are aligneable.

hs_atac_mapped_in_chimp = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Hs_clean_peaks_on_PT6_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)

hs_atac_mapped_in_rhesus = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Hs_clean_peaks_on_RM10_summit.narrowPeak"),
                                               chroms=paste0("chr",c(1:22,'X')),4)

chimp_mapped_in_humans = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Pt_clean_peaks_on_Hg38_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1:22,'X')),4)

macaque_mapped_in_humans = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Mm_clean_peaks_on_Hg38_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1:22,'X')),4)

macaque_in_chimps = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Mm_clean_peaks_on_PanTro6_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)

chimps_in_macaque = readBed_filterChroms(paste0(liftOverPeaks,'ATAC.Pt_clean_peaks_on_RheMac10_summit.narrowPeak'),
                                               chroms=paste0('chr',c(1:22,'X')),4)


CleanAndResize = function( gro, finalSize ){
  return( GenomicRanges::resize(gro[ width(gro)==2 & start(gro)>500 ],finalSize,fix="center"))
}

hs_atac_mapped_in_chimp = CleanAndResize( hs_atac_mapped_in_chimp, 500 )
hs_atac_mapped_in_rhesus = CleanAndResize( hs_atac_mapped_in_rhesus, 500 )
chimp_mapped_in_humans = CleanAndResize( chimp_mapped_in_humans, 500 )
macaque_mapped_in_humans = CleanAndResize( macaque_mapped_in_humans, 500 )
macaque_in_chimps = CleanAndResize( macaque_in_chimps, 500 )
chimps_in_macaque = CleanAndResize( chimps_in_macaque, 500 )

hs_atac = CleanAndResize( hs_atac, 500 )
pt_atac = CleanAndResize( pt_atac, 500 )
mm_atac = CleanAndResize( mm_atac, 500 )

Now, we have all the lifted over combinations.

# human peaks aligned in all the species
hs_pt_mm_liftover = names(hs_atac_mapped_in_chimp)[names(hs_atac_mapped_in_chimp) %in% names(hs_atac_mapped_in_rhesus) ]
human_peaks_aligned_Pt_Mm_coordinates_hs = hs_atac[ which(names(hs_atac) %in% hs_pt_mm_liftover ) ]

human_peaks_aligned_Pt_Mm_coordinates_hs = human_peaks_aligned_Pt_Mm_coordinates_hs[-subjectHits(findOverlaps(human_peaks_aligned_Pt_Mm_coordinates_hs,drop.self=TRUE,drop.redundant=TRUE))]
length(human_peaks_aligned_Pt_Mm_coordinates_hs)

## [1] 141484

human_peaks_aligned_Pt_Mm_coordinates_pt = hs_atac_mapped_in_chimp[ match(names(human_peaks_aligned_Pt_Mm_coordinates_hs),names(hs_atac_mapped_in_chimp)) ]
human_peaks_aligned_Pt_Mm_coordinates_mm = hs_atac_mapped_in_rhesus[ match(names(human_peaks_aligned_Pt_Mm_coordinates_hs),names(hs_atac_mapped_in_rhesus)) ]
all(names(human_peaks_aligned_Pt_Mm_coordinates_hs)==names(human_peaks_aligned_Pt_Mm_coordinates_pt))

## [1] TRUE

all(names(human_peaks_aligned_Pt_Mm_coordinates_hs)==names(human_peaks_aligned_Pt_Mm_coordinates_mm))

## [1] TRUE

# peaks found in chimp and macaque (aligned to the human genome) but not detected in human 
nhp_peaks = chimp_mapped_in_humans[ queryHits(findOverlaps(chimp_mapped_in_humans,macaque_mapped_in_humans))]
nhp_peaks = nhp_peaks[ which(names(nhp_peaks) %in% names(chimps_in_macaque))]
nhp_peaks_coordinates_hs = nhp_peaks[ -queryHits(findOverlaps(nhp_peaks,hs_atac))]
nhp_peaks_coordinates_hs = nhp_peaks_coordinates_hs[-subjectHits(findOverlaps(nhp_peaks_coordinates_hs,drop.self=TRUE,drop.redundant=TRUE))]
nhp_peaks_coordinates_pt = pt_atac[ match(names(nhp_peaks_coordinates_hs),names(pt_atac)) ]
nhp_peaks_coordinates_mm = chimps_in_macaque[ match(names(nhp_peaks_coordinates_pt),names(chimps_in_macaque)) ]
all(names(nhp_peaks_coordinates_hs)==names(nhp_peaks_coordinates_mm))

## [1] TRUE

all(names(nhp_peaks_coordinates_hs)==names(nhp_peaks_coordinates_pt))

## [1] TRUE

# chimp peaks aligned both in humans and macaques but not detected as peaks in humans and macaques 
chimp_peaks = chimp_mapped_in_humans[ -queryHits(findOverlaps(chimp_mapped_in_humans,c(nhp_peaks_coordinates_hs, hs_atac))) ]
chimp_peaks = chimp_peaks[ which(names(chimp_peaks) %in% names(chimps_in_macaque)) ]
chimp_peaks = chimp_peaks[ which(names(chimp_peaks) %in% names(pt_atac)) ]
chimp_uniquely_peaks_coordinates_hs = chimp_mapped_in_humans[ match( names(chimp_peaks), names(chimp_mapped_in_humans) )]
chimp_uniquely_peaks_coordinates_hs = chimp_uniquely_peaks_coordinates_hs[-subjectHits(findOverlaps(chimp_uniquely_peaks_coordinates_hs,
                                                                                                    drop.self=TRUE,drop.redundant=TRUE))]
chimp_uniquely_peaks_coordinates_pt = pt_atac[ match( names(chimp_uniquely_peaks_coordinates_hs), names(pt_atac) )]
chimp_uniquely_peaks_coordinates_mm = chimps_in_macaque[ match( names(chimp_uniquely_peaks_coordinates_pt), names(chimps_in_macaque) )]
all(names(chimp_uniquely_peaks_coordinates_hs)==names(chimp_uniquely_peaks_coordinates_pt))

## [1] TRUE

all(names(chimp_uniquely_peaks_coordinates_hs)==names(chimp_uniquely_peaks_coordinates_mm))

## [1] TRUE

length(chimp_uniquely_peaks_coordinates_hs)

## [1] 31740

# macaque peaks aligned both in humans and chimps but not detected as peaks in humans and chimps
macaque_peaks = macaque_mapped_in_humans[ -queryHits(findOverlaps(macaque_mapped_in_humans,c(nhp_peaks_coordinates_hs, hs_atac))) ]
macaque_peaks = macaque_peaks[ which(names(macaque_peaks) %in% names(macaque_in_chimps)) ]
macaque_peaks = macaque_peaks[ which(names(macaque_peaks) %in% names(mm_atac)) ]

macaque_uniquely_peaks_coordinates_hs = macaque_mapped_in_humans[ match( names(macaque_peaks), names(macaque_mapped_in_humans) )]
macaque_uniquely_peaks_coordinates_hs = macaque_uniquely_peaks_coordinates_hs[ -subjectHits(findOverlaps(macaque_uniquely_peaks_coordinates_hs,drop.self=TRUE,drop.redundant=TRUE))]
macaque_uniquely_peaks_coordinates_pt = macaque_in_chimps[ match( names(macaque_uniquely_peaks_coordinates_hs), names(macaque_in_chimps) )]
macaque_uniquely_peaks_coordinates_mm = mm_atac[ match( names(macaque_uniquely_peaks_coordinates_hs), names(mm_atac) )]

all(names(macaque_uniquely_peaks_coordinates_hs)==names(macaque_uniquely_peaks_coordinates_pt))

## [1] TRUE

all(names(macaque_uniquely_peaks_coordinates_hs)==names(macaque_uniquely_peaks_coordinates_mm))

## [1] TRUE

length(macaque_uniquely_peaks_coordinates_hs)

## [1] 43319

### we pool all togehter and remove duplicated peaks --> 225,059
all_human_intervals = c( human_peaks_aligned_Pt_Mm_coordinates_hs, nhp_peaks_coordinates_hs, 
                         chimp_uniquely_peaks_coordinates_hs, macaque_uniquely_peaks_coordinates_hs )

all_chimp_intervals = c( human_peaks_aligned_Pt_Mm_coordinates_pt, nhp_peaks_coordinates_pt, 
                         chimp_uniquely_peaks_coordinates_pt, macaque_uniquely_peaks_coordinates_pt )

all_macaque_intervals = c( human_peaks_aligned_Pt_Mm_coordinates_mm, nhp_peaks_coordinates_mm, 
                           chimp_uniquely_peaks_coordinates_mm, macaque_uniquely_peaks_coordinates_mm )

all(names(all_human_intervals) == names(all_chimp_intervals))

## [1] TRUE

all(names(all_human_intervals) == names(all_macaque_intervals))

## [1] TRUE

duplicated_peak_names = table(names(all_human_intervals))
duplicated_peak_names = names(duplicated_peak_names[duplicated_peak_names>1])

all_human_intervals = all_human_intervals[which(! names(all_human_intervals) %in% duplicated_peak_names) ]
all_chimp_intervals = all_chimp_intervals[which(! names(all_chimp_intervals) %in% duplicated_peak_names) ]
all_macaque_intervals = all_macaque_intervals[which(! names(all_macaque_intervals) %in% duplicated_peak_names) ]

Find peaks with 50% liftover

hs_atac2 = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)

pt_atac2 = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_Pantro_i-Astro_Sandra_Mandy4-6_merged_PanTro6_peaks.narrowPeak'),
                               paste0("chr",c(1,"2A","2B", 3:22,'X')),10)

mm_atac2 = readBed_filterChroms(paste0(outputs_directory,'ATAC_Seq_12-22_MacMul_i-Astro_Becky_merged_RheMac10_peaks.narrowPeak'),
                               paste0("",c(1:22,'X')),10)

seqlevelsStyle(hs_atac2) = 'ucsc'
seqlevelsStyle(mm_atac2) = 'ucsc'

start(hs_atac2) = start(hs_atac2) + hs_atac2$score
end(hs_atac2) = start(hs_atac2) 
hs_atac2 = GenomicRanges::resize(hs_atac2,500,fix="center")

start(pt_atac2) = start(pt_atac2) + pt_atac2$score
end(pt_atac2) = start(pt_atac2) 
pt_atac2 = GenomicRanges::resize(pt_atac2,500,fix="center")

start(mm_atac2) = start(mm_atac2) + mm_atac2$score
end(mm_atac2) = start(mm_atac2) 
mm_atac2 = GenomicRanges::resize(mm_atac2,500,fix="center")

export.bed( hs_atac2, con=paste0(outputs_directory,'ATAC.Hs_clean_500_summit.narrowPeak'))
export.bed( pt_atac2, con=paste0(outputs_directory,'ATAC.Pt_clean_500_summit.narrowPeak'))
export.bed( mm_atac2, con=paste0(outputs_directory,'ATAC.Mm_clean_500_summit.narrowPeak'))

cd ~/Documents/Tools/

## human to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Hs_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_PT6_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_PT6_500_summit.unmapped.file

## human to macaque
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Hs_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToRheMac10.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_RM10_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Hs_clean_peaks_on_RM10_500_summit.unmapped.file

## chimp to human
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Pt_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/panTro6ToHg38.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_Hg38_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_Hg38_500_summit.unmapped.file

## macaque to human
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Mm_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/rheMac10ToHg38.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_Hg38_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_Hg38_500_summit.unmapped.file


## chimp to macaque
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Pt_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/panTro6ToRheMac10.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_RheMac10_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Pt_clean_peaks_on_RheMac10_500_summit.unmapped.file

## macaque to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/ATAC.Mm_clean_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/rheMac10ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_PanTro6_500_summit.narrowPeak /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/sequence_analysis/ATAC.Mm_clean_peaks_on_PanTro6_500_summit.unmapped.file

Read in the results to retrieve the peak names that we wish to use.

peaks_hs_Pt =  readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Hs_clean_peaks_on_PT6_500_summit.narrowPeak"), 
                                    paste0("chr",c(1,"2A","2B", 3:22,'X')),5 )
peaks_hs_Mm =  readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Hs_clean_peaks_on_RM10_500_summit.narrowPeak"), 
                                    paste0("chr",c(1:22,'X')),5 )

peaks_hs = names(peaks_hs_Pt)[names(peaks_hs_Pt) %in% names(peaks_hs_Mm) ]

peaks_Pt_Hs = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Pt_clean_peaks_on_Hg38_500_summit.narrowPeak"), 
                                   paste0("chr",c(1:22,'X')),5 ) 
peaks_Pt_Mm = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Pt_clean_peaks_on_RheMac10_500_summit.narrowPeak"), 
                                   paste0("chr",c(1:22,'X')),5 ) 
  
peaks_pt = names(peaks_Pt_Hs)[names(peaks_Pt_Hs) %in% names(peaks_Pt_Mm) ]

peaks_Mm_Hs = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Mm_clean_peaks_on_Hg38_500_summit.narrowPeak"), 
                                   paste0("chr",c(1:22,'X')),5 )
peaks_Mm_Pt = readBed_filterChroms(paste0(liftOverPeaks,"ATAC.Mm_clean_peaks_on_PanTro6_500_summit.narrowPeak"), 
                                   paste0("chr",c(1,"2A","2B", 3:22,'X')), 5 ) 
  
peaks_mm = names(peaks_Mm_Hs)[names(peaks_Mm_Hs) %in% names(peaks_Mm_Pt) ]

peaks = unique( c( peaks_hs, peaks_pt, peaks_mm))

The final ranges

length(all_human_intervals)==length(all_chimp_intervals)

## [1] TRUE

length(all_human_intervals)==length(all_macaque_intervals)

## [1] TRUE

length(all_macaque_intervals) # 225,059

## [1] 225059

all_human_intervals = all_human_intervals[which(names(all_human_intervals) %in% peaks)]
all_chimp_intervals = all_chimp_intervals[which(names(all_chimp_intervals) %in% peaks)]
all_macaque_intervals = all_macaque_intervals[which(names(all_macaque_intervals) %in% peaks)]
length(all_human_intervals)==length(all_chimp_intervals)

## [1] TRUE

length(all_human_intervals)==length(all_macaque_intervals)

## [1] TRUE

length(all_macaque_intervals) # 224,411

## [1] 224411

all(names(all_human_intervals)==names(all_chimp_intervals))

## [1] TRUE

all(names(all_human_intervals)==names(all_macaque_intervals))

## [1] TRUE

export.gff( all_human_intervals,
            con=paste0(outputs_directory,"hs_atac_for_Deseq2.gtf" ) )
export.gff( all_chimp_intervals,
            con=paste0(outputs_directory,"pt_atac_for_Deseq2.gtf" ) )
export.gff( all_macaque_intervals,
            con=paste0(outputs_directory,"mm_atac_for_Deseq2.gtf" ) )

writeLines( paste0( seqlevels(all_human_intervals), ",", 
                    gsub("chr",'',seqlevels(all_human_intervals)) ),
            paste0(outputs_directory,'hs_atac_for_Deseq2.txt') )
writeLines( paste0( seqlevels(all_chimp_intervals), ",", 
                    gsub("chr",'',seqlevels(all_chimp_intervals)) ),
            paste0(outputs_directory,'pt_atac_for_Deseq2.txt') )
writeLines( paste0( seqlevels(all_macaque_intervals), ",", 
                    gsub("chr",'',seqlevels(all_macaque_intervals)) ),
            paste0(outputs_directory,'mm_atac_for_Deseq2.txt') )

ATAC read counting

setwd('/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/bam_files/')

## -------------------------
ele10 = featureCounts( 'ATAC_Seq_12-21_HomSap_i-Astro_WT_ELE10_merged_hg38.bam', 
                       annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/hs_atac_for_Deseq2.gtf', 
                       isGTFAnnotationFile = TRUE, 
                       chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/hs_atac_for_Deseq2.txt' ,
                       GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)

ele30 = featureCounts( 'ATAC_Seq_05-22_HomSap_i-Astro_WT_ELE30_2_Rep_1_hg38.bam', 
                       annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/hs_atac_for_Deseq2.gtf', 
                       isGTFAnnotationFile = TRUE, 
                       chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/hs_atac_for_Deseq2.txt' ,
                       GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)

## -------------------------
sandraa = featureCounts( 'ATAC_Seq_12-22_Pantro_i-Astro_Sandra_merged_PanTro6.bam', 
                         annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.gtf', 
                         isGTFAnnotationFile = TRUE, 
                         chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.txt' ,
                         GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)

Mandy04 = featureCounts( 'ATAC_Seq_05-22_PanTro_i-Astro_WT_Mandy4_Rep_1_PanTro6.bam', 
                         annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.gtf', 
                         isGTFAnnotationFile = TRUE, 
                         chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.txt' ,
                         GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)

Mandy06 = featureCounts( 'ATAC_Seq_05-22_PanTro_i-Astro_WT_Mandy6_Rep_1_PanTro6.bam', 
                         annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.gtf', 
                         isGTFAnnotationFile = TRUE, 
                         chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/pt_atac_for_Deseq2.txt' ,
                         GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)

## -------------------------
becky = featureCounts( 'ATAC_Seq_12-22_MacMul_i-Astro_Becky_merged_RheMac10.bam', 
                       annot.ext = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/mm_atac_for_Deseq2.gtf',
                       isGTFAnnotationFile = TRUE, 
                       chrAliases = '/Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/mm_atac_for_Deseq2.txt' ,
                       GTF.featureType = 'sequence_feature', GTF.attrType = 'ID', isPairedEnd=TRUE)

## -------------------------
save( ele10, ele30, sandraa, Mandy04, Mandy06, becky,
      file=paste0(outputs_directory,'counts_ATAC_refined.RData' ) )
## -------------------------


all( ele10$annotation$GeneID == ele30$annotation$GeneID )
all( names(all_human_intervals)== ele10$annotation$GeneID)
all(start(all_human_intervals)==ele10$annotation$Start)
all( names(all_human_intervals)== becky$annotation$GeneID)

ATAC_count = data.frame( ELE10 = ele10$counts[,1], 
                         ELE30 = ele30$counts[,1], 
                         SandraA = sandraa$counts[,1], 
                         Mandy04 = Mandy04$counts[,1],
                         Mandy06 = Mandy06$counts[,1],
                         Becky = becky$counts[,1] )

save( ATAC_count,
      file = paste0(outputs_directory,'ATAC_count.RData' ) )

||              Paired-end : yes                                              ||
||        Count read pairs : yes                                              ||
||              Annotation : hs_atac_for_Deseq2.gtf (GTF)                     ||
||      Dir for temp files : .                                                ||
||   Chromosome alias file : hs_atac_for_Deseq2.txt                           ||
||                 Threads : 1                                                ||
||                   Level : meta-feature level                               ||
||      Multimapping reads : counted                                          ||
|| Multi-overlapping reads : not counted                                      ||
||   Min overlapping bases : 1

DEseq analysis of ATAC seq peak openess

Here for the quantitative analysis we will consider only peaks that have at least 50% liftover between all the species

load( paste0(outputs_directory,'ATAC_count.RData') )

metadata = data.frame(species=c('HS','HS','PT','PT','PT','MM'),
                      human_or_not = c("HS","HS","NHP","NHP","NHP","NHP"),
                      assay='ATAC',
                      row.names=colnames(ATAC_count))

data <- DESeqDataSetFromMatrix( countData=ATAC_count,
                                colData = metadata,
                                design = ~ 0 + species )
data$species = relevel(data$species, "HS")

data = DESeq(data,fitType = 'local')

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

vst_data = vst(data, blind=TRUE)
log_data = rlog(data, blind=TRUE)

PCA Plot for all Samples.

species.colors <- c('HS' = '#000000', 'PT' = '#FF3300', 'MM' = '#0033FF')
ord = order(rowVars(counts(data, normalized = TRUE)), decreasing = TRUE)
pca = prcomp(t(counts(data, normalized = TRUE)[ord,]))

plotPCA(log_data, intgroup="species") + 
    geom_label_repel(aes(label = name),fill = alpha(c("white"),0.2),
                   show.legend = FALSE, size = 3.25, label.size=0.5,
                   fontface = 'bold') + 
  scale_color_manual(values = species.colors) + theme_bw() + labs(color = "Species") +
  theme(aspect.ratio = 1, axis.text = element_text(face = 'bold', size = 11),
        axis.title = element_text(face = 'bold', size = 13),
        legend.text = element_text(face = 'bold'), legend.title = element_text(face = 'bold', size = 12)) +
  ggtitle("PCA Plot")

Next, we will identify human specific ATAC peaks in comparison with chimpanzee and macaque.

hs_atac_for_Deseq2 = all_human_intervals[ which(names(all_human_intervals) %in% rownames(ATAC_count))]
score(hs_atac_for_Deseq2) = 1
export.bed( hs_atac_for_Deseq2,
            con=paste0(outputs_directory,"hs_atac_for_Deseq2.bed" ))
save(hs_atac_for_Deseq2,
     file=paste0(objects_directory,"hs_atac_for_Deseq2.RData") )

pt_atac_for_Deseq2 = all_chimp_intervals[ which(names(all_chimp_intervals) %in% names(hs_atac_for_Deseq2))]
score(pt_atac_for_Deseq2) = 1
export.bed( pt_atac_for_Deseq2,
            con=paste0(outputs_directory,"pt_atac_for_Deseq2.bed" ))
save(pt_atac_for_Deseq2,
     file=paste0(objects_directory,"pt_atac_for_Deseq2.RData") )

mm_atac_for_Deseq2 = all_macaque_intervals[ which(names(all_macaque_intervals) %in% names(hs_atac_for_Deseq2))]
score(mm_atac_for_Deseq2) = 1
export.bed( mm_atac_for_Deseq2,
            con=paste0(outputs_directory,"mm_atac_for_Deseq2.bed" ))
save(mm_atac_for_Deseq2,
     file=paste0(objects_directory,"mm_atac_for_Deseq2.RData") )

Individual comparisons and a table of these

HS_PT = DESeqDataSetFromMatrix( countData = ATAC_count[ ,colnames(ATAC_count) %in% rownames(metadata[metadata$species %in% c("HS","PT"),])],
                                colData = metadata[metadata$species %in% c("HS","PT"),],
                                design = ~ 0 + species )

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

HS_PT = DESeq(HS_PT,fitType = 'local')

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

HS_PT$species = relevel(HS_PT$species, "HS")
res_HS_PT = results(HS_PT, contrast = c("species","HS","PT"))
# all(rownames(res_HS_PT)==names(hs_atac_for_Deseq2))

HS_MM = DESeqDataSetFromMatrix( countData=ATAC_count[,metadata$species %in% c("HS","MM")],
                                colData = metadata[metadata$species %in% c("HS","MM"),],
                                design = ~ 0 + species )

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

HS_MM = DESeq(HS_MM,fitType = 'local')

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

HS_MM$species = relevel(HS_MM$species, "HS")
res_HS_MM = results(HS_MM, contrast = c("species","HS","MM"))
# all(rownames(res_HS_PT)==rownames(res_HS_MM))

res_HS_NHP = data.frame( hs_pt_LFC = res_HS_PT$log2FoldChange,
                         hs_pt_Padj = res_HS_PT$padj,
                         hs_mm_LFC = res_HS_MM$log2FoldChange,
                         hs_mm_Padj = res_HS_MM$padj,
                         row.names = rownames(res_HS_MM),
                         chrom_hs = chrom(hs_atac_for_Deseq2),
                         start= start(hs_atac_for_Deseq2),
                         end = end(hs_atac_for_Deseq2) )

hs_atac_for_Deseq2$score=0
hs_atac_for_Deseq2$padj_HSPT = res_HS_PT$padj
hs_atac_for_Deseq2$padj_HSMM = res_HS_MM$padj
export.gff(hs_atac_for_Deseq2,con=paste0(outputs_directory,"hs_atac_for_Deseq2.gtf"))



table(rowSums(res_HS_NHP[,c("hs_pt_Padj","hs_mm_Padj")]<0.1))

## 
##      0      1      2 
## 122226  71847  17203

table(rowSums(all_Deseqs[,c("pvalue.x","padj.y")]<0.1))

## 
##     0     1     2 
## 13083 11086  5137

table( res_HS_NHP$hs_pt_Padj<0.1 & res_HS_NHP$hs_pt_LFC<0 )

## 
##  FALSE   TRUE 
## 205878  15321

table( res_HS_NHP$hs_pt_Padj<0.1 & res_HS_NHP$hs_pt_LFC>0 )

## 
##  FALSE   TRUE 
## 202928  20257

table( res_HS_NHP$hs_mm_Padj<0.1 & res_HS_NHP$hs_mm_LFC<0 )

## 
##  FALSE   TRUE 
## 184934  35243

table( res_HS_NHP$hs_mm_Padj<0.1 & res_HS_NHP$hs_mm_LFC>0 )

## 
##  FALSE   TRUE 
## 175875  43810

## export locations of the altered peaks
hs_atac_for_Deseq2_Hs_vs_NHP_filt = hs_atac_for_Deseq2[ which(!is.na(res_HS_NHP$hs_pt_Padj) & ! is.na(res_HS_NHP$hs_mm_Padj)) ]
res_HS_NHP_filt = res_HS_NHP[ which(!is.na(res_HS_NHP$hs_pt_Padj) & ! is.na(res_HS_NHP$hs_mm_Padj)), ]
all(names(hs_atac_for_Deseq2_Hs_vs_NHP_filt)==rownames(res_HS_NHP_filt))

## [1] TRUE

sum( rowSums(cbind(res_HS_NHP_filt$hs_pt_Padj<0.1,res_HS_NHP_filt$hs_mm_Padj<0.1 ))>0 )

## [1] 89050

## -----------------------------
pvalthr=0.1
gained_ATAC_gr = hs_atac_for_Deseq2_Hs_vs_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC>0 & res_HS_NHP_filt$hs_pt_Padj<pvalthr & res_HS_NHP_filt$hs_mm_LFC>0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr) ]
gained_ATAC = res_HS_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC>0 & res_HS_NHP_filt$hs_pt_Padjpvalthr & res_HS_NHP_filt$hs_mm_LFC>0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr), ]
gained_ATAC_gr$score=0
export.bed(gained_ATAC_gr,con=paste0(outputs_directory,"gained_ATAC_gr.bed"))

lost_ATAC_gr = hs_atac_for_Deseq2_Hs_vs_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC<0 & res_HS_NHP_filt$hs_pt_Padj<pvalthr & res_HS_NHP_filt$hs_mm_LFC<0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr) ]
lost_ATAC = res_HS_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC<0 & res_HS_NHP_filt$hs_pt_Padj<pvalthr & res_HS_NHP_filt$hs_mm_LFC<0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr), ]
lost_ATAC_gr$score=0
export.bed(lost_ATAC_gr,con=paste0(outputs_directory,"lost_ATAC_gr.bed"))


par(mar=c(5,5,5,5),mfrow=c(1,1))#, cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)
topT <- as.data.frame(res_HS_NHP)
topTsig = rownames(topT[topT$hs_pt_Padj <=pvalthr & topT$hs_mm_Padj<=pvalthr,])
length(topTsig)

## [1] 25677

with(topT, plot(hs_pt_LFC, -log10(hs_pt_Padj), 
                  pch=20, cex=1.0, 
                  xlab=bquote(~Log[2]~fold~change), 
                  ylab=bquote(~-log[10]~Q~value), 
                  xlim=c(-10,10),
                  ylim=c(0,20)),col="gray60")  
with(subset(topT, rownames(topT) %in% topTsig), 
       points(hs_pt_LFC, -log10(hs_pt_Padj), pch=20, col="steelblue", cex=0.5))
axis(2,lwd=2)
axis(1,lwd=2)
box(col="black",lwd=2)

Identificaiton and functional annotation of DORs gained in humans

First we will consider only regions that do not overlap promoters nor H3K4me3 peaks. Then, we identify the human specific ATAC-seq peaks and remove the peaks that overlap H3K27ac in NHPs. Create a big annotation table for the DORegions between Hs, Pt and Mm.

human_spe_enhancers = gained_ATAC_gr[ - queryHits(findOverlaps(gained_ATAC_gr,c(hs_me3,promoters_tss_gr)))]
human_spe_active_promoters = gained_ATAC_gr[ queryHits(findOverlaps(gained_ATAC_gr, promoters_tss_gr[queryHits(findOverlaps(promoters_tss_gr,hs_me3))]))]
human_spe_inactive_promoters = gained_ATAC_gr[ queryHits(findOverlaps(gained_ATAC_gr, promoters_tss_gr[-queryHits(findOverlaps(promoters_tss_gr,hs_me3))]))]
pt_atac_for_Deseq2_not_H3K27ac = names(pt_atac_for_Deseq2[-queryHits(findOverlaps(pt_atac_for_Deseq2,pt_k27ac))])
mm_atac_for_Deseq2_not_H3K27ac = names(mm_atac_for_Deseq2[-queryHits(findOverlaps(mm_atac_for_Deseq2,mm_k27ac))])

## HUMAN SPECIFIC ENHANCERS
human_spe_enhancers = human_spe_enhancers[which(names(human_spe_enhancers) %in% pt_atac_for_Deseq2_not_H3K27ac[pt_atac_for_Deseq2_not_H3K27ac %in% mm_atac_for_Deseq2_not_H3K27ac])]
human_spe_enhancers_with_K27_peak = human_spe_enhancers[queryHits(findOverlaps(human_spe_enhancers,hs_k27ac))]
human_spe_enhancers_without_K27_peak = human_spe_enhancers[-queryHits(findOverlaps(human_spe_enhancers,hs_k27ac))]

length(human_spe_enhancers)

## [1] 9356

gained_atac_peaks_hs = names(gained_ATAC_gr)
gained_ATAC_gr_pt = pt_atac_for_Deseq2[which(names(pt_atac_for_Deseq2) %in% gained_atac_peaks_hs)]
gained_ATAC_gr_mm = mm_atac_for_Deseq2[which(names(mm_atac_for_Deseq2) %in% gained_atac_peaks_hs)]

all(names(gained_atac_peaks_hs) == names(gained_ATAC_gr_pt))

## [1] TRUE

all(names(gained_atac_peaks_hs) == names(gained_ATAC_gr_mm))

## [1] TRUE

intergenic_gained = gained_atac_peaks_hs[-queryHits(findOverlaps(gained_ATAC_gr,promoters_tss_gr))]


gained_ATAC_functional_annotation = data.frame(atac_Hs = countOverlaps(gained_ATAC_gr,hs_atac),
                                               atac_Pt = countOverlaps(gained_ATAC_gr_pt,pt_atac),
                                               atac_Mm = countOverlaps(gained_ATAC_gr_mm,mm_atac),
                                               me3_Hs = countOverlaps(gained_ATAC_gr,hs_me3),
                                               me3_Pt = countOverlaps(gained_ATAC_gr_pt,pt_me3),
                                               me3_Mm = countOverlaps(gained_ATAC_gr_mm,mm_me3),
                                               k27_Hs = countOverlaps(gained_ATAC_gr,hs_k27ac),
                                               k27_Pt = countOverlaps(gained_ATAC_gr_pt,pt_k27ac),
                                               k27_Mm = countOverlaps(gained_ATAC_gr_mm,mm_k27ac),
                                               promoter = countOverlaps(gained_ATAC_gr,promoters_tss_gr),
                                               is_intergenic = gained_atac_peaks_hs %in% intergenic_gained,
                                               row.names = gained_atac_peaks_hs )

colSums(gained_ATAC_functional_annotation>0)

##       atac_Hs       atac_Pt       atac_Mm        me3_Hs        me3_Pt 
##         13108          4253          1792           411           329 
##        me3_Mm        k27_Hs        k27_Pt        k27_Mm      promoter 
##           301          7382          2888          1424           908 
## is_intergenic 
##         12268

par(mfrow=c(1,1))
x=gained_ATAC_functional_annotation>0
x=x[order(x[,1],x[,2],x[,3],x[,4],x[,5],x[,6],x[,7],x[,8],x[,9],x[,10],x[,11]),]
x[x[,2]>0,2] = 2
x[x[,3]>0,3] = 3
x[x[,4]>0,4] = 4
x[x[,5]>0,5] = 5
x[x[,6]>0,6] = 6
x[x[,7]>0,7] = 7
x[x[,8]>0,8] = 8
x[x[,9]>0,9] = 9
x[x[,10]>0,10] = 10
x[x[,11]>0,11] = 11

par(mfrow=c(1,1),mar=c(7,1,1,1))
image(t(x),
      col=c("white","gray80","gray80","gray80",
            'forestgreen','forestgreen','forestgreen',
            "coral3","coral3","coral3",
            "black","blue4"),
      axes=FALSE,
      las=2)
box(col="black",lwd=2)
axis(1,at=seq(0,1,length=11),
     c("atac_human","atac_chimp","atac_macaque",
       "me3_human","me3_chimp","me3_macaque",
       "k27_human","k27_chimp","k27_macaque",
       "promoter","intergenic"),
     las=2)
abline(v=seq(0,1,length.out=11)[c(3,6,9,10)]+0.05,lwd=2)

gained_promoters = gained_ATAC_functional_annotation[gained_ATAC_functional_annotation$me3_Pt==0 & gained_ATAC_functional_annotation$me3_Mm==0 & gained_ATAC_functional_annotation$me3_Hs>0 & gained_ATAC_functional_annotation$atac_Hs>0 & gained_ATAC_functional_annotation$atac_Pt==0 & gained_ATAC_functional_annotation$atac_Mm==0 & gained_ATAC_functional_annotation$k27_Mm==0 & gained_ATAC_functional_annotation$promoter>0 & gained_ATAC_functional_annotation$me3_Hs>0 & gained_ATAC_functional_annotation$atac_Pt==0 & gained_ATAC_functional_annotation$promoter>0,]

gained_promoters_gr = gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(gained_promoters))]

Identification and functional annotation of DORs lost in humans

lost_atac_peaks_hs = names(lost_ATAC_gr)
lost_ATAC_gr_pt = pt_atac_for_Deseq2[which(names(pt_atac_for_Deseq2) %in% lost_atac_peaks_hs)]
lost_ATAC_gr_mm = mm_atac_for_Deseq2[which(names(mm_atac_for_Deseq2) %in% lost_atac_peaks_hs)]

all(names(lost_atac_peaks_hs) == names(lost_ATAC_gr_pt))

## [1] TRUE

all(names(lost_atac_peaks_hs) == names(lost_ATAC_gr_mm))

## [1] TRUE

intergenic_lost = lost_atac_peaks_hs[-queryHits(findOverlaps(lost_ATAC_gr,promoters_tss_gr))]


lost_ATAC_functional_annotation = data.frame(atac_Hs = countOverlaps(lost_ATAC_gr,hs_atac),
                                             atac_Pt = countOverlaps(lost_ATAC_gr_pt,pt_atac),
                                             atac_Mm = countOverlaps(lost_ATAC_gr_mm,mm_atac),
                                             me3_Hs = countOverlaps(lost_ATAC_gr,hs_me3),
                                             me3_Pt = countOverlaps(lost_ATAC_gr_pt,pt_me3),
                                             me3_Mm = countOverlaps(lost_ATAC_gr_mm,mm_me3),
                                             k27_Hs = countOverlaps(lost_ATAC_gr,hs_k27ac),
                                             k27_Pt = countOverlaps(lost_ATAC_gr_pt,pt_k27ac),
                                             k27_Mm = countOverlaps(lost_ATAC_gr_mm,mm_k27ac),
                                             promoter = countOverlaps(lost_ATAC_gr,promoters_tss_gr),
                                             is_intergenic = lost_atac_peaks_hs %in% intergenic_lost,
                                             row.names = lost_atac_peaks_hs )
colSums(lost_ATAC_functional_annotation>0)

##       atac_Hs       atac_Pt       atac_Mm        me3_Hs        me3_Pt 
##           479          2441          3172           145           446 
##        me3_Mm        k27_Hs        k27_Pt        k27_Mm      promoter 
##           505           325          1538          1234           540 
## is_intergenic 
##          2780

We observe a 3 fold over representation of lost than gained promoters in evolution

lost_promoters = lost_ATAC_functional_annotation[lost_ATAC_functional_annotation$promoter>0 & rowSums( lost_ATAC_functional_annotation[,c("me3_Pt","me3_Mm","k27_Pt","k27_Mm")]>0)==4,]
gained_promoters = gained_ATAC_functional_annotation[gained_ATAC_functional_annotation$me3_Hs>0  & gained_ATAC_functional_annotation$promoter>0 & rowSums(gained_ATAC_functional_annotation[,c("me3_Pt","me3_Mm","k27_Pt","k27_Mm")]==0)==4,]

lost_promoters = lost_ATAC_functional_annotation[lost_ATAC_functional_annotation$promoter>0 ,]
gained_promoters = gained_ATAC_functional_annotation[gained_ATAC_functional_annotation$promoter>0 ,]


lost_promoters_ensid = unique( promoters_tss_gr[queryHits(findOverlaps(promoters_tss_gr, lost_ATAC_gr[which(names(lost_ATAC_gr) %in% rownames(lost_promoters))] ))]$gene_id )
gained_promoters_ensid = unique( promoters_tss_gr[queryHits(findOverlaps(promoters_tss_gr, gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(gained_promoters) )]))]$gene_id )

length(gained_promoters_ensid)

## [1] 951

length(lost_promoters_ensid)

## [1] 613

any(lost_promoters_ensid %in% gained_promoters_ensid)

## [1] TRUE

m=matrix(c(nrow(lost_promoters),
           nrow(gained_promoters),
           length(lost_ATAC_gr),
           length(gained_ATAC_gr)),2,2)

fisher.test(m)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  m
## p-value < 0.00000000000000022
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  2.103521 2.646321
## sample estimates:
## odds ratio 
##   2.360091

Tables for the manuscript

save(gained_ATAC_gr,file=paste0(objects_directory,"gained_ATAC_gr.RData"))
save(lost_ATAC_gr,file=paste0(objects_directory,"lost_ATAC_gr.RData"))

Genuine gained and lost enhancers - definitions

## ----------------
geniune_lost_active_enhancers = lost_ATAC_functional_annotation[ lost_ATAC_functional_annotation$is_intergenic>0 & lost_ATAC_functional_annotation$promoter==0 & rowSums(lost_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & lost_ATAC_functional_annotation$k27_Hs==0 & lost_ATAC_functional_annotation$k27_Pt>0 & lost_ATAC_functional_annotation$k27_Mm>0,]

geniune_lost_poised_enhancers = lost_ATAC_functional_annotation[ lost_ATAC_functional_annotation$is_intergenic>0 & lost_ATAC_functional_annotation$promoter==0 & rowSums(lost_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & lost_ATAC_functional_annotation$k27_Hs==0 &  lost_ATAC_functional_annotation$k27_Pt==0 & lost_ATAC_functional_annotation$k27_Mm==0,]

geniune_lost_enhancers = lost_ATAC_functional_annotation[ lost_ATAC_functional_annotation$is_intergenic>0 & lost_ATAC_functional_annotation$promoter==0 & rowSums(lost_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & lost_ATAC_functional_annotation$k27_Hs==0 ,]

genuine_lost_enhancers_gr = lost_ATAC_gr[which(names(lost_ATAC_gr) %in% rownames(geniune_lost_enhancers))]
geniune_lost_poised_enhancers_gr = lost_ATAC_gr[which(names(lost_ATAC_gr) %in% rownames(geniune_lost_poised_enhancers))]
geniune_lost_active_enhancers_gr = lost_ATAC_gr[which(names(lost_ATAC_gr) %in% rownames(geniune_lost_active_enhancers))]
save( genuine_lost_enhancers_gr, file=paste0(objects_directory,"genuine_lost_enhancers_gr.RData"))


## ----------------
genuine_gained_enhancers = gained_ATAC_functional_annotation[ gained_ATAC_functional_annotation$is_intergenic>0 & gained_ATAC_functional_annotation$promoter==0 & rowSums(gained_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & gained_ATAC_functional_annotation$k27_Pt==0 & gained_ATAC_functional_annotation$k27_Mm==0,]

genuine_gained_active_enhancers = gained_ATAC_functional_annotation[ gained_ATAC_functional_annotation$is_intergenic>0 & gained_ATAC_functional_annotation$promoter==0 & rowSums(gained_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & gained_ATAC_functional_annotation$k27_Pt==0 & gained_ATAC_functional_annotation$k27_Mm==0 & gained_ATAC_functional_annotation$k27_Hs>0,]

genuine_gained_poised_enhancers = gained_ATAC_functional_annotation[ gained_ATAC_functional_annotation$is_intergenic>0 & gained_ATAC_functional_annotation$promoter==0 & rowSums(gained_ATAC_functional_annotation[,c("me3_Hs","me3_Pt","me3_Mm")])==0 & gained_ATAC_functional_annotation$k27_Pt==0 & gained_ATAC_functional_annotation$k27_Mm==0 & gained_ATAC_functional_annotation$k27_Hs==0,]


genuine_gained_enhancers_gr = gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(genuine_gained_enhancers))]
genuine_gained_active_enhancers_gr = gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(genuine_gained_active_enhancers))]
genuine_gained_poised_enhancers_gr = gained_ATAC_gr[which(names(gained_ATAC_gr) %in% rownames(genuine_gained_poised_enhancers))]

seqlevelsStyle(genuine_gained_enhancers_gr) = "ncbi"
export.bed( genuine_gained_enhancers_gr, con=paste0(outputs_directory,"genuine_gained_enhancers.bed"))
seqlevelsStyle(genuine_gained_enhancers_gr) = "ucsc"
export.bed( genuine_gained_enhancers_gr, con=paste0(outputs_directory,"genuine_gained_enhancers_ucsc.bed"))

Stats for the text

up_set = HS_UP_Genes$ensembl_id
dn_set = HS_DN_Genes$ensembl_id

promoters_HITS_UP = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% up_set ) ]
promoters_HITS_DN = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% dn_set ) ]

length(genuine_gained_enhancers_gr)

## [1] 9343

length(genuine_lost_enhancers_gr)

## [1] 2351

promoters_HITS_UP_500 = GenomicRanges::resize(promoters_HITS_UP,1000000,fix="center")
sum(countOverlaps(promoters_HITS_UP_500,genuine_gained_enhancers_gr)>0)

## [1] 586

sum(countOverlaps(promoters_HITS_UP_500,genuine_gained_active_enhancers_gr)>0)

## [1] 460

Saving objects for sequence comparision

## ----------------
promoters_HITS_UP = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% up_set ) ]
promoters_HITS_DN = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% dn_set ) ]

save( promoters_HITS_UP, promoters_HITS_DN, 
      file=paste0(objects_directory,"promoters_up_down.RData"))

prom_up_hs_pt = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.x>0 & all_Deseqs$padj.x<0.01,"Row.names"] ) ]
prom_dn_hs_pt = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.x<0 & all_Deseqs$padj.x<0.01,"Row.names"] ) ]

prom_up_hs_mm = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.y>0 & all_Deseqs$padj.y<0.01,"Row.names"] ) ]
prom_dn_hs_mm = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.y<0 & all_Deseqs$padj.y<0.01,"Row.names"] ) ]

## ----------------
genuine_gained_enhancers_that_do_something = genuine_gained_enhancers_gr[ which(elementMetadata(distanceToNearest(genuine_gained_enhancers_gr,promoters_HITS_UP))[,1]<500000)]
genuine_lost_enhancers_that_do_something = genuine_lost_enhancers_gr[ which(elementMetadata(distanceToNearest(genuine_lost_enhancers_gr,promoters_HITS_DN))[,1]<500000)]
genuine_lost_enhancers_that_do_nothing = genuine_lost_enhancers_gr[ which(elementMetadata(distanceToNearest(genuine_lost_enhancers_gr,c( prom_up_hs_pt,prom_dn_hs_pt,prom_up_hs_mm,prom_dn_hs_mm,promoters_HITS_UP,promoters_HITS_DN )))[,1]>500000)]

## verify on elements identified without TAD filtering
enhancers_linked_with_activation = genuine_gained_enhancers_gr[which(elementMetadata(distanceToNearest(genuine_gained_enhancers_gr, promoters_HITS_UP ))[,1]<500000)]
enhancers_not_linked_with_activation = genuine_gained_enhancers_gr[which(elementMetadata(distanceToNearest(genuine_gained_enhancers_gr, c( prom_up_hs_pt,prom_dn_hs_pt,prom_up_hs_mm,prom_dn_hs_mm,promoters_HITS_UP,promoters_HITS_DN )))[,1]>500000) ]

save( enhancers_linked_with_activation, enhancers_not_linked_with_activation,
      file=paste0(objects_directory,"enhancers_functional_groups.RData"))

export.bed(enhancers_linked_with_activation,con=paste0(outputs_directory,"enhancers_linked_with_activation.bed"))
export.bed(enhancers_not_linked_with_activation,con=paste0(outputs_directory,"enhancers_not_linked_with_activation.bed"))

save( genuine_lost_enhancers_that_do_something, genuine_lost_enhancers_that_do_nothing,
      file=paste0(objects_directory,"lost_enhancers_functional_groups.RData"))

export.bed(genuine_lost_enhancers_that_do_something,con=paste0(outputs_directory,"lost_enhancers_linked_with_activation.bed"))
export.bed(genuine_lost_enhancers_that_do_nothing,con=paste0(outputs_directory,"lost_enhancers_not_linked_with_activation.bed") )

## all enhancers
enhancers_HS = hs_atac[- queryHits(findOverlaps(hs_atac,c(hs_me3,promoters_tss_gr))) ]
enhancers_HS = enhancers_HS[ queryHits(findOverlaps(enhancers_HS, hs_k27ac) ) ]

## conserved enhancers - very not a lot and do not change significantly and do not overlap any promoter
conserved_enhancers = res_HS_NHP[ abs(res_HS_NHP$hs_pt_LFC)<log2(1.5) & abs(res_HS_NHP$hs_mm_LFC)<log2(1.5) &
                                  res_HS_NHP$hs_pt_Padj>0.1 & res_HS_NHP$hs_mm_Padj>0.1, ]
conserved_enhancers = enhancers_HS[ which(names(enhancers_HS) %in% rownames(conserved_enhancers)) ]
save( conserved_enhancers,
      file=paste0(objects_directory,"conserved_enhancers.RData" ))
export.bed(conserved_enhancers,con=paste0(outputs_directory,"conserved_enhancers.bed"))

Bar-graph showing how many enhancer do something

up_dn_sep = c(prom_up_hs_pt,prom_dn_hs_pt,prom_up_hs_mm,prom_dn_hs_mm)
up_dn_sep = up_dn_sep[which(! names(up_dn_sep) %in% names(promoters_HITS_UP))]

m =   c( any_DEG=sum( elementMetadata(distanceToNearest(genuine_gained_enhancers_gr,promoters_HITS_UP))[,1] > 500000)-length(enhancers_not_linked_with_activation),
         EAG=sum( elementMetadata(distanceToNearest(genuine_gained_enhancers_gr,promoters_HITS_UP))[,1] < 500000) )

par(mar=c(4,4,4,4),mfrow=c(1,1))
barplot(as.matrix(m),beside = FALSE,col=c("#0B6623","steelblue3"),
        ylim=c(0,10000),ylab="Enhancers")
axis(2,lwd=2)

## any_DEG     EAG 
##    5219    1443

Linked and not enhancers - TAD based annotation

genuine_gained_enhancers_gr = import.bed(paste0(outputs_directory,"genuine_gained_enhancers_ucsc.bed"))
enhancers_TAD_annotation = data.frame( up_genes = countOverlaps(ele_domains$TADs,promoters_HITS_UP),
                                       up_enhancers = countOverlaps(ele_domains$TADs,genuine_gained_enhancers_gr),
                                       genes_Hs_NHP = countOverlaps(ele_domains$TADs,c(prom_up_hs_pt,prom_dn_hs_pt,prom_up_hs_mm,prom_dn_hs_mm)))

enhancers_linked_with_activation_TAD = genuine_gained_enhancers_gr[unique(queryHits(findOverlaps(genuine_gained_enhancers_gr,ele_domains$TADs[which(enhancers_TAD_annotation[,1]>0 & enhancers_TAD_annotation[,2]>0)])))]
enhancers_not_linked_with_activation_TAD = genuine_gained_enhancers_gr[unique(queryHits(findOverlaps(genuine_gained_enhancers_gr,ele_domains$TADs[which(enhancers_TAD_annotation[,1]==0 & enhancers_TAD_annotation[,2]>0 & enhancers_TAD_annotation[,3]==0)])))]

enhancers_linked_with_activation_TADs = enhancers_linked_with_activation_TAD[which(!enhancers_linked_with_activation_TAD$name %in% enhancers_not_linked_with_activation_TAD$name)]
enhancers_not_linked_with_activation_TADs = enhancers_not_linked_with_activation_TAD[which(!enhancers_not_linked_with_activation_TAD$name %in% enhancers_linked_with_activation_TAD$name)]

names(enhancers_linked_with_activation_TADs) = enhancers_linked_with_activation_TADs$name
names(enhancers_not_linked_with_activation_TAD) = enhancers_not_linked_with_activation_TAD$name
export.bed(enhancers_linked_with_activation_TADs,con=paste0(outputs_directory,"enhancers_linked_with_activation_TADs.bed"))
export.bed(enhancers_not_linked_with_activation_TADs,con=paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs.bed"))

TADs

We read in the annotation of TADs from TopDom. We identify DEGs in single comparisions and assess how frequently we see up and down regulated genes per TAD.

prom_up_hs_pt = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.x>0 & all_Deseqs$padj.x<0.01,"Row.names"] ) ]
prom_dn_hs_pt = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.x<0 & all_Deseqs$padj.x<0.01,"Row.names"] ) ]

prom_up_hs_mm = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.y>0 & all_Deseqs$padj.y<0.01,"Row.names"] ) ]
prom_dn_hs_mm = promoters_filtered_gr[ which(promoters_filtered_gr$gene_id %in% all_Deseqs[all_Deseqs$log2FoldChange.y<0 & all_Deseqs$padj.y<0.01,"Row.names"] ) ]

ele_domains_anno = data.frame( up_prom = countOverlaps(ele_domains$TADs,promoters_HITS_UP),
                               dn_prom = countOverlaps(ele_domains$TADs,promoters_HITS_DN),
                               up_vsPt = countOverlaps(ele_domains$TADs,prom_up_hs_pt),
                               dn_vsPt = countOverlaps(ele_domains$TADs,prom_dn_hs_pt),
                               up_vsMm = countOverlaps(ele_domains$TADs,prom_up_hs_mm),
                               dn_vsMm = countOverlaps(ele_domains$TADs,prom_dn_hs_mm),
                               number_of_enh_hs = countOverlaps(ele_domains$TADs, enhancers_HS ),
                               genuine_gained_enhancers = countOverlaps(ele_domains$TADs,genuine_gained_enhancers_gr),
                               genuine_gained_active_enhancers = countOverlaps(ele_domains$TADs,genuine_gained_active_enhancers_gr),
                               genuine_gained_poised_enhancers = countOverlaps(ele_domains$TADs,genuine_gained_poised_enhancers_gr),
                               genuine_lost_enhancers = countOverlaps(ele_domains$TADs,genuine_lost_enhancers_gr),
                               geniune_lost_active_enhancers = countOverlaps(ele_domains$TADs,geniune_lost_active_enhancers_gr),
                               geniune_lost_poised_enhancers = countOverlaps(ele_domains$TADs,geniune_lost_poised_enhancers_gr),
                               gained_enhancers_that_do_sth = countOverlaps(ele_domains$TADs,genuine_gained_enhancers_that_do_something),
                               lost_enhancers_that_do_sth = countOverlaps(ele_domains$TADs,genuine_lost_enhancers_that_do_something),
                               prom_number=countOverlaps(ele_domains$TADs,promoters_filtered_gr),
                               me3_number=countOverlaps(ele_domains$TADs,hs_me3),
                               size = width(ele_domains$TADs))

length(unique(queryHits(findOverlaps(promoters_HITS_UP,ele_domains$TADs))))

## [1] 555

length(unique(queryHits(findOverlaps(promoters_HITS_DN,ele_domains$TADs))))

## [1] 447

tads_with_up_gene = ele_domains_anno[ele_domains_anno$up_prom>0 & ele_domains_anno$dn_prom==0,]
tads_with_dn_gene = ele_domains_anno[ele_domains_anno$dn_prom>0 & ele_domains_anno$up_prom==0,]
colSums(tads_with_up_gene>0)

##                         up_prom                         dn_prom 
##                             382                               0 
##                         up_vsPt                         dn_vsPt 
##                             382                              72 
##                         up_vsMm                         dn_vsMm 
##                             382                              83 
##                number_of_enh_hs        genuine_gained_enhancers 
##                             349                             231 
## genuine_gained_active_enhancers genuine_gained_poised_enhancers 
##                             154                             176 
##          genuine_lost_enhancers   geniune_lost_active_enhancers 
##                              92                              19 
##   geniune_lost_poised_enhancers    gained_enhancers_that_do_sth 
##                              55                             228 
##      lost_enhancers_that_do_sth                     prom_number 
##                               8                             382 
##                      me3_number                            size 
##                             366                             382

colSums(tads_with_dn_gene>0)

##                         up_prom                         dn_prom 
##                               0                             365 
##                         up_vsPt                         dn_vsPt 
##                              38                             365 
##                         up_vsMm                         dn_vsMm 
##                              83                             365 
##                number_of_enh_hs        genuine_gained_enhancers 
##                             335                             183 
## genuine_gained_active_enhancers genuine_gained_poised_enhancers 
##                              99                             133 
##          genuine_lost_enhancers   geniune_lost_active_enhancers 
##                              98                              20 
##   geniune_lost_poised_enhancers    gained_enhancers_that_do_sth 
##                              50                              12 
##      lost_enhancers_that_do_sth                     prom_number 
##                              93                             365 
##                      me3_number                            size 
##                             353                             365

sum( rowSums(ele_domains_anno[,c("up_prom","dn_prom")]>0)==2 )

## [1] 33

sum( rowSums(ele_domains_anno[,c("up_prom","dn_prom")]>0)==1 & rowSums(ele_domains_anno[,c("up_prom","dn_prom")])>1)

## [1] 124

## compute co-occurence of up and down regulated genes in TADs
getStats = function(x,col1,col2){
  tp = c( sum(x[,col1]>0 & x[,col2]==0), 
          sum(x[,col1]==0 & x[,col2]>0),
          sum(x[,col1]>0 & x[,col2]>0 ) )
  names(tp) = c("FirstOnly","SecondOnly","Both")
  return(tp)
}


par(mfrow=c(1,1),mar=c(8,5,1,1))
m = rbind( getStats(ele_domains_anno,
                    which(colnames(ele_domains_anno)=="up_prom"),
                    which(colnames(ele_domains_anno)=="dn_prom")),
           getStats(ele_domains_anno,
                    which(colnames(ele_domains_anno)=="up_vsPt"),
                    which(colnames(ele_domains_anno)=="dn_vsPt")),
           getStats(ele_domains_anno,
                    which(colnames(ele_domains_anno)=="up_vsMm"),
                    which(colnames(ele_domains_anno)=="dn_vsMm")) )

barplot(t(m), beside=FALSE, col=c("green4","wheat3","gray60"),
        names=c("Hs vs NHP","Hs vs. Pt","Hs vs. Mm"),las=2,
        ylab="EAG")
axis(2,lwd=2,las=2)

Figure showing how many genuine gained enhancers are there per domain. First of all there are many domains that only feature gained enhancer and no upregulated EAG. There are few domains where I do not see a gained enhancer despite the presence of an upregulated EAG. We see both the up-regulated EAG and a gained DOR in 253 TADs.

getStats(ele_domains_anno,
         which(colnames(ele_domains_anno)=="genuine_gained_enhancers"),
         which(colnames(ele_domains_anno)=="up_prom"))

##  FirstOnly SecondOnly       Both 
##       3420        162        253

hist(ele_domains_anno[ele_domains_anno$up_prom>0,"genuine_gained_enhancers"],n=14,
     main="",col="green4",xlab="Number of gained putative enhancers",ylim=c(0,300))
axis(1,lwd=2)
axis(2,lwd=2)

Majority of TADs have a gained enhancer and an upregulated EAG!

sum(ele_domains_anno$genuine_gained_enhancers>0)

## [1] 3673

sum(ele_domains_anno[ele_domains_anno$up_prom>0,"genuine_gained_enhancers"]>0)

## [1] 253

sum(ele_domains_anno$up_prom>0)

## [1] 415

gained_enhancers_in_any_comp = hs_atac_for_Deseq2_Hs_vs_NHP_filt[ which( res_HS_NHP_filt$hs_pt_LFC>0 & res_HS_NHP_filt$hs_pt_Padj<pvalthr | res_HS_NHP_filt$hs_mm_LFC>0 & res_HS_NHP_filt$hs_mm_Padj<pvalthr) ]
gained_enhancers_in_any_comp = gained_enhancers_in_any_comp[- queryHits(findOverlaps(gained_enhancers_in_any_comp,c(hs_me3,promoters_tss_gr)))]
gained_enhancers_in_any_comp = gained_enhancers_in_any_comp[which(names(gained_enhancers_in_any_comp) %in% pt_atac_for_Deseq2_not_H3K27ac[pt_atac_for_Deseq2_not_H3K27ac %in% mm_atac_for_Deseq2_not_H3K27ac])]
sum( countOverlaps(ele_domains$TADs,gained_enhancers_in_any_comp)>0 & ele_domains_anno$up_prom>0  )

## [1] 358

sum( countOverlaps(ele_domains$TADs,gained_enhancers_in_any_comp)>0 & rowSums( ele_domains_anno[,c("up_vsPt", "up_vsMm")]>0)>0  )

## [1] 1607

Overall number of human gained enhancers as compared to chimps and macaques - is it explaining the fact that the log fold change in the human lineage is more pronounced when compared to macaques?

all(names(hs_atac_for_Deseq2)==rownames(res_HS_NHP))

## [1] TRUE

proms_in_tads_wo_DORs = promoters_HITS_UP[queryHits(findOverlaps(promoters_HITS_UP, ele_domains$TADs[which(ele_domains_anno$up_prom>0 & ele_domains_anno$genuine_gained_enhancers==0)]))]
proms_in_tads_with_1_DORs = promoters_HITS_UP[queryHits(findOverlaps(promoters_HITS_UP, ele_domains$TADs[which(ele_domains_anno$up_prom>0 & ele_domains_anno$genuine_gained_enhancers==1)]))]
proms_in_tads_with_many_DORs = promoters_HITS_UP[queryHits(findOverlaps(promoters_HITS_UP, ele_domains$TADs[which(ele_domains_anno$up_prom>0 & ele_domains_anno$genuine_gained_enhancers>1 & ele_domains_anno$genuine_gained_enhancers<4)]))]
proms_in_tads_with_very_many_DORs = promoters_HITS_UP[queryHits(findOverlaps(promoters_HITS_UP, ele_domains$TADs[which(ele_domains_anno$up_prom>0 & ele_domains_anno$genuine_gained_enhancers>4)]))]

DORs_gained_Hs_Pt = hs_atac_for_Deseq2[which(res_HS_NHP$hs_pt_LFC>0 & res_HS_NHP$hs_pt_Padj<0.1)]
DORs_gained_Hs_Mm = hs_atac_for_Deseq2[which(res_HS_NHP$hs_mm_LFC>0 & res_HS_NHP$hs_mm_Padj<0.1)]
DORs_gained_Hs_Pt = DORs_gained_Hs_Pt[-unique(queryHits(findOverlaps(DORs_gained_Hs_Pt,c(promoters_tss_gr,hs_me3,gained_ATAC_gr))))]
DORs_gained_Hs_Mm = DORs_gained_Hs_Mm[-unique(queryHits(findOverlaps(DORs_gained_Hs_Mm,c(promoters_tss_gr,hs_me3,gained_ATAC_gr))))]

ele_domains_anno$DORs_gained_Hs_Pt = countOverlaps(ele_domains$TADs,DORs_gained_Hs_Pt)
ele_domains_anno$DORs_gained_Hs_Mm = countOverlaps(ele_domains$TADs,DORs_gained_Hs_Mm)

ele_domains_anno$DORs_anno = cut(ele_domains_anno$genuine_gained_enhancers,c(-Inf,0,1,3,1000))

par(mfrow=c(2,2),mar=c(3,2,1,2))
boxplot( all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_wo_DORs)],
         # all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_1_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_many_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_very_many_DORs)],
         notch=TRUE, ylim=c(0,12), col="white",border=colorRampPalette(c("steelblue","green4"))(4),
         names=c("0","1","2-3",">3"))
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)
boxplot( all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_wo_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_1_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_many_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_very_many_DORs)],
         notch=TRUE, ylim=c(0,12), col="white",border=colorRampPalette(c("steelblue","green4"))(4),
         names=c("0","1","2-3",">3"))
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)

boxplot(split(ele_domains_anno$DORs_gained_Hs_Pt,ele_domains_anno$DORs_anno),ylim=c(0,30),notch=TRUE, col="white",border=colorRampPalette(c("steelblue","green4"))(4),names=c("0","1","2-3",">3"))

## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE

boxplot(split(ele_domains_anno$DORs_gained_Hs_Mm,ele_domains_anno$DORs_anno),ylim=c(0,30),notch=TRUE, col="white",border=colorRampPalette(c("steelblue","green4"))(4),names=c("0","1","2-3",">3"))

Down-regulated genes - loss of enhancers?

DORs_lost_Hs_Pt = hs_atac_for_Deseq2[which(res_HS_NHP$hs_pt_LFC<0 & res_HS_NHP$hs_pt_Padj<0.1)]
DORs_lost_Hs_Mm = hs_atac_for_Deseq2[which(res_HS_NHP$hs_mm_LFC<0 & res_HS_NHP$hs_mm_Padj<0.1)]
DORs_lost_Hs_Pt = DORs_lost_Hs_Pt[-unique(queryHits(findOverlaps(DORs_lost_Hs_Pt,c(promoters_tss_gr,hs_me3,lost_ATAC_gr))))]
DORs_lost_Hs_Mm = DORs_lost_Hs_Mm[-unique(queryHits(findOverlaps(DORs_lost_Hs_Mm,c(promoters_tss_gr,hs_me3,lost_ATAC_gr))))]

ele_domains_anno$DORs_lost_Hs_Pt = countOverlaps(ele_domains$TADs,DORs_lost_Hs_Pt)
ele_domains_anno$DORs_lost_Hs_Mm = countOverlaps(ele_domains$TADs,DORs_lost_Hs_Mm)

ele_domains_anno$DORs_anno = cut(ele_domains_anno$genuine_lost_enhancers,c(-Inf,0,1,3,1000))

### 
proms_in_tads_wo_lost_DORs = promoters_HITS_DN[queryHits(findOverlaps(promoters_HITS_DN, ele_domains$TADs[which(ele_domains_anno$dn_prom>0 & ele_domains_anno$genuine_lost_enhancers==0)]))]

proms_in_tads_with_1_lost_DORs = promoters_HITS_DN[queryHits(findOverlaps(promoters_HITS_DN, ele_domains$TADs[which(ele_domains_anno$dn_prom>0 & ele_domains_anno$genuine_lost_enhancers==1)]))]

proms_in_tads_with_many_lost_DORs = promoters_HITS_DN[queryHits(findOverlaps(promoters_HITS_DN, ele_domains$TADs[which(ele_domains_anno$dn_prom>0 & ele_domains_anno$genuine_lost_enhancers>1 & ele_domains_anno$genuine_gained_enhancers<4)]))]

proms_in_tads_with_very_many_lost_DORs = promoters_HITS_DN[queryHits(findOverlaps(promoters_HITS_DN, ele_domains$TADs[which(ele_domains_anno$dn_prom>0 & ele_domains_anno$genuine_lost_enhancers>3)]))]

par(mfrow=c(2,2),mar=c(3,2,1,1))
boxplot( all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_wo_lost_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_1_lost_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_many_lost_DORs)],
         all_Deseqs$log2FoldChange.x[all_Deseqs$Row.names %in% names(proms_in_tads_with_very_many_lost_DORs)],
         notch=FALSE, ylim=c(-12,2), col="white",border=colorRampPalette(c("black","red"))(4),
         names=c("0","1","2-3",">3"))
boxplot( all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_wo_lost_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_1_lost_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_many_lost_DORs)],
         all_Deseqs$log2FoldChange.y[all_Deseqs$Row.names %in% names(proms_in_tads_with_very_many_lost_DORs)],
         notch=FALSE, ylim=c(-12,2), col="white",border=colorRampPalette(c("black","red"))(4),
         names=c("0","1","2-3",">3"))

boxplot(split(ele_domains_anno$DORs_lost_Hs_Pt,ele_domains_anno$DORs_anno),
        ylim=c(0,30),notch=TRUE, col="white",border=colorRampPalette(c("black","red"))(4),
        names=c("0","1","2-3",">3"))
boxplot(split(ele_domains_anno$DORs_lost_Hs_Mm,ele_domains_anno$DORs_anno),ylim=c(0,30),notch=TRUE, 
        col="white",border=colorRampPalette(c("black","red"))(4),
        names=c("0","1","2-3",">3"))

Sequence analysis of the enhancer classes

phastCons = readRDS(paste0(objects_directory,'phastCons30way_signal_in_5bp_bins_for_all_ATAC_peaks_500Kb_around_summit.Rds'))

par(mfrow=c(1,1),mar=c(5,5,1,1))
plot( seq(-500,500,length.out=200),
      colMeans(phastCons[rownames(phastCons) %in% names(conserved_enhancers),]),
      ylab="PhastCons", 
      ty="l",col="black",lwd=3,ylim=c(0.0,0.4),xlim=c(-500,500),
      xlab="distance from ATAC-seq peak summit" )
lines(  seq(-500,500,length.out=200),
      colMeans(phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation),]),ty="l",col="turquoise4",lwd=3 )
lines(  seq(-500,500,length.out=200),
      colMeans(phastCons[rownames(phastCons) %in% names(enhancers_not_linked_with_activation),]),ty="l",col="gray80",lwd=3 )
lines(  seq(-500,500,length.out=200),
      colMeans(phastCons[rownames(phastCons) %in% names(genuine_lost_enhancers_gr),]),ty="l",col="red",lwd=3 )
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)
abline(v=0,lwd=2,lty=2,col="gray")

t.test(phastCons[rownames(phastCons) %in% names(conserved_enhancers),95],
       phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation),95])

## 
##  Welch Two Sample t-test
## 
## data:  phastCons[rownames(phastCons) %in% names(conserved_enhancers), 95] and phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation), 95]
## t = 18.66, df = 3400.3, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1738891 0.2147229
## sample estimates:
## mean of x mean of y 
## 0.3625148 0.1682088

t.test(phastCons[rownames(phastCons) %in% names(enhancers_not_linked_with_activation),95],
       phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation),95])

## 
##  Welch Two Sample t-test
## 
## data:  phastCons[rownames(phastCons) %in% names(enhancers_not_linked_with_activation), 95] and phastCons[rownames(phastCons) %in% names(enhancers_linked_with_activation), 95]
## t = 4.7247, df = 3272.1, p-value = 0.000002401
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.02917625 0.07056904
## sample estimates:
## mean of x mean of y 
## 0.2180814 0.1682088

Stepwise gain in enhancer activity

DOR_Deseq2 = merge(as.data.frame(res_HS_PT),
                   as.data.frame(res_HS_MM),
                   by="row.names")

## all putative enhancers
seqlevelsStyle(hs_atac) = "UCSC"
all_primate_enhancers = hs_atac[ -unique( queryHits(findOverlaps(hs_atac,c(promoters_tss_gr,hs_me3)))) ]

changed_enhancers_ATAC_signal_change = DOR_Deseq2[DOR_Deseq2$Row.names %in% names(all_primate_enhancers),]
changed_enhancers_ATAC_signal_change = changed_enhancers_ATAC_signal_change[ !is.na(changed_enhancers_ATAC_signal_change$padj.x),]
changed_enhancers_ATAC_signal_change = changed_enhancers_ATAC_signal_change[ !is.na(changed_enhancers_ATAC_signal_change$padj.y),]

changed_enhancers_ATAC_signal_change = changed_enhancers_ATAC_signal_change[ changed_enhancers_ATAC_signal_change$padj.x<sqrt(0.1) | changed_enhancers_ATAC_signal_change$padj.y<sqrt(0.1), ]
changed_enhancers_ATAC_signal_change = changed_enhancers_ATAC_signal_change[! changed_enhancers_ATAC_signal_change$log2FoldChange.x==changed_enhancers_ATAC_signal_change$log2FoldChange.y,]

par(mfrow=c(1,1),mar=c(4,4,1,1))
boxplot(abs(changed_enhancers_ATAC_signal_change$log2FoldChange.x),
        abs(changed_enhancers_ATAC_signal_change$log2FoldChange.y), 
        outline=FALSE, ylab=expression("Human/NHP [log"[2]*")]"),
        names=c("Hs vs. Pt","Hs vs. Mm"),
        col="white",border=c("red","blue"),lwd=2,ylim=c(0,7))
axis(1,lwd=2, at=c(1,2),labels=c("Hs vs. Pt","Hs vs. Mm"))
axis(2,lwd=2)
box(col="black",lwd=2)

par(mfrow=c(1,1),pty='s')
heatscatter( changed_enhancers_ATAC_signal_change$log2FoldChange.x,
             changed_enhancers_ATAC_signal_change$log2FoldChange.y,
             colpal="blues",cex=0.5, 
             ylab=expression("Human/chimp [log"[2]*")]"),
             xlab=expression("Human/macaque [log"[2]*")]"),
             ylim=c(-10,10),
             xlim=c(-10,10))
axis(1,lwd=2)
axis(2,lwd=2)

Regulomes are less correlated than transcriptomes

cor.test(res_HS_NHP$hs_pt_LFC[res_HS_NHP$hs_pt_Padj<0.1 & res_HS_NHP$hs_mm_Padj<0.1],
         res_HS_NHP$hs_mm_LFC[res_HS_NHP$hs_pt_Padj<0.1 & res_HS_NHP$hs_mm_Padj<0.1] )

## 
##  Pearson's product-moment correlation
## 
## data:  res_HS_NHP$hs_pt_LFC[res_HS_NHP$hs_pt_Padj < 0.1 & res_HS_NHP$hs_mm_Padj < 0.1] and res_HS_NHP$hs_mm_LFC[res_HS_NHP$hs_pt_Padj < 0.1 & res_HS_NHP$hs_mm_Padj < 0.1]
## t = 241.55, df = 17201, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8753683 0.8821743
## sample estimates:
##      cor 
## 0.878816

cor.test(all_Deseqs$log2FoldChange.x[all_Deseqs$padj.x<0.01 & all_Deseqs$pvalue.y<0.01],
         all_Deseqs$log2FoldChange.y[all_Deseqs$padj.x<0.01 & all_Deseqs$pvalue.y<0.01] )

## 
##  Pearson's product-moment correlation
## 
## data:  all_Deseqs$log2FoldChange.x[all_Deseqs$padj.x < 0.01 & all_Deseqs$pvalue.y < 0.01] and all_Deseqs$log2FoldChange.y[all_Deseqs$padj.x < 0.01 & all_Deseqs$pvalue.y < 0.01]
## t = 49.93, df = 1680, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7529456 0.7914822
## sample estimates:
##       cor 
## 0.7729257

expressionRNA = cor.test(log_fold_dat$HSvPT_lfc_shrunk,log_fold_dat$HSvMM_lfc, conf.level = 0.99)
regulomeATAC = cor.test(changed_enhancers_ATAC_signal_change$log2FoldChange.x,
                    changed_enhancers_ATAC_signal_change$log2FoldChange.y, conf.level = 0.99)
par(pty="m")
barplot( c(expressionRNA$estimate,
           regulomeATAC$estimate),
         col=c('green4','steelblue'),ylim=c(0,1),
         names=c("gene expression","ATAC"))
segments(0.7,expressionRNA$conf.int[[1]],0.7,expressionRNA$conf.int[[2]]) 
segments(1.9,regulomeATAC$conf.int[[1]],1.9,regulomeATAC$conf.int[[2]]) 
axis(2,lwd=2)

Linked and not enhancers have a promoter within 500kb

linked_500 = GenomicRanges::resize(enhancers_linked_with_activation,1000000,fix="center")
not_linked_500 = GenomicRanges::resize(enhancers_not_linked_with_activation,1000000,fix="center")
sum(countOverlaps(linked_500,promoters_filtered_gr)>0)

## [1] 1443

sum(countOverlaps(not_linked_500,promoters_filtered_gr)>0)

## [1] 2675

Average profile

Enhancers that do something are more open

atac_hs_bw = import.bw(paste0(outputs_directory,"ATAC_Seq_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_RPGC.bw"))
k27_hs_bw = import.bw(paste0(outputs_directory,"ChIP_Seq_H3K27ac_12-22_HomSap_i-Astro_ELE10-30_merged_hg38_RPGC.bw"))
seqlevelsStyle(atac_hs_bw) = "ucsc"
seqlevelsStyle(k27_hs_bw) = "ucsc"

linked_GR = import.bed(paste0(outputs_directory,"enhancers_linked_with_activation.bed"))
names(linked_GR) = linked_GR$name
not_linked_GR = import.bed(paste0(outputs_directory,"enhancers_not_linked_with_activation.bed"))
names(not_linked_GR) = not_linked_GR$name

linked_GR_AP = GetAPRangesForGenomicRangesObject(linked_GR)
not_linked_GR_AP = GetAPRangesForGenomicRangesObject(not_linked_GR)

linked_atac_hs = getSignalInBins( linked_GR_AP, atac_hs_bw, 1 )
linked_k27_hs = getSignalInBins( linked_GR_AP, k27_hs_bw, 1 )

not_linked_atac_hs = getSignalInBins( not_linked_GR_AP, atac_hs_bw, 1 )
not_linked_k27_hs = getSignalInBins( not_linked_GR_AP, k27_hs_bw, 1 )

par(mfrow=c(1,2),mar=c(5,5,1,1),pty="m")
plot(seq(-1000,1000,length.out=200),
     colMeans(linked_atac_hs),col="turquoise4",ty='l',lwd=2,
     xlab="Distance from the DOR summit",ylab="ATAC-seq signal (RPGC)")
lines(seq(-1000,1000,length.out=200),
     colMeans(not_linked_atac_hs),col="gray",lwd=2)
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)

plot(seq(-1000,1000,length.out=200),
     colMeans(linked_k27_hs),col="turquoise4",ty='l',lwd=2,
     xlab="Distance from the DOR summit",ylab="H3K27ac ChIP-seq signal (RPGC)",ylim=c(0,6))
lines(seq(-1000,1000,length.out=200),
     colMeans(not_linked_k27_hs),col="gray",lwd=2)
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)

Names of all the TFs in Hocomoco database.

AllTFs = c(list.files('~/human_beds/A'), 
           unlist( lapply( as.list(paste0("~/human_beds/A-kopia",c('',2,3,4,5,6))), function(x){list.files(x)} ) ) )
TFs = unique( do.call('c', lapply(strsplit(AllTFs,'.bed'),function(el){el[[1]]})) )

TFsEnsemblG = read.delim( file=paste0(outputs_directory,'TFsymbol_fixed.txt'),as.is=TRUE )
TFsEnsemblG$eig = genemapu$ensembl_gene_id[match(TFsEnsemblG$Fixed,genemapu$hgnc_symbol)]
TFsEnsemblG$names = AllTFs
TFsEnsemblG$names2 = unlist(strsplit(AllTFs,".bed"))
TFsEnsemblG$names3 = paste0( unlist(strsplit(AllTFs,".bed")), "_HG38.bed" )
save(TFsEnsemblG,file=paste0(objects_directory,"TFsEnsemblG.RData"))

Load objects

human_stripe_factors = read.delim(paste0(outputs_directory,"human_stripe_factors.txt"),header=FALSE,as.is=TRUE)
load(paste0(objects_directory,"TFsEnsemblG.RData"))

load( paste0(objects_directory,"enhancers_functional_groups.RData"))
load( paste0(objects_directory,"conserved_enhancers.RData" ))
export.bed( conserved_enhancers, con=paste0(outputs_directory,"conserved_enhancers.bed" ))
genuine_gained_enhancers_gr = import.bed(paste0(outputs_directory,"genuine_gained_enhancers_ucsc.bed"))
load(paste0(objects_directory,"lost_enhancers_functional_groups.RData"))
genuine_lost_enhancers_that_do_something = import.bed(paste0(outputs_directory,"lost_enhancers_linked_with_activation.bed"))
genuine_lost_enhancers_that_do_nothing = import.bed(paste0(outputs_directory,"lost_enhancers_not_linked_with_activation.bed"))

enhancers_linked_with_activation_TADs = import.bed(paste0(outputs_directory,"enhancers_linked_with_activation_TADs.bed"))
enhancers_not_linked_with_activation_TADs = import.bed(paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs.bed"))
names(enhancers_linked_with_activation_TADs) = enhancers_linked_with_activation_TADs$name
names(enhancers_not_linked_with_activation_TADs) = enhancers_not_linked_with_activation_TADs$name

Evolutionary changes in TFBS - preparations

Align the chosen enhancer groups to chimp

cd ~/Documents/Tools/

## human to chimp
./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_PanTro.unmapped.file

./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_PanTro.unmapped.file

./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_PanTro.unmapped.file

./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_PanTro.unmapped.file


./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_PanTro.unmapped.file

./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_with_activation.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_with_activation_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_with_activation_PanTro.unmapped.file

./liftOver -minMatch=0.5 -bedPlus=6 -tab /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_with_activation.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/chain_files/hg38ToPanTro6.over.chain /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_with_activation_PanTro.bed /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_with_activation_PanTro.unmapped.file

Get sequences for the enhancers

enhancers_linked_with_activation_pt = readBed_filterChroms(paste0(outputs_directory,'enhancers_linked_with_activation_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
enhancers_not_linked_with_activation_pt = readBed_filterChroms(paste0(outputs_directory,'enhancers_not_linked_with_activation_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)

enhancers_linked_with_activation_TADs_pt = readBed_filterChroms(paste0(outputs_directory,'enhancers_linked_with_activation_TADs_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
enhancers_not_linked_with_activation_TADs_pt = readBed_filterChroms(paste0(outputs_directory,'enhancers_not_linked_with_activation_TADs_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)


conserved_enhancers_pt = readBed_filterChroms(paste0(outputs_directory,'conserved_enhancers_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)

lost_linked_pt = readBed_filterChroms(paste0(outputs_directory,'lost_enhancers_linked_with_activation_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)
lost_not_linked_pt = readBed_filterChroms(paste0(outputs_directory,'lost_enhancers_not_linked_with_activation_PanTro.bed'),
                                               chroms=paste0('chr',c(1,'2A','2B', 3:22,'X')),4)

## checks
enhancers_linked_with_activation = enhancers_linked_with_activation[match(names(enhancers_linked_with_activation_pt),names(enhancers_linked_with_activation))]
all(names(enhancers_linked_with_activation_pt)==names(enhancers_linked_with_activation))

## [1] TRUE

all(names(enhancers_not_linked_with_activation_pt)==names(enhancers_not_linked_with_activation))

## [1] TRUE

genuine_lost_enhancers_that_do_something = genuine_lost_enhancers_that_do_something[match(names(lost_linked_pt),genuine_lost_enhancers_that_do_something$name)]
genuine_lost_enhancers_that_do_nothing = genuine_lost_enhancers_that_do_nothing[match(names(lost_not_linked_pt),genuine_lost_enhancers_that_do_nothing$name)]
all(genuine_lost_enhancers_that_do_something$name==names(lost_linked_pt))

## [1] TRUE

all(genuine_lost_enhancers_that_do_nothing$name==names(lost_not_linked_pt))

## [1] TRUE

names(genuine_lost_enhancers_that_do_something) = genuine_lost_enhancers_that_do_something$name
names(genuine_lost_enhancers_that_do_nothing) = genuine_lost_enhancers_that_do_nothing$name

enhancers_linked_with_activation_TADs = enhancers_linked_with_activation_TADs[match(names(enhancers_linked_with_activation_TADs_pt),names(enhancers_linked_with_activation_TADs))]
enhancers_not_linked_with_activation_TADs = enhancers_not_linked_with_activation_TADs[match(names(enhancers_not_linked_with_activation_TADs_pt),names(enhancers_not_linked_with_activation_TADs))]
all(names(enhancers_linked_with_activation_TADs)==names(enhancers_linked_with_activation_TADs_pt))

## [1] TRUE

all(names(enhancers_not_linked_with_activation_TADs)==names(enhancers_not_linked_with_activation_TADs_pt))

## [1] TRUE

enhancers_linked_with_activation_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,enhancers_linked_with_activation)
enhancers_linked_with_activation_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,enhancers_linked_with_activation_pt)
enhancers_not_linked_with_activation_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,enhancers_not_linked_with_activation)
enhancers_not_linked_with_activation_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,enhancers_not_linked_with_activation_pt)

enhancers_linked_with_activation_TADs_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,enhancers_linked_with_activation_TADs)
enhancers_linked_with_activation_TADs_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,enhancers_linked_with_activation_TADs_pt)
enhancers_not_linked_with_activation_TADs_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,enhancers_not_linked_with_activation_TADs)
enhancers_not_linked_with_activation_TADs_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,enhancers_not_linked_with_activation_TADs_pt)


conserved_enhancers_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,conserved_enhancers)
conserved_enhancers_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,conserved_enhancers_pt)
lost_enhancers_linked_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,genuine_lost_enhancers_that_do_something)
lost_enhancers_linked_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,lost_linked_pt)
lost_enhancers_not_linked_seq_Hs = Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,genuine_lost_enhancers_that_do_nothing)
lost_enhancers_not_linked_seq_Pt = Biostrings::getSeq(BSgenome.Ptroglodytes.UCSC.panTro6,lost_not_linked_pt)

Enhancers that do something

Now let’s find the evolutionary mismatches between sequences. We compare human to chimp sequences.

test = mclapply( as.list(names(enhancers_linked_with_activation)), 
                 function(enh){ Figure_out_mismatching_sequences( enhancers_linked_with_activation[which(names(enhancers_linked_with_activation)==enh)],
                                                                                                  enhancers_linked_with_activation_seq_Hs[which(names(enhancers_linked_with_activation_seq_Hs)==enh)],
                                                                                                       enhancers_linked_with_activation_pt[which(names(enhancers_linked_with_activation_pt)==enh)],
                                                                                                       enhancers_linked_with_activation_seq_Pt[which(names(enhancers_linked_with_activation_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )

enhancers_linked_with_activation_hs_vs_Pt = do.call("rbind",test)
save(enhancers_linked_with_activation_hs_vs_Pt,
     file=paste0(objects_directory,"enhancers_linked_with_activation_hs_vs_Pt.RData") )

enhancers_linked_with_activation_hs_vs_Pt_gr =  GRanges( seqnames=Rle(enhancers_linked_with_activation_hs_vs_Pt$seqnames),
                                                         ranges = IRanges( enhancers_linked_with_activation_hs_vs_Pt$start,
                                                         end=enhancers_linked_with_activation_hs_vs_Pt$end ),
                                                         kind=enhancers_linked_with_activation_hs_vs_Pt$type)
names(enhancers_linked_with_activation_hs_vs_Pt_gr) = enhancers_linked_with_activation_hs_vs_Pt$names
export.bed( enhancers_linked_with_activation_hs_vs_Pt_gr, con=paste0(outputs_directory,"enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed"))

seqlevelsStyle(enhancers_linked_with_activation_hs_vs_Pt_gr) = "ncbi"
export.gff( enhancers_linked_with_activation_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_linked_with_activation_hs_vs_Pt.gtf"))
export.bed( enhancers_linked_with_activation_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_linked_with_activation_hs_vs_Pt.bed"))

Intersect with bedtools

cd ~/Documents/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation/hs_specieis_enhancers_A6.bed

Intersect with chimp TFBS lifted over to the Hg38 genome assembly

cd ~/Documents/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A6.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia7/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/hs_specieis_enhancers_A7.bed

Enhancers that do nothing

norole = mclapply( as.list(names(enhancers_not_linked_with_activation)), 
                 function(enh){ Figure_out_mismatching_sequences( enhancers_not_linked_with_activation[which(names(enhancers_not_linked_with_activation)==enh)],
                                                                                                  enhancers_not_linked_with_activation_seq_Hs[which(names(enhancers_not_linked_with_activation_seq_Hs)==enh)],
                                                                                                       enhancers_not_linked_with_activation_pt[which(names(enhancers_not_linked_with_activation_pt)==enh)],
                                                                                                       enhancers_not_linked_with_activation_seq_Pt[which(names(enhancers_not_linked_with_activation_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )

enhancers_not_linked_with_activation_hs_vs_Pt = do.call("rbind",norole)
save(enhancers_not_linked_with_activation_hs_vs_Pt,
     file=paste0(objects_directory,"enhancers_not_linked_with_activation_hs_vs_Pt.RData"))

enhancers_not_linked_with_activation_hs_vs_Pt_gr =  GRanges( seqnames=Rle(enhancers_not_linked_with_activation_hs_vs_Pt$seqnames),
                                                             ranges = IRanges( enhancers_not_linked_with_activation_hs_vs_Pt$start,
                                                             end=enhancers_not_linked_with_activation_hs_vs_Pt$end ),
                                                             kind=enhancers_not_linked_with_activation_hs_vs_Pt$type)
names(enhancers_not_linked_with_activation_hs_vs_Pt_gr) = enhancers_not_linked_with_activation_hs_vs_Pt$names

seqlevelsStyle(enhancers_not_linked_with_activation_hs_vs_Pt_gr) = "ncbi"
export.gff( enhancers_not_linked_with_activation_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_not_linked_with_activation_hs_vs_Pt.gtf"))
export.bed( enhancers_not_linked_with_activation_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_not_linked_with_activation_hs_vs_Pt.bed"))

Intersect with bedtools

cd ~/Documents/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation/hs_specieis_enhancers_A6.bed

Enhancers that do something - TADs

Now let’s find the evolutionary mismatches between sequences. We compare human to chimp sequences.

TADs_enh_linked = mclapply( as.list(names(enhancers_linked_with_activation_TADs)), 
                 function(enh){ Figure_out_mismatching_sequences( enhancers_linked_with_activation_TADs[which(names(enhancers_linked_with_activation_TADs)==enh)],
                                                                                                  enhancers_linked_with_activation_TADs_seq_Hs[which(names(enhancers_linked_with_activation_TADs_seq_Hs)==enh)],
                                                                                                       enhancers_linked_with_activation_TADs_pt[which(names(enhancers_linked_with_activation_TADs_pt)==enh)],
                                                                                                       enhancers_linked_with_activation_TADs_seq_Pt[which(names(enhancers_linked_with_activation_TADs_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )

enhancers_linked_with_activation_TADs_hs_vs_Pt = do.call("rbind",TADs_enh_linked)
save(enhancers_linked_with_activation_TADs_hs_vs_Pt,
     file=paste0(objects_directory,"eenhancers_linked_with_activation_TADs_hs_vs_Pt.RData") )

enhancers_linked_with_activation_TADs_hs_vs_Pt_gr =  GRanges( seqnames=Rle(enhancers_linked_with_activation_TADs_hs_vs_Pt$seqnames),
                                                              ranges = IRanges( enhancers_linked_with_activation_TADs_hs_vs_Pt$start,
                                                              end=enhancers_linked_with_activation_TADs_hs_vs_Pt$end ),
                                                              kind=enhancers_linked_with_activation_TADs_hs_vs_Pt$type)
names(enhancers_linked_with_activation_TADs_hs_vs_Pt_gr) = enhancers_linked_with_activation_TADs_hs_vs_Pt$names
export.bed( enhancers_linked_with_activation_TADs_hs_vs_Pt_gr, con=paste0(outputs_directory,"enhancers_linked_with_activation_TADs_hs_vs_Pt_ucsc.bed"))

seqlevelsStyle(enhancers_linked_with_activation_TADs_hs_vs_Pt_gr) = "ncbi"
export.gff( enhancers_linked_with_activation_TADs_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_linked_with_activation_TADs_hs_vs_Pt.gtf"))
export.bed( enhancers_linked_with_activation_TADs_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_linked_with_activation_TADs_hs_vs_Pt.bed"))

Intersect with bedtools

cd ~/Documents/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_linked_with_activation_TADs/hs_specieis_enhancers_A6.bed

Enhancers that do nothing - TAD

TADs_enh__not_linked = mclapply( as.list(names(enhancers_not_linked_with_activation_TADs)), 
                 function(enh){ Figure_out_mismatching_sequences( enhancers_not_linked_with_activation_TADs[which(names(enhancers_not_linked_with_activation_TADs)==enh)],
                                                                                                  enhancers_not_linked_with_activation_TADs_seq_Hs[which(names(enhancers_not_linked_with_activation_TADs_seq_Hs)==enh)],
                                                                                                       enhancers_not_linked_with_activation_TADs_pt[which(names(enhancers_not_linked_with_activation_TADs_pt)==enh)],
                                                                                                       enhancers_not_linked_with_activation_TADs_seq_Pt[which(names(enhancers_not_linked_with_activation_TADs_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )

enhancers_not_linked_with_activation_TADs_hs_vs_Pt = do.call("rbind",TADs_enh__not_linked)
save(enhancers_not_linked_with_activation_TADs_hs_vs_Pt,
     file=paste0(objects_directory,"enhancers_not_linked_with_activation_TADs_hs_vs_Pt.RData"))

enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr =  GRanges( seqnames=Rle(enhancers_not_linked_with_activation_TADs_hs_vs_Pt$seqnames),
                                                                  ranges = IRanges( enhancers_not_linked_with_activation_TADs_hs_vs_Pt$start,
                                                                  end=enhancers_not_linked_with_activation_TADs_hs_vs_Pt$end ),
                                                                  kind=enhancers_not_linked_with_activation_TADs_hs_vs_Pt$type)
names(enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr) = enhancers_not_linked_with_activation_TADs_hs_vs_Pt$names

seqlevelsStyle(enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr) = "ncbi"
export.gff( enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs_hs_vs_Pt.gtf"))
export.bed( enhancers_not_linked_with_activation_TADs_hs_vs_Pt_gr, paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed"))

Intersect with bedtools.

cd ~/Documents/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/enhancers_not_linked_with_activation_TADs_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/enhancers_not_linked_with_activation_TADs/hs_specieis_enhancers_A6.bed

Conserved enhancers

cons = mclapply( as.list(names(conserved_enhancers)), 
                 function(enh){ Figure_out_mismatching_sequences( conserved_enhancers[which(names(conserved_enhancers)==enh)],
                                                                  conserved_enhancers_seq_Hs[which(names(conserved_enhancers_seq_Hs)==enh)],
                                                                  conserved_enhancers_pt[which(names(conserved_enhancers_pt)==enh)],
                                                                  conserved_enhancers_seq_Pt[which(names(conserved_enhancers_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )

conserved_enhancers_hs_vs_Pt = do.call("rbind",cons)
save(conserved_enhancers_hs_vs_Pt,
     file=paste0(objects_directory,"conserved_enhancers_hs_vs_Pt.RData"))

conserved_enhancers_hs_vs_Pt_gr =  GRanges( seqnames=Rle(conserved_enhancers_hs_vs_Pt$seqnames),
                                            ranges = IRanges( conserved_enhancers_hs_vs_Pt$start,
                                                              end=conserved_enhancers_hs_vs_Pt$end ),
                                            kind=conserved_enhancers_hs_vs_Pt$type)
names(conserved_enhancers_hs_vs_Pt_gr) = conserved_enhancers_hs_vs_Pt$names

export.bed( conserved_enhancers_hs_vs_Pt_gr, con=paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt_ucsc.bed"))
seqlevelsStyle(conserved_enhancers_hs_vs_Pt_gr) = "ncbi"
export.gff( conserved_enhancers_hs_vs_Pt_gr, con=paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt.gtf"))
export.bed( conserved_enhancers_hs_vs_Pt_gr, con=paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt.bed"))

Intersect with bedtools.

cd ~/Documents/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt.bed -b /Volumes/T7/T7_backup_25_07_2023/TFBS/human_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Hs/hs_specieis_enhancers_A6.bed

Intersect with chimp TFBS lifted over to the Hg38 genome assembly

cd ~/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A6.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/conserved_enhancers_hs_vs_Pt_ucsc.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia7/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/conserved_enhancers_Pt/hs_specieis_enhancers_A7.bed

Lost enhnacers that do something

lostE = mclapply( as.list(names(genuine_lost_enhancers_that_do_something)), 
                 function(enh){ Figure_out_mismatching_sequences( genuine_lost_enhancers_that_do_something[which(names(genuine_lost_enhancers_that_do_something)==enh)],
                                                                  lost_enhancers_linked_seq_Hs[which(names(lost_enhancers_linked_seq_Hs)==enh)],
                                                                  lost_linked_pt[which(names(lost_linked_pt)==enh)],
                                                                  lost_enhancers_linked_seq_Pt[which(names(lost_enhancers_linked_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )

lost_enhancers_linked_hs_vs_Pt = do.call("rbind",lostE)
save(lost_enhancers_linked_hs_vs_Pt,
     file=paste0(objects_directory,"lost_enhancers_linked_hs_vs_Pt.RData"))

lost_enhancers_linked_hs_vs_Pt_gr =  GRanges( seqnames=Rle(lost_enhancers_linked_hs_vs_Pt$seqnames),
                                            ranges = IRanges( lost_enhancers_linked_hs_vs_Pt$start,
                                                              end=lost_enhancers_linked_hs_vs_Pt$end ),
                                            kind=lost_enhancers_linked_hs_vs_Pt$type)
names(lost_enhancers_linked_hs_vs_Pt_gr) = lost_enhancers_linked_hs_vs_Pt$names

export.bed( lost_enhancers_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_linked_hs_vs_Pt_gr_ucsc.bed"))
seqlevelsStyle(lost_enhancers_linked_hs_vs_Pt_gr) = "ncbi"
export.gff( lost_enhancers_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_linked_hs_vs_Pt_gr.gtf"))
export.bed( lost_enhancers_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_linked_hs_vs_Pt_gr.bed"))

Intersect the positions of mismatches with TFBS inferred for Chimp and lifted over to human.

cd ~/Documents/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A6.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia7/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_linked_pt/hs_specieis_enhancers_A7.bed

Lost enhnacers that do nothing

lostI = mclapply( as.list(names(genuine_lost_enhancers_that_do_nothing)), 
                 function(enh){ Figure_out_mismatching_sequences( genuine_lost_enhancers_that_do_nothing[which(names(genuine_lost_enhancers_that_do_nothing)==enh)],
                                                                  lost_enhancers_not_linked_seq_Hs[which(names(lost_enhancers_not_linked_seq_Hs)==enh)],
                                                                  lost_not_linked_pt[which(names(lost_not_linked_pt)==enh)],
                                                                  lost_enhancers_not_linked_seq_Pt[which(names(lost_enhancers_not_linked_seq_Pt)==enh)]) }, 
                 mc.cores = 4L )

lost_enhancers_not_linked_hs_vs_Pt = do.call("rbind",lostI)
save(lost_enhancers_not_linked_hs_vs_Pt,
     file=paste0(outputs_directory,"lost_enhancers_not_linked_hs_vs_Pt.RData"))

lost_enhancers_not_linked_hs_vs_Pt_gr =  GRanges( seqnames=Rle(lost_enhancers_not_linked_hs_vs_Pt$seqnames),
                                            ranges = IRanges( lost_enhancers_not_linked_hs_vs_Pt$start,
                                                              end=lost_enhancers_not_linked_hs_vs_Pt$end ),
                                            kind=lost_enhancers_not_linked_hs_vs_Pt$type)
names(lost_enhancers_not_linked_hs_vs_Pt_gr) = lost_enhancers_not_linked_hs_vs_Pt$names

export.bed( lost_enhancers_not_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed"))
seqlevelsStyle(lost_enhancers_not_linked_hs_vs_Pt_gr) = "ncbi"
export.gff( lost_enhancers_not_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_not_linked_hs_vs_Pt_gr.gtf"))
export.bed( lost_enhancers_not_linked_hs_vs_Pt_gr, con=paste0(outputs_directory,"lost_enhancers_not_linked_hs_vs_Pt_gr.bed"))

Again, intersect the positions of mismatches with TFBS inferred for Chimp and lifted over to human.

cd ~/Documents/Tools/bedtools2/

## ------------
./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A1.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia2/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A2.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia3/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A3.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia4/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A4.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia5/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A5.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia6/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A6.bed

./bin/intersectBed -a /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/lost_enhancers_not_linked_hs_vs_Pt_gr_UCSC.bed -b /Volumes/T7/T7_backup_25_07_2023/Chimp_Motif_liftedOver_to_Humans_beds/A-kopia7/*.bed -C -filenames > /Volumes/Backup_4TB/Ciuba_et_all_data_package/data/outputs/TFBS_analysis/lost_enhancers_not_linked_pt/hs_specieis_enhancers_A7.bed

PLOTS - Sequence analysis

conserved_enhancers_hs_vs_Pt = import.bed(paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt_ucsc.bed"))
conserved_enhancers = import.bed(paste0(outputs_directory,"conserved_enhancers.bed"))
names(conserved_enhancers) = conserved_enhancers$name
conserved_enhancers = conserved_enhancers[order(width(conserved_enhancers))]
conserved_enhancers = conserved_enhancers[which(!duplicated(names(conserved_enhancers))) ]

linked_with_activation_TFBSchange = readBedtools_res( filePath=paste0(outputs_directory,"TFBS_analysis/enhancers_linked_with_activation/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)

linked_with_activation_TFBSchange_chimp = readBedtools_UCSC( filePath=paste0(outputs_directory,"TFBS_analysis/enhancers_linked_with_activation_TFBS_chimp/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)

not_linked_with_activation_TFBSchange = readBedtools_res( filePath=paste0(outputs_directory,"TFBS_analysis/enhancers_not_linked_with_activation/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)

conserved_TFBSchange_Hs = readBedtools_res( filePath=paste0(outputs_directory,"TFBS_analysis/conserved_enhancers_Hs/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)

conserved_TFBSchange_Pt = readBedtools_UCSC( filePath=paste0(outputs_directory,"TFBS_analysis/conserved_enhancers_Pt/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)

lost_linked = readBedtools_UCSC( filePath=paste0(outputs_directory,"TFBS_analysis/lost_enhancers_linked_pt/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)

lost_not_linked = readBedtools_UCSC( filePath=paste0(outputs_directory,"TFBS_analysis/lost_enhancers_not_linked_pt/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)

#### --------------------------------------------------------------
linked_with_activation_TFBSchange = processTFBSresult(linked_with_activation_TFBSchange,
                                                      tfanno=TFsEnsemblG,
                                                      nameColumn="names")
linked_with_activation_TFBSchange_chimp = processTFBSresult(linked_with_activation_TFBSchange_chimp,
                                                            tfanno=TFsEnsemblG,
                                                            nameColumn = "names3")

not_linked_with_activation_TFBSchange = processTFBSresult(not_linked_with_activation_TFBSchange,
                                                          tfanno=TFsEnsemblG,
                                                          nameColumn="names")


conserved_TFBSchange = processTFBSresult(conserved_TFBSchange_Hs,
                                         tfanno=TFsEnsemblG,
                                         nameColumn = "names")

conserved_TFBSchange_chimp = processTFBSresult(conserved_TFBSchange_Pt,
                                               tfanno=TFsEnsemblG,
                                               nameColumn = "names3")

lost_linked_TFBSchange = processTFBSresult(lost_linked,
                                           tfanno=TFsEnsemblG,
                                           nameColumn = "names3")

lost_not_linked_TFBSchange = processTFBSresult(lost_not_linked,
                                               tfanno=TFsEnsemblG,
                                               nameColumn = "names3")

save( linked_with_activation_TFBSchange, not_linked_with_activation_TFBSchange,linked_with_activation_TFBSchange_chimp,
      conserved_TFBSchange,conserved_TFBSchange_chimp,
      lost_linked_TFBSchange,lost_not_linked_TFBSchange,
      file=paste0(objects_directory,"evolutionary_changes_in_TFBS.RData" ) )
save(linked_with_activation_TFBSchange,file=paste0(objects_directory,"linked_with_activation_TFBSchange.RData"))
save(linked_with_activation_TFBSchange_chimp,file=paste0(objects_directory,"linked_with_activation_TFBSchange_chimp.Rdata"))

load(paste0(objects_directory,"evolutionary_changes_in_TFBS.RData" ))
conserved_enhancers_hs_vs_Pt = import.bed(paste0(outputs_directory,"conserved_enhancers_hs_vs_Pt_ucsc.bed"))
conserved_enhancers = import.bed(paste0(outputs_directory,"conserved_enhancers.bed"))
names(conserved_enhancers) = conserved_enhancers$name
conserved_enhancers = conserved_enhancers[order(width(conserved_enhancers))]
conserved_enhancers = conserved_enhancers[which(!duplicated(names(conserved_enhancers))) ]

Overall conservation of TFBS - take conserved enhancers

conserved_TFBSchange_table = table( conserved_TFBSchange$TF )
conserved_TFBSchange_chimp_table = table(conserved_TFBSchange_chimp$TF)
conserved_TFBSchange_human_table = conserved_TFBSchange_table[ match(names(conserved_TFBSchange_chimp_table),names(conserved_TFBSchange_table))]

all(names(conserved_TFBSchange_human_table)==names(conserved_TFBSchange_chimp_table))

## [1] TRUE

linked_TFBSchange_table = table( linked_with_activation_TFBSchange$TF )
linked_with_activation_TFBSchange_chimp_table = table(linked_with_activation_TFBSchange_chimp$TF)
linked_with_activation_TFBSchange_human_table = linked_TFBSchange_table[ match(names(linked_with_activation_TFBSchange_chimp_table),names(linked_TFBSchange_table))]
table(names(linked_with_activation_TFBSchange_human_table)==names(linked_with_activation_TFBSchange_human_table))

## 
## TRUE 
##  674

boxplot( log2(conserved_TFBSchange_human_table/conserved_TFBSchange_chimp_table),
         log2(linked_with_activation_TFBSchange_human_table/linked_with_activation_TFBSchange_chimp_table),
         col="white",border=c("green4","turquoise4"),ylab="",
         notch=TRUE, outline=FALSE, ylim=c(-1.5,1.5),
         ylab="Change in TFBS [log2(human/Chimp)]")

## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, :
## Duplicated argument ylab = "Change in TFBS [log2(human/Chimp)]" is disregarded

axis(1,at=c(1,2),lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)

t.test( log2(conserved_TFBSchange_human_table/conserved_TFBSchange_chimp_table),
        log2(linked_with_activation_TFBSchange_human_table/linked_with_activation_TFBSchange_chimp_table) )

## 
##  Welch Two Sample t-test
## 
## data:  log2(conserved_TFBSchange_human_table/conserved_TFBSchange_chimp_table) and log2(linked_with_activation_TFBSchange_human_table/linked_with_activation_TFBSchange_chimp_table)
## t = -8.9693, df = 1006.3, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2721183 -0.1744232
## sample estimates:
##  mean of x  mean of y 
## 0.06966084 0.29293161

Conserved and species specific peaks feature TFBS changes frequently

howManyChangesPerPeak = function(tfbsobj){
  unlist(lapply(split(start(tfbsobj),tfbsobj$peak),function(x){length(unique(x))}))
}

boxplot( howManyChangesPerPeak( linked_with_activation_TFBSchange ),
         howManyChangesPerPeak( not_linked_with_activation_TFBSchange ),
         howManyChangesPerPeak( conserved_TFBSchange ),
         col="white",outline=FALSE,
         names=c('linked','not linked','conserved'),las=2,
         border=c('turquoise4','gray80','green4'),
         ylab="changes in TFBS per element",lwd=2 )
axis(2,lwd=2,las=2)
axis(1,at=c(1,2,3),lwd=2,c('linked','not linked','conserved'),las=2)
box(col="black",lwd=2)

t.test(  howManyChangesPerPeak( linked_with_activation_TFBSchange ),
         howManyChangesPerPeak( not_linked_with_activation_TFBSchange ) )

## 
##  Welch Two Sample t-test
## 
## data:  howManyChangesPerPeak(linked_with_activation_TFBSchange) and howManyChangesPerPeak(not_linked_with_activation_TFBSchange)
## t = 7.0175, df = 1931.9, p-value = 0.000000000003114
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.7881839 1.3996136
## sample estimates:
## mean of x mean of y 
##  6.746499  5.652600

t.test(  howManyChangesPerPeak( conserved_TFBSchange ),
         howManyChangesPerPeak( not_linked_with_activation_TFBSchange ) )

## 
##  Welch Two Sample t-test
## 
## data:  howManyChangesPerPeak(conserved_TFBSchange) and howManyChangesPerPeak(not_linked_with_activation_TFBSchange)
## t = -6.2941, df = 6123.5, p-value = 0.0000000003306
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6712331 -0.3524108
## sample estimates:
## mean of x mean of y 
##  5.140778  5.652600

A=table(cut(width(conserved_TFBSchange),c(0,1,1000)))/length(conserved_TFBSchange)
B=table(cut(width(linked_with_activation_TFBSchange),c(0,1,1000)))/length(linked_with_activation_TFBSchange)
C=table(cut(width(not_linked_with_activation_TFBSchange),c(0,1,1000)))/length(not_linked_with_activation_TFBSchange)
ABC=rbind(A,B,C)

barplot( 100*ABC, beside=TRUE,col=c("green4","turquoise4","gray80"),ylim=c(0,100),
         names=c("MM","Changes>1bp"),ylab="%" )
axis(2,lwd=2)
legend(x=5,y=90,c("Conserved","Linked","Not linked"),cex=1,
       pch=15,col=c("green4","turquoise4","gray80"),bty="n")

Any particular TFs? Preparations

## ------------------------------------------------------------
# matrix for each peak
TFmat_linked_with_activation = makeMatrixTFBS4peaks( tfmut=linked_with_activation_TFBSchange, 
                                                     theTFs=unique(TFsEnsemblG$Fixed), 
                                                     allPeaks=enhancers_linked_with_activation )
TFmat_not_linked_with_activation = makeMatrixTFBS4peaks( tfmut=not_linked_with_activation_TFBSchange, 
                                                         theTFs=unique(TFsEnsemblG$Fixed), 
                                                         allPeaks=enhancers_not_linked_with_activation )
TFmat_linked_with_activation_chimp = makeMatrixTFBS4peaks( tfmut=linked_with_activation_TFBSchange_chimp, 
                                                           theTFs=unique(TFsEnsemblG$Fixed), 
                                                           allPeaks=enhancers_linked_with_activation )


TFmat_conserved = makeMatrixTFBS4peaks( tfmut=conserved_TFBSchange, 
                                        theTFs=unique(TFsEnsemblG$Fixed), 
                                        allPeaks=conserved_enhancers )
TFmat_conserved_chimp = makeMatrixTFBS4peaks( tfmut=conserved_TFBSchange_chimp, 
                                              theTFs=unique(TFsEnsemblG$Fixed), 
                                              allPeaks=conserved_enhancers )

TFmat_lost_linked = makeMatrixTFBS4peaks( tfmut=lost_linked_TFBSchange, 
                                          theTFs=unique(TFsEnsemblG$Fixed), 
                                          allPeaks=genuine_lost_enhancers_that_do_something )
TFmat_lost_not_linked = makeMatrixTFBS4peaks( tfmut=lost_not_linked_TFBSchange, 
                                              theTFs=unique(TFsEnsemblG$Fixed), 
                                              allPeaks=genuine_lost_enhancers_that_do_nothing )

save(TFmat_linked_with_activation,TFmat_not_linked_with_activation,TFmat_linked_with_activation_chimp,
     TFmat_conserved, TFmat_conserved_chimp,TFmat_lost_linked,TFmat_lost_not_linked,
     file=paste0(objects_directory,"TFmatrices_linked_not_linked.RData"))

Assess the significance of the observed differences in frequency

load(paste0(objects_directory,"TFmatrices_linked_not_linked.RData"))

RES = do.call("rbind",
              apply(TFmat_linked_with_activation,2,function(x){data.frame( Motif=sum(x>0),
                                                                 noMotif=sum(x==0) ) } ) )

SER = do.call("rbind",
              apply(TFmat_not_linked_with_activation,2,function(x){data.frame( Motif=sum(x>0),
                                                                               noMotif=sum(x==0) ) } ) )

TFs_FT = data.frame()

for( i in colnames(TFmat_linked_with_activation) ){
  m=rbind(linked=RES[rownames(RES)==i,],
          notLinked=SER[rownames(SER)==i,])
  tp = fisher.test(m)
  tp = data.frame(p_val=tp$p.value,
                  odds=tp$estimate,
                  number_in_linked = RES[rownames(RES)==i,1],
                  number_in_not_linked = SER[rownames(SER)==i,1],
                  fraction_in_linked = RES[rownames(RES)==i,1]/rowSums(RES[rownames(RES)==i,]),
                  fraction_in_not_linked = SER[rownames(SER)==i,1]/rowSums(SER[rownames(SER)==i,]),
                  tf = i)
  TFs_FT=rbind(tp,TFs_FT) }
TFs_FT$p_adjust = p.adjust(TFs_FT$p_val)
TFs_FT$p_adjust_bin = cut(-log10(TFs_FT$p_adjust), c(-1,0,1, seq(2,10,length.out=252),45) )

par(pty="s")
plot( x=TFs_FT$fraction_in_linked, 
      y=TFs_FT$fraction_in_not_linked,
      pch=19, cex=0.5, 
      xlab="Linked with activation",
      ylab="Not linked with activation",
      xlim=c(0,0.3), ylim=c(0,0.3),
      col=ifelse(TFs_FT$p_adjust<0.01,"blue3","wheat2"))
abline(a=0,b=1,col='black')
axis(1,lwd=2)
axis(2,lwd=2)
box(col='black',lwd=2)
text(x=TFs_FT$fraction_in_linked[TFs_FT$p_adjust<0.01 & TFs_FT$fraction_in_linked>0.1]+0.005,
     y=TFs_FT$fraction_in_not_linked[TFs_FT$p_adjust<0.01 & TFs_FT$fraction_in_linked>0.1]+0.005,
     TFs_FT$tf[TFs_FT$p_adjust<0.01 & TFs_FT$fraction_in_linked>0.1],
     cex=1)

Odds of seeing that many stripe TFs

TFs_FT_filt = TFs_FT[TFs_FT$p_adjust<0.01 ,]
sum( TFs_FT_filt$tf %in% human_stripe_factors$V1 )/nrow(TFs_FT_filt)

## [1] 0.8651685

m = rbind( affected = c(stripe=sum( TFs_FT[TFs_FT$p_adjust<0.01,]$tf %in% human_stripe_factors$V1 ),
                        non_stripe = sum( ! TFs_FT[ TFs_FT$p_adjust<0.01,]$tf %in% human_stripe_factors$V1 )),
           non_affected = c(stripe=sum( TFs_FT$tf %in% human_stripe_factors$V1 ),
                            non_stripe = sum( ! TFs_FT$tf %in% human_stripe_factors$V1)) )
m

##              stripe non_stripe
## affected         77         12
## non_affected    199        476

fisher.test(m)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  m
## p-value < 0.00000000000000022
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   8.046439 31.566055
## sample estimates:
## odds ratio 
##   15.28785

par(pty="m",mar=c(5,3,3,1))
barplot( log2(fisher.test(m)$estimate), col="blue3", ylim=c(0,4))
axis(2,lwd=2)

keyTFs = TFs_FT_filt$tf
keyTFs = keyTFs[keyTFs %in% human_stripe_factors$V1]

Lost enhancers - inactivation is also linked with loss of stripe factors?

l=(1+colSums(TFmat_lost_linked>0))/nrow(TFmat_lost_linked)
L=(1+colSums(TFmat_lost_not_linked>0))/nrow(TFmat_lost_not_linked)

LOS = do.call("rbind",
              apply(TFmat_lost_linked,2,function(x){data.frame( Motif=1+sum(x>0),
                                                                noMotif=1+sum(x==0) ) } ) )

SOL = do.call("rbind",
              apply(TFmat_lost_not_linked,2,function(x){data.frame( Motif=1+sum(x>0),
                                                                    noMotif=1+sum(x==0) ) } ) )



lost_TFs_FT_chimp = data.frame()

for( i in colnames(TFmat_linked_with_activation) ){
  m=rbind(linked=LOS[rownames(LOS)==i,],
          notLinked=SOL[rownames(SOL)==i,])
  tp = fisher.test(m)
  tp = data.frame(p_val=tp$p.value,
                  odds=tp$estimate,
                  number_in_linked = LOS[rownames(LOS)==i,1],
                  number_in_not_linked = SOL[rownames(SOL)==i,1],
                  fraction_in_linked = LOS[rownames(LOS)==i,1]/rowSums(LOS[rownames(LOS)==i,]),
                  fraction_in_not_linked = SOL[rownames(SOL)==i,1]/rowSums(SOL[rownames(SOL)==i,]),
                  tf = i)
  lost_TFs_FT_chimp=rbind(tp,lost_TFs_FT_chimp) }

par(mfrow=c(1,1),mar=c(10,4,4,4), pty="m")
boxplot( lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% keyTFs],
         lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1],
         lost_TFs_FT_chimp$odds[! lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1],
         col="white",border=c("blue3","steelblue","coral3"),
         ylim=c(0,3),ylab="Odds",
         outline=FALSE,axes=FALSE)
abline(h=1)
axis(1,lwd=2,at=c(1,2,3),c("77 stripe TFs","All stripe TFs","non-stripe TFs"),las=2)
axis(2,lwd=2)

t.test(lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% keyTFs],
       lost_TFs_FT_chimp$odds[! lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1])

## 
##  Welch Two Sample t-test
## 
## data:  lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% keyTFs] and lost_TFs_FT_chimp$odds[!lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1]
## t = 4.2022, df = 332.24, p-value = 0.00003402
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1255168 0.3464608
## sample estimates:
## mean of x mean of y 
##  1.374035  1.138047

t.test(lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1],
       lost_TFs_FT_chimp$odds[! lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1])

## 
##  Welch Two Sample t-test
## 
## data:  lost_TFs_FT_chimp$odds[lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1] and lost_TFs_FT_chimp$odds[!lost_TFs_FT_chimp$tf %in% human_stripe_factors$V1]
## t = 2.4189, df = 346.62, p-value = 0.01608
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03773178 0.36604516
## sample estimates:
## mean of x mean of y 
##  1.339935  1.138047

How many human gained enhanceers have evolutionary changes in these factors

par(mfrow=c(1,1), pty="m")
m = rbind( linked=table(cut(rowSums(TFmat_linked_with_activation[,keyTFs]), 
                            c(-Inf,0,1,2,4,Inf) ) )/nrow(TFmat_linked_with_activation),
           not_linked=table(cut(rowSums(TFmat_not_linked_with_activation[,keyTFs]), 
                                c(-Inf,0,1,2,4,Inf) ) )/nrow(TFmat_not_linked_with_activation))

par(mfrow=c(2,1),mar=c(4,4,1,1),pty="m")
barplot(m,beside=TRUE,col=c("turquoise4","gray70"),ylim=c(0,0.5),
        names=c(0,1,2,4,">4"),ylab="Francion of sequences with TFs")
axis(2,lwd=2)

astroTFs = c("SOX9","SOX2","NFIA","NFIB","AFT3","RUNX2","NR1F2","DBX2","LHX2","STAT3")
astroTFs = astroTFs[astroTFs %in% colnames(TFmat_linked_with_activation)]
# par(mfrow=c(1,1))
M = rbind( linked=table(cut(rowSums(TFmat_linked_with_activation[,astroTFs]), 
                            c(-Inf,0,1,2,4,Inf) ) )/nrow(TFmat_linked_with_activation),
           not_linked=table(cut(rowSums(TFmat_not_linked_with_activation[,astroTFs]), 
                                c(-Inf,0,1,2,4,Inf) ) )/nrow(TFmat_not_linked_with_activation))
barplot(M,beside=TRUE,col=c("turquoise4","gray70"),ylim=c(0,1),
        names=c(0,1,2,4,">4"),ylab="Francion of sequences with TFs")
axis(2,lwd=2)

table(cut(rowSums(TFmat_linked_with_activation[,keyTFs]), 
          c(-Inf,0,1,2,4,Inf) ) )

## 
## (-Inf,0]    (0,1]    (1,2]    (2,4] (4, Inf] 
##      325      185      102      147      684

1443 - sum(rowSums(TFmat_linked_with_activation[,keyTFs])==0)

## [1] 1118

How many changes in TF not being stripe factors?

not_stripeTFs_changes = TFmat_linked_with_activation[,!colnames(TFmat_linked_with_activation) %in% human_stripe_factors$V1]
table(rowSums(not_stripeTFs_changes>0))

## 
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
## 25 27 51 64 67 68 67 66 77 59 82 73 71 65 47 61 60 42 37 35 33 25 29 23 24 19 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 40 41 42 43 44 45 46 47 48 50 53 54 56 
## 20 19 16  9 11 15 10  7  3  8  2  3  2  2  1  2  1  2  3  1  1  1  1  1  1  1 
## 66 68 70 
##  1  1  1

Export these enhancers

enhancers__stripe_factors = TFmat_linked_with_activation[,keyTFs]
enhancers_with_stripe_factors = TFmat_linked_with_activation[rowSums(enhancers__stripe_factors)>0,]
enhancers_wo_stripe_factors = TFmat_linked_with_activation[rowSums(enhancers__stripe_factors)==0,]

save(enhancers_with_stripe_factors,
     enhancers_wo_stripe_factors,
     file=paste0(objects_directory,"enhancers_stripeTFs_no_stripeTFs.RData"))

Plot showing if the stripe factors more frequently loose binding or not

numberTFperPeak = function(tfmut,theTFs,allPeaks ){
  # tfmut=linked_with_activation_TFBSchange_Hs_spe;theTFs=unique(TFsEnsemblG$Fixed);allPeaks=enhancers_linked_with_activationII
    res = matrix(0L,
                 nrow=length(allPeaks),
                 ncol=length(unique(theTFs)) )
    res = as.data.frame(res)
    rownames(res) = names(allPeaks)
    colnames(res) = unique(theTFs)
    tp = split( tfmut$peak, tfmut$TF )
    for( tf in unique(theTFs)) {
        # tf="AFX3"
        thisC = which( colnames(res)==tf )
        theseRows = which( rownames(res) %in% tp[[tf]])
        numbers4rows = table( tp[[tf]] )
        coordinates = cbind(row=theseRows,
                            col=rep(thisC,length(theseRows)),
                            number = numbers4rows[match(rownames(res)[theseRows],names(numbers4rows))])
        if(nrow(coordinates)>0){
          res[ cbind( coordinates[,1], coordinates[,2]) ] = coordinates[,3] }
        
    }
    return(res) }


## -----------------------
linked_with_activation_TFBSchange_Hs_spe = linked_with_activation_TFBSchange[-queryHits(findOverlaps(linked_with_activation_TFBSchange,linked_with_activation_TFBSchange_chimp))]

linked_with_activation_TFBSchange_Pt_spe = linked_with_activation_TFBSchange_chimp[-queryHits(findOverlaps(linked_with_activation_TFBSchange_chimp,linked_with_activation_TFBSchange))]



TFmat_linked_with_activation_Hs = numberTFperPeak( tfmut=linked_with_activation_TFBSchange_Hs_spe, 
                                                   theTFs=unique(TFsEnsemblG$Fixed), 
                                                   allPeaks=enhancers_linked_with_activation )
TFmat_linked_with_activation_Pt = numberTFperPeak( tfmut=linked_with_activation_TFBSchange_Pt_spe, 
                                                   theTFs=unique(TFsEnsemblG$Fixed), 
                                                   allPeaks=enhancers_linked_with_activation )


conserved_TFBSchange_Hs_spe = conserved_TFBSchange[-queryHits(findOverlaps(conserved_TFBSchange,conserved_TFBSchange_chimp))]
conserved_TFBSchange_Pt_spe = conserved_TFBSchange_chimp[-queryHits(findOverlaps(conserved_TFBSchange_chimp,conserved_TFBSchange))]

TFmat_conserved_Hs = numberTFperPeak( tfmut=conserved_TFBSchange_Hs_spe, 
                                      theTFs=unique(TFsEnsemblG$Fixed), 
                                      allPeaks=conserved_enhancers )
TFmat_conserved_Pt = numberTFperPeak( tfmut=conserved_TFBSchange_Pt_spe, 
                                      theTFs=unique(TFsEnsemblG$Fixed), 
                                      allPeaks=conserved_enhancers )

all(rownames(TFmat_conserved_Hs)==rownames(TFmat_conserved_Pt))

## [1] TRUE

net_TFBS_gain_conserved = (TFmat_conserved_Hs>TFmat_conserved_Pt)
net_TFBS_loss_conserved = (TFmat_conserved_Hs<TFmat_conserved_Pt)
net_TFBS_gain_linked = (TFmat_linked_with_activation_Hs>TFmat_linked_with_activation_Pt)
net_TFBS_loss_linked = (TFmat_linked_with_activation_Hs<TFmat_linked_with_activation_Pt)



## -----------------------
par(mfrow=c(1,2),pty="s",mar=c(4,4,3,3))
plot(colSums(net_TFBS_gain_linked),
     colSums(net_TFBS_loss_linked),pch=19, cex=0.5,
     main="Linked",ylab="TFBS loss",xlab="Gain in TFBS",
     ylim=c(0,60),xlim=c(0,60),
     col=ifelse(names(colSums(net_TFBS_gain_linked)) %in% keyTFs,"blue","gray"))
abline(a=0,b=1)
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)
plot(colSums(net_TFBS_gain_conserved),colSums(net_TFBS_loss_conserved), pch=19, cex=0.5,
     main="Conserved",ylab="TFBS loss",xlab="Gain in TFBS" ,ylim=c(0,60),xlim=c(0,60),
     col=ifelse(names(colSums(net_TFBS_gain_conserved)) %in% keyTFs,"blue","gray"))
abline(a=0,b=1)
axis(1,lwd=2)
axis(2,lwd=2)
box(col="black",lwd=2)

Enhancers with changes in stripe factors and gene upregulation in the vicinity

up_set = HS_UP_Genes$ensembl_id
promoters_HITS_UP = promoters_filtered_gr[ which( promoters_filtered_gr$gene_id %in% up_set ) ]
promoters_HITS_UP_500 = resize(promoters_HITS_UP,1000000,fix="center")

promoters_HITS_UP_500_counting = data.frame( with_stripeTF = countOverlaps(promoters_HITS_UP_500,enhancers_linked_with_activation[which(names(enhancers_linked_with_activation) %in% rownames(enhancers_with_stripe_factors))]),
                                             wo_stripeTF = countOverlaps(promoters_HITS_UP_500,enhancers_linked_with_activation[which(names(enhancers_linked_with_activation) %in% rownames(enhancers_wo_stripe_factors))]),
                                             any = countOverlaps(promoters_HITS_UP_500,genuine_gained_enhancers_gr) )

## stripe no stripe
prom_with_with = rownames(promoters_HITS_UP_500_counting[promoters_HITS_UP_500_counting$with_stripeTF>0& promoters_HITS_UP_500_counting$wo_stripeTF>0,])
prom_with_wo = rownames(promoters_HITS_UP_500_counting[promoters_HITS_UP_500_counting$with_stripeTF>0 & promoters_HITS_UP_500_counting$wo_stripeTF==0,])

prom_wo_with = rownames(promoters_HITS_UP_500_counting[promoters_HITS_UP_500_counting$with_stripeTF==0 & promoters_HITS_UP_500_counting$wo_stripeTF>0,])
prom_wo_wo = rownames(promoters_HITS_UP_500_counting[promoters_HITS_UP_500_counting$with_stripeTF==0 & promoters_HITS_UP_500_counting$wo_stripeTF==0,])


promoters_HITS_UP_500_counting_enh = promoters_HITS_UP_500_counting[rowSums(promoters_HITS_UP_500_counting[,1:2])>0,]

m = promoters_HITS_UP_500_counting_enh>0
m = m[order(m[,1],m[,2]),1:2]

par(mar=c(1,1,1,1))
image(t(m),col=c("white","coral2"),axes=FALSE)
box(col="black",lwd=2)
abline(v=0.5,lwd=2)

sum(m[,1]==0 & m[,2]>0)

## [1] 29

sum(m[,1]>0 & m[,2]>0)

## [1] 258

sum(m[,1]>0 & m[,2]==0)

## [1] 299

Footprint

readFootprintAnalysis_bed = function(TFdir,SPECIES){
  allF = as.list( unlist(strsplit(list.files(TFdir),"_FootPrints")) )
  res=do.call("rbind",lapply(allF,function(x){
    # x = allF[[1]]
    tp=read.delim(paste0(TFdir,"/",x,"_FootPrints/",x,".bed"),
                  sep="\t",header=FALSE )
    return( data.frame(score=tp$V5, TF=unlist(strsplit(x,"_"))[1], species=SPECIES ) ) }))
  return(res) }

footprintHg = readFootprintAnalysis_bed(paste0(outputs_directory,"footprint_analysis/Stripe_TF_HG38_Footprints_10bp/"), "Human")
footprintPt = readFootprintAnalysis_bed(paste0(outputs_directory,"footprint_analysis/Stripe_TF_PT06_Footprints_10bp/"), "Chimpanzee")

footprintHg_keyTFs = footprintHg[footprintHg$TF %in% TFsEnsemblG[ TFsEnsemblG$Fixed %in% keyTFs,1], ]
footprintPt_keyTFs = footprintPt[footprintPt$TF %in% TFsEnsemblG[ TFsEnsemblG$Fixed %in% keyTFs,1], ]

footprint_scores = rbind(footprintHg,footprintPt)
footprint_scores$species = factor(footprint_scores$species,levels=c("Human","Chimpanzee"))
p1=ggboxplot(footprint_scores, x="TF", y="score",color = "species",
          palette=c("black","red"),outlier.shape = NA,rotate = TRUE) 
ggpar(p1,ylim = c(0,500)) + rotate_x_text(90)

## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

Supplementary analysis - TAD based annotation

enhancers_linked_with_activation_TADs = import.bed(paste0(outputs_directory,"enhancers_linked_with_activation_TADs.bed"))
names(enhancers_linked_with_activation_TADs) = enhancers_linked_with_activation_TADs$name

enhancers_not_linked_with_activation_TADs = import.bed(paste0(outputs_directory,"enhancers_not_linked_with_activation_TADs.bed"))
names(enhancers_not_linked_with_activation_TADs) = enhancers_not_linked_with_activation_TADs$name


linked_with_activation_TADs_TFBSchange = readBedtools_res( filePath=paste0(outputs_directory,"/TFBS_analysis/enhancers_linked_with_activation_TADs/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)

not_linked_with_activation_TADs_TFBSchange = readBedtools_res( filePath=paste0(outputs_directory,"/TFBS_analysis/enhancers_not_linked_with_activation_TADs/"),
                         chroms = paste0("chr",c(1:22,'X','Y')),4,7)


linked_with_activation_TADs_TFBSchange = processTFBSresult(linked_with_activation_TADs_TFBSchange,
                                                      tfanno=TFsEnsemblG,
                                                      nameColumn="names")

not_linked_with_activation_TADs_TFBSchange = processTFBSresult(not_linked_with_activation_TADs_TFBSchange,
                                                      tfanno=TFsEnsemblG,
                                                      nameColumn="names")

save(linked_with_activation_TADs_TFBSchange,
     not_linked_with_activation_TADs_TFBSchange,
     file=paste0(objects_directory,"linked_or_not_with_activation_TADs_TFBSchange_chimp.RData"))



TFmat_linked_with_activation_TADs = makeMatrixTFBS4peaks( tfmut=linked_with_activation_TADs_TFBSchange, 
                                                     theTFs=unique(TFsEnsemblG$Fixed), 
                                                     allPeaks=enhancers_linked_with_activation_TADs )
TFmat_not_linked_with_activation_TADs = makeMatrixTFBS4peaks( tfmut=not_linked_with_activation_TADs_TFBSchange, 
                                                         theTFs=unique(TFsEnsemblG$Fixed), 
                                                         allPeaks=enhancers_not_linked_with_activation_TADs )


save(TFmat_linked_with_activation_TADs,TFmat_not_linked_with_activation_TADs,
     file=paste0(objects_directory,"TFmatrices_linked_not_linked_TADs.RData"))

Compare linked and non linked - TAD annotation

load(paste0(objects_directory,"TFmatrices_linked_not_linked_TADs.RData"))

TAL = do.call("rbind",
              apply(TFmat_linked_with_activation_TADs,2,function(x){data.frame( Motif=sum(x>0),
                                                                 noMotif=sum(x==0) ) } ) )

LAT = do.call("rbind",
              apply(TFmat_not_linked_with_activation_TADs,2,function(x){data.frame( Motif=sum(x>0),
                                                                               noMotif=sum(x==0) ) } ) )

TFs_TAD_FT = data.frame()
for( i in colnames(TFmat_linked_with_activation_TADs) ){
  m=rbind(linked=TAL[rownames(TAL)==i,],
          notLinked=LAT[rownames(LAT)==i,])
  tp = fisher.test(m)
  tp = data.frame(p_val=tp$p.value,
                  odds=tp$estimate,
                  number_in_linked = TAL[rownames(TAL)==i,1],
                  number_in_not_linked = LAT[rownames(LAT)==i,1],
                  fraction_in_linked = TAL[rownames(TAL)==i,1]/rowSums(TAL[rownames(TAL)==i,]),
                  fraction_in_not_linked = LAT[rownames(LAT)==i,1]/rowSums(LAT[rownames(LAT)==i,]),
                  tf = i)
  TFs_TAD_FT=rbind(tp,TFs_TAD_FT) }

TFs_TAD_FT$p_adjust = p.adjust(TFs_TAD_FT$p_val)

par(pty="s",mfrow=c(1,1))
plot( x=TFs_TAD_FT$fraction_in_linked, 
      y=TFs_TAD_FT$fraction_in_not_linked,
      pch=19, cex=0.5, 
      xlab="Linked with activation",
      ylab="Not linked with activation",
      xlim=c(0,0.3), ylim=c(0,0.3),
      col=ifelse(TFs_TAD_FT$p_adjust<0.05 ,"blue3","wheat2"))
abline(a=0,b=1,col='black')
axis(1,lwd=2)
axis(2,lwd=2)
box(col='black',lwd=2)
text(x=TFs_TAD_FT$fraction_in_linked[TFs_TAD_FT$p_adjust<0.05 ]+0.005,
     y=TFs_TAD_FT$fraction_in_not_linked[TFs_TAD_FT$p_adjust<0.05 ]+0.005,
     TFs_TAD_FT$tf[TFs_TAD_FT$p_adjust<0.05 ],
     cex=1)

TFs_TAD_FT[TFs_TAD_FT$p_val<0.01 & TFs_TAD_FT$fraction_in_linked>0.1,]

##                                p_val     odds number_in_linked
## odds ratio673 0.00150988966405420457 1.608891               64
## odds ratio669 0.00061743780893750864 1.683180               63
## odds ratio664 0.00000001553881099418 1.845451              143
## odds ratio645 0.00002249720073059971 1.586955              135
## odds ratio639 0.00547001523869697359 1.401620              100
## odds ratio636 0.00001844604778766271 1.738707               91
## odds ratio635 0.00000177955637888710 1.693353              133
## odds ratio626 0.00051003649052152309 1.607932               79
## odds ratio623 0.00004148693474305783 1.610882              117
## odds ratio616 0.00020763714135284235 1.702754               72
## odds ratio603 0.00000000100972795250 2.345061               86
## odds ratio587 0.00044599744467552140 1.518042              109
## odds ratio584 0.00005385690669902456 1.799449               72
## odds ratio582 0.00000037117379643772 1.768726              130
## odds ratio579 0.00003393160944478522 1.537540              153
## odds ratio566 0.00000000003216698365 2.880740               68
## odds ratio538 0.00000410606699485287 1.786143               98
## odds ratio537 0.00000000049265940835 2.234227              101
## odds ratio514 0.00000028707333831881 1.828129              117
## odds ratio513 0.00000196034299164705 1.876367               89
## odds ratio512 0.00000000000004617375 2.398890              131
## odds ratio511 0.00000000010021448003 2.256986              106
## odds ratio478 0.00003385372076594007 1.704218               93
## odds ratio473 0.00003225347827537949 1.725282               90
## odds ratio458 0.00000033506023894474 1.969773               91
## odds ratio457 0.00000010818835810204 2.112920               80
## odds ratio408 0.00000000256319916115 1.926115              141
## odds ratio308 0.00000138201327477164 1.652499              155
## odds ratio287 0.00000002345307931912 2.008273              105
## odds ratio286 0.00000025777319341452 2.094659               77
## odds ratio284 0.00000000811899658359 2.205526               87
## odds ratio283 0.00000000007428138142 2.293758              103
## odds ratio282 0.00001458947286974133 1.698132              104
## odds ratio96  0.00000578088929590625 1.670073              124
## odds ratio95  0.00000000624475659928 2.205966               88
## odds ratio57  0.00001776370364941432 1.818494               78
##               number_in_not_linked fraction_in_linked fraction_in_not_linked
## odds ratio673                  316             0.1024             0.06620574
## odds ratio669                  298             0.1008             0.06243453
## odds ratio664                  661             0.2288             0.13848732
## odds ratio645                  706             0.2160             0.14791536
## odds ratio639                  571             0.1600             0.11963126
## odds ratio636                  426             0.1456             0.08925204
## odds ratio635                  657             0.2128             0.13764928
## odds ratio626                  394             0.1264             0.08254766
## odds ratio623                  597             0.1872             0.12507857
## odds ratio616                  339             0.1152             0.07102451
## odds ratio603                  304             0.1376             0.06369160
## odds ratio587                  583             0.1744             0.12214540
## odds ratio584                  322             0.1152             0.06746281
## odds ratio582                  617             0.2080             0.12926880
## odds ratio579                  831             0.2448             0.17410434
## odds ratio566                  194             0.1088             0.04064530
## odds ratio538                  450             0.1568             0.09428033
## odds ratio537                  379             0.1616             0.07940499
## odds ratio514                  534             0.1872             0.11187932
## odds ratio513                  388             0.1424             0.08129059
## odds ratio512                  475             0.2096             0.09951812
## odds ratio511                  396             0.1696             0.08296669
## odds ratio478                  444             0.1488             0.09302326
## odds ratio473                  424             0.1440             0.08883302
## odds ratio458                  380             0.1456             0.07961450
## odds ratio457                  310             0.1280             0.06494867
## odds ratio408                  627             0.2256             0.13136392
## odds ratio308                  794             0.2480             0.16635240
## odds ratio287                  436             0.1680             0.09134716
## odds ratio286                  300             0.1232             0.06285355
## odds ratio284                  326             0.1392             0.06830086
## odds ratio283                  378             0.1648             0.07919547
## odds ratio282                  502             0.1664             0.10517494
## odds ratio96                   616             0.1984             0.12905929
## odds ratio95                   330             0.1408             0.06913891
## odds ratio57                   347             0.1248             0.07270061
##                    tf            p_adjust
## odds ratio673 ZSCAN22 0.92254258473711903
## odds ratio669   ZNF76 0.38281144154125535
## odds ratio664  ZNF770 0.00001034884812212
## odds ratio645  ZNF467 0.01444320286904501
## odds ratio639  ZNF394 1.00000000000000000
## odds ratio636  ZNF350 0.01189770082304245
## odds ratio635  ZNF341 0.00116738898454994
## odds ratio626  ZNF281 0.31724269710438735
## odds ratio623  ZNF263 0.02638569049658478
## odds ratio616  ZNF148 0.13060376191093784
## odds ratio603     ZFX 0.00000067651772818
## odds ratio587  ZBTB17 0.27830240547752533
## odds ratio584  ZNF324 0.03419913575388060
## odds ratio582     WT1 0.00024460353185246
## odds ratio579   VEZF1 0.02168229843521776
## odds ratio566   THAP1 0.00000002168054698
## odds ratio538   TBX15 0.00267715568064407
## odds ratio537    TBX1 0.00000033057446300
## odds ratio514     SP4 0.00018975547662873
## odds ratio513     SP3 0.00128402465952881
## odds ratio512     SP2 0.00000000003116728
## odds ratio511     SP1 0.00000006734413058
## odds ratio478    RXRA 0.02166638129020165
## odds ratio473   RREB1 0.02067447957451825
## odds ratio458    RARA 0.00022113975770353
## odds ratio457    PURA 0.00007183706977975
## odds ratio408   PATZ1 0.00000171478023881
## odds ratio308     MAZ 0.00090798272152497
## odds ratio287    KLF6 0.00001559629774721
## odds ratio286    KLF5 0.00017064585404041
## odds ratio284    KLF3 0.00000541537072126
## odds ratio283   KLF16 0.00000004999136969
## odds ratio282   KLF15 0.00943938894672264
## odds ratio96     EGR2 0.00376335893163497
## odds ratio95     EGR1 0.00000417149740832
## odds ratio57    NR2F1 0.01147535255752165

scRNA-seq data

ms=64000*1024^2 
options(future.globals.maxSize=ms)

human1=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_667_S13_SingleCell/raw_feature_bc_matrix/')
human2=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_698_S15_SingleCell/raw_feature_bc_matrix/')
human3=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_700_S14_SingleCell/raw_feature_bc_matrix/')
human4=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_702_9C_SingleCell/raw_feature_bc_matrix/')
human5=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_667_60C_SingleCell/raw_feature_bc_matrix/')
human6=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_669_3C_SingleCell/raw_feature_bc_matrix/')
human7=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_671_64C_SingleCell/raw_feature_bc_matrix/')
human8=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_673_62C_SingleCell/raw_feature_bc_matrix/')
human9=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_675_23C_SingleCell/raw_feature_bc_matrix/')
human10=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_677_63C_SingleCell/raw_feature_bc_matrix/')
human11=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_679_30C_SingleCell/raw_feature_bc_matrix/')
human12=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_681_34C_SingleCell/raw_feature_bc_matrix/')
human13=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_686_56C_SingleCell/raw_feature_bc_matrix/')
human14=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_690_26C_SingleCell/raw_feature_bc_matrix/')
human15=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_F_688_11C_SingleCell/raw_feature_bc_matrix/')
human16=paste0(outputs_directory,'/scRNA_published_data/HSapiens_SingleCell_PRJNA899373/HS_Prenatal_M_692_24C_SingleCell/raw_feature_bc_matrix/')


macaque1=paste0(outputs_directory,'/scRNA_published_data/RhMacaque/scRNA_syn17093056_RMB683_DFC/multi/count/raw_feature_bc_matrix/')
macaque2=paste0(outputs_directory,'/scRNA_published_data/RhMacaque/scRNA_syn17093056_RMB691_DFC/multi/count/raw_feature_bc_matrix/')
macaque_Ch_78_1=paste0(outputs_directory,'scRNA_published_data/RhMacaque/scRNA_SRR23687004_macaque/raw_feature_bc_matrix/')
macaque_Ch_110_DFC=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23687017_M_DFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_110_OFC=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23686999_M_OFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_93_DFC=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E93_SRR23687065_M_DFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_110_DFC_S2=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23687057_M_DFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_110_VFC=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23687060_M_VFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_110_VFC_S2=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E110_SRR23687012_M_VFC_scRNA/raw_feature_bc_matrix/')
macaque_Ch_77_Frontal=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E77_F_Frontal/raw_feature_bc_matrix/')
macaque_Ch_64_Frontal=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E64_F_Frontal/raw_feature_bc_matrix/')
macaque_Ch_62_Frontal=paste0(outputs_directory,'scRNA_published_data/RhMacaque/E62_F_frontal_scRNA/raw_feature_bc_matrix/')

human_metadata = read.csv( paste0(outputs_directory,'/scRNA_published_data/GSE217511_CorticalPlate_Seuratmetadata.csv' ))
human_sample_anno = paste0(outputs_directory,'/scRNA_published_data/MetaTable.txt' )
human_metadata$UMI = unlist(lapply(strsplit(human_metadata$X,"_"),function(x){x[[1]]}))

Let’s consider the data from the foetal like cells

human1_expression=Read10X( human1 )
human2_expression=Read10X( human2 )
human3_expression=Read10X( human3 )
human4_expression=Read10X( human4 )
human5_expression=Read10X( human5 )
human6_expression=Read10X( human6 )
human7_expression=Read10X( human7 )
human8_expression=Read10X( human8 )
human9_expression=Read10X( human9 )
human10_expression=Read10X( human10 )
human11_expression=Read10X( human11 )
human12_expression=Read10X( human12 )
human13_expression=Read10X( human13 )
human14_expression=Read10X( human14 )
human15_expression=Read10X( human15 )
human16_expression=Read10X( human16 )


macaque1_expression=Read10X( macaque1 )
macaque2_expression=Read10X( macaque2 )
macaque_CN_78_1_expression=Read10X( macaque_Ch_78_1 )
macaque_DFC_110_1_expression=Read10X( macaque_Ch_110_DFC )
macaque_OFC_110_1_expression=Read10X( macaque_Ch_110_OFC )
macaque_93_DFC_expression= Read10X( macaque_Ch_93_DFC )
macaque_110_DFC_S2_expression= Read10X( macaque_Ch_110_DFC_S2 )
macaque_110_VFC_expression=Read10X( macaque_Ch_110_VFC)
macaque_110_VFC_S2_expression=Read10X( macaque_Ch_110_VFC_S2)
macaque_77_Frontal_expression=Read10X( macaque_Ch_77_Frontal)
macaque_64_Frontal_expression=Read10X( macaque_Ch_64_Frontal)
macaque_62_Frontal_expression=Read10X( macaque_Ch_62_Frontal)




## ---------------------------
human1_expression = CreateSeuratObject(human1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human2_expression = CreateSeuratObject(human2_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human3_expression = CreateSeuratObject(human3_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human4_expression=  CreateSeuratObject(human4_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human5_expression=  CreateSeuratObject(human5_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human6_expression=  CreateSeuratObject(human6_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human7_expression=  CreateSeuratObject(human7_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human8_expression=  CreateSeuratObject(human8_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human9_expression=  CreateSeuratObject(human9_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human10_expression= CreateSeuratObject(human10_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human11_expression= CreateSeuratObject(human11_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human12_expression= CreateSeuratObject(human12_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human13_expression= CreateSeuratObject(human13_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human14_expression= CreateSeuratObject(human14_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human15_expression= CreateSeuratObject(human15_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
human16_expression= CreateSeuratObject(human16_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)


macaque1_expression = CreateSeuratObject(macaque1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque2_expression = CreateSeuratObject(macaque2_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_CN_78_1_expression = CreateSeuratObject(macaque_CN_78_1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_DFC_110_1_expression = CreateSeuratObject(macaque_DFC_110_1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_OFC_110_1_expression = CreateSeuratObject(macaque_OFC_110_1_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_93_DFC_expression= CreateSeuratObject(macaque_93_DFC_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_110_DFC_S2_expression= CreateSeuratObject(macaque_110_DFC_S2_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_110_VFC_expression= CreateSeuratObject(macaque_110_VFC_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_110_VFC_S2_expression= CreateSeuratObject(macaque_110_VFC_S2_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_77_Frontal_expression= CreateSeuratObject(macaque_77_Frontal_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_64_Frontal_expression= CreateSeuratObject(macaque_64_Frontal_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)
macaque_62_Frontal_expression= CreateSeuratObject(macaque_62_Frontal_expression, project = "AstroEvo_old", min.cells = 3, min.features = 200)


## ----------------
human1_expression[["percent.mt"]] <- PercentageFeatureSet(human1_expression, pattern = "^MT-")
human2_expression[["percent.mt"]] <- PercentageFeatureSet(human2_expression, pattern = "^MT-")
human3_expression[["percent.mt"]] <- PercentageFeatureSet(human3_expression, pattern = "^MT-")
human4_expression[["percent.mt"]] <- PercentageFeatureSet(human4_expression, pattern = "^MT-")
human5_expression[["percent.mt"]] <- PercentageFeatureSet(human5_expression, pattern = "^MT-")
human6_expression[["percent.mt"]] <- PercentageFeatureSet(human6_expression, pattern = "^MT-")
human7_expression[["percent.mt"]] <- PercentageFeatureSet(human7_expression, pattern = "^MT-")
human8_expression[["percent.mt"]] <- PercentageFeatureSet(human8_expression, pattern = "^MT-")
human9_expression[["percent.mt"]] <- PercentageFeatureSet(human9_expression, pattern = "^MT-")
human10_expression[["percent.mt"]] <- PercentageFeatureSet(human10_expression, pattern = "^MT-")
human11_expression[["percent.mt"]] <- PercentageFeatureSet(human11_expression, pattern = "^MT-")
human12_expression[["percent.mt"]] <- PercentageFeatureSet(human12_expression, pattern = "^MT-")
human13_expression[["percent.mt"]] <- PercentageFeatureSet(human13_expression, pattern = "^MT-")
human14_expression[["percent.mt"]] <- PercentageFeatureSet(human14_expression, pattern = "^MT-")
human15_expression[["percent.mt"]] <- PercentageFeatureSet(human15_expression, pattern = "^MT-")
human16_expression[["percent.mt"]] <- PercentageFeatureSet(human16_expression, pattern = "^MT-")

macaque1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque1_expression, pattern = "^MT-")
macaque2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque2_expression, pattern = "^MT-")
macaque_CN_78_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_CN_78_1_expression, pattern = "^MT-")
macaque_DFC_110_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_DFC_110_1_expression, pattern = "^MT-")
macaque_OFC_110_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_OFC_110_1_expression, pattern = "^MT-")
macaque_93_DFC_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_93_DFC_expression, pattern = "^MT-")
macaque_110_DFC_S2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_DFC_S2_expression, pattern = "^MT-")
macaque_110_VFC_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_VFC_expression, pattern = "^MT-")
macaque_110_VFC_S2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_VFC_S2_expression, pattern = "^MT-")
macaque_77_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_77_Frontal_expression, pattern = "^MT-")
macaque_64_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_64_Frontal_expression, pattern = "^MT-")
macaque_62_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_62_Frontal_expression, pattern = "^MT-")


## --------------------------------------------
human1_expression$orig.ident = 'human1'
human2_expression$orig.ident = 'human2'
human3_expression$orig.ident = 'human3'
human4_expression$orig.ident = 'human4'
human5_expression$orig.ident = 'human5'
human6_expression$orig.ident = 'human6'
human7_expression$orig.ident = 'human7'
human8_expression$orig.ident = 'human8'
human9_expression$orig.ident = 'human9'
human10_expression$orig.ident = 'human10'
human11_expression$orig.ident = 'human11'
human12_expression$orig.ident = 'human12'
human13_expression$orig.ident = 'human13'
human14_expression$orig.ident = 'human14'
human15_expression$orig.ident = 'human15'
human16_expression$orig.ident = 'human16'


macaque1_expression$orig.ident = 'Macaque1'
macaque2_expression$orig.ident = 'Macaque2'
macaque_CN_78_1_expression$orig.ident = 'Macaque3_78_1'
macaque_DFC_110_1_expression$orig.ident = 'Macaque3_110_DFC'
macaque_OFC_110_1_expression$orig.ident = 'Macaque3_110_OFC'
macaque_93_DFC_expression$orig.ident = 'Macaque3_93_DFC'
macaque_110_DFC_S2_expression$orig.ident ='Macaque3_110_DFC_S2'
macaque_110_VFC_expression$orig.ident ='Macaque3_110_VFC'
macaque_110_VFC_S2_expression$orig.ident ='Macaque3_110_VFC_S2'
macaque_77_Frontal_expression$orig.ident ='Macaque3_77_Frontal'
macaque_64_Frontal_expression$orig.ident ='Macaque3_64_Frontal'
macaque_62_Frontal_expression$orig.ident ='Macaque3_62_Frontal'

## ------------------------------------------
all_genes = rownames(human1_expression)

human1_expression = CellCycleScoring(human1_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human2_expression = CellCycleScoring(human2_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human3_expression = CellCycleScoring(human3_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )

human4_expression = CellCycleScoring(human4_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human5_expression = CellCycleScoring(human5_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )

human6_expression = CellCycleScoring(human6_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )

human7_expression = CellCycleScoring(human7_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human8_expression = CellCycleScoring(human8_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human9_expression = CellCycleScoring(human9_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human10_expression = CellCycleScoring(human10_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human11_expression = CellCycleScoring(human11_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human12_expression = CellCycleScoring(human12_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human13_expression = CellCycleScoring(human13_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human14_expression = CellCycleScoring(human14_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human15_expression = CellCycleScoring(human15_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )
human16_expression = CellCycleScoring(human16_expression,
                                     g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                     s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                     set.ident = FALSE )



macaque1_expression = CellCycleScoring(macaque1_expression,
                                      g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                      s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                      set.ident = FALSE )
macaque2_expression = CellCycleScoring(macaque2_expression,
                                       g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                       s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                       set.ident = FALSE )
macaque_CN_78_1_expression = CellCycleScoring(macaque_CN_78_1_expression,
                                              g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                              s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                              set.ident = FALSE )
macaque_DFC_110_1_expression = CellCycleScoring(macaque_DFC_110_1_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_OFC_110_1_expression = CellCycleScoring(macaque_OFC_110_1_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )


macaque_93_DFC_expression= CellCycleScoring(macaque_93_DFC_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_110_DFC_S2_expression= CellCycleScoring(macaque_110_DFC_S2_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_110_VFC_expression= CellCycleScoring(macaque_110_VFC_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_110_VFC_S2_expression= CellCycleScoring(macaque_110_VFC_S2_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_77_Frontal_expression= CellCycleScoring(macaque_77_Frontal_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_64_Frontal_expression= CellCycleScoring(macaque_64_Frontal_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )
macaque_62_Frontal_expression= CellCycleScoring(macaque_62_Frontal_expression,
                                                g2m.features = cc.genes$g2m.genes[ cc.genes$g2m.genes %in% all_genes ],
                                                s.features = cc.genes$s.genes[ cc.genes$s.genes %in% all_genes ],
                                                set.ident = FALSE )


human1_expression[["percent.mt"]] <- PercentageFeatureSet(human1_expression, pattern = "^MT-")
human2_expression[["percent.mt"]] <- PercentageFeatureSet(human2_expression, pattern = "^MT-")
human3_expression[["percent.mt"]] <- PercentageFeatureSet(human3_expression, pattern = "^MT-")
human4_expression[["percent.mt"]] <- PercentageFeatureSet(human4_expression, pattern = "^MT-")
human5_expression[["percent.mt"]] <- PercentageFeatureSet(human5_expression, pattern = "^MT-")
human6_expression[["percent.mt"]] <- PercentageFeatureSet(human6_expression, pattern = "^MT-")
human7_expression[["percent.mt"]] <- PercentageFeatureSet(human7_expression, pattern = "^MT-")
human8_expression[["percent.mt"]] <- PercentageFeatureSet(human8_expression, pattern = "^MT-")
human9_expression[["percent.mt"]] <- PercentageFeatureSet(human9_expression, pattern = "^MT-")
human10_expression[["percent.mt"]] <- PercentageFeatureSet(human10_expression, pattern = "^MT-")
human11_expression[["percent.mt"]] <- PercentageFeatureSet(human11_expression, pattern = "^MT-")
human12_expression[["percent.mt"]] <- PercentageFeatureSet(human12_expression, pattern = "^MT-")
human13_expression[["percent.mt"]] <- PercentageFeatureSet(human13_expression, pattern = "^MT-")
human14_expression[["percent.mt"]] <- PercentageFeatureSet(human14_expression, pattern = "^MT-")
human15_expression[["percent.mt"]] <- PercentageFeatureSet(human15_expression, pattern = "^MT-")
human16_expression[["percent.mt"]] <- PercentageFeatureSet(human16_expression, pattern = "^MT-")
macaque1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque1_expression, pattern = "^MT-")
macaque2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque2_expression, pattern = "^MT-")
macaque_CN_78_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_CN_78_1_expression, pattern = "^MT-")
macaque_DFC_110_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_DFC_110_1_expression, pattern = "^MT-")
macaque_OFC_110_1_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_OFC_110_1_expression, pattern = "^MT-")
macaque_93_DFC_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_93_DFC_expression, pattern = "^MT-")
macaque_110_DFC_S2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_DFC_S2_expression, pattern = "^MT-")
macaque_110_VFC_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_VFC_expression, pattern = "^MT-")
macaque_110_VFC_S2_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_110_VFC_S2_expression, pattern = "^MT-")
macaque_77_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_77_Frontal_expression, pattern = "^MT-")
macaque_64_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_64_Frontal_expression, pattern = "^MT-")
macaque_62_Frontal_expression[["percent.mt"]] <- PercentageFeatureSet(macaque_62_Frontal_expression, pattern = "^MT-")


human1_expression = subset(human1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human2_expression = subset(human2_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human3_expression = subset(human3_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human4_expression = subset(human4_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human5_expression = subset(human5_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human6_expression = subset(human6_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human7_expression = subset(human7_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human8_expression = subset(human8_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human9_expression = subset(human9_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human10_expression = subset(human10_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human11_expression = subset(human11_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human12_expression = subset(human12_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human13_expression = subset(human13_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human14_expression = subset(human14_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human15_expression = subset(human15_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
human16_expression = subset(human16_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)


macaque1_expression = subset(macaque1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque2_expression = subset(macaque2_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_CN_78_1_expression = subset(macaque_CN_78_1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_DFC_110_1_expression = subset(macaque_DFC_110_1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_OFC_110_1_expression = subset(macaque_OFC_110_1_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)

macaque_93_DFC_expression = subset(macaque_93_DFC_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_110_DFC_S2_expression= subset(macaque_110_DFC_S2_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_110_VFC_expression= subset(macaque_110_VFC_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_110_VFC_S2_expression= subset(macaque_110_VFC_S2_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_77_Frontal_expression= subset(macaque_77_Frontal_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_64_Frontal_expression= subset(macaque_64_Frontal_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)
macaque_62_Frontal_expression= subset(macaque_62_Frontal_expression, subset = nFeature_RNA > 200 & nFeature_RNA < 10000 & percent.mt < 1)




alldata = merge( human1_expression, 
                 c(human2_expression,
                   human3_expression,
                   human4_expression,
                   human5_expression,
                   human6_expression,
                   human7_expression,
                   human8_expression,
                   human9_expression,
                   human10_expression,
                   human11_expression,
                   human12_expression,
                   human13_expression,
                   human14_expression,
                   human15_expression,
                   human16_expression,
                   macaque1_expression,
                   macaque2_expression,
                   macaque_CN_78_1_expression,
                   macaque_DFC_110_1_expression,
                   macaque_OFC_110_1_expression,
                   macaque_93_DFC_expression, 
                   macaque_110_DFC_S2_expression, 
                   macaque_110_VFC_expression, 
                   macaque_110_VFC_S2_expression, 
                   macaque_77_Frontal_expression, 
                   macaque_64_Frontal_expression,
                   macaque_62_Frontal_expression), 
                 add.cell.ids = c('human1','human2','human3','human4','human5','human6','human7','human8','human9','human10','human11','human12','human13','human14','human15','human16', 
                                  "macaque1","macaque2","macaque_78_1","Macaque3_110_DFC","Macaque3_110_OFC","macaque_93_DFC_expression", "macaque_110_DFC_S2_expression", "macaque_110_VFC_expression", "macaque_110_VFC_S2_expression", "macaque_77_Frontal_expression", "macaque_64_Frontal_expression", "macaque_62_Frontal_expression"))


save(alldata,file=paste0(objects_directory,"scRNA_published_foetal_samples.RData"))

Human: Find astrocytes in individual samples

load(paste0(objects_directory,"scRNA_published_foetal_samples.RData"))
split_seurat = SplitObject(alldata, split.by = "orig.ident")

human1 = perform_clustering_to_find_astrocytes(split_seurat[[1]])
human2 = perform_clustering_to_find_astrocytes(split_seurat[[2]])
human3 = perform_clustering_to_find_astrocytes(split_seurat[[3]])
human5 = perform_clustering_to_find_astrocytes(split_seurat[[5]])
human6 = perform_clustering_to_find_astrocytes(split_seurat[[6]])
human7 = perform_clustering_to_find_astrocytes(split_seurat[[7]])
human9 = perform_clustering_to_find_astrocytes(split_seurat[[9]])
human10 = perform_clustering_to_find_astrocytes(split_seurat[[10]])
human13 = perform_clustering_to_find_astrocytes(split_seurat[[13]])
human15 = perform_clustering_to_find_astrocytes(split_seurat[[15]])
human16 = perform_clustering_to_find_astrocytes(split_seurat[[16]])
## takes longer
human4 = perform_clustering_to_find_astrocytes(split_seurat[[4]]) # long
human8 = perform_clustering_to_find_astrocytes(split_seurat[[8]]) # long
human11 = perform_clustering_to_find_astrocytes(split_seurat[[11]]) # long
human12 = perform_clustering_to_find_astrocytes(split_seurat[[12]]) # long
human14 = perform_clustering_to_find_astrocytes(split_seurat[[14]]) # long

save( human1, file=paste0(objects_directory,"human1_scRNA.RData"))
save( human2, file=paste0(objects_directory,"human2_scRNA.RData"))
save( human3, file=paste0(objects_directory,"human3_scRNA.RData"))
save( human4, file=paste0(objects_directory,"human4_scRNA.RData"))
save( human5, file=paste0(objects_directory,"human5_scRNA.RData"))
save( human6, file=paste0(objects_directory,"human6_scRNA.RData"))
save( human7, file=paste0(objects_directory,"human7_scRNA.RData"))
save( human8, file=paste0(objects_directory,"human8_scRNA.RData"))
save( human9, file=paste0(objects_directory,"human9_scRNA.RData"))
save( human10, file=paste0(objects_directory,"human10_scRNA.RData"))
save( human11, file=paste0(objects_directory,"human11_scRNA.RData"))
save( human12, file=paste0(objects_directory,"human12_scRNA.RData"))
save( human13, file=paste0(objects_directory,"human13_scRNA.RData"))
save( human14, file=paste0(objects_directory,"human14_scRNA.RData"))
save( human15, file=paste0(objects_directory,"human15_scRNA.RData"))
save( human16, file=paste0(objects_directory,"human16_scRNA.RData"))

objects_directory="~/Desktop/Ciuba_et_al_SM/data/objects/"
macaque1 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque1"]])
save( macaque1, file=paste0(objects_directory,"macaque1_scRNA.RData"))
macaque2 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque2"]])
save( macaque2, file=paste0(objects_directory,"macaque2_scRNA.RData"))
macaque3 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_78_1"]])
save( macaque3, file=paste0(objects_directory,"macaque3_scRNA.RData"))
macaque4 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_DFC"]])
save( macaque4, file=paste0(objects_directory,"macaque4_scRNA.RData"))
macaque5 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_OFC"]])
save( macaque5, file=paste0(objects_directory,"macaque5_scRNA.RData"))
macaque6 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_93_DFC"]])
save( macaque6, file=paste0(objects_directory,"macaque6_scRNA.RData"))
macaque7 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_DFC_S2"]])
save( macaque7, file=paste0(objects_directory,"macaque7_scRNA.RData"))
macaque8 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_VFC"]])
save( macaque8, file=paste0(objects_directory,"macaque8_scRNA.RData"))
macaque9 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_110_VFC_S2"]])
save( macaque9, file=paste0(objects_directory,"macaque9_scRNA.RData"))
macaque10 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_77_Frontal"]])
save( macaque10, file=paste0(objects_directory,"macaque10_scRNA.RData"))
macaque11 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_64_Frontal"]])
save( macaque11, file=paste0(objects_directory,"macaque11_scRNA.RData"))
macaque12 = perform_clustering_to_find_astrocytes(split_seurat[["Macaque3_62_Frontal"]])
save( macaque12, file=paste0(objects_directory,"macaque12_scRNA.RData"))

Count tables

load(paste0(objects_directory,"human1_scRNA.RData"))
load(paste0(objects_directory,"human2_scRNA.RData"))
load(paste0(objects_directory,"human3_scRNA.RData"))
load(paste0(objects_directory,"human4_scRNA.RData"))
load(paste0(objects_directory,"human5_scRNA.RData"))
load(paste0(objects_directory,"human6_scRNA.RData"))
load(paste0(objects_directory,"human7_scRNA.RData"))
load(paste0(objects_directory,"human8_scRNA.RData"))
load(paste0(objects_directory,"human9_scRNA.RData"))
load(paste0(objects_directory,"human10_scRNA.RData"))
load(paste0(objects_directory,"human11_scRNA.RData"))
load(paste0(objects_directory,"human12_scRNA.RData"))
load(paste0(objects_directory,"human13_scRNA.RData"))
load(paste0(objects_directory,"human14_scRNA.RData"))
load(paste0(objects_directory,"human15_scRNA.RData"))
load(paste0(objects_directory,"human16_scRNA.RData"))

human1_astrocyte_counts = findClusterCorrespondingToAstrocytes(human1,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human2_astrocyte_counts = findClusterCorrespondingToAstrocytes(human2,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human3_astrocyte_counts = findClusterCorrespondingToAstrocytes(human3,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human4_astrocyte_counts = findClusterCorrespondingToAstrocytes(human4,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human5_astrocyte_counts = findClusterCorrespondingToAstrocytes(human5,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human6_astrocyte_counts = findClusterCorrespondingToAstrocytes(human6,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human7_astrocyte_counts = findClusterCorrespondingToAstrocytes(human7,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human8_astrocyte_counts = findClusterCorrespondingToAstrocytes(human8,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human9_astrocyte_counts = findClusterCorrespondingToAstrocytes(human9,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human10_astrocyte_counts = findClusterCorrespondingToAstrocytes(human10,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human11_astrocyte_counts = findClusterCorrespondingToAstrocytes(human11,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human12_astrocyte_counts = findClusterCorrespondingToAstrocytes(human12,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human13_astrocyte_counts = findClusterCorrespondingToAstrocytes(human13,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human14_astrocyte_counts = findClusterCorrespondingToAstrocytes(human14,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human15_astrocyte_counts = findClusterCorrespondingToAstrocytes(human15,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
human16_astrocyte_counts = findClusterCorrespondingToAstrocytes(human16,chosenClusterSet = "RNA_snn_res.2",
                                                               astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))

save( human1_astrocyte_counts, human2_astrocyte_counts, human3_astrocyte_counts, human4_astrocyte_counts, human5_astrocyte_counts,
      human6_astrocyte_counts, human7_astrocyte_counts, human8_astrocyte_counts, human9_astrocyte_counts, human10_astrocyte_counts, human11_astrocyte_counts, human12_astrocyte_counts, human13_astrocyte_counts, human14_astrocyte_counts, human15_astrocyte_counts, human16_astrocyte_counts, file=paste0(objects_directory,"human_scRNA_pseudobulk_data.RData"))

human1_astrocyte = getAstrocytes(human1,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human2_astrocyte = getAstrocytes(human2,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  276
human3_astrocyte = getAstrocytes(human3,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  252
human4_astrocyte = getAstrocytes(human4,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  280
human5_astrocyte = getAstrocytes(human5,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human6_astrocyte = getAstrocytes(human6,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  85
human7_astrocyte = getAstrocytes(human7,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  41
human8_astrocyte = getAstrocytes(human8,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  205
human9_astrocyte = getAstrocytes(human9,chosenClusterSet = "RNA_snn_res.2",
                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  40
human10_astrocyte = getAstrocytes(human10,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human11_astrocyte = getAstrocytes(human11,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human12_astrocyte = getAstrocytes(human12,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human13_astrocyte = getAstrocytes(human13,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human14_astrocyte = getAstrocytes(human14,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human15_astrocyte = getAstrocytes(human15,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153
human16_astrocyte = getAstrocytes(human16,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9")) #  153

human1_astrocyte = getAstrocytes(human1,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 26 found 153 astrocytes"
> human2_astrocyte = getAstrocytes(human2,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 9 found 276 astrocytes"
> human3_astrocyte = getAstrocytes(human3,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 10 found 252 astrocytes"
> human4_astrocyte = getAstrocytes(human4,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 22 found 280 astrocytes"
> human5_astrocyte = getAstrocytes(human5,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 26 found 153 astrocytes"
> human6_astrocyte = getAstrocytes(human6,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 20 found 85 astrocytes"
> human7_astrocyte = getAstrocytes(human7,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 23 found 41 astrocytes"
> human8_astrocyte = getAstrocytes(human8,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 21 found 205 astrocytes"
> human9_astrocyte = getAstrocytes(human9,chosenClusterSet = "RNA_snn_res.2",
+                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 22 found 40 astrocytes"
> human10_astrocyte = getAstrocytes(human10,chosenClusterSet = "RNA_snn_res.2",
+                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 21 found 82 astrocytes"
> human11_astrocyte = getAstrocytes(human11,chosenClusterSet = "RNA_snn_res.2",
+                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 31 found 259 astrocytes"
> human12_astrocyte = getAstrocytes(human12,chosenClusterSet = "RNA_snn_res.2",
+                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 27 found 96 astrocytes"
> human13_astrocyte = getAstrocytes(human13,chosenClusterSet = "RNA_snn_res.2",
+                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
[1] "astrocytes are in cluster 12 found 112 astrocytes"

load(paste0(objects_directory,"human_scRNA_pseudobulk_data.RData"))
human_astrocyte_counts = data.frame(human1=human1_astrocyte_counts,
                                    human2=human2_astrocyte_counts,
                                    human3=human3_astrocyte_counts,
                                    human4=human4_astrocyte_counts,
                                    human5=human5_astrocyte_counts,
                                    human6=human6_astrocyte_counts,
                                    human7=human7_astrocyte_counts,
                                    human8=human8_astrocyte_counts,
                                    human9=human9_astrocyte_counts,
                                    human10=human10_astrocyte_counts,
                                    human11=human11_astrocyte_counts,
                                    human12=human12_astrocyte_counts,
                                    human13=human13_astrocyte_counts,
                                    human14=human14_astrocyte_counts,
                                    human15=human15_astrocyte_counts,
                                    human16=human16_astrocyte_counts,
                                    row.names = names(human1_astrocyte_counts))
human_astrocyte_counts_metadata = data.frame(Species=rep("Human",ncol(human_astrocyte_counts)),
                                             Human_NHP=rep("Human",ncol(human_astrocyte_counts)),
                                             study=rep("Mixed",ncol(human_astrocyte_counts)),
                                             stage=rep("Foetal",ncol(human_astrocyte_counts)),
                                             row.names=colnames(human_astrocyte_counts))

save( human_astrocyte_counts, human_astrocyte_counts_metadata,
      file=paste0(objects_directory,"human_astrocyte_counts.RData"))

Macaque: Find astrocytes in individual samples

load(paste0(objects_directory,"macaque1_scRNA.RData"))
load(paste0(objects_directory,"macaque2_scRNA.RData"))
load(paste0(objects_directory,"macaque3_scRNA.RData"))
load(paste0(objects_directory,"macaque4_scRNA.RData"))
load(paste0(objects_directory,"macaque5_scRNA.RData"))
load(paste0(objects_directory,"macaque6_scRNA.RData"))
load(paste0(objects_directory,"macaque7_scRNA.RData"))
load(paste0(objects_directory,"macaque8_scRNA.RData"))
load(paste0(objects_directory,"macaque9_scRNA.RData"))
load(paste0(objects_directory,"macaque10_scRNA.RData"))
load(paste0(objects_directory,"macaque11_scRNA.RData"))
load(paste0(objects_directory,"macaque12_scRNA.RData"))

macaque1_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque1,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque2_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque2,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque3_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque3,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))

save( macaque1_astrocyte_counts, macaque2_astrocyte_counts, macaque3_astrocyte_counts,
      file="~/Desktop/macaques123.RData")
rm(list=c("macaque1","macaque2","macaque3"))
gc()


macaque4_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque4,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque5_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque5,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))

save( macaque4_astrocyte_counts, macaque5_astrocyte_counts,
      file="~/Desktop/macaques45.RData")
rm(list=c("macaque4","macaque5"))
gc()

macaque6_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque6,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque7_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque7,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))

save( macaque6_astrocyte_counts, macaque7_astrocyte_counts,
      file="~/Desktop/macaques67.RData")
rm(list=c("macaque6","macaque7"))
gc()

macaque8_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque8,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque9_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque9,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))

save( macaque8_astrocyte_counts,macaque9_astrocyte_counts,
      file="~/Desktop/macaques89.RData")
rm(list=c("macaque8","macaque9"))
gc()

macaque10_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque10,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque11_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque11,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))

save( macaque10_astrocyte_counts,macaque11_astrocyte_counts,
      file="~/Desktop/macaques10_11.RData")
rm(list=c("macaque10","macaque11"))
gc()

macaque12_astrocyte_counts = findClusterCorrespondingToAstrocytes(macaque12,chosenClusterSet = "RNA_snn_res.2",
                                                                 astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))

save( macaque12_astrocyte_counts,
      file="~/Desktop/macaques_12.RData")
rm(list=c("macaque12"))
gc()



macaque1_astrocyte = getAstrocytes(macaque1,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque2_astrocyte = getAstrocytes(macaque2,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque3_astrocyte = getAstrocytes(macaque3,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque4_astrocyte = getAstrocytes(macaque4,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque5_astrocyte = getAstrocytes(macaque5,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque6_astrocyte = getAstrocytes(macaque6,chosenClusterSet = "RNA_snn_res.2",
                                  astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque7_astrocyte = getAstrocytes(macaque7,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque8_astrocyte = getAstrocytes(macaque8,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque9_astrocyte = getAstrocytes(macaque9,chosenClusterSet = "RNA_snn_res.2",
                                   astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque10_astrocyte = getAstrocytes(macaque10,chosenClusterSet = "RNA_snn_res.2",
                                    astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque11_astrocyte = getAstrocytes(macaque11,chosenClusterSet = "RNA_snn_res.2",
                                    astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))
macaque12_astrocyte = getAstrocytes(macaque12,chosenClusterSet = "RNA_snn_res.2",
                                    astrocyticMarkers = c("SLC1A3","SOX9","GFAP","AQP4","ALDH1A1","ID4","APOE","S100A9"))

macaque_astrocyte_counts = data.frame(macaque1_late=macaque1_astrocyte_counts, # 807
                                      macaque2_late=macaque2_astrocyte_counts, # 36
                                      Macaque3_78_1=macaque3_astrocyte_counts, # 687
                                      Macaque3_110_DFC=macaque4_astrocyte_counts, # 783
                                      Macaque3_110_OFC=macaque5_astrocyte_counts, # 114
                                      Macaque3_93_DFC=macaque6_astrocyte_counts, # 613
                                      Macaque3_110_DFC_S2=macaque7_astrocyte_counts, # 37
                                      Macaque3_110_VFC=macaque8_astrocyte_counts, # 713
                                      Macaque3_110_VFC_S2=macaque9_astrocyte_counts, # 265
                                      Macaque3_77_Frontal=macaque10_astrocyte_counts, # 441
                                      Macaque3_64_Frontal=macaque11_astrocyte_counts, # 314
                                      Macaque3_62_Frontal=macaque12_astrocyte_counts, # 246
                                      row.names = names(macaque1_astrocyte_counts))

macaque_astrocyte_counts_metadata = data.frame(Species=rep("Macaque",ncol(macaque_astrocyte_counts)),
                                               Human_NHP=rep("NHP",ncol(macaque_astrocyte_counts)),
                                               study=rep("Mixed",ncol(macaque_astrocyte_counts)),
                                               stage=rep("Foetal",ncol(macaque_astrocyte_counts)),
                                               row.names=colnames(macaque_astrocyte_counts))
save( macaque_astrocyte_counts, macaque_astrocyte_counts_metadata,
      file=paste0(objects_directory,"macaque_astrocyte_counts.RData"))

Pseudo bulk approach to identify markers of human astrocyte evolution

“human14”,“human7”,“human13” here we find only few astrocytes, we remove these samples from the analysis.

load(paste0(objects_directory,"macaque_astrocyte_counts.RData"))
load(paste0(objects_directory,"human_astrocyte_counts.RData"))
all(rownames(macaque_astrocyte_counts)==rownames(human_astrocyte_counts))

## [1] TRUE

stitched_counts = data.frame( macaque_astrocyte_counts, human_astrocyte_counts )
st_metadata = rbind( macaque_astrocyte_counts_metadata,human_astrocyte_counts_metadata )
all(colnames(stitched_counts)==rownames(st_metadata))

## [1] TRUE

data = DESeqDataSetFromMatrix( countData = stitched_counts[,! colnames(stitched_counts) %in% c("human14","human7","human13")],
                               colData = st_metadata[! rownames(st_metadata) %in% c("human14","human7","human13"),],
                               design = ~ Human_NHP )

## converting counts to integer mode

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

data =  estimateSizeFactors(data)
data = estimateDispersions(data, fitType = "local")

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

data = DESeq(data, fitType = 'local')

## using pre-existing size factors

## estimating dispersions

## found already estimated dispersions, replacing these

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 89 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

degs = results(data, contrast = c("Human_NHP","Human", "NHP") )
human_macaque_fetal_norm_counts = counts(data,normalized=TRUE)
human_macaque_fetal_unnorm_counts = counts(data,normalized=FALSE)

save(data,degs, 
     human_macaque_fetal_norm_counts,
     human_macaque_fetal_unnorm_counts,
     file=paste0(objects_directory,"pseudobulk_published_scRNA_Foetal.RData"))

degs = degs[! is.na(degs$padj), ]
degs_01 = degs[ degs$padj < 0.1, ]
sum( degs_01$log2FoldChange>0 )

## [1] 5212

sum( degs_01$log2FoldChange<0 )

## [1] 6219

We confirm (87/237) 36% of up-regulated genes and 28% (104/301) of down-regulated genes.

up.hits.ensids = read.delim(paste0(outputs_directory,"up_engs.txt"),as.is=TRUE, header=FALSE)
dn.hits.ensids = read.delim(paste0(outputs_directory,"dn_engs.txt"),as.is=TRUE, header=FALSE)
up.hits.geneN = unique(genemap$hgnc_symbol[genemap$ensembl_gene_id %in% up.hits.ensids$V1])
dn.hits.geneN = unique(genemap$hgnc_symbol[genemap$ensembl_gene_id %in% dn.hits.ensids$V1])

up.hits.geneN_filt = up.hits.geneN[up.hits.geneN %in% rownames(degs)]
dn.hits.geneN_filt = dn.hits.geneN[dn.hits.geneN %in% rownames(degs)]
length(up.hits.geneN_filt)

## [1] 238

length(dn.hits.geneN_filt)

## [1] 301

degs_us_up = degs[rownames(degs) %in% up.hits.geneN_filt,]
degs_us_dn = degs[rownames(degs) %in% dn.hits.geneN_filt,]

degs_us_up[degs_us_up$padj<0.1 & degs_us_up$log2FoldChange>0,]

## log2 fold change (MLE): Human_NHP Human vs NHP 
## Wald test p-value: Human_NHP Human vs NHP 
## DataFrame with 87 rows and 6 columns
##           baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##          <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## SCNN1D     6.61796       5.252182  0.580160   9.05299 1.39105e-19 9.86892e-19
## CDK11A    35.76199       2.444782  0.184597  13.24392 4.89269e-40 9.54270e-39
## SLC35E2A  58.70908       2.730201  0.189547  14.40384 4.89453e-47 1.27041e-45
## H6PD      20.64759       1.287565  0.170613   7.54670 4.46417e-14 2.23169e-13
## DFFA      17.20583       0.636637  0.168038   3.78865 1.51468e-04 3.34767e-04
## ...            ...            ...       ...       ...         ...         ...
## ADA2       8.39128       2.439363  0.293316   8.31651 9.05931e-17 5.33378e-16
## LZTR1     25.92802       0.882482  0.168153   5.24808 1.53688e-07 4.68215e-07
## C1QTNF6    7.36232       1.673785  0.369422   4.53082 5.87541e-06 1.52153e-05
## MT-ATP8   24.80009       8.290334  0.666553  12.43761 1.63309e-35 2.54872e-34
## MT-ATP6   15.51741       7.188377  0.712925  10.08294 6.57311e-24 5.92171e-23

degs_us_dn[degs_us_dn$padj<0.1 & degs_us_dn$log2FoldChange<0,]

## log2 fold change (MLE): Human_NHP Human vs NHP 
## Wald test p-value: Human_NHP Human vs NHP 
## DataFrame with 104 rows and 6 columns
##             baseMean log2FoldChange     lfcSE      stat               pvalue
##            <numeric>      <numeric> <numeric> <numeric>            <numeric>
## SRSF10      104.4759      -0.932483  0.197947  -4.71077 0.000002467874236896
## RCAN3        15.2877      -2.830897  0.421129  -6.72216 0.000000000017904770
## PDIK1L       14.7172      -1.528401  0.215272  -7.09987 0.000000000001248702
## SRSF4       109.6152      -0.319722  0.169547  -1.88574 0.059329607370328094
## PRPF38A      32.0105      -1.528840  0.200637  -7.61992 0.000000000000025384
## ...              ...            ...       ...       ...                  ...
## ZNF776      24.28439       -2.44103  0.202366 -12.06248          1.66682e-33
## OLIG2       23.58954       -2.73026  0.728066  -3.75002          1.76820e-04
## DONSON      10.98414       -1.23557  0.311389  -3.96792          7.25017e-05
## MT-CO3     138.76363       -4.75420  0.436041 -10.90310          1.11399e-27
## C1GALT1C1L   1.77877       -3.83977  0.630962  -6.08558          1.16069e-09
##                            padj
##                       <numeric>
## SRSF10     0.000006656184798295
## RCAN3      0.000000000075424177
## PDIK1L     0.000000000005675825
## SRSF4      0.089430349211630344
## PRPF38A    0.000000000000128673
## ...                         ...
## ZNF776              2.37697e-32
## OLIG2               3.88128e-04
## DONSON              1.66126e-04
## MT-CO3              1.20738e-26
## C1GALT1C1L          4.24191e-09

conf_up = rownames(degs_us_up[degs_us_up$padj<0.1 & degs_us_up$log2FoldChange>0,])
conf_down = rownames(degs_us_dn[degs_us_dn$padj<0.1 & degs_us_dn$log2FoldChange<0,])

barplot( c(length(conf_up)/length(up.hits.geneN_filt),
           length(conf_down)/length(dn.hits.geneN_filt)),
         col=c("green4","wheat3"), ylim=c(0,0.5),ylab="Fraction",
         names=c("Up","Down"),xlab="EAGs")
axis(2,lwd=1)

Boxplots of chosen genes

sa=st_metadata[! rownames(st_metadata) %in% c("human14","human7","human13"),]
plotAGene = function( ct, gene, sa, cols ){
  # ct = human_macaque_fetal_norm_counts; gene="CTCF"
  # sa = st_metadata[! rownames(st_metadata) %in% c("human14","human7","human13"),]
  # cols = c("black","blue")
  x = split( ct[rownames(ct)==gene,], sa$Species )[c("Human","Macaque")]
  boxplot(x,border=cols,main=gene,col="white")
}

degs["CTCF",]

## log2 fold change (MLE): Human_NHP Human vs NHP 
## Wald test p-value: Human_NHP Human vs NHP 
## DataFrame with 1 row and 6 columns
##       baseMean log2FoldChange     lfcSE      stat    pvalue      padj
##      <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
## CTCF   73.2475      -0.458664  0.186933  -2.45363 0.0141422 0.0240199

plotAGene( human_macaque_fetal_norm_counts, "CTCF", sa, c("black","blue")) # P=0.0147813

degs["TEAD3",]

## log2 fold change (MLE): Human_NHP Human vs NHP 
## Wald test p-value: Human_NHP Human vs NHP 
## DataFrame with 1 row and 6 columns
##        baseMean log2FoldChange     lfcSE      stat          pvalue
##       <numeric>      <numeric> <numeric> <numeric>       <numeric>
## TEAD3   11.8658        1.36589   0.24607   5.55085 0.0000000284291
##                  padj
##             <numeric>
## TEAD3 0.0000000923959

plotAGene( human_macaque_fetal_norm_counts, "TEAD3", sa, c("black","blue")) # P=3.20079e-08

Table for the supplement

Kanton

kanton_hits = read.delim(paste0(outputs_directory,"Supplementary_Table_15_human_DE.txt"))
kanton_hits_up = kanton_hits[kanton_hits$Average.expression..human.>kanton_hits$Average.expression..chimp.,]
kanton_hits_dn = kanton_hits[kanton_hits$Average.expression..human.<kanton_hits$Average.expression..chimp.,]
kanton_hits_up = kanton_hits_up$Symbol
kanton_hits_dn = kanton_hits_dn$Symbol

Jorstad

hs_pt = read.delim( paste0(outputs_directory,'Jorstad_tables/Astro_human_vs_chimp_sig_genes.txt'),sep=",")
hs_pp = read.delim( paste0(outputs_directory,'Jorstad_tables/Astro_human_vs_gorilla_sig_genes.txt'),sep=",")
hs_rm = read.delim( paste0(outputs_directory,'Jorstad_tables/Astro_human_vs_rhesus_sig_genes.txt'), sep=",")
hs_cj = read.delim( paste0(outputs_directory,'Jorstad_tables/Astro_human_vs_marmoset_sig_genes.txt'),sep=",")

hs_pt = hs_pt[!is.na(hs_pt$padj),]
hs_pp = hs_pp[!is.na(hs_pp$padj),]
hs_rm = hs_rm[!is.na(hs_rm$padj),]
hs_cj = hs_cj[!is.na(hs_cj$padj),]

# identify downregulated and up regulated genes
thr=0

down1 = hs_pt$gene[hs_pt$log2FoldChange<(-1*thr) & hs_pt$padj<0.1]
down2 = hs_pp$gene[hs_pp$log2FoldChange<(-1*thr) & hs_pp$padj<0.1]
down3 = hs_rm$gene[hs_rm$log2FoldChange<(-1*thr) & hs_rm$padj<0.1]
down4 = hs_cj$gene[hs_cj$log2FoldChange<(-1*thr)  & hs_cj$padj<0.1]

up1 = hs_pt$gene[hs_pt$log2FoldChange>thr & hs_pt$padj<0.1]
up2 = hs_pp$gene[hs_pp$log2FoldChange>thr & hs_pp$padj<0.1]
up3 = hs_rm$gene[hs_rm$log2FoldChange>thr & hs_rm$padj<0.1]
up4 = hs_cj$gene[hs_cj$log2FoldChange>thr & hs_cj$padj<0.1]

down1234=down1[down1 %in% down2[down2 %in% down3[down3 %in% down4]]]
up1234=up1[up1 %in% up2[up2 %in% up3[up3 %in% up4]]]

down12=down1[down1 %in% down2]
up12=up1[up1 %in% up2]


down123=down1[down1 %in% down2[down2 %in% down3]]
up123=up1[up1 %in% up2[up2 %in% up3]]

Ma et al

these are the tables I obtained from Shaojie Ma directly.

load(paste0(outputs_directory,'/Wilcox_DEG_results_raw.Rdata'))
deg_species_filt = deg_species[deg_species$cluster=="Astro" & deg_species$p_val_adj<0.01,]

#############################
deg_species_filt_astro_hs = deg_species_filt[log2(deg_species_filt$ratio_fc)>0,]
deg_species_filt_astro_hs = deg_species_filt_astro_hs[deg_species_filt_astro_hs$species1=="Human" & deg_species_filt_astro_hs$species2 %in% c("Chimpanzee","Rhesus"),]

up_genes = table(deg_species_filt_astro_hs$gene)
up_genes = names(up_genes[up_genes>1])

#############################
deg_species_filt_astro_hs = deg_species_filt[log2(deg_species_filt$ratio_fc)<(0),]
deg_species_filt_astro_hs = deg_species_filt_astro_hs[deg_species_filt_astro_hs$species1=="Human" & deg_species_filt_astro_hs$species2 %in% c("Chimpanzee","Rhesus"),]

dn_genes = table(deg_species_filt_astro_hs$gene)
dn_genes = names(dn_genes[dn_genes>1])


any( up_genes %in% dn_genes )

## [1] FALSE

length(up_genes)

## [1] 1429

length(dn_genes)

## [1] 1123

all_up_all = unique( c(kanton_hits_up,up123,up_genes,up.hits.geneN,conf_up))

all_up_all = data.frame( Kanton = all_up_all %in% kanton_hits_up,
                         Jorstad = all_up_all %in% up123,
                          Ma = all_up_all %in% up_genes,
                          Foetal = all_up_all %in% conf_up,
                          Ciuba = all_up_all %in% up.hits.geneN,
                          row.names = all_up_all)
all_up_all = all_up_all[all_up_all$Ciuba & rowSums(all_up_all[,1:4])>0,]
all_up_all

##               Kanton Jorstad    Ma Foetal Ciuba
## NBPF11          TRUE   FALSE FALSE   TRUE  TRUE
## NBPF14          TRUE   FALSE FALSE   TRUE  TRUE
## PABPC1L         TRUE   FALSE FALSE   TRUE  TRUE
## PALLD           TRUE    TRUE  TRUE   TRUE  TRUE
## PCAT6           TRUE   FALSE FALSE  FALSE  TRUE
## PIGZ            TRUE    TRUE FALSE   TRUE  TRUE
## SCNN1D          TRUE   FALSE FALSE   TRUE  TRUE
## SCRG1           TRUE    TRUE  TRUE  FALSE  TRUE
## THBS4           TRUE   FALSE  TRUE   TRUE  TRUE
## PAGR1          FALSE    TRUE FALSE   TRUE  TRUE
## STK33          FALSE    TRUE  TRUE   TRUE  TRUE
## VKORC1         FALSE    TRUE FALSE  FALSE  TRUE
## AQP1           FALSE    TRUE  TRUE  FALSE  TRUE
## MTCH1          FALSE    TRUE  TRUE  FALSE  TRUE
## PRDX6          FALSE    TRUE  TRUE  FALSE  TRUE
## RMDN1          FALSE    TRUE FALSE  FALSE  TRUE
## RANGRF         FALSE    TRUE FALSE  FALSE  TRUE
## GUK1           FALSE    TRUE  TRUE  FALSE  TRUE
## ATP6V1E2       FALSE    TRUE  TRUE   TRUE  TRUE
## S100A13        FALSE    TRUE FALSE  FALSE  TRUE
## FAM228B        FALSE    TRUE FALSE  FALSE  TRUE
## LIN7A          FALSE    TRUE FALSE   TRUE  TRUE
## ACACA          FALSE   FALSE  TRUE   TRUE  TRUE
## BAIAP3         FALSE   FALSE  TRUE   TRUE  TRUE
## C1orf54        FALSE   FALSE  TRUE   TRUE  TRUE
## C1QTNF6        FALSE   FALSE  TRUE   TRUE  TRUE
## C22orf46       FALSE   FALSE  TRUE  FALSE  TRUE
## CPS1           FALSE   FALSE  TRUE  FALSE  TRUE
## DGCR6L         FALSE   FALSE  TRUE  FALSE  TRUE
## EFHD1          FALSE   FALSE  TRUE   TRUE  TRUE
## GFPT2          FALSE   FALSE  TRUE   TRUE  TRUE
## GTF3C5         FALSE   FALSE  TRUE   TRUE  TRUE
## HSPB1          FALSE   FALSE  TRUE  FALSE  TRUE
## MLH1           FALSE   FALSE  TRUE   TRUE  TRUE
## MMP19          FALSE   FALSE  TRUE  FALSE  TRUE
## MOV10          FALSE   FALSE  TRUE   TRUE  TRUE
## NDUFV1         FALSE   FALSE  TRUE  FALSE  TRUE
## NR1H3          FALSE   FALSE  TRUE   TRUE  TRUE
## PDLIM7         FALSE   FALSE  TRUE  FALSE  TRUE
## RHOBTB3        FALSE   FALSE  TRUE  FALSE  TRUE
## SIRT3          FALSE   FALSE  TRUE   TRUE  TRUE
## STYXL1         FALSE   FALSE  TRUE   TRUE  TRUE
## TCF25          FALSE   FALSE  TRUE   TRUE  TRUE
## TCTN3          FALSE   FALSE  TRUE   TRUE  TRUE
## TMEM9B-AS1     FALSE   FALSE  TRUE  FALSE  TRUE
## TRIP6          FALSE   FALSE  TRUE  FALSE  TRUE
## TSR3           FALSE   FALSE  TRUE  FALSE  TRUE
## VIM            FALSE   FALSE  TRUE  FALSE  TRUE
## ZNF266         FALSE   FALSE  TRUE   TRUE  TRUE
## ZNHIT3         FALSE   FALSE  TRUE  FALSE  TRUE
## CDK11A         FALSE   FALSE FALSE   TRUE  TRUE
## H6PD           FALSE   FALSE FALSE   TRUE  TRUE
## DFFA           FALSE   FALSE FALSE   TRUE  TRUE
## PAQR7          FALSE   FALSE FALSE   TRUE  TRUE
## SRGAP2B        FALSE   FALSE FALSE   TRUE  TRUE
## HHLA3          FALSE   FALSE FALSE   TRUE  TRUE
## SLC35E2A       FALSE   FALSE FALSE   TRUE  TRUE
## NBPF1          FALSE   FALSE FALSE   TRUE  TRUE
## NBPF15         FALSE   FALSE FALSE   TRUE  TRUE
## NBPF9          FALSE   FALSE FALSE   TRUE  TRUE
## NBPF19         FALSE   FALSE FALSE   TRUE  TRUE
## NBPF26         FALSE   FALSE FALSE   TRUE  TRUE
## ACOX3          FALSE   FALSE FALSE   TRUE  TRUE
## CBR4           FALSE   FALSE FALSE   TRUE  TRUE
## TMEM129        FALSE   FALSE FALSE   TRUE  TRUE
## SULT1C4        FALSE   FALSE FALSE   TRUE  TRUE
## TEAD3          FALSE   FALSE FALSE   TRUE  TRUE
## MAN2B2         FALSE   FALSE FALSE   TRUE  TRUE
## RIPK1          FALSE   FALSE FALSE   TRUE  TRUE
## SRD5A1         FALSE   FALSE FALSE   TRUE  TRUE
## WDR27          FALSE   FALSE FALSE   TRUE  TRUE
## INSYN2B        FALSE   FALSE FALSE   TRUE  TRUE
## C1QTNF3-AMACR  FALSE   FALSE FALSE   TRUE  TRUE
## ABCB4          FALSE   FALSE FALSE   TRUE  TRUE
## CCND3          FALSE   FALSE FALSE   TRUE  TRUE
## MAPKAP1        FALSE   FALSE FALSE   TRUE  TRUE
## NUP43          FALSE   FALSE FALSE   TRUE  TRUE
## PDE1C          FALSE   FALSE FALSE   TRUE  TRUE
## POLR2J3        FALSE   FALSE FALSE   TRUE  TRUE
## ADAM9          FALSE   FALSE FALSE   TRUE  TRUE
## EPHB4          FALSE   FALSE FALSE   TRUE  TRUE
## COL27A1        FALSE   FALSE FALSE   TRUE  TRUE
## SPDYE3         FALSE   FALSE FALSE   TRUE  TRUE
## OSBPL5         FALSE   FALSE FALSE   TRUE  TRUE
## CUBN           FALSE   FALSE FALSE   TRUE  TRUE
## ELMOD1         FALSE   FALSE FALSE   TRUE  TRUE
## SHLD2          FALSE   FALSE FALSE   TRUE  TRUE
## EML3           FALSE   FALSE FALSE   TRUE  TRUE
## TIMM23B-AGAP6  FALSE   FALSE FALSE   TRUE  TRUE
## AGAP4          FALSE   FALSE FALSE   TRUE  TRUE
## FAM111B        FALSE   FALSE FALSE   TRUE  TRUE
## TIMM23B        FALSE   FALSE FALSE   TRUE  TRUE
## AGAP9          FALSE   FALSE FALSE   TRUE  TRUE
## DGKA           FALSE   FALSE FALSE   TRUE  TRUE
## DHRS12         FALSE   FALSE FALSE   TRUE  TRUE
## RFLNA          FALSE   FALSE FALSE   TRUE  TRUE
## LTB4R          FALSE   FALSE FALSE   TRUE  TRUE
## LPCAT2         FALSE   FALSE FALSE   TRUE  TRUE
## CNTNAP1        FALSE   FALSE FALSE   TRUE  TRUE
## ADCY9          FALSE   FALSE FALSE   TRUE  TRUE
## SLCO3A1        FALSE   FALSE FALSE   TRUE  TRUE
## NPIPA1         FALSE   FALSE FALSE   TRUE  TRUE
## ADA2           FALSE   FALSE FALSE   TRUE  TRUE
## MAN2B1         FALSE   FALSE FALSE   TRUE  TRUE
## CARD8          FALSE   FALSE FALSE   TRUE  TRUE
## SLC66A2        FALSE   FALSE FALSE   TRUE  TRUE
## ZNF486         FALSE   FALSE FALSE   TRUE  TRUE
## GYG2           FALSE   FALSE FALSE   TRUE  TRUE
## LZTR1          FALSE   FALSE FALSE   TRUE  TRUE
## MT-ATP6        FALSE   FALSE FALSE   TRUE  TRUE
## MT-ATP8        FALSE   FALSE FALSE   TRUE  TRUE

all_dn_all = unique( c(kanton_hits_dn,down123,dn_genes,dn.hits.geneN,conf_down))

all_dn_all = data.frame( Kanton = all_dn_all %in% kanton_hits_dn,
                         Jorstad = all_dn_all %in% down123,
                         Ma = all_dn_all %in% dn_genes,
                         foetal = all_dn_all %in% conf_down,
                         Ciuba = all_dn_all %in% dn.hits.geneN,
                         row.names = all_dn_all)

all_dn_all = all_dn_all[all_dn_all$Ciuba & rowSums(all_dn_all[,1:4])>0,]
all_dn_all

##            Kanton Jorstad    Ma foetal Ciuba
## CELF4        TRUE   FALSE FALSE  FALSE  TRUE
## FGF13        TRUE   FALSE FALSE  FALSE  TRUE
## SYN1         TRUE   FALSE  TRUE   TRUE  TRUE
## PDZRN4      FALSE    TRUE  TRUE   TRUE  TRUE
## PLCL2       FALSE    TRUE  TRUE  FALSE  TRUE
## SRSF4       FALSE    TRUE FALSE   TRUE  TRUE
## PBLD        FALSE    TRUE  TRUE   TRUE  TRUE
## GABPB1      FALSE    TRUE  TRUE  FALSE  TRUE
## UNC5D       FALSE    TRUE  TRUE   TRUE  TRUE
## RAPGEF5     FALSE    TRUE FALSE  FALSE  TRUE
## DCC         FALSE    TRUE  TRUE  FALSE  TRUE
## ATP8A2      FALSE    TRUE  TRUE  FALSE  TRUE
## PANK3       FALSE    TRUE FALSE   TRUE  TRUE
## RCAN3       FALSE    TRUE  TRUE   TRUE  TRUE
## MAP3K2      FALSE    TRUE FALSE  FALSE  TRUE
## NUDT4       FALSE    TRUE FALSE   TRUE  TRUE
## RND3        FALSE    TRUE  TRUE   TRUE  TRUE
## SPAST       FALSE    TRUE  TRUE  FALSE  TRUE
## FBXO11      FALSE    TRUE FALSE  FALSE  TRUE
## ACIN1       FALSE   FALSE  TRUE  FALSE  TRUE
## AHCTF1      FALSE   FALSE  TRUE  FALSE  TRUE
## ATAD2B      FALSE   FALSE  TRUE  FALSE  TRUE
## CECR2       FALSE   FALSE  TRUE  FALSE  TRUE
## CEP104      FALSE   FALSE  TRUE  FALSE  TRUE
## CREBRF      FALSE   FALSE  TRUE  FALSE  TRUE
## CSPP1       FALSE   FALSE  TRUE  FALSE  TRUE
## DONSON      FALSE   FALSE  TRUE   TRUE  TRUE
## DYRK2       FALSE   FALSE  TRUE   TRUE  TRUE
## EED         FALSE   FALSE  TRUE   TRUE  TRUE
## EFL1        FALSE   FALSE  TRUE   TRUE  TRUE
## ERCC6L2     FALSE   FALSE  TRUE  FALSE  TRUE
## FBXW7       FALSE   FALSE  TRUE  FALSE  TRUE
## GRK4        FALSE   FALSE  TRUE  FALSE  TRUE
## INA         FALSE   FALSE  TRUE   TRUE  TRUE
## INSR        FALSE   FALSE  TRUE  FALSE  TRUE
## KAT6A       FALSE   FALSE  TRUE  FALSE  TRUE
## KLHL24      FALSE   FALSE  TRUE  FALSE  TRUE
## MBTD1       FALSE   FALSE  TRUE  FALSE  TRUE
## MIB1        FALSE   FALSE  TRUE  FALSE  TRUE
## MLLT10      FALSE   FALSE  TRUE  FALSE  TRUE
## PGBD2       FALSE   FALSE  TRUE  FALSE  TRUE
## POLR1B      FALSE   FALSE  TRUE   TRUE  TRUE
## PPM1A       FALSE   FALSE  TRUE   TRUE  TRUE
## PPP4R3B     FALSE   FALSE  TRUE   TRUE  TRUE
## PTPN4       FALSE   FALSE  TRUE  FALSE  TRUE
## RAB3A       FALSE   FALSE  TRUE   TRUE  TRUE
## RPRD2       FALSE   FALSE  TRUE  FALSE  TRUE
## STRN3       FALSE   FALSE  TRUE  FALSE  TRUE
## STXBP1      FALSE   FALSE  TRUE  FALSE  TRUE
## SYT16       FALSE   FALSE  TRUE  FALSE  TRUE
## TERF1       FALSE   FALSE  TRUE   TRUE  TRUE
## TFDP2       FALSE   FALSE  TRUE  FALSE  TRUE
## TRIM2       FALSE   FALSE  TRUE   TRUE  TRUE
## TRIM23      FALSE   FALSE  TRUE   TRUE  TRUE
## TTC33       FALSE   FALSE  TRUE   TRUE  TRUE
## TUBB4A      FALSE   FALSE  TRUE   TRUE  TRUE
## UBN2        FALSE   FALSE  TRUE  FALSE  TRUE
## ZNF148      FALSE   FALSE  TRUE  FALSE  TRUE
## ZNF595      FALSE   FALSE  TRUE  FALSE  TRUE
## ZRANB3      FALSE   FALSE  TRUE  FALSE  TRUE
## NUP133      FALSE   FALSE FALSE   TRUE  TRUE
## RSBN1       FALSE   FALSE FALSE   TRUE  TRUE
## CCDC181     FALSE   FALSE FALSE   TRUE  TRUE
## PRPF38A     FALSE   FALSE FALSE   TRUE  TRUE
## ETAA1       FALSE   FALSE FALSE   TRUE  TRUE
## PDIK1L      FALSE   FALSE FALSE   TRUE  TRUE
## SOX11       FALSE   FALSE FALSE   TRUE  TRUE
## AIDA        FALSE   FALSE FALSE   TRUE  TRUE
## SRSF10      FALSE   FALSE FALSE   TRUE  TRUE
## GDAP2       FALSE   FALSE FALSE   TRUE  TRUE
## ARL6        FALSE   FALSE FALSE   TRUE  TRUE
## PHOSPHO2    FALSE   FALSE FALSE   TRUE  TRUE
## SMARCA5     FALSE   FALSE FALSE   TRUE  TRUE
## KCNH7       FALSE   FALSE FALSE   TRUE  TRUE
## C1GALT1C1L  FALSE   FALSE FALSE   TRUE  TRUE
## KIF2A       FALSE   FALSE FALSE   TRUE  TRUE
## CDC5L       FALSE   FALSE FALSE   TRUE  TRUE
## PRPF4B      FALSE   FALSE FALSE   TRUE  TRUE
## CLK4        FALSE   FALSE FALSE   TRUE  TRUE
## OARD1       FALSE   FALSE FALSE   TRUE  TRUE
## KIF3A       FALSE   FALSE FALSE   TRUE  TRUE
## CEP162      FALSE   FALSE FALSE   TRUE  TRUE
## EIF4E       FALSE   FALSE FALSE   TRUE  TRUE
## ZUP1        FALSE   FALSE FALSE   TRUE  TRUE
## ZCCHC10     FALSE   FALSE FALSE   TRUE  TRUE
## PGM2        FALSE   FALSE FALSE   TRUE  TRUE
## HDAC2       FALSE   FALSE FALSE   TRUE  TRUE
## ZKSCAN8     FALSE   FALSE FALSE   TRUE  TRUE
## BRD2        FALSE   FALSE FALSE   TRUE  TRUE
## CFAP69      FALSE   FALSE FALSE   TRUE  TRUE
## CBLL1       FALSE   FALSE FALSE   TRUE  TRUE
## RANBP6      FALSE   FALSE FALSE   TRUE  TRUE
## ZNF92       FALSE   FALSE FALSE   TRUE  TRUE
## C9orf72     FALSE   FALSE FALSE   TRUE  TRUE
## TMEM196     FALSE   FALSE FALSE   TRUE  TRUE
## ZBTB10      FALSE   FALSE FALSE   TRUE  TRUE
## UBXN2B      FALSE   FALSE FALSE   TRUE  TRUE
## RPAP3       FALSE   FALSE FALSE   TRUE  TRUE
## FAM76B      FALSE   FALSE FALSE   TRUE  TRUE
## FOLH1       FALSE   FALSE FALSE   TRUE  TRUE
## IKZF5       FALSE   FALSE FALSE   TRUE  TRUE
## SMC3        FALSE   FALSE FALSE   TRUE  TRUE
## KMT5B       FALSE   FALSE FALSE   TRUE  TRUE
## DPF2        FALSE   FALSE FALSE   TRUE  TRUE
## LIN7C       FALSE   FALSE FALSE   TRUE  TRUE
## DCDC1       FALSE   FALSE FALSE   TRUE  TRUE
## GVQW3       FALSE   FALSE FALSE   TRUE  TRUE
## HSPA14      FALSE   FALSE FALSE   TRUE  TRUE
## C10orf143   FALSE   FALSE FALSE   TRUE  TRUE
## YAF2        FALSE   FALSE FALSE   TRUE  TRUE
## PKP2        FALSE   FALSE FALSE   TRUE  TRUE
## ATP2B1      FALSE   FALSE FALSE   TRUE  TRUE
## VCPKMT      FALSE   FALSE FALSE   TRUE  TRUE
## CAND1       FALSE   FALSE FALSE   TRUE  TRUE
## ZC2HC1C     FALSE   FALSE FALSE   TRUE  TRUE
## RBM26       FALSE   FALSE FALSE   TRUE  TRUE
## THTPA       FALSE   FALSE FALSE   TRUE  TRUE
## ZNF200      FALSE   FALSE FALSE   TRUE  TRUE
## CTCF        FALSE   FALSE FALSE   TRUE  TRUE
## AKTIP       FALSE   FALSE FALSE   TRUE  TRUE
## NRG4        FALSE   FALSE FALSE   TRUE  TRUE
## ADAP2       FALSE   FALSE FALSE   TRUE  TRUE
## DLL3        FALSE   FALSE FALSE   TRUE  TRUE
## ZNF175      FALSE   FALSE FALSE   TRUE  TRUE
## APOE        FALSE   FALSE FALSE   TRUE  TRUE
## OSBPL2      FALSE   FALSE FALSE   TRUE  TRUE
## ZNF304      FALSE   FALSE FALSE   TRUE  TRUE
## ZNF776      FALSE   FALSE FALSE   TRUE  TRUE
## EID2B       FALSE   FALSE FALSE   TRUE  TRUE
## MEX3C       FALSE   FALSE FALSE   TRUE  TRUE
## ZNF17       FALSE   FALSE FALSE   TRUE  TRUE
## ZNF600      FALSE   FALSE FALSE   TRUE  TRUE
## ZNF181      FALSE   FALSE FALSE   TRUE  TRUE
## PEG3        FALSE   FALSE FALSE   TRUE  TRUE
## OLIG2       FALSE   FALSE FALSE   TRUE  TRUE
## ZNF134      FALSE   FALSE FALSE   TRUE  TRUE
## RBMX        FALSE   FALSE FALSE   TRUE  TRUE
## PHF6        FALSE   FALSE FALSE   TRUE  TRUE
## MT-CO3      FALSE   FALSE FALSE   TRUE  TRUE
## PGAM4       FALSE   FALSE FALSE   TRUE  TRUE
## RTL5        FALSE   FALSE FALSE   TRUE  TRUE

Session Info

sessionInfo()

## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] pl_PL.UTF-8/pl_PL.UTF-8/pl_PL.UTF-8/C/pl_PL.UTF-8/pl_PL.UTF-8
## 
## attached base packages:
##  [1] tools     grid      stats4    parallel  stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0                            
##  [2] readr_2.1.4                             
##  [3] SeuratObject_4.1.3                      
##  [4] Seurat_4.3.0                            
##  [5] ggVennDiagram_1.2.2                     
##  [6] rBLAST_0.99.2                           
##  [7] Rsubread_2.6.4                          
##  [8] BSgenome.Ptroglodytes.UCSC.panTro6_1.4.2
##  [9] beeswarm_0.4.0                          
## [10] VennDiagram_1.7.3                       
## [11] futile.logger_1.4.3                     
## [12] scuttle_1.2.1                           
## [13] SingleCellExperiment_1.14.1             
## [14] forcats_1.0.0                           
## [15] RColorBrewer_1.1-3                      
## [16] glmGamPoi_1.4.0                         
## [17] reshape2_1.4.4                          
## [18] kableExtra_1.3.4                        
## [19] plotly_4.10.1                           
## [20] dplyr_1.1.2                             
## [21] ggrepel_0.9.3                           
## [22] data.table_1.14.8                       
## [23] pheatmap_1.0.12                         
## [24] LSD_4.1-0                               
## [25] BSgenome.Hsapiens.UCSC.hg38_1.4.3       
## [26] BSgenome_1.60.0                         
## [27] colorspace_2.1-0                        
## [28] rtracklayer_1.52.1                      
## [29] Rsamtools_2.8.0                         
## [30] Biostrings_2.60.2                       
## [31] XVector_0.32.0                          
## [32] GenomicFeatures_1.44.2                  
## [33] biomaRt_2.48.3                          
## [34] Gviz_1.36.2                             
## [35] st_1.2.7                                
## [36] sda_1.3.8                               
## [37] fdrtool_1.2.17                          
## [38] corpcor_1.6.10                          
## [39] entropy_1.3.1                           
## [40] smoothmest_0.1-3                        
## [41] MASS_7.3-58.3                           
## [42] genefilter_1.74.1                       
## [43] edgeR_3.34.1                            
## [44] limma_3.48.3                            
## [45] DESeq2_1.32.0                           
## [46] SummarizedExperiment_1.22.0             
## [47] MatrixGenerics_1.4.3                    
## [48] matrixStats_0.63.0                      
## [49] GenomicRanges_1.44.0                    
## [50] GenomeInfoDb_1.28.4                     
## [51] geneplotter_1.70.0                      
## [52] annotate_1.70.0                         
## [53] XML_3.99-0.14                           
## [54] AnnotationDbi_1.54.1                    
## [55] IRanges_2.26.0                          
## [56] S4Vectors_0.30.2                        
## [57] lattice_0.21-8                          
## [58] locfit_1.5-9.7                          
## [59] Biobase_2.52.0                          
## [60] BiocGenerics_0.38.0                     
## [61] plyr_1.8.8                              
## [62] ggplot2_3.4.2                           
## [63] Matrix_1.5-4                            
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3            scattermore_0.8          
##   [3] tidyr_1.3.0               bit64_4.0.5              
##   [5] knitr_1.42                irlba_2.3.5.1            
##   [7] DelayedArray_0.18.0       rpart_4.1.19             
##   [9] KEGGREST_1.32.0           RCurl_1.98-1.12          
##  [11] AnnotationFilter_1.16.0   generics_0.1.3           
##  [13] cowplot_1.1.1             lambda.r_1.2.4           
##  [15] RSQLite_2.3.1             RANN_2.6.1               
##  [17] proxy_0.4-27              future_1.32.0            
##  [19] tzdb_0.3.0                bit_4.0.5                
##  [21] spatstat.data_3.0-1       webshot_0.5.4            
##  [23] xml2_1.3.3                httpuv_1.6.9             
##  [25] xfun_0.38                 hms_1.1.3                
##  [27] jquerylib_0.1.4           evaluate_0.20            
##  [29] promises_1.2.0.1          fansi_1.0.4              
##  [31] restfulr_0.0.15           progress_1.2.2           
##  [33] dbplyr_2.3.2              igraph_1.4.2             
##  [35] DBI_1.1.3                 htmlwidgets_1.6.2        
##  [37] spatstat.geom_3.1-0       purrr_1.0.1              
##  [39] ellipsis_0.3.2            backports_1.4.1          
##  [41] deldir_1.0-6              sparseMatrixStats_1.4.2  
##  [43] vctrs_0.6.1               ensembldb_2.16.4         
##  [45] ROCR_1.0-11               abind_1.4-5              
##  [47] cachem_1.0.7              withr_2.5.0              
##  [49] RVenn_1.1.0               progressr_0.13.0         
##  [51] checkmate_2.1.0           sctransform_0.3.5        
##  [53] GenomicAlignments_1.28.0  prettyunits_1.1.1        
##  [55] goftest_1.2-3             svglite_2.1.1            
##  [57] cluster_2.1.4             lazyeval_0.2.2           
##  [59] crayon_1.5.2              spatstat.explore_3.1-0   
##  [61] units_0.8-1               labeling_0.4.2           
##  [63] pkgconfig_2.0.3           nlme_3.1-162             
##  [65] ProtGenerics_1.24.0       nnet_7.3-18              
##  [67] rlang_1.1.0               globals_0.16.2           
##  [69] lifecycle_1.0.3           miniUI_0.1.1.1           
##  [71] filelock_1.0.2            BiocFileCache_2.0.0      
##  [73] dichromat_2.0-0.1         invgamma_1.1             
##  [75] polyclip_1.10-4           lmtest_0.9-40            
##  [77] ashr_2.2-54               carData_3.0-5            
##  [79] zoo_1.8-11                base64enc_0.1-3          
##  [81] ggridges_0.5.4            png_0.1-8                
##  [83] viridisLite_0.4.1         rjson_0.2.21             
##  [85] bitops_1.0-7              KernSmooth_2.23-20       
##  [87] blob_1.2.4                DelayedMatrixStats_1.14.3
##  [89] classInt_0.4-9            mixsqp_0.3-48            
##  [91] SQUAREM_2021.1            stringr_1.5.0            
##  [93] spatstat.random_3.1-4     parallelly_1.35.0        
##  [95] rstatix_0.7.2             jpeg_0.1-10              
##  [97] ggsignif_0.6.4            beachmat_2.8.1           
##  [99] scales_1.2.1              memoise_2.0.1            
## [101] magrittr_2.0.3            ica_1.0-3                
## [103] zlibbioc_1.38.0           compiler_4.1.0           
## [105] BiocIO_1.2.0              fitdistrplus_1.1-8       
## [107] cli_3.6.1                 listenv_0.9.0            
## [109] patchwork_1.1.2           pbapply_1.7-0            
## [111] htmlTable_2.4.1           formatR_1.14             
## [113] Formula_1.2-5             tidyselect_1.2.0         
## [115] stringi_1.7.12            highr_0.10               
## [117] yaml_2.3.7                latticeExtra_0.6-30      
## [119] sass_0.4.5                VariantAnnotation_1.38.0 
## [121] future.apply_1.10.0       rstudioapi_0.14          
## [123] foreign_0.8-84            gridExtra_2.3            
## [125] farver_2.1.1              Rtsne_0.16               
## [127] digest_0.6.31             shiny_1.7.4              
## [129] Rcpp_1.0.10               car_3.1-2                
## [131] broom_1.0.4               later_1.3.0              
## [133] RcppAnnoy_0.0.20          httr_1.4.5               
## [135] biovizBase_1.40.0         sf_1.0-12                
## [137] tensor_1.5                rvest_1.0.3              
## [139] reticulate_1.28           truncnorm_1.0-9          
## [141] splines_4.1.0             uwot_0.1.14              
## [143] spatstat.utils_3.0-2      sp_1.6-0                 
## [145] systemfonts_1.0.4         xtable_1.8-4             
## [147] jsonlite_1.8.4            futile.options_1.0.1     
## [149] R6_2.5.1                  Hmisc_5.0-1              
## [151] pillar_1.9.0              htmltools_0.5.5          
## [153] mime_0.12                 glue_1.6.2               
## [155] fastmap_1.1.1             BiocParallel_1.26.2      
## [157] class_7.3-21              codetools_0.2-19         
## [159] utf8_1.2.3                spatstat.sparse_3.0-1    
## [161] bslib_0.4.2               tibble_3.2.1             
## [163] curl_5.0.0                leiden_0.4.3             
## [165] interp_1.1-4              survival_3.5-5           
## [167] rmarkdown_2.21            munsell_0.5.0            
## [169] e1071_1.7-13              GenomeInfoDbData_1.2.6   
## [171] gtable_0.3.3

Molecular signature of primate astrocytes reveals pathways and regulatory changes contributing to the human brain evolution

Aleksandra Pekowska, Debadeep Chaudhury

Libraries, functions, paths

Transcriptome - from validation to evolution

Zhang data - expressed and not genes

Differential Gene Expression Analysis

Differentiation score

Heatmap for figure 1 and differentiation score

Evolution of astrocyte transcriptomes

Volcano plots

save object for other vignettes

Functional annotation

Pick genes randomly how many would we expect to be related to diseases?

ID and intelligence

Extracellular exosome

DAVID analysis - downregulated genes

DAVID analysis - upregulated genes

Mandy6 validation

ID

DESeq2 analysis of gene expression in bulk cortex tissue - this paper

Consider Khaitovitch lab bulk RNA seq data

DESeq2 analysis of gene expression in bulk cortex tissue - this paper

Consider Khaitovitch lab bulk RNA seq data

Intellectual Dissability-related genes

More broad analysis of diseases

TEADs - expression

Regulome analysis - chromatin structure

Analysis of domains

Identification of human and chimpanzee specific domian boundaries

HiC data normalisation: human data

HiC data normalisation: chimpanzee data

Boundary strength – boxplot of Insulation score

Genome-wide insulation score

Regulome analysis - differential openess

ATAC - preparations

Identificaion of peaks to work with

Preparation of count table for ATAC regions

Find peaks with 50% liftover

ATAC read counting

DEseq analysis of ATAC seq peak openess

Identificaiton and functional annotation of DORs gained in humans

Identification and functional annotation of DORs lost in humans

Tables for the manuscript

Genuine gained and lost enhancers - definitions

Stats for the text

Saving objects for sequence comparision

Linked and not enhancers - TAD based annotation

TADs

Down-regulated genes - loss of enhancers?

Sequence analysis of the enhancer classes

Stepwise gain in enhancer activity

Linked and not enhancers have a promoter within 500kb

Average profile

Evolutionary changes in TFBS - preparations

Enhancers that do something

Enhancers that do nothing

Enhancers that do something - TADs

Enhancers that do nothing - TAD

Conserved enhancers

Lost enhnacers that do something

Lost enhnacers that do nothing

PLOTS - Sequence analysis

Lost enhancers - inactivation is also linked with loss of stripe factors?

How many human gained enhanceers have evolutionary changes in these factors

Plot showing if the stripe factors more frequently loose binding or not

Enhancers with changes in stripe factors and gene upregulation in the vicinity

Footprint

Supplementary analysis - TAD based annotation

Compare linked and non linked - TAD annotation

scRNA-seq data

Human: Find astrocytes in individual samples

Count tables

Macaque: Find astrocytes in individual samples

Pseudo bulk approach to identify markers of human astrocyte evolution

Table for the supplement

Kanton

Jorstad

Ma et al

Session Info