s3ATAC Analysis

Processing for s3ATAC portion of s3 paper.

This notebook is a continuation of “s3 Preprocessing” and details the processing of s3ATAC libraries. This notebook starts with a merged, barcode-based removal of duplicate, >Q10 filtered bam file which was subset to barcodes >=10K unique reads.

#Initial directory structure (filtered to relevant files):
/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis

├── complexity_data
│   ├── Barnyard_ATAC_A.complexity.txt
│   ├── Barnyard_ATAC_B.complexity.txt
│   ├── Barnyard_ATAC_C.complexity.txt
│   ├── Barnyard_ATAC_D.complexity.txt
│   ├── Barnyard_ATAC_E.complexity.txt
├── filtered_bam
│   ├── Barnyard_ATAC_A.bbrd.q10.filt.bam
│   ├── Barnyard_ATAC_A.bbrd.q10.filt.cellIDs.list
│   ├── Barnyard_ATAC_B.bbrd.q10.filt.bam
│   ├── Barnyard_ATAC_B.bbrd.q10.filt.cellIDs.list
│   ├── Barnyard_ATAC_C.bbrd.q10.filt.bam
│   ├── Barnyard_ATAC_C.bbrd.q10.filt.cellIDs.list
│   ├── Barnyard_ATAC_D.bbrd.q10.filt.bam
│   ├── Barnyard_ATAC_D.bbrd.q10.filt.cellIDs.list
│   ├── Barnyard_ATAC_E.bbrd.q10.filt.bam
│   ├── Barnyard_ATAC_E.bbrd.q10.filt.cellIDs.list
├── s3atac_data
│    └── s3atac_barnyard.bbrd.q10.filt.bam
├── raw_fq
│   ├── Barnyard_ATAC_A.1.fq.gz
│   ├── Barnyard_ATAC_A.2.fq.gz
│   ├── Barnyard_ATAC_B.1.fq.gz
│   ├── Barnyard_ATAC_B.2.fq.gz
│   ├── Barnyard_ATAC_C.1.fq.gz
│   ├── Barnyard_ATAC_C.2.fq.gz
│   ├── Barnyard_ATAC_D.1.fq.gz
│   ├── Barnyard_ATAC_D.2.fq.gz
│   ├── Barnyard_ATAC_E.1.fq.gz
│   ├── Barnyard_ATAC_E.2.fq.gz

List of starting files

#Barnyard analysis to measure per cell collision rates and split merged bam by species for realignment
scitools barnyard-compare s3atac_barnyard.bbrd.q10.filt.bam &

#New files generated by barnyard-compare
s3atac_barnyard.bbrd.q10.filt.bam                      s3atac_barnyard.bbrd.q10.filt.barnyard_cells.plot.r
s3atac_barnyard.bbrd.q10.filt.barnyard_cells.plot.pdf  s3atac_barnyard.bbrd.q10.filt.barnyard_cells.txt
s3atac_barnyard.bbrd.q10.filt.barnyard_cells.plot.png  s3atac_barnyard.bbrd.q10.filt.barnyard_stats.txt

R processing to generate a list of cellIDs which are >95% human or >95% mouse

library(reshape2)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

#read in indexes master file
idx<-read.table("/home/groups/oroaklab/src/scitools/scitools-dev/SCI_stdchem_Indexes.txt",col.names=c("idx_name","idx_cycles","idx_seq"))
idx_i7<-idx[idx$idx_cycles==1,]
colnames(idx_i7)<-c("i7_idx_name","i7_idx_cycle","i7_idx_seq")
idx_i5<-idx[idx$idx_cycles==2,]
colnames(idx_i5)<-c("i5_idx_name","i5_idx_cycle","i5_idx_seq")
idx_tn5<-idx[idx$idx_cycles==3,]
colnames(idx_tn5)<-c("tn5_idx_name","tn5_idx_cycle","tn5_idx_seq")

#read in barnyard output
barnyard<-read.table("./barnyard_analysis/s3atac_barnyard.bbrd.q10.filt.barnyard_cells.txt", header=F)
colnames(barnyard)<-c("cellID","total_reads_q20","hg38_q20","mm10_q20","perc_human","species_call")
barnyard$i7_idx_seq<-substr(barnyard$cellID,0,8)
barnyard$tn5_idx_seq<-substr(barnyard$cellID,17,24)
barnyard$i5_idx_seq<-substr(barnyard$cellID,9,16)
dat<-merge(barnyard,idx_i7,by="i7_idx_seq")
dat<-merge(dat,idx_tn5,by="tn5_idx_seq")
dat<-merge(dat,idx_i5,by="i5_idx_seq")
dat$pcr_plate<-substr(dat$i5_idx_name,5,5)
dat$i5_set<-paste0("CPT_",dat$pcr_plate)

#Per Plate breakdown of data contained as sheets in https://docs.google.com/spreadsheets/d/1mZ34KIwmr2vdjQlnqY7v_u0-Eca8mRc-HgI2r_WICXk/edit#gid=1305239528
#Not all plates are a proper barnyard mix.
i5_annot<-read.table("i5_pcr_annot.txt",sep="\t",header=T)
i7_annot<-read.table("i7_pcr_annot.txt",sep="\t",header=T)

dat<-merge(dat,i7_annot,by="i7_idx_seq")
dat<-merge(dat,i5_annot,by="i5_idx_seq")

#Plate 4 and 5 are only human (as noted in spreadsheet)
dat[dat$i5_sample=="Human",]$i7_sample_designation<-"hum_cortex"

#Also based on FACS sorting data, Plate 1, Column 6 Row E was incorrectly double sorted (experimentor error) and will be excluded
#this is also reflected in high cell count in the well (28 cells) compared to average (~8 cells)
dat<-dat[!(dat$i7_plate_column=="6" & dat$i5_row=="E" & dat$i5_pcr_plate=="Plate_1"),]
#Also removing plate 2 (used to polymerase testing)
dat<-dat[!(dat$i5_pcr_plate=="Plate_2"),]

#True barnyard results (columns 7-11 only of plates 1 2 and 3)
true_by_cellstested<-nrow(dat[dat$i7_sample_designation =="equimolar_tagment_mix",])
true_by_cellspassed<-nrow(dat[dat$i7_sample_designation =="equimolar_tagment_mix" & (dat$species_call != "Mixed"),])
true_by_cellsfailed<-nrow(dat[dat$i7_sample_designation =="equimolar_tagment_mix" & (dat$species_call == "Mixed"),])

(true_by_cellsfailed/true_by_cellstested)*2
#0.05531295
#so 5.5% estimated collision rate (19/687)

#PostTagMixed Barnyard (column 12 only)
posttag_by_cellstested<-nrow(dat[dat$i7_sample_designation =="equimolar_posttag_mix",])
posttag_by_cellspassed<-nrow(dat[dat$i7_sample_designation =="equimolar_posttag_mix" & (dat$species_call == "Mouse"),])

((posttag_by_cellstested-posttag_by_cellspassed)/posttag_by_cellstested)*2
#0
#so 0%

#Plot true barnyard and posttag barnyard cells
library(ggplot2)
plt<-ggplot(dat[dat$i7_sample_designation =="equimolar_tagment_mix",],aes(x=hg38_q20,y=mm10_q20,color=as.factor(species_call)))+geom_point()+xlab("hg38 Reads")+ylab("mm10 Reads")+theme_bw()
ggsave(plt,file="truebarnyard_readcount.pdf")
write.table(dat[dat$i7_sample_designation =="equimolar_tagment_mix",c("hg38_q20","mm10_q20","species_call")],file="SourceData_Fig2b.tsv",sep="\t",quote=F)
system("slack -F SourceData_Fig2b.tsv ryan_todo")

plt<-ggplot(dat[dat$i7_sample_designation =="equimolar_tagment_mix",],aes(x=perc_human))+geom_histogram(bins=100)+xlab("Percent Human")+ylab("Cells")+theme_bw()
ggsave(plt,file="truebarnyard_perchuman.pdf")

#Post tagmentation barnyard
plt<-ggplot(dat[dat$i7_sample_designation =="equimolar_posttag_mix",],aes(x=hg38_q20,y=mm10_q20,color=as.factor(species_call)))+geom_point()+xlab("hg38 Reads")+ylab("mm10 Reads")+theme_bw()
ggsave(plt,file="posttagbarnyard_readcount.pdf")
write.table(dat[dat$i7_sample_designation =="equimolar_posttag_mix",c("hg38_q20","mm10_q20","species_call")],file="SourceData_Fig2c.tsv",sep="\t",quote=F)
system("slack -F SourceData_Fig2c.tsv ryan_todo")

plt<-ggplot(dat[dat$i7_sample_designation =="equimolar_posttag_mix",],aes(x=perc_human))+geom_histogram(bins=100)+xlab("Percent Human")+ylab("Cells")+theme_bw()
ggsave(plt,file="posttagbarnyard_perchuman.pdf")

#Make annot files for apriori_species, called_species, plate_volume, barnyard_condition

dat$species_apriori<-as.character(dat$i7_sample_designation)
dat[dat$i7_sample_designation %in% c("equimolar_tagment_mix","equimolar_posttag_mix"),]$species_apriori<-"barnyard"

#apriori species
write.table(dat[c("cellID","species_apriori")],file="barnyard_apriorispecies.annot",sep="\t",quote=F,col.names=F,row.names=F)

#called species
write.table(dat[c("cellID","species_call")],file="barnyard_calledspecies.annot",sep="\t",quote=F,col.names=F,row.names=F)

#PCR Plate
write.table(dat[c("cellID","pcr_plate")],file="barnyard_pcrplate.annot",sep="\t",quote=F,col.names=F,row.names=F)

#Plate Volume
write.table(dat[c("cellID","i5_plate_condition")],file="barnyard_platevolume.annot",sep="\t",quote=F,col.names=F,row.names=F)

#Barnyard Condition
write.table(dat[c("cellID","i7_sample_designation")],file="barnyard_condition.annot",sep="\t",quote=F,col.names=F,row.names=F)

#Freshvfrozen
write.table(dat[c("cellID","i5_frozen")],file="barnyard_storagecondition.annot",sep="\t",quote=F,col.names=F,row.names=F)

#PCR_pol
write.table(dat[c("cellID","PCR_Pol")],file="barnyard_polymerase.annot",sep="\t",quote=F,col.names=F,row.names=F)

#Full Data Write Out
write.table(dat,file="barnyard_full_cell_summary.tsv",sep="\t",quote=F,col.names=T,row.names=F)

Analysis of barnyard output

Split Human and mouse cells at the fastq level based on the barnyard output. Then processed in parallel.

Further processing after human and mouse cells were split

#Merge original fastqs, and split original fastqs by called species for downstream processing
cat /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/raw_fq/Barnyard_ATAC*.1.fq.gz > /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/Barnyard_ATAC.merged.1.fq.gz &
cat /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/raw_fq/Barnyard_ATAC*.2.fq.gz > /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/Barnyard_ATAC.merged.2.fq.gz &
scitools fastq-split -X -A ../barnyard_calledspecies.annot Barnyard_ATAC.merged.1.fq.gz Barnyard_ATAC.merged.2.fq.gz &

awk '{if($2="Human") print $1}' barnyard_calledspecies.annot > hg38.cellID.list 
awk '{if($2="Mouse") print $1}' barnyard_calledspecies.annot > mm10.cellID.list

#realign to reference genomes with subset cellIDs
scitools fastq-align -t 10 -r 10 hg38 hg38 barnyard_calledspecies.Human.1.fq.gz barnyard_calledspecies.Human.2.fq.gz &
scitools fastq-align -t 10 -r 10 mm10 mm10 barnyard_calledspecies.Mouse.1.fq.gz barnyard_calledspecies.Mouse.2.fq.gz &

#Project out bam files to look for maximum complexity
scitools bam-project hg38.bam &
scitools bam-project mm10.bam &

#Performing bbrd and changing file name to be simplified.
scitools bam-rmdup hg38.bam &
scitools bam-rmdup mm10.bam &
  
#Calling peaks on human and mouse cells
scitools callpeaks hg38.bbrd.q10.bam &
scitools callpeaks mm10.bbrd.q10.bam &

#  304049 hg38.bbrd.q10.500.bed
#  181791 mm10.bbrd.q10.500.bed


#Set up cellID by peak count sparse matrix
scitools atac-count hg38.bbrd.q10.bam hg38.bbrd.q10.500.bed &
scitools atac-count mm10.bbrd.q10.bam mm10.bbrd.q10.500.bed &

#TSS enrichment calculation
module load bedops/2.4.36
scitools bam-tssenrich -X hg38.bbrd.q10.bam hg38 &
scitools bam-tssenrich -X -E hg38.bbrd.q10.bam hg38 &
scitools bam-tssenrich -X -E mm10.bbrd.q10.bam mm10 &
scitools bam-tssenrich -X mm10.bbrd.q10.bam mm10 &

#Complexity plotting (split bam into single cells then project out)
#for hg38, splitting bam into multiples of 1000 unique barcodes for faster processing and limiting IO
split --additional-suffix=.cellID.list hg38.cellID.list hg38. #default is to split to 1000 lines (so 1000 cells)
for i in hg38.??.cellID.list; do awk 'OFS="\t"{split(FILENAME,a,"[.]");print $1,a[2]}' $i ; done > hg38.cellID.split.annot # awk to concatenate split into a scitools annot file
scitools bam-split -A hg38.cellID.split.annot hg38.bam & #split bam file
for i in hg38.??.bam; do scitools bam-addrg $i & done & # add RG
for i in hg38.??.RG.bam; do samtools split -@8 -f './hg38_split/%*_%!.%.' $i ; done & #perform samtools split on RG (cellIDs)
cd hg38_split
for i in hg38*bam; do scitools bam-project -r 1000 -X -e  $i; done & #perform complexity projection on 1000 read intervals

#repeat with mm10, but skip the bulk bam splitting (since cell count is lower)
scitools bam-addrg mm10.bam &
mkdir mm10_split
samtools split -@20 -f './mm10_split/%*_%!.%.' mm10.RG.bam &
cd mm10_split
for i in mm10*bam; do scitools bam-project -r 1000 -X -e  $i; done &

#Grab single cell read projections
for i in ./mm10_split/*.read_projections; 
do cellid=${i:21:-17};
awk -v cellid=$cellid 'OFS="\t" {print $1,$2,$3,cellid}' $i/cell_summaries.txt;
done > ./mm10_projected_reads.txt

for i in ./hg38_split/*.read_projections; 
do cellid=${i:24:-17};
awk -v cellid=$cellid 'OFS="\t" {print $1,$2,$3,cellid}' $i/cell_summaries.txt;
done > ./hg38_projected_reads.txt #change in cellid string slicing to account for added "split bam identifier"

#isize distribution
scitools isize hg38.bbrd.q10.bam &
scitools isize mm10.bbrd.q10.bam &

 awk 'OFS="\t" {print $1,"hg38"}' hg38.bbrd.q10.isize.values > SourceData_Fig2E.txt
 awk 'OFS="\t" {print $1,"mm10"}' mm10.bbrd.q10.isize.values >> SourceData_Fig2E.txt
 sort -k2,2 -k1,1n -T . --parallel=20 SourceData_Fig2E.txt | uniq -c > SourceData_Fig2E.counts.txt

Going to compress file by making a frequency table instead of just a raw list

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
dat<-read.table("SourceData_Fig2E.txt")

Plotting TSS Enrichment

#Plotting hg38 tss enrichment
library(ggplot2)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
hg38_rangeR <- read.table("hg38.bbrd.q10.bulkTSSdist_1000.txt")
colnames(hg38_rangeR)<-c("distance","count")
hg38_rangeR$species<-"hg38"
mm10_rangeR <- read.table("mm10.bbrd.q10.bulkTSSdist_1000.txt")
colnames(mm10_rangeR)<-c("distance","count")
mm10_rangeR$species<-"mm10"
dat<-rbind(hg38_rangeR,mm10_rangeR)
ggplot(dat,aes(x=distance,y=count))+geom_area()+theme_bw()+facet_grid(species ~. )
ggsave("s3atac_bulk_TSS.pdf")

write.table(dat[,c("distance","count","species")],file="SourceData_Fig2f.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_Fig2f.tsv ryan_todo")

Tabix fragment file generation

Tabix file format is a tab separated multicolumn data structure.

Column Number Name Description
1 chrom Reference genome chromosome of fragment
2 chromStart Adjusted start position of fragment on chromosome.
3 chromEnd Adjusted end position of fragment on chromosome. The end position is exclusive, so represents the position immediately following the fragment interval.
4 barcode The 10x (or sci) cell barcode of this fragment. This corresponds to the CB tag attached to the corresponding BAM file records for this fragment.
5 duplicateCount The number of PCR duplicate read pairs observed for this fragment. Sequencer-created duplicates, such as Exclusion Amp duplicates created by the NovaSeq instrument are excluded from this count.
#human processing
input_bam="hg38.bbrd.q10.bam"
output_name=${input_bam::-13}
tabix="/home/groups/oroaklab/src/cellranger-atac/cellranger-atac-1.1.0/miniconda-atac-cs/4.3.21-miniconda-atac-cs-c10/bin/tabix"
bgzip="/home/groups/oroaklab/src/cellranger-atac/cellranger-atac-1.1.0/miniconda-atac-cs/4.3.21-miniconda-atac-cs-c10/bin/bgzip"
samtools view --threads 10 $input_bam | awk 'OFS="\t" {split($1,a,":"); print $3,$4,$8,a[1],1}' | sort -S 2G -T . --parallel=30 -k1,1 -k2,2n -k3,3n | $bgzip > $output_name.fragments.tsv.gz; wait ;
$tabix -p bed $output_name.fragments.tsv.gz &
#mouse
input_bam="mm10.bbrd.q10.bam"
output_name=${input_bam::-13}
samtools view --threads 10 $input_bam | awk 'OFS="\t" {split($1,a,":"); print $3,$4,$8,a[1],1}' | sort -S 2G -T . --parallel=10 -k1,1 -k2,2n -k3,3n | $bgzip > $output_name.fragments.tsv.gz
$tabix -p bed $output_name.fragments.tsv.gz &

Downsampling of BAM Files

Split out single-cell bams from deduplicated bams


mkdir single_cell_splits

#Add read group for cellID specific splitting
for i in *bbrd.q10.bam ; do scitools bam-addrg $i & done &

#mm10 is small enough to split as is
samtools split -@20 -f './single_cell_splits/%*_%!.full.%.' mm10.bbrd.q10.RG.bam & #perform samtools split on RG (cellIDs)

#for human too many cells for simultaneous splitting (I/O errors), going to make two temp bam files with limited RG headers
samtools view -H hg38.bbrd.q10.RG.bam | grep "^@RG" | awk '{split($2,a,":"); print a[2]}' | split --lines=500 --additional-suffix=".cellID.subset.temp.cellID.list" #split RG into 500 cell chunks
for i in *.cellID.subset.temp.cellID.list; do outname=${i::-26} ; scitools bam-filter -L $i -O hg38.bbrd.q10.RG.${outname}.temp hg38.bbrd.q10.RG.bam & done & #split bam files by RG lists

#samtools split opens as many IO streams as rg in header, so need to fix headers with just rg in new split bams
#reheader bam files
for i in *.cellID.subset.temp.cellID.list; 
  do outname=${i::-26} ; 
  samtools view -H hg38.bbrd.q10.RG.${outname}.temp.filt.bam | grep -v "^@RG" - > temp.header;
  awk 'OFS="\t" {print "@RG","ID:"$1,"SM:"$1,"LB:"$i,"PL:SCI"}' $i >> temp.header ;
  samtools reheader -c "cat temp.header" hg38.bbrd.q10.RG.${outname}.temp.filt.bam > hg38.bbrd.q10.RG.${outname}.temp.rehead.bam; done &

#finally split out single cells
for i in *temp.rehead.bam; do samtools split -@20 -f './single_cell_splits/%*_%!.full.%.' $i; done & #perform samtools split on RG (cellIDs)

Split out single-cell bams from raw bams for projections


#Add read group for cellID specific splitting
for i in mm10.bam hg38.bam ; do scitools bam-addrg $i & done &

#mm10 is small enough to split as is
samtools split -@20 -f './single_cell_splits/%*_%!.prededup.%.' mm10.RG.bam & #perform samtools split on RG (cellIDs)

#for human too many cells for simultaneous splitting (I/O errors), going to make two temp bam files with limited RG headers
samtools view -H hg38.RG.bam | grep "^@RG" | awk '{split($2,a,":"); print a[2]}' | split --lines=500 --additional-suffix=".cellID.subset.temp.cellID.list" #split RG into 500 cell chunks
for i in *.cellID.subset.temp.cellID.list; do outname=${i::-26} ; scitools bam-filter -L $i -O hg38.RG.${outname}.temp hg38.RG.bam & done & #split bam files by RG lists

#samtools split opens as many IO streams as rg in header, so need to fix headers with just rg in new split bams
#reheader bam files
for i in *.cellID.subset.temp.cellID.list; 
  do outname=${i::-26} ; 
  samtools view -H hg38.RG.${outname}.temp.filt.bam | grep -v "^@RG" - > temp.header;
  awk 'OFS="\t" {print "@RG","ID:"$1,"SM:"$1,"LB:"$i,"PL:SCI"}' $i >> temp.header ;
  samtools reheader -c "cat temp.header" hg38.RG.${outname}.temp.filt.bam > hg38.RG.${outname}.temp.rehead.bam; done &

#finally split out single cells
for i in *temp.rehead.bam; do samtools split -@20 -f './single_cell_splits/%*_%!.prededup.%.' $i; done & #perform samtools split on RG (cellIDs)

Process single-cell bams

cd single_cell_splits

mkdir single_dedup_bams
mkdir single_prededup_bams

mv *full.bam single_dedup_bams/
mv *prededup.bam single_prededup_bams/

cd single_prededup_bams

#parallelization of picard tools estimate library complexity, running on j=30 cores
parallel -j 30 "java -jar /home/groups/oroaklab/src/picard/picard-tools-1.70/EstimateLibraryComplexity.jar I={} O={}.picard.metrics" ::: **.bam

Process single-cell pre-deduplicated bams

cd single_dedup_bams

#Subsample by percentage
for j in 005 01 02 05 10 15 20 50 100; #2 = 20%, 4= 40% etc.
  do for i in *.full.bam; 
    do samtools view -@ 10 -h -s 0.${j} $i > ${i::-9}.${j}perc.bam ;done ; done & 

#Alternative approach by read count, ended up not using
#Subsample by read count
#for j in 1000 5000 10000 25000;
#do for i in *full.bam; do ((samtools view -H $i) & (samtools view -@ 10 $i | shuf -n $j)) | samtools view -@ 10 -bS - > ${i::-9}.${j}.bam ; done; done & 

#remove bam files that dont hit the subsample amount
#for j in 1000 5000 10000 25000;
#do for i in *${j}.bam;
#do readcount=`samtools view $i | wc -l`;
#if (("$readcount" < "$j")) ; then rm -f $i; fi; done; done &

#merge bam files together and remove single-cell bams for readability
#splitting tmp_list again to reduce IO then merging all
for i in hg38 mm10; 
  do for j in 005perc 01perc 02perc 05perc 10perc 15perc 20perc 50perc 100perc; 
    do ls ${i}*${j}.bam > tmp_list.txt; #generate temporary list
    split -l 500 --additional-suffix=.${i}.${j}.tmp_list.txt tmp_list.txt; #split list into -l lines (cells)
    for k in *.${i}.${j}.tmp_list.txt;
      do tmp_name=(${k//./ })
      samtools merge -@ 30 -b $k ${i}.${tmp_name[0]}.tmp.${j}.merged.bam; #first stage merging of 500 cells each
    done;
    samtools merge -@ 30 ${i}.${j}.merged.bam `ls ${i}.*.tmp.${j}.merged.bam`; #second stage merging
      done ; done &

#remove single cell files
rm -rf *perc.bam &
rm -rf *0.bam &
rm -rf *full.bam &
rm -rf *tmp_list.txt
rm -rf *tmp*bam


for i in *perc*merged.bam;
do scitools callpeaks $i ; done &

#  40782 hg38.10000.merged.500.bed
#    3173 hg38.1000.merged.500.bed
#   75744 hg38.20perc.merged.500.bed
#   64892 hg38.25000.merged.500.bed
#  137381 hg38.40perc.merged.500.bed
#   94585 hg38.5000.merged.500.bed
#  184420 hg38.60perc.merged.500.bed
#  221442 hg38.80perc.merged.500.bed
#   16732 mm10.10000.merged.500.bed
#     519 mm10.1000.merged.500.bed
#   36837 mm10.20perc.merged.500.bed
#   26208 mm10.25000.merged.500.bed
#   71047 mm10.40perc.merged.500.bed
#   41218 mm10.5000.merged.500.bed
#   99236 mm10.60perc.merged.500.bed
#  154911 mm10.80perc.merged.500.bed

#for i in hg38*bam;
#do scitools atac-count -O ${i::-4}.fullpeakset $i ../../hg38.bbrd.q10.500.bed & done &

#for i in mm10*bam;
#do scitools atac-count -O ${i:-4}.fullpeakset $i ../../mm10.bbrd.q10.500.bed & done &

for i in hg38*bam;
do scitools atac-count -O ${i::-4} $i ${i::-4}.500.bed & done &

for i in mm10*bam;
do scitools atac-count -O ${i:-4} $i ${i::-4}.500.bed & done &

#Generate count of reads per bam
for i in *bam; do
samtools view $i | awk '{split($1,a,":");print a[1]}' | sort --parallel 10 | uniq -c > ${i::-4}.counts.txt; done &

Generate Seurat Objects of downsampled data.

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(Matrix)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/single_cell_splits/single_dedup_bams")

file_in<-list.files(pattern="*.fracOnTarget.values")
file_in<-substr(file_in,1,nchar(file_in)-20)

hg38_annotations<-GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
mm10_annotations<-GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)


make_seurat_object<-function(x,genome="hg38"){
IN<-as.matrix(read.table(paste0(x,".counts.sparseMatrix.values.gz")))
IN<-sparseMatrix(i=IN[,1],j=IN[,2],x=IN[,3])
COLS<-read.table(paste0(x,".counts.sparseMatrix.cols.gz"))
colnames(IN)<-COLS$V1
ROWS<-read.table(paste0(x,".counts.sparseMatrix.rows.gz"))
row.names(IN)<-ROWS$V1
counts_mat<-IN # make counts matrix from sparse matrix

# extract gene annotations from EnsDb
if(startsWith(x,"hg38")){
annotations<-hg38_annotations
} else {
annotations<-mm10_annotations
}

# change to UCSC style
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- genome

#Generate ChromatinAssay Objects
chromatinassay <- CreateChromatinAssay(
  counts = counts_mat,
  genome=genome,
  min.cells = 1,
  annotation=annotations,
  sep=c("_","_"))

#Create Seurat Objects
atac <- CreateSeuratObject(
  counts = chromatinassay,
  assay = "peaks")
#saving unprocessed SeuratObjects
saveRDS(atac,file=paste0(x,"_SeuratObject.Rds"))
}

lapply(file_in,make_seurat_object)

Perform cisTopic on each subset seurat object

This is a custom R script to be run in bash with argument 1 being the seurat objects. This is because cistopic seems to have a memory leak if run in R internally.

library(Signac)
library(Seurat)
library(SeuratWrappers)
library(cisTopic)
set.seed(1234)

#Read in data and modify to monocle CDS file
#read in RDS file.
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/single_cell_splits/single_dedup_bams")

args = commandArgs(trailingOnly=TRUE)


######FUNCTIONS#####
cistopic_generation<-function(x){
#Perform cistopic on subclusters of data 
    atac_sub<-readRDS(x) 
    outname<-unlist(strsplit(x,"_"))[[1]]
    cistopic_counts_frmt<-atac_sub$peaks@counts
    row.names(cistopic_counts_frmt)<-sub("-", ":", row.names(cistopic_counts_frmt))
    sub_cistopic<-cisTopic::createcisTopicObject(cistopic_counts_frmt)
    sub_cistopic_models<-cisTopic::runWarpLDAModels(sub_cistopic,topic=c(10,20:30),nCores=11,addModels=FALSE)
    saveRDS(sub_cistopic_models,file=paste(outname,"CisTopicObject.Rds",sep="."))

    pdf(paste(outname,"model_selection.pdf",sep="_"))
    par(mfrow=c(3,3))
    sub_cistopic_models<- selectModel(sub_cistopic_models, type='derivative')
    dev.off()
    }


####################################
### Processing ###
    #Running cistopic model generation on all subset data
    print(paste("Running model for",args[1]))
    #Running cistopic subclustering on all identified cell types
    #lapply(seurat_objects[8:length(seurat_objects)], function(i) {cistopic_generation(x=i)})
	cistopic_generation(x=args[1])

Using a bash loop for cistopic clustering.

for i in *SeuratObject.Rds; 
do Rscript cistopic.loop.r $i; done &
#cistopic.loop.r is the above R code
library(Signac)
library(Seurat)
library(SeuratWrappers)
library(cisTopic)
library(patchwork)
library(ggplot2)
set.seed(1234)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/single_cell_splits/single_dedup_bams")

	hg38_atac<-readRDS(file="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/hg38_SeuratObject.Rds")
	mm10_atac<-readRDS(file="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10_SeuratObject.Rds")

    seurat_objects<-list.files(pattern="SeuratObject.Rds$")
    topic_models<-list.files(pattern="CisTopicObject.Rds$")

    #determine model count to use for each cell type
    for (i in seurat_objects){
    	    outname<-unlist(strsplit(i,"_"))[[1]]
    	    system(paste0("slack -F ",outname,"_model_selection.pdf"," ryan_todo"))} #slack -F of ryan_todo just posts files from our cluster to a slack channel I have

    #selecting topics based on derivative, making a named vector 
    topic_count_list<-c(24,24,25,26,26,28,30,28,23,30,30,23,
    	24,30,27,28,30,30,28,28,23,26,26,24)
    names(topic_count_list)<-topic_models


#UMAP Projection and clustering on selected cistopic model
clustering_loop<-function(topicmodel,topiccount,seuratobject,genome_name="hg38",outname.){
	outname<-outname.
	object_input<-readRDS(seuratobject)
	object_input$cellID<-row.names(object_input@meta.data)
    #select_topic
    models_input<-readRDS(topicmodel)
    counts<-read.table(paste0(outname,".counts.txt"),header=F)
    colnames(counts)<-c("subsamp_counts","cellID")
    row.names(counts)<-counts$cellID
    avg_read_count<-mean(counts$subsamp_counts)
    cisTopicObject<-cisTopic::selectModel(models_input,select=topiccount,keepModels=F)
    
    #perform UMAP on topics
    topic_df<-as.data.frame(cisTopicObject@selected.model$document_expects)
    row.names(topic_df)<-paste0("Topic_",row.names(topic_df))
    dims<-as.data.frame(uwot::umap(t(topic_df),n_components=2))
    row.names(dims)<-colnames(topic_df)
    colnames(dims)<-c("x","y")
    dims$cellID<-row.names(dims)

    #get cell embeddings
    cell_embeddings<-as.data.frame(cisTopicObject@selected.model$document_expects)
    colnames(cell_embeddings)<-cisTopicObject@cell.names
    n_topics<-nrow(cell_embeddings)
    row.names(cell_embeddings)<-paste0("topic_",1:n_topics)
    cell_embeddings<-as.data.frame(t(cell_embeddings))
    
    #get feature loadings
    feature_loadings<-as.data.frame(cisTopicObject@selected.model$topics)
    row.names(feature_loadings)<-paste0("topic_",1:n_topics)
    feature_loadings<-as.data.frame(t(feature_loadings))
    
    #combine with seurat object for celltype seuratobject  
    cistopic_obj<-CreateDimReducObject(embeddings=as.matrix(cell_embeddings),loadings=as.matrix(feature_loadings),assay="peaks",key="topic_")
    umap_dims<-as.data.frame(as.matrix(dims[,c("x","y")]))
    colnames(umap_dims)<-c("UMAP_1","UMAP_2")
    row.names(umap_dims)<-dims$cellID
    cistopic_umap<-CreateDimReducObject(embeddings=as.matrix(umap_dims),assay="peaks",key="UMAP_")
    object_input@reductions$cistopic<-cistopic_obj
    object_input@reductions$umap<-cistopic_umap

    if(genome_name=="hg38"){
    	metadata<-as.data.frame(hg38_atac@meta.data)
    	}else {
    	metadata<-as.data.frame(mm10_atac@meta.data)
    	}
    object_input$cellID<-row.names(object_input@meta.data)
    object_input<-subset(object_input,cells=metadata$cellID)
    object_input<-AddMetaData(object_input,metadata)

    saveRDS(object_input,seuratobject)
    plt<-DimPlot(object_input,group.by=c('seurat_clusters'))+ggtitle(paste(outname,avg_read_count))
    ggsave(plt,file=paste(outname,"clustering.pdf",sep="_"))
    return(plt)
}


    #Running clustering loop
    hg38_seuratobjects<-seurat_objects[which(startsWith(seurat_objects,"hg38"))]
    mm10_seuratobjects<-seurat_objects[which(startsWith(seurat_objects,"mm10"))]

    plt_list_hg38<-lapply(hg38_seuratobjects,function(i) {
    	outname_in<-unlist(strsplit(i,"_"))[[1]]
    	obj_in<-paste(outname_in,"SeuratObject.Rds",sep="_")
    	topic_in<-paste(outname_in,"CisTopicObject.Rds",sep=".")
		topic_count=topic_count_list[topic_in]
    	clustering_loop(seuratobject=obj_in,topicmodel=topic_in,topiccount=topic_count,genome_name="hg38",outname.=outname_in)}
    	)

    plt_list_mm10<-lapply(mm10_seuratobjects,function(i) {
    	outname_in<-unlist(strsplit(i,"_"))[[1]]
    	obj_in<-paste(outname_in,"SeuratObject.Rds",sep="_")
    	topic_in<-paste(outname_in,"CisTopicObject.Rds",sep=".")
		topic_count=topic_count_list[topic_in]
    	clustering_loop(seuratobject=obj_in,topicmodel=topic_in,topiccount=topic_count,genome_name="mm10",outname.=outname_in)}
    	)
    
    plt_list_out<-wrap_plots(plt_list_hg38,guides="collect")
  	ggsave(plt_list_out,file="downsample.hg38.clustering.pdf",height=30,width=30,limitsize=F)
  	system("slack -F downsample.hg38.clustering.pdf ryan_todo")

  	plt_list_out<-wrap_plots(plt_list_mm10,guides="collect")
  	ggsave(plt_list_out,file="downsample.mm10.clustering.pdf",height=30,width=30,limitsize=F)
  	system("slack -F downsample.mm10.clustering.pdf ryan_todo")

  	out_data<-lapply(c(hg38_seuratobjects,mm10_seuratobjects), function(i){
  		obj<-readRDS(i)
  		subset_amount<-unlist(lapply(strsplit(i,"[.]"),"[",2))
  		species<-unlist(lapply(strsplit(i,"[.]"),"[",1))
		dat_out<-cbind(as.data.frame(obj@reductions$umap@cell.embeddings[names(obj$celltype),]),obj$celltype,subset_amount,species)
		return(dat_out)
  	})

  	dat_out<-do.call("rbind",out_data)
  	write.table(dat_out,file="SourceData_SupFig1d.tsv",sep="\t",quote=F,row.names=F)
	system("slack -F SourceData_SupFig1d.tsv ryan_todo")


Gene activity score comparison between downsampled seurat objects

library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(cicero)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(Signac)
library(Seurat)
set.seed(1234)

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/single_cell_splits/single_dedup_bams")

seurat_objects<-list.files(pattern="SeuratObject.Rds$")

#Read in data and modify to monocle CDS file
#read in RDS file.

cicero_processing<-function(x){
	obj_in<-readRDS(x)
	outname_in<-unlist(strsplit(x,"_"))[[1]]
	#convert to CellDataSet format and make the cicero object
    #Processing hg38 cells
    obj.cds <- as.cell_data_set(x = obj_in,group_by="seurat_clusters")
    obj.cicero <- make_cicero_cds( obj.cds, reduced_coordinates = reducedDims(obj.cds)$UMAP)
    genome <- seqlengths(obj_in) # get the chromosome sizes from the Seurat object
    genome.df <- data.frame("chr" = names(genome), "length" = genome) # convert chromosome sizes to a dataframe
    conns <- run_cicero(obj.cicero, genomic_coords = genome.df) # run cicero
    saveRDS(conns,paste0(outname_in,"_cicero_conns.Rds"))
    ccans <- generate_ccans(conns) # generate ccans
    saveRDS(ccans,paste0(outname_in,"_cicero_ccans.Rds"))
    links <- ConnectionsToLinks(conns = conns, ccans = ccans) #Add connections back to Seurat object as links
    Links(obj_in) <- links
    saveRDS(obj_in,paste0(outname_in,"Seurat.cicero.Rds"))
}

for (i in seurat_objects){
	tryCatch(
		{
			cicero_processing(i)
		},
		error=function(err){print(paste(i,"failed to generate model."))})
}

ccan_list<-list.files(pattern="_ccans.Rds$")
ccan_output<-list()

for (i in ccan_list){
	ccan_in<-readRDS(i)
	ccan_output[[i]]<-c(sample=i,peak_in_ccan=nrow(ccan_in),ccan_count=length(unique(ccan_in$CCAN)),average_peak_per_ccan=mean(table(ccan_in$CCAN)))
}

saveRDS(ccan_output,file="subsampling_ccanoutput.rds")

# generate unnormalized gene activity matrices
# gene annotation sample
    #hg38
    hg38_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
    mm10_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)

generate_ga<-function(x){
	obj_in<-readRDS(x)
	outname_in<-unlist(strsplit(x,"_"))[[1]]
	conns<-readRDS(paste0(outname_in,"_cicero_conns.Rds"))

	if(startsWith(outname_in,"hg38")){
		annot<-hg38_annotations
	} else {
		annot<-mm10_annotations
	}

	pos <-as.data.frame(annot,row.names=NULL)
    pos$chromosome<-paste0("chr",pos$seqnames)
    pos$gene<-pos$gene_id
    pos <- subset(pos, strand == "+")
    pos <- pos[order(pos$start),] 
    pos <- pos[!duplicated(pos$tx_id),] # remove all but the first exons per transcript
    pos$end <- pos$start + 1 # make a 1 base pair marker of the TSS

    neg <-as.data.frame(annot,row.names=NULL)
    neg$chromosome<-paste0("chr",neg$seqnames)
    neg$gene<-neg$gene_id
    neg <- subset(neg, strand == "-")
    neg <- neg[order(neg$start,decreasing=TRUE),] 
    neg <- neg[!duplicated(neg$tx_id),] # remove all but the first exons per transcript
    neg$end <- neg$end + 1 # make a 1 base pair marker of the TSS
    gene_annotation<- rbind(pos, neg)
    gene_annotation <- gene_annotation[,c("chromosome","start","end","gene_name")] # Make a subset of the TSS annotation columns containing just the coordinates and the gene name
    names(gene_annotation)[4] <- "gene" # Rename the gene symbol column to "gene"
    obj.cds<-annotate_cds_by_site(as.cell_data_set(x = obj_in,group_by="seurat_clusters"), gene_annotation)
    unnorm_ga<-build_gene_activity_matrix(obj.cds, conns)
    saveRDS(unnorm_ga,paste0(outname_in,".unnorm_GA.Rds"))
    obj_in[['GeneActivity']]<- CreateAssayObject(counts = as.data.frame(unnorm_ga)) 
    # normalize
    obj_in <- NormalizeData(
      object = obj_in,
      assay = 'GeneActivity',
      normalization.method = 'LogNormalize',
      scale.factor = median(obj_in$nCount_peaks)
    )
    saveRDS(obj_in,x)
}


cicero_objects<-list.files(pattern="Seurat.cicero.Rds")
lapply(cicero_objects,generate_ga)

Comparison of s3ATAC adult mouse brain reads with available data sets

  • dscATAC
    • from https://github.com/buenrostrolab/dscATAC_analysis_code/blob/master/mousebrain/data/mousebrain-master_dataframe.rds They only get single reads. So “uniqueNuclearFrags” is the correct column to use.
  • snATAC
    • from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2668124 “Quality metrics for cells that passed quality filtering (column 1: cell barcodes, column 2: number of reads, column 3: promoter coverage, column 4: fraction of reads in peaks.”
  • 10X scATAC
    • Reports Fragments from https://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_adult_brain_fresh_5k So for 10X I decided to use their filtered bam file to generate my own count of unique reads per cell. ```bash cd /home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison wget https://cg.10xgenomics.com/samples/cell-atac/1.2.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_possorted_bam.bam samtools flagstat atac_v1_adult_brain_fresh_5k_possorted_bam.bam #488116262 + 0 in total (QC-passed reads + QC-failed reads) #3570 + 0 secondary #0 + 0 supplementary #270088536 + 0 duplicates #481421958 + 0 mapped (98.63% : N/A) #488112692 + 0 paired in sequencing #244056346 + 0 read1 #244056346 + 0 read2 #473659052 + 0 properly paired (97.04% : N/A) #478189330 + 0 with itself and mate mapped #3229058 + 0 singletons (0.66% : N/A) #1467290 + 0 with mate mapped to a different chr #839491 + 0 with mate mapped to a different chr (mapQ>=5) #so there are some singletons in this as well, but a low amount samtools view -bf 2 atac_v1_adult_brain_fresh_5k_possorted_bam.bam > 10xout.temp.bam samtools flagstat 10xout.temp.bam
samtools view -b -f 2 atac_v1_adult_brain_fresh_5k_possorted_bam.bam awk ‘{print $19}’ sort –parallel=20 -T . uniq -c > 10x_genomics_uniquereads.txt &

```R
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/barnyard_analysis")

dat_10x<-read.csv("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/10x_atac_v1_adult_brain_fresh_5k_singlecell.csv")
dat_10x<-dat_10x[c("barcode","passed_filters")]
barcused_10x<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/10x_atac_v1_adult_brain_fresh_5k_barcodesused.txt")
dat_10x<-dat_10x[dat_10x$barcode %in% barcused_10x$V1,]
#summary(dat_10x$duplicate/dat_10x$total)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#0.05322 0.48545 0.52413 0.52090 0.56153 0.79080
colnames(dat_10x)<-c("cellID","uniq_reads")
dat_10x$assay<-"10x_scATAC"

dat_biorad<-readRDS("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/mousebrain-master_dataframe.rds")
#summary(1-dat_biorad$duplicateProportion)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 # 0.051   0.160   0.190   0.255   0.328   0.796
dat_biorad_projected<-dat_biorad[c("DropBarcode","librarySize")]
colnames(dat_biorad_projected)<-c("cellID","uniq_reads")
dat_biorad_projected$assay<-"dscATAC_projected"

dat_biorad<-dat_biorad[c("DropBarcode","uniqueNuclearFrags")]
colnames(dat_biorad)<-c("cellID","uniq_reads")
dat_biorad$assay<-"dscATAC"

dat_ren<-read.csv("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/GSM2668124_p56.nchrM.merge.sel_cell.qc.csv",header=F)
colnames(dat_ren)<-c("cellID","uniq_reads","perc_uniq","frip")
#> summary(dat_ren$perc_uniq)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 0.2508  0.2643  0.2660  0.2682  0.2724  0.3334
dat_ren<-dat_ren[c("cellID","uniq_reads")]
dat_ren$assay<-"snATAC"

dat_sciatac<-read.table("/home/groups/oroaklab/adey_lab/projects/spatial/wholebrain/wholebrain.complexity.txt",header=F)
colnames(dat_sciatac)<-c("rowname_carryover","cellID","tot_reads","uniq_reads","perc_uniq")
dat_sciatac_condition<-read.table("/home/groups/oroaklab/adey_lab/projects/spatial/wholebrain/annots_wholebrain/wholebrain_FreshvFrozen.annot")
colnames(dat_sciatac_condition)<-c("cellID","condition")
dat_sciatac<-merge(dat_sciatac,dat_sciatac_condition,by="cellID")
dat_sciatac<-dat_sciatac[dat_sciatac$condition=="Frozen",] #filter by frozen condition
dat_sciatac_dims<-read.table("/home/groups/oroaklab/adey_lab/projects/spatial/wholebrain/wholebrain.bbrd.q10.filt.500.counts.cistopic.UMAP.dims") 
colnames(dat_sciatac_dims)<-c("cellID","x","y") #filter by cells used in analysis (dims file are those which underwent clustering)
dat_sciatac<-dat_sciatac[c("cellID","uniq_reads")]
dat_sciatac<-dat_sciatac[dat_sciatac$cellID %in% dat_sciatac_dims$cellID,]
dat_sciatac$assay<-"sciatac"


dat_s3<-read.table("mm10.complexity.txt",header=F)
colnames(dat_s3)<-c("rowname_carryover","cellID","tot_reads","uniq_reads","perc_uniq")
dat_s3_pcrplate<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/barnyard_pcrplate.annot",header=F,col.names=c("cellID","pcr_plate"))
dat_s3<-merge(dat_s3,dat_s3_pcrplate,by="cellID")
#summarize percent unique per plate
library(dplyr)

dat_s3<-dat_s3[dat_s3$uniq_reads>=10000,]

data.frame(dat_s3 %>% group_by(pcr_plate) %>% summarise(mean_compl=mean(perc_uniq),
                                                        median_compl=median(perc_uniq),
                                                        sd=sd(perc_uniq),
                                                        mean_uniq=mean(uniq_reads),
                                                        median_uniq=median(uniq_reads),
                                                        sd_uniq=sd(uniq_reads),
                                                        cell_count=n()))

#  pcr_plate mean_compl median_compl        sd mean_uniq median_uniq   sd_uniq cell_count
#1         B   71.48840       71.820  3.668856  46299.46     33502.0  39659.36 381
#2         C   36.31242       27.895 16.913973 264133.51    181248.5 269722.56 298
#3         D   66.16967       67.690  7.614098  43358.67     26490.0  48451.74 273 
       
dat_s3_b<-dat_s3[dat_s3$pcr_plate=="B",c("cellID","uniq_reads")]
dat_s3_b$assay<-"s3ATAC_Plate_B"
dat_s3_c<-dat_s3[dat_s3$pcr_plate=="C",c("cellID","uniq_reads")]
dat_s3_c$assay<-"s3ATAC_Plate_C"

dat_10x<-dat_10x[dat_10x$uniq_reads>=1000,]

dat<-rbind(dat_10x,dat_ren,dat_biorad,dat_s3_c) #dat_sciatac
dat$assay  = factor(dat$assay, levels=c("snATAC", "10x_scATAC","dscATAC","s3ATAC_Plate_C")) #"sciatac",

write.table(dat[,c("assay","uniq_reads")],file="SourceData_Fig2d.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_Fig2d.tsv ryan_todo")

library(ggplot2)
ggplot(dat,aes(x=as.factor(assay),y=log10(uniq_reads),color=as.factor(assay)))+geom_boxplot()+theme_bw()+theme(axis.text.x = element_text(angle = 60, hjust=1))
ggsave("adultmusbrain_atacprotocol_comparisons.svg")
ggsave("adultmusbrain_atacprotocol_comparisons.pdf")
ggsave("adultmusbrain_atacprotocol_comparisons.png")
system("slack -F adultmusbrain_atacprotocol_comparisons.pdf ryan_todo")
i="s3ATAC_Plate_C"
for (j in unique(dat$assay)){
  ttemp<-t.test(dat[dat$assay==i,]$uniq_reads,dat[dat$assay==j,]$uniq_reads,alternative="greater")
  print(paste(i,j,ttemp$p.value,ttemp$estimate[1],ttemp$estimate[2],ttemp$estimate[1]/ttemp$estimate[2],ttemp$alternative,ttemp$method))
  }
#[1] "s3ATAC_Plate_C,10x_scATAC,3.13338819899197e-40,264133.506711409,22715.9052708283,greater,Welch Two Sample t-test"
#[1] "s3ATAC_Plate_C,snATAC,5.7106611661251e-42,264133.506711409,15459.4017798286,greater,Welch Two Sample t-test"
#[1] "s3ATAC_Plate_C,dscATAC,1.5654951763642e-37,264133.506711409,34045.7355797055,greater,Welch Two Sample t-test"
#[1] "s3ATAC_Plate_C,sciatac,9.87044162982429e-43,264133.506711409,12263.8299081767,greater,Welch Two Sample t-test"
#[1] "s3ATAC_Plate_C,s3ATAC_Plate_C,0.5,264133.506711409,264133.506711409,greater,Welch Two Sample t-test"





Comparison between peak sets

#snATAC peaks
wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2668nnn/GSM2668124/suppl/GSM2668124_p56.nchrM.merge.sel_cell.ygi.txt.gz
gzip -d GSM2668124_p56.nchrM.merge.sel_cell.ygi.txt.gz
awk 'OFS="\t" {print $0}' GSM2668124_p56.nchrM.merge.sel_cell.ygi.txt > snatac_peaks.bed
wc -l snatac_peaks.bed
##140102 peaks

#10X peaks
wget https://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_peaks.bed
##157797 peaks

#biorad peaks
wget https://raw.githubusercontent.com/buenrostrolab/dscATAC_analysis_code/master/mousebrain/data/mouseBrain_peaks.bed
awk 'OFS="\t" {print $0}' mouseBrain_peaks.bed | awk '(NR>1)' - > biorad_peaks.bed
##454047 peaks
#Getting counts of peak overlaps
wc -l /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10.bbrd.q10.500.bed
#174653
bedtools intersect -wb -a /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10.bbrd.q10.500.bed -b /home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/snatac_peaks.bed | wc -l
#71199 of ours in theirs
bedtools intersect -wa -a /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10.bbrd.q10.500.bed -b /home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/atac_v1_adult_brain_fresh_5k_peaks.bed | wc -l

bedtools intersect -wa -a /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10.bbrd.q10.500.bed -b /home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/biorad_peaks.bed | wc -l

s3 ATAC Full Processing

Generating Seurat Objects

Using R v4.0.0 and Signac v1.0

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(Matrix)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

#function to read in sparse matrix format from atac-count
read_in_sparse<-function(x){ #x is character file prefix followed by .bbrd.q10.500.counts.sparseMatrix.values.gz
IN<-as.matrix(read.table(paste0(x,".bbrd.q10.500.counts.sparseMatrix.values.gz")))
IN<-sparseMatrix(i=IN[,1],j=IN[,2],x=IN[,3])
COLS<-read.table(paste0(x,".bbrd.q10.500.counts.sparseMatrix.cols.gz"))
colnames(IN)<-COLS$V1
ROWS<-read.table(paste0(x,".bbrd.q10.500.counts.sparseMatrix.rows.gz"))
row.names(IN)<-ROWS$V1
return(IN)
}

hg38_counts<-read_in_sparse("hg38") # make hg38 counts matrix from sparse matrix
mm10_counts<-read_in_sparse("mm10") # make mm10 counts matrix from sparse matrix

#Read in fragment path for coverage plots
mm10_fragment.path="./mm10.fragments.tsv.gz"
hg38_fragment.path="./hg38.fragments.tsv.gz"

# extract gene annotations from EnsDb
hg38_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
mm10_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)

# change to UCSC style
seqlevelsStyle(hg38_annotations) <- 'UCSC'
seqlevelsStyle(mm10_annotations) <- 'UCSC'

genome(hg38_annotations) <- "hg38"
genome(mm10_annotations) <- "mm10"

#Generate ChromatinAssay Objects
hg38_chromatinassay <- CreateChromatinAssay(
  counts = hg38_counts,
  genome="hg38",
  min.cells = 1,
  annotation=hg38_annotations,
  sep=c("_","_"),
  fragments=hg38_fragment.path
)

mm10_chromatinassay <- CreateChromatinAssay(
  counts = mm10_counts,
  genome="mm10",
  min.cells = 1,
  annotation=mm10_annotations,
  sep=c("_","_"),
  fragments=mm10_fragment.path
)


#Create Seurat Objects
hg38_atac <- CreateSeuratObject(
  counts = hg38_chromatinassay,
  assay = "peaks"
)
mm10_atac <- CreateSeuratObject(
  counts = mm10_chromatinassay,
  assay = "peaks"
)

#Meta.data to be updated after clustering

#saving unprocessed SeuratObjects
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")

Plotting and updating metadata

#renaming annot for simplified annotation file making
#rename processing_ processing. *annot

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)

annot.files<-list.files(pattern="annot$")
annot.files<-annot.files[startsWith(annot.files,prefix="barnyard")]
annot_append<-read.table(annot.files[1],col.names=c("cellID",strsplit(annot.files[1],"[.]")[[1]][1]))
for (i in 2:length(annot.files)){
  tmp<-read.table(annot.files[i],col.names=c("cellID",strsplit(annot.files[i],"[.]")[[1]][1]))
  annot_append<-merge(annot_append,tmp,by="cellID")
}

hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
hg38_atac@meta.data$cellID<-row.names(hg38_atac@meta.data)
hg38_atac@meta.data$i5_idx_seq<-substr(hg38_atac@meta.data$cellID,9,16)
hg38_atac@meta.data$i7_idx_seq<-substr(hg38_atac@meta.data$cellID,0,8)
annot<-as.data.frame(hg38_atac@meta.data)
annot<-merge(annot,annot_append,by="cellID",all.x=T)
compl_hg38<-read.table("hg38.complexity.txt")
colnames(compl_hg38)<-c("row_carryover","cellID","total_reads","uniq_reads","perc_uniq")
annot<-merge(annot,compl_hg38,by="cellID")
hg38_fracontarget<-read.table("hg38.bbrd.q10.500.fracOnTarget.values")
colnames(hg38_fracontarget)<-c("cellID","frac_reads_in_peaks")
annot<-merge(annot,hg38_fracontarget,by="cellID")
row.names(annot)<-annot$cellID
hg38_atac@meta.data<-annot


mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
mm10_atac@meta.data$cellID<-row.names(mm10_atac@meta.data)
mm10_atac@meta.data$i5_idx_seq<-substr(mm10_atac@meta.data$cellID,9,16)
mm10_atac@meta.data$i7_idx_seq<-substr(mm10_atac@meta.data$cellID,0,8)
annot<-as.data.frame(mm10_atac@meta.data)
annot<-merge(annot,annot_append,by="cellID",all.x=T)
compl_mm10<-read.table("mm10.complexity.txt")
colnames(compl_mm10)<-c("row_carryover","cellID","total_reads","uniq_reads","perc_uniq")
annot<-merge(annot,compl_mm10,by="cellID")
mm10_fracontarget<-read.table("mm10.bbrd.q10.500.fracOnTarget.values")
colnames(mm10_fracontarget)<-c("cellID","frac_reads_in_peaks")
annot<-merge(annot,mm10_fracontarget,by="cellID")
row.names(annot)<-annot$cellID
mm10_atac@meta.data<-annot


#Filtering cells to those with fraction of reads in peaks >0.2
mm10_atac <- subset(mm10_atac, subset = frac_reads_in_peaks >= 0.2)
hg38_atac <- subset(hg38_atac, subset = frac_reads_in_peaks >= 0.2)

saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")

Performing cisTopic and UMAP

#continued from above session
library(cisTopic)
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(Matrix)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")

hg38_cistopic_counts_frmt<-hg38_atac$peaks@counts
mm10_cistopic_counts_frmt<-mm10_atac$peaks@counts

#renaming row names to fit granges expectation of format
row.names(hg38_cistopic_counts_frmt)<-sub("-", ":", row.names(hg38_cistopic_counts_frmt))
row.names(mm10_cistopic_counts_frmt)<-sub("-", ":", row.names(mm10_cistopic_counts_frmt))

#set up CisTopicObjects
hg38_atac_cistopic<-cisTopic::createcisTopicObject(hg38_cistopic_counts_frmt)
mm10_atac_cistopic<-cisTopic::createcisTopicObject(mm10_cistopic_counts_frmt)

#Run warp LDA on objects
hg38_atac_cistopic_models<-cisTopic::runWarpLDAModels(hg38_atac_cistopic,topic=c(5,10,20:30,40,50,55,60:70),nCores=27,addModels=FALSE)
mm10_atac_cistopic_models<-cisTopic::runWarpLDAModels(mm10_atac_cistopic,topic=c(5,10,20:30,40,50,55,60:70),nCores=27,addModels=FALSE)

#Saving all models for posterity
saveRDS(hg38_atac_cistopic_models,file="hg38_CisTopicObject.Rds")
saveRDS(mm10_atac_cistopic_models,file="mm10_CisTopicObject.Rds")

#Setting up topic count selection
pdf("hg38_atac_model_selection.pdf")
par(mfrow=c(3,3))
hg38_atac_cistopic_models <- selectModel(hg38_atac_cistopic_models, type='maximum')
hg38_atac_cistopic_models <- selectModel(hg38_atac_cistopic_models, type='perplexity')
hg38_atac_cistopic_models <- selectModel(hg38_atac_cistopic_models, type='derivative')
dev.off()

pdf("mm10_atac_model_selection.pdf")
par(mfrow=c(3,3))
mm10_atac_cistopic_models <- selectModel(mm10_atac_cistopic_models, type='maximum')
mm10_atac_cistopic_models <- selectModel(mm10_atac_cistopic_models, type='perplexity')
mm10_atac_cistopic_models <- selectModel(mm10_atac_cistopic_models, type='derivative')
dev.off()

hg38_atac_cistopic_models<-readRDS(file="hg38_CisTopicObject.Rds")
mm10_atac_cistopic_models<-readRDS(file="mm10_CisTopicObject.Rds")


#set topics based on derivative
#selected topics subject to change
mm10_selected_topic=27
hg38_selected_topic=24
mm10_cisTopicObject<-cisTopic::selectModel(mm10_atac_cistopic_models,select=mm10_selected_topic,keepModels=F)
hg38_cisTopicObject<-cisTopic::selectModel(hg38_atac_cistopic_models,select=hg38_selected_topic,keepModels=F)

#saving model selected RDS
saveRDS(hg38_cisTopicObject,file="hg38_CisTopicObject.Rds")
saveRDS(mm10_cisTopicObject,file="mm10_CisTopicObject.Rds")

#Read in cisTopic objects
hg38_cisTopicObject<-readRDS("hg38_CisTopicObject.Rds")
mm10_cisTopicObject<-readRDS("mm10_CisTopicObject.Rds")
#read in seurat format object
hg38_atac<-readRDS("hg38_SeuratObject.Rds")
mm10_atac<-readRDS("mm10_SeuratObject.Rds")



#run UMAP on topics
hg38_topic_df<-as.data.frame(hg38_cisTopicObject@selected.model$document_expects)
row.names(hg38_topic_df)<-paste0("Topic_",row.names(hg38_topic_df))
hg38_dims<-as.data.frame(uwot::umap(t(hg38_topic_df),n_components=2))
row.names(hg38_dims)<-colnames(hg38_topic_df)
colnames(hg38_dims)<-c("x","y")
hg38_dims$cellID<-row.names(hg38_dims)
hg38_dims<-merge(hg38_dims,hg38_atac@meta.data,by.x="cellID",by.y="row.names")

mm10_topic_df<-as.data.frame(mm10_cisTopicObject@selected.model$document_expects)
row.names(mm10_topic_df)<-paste0("Topic_",row.names(mm10_topic_df))
mm10_dims<-as.data.frame(uwot::umap(t(mm10_topic_df),n_components=2))
row.names(mm10_dims)<-colnames(mm10_topic_df)
colnames(mm10_dims)<-c("x","y")
mm10_dims$cellID<-row.names(mm10_dims)
mm10_dims<-merge(mm10_dims,mm10_atac@meta.data,by.x="cellID",by.y="row.names")

#plot heatmaps of topics
pdf("mm10_atac_cistopic_heatmap.pdf")
cellTopicHeatmap(mm10_cisTopicObject, method='Probability')
dev.off()

pdf("hg38_atac_cistopic_heatmap.pdf")
cellTopicHeatmap(hg38_cisTopicObject, method='Probability',use_raster=F)
dev.off()

#predictive measurements of topics
#hg38_pred.matrix <- predictiveDistribution(hg38_cisTopicObject)
#mm10_pred.matrix <- predictiveDistribution(mm10_cisTopicObject)

# Obtain signatures
#hg38_path_to_signatures <- '/home/groups/oroaklab/refs/hg38/tfbs/'
#hg38_HomerTF_signatures <- paste(hg38_path_to_signatures, list.files(hg38_path_to_signatures), sep='')
#mm10_path_to_signatures <- '/home/groups/oroaklab/refs/mm10/JASPAR2018_TF_Sites/bed_files/'
#mm10_JASPARTF_signatures <- paste(mm10_path_to_signatures, list.files(mm10_path_to_signatures), sep='')

#get a list of files in directory and strsplit into labels
#hg38_labels  <- unlist(lapply(strsplit(list.files(hg38_path_to_signatures),"[.]"),"[",1))
#mm10_labels  <- unlist(lapply(strsplit(list.files(mm10_path_to_signatures),"[.]"),"[",1))

#get signature region scores for topics
#hg38_cisTopicObject <- getSignaturesRegions(hg38_cisTopicObject, hg38_HomerTF_signatures, labels=hg38_labels, minOverlap = 0.4)
#mm10_cisTopicObject <- getSignaturesRegions(mm10_cisTopicObject, mm10_JASPARTF_signatures, labels=mm10_labels, minOverlap = 0.4)

#get regions from topics
#hg38_cisTopicObject <- getRegionsScores(hg38_cisTopicObject, method='NormTop', scale=TRUE)
# mm10_cisTopicObject <- getRegionsScores(mm10_cisTopicObject, method='NormTop', scale=TRUE)

#plot tfbs predictions to topics
#pdf("hg38_atac_cistopic_SignatureRegions.pdf",height=25,width=25)
#signaturesHeatmap(hg38_cisTopicObject,row_names_gp=gpar(fontsize=5))
#dev.off()

  #pdf("mm10_atac_cistopic_SignatureRegions.pdf",height=25,width=25)
  #signaturesHeatmap(mm10_cisTopicObject,row_names_gp=gpar(fontsize=5))
  #dev.off()

#plot general annotations to topics (this still needs to be run)
#library(org.Hs.eg.db)
#library(org.Mm.eg.db)
#library(TxDb.Hsapiens.UCSC.hg38.knownGene)
#library(TxDb.Mmusculus.UCSC.mm10.knownGene)

#hg38_cisTopicObject <- annotateRegions(hg38_cisTopicObject, txdb=TxDb.Hsapiens.UCSC.hg38.knownGene, annoDb='org.Hs.eg.db')
#pdf("hg38_atac_cistopic_SignatureRegions.pdf")
#signaturesHeatmap(hg38_cisTopicObject, selected.signatures = 'annotation')
#dev.off()

  #mm10_cisTopicObject <- annotateRegions(mm10_cisTopicObject, txdb=TxDb.Mmusculus.UCSC.mm10.knownGene, annoDb='org.Mm.eg.db')
  #pdf("mm10_atac_cistopic_SignatureRegions.pdf")
  #signaturesHeatmap(mm10_cisTopicObject, selected.signatures = 'annotation')
  #dev.off()

  # Compute cell rankings
  #library(AUCell)
  #pdf("orgo_atac_cistopic_AUCellRankings.pdf")
  #aucellRankings <- AUCell_buildRankings(pred.matrix, plot=TRUE, verbose=FALSE)
  #dev.off()

  # Check signature enrichment in cells
  #pdf("orgo_atac_cistopic_sigantureCellEnrichment.pdf")
  #cisTopicObject <- signatureCellEnrichment(cisTopicObject, aucellRankings, selected.signatures='all', aucMaxRank = 0.1*nrow(aucellRankings), plot=TRUE)
  #dev.off()

  #pdf("orgo_atac_cistopic_gammafit_regions.pdf")
  #par(mfrow=c(2,5))
  #cisTopicObject <- binarizecisTopics(cisTopicObject, thrP=0.975, plot=TRUE)
  #dev.off()

# #saving model selected RDS
# saveRDS(hg38_cisTopicObject,file="hg38_CisTopicObject.Rds")
# saveRDS(mm10_cisTopicObject,file="mm10_CisTopicObject.Rds")

#bin_hg38_cisTopicObject<-binarizecisTopics(hg38_cisTopicObject)
# getBedFiles(bin_hg38_cisTopicObject, path='hg38_cisTopic_beds')
#bin_mm10_cisTopicObject<-binarizecisTopics(mm10_cisTopicObject)
# getBedFiles(bin_mm10_cisTopicObject, path='mm10_cisTopic_beds')


  #Read in cisTopic objects
# hg38_cisTopicObject<-readRDS("hg38_CisTopicObject.Rds")
# mm10_cisTopicObject<-readRDS("mm10_CisTopicObject.Rds")

#Add cell embeddings into seurat
hg38_cell_embeddings<-as.data.frame(hg38_cisTopicObject@selected.model$document_expects)
colnames(hg38_cell_embeddings)<-hg38_cisTopicObject@cell.names
hg38_n_topics<-nrow(hg38_cell_embeddings)
row.names(hg38_cell_embeddings)<-paste0("topic_",1:hg38_n_topics)
hg38_cell_embeddings<-as.data.frame(t(hg38_cell_embeddings))

mm10_cell_embeddings<-as.data.frame(mm10_cisTopicObject@selected.model$document_expects)
colnames(mm10_cell_embeddings)<-mm10_cisTopicObject@cell.names
mm10_n_topics<-nrow(mm10_cell_embeddings)
row.names(mm10_cell_embeddings)<-paste0("topic_",1:mm10_n_topics)
mm10_cell_embeddings<-as.data.frame(t(mm10_cell_embeddings))

#Add feature loadings into seurat
hg38_feature_loadings<-as.data.frame(hg38_cisTopicObject@selected.model$topics)
row.names(hg38_feature_loadings)<-paste0("topic_",1:hg38_n_topics)
hg38_feature_loadings<-as.data.frame(t(hg38_feature_loadings))

mm10_feature_loadings<-as.data.frame(mm10_cisTopicObject@selected.model$topics)
row.names(mm10_feature_loadings)<-paste0("topic_",1:mm10_n_topics)
mm10_feature_loadings<-as.data.frame(t(mm10_feature_loadings))

#combined cistopic results (cistopic loadings and umap with seurat object)
hg38_cistopic_obj<-CreateDimReducObject(embeddings=as.matrix(hg38_cell_embeddings),loadings=as.matrix(hg38_feature_loadings),assay="peaks",key="topic_")
hg38_umap_dims<-as.data.frame(as.matrix(hg38_dims[2:3]))
colnames(hg38_umap_dims)<-c("UMAP_1","UMAP_2")
row.names(hg38_umap_dims)<-hg38_dims$cellID
hg38_cistopic_umap<-CreateDimReducObject(embeddings=as.matrix(hg38_umap_dims),assay="peaks",key="UMAP_")
hg38_atac@reductions$cistopic<-hg38_cistopic_obj
hg38_atac@reductions$umap<-hg38_cistopic_umap

mm10_cistopic_obj<-CreateDimReducObject(embeddings=as.matrix(mm10_cell_embeddings),loadings=as.matrix(mm10_feature_loadings),assay="peaks",key="topic_")
mm10_umap_dims<-as.data.frame(as.matrix(mm10_dims[2:3]))
colnames(mm10_umap_dims)<-c("UMAP_1","UMAP_2")
row.names(mm10_umap_dims)<-mm10_dims$cellID
mm10_cistopic_umap<-CreateDimReducObject(embeddings=as.matrix(mm10_umap_dims),assay="peaks",key="UMAP_")
mm10_atac@reductions$cistopic<-mm10_cistopic_obj
mm10_atac@reductions$umap<-mm10_cistopic_umap

hg38_n_topics<-ncol(Embeddings(hg38_atac,reduction="cistopic"))

hg38_atac <- FindNeighbors(
  object = hg38_atac,
  reduction = 'cistopic',
  dims = 1:hg38_n_topics
)
hg38_atac <- FindClusters(
  object = hg38_atac,
  verbose = TRUE,
  resolution=0.3
)

mm10_n_topics<-ncol(Embeddings(mm10_atac,reduction="cistopic"))

mm10_atac <- FindNeighbors(
  object = mm10_atac,
  reduction = 'cistopic',
  dims = 1:mm10_n_topics
)
mm10_atac <- FindClusters(
  object = mm10_atac,
  verbose = TRUE,
  resolution=0.2
)

###save Seurat files
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")


out_data<-lapply(c("hg38_SeuratObject.Rds","mm10_SeuratObject.Rds"), function(i){
	obj<-readRDS(i)
	species<-unlist(lapply(strsplit(i,"_"),"[",1))
	dat_out<-cbind(as.data.frame(obj@reductions$umap@cell.embeddings[names(obj$celltype),]),obj$celltype,obj$seurat_clusters,species)
	return(dat_out)
})

  	dat_out<-do.call("rbind",out_data)
  	write.table(dat_out,file="SourceData_Fig2h.tsv",sep="\t",quote=F,row.names=F)
	system("slack -F SourceData_Fig2h.tsv ryan_todo")

Plotting and writing out cell table summaries

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)

hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")

dat_hg38<-hg38_atac@meta.data
dat_mm10<-mm10_atac@meta.data

library(ggplot2)

plt<-ggplot(dat_hg38,aes(x=barnyard_pcrplate,y=log10(uniq_reads)))+
geom_jitter()+
geom_boxplot(outlier.shape=NA)+
theme(axis.text.x = element_text(angle = 90))

ggsave(plt,file="hg38_complexity_boxplot.pdf")
ggsave(plt,file="hg38_complexity_boxplot.png")

plt<-ggplot(dat_mm10,aes(x=barnyard_pcrplate,y=log10(uniq_reads)))+
geom_jitter()+
geom_boxplot(outlier.shape=NA)+
theme(axis.text.x = element_text(angle = 90))

ggsave(plt,file="mm10_complexity_boxplot.pdf")
ggsave(plt,file="mm10_complexity_boxplot.png")


plt<-DimPlot(hg38_atac,group.by=c('seurat_clusters',
                                  'barnyard_platevolume',
                                  'barnyard_polymerase',
                                  'barnyard_storagecondition',
                                  'barnyard_pcrplate'
                                 ))

pdf("hg38.umap.pdf",width=20)
plt
dev.off()
ggsave(plt,file="hg38.umap.png",width=20)




plt<-DimPlot(mm10_atac,group.by=c('seurat_clusters',
                                  'barnyard_platevolume',
                                  'barnyard_polymerase',
                                  'barnyard_storagecondition',
                                  'barnyard_pcrplate'))



pdf("mm10.umap.pdf",width=20)
plt
dev.off()
ggsave(plt,file="mm10.umap.png",width=20)

dat_hg38$UMAP_1<-hg38_atac@reductions$umap@cell.embeddings[,1]
dat_hg38$UMAP_2<-hg38_atac@reductions$umap@cell.embeddings[,2]

dat_mm10$UMAP_1<-mm10_atac@reductions$umap@cell.embeddings[,1]
dat_mm10$UMAP_2<-mm10_atac@reductions$umap@cell.embeddings[,2]

write.table(dat_hg38,file="hg38_cell_summary.txt",col.names=T,row.names=T,sep="\t",quote=F)
write.table(dat_mm10,file="mm10_cell_summary.txt",col.names=T,row.names=T,sep="\t",quote=F)

Adding gene activity matrix to ATAC processing

Cicero for co-accessible sites and gene activity score generation

Currently testing k argument in make_cicero_cds to see if lower cell count (using out higher reads) does a better job. Also need to make sure that it is assigning cells correctly while adding to the Seurat object. Genebody analysis looks like GA should be working better.

library(Signac)
library(Seurat)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(cicero)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

#Read in data and modify to monocle CDS file
#read in RDS file.
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")


# convert to CellDataSet format and make the cicero object
    #Processing hg38 cells
    hg38_atac.cds <- as.cell_data_set(x = hg38_atac,group_by="seurat_clusters")
    hg38_atac.cicero <- make_cicero_cds( hg38_atac.cds, reduced_coordinates = reducedDims(hg38_atac.cds)$UMAP,k=10)
    genome <- seqlengths(hg38_atac) # get the chromosome sizes from the Seurat object
    genome.df <- data.frame("chr" = names(genome), "length" = genome) # convert chromosome sizes to a dataframe
    conns <- run_cicero(hg38_atac.cicero, genomic_coords = genome.df) # run cicero
    saveRDS(conns,"hg38_cicero_conns.Rds")
    ccans <- generate_ccans(conns) # generate ccans
    saveRDS(ccans,"hg38_cicero_ccans.Rds")
    links <- ConnectionsToLinks(conns = conns, ccans = ccans) #Add connections back to Seurat object as links
    Links(hg38_atac) <- links
    saveRDS(hg38_atac,file="hg38_SeuratObject.cicero.Rds")

    #Processing mm10 cells
    mm10_atac.cds <- as.cell_data_set(x = mm10_atac,group_by="seurat_clusters")
    mm10_atac.cicero <- make_cicero_cds( mm10_atac.cds, reduced_coordinates = reducedDims(mm10_atac.cds)$UMAP,k=10)
    genome <- seqlengths(mm10_atac) # get the chromosome sizes from the Seurat object
    genome.df <- data.frame("chr" = names(genome), "length" = genome) # convert chromosome sizes to a dataframe
    conns <- run_cicero(mm10_atac.cicero, genomic_coords = genome.df) # run cicero
    saveRDS(conns,"mm10_cicero_conns.Rds")
    ccans <- generate_ccans(conns) # generate ccans
    saveRDS(ccans,"mm10_cicero_ccans.Rds")
    links <- ConnectionsToLinks(conns = conns, ccans = ccans) #Add connections back to Seurat object as links
    Links(mm10_atac) <- links
    saveRDS(mm10_atac,file="mm10_SeuratObject.cicero.Rds")
    
	hg38_atac<-readRDS(file="hg38_SeuratObject.cicero.Rds")
	mm10_atac<-readRDS(file="mm10_SeuratObject.cicero.Rds")
# generate unnormalized gene activity matrices
# gene annotation sample
    #hg38
    hg38_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)

    pos <-as.data.frame(hg38_annotations,row.names=NULL)
    pos$chromosome<-paste0("chr",pos$seqnames)
    pos$gene<-pos$gene_id
    pos <- subset(pos, strand == "+")
    pos <- pos[order(pos$start),] 
    pos <- pos[!duplicated(pos$tx_id),] # remove all but the first exons per transcript
    pos$end <- pos$start + 1 # make a 1 base pair marker of the TSS

    neg <-as.data.frame(hg38_annotations,row.names=NULL)
    neg$chromosome<-paste0("chr",neg$seqnames)
    neg$gene<-neg$gene_id
    neg <- subset(neg, strand == "-")
    neg <- neg[order(neg$start,decreasing=TRUE),] 
    neg <- neg[!duplicated(neg$tx_id),] # remove all but the first exons per transcript
    neg$end <- neg$end + 1 # make a 1 base pair marker of the TSS

    gene_annotation<- rbind(pos, neg)
    gene_annotation <- gene_annotation[,c("chromosome","start","end","gene_name")] # Make a subset of the TSS annotation columns containing just the coordinates and the gene name
    names(gene_annotation)[4] <- "gene" # Rename the gene symbol column to "gene"

    geneactivity_processing<-function(cds_input,conns_input,prefix){
        atac.cds<- annotate_cds_by_site(cds_input, gene_annotation)
        unnorm_ga <- build_gene_activity_matrix(atac.cds, conns_input)
        saveRDS(unnorm_ga,paste(prefix,"unnorm_GA.Rds",sep="."))
    }

    conns<-as.data.frame(readRDS("hg38_cicero_conns.Rds"))
    geneactivity_processing(cds_input=as.cell_data_set(x = hg38_atac,group_by="seurat_clusters"),conns_input=conns,prefix="hg38")

    #mm10
    mm10_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)

    pos <-as.data.frame(mm10_annotations,row.names=NULL)
    pos$chromosome<-paste0("chr",pos$seqnames)
    pos$gene<-pos$gene_id
    pos <- subset(pos, strand == "+")
    pos <- pos[order(pos$start),] 
    pos <- pos[!duplicated(pos$tx_id),] # remove all but the first exons per transcript
    pos$end <- pos$start + 1 # make a 1 base pair marker of the TSS

    neg <-as.data.frame(mm10_annotations,row.names=NULL)
    neg$chromosome<-paste0("chr",neg$seqnames)
    neg$gene<-neg$gene_id
    neg <- subset(neg, strand == "-")
    neg <- neg[order(neg$start,decreasing=TRUE),] 
    neg <- neg[!duplicated(neg$tx_id),] # remove all but the first exons per transcript
    neg$end <- neg$end + 1 # make a 1 base pair marker of the TSS

    gene_annotation<- rbind(pos, neg)
    gene_annotation <- gene_annotation[,c("chromosome","start","end","gene_name")] # Make a subset of the TSS annotation columns containing just the coordinates and the gene name
    names(gene_annotation)[4] <- "gene" # Rename the gene symbol column to "gene"

    geneactivity_processing<-function(cds_input,conns_input,prefix){
        atac.cds<- annotate_cds_by_site(cds_input, gene_annotation)
        unnorm_ga <- build_gene_activity_matrix(atac.cds, conns_input)
        saveRDS(unnorm_ga,paste(prefix,"unnorm_GA.Rds",sep="."))
    }

    conns<-as.data.frame(readRDS("mm10_cicero_conns.Rds"))
    geneactivity_processing(cds_input=as.cell_data_set(x = mm10_atac,group_by="seurat_clusters"),conns_input=conns,prefix="mm10")

#Add to seurat Object as new Assay    
    #Read in unnormalized GA
    #hg38
    cicero_gene_activities<-readRDS("hg38.unnorm_GA.Rds")
    hg38_atac[['GeneActivity']]<- CreateAssayObject(counts = as.data.frame(cicero_gene_activities)) 
    # normalize
    hg38_atac <- NormalizeData(
      object = hg38_atac,
      assay = 'GeneActivity',
      normalization.method = 'LogNormalize',
      scale.factor = median(hg38_atac$nCount_peaks)
    )
    saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
    
    #mm10
    cicero_gene_activities<-readRDS("mm10.unnorm_GA.Rds")
    row.names(cicero_gene_activities)<-gsub("-",".",row.names(cicero_gene_activities))
    mm10_atac[['GeneActivity']]<- CreateAssayObject(counts = cicero_gene_activities[!(rownames(x = cicero_gene_activities) == ""),]) 
    # normalize
    mm10_atac <- NormalizeData(
      object = mm10_atac,
      assay = 'GeneActivity',
      normalization.method = 'LogNormalize',
      scale.factor = median(mm10_atac$nCount_peaks)
    )
    saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")


Adding additional gene activity via read count across gene bodies

This is the signac default method of calculating gene activity

library(Signac)
library(Seurat)

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

#Read in data and modify to monocle CDS file
#read in RDS file.
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")

gene.activities <- GeneActivity(mm10_atac,process_n=10000)
saveRDS(gene.activities,"mm10.signac.geneactivities.RDS")
# add the gene activity matrix to the Seurat object as a new assay and normalize it
mm10_atac[['SignacGA']] <- CreateAssayObject(counts = gene.activities)
mm10_atac <- NormalizeData(
  object = mm10_atac,
  assay = 'SignacGA',
  normalization.method = 'LogNormalize',
  scale.factor = median(mm10_atac$nCount_SignacGA)
)
 
saveRDS(mm10_atac,file="mm10_SeuratObject.signacGA.Rds")

library(Signac)
library(Seurat)

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

#Read in data and modify to monocle CDS file
#read in RDS file.
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")

gene.activities <- GeneActivity(hg38_atac,process_n=10000)
saveRDS(gene.activities,"hg38.signac.geneactivities.RDS")
# add the gene activity matrix to the Seurat object as a new assay and normalize it
hg38_atac[['SignacGA']] <- CreateAssayObject(counts = gene.activities)
hg38_atac <- NormalizeData(
  object = hg38_atac,
  assay = 'SignacGA',
  normalization.method = 'LogNormalize',
  scale.factor = median(hg38_atac$nCount_SignacGA)
)
 
saveRDS(hg38_atac,file="hg38_SeuratObject.signacGA.Rds")

Plotting Coverage Plots

#Making a set of marker lists for human cortex and mouse whole brain

#Human cortex marker list is from https://cells.ucsc.edu/?ds=allen-celltypes%2Fhuman-cortex&meta=clusterlabel
#And stored in https://docs.google.com/spreadsheets/d/15YwK2lWh018IyxgQsRxnrINxBFlG3XtU5kE8gArYxJ4/edit?usp=sharing
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/marker_sets")

marker_list<-NULL
marker_list[["VLMC"]]<-c('ITIH5', 'ABCA9', 'DCN', 'ATP1A2', 'SAT1', 'NEAT1', 'SLC7A11', 'ABCA8', 'APOD', 'NKD1', 'PTN', 'FBLN1', 'LAMC3', 'PDGFRB', 'CXCL12', 'AKR1C2', 'SVIL', 'ABCA6', 'COLEC12', 'CYP1B1', 'UACA', 'AHNAK', 'TPCN1', 'SLC1A3', 'ADAM33', 'LAMA2', 'COL18A1', 'CYTL1', 'SLC6A1', 'EDN3', 'ARHGAP29', 'COL4A5', 'DAB2', 'BGN', 'LEPR', 'PTGDS', 'OGN', 'WDR86', 'PRELP', 'ISLR', 'PDGFRA', 'MRC2', 'COL15A1', 'SPRY4', 'SELENBP1', 'RHOJ', 'KIAA1755', 'SASH1', 'LHFP', 'RPS12P5', 'KANK2')
marker_list[["Astrocyte"]]<-c('ADGRV1', 'SLC1A3', 'ATP1A2', 'C1orf61', 'FGFR3', 'SLC1A2', 'PTGDS', 'GLUL', 'NDRG2', 'PRODH', 'NTM', 'COL5A3', 'CST3', 'MACF1', 'PTPRZ1', 'ATP1B2', 'SLC25A18', 'ZBTB20', 'MT3', 'MT2A', 'ATP13A4', 'PON2', 'AQP4', 'DTNA', 'GRAMD3', 'NCAN', 'ACSS1', 'MSI2', 'GJA1', 'SLCO1C1', 'SLC7A11', 'PAPLN', 'F3', 'RYR3', 'PSD2', 'ETNPPL', 'MYO10', 'RAPGEF3', 'TTYH1', 'SOX2', 'EMX2', 'HIF3A', 'DST', 'APOE', 'PDGFRB', 'SFXN5', 'AASS', 'FNBP1', 'SLC4A4', 'GRIN2C', 'PLXNB1')
marker_list[["L5ET"]]<-c('COL24A1', 'ADRA1A', 'COL5A2', 'CRYM', 'GRIK2', 'DGKH', 'SULF2', 'ATP6V1C2', 'VAT1L', 'NEFM', 'SPHKAP', 'NRP1', 'SLC26A4', 'COL21A1', 'TOX', 'PDE1C', 'NTNG1', 'IQCA1', 'BCL11B', 'ANXA4', 'ARAP2', 'ADAMTSL3', 'FAM126A', 'LRP2', 'LOC101927745', 'SLC5A8', 'DOCK4', 'NEFH', 'PRUNE2', 'GRB14', 'GRM8', 'MYO16', 'FAM84B', 'ANKRD18A', 'TRDMT1', 'PCSK6', 'ANO4', 'SSTR2', 'ESRRG', 'MYLIP', 'VASH2', 'KLHL32', 'RYR3', 'ZNF189', 'SLC26A4-AS1', 'EPM2A', 'CNR1', 'ST3GAL1', 'LOC105374239', 'ANKRD34B', 'SEPT4')
marker_list[["L6CT"]]<-c('SEMA3E', 'TLE4', 'EGFEM1P', 'HS3ST4', 'SYT6', 'RYR3', 'ENPP7P4', 'DLC1', 'SEMA3A', 'MEIS2', 'COL24A1', 'DIP2A', 'PCDH17', 'RXFP1', 'LRP8', 'ANKRD18A', 'FOXP2', 'NRP1', 'SEMA5A', 'LRP1B', 'LOC105374199', 'ABCC9', 'SLC24A2', 'VWA2', 'SYNJ2', 'FAM95C', 'SLC8A1-AS1', 'ITGA11', 'SERPINE2', 'KLHL5', 'NFIA', 'CDH6', 'SPARCL1', 'CPE', 'CTGF', 'TENM1', 'ANKRD26P3', 'SEZ6L', 'GRIK3', 'MCC', 'SYNPO2', 'ZFHX3', 'SULF1', 'FILIP1', 'DGKG', 'LUZP2', 'ANKRD20A5P', 'ANKRD20A7P', 'DMD', 'PCLO', 'TMEFF2')
marker_list[["L4IT"]]<-c('NTNG1', 'HTR2A', 'VAV3', 'PCP4', 'TRPC3', 'CDR1', 'PCSK1', 'BHLHE22', 'PLEKHH2', 'ITM2A', 'BTBD3', 'SORL1', 'CUX2', 'VSTM2A', 'SLC17A6', 'POU6F2', 'MEF2A', 'SEMA6D', 'RORA', 'LOC105377864', 'LRRK2', 'NEFM', 'GPR26', 'KCNH8', 'FSTL1', 'MET', 'ACVR1C', 'LOC105375817', 'KCNS1', 'ND4', 'EGR1', 'RET', 'CD74', 'TIAM1', 'PTTG1', 'PHACTR2', 'TADA1', 'SLC38A11', 'SYT2', 'TMEM215', 'LOC101927653', 'RHBDL3', 'PRKCA', 'GRAMD3', 'ILDR2', 'BMP8A', 'ITPR2', 'ND2', 'TRAF5', 'RALB', 'AIFM3')
marker_list[["L56NP"]]<-c('HTR2C', 'CRYM', 'TSHZ2', 'PCP4', 'FGFR1', 'ROBO3', 'NPSR1-AS1', 'ENPP7P8', 'SEMA3E', 'TLE4', 'IFNG-AS1', 'ENPP7P7', 'RPS3AP34', 'COL11A1', 'ALCAM', 'PCDH17', 'ENPP7P4', 'NXPH2', 'CDH6', 'MYLIP', 'DIP2A', 'MEIS2', 'CALCRL', 'LRMP', 'KCNT2', 'FABP7', 'PHLDB2', 'SORCS2', 'ZNF385D', 'CD36', 'ITGA8', 'SEL1L3', 'DOCK4', 'ANKRD20A5P', 'SPOCK1', 'TLL1', 'SLC24A2', 'KLHL5', 'MYHAS', 'BCL11B', 'KCNIP1', 'NR4A3', 'LUZP2', 'COL9A1', 'CCDC80', 'TOX', 'TRHDE', 'VWC2L', 'FAM89A', 'ABCC9', 'LINC01279')
marker_list[["L6b"]]<-c('TLE4', 'CTGF', 'RXFP1', 'SEMA3E', 'ZFHX3', 'LOC100996635', 'ENPP7P7', 'EGFEM1P', 'NFIA', 'SEMA3A', 'DLC1', 'KLHL5', 'LOC401134', 'HS3ST4', 'DGKH', 'CDH9', 'NR4A2', 'PCSK5', 'ENPP7P8', 'RYR3', 'SEMA3D', 'ADD3', 'PCDH17', 'OLFML2B', 'HPCAL1', 'MDFIC', 'ITPR2', 'COL19A1', 'TMEFF2', 'SEMA3C', 'SEZ6L', 'PQLC3', 'ENPP7P4', 'CDH11', 'DOCK4', 'MGST1', 'TENM1', 'DPP4', 'TNIK', 'GLCE', 'SLC1A2', 'FILIP1', 'VWA5A', 'ARSJ', 'ROS1', 'LRP8', 'DGKG', 'CPLX3', 'SLC24A2', '43891', 'LRP1B')
marker_list[["PVALB"]]<-c('ERBB4', 'ZNF385D', 'TAC1', 'GAD1', 'SLIT2', 'KCNC2', 'GAD2', 'PVALB', 'KLHL5', 'KCNAB3', 'PTPRM', 'C8orf4', 'SAT1', 'SLC6A1', 'RGS5', 'LHX6', 'BTBD11', 'ANK1', 'RAB3IP', 'ZNF536', 'SOX6', 'SPARCL1', 'TENM1', 'LANCL1', 'LRP8', 'MYO16', 'KCNAB1', 'PAM', 'ZNF804A', 'NEAT1', 'NXPH1', 'OSBPL3', 'SLC9A9', 'SULF1', 'WIF1', 'CRHBP', 'RND3', 'LGI2', 'SEPT4', 'SERPINI1', 'ANO4', 'LOC105369345', 'KLF12', 'NPPC', 'RUNX2', 'DOCK11', 'SPOCK3', 'PLCE1', 'GRIA4', 'KCNS3', 'ARX')
marker_list[["VIP"]]<-c('CXCL14', 'SYNPR', 'RGS12', 'ERBB4', 'SCG2', 'VIP', 'GAD1', 'CALB2', 'ADARB2', 'THSD7A', 'CNR1', 'PROX1', 'ARL4C', 'DLX6-AS1', 'SLC6A1', 'DNER', 'NR2F2', 'FXYD6', 'ATP1B2', 'TIMP2', 'COL21A1', 'DOCK10', 'TAC3', 'CHRNA2', 'IGF1', 'PTHLH', 'SLC24A3', 'AP1S2', 'RGS16', 'DLX1', 'SHISA8', 'ZNF536', 'ADAM33', 'ADRA1A', 'ASIC4', 'SLC10A4', 'ROBO1', 'CRH', 'PCDH8', 'CNTNAP4', 'ENTPD3', 'HLA-A', 'SERINC1', 'GRIK2', 'ROBO2', 'PLD5', 'SEZ6', 'LOC105373643', 'HTRA1', 'KCNT2', 'PTPRE')
marker_list[["IT"]]<-c('ENC1', 'PTPRK', 'VSNL1', 'LINC00507', 'FAM13A', 'LOC101927745', 'CCK', 'TNNT2', 'ARAP2', 'LMF1', 'EPB41L2', 'ART3', 'HTR2A', 'ANKRD18B', 'LOC105374973', 'LOC101929974', 'LOC105373009', 'ARL4A', 'OTOGL', 'CRYM', 'RFX3', 'ADAMTS3', 'SLC38A11', 'NWD2', 'PTPN3', 'COL5A2', 'COL24A1', 'ID2', 'DMD', 'MYO5C', 'RFTN1', 'LOC102724736', 'LRRIQ1', 'ATP2B4', 'IQCA1', 'HSD3BP4''LOC105370610', 'RXFP1', 'LOC102724834', 'SYN3', 'ADCYAP1', 'DPYD', 'DGKA', 'LOC101929293', 'SMAD3', 'MEF2A', 'CNGB1', 'MYH9', 'TRABD2A', 'CA10', 'SYNJ2')
marker_list[["L56ITCAR3"]]<-c('RGS12', 'POSTN', 'NR4A2', 'SYNPR', 'SMYD1', 'ITGB8', 'FRAS1', 'ANXA1', 'OLFML2B', 'ITGA11', 'SPOCK1', 'ARAP2', 'GNB4', 'ABCC3', 'CCK', 'HSD3BP4', 'ENPP7P8', 'NWD2', 'THEMIS', 'MCTP2', 'FAP', 'SEMA6D', 'PDLIM5', 'EGFEM1P', 'GFRA1', 'MAMDC2', 'CD52', 'RXFP1', 'IL7R', 'GPR63', 'GNG7', 'CRABP1', 'ANXA5', 'TPMT', 'DGKH', 'B2M', 'SYNJ2', 'DCSTAMP', 'LOC105371828', 'LOC100421670', 'GALNT14', 'PRLR', 'HLA-L', 'MYOM2', 'SFMBT2', 'MREG', 'ATP10A', 'CD63', 'GSG1L', 'GAS2L3', 'SOCS2')
marker_list[["PAX6"]]<-c('CXCL14', 'RELN', 'CNR1', 'WIF1', 'AP1S2', 'GRIK2', 'CRH', 'CCK', 'ADARB2', 'DDR2', 'SP8', 'RGS12', 'HMBOX1', 'SLC6A1', 'ENTPD3', 'FXYD6', 'GAD1', 'SEZ6L', 'SPOCK3', 'ZNF385D', 'NR2F2', 'NXPH1', 'DOCK10', 'PNOC', 'TIMP2', 'SORCS3', 'MYO16', 'SCG2', 'ROBO2', 'PLS3', 'ZNF536', 'PCDH8', 'NECAB2', 'DNER', 'FSTL5', 'PAX6', 'CDH4', 'NPAS3', 'SERINC1', 'CPLX3', 'RBMS3', 'DLX6-AS1', 'SPOCK1', 'ARL4C', 'RAB3C', 'COL16A1', 'PTPRM', 'WWP1', 'LOC102724124', 'IGF1', 'SYNPR')
marker_list[["LAMP5"]]<-c('SLC6A1', 'GAD2', 'DNER', 'KIT', 'LAMP5', 'SV2C', 'GAD1', 'FSTL5', 'FXYD6', 'PTPRT', 'AP1S2', 'DOCK10', 'ADARB2', 'GRIK1', 'GRIA4', 'ZNF536', 'CNTNAP4', 'HAPLN1', 'NXPH2', 'ERBB4', 'CXCL14', 'RAB3C', 'GRIN3A', 'MYO16', 'ARL4C', 'PTCHD4', 'NXPH1', 'ATP1B2', 'FREM1', 'RAB3IP', 'SPOCK1', 'EYA4', 'KCNC2', 'NECAB2', 'CACNA2D1', 'DLX6-AS1', 'FAT1', 'PMEPA1', 'RELN', 'GRIK2', 'ADRA1A', 'PCP4L1', 'MTSS1', 'CA2', 'SPHKAP', 'BCL11B', 'TPD52L1', 'IGF1', 'SOX2-OT', 'PTPRM', 'ATP11C')
marker_list[["Oligodendrocyte"]]<-c('PLP1', 'MBP', 'PTGDS', 'CERCAM', 'TF', 'MOBP', 'MOG', 'CARNS1', 'LOC101929249', 'SCD', 'SLC44A1', 'CLDND1', 'CLDN11', 'PXK', 'RNASE1', 'CNTN2', 'PLEKHH1', 'ABCA2', 'PHLDB1', 'CRYAB', 'SPP1', 'ENPP2', 'QKI', 'TMEM144', 'APOD', 'MYRF', 'CNDP1', 'APLP1', 'TMEM63A', 'PPP1R14A', 'OPALIN', 'GPRC5B', 'MAG', 'UGT8', 'ST18', 'CAPN3', 'LAMP2', 'QDPR', 'EDIL3', 'HAPLN2', 'ERMN', 'TTYH2', 'NDRG1', 'PMP22', 'LINC00844', 'MAL', 'SOX2-OT', 'ABCA8', 'DBNDD2', 'ANLN', 'S100B')
marker_list[["SST"]]<-c('SST', 'GRIK1', 'GAD1', 'SYNPR', 'SPOCK3', 'LOC105372768', 'PAWR', 'LRP8', 'NMU', 'RAB3B', 'GRIN3A', 'LHX6', 'TRB', 'ROBO2', 'CDH13', 'KCNA3', 'KLHL5', 'NTM', 'GAD2', 'COL25A1', 'MAFB', 'CORT', 'STXBP6', 'SLC6A1', 'PNOC', 'ARX', 'TIMP2', 'GRIK2', 'FXYD6', 'GRIK1-AS2', 'SOX6', 'MAF', 'TRHDE', 'NXPH1', 'PLCH1', 'CDH9', 'GRIK3', 'PAM', 'ELFN1', 'FLT3', 'DLX1', 'PTPRM', 'NPAS3', 'CRHBP', 'RAB3IP', 'RNASET2', 'TMEM47', 'SPARCL1', 'ANK1', 'ADCYAP1R1', 'RBP4')
marker_list[["OPC"]]<-c('PTPRZ1', 'VCAN', 'OLIG1', 'PDGFRA', 'BCAN', 'NTM', 'APOD', 'COL9A1', 'C1orf61', 'PCDH15', 'EPN2', 'HIP1R', 'COL20A1', 'LOC105379054', 'ZBTB20', 'PLEKHH2', 'SCD5', 'CST3', 'OLIG2', 'PTN', 'SOX6', 'HTRA1', 'GPNMB', 'SNX22', 'SMOC1', 'SEMA5A', 'SULF2', 'LUZP2', 'NAV1', 'CCDC50', 'CSPG5', 'SPRY4', 'ADGRG1', 'LHFPL3', 'COL9A2', 'NKAIN4', 'BCAS1', 'KAT2B', 'SDC3', 'FERMT1', 'QKI', 'CSPG4', 'LPPR1', 'DSCAM', 'KANK1', 'COL11A1', 'FGFR1', 'MYT1', 'MEGF11', 'LRP4', 'PHLDA1')
marker_list[["Pericyte"]]<-c('PDGFRB', 'IGFBP7', 'ATP1A2', 'NOTCH3', 'ITIH5', 'SLC6A12', 'HIGD1B', 'RGS5', 'DCN', 'SLC6A1', 'PTN', 'IFITM3', 'CALD1', 'GNG11', 'BGN', 'FN1', 'IFITM1', 'SLC19A1', 'NDUFA4L2', 'PLXDC1', 'B2M', 'DLC1', 'TXNIP', 'COLEC12', 'PRELP', 'ADIRF', 'SPARC', 'SLC12A7', 'SLC6A13', 'CD9', 'EPS8', 'ISYNA1', 'LGALS1', 'IFITM2', 'EPAS1', 'MYO1B', 'STOM', 'HSPB1', 'EMP2', 'HES1', 'CYBA', 'PTGDS', 'MCAM', 'MYL12A', 'PDE7B', 'CD63', 'NR2F2', 'ARHGAP29', 'FRZB', 'TNS2', 'GJC1')
marker_list[["Endothelial"]]<-c('HLA-B', 'ABCG2', 'CLDN5', 'FLT1', 'HLA-E', 'B2M', 'A2M', 'ADGRF5', 'XAF1', 'HLA-C', 'ABCB1', 'SLC2A1', 'PODXL', 'EMP2', 'IFITM3', 'CLEC3B', 'EPAS1', 'IFI27', 'ID3', 'PTPRB', 'ENG', 'IFI44', 'MFSD2A', 'NEAT1', 'GPR85', 'ID1', 'VIM', 'SRGN', 'COBLL1', 'IFI44L', 'ITM2A', 'GPCPD1', 'DUSP1', 'HLA-H', 'LIMS2', 'MT2A', 'TM4SF1', 'GIMAP7', 'IGFBP7', 'VWF', 'SLC7A5', 'STOM', 'ITGA6', 'LMO2', 'PDGFB', 'SLC38A5', 'ESAM', 'KLF2', 'SLC7A1', 'HLA-A', 'HSPB1')
marker_list[["Microglia"]]<-c('LPAR6', 'CD74', 'ADAM28', 'P2RY12', 'CSF1R', 'CX3CR1', 'C3', 'RNASET2', 'SAT1', 'A2M', 'MAF', 'ITGAX', 'BHLHE41', 'BLNK', 'SRGAP2', 'HLA-DRB1', 'APBB1IP', 'ZFP36L2', 'LAPTM5', 'P2RY13', 'IFNGR1', 'PTPRC', 'CSF3R', 'ZFP36L1', 'CYBA', 'USP53', 'SFMBT2', 'C1QC', 'CSF2RA', 'LINC01268', 'SAMSN1', 'HLA-DRA', 'RCSD1', 'ADAP2', 'FYB', 'LTC4S', 'TYROBP', 'CD53', 'ARHGAP24', 'IFI16', 'C1QB', 'SLCO2B1', 'SLC1A3', 'MS4A7', 'GPR34', 'AIF1', 'TAL1', 'ITPR2', 'DOCK8', 'HLA-DPA1', 'MEF2A')
saveRDS(marker_list,"hg38_markerlist.rds")

#Human cortex marker list is from https://cells.ucsc.edu/?ds=aging-brain
#And stored in https://docs.google.com/spreadsheets/d/15YwK2lWh018IyxgQsRxnrINxBFlG3XtU5kE8gArYxJ4/edit?usp=sharing
marker_list<-NULL
marker_list[["SMC"]]<-c('Acta2', 'Tagln', 'Myl9', 'Tpm2', 'Mylk', 'Myh11', 'Cald1', 'Tpm1', 'Crip1', 'Mustn1', 'Flna', 'Pln', 'Sncg', 'Gm13889', 'Csrp1', 'Lmod1', 'Ppp1r14a', 'Des', 'Rgs4', 'Palld', 'Fxyd1', 'Nexn', 'Rasl11a', 'Aspn', 'Filip1l', 'Map1b', 'Pde3a', 'Igfbp7', 'Gm13861', 'Ccnd2', 'Snhg18', 'Lgals1', 'Gja4', 'Gucy1a1', 'Sparcl1', 'Errfi1', 'Gper1', 'Lbh', 'Hspb2', 'Crispld2', 'Mfge8', 'Cryab', 'Bgn', 'Cnn1', 'Higd1b', 'Rrad', 'Rasl12', 'Wtip', 'Notch3', 'Nrip2', 'Perp')
marker_list[["mNeur"]]<-c('Meg3', 'Snhg11', 'Rtn1', 'Bex2', 'Ly6h', 'Celf4', 'Syt1', 'Snap25', 'Pcsk1n', 'Map1b', 'Stmn1', 'Stmn3', 'Ahi1', '6330403K07Rik', 'Reln', 'Stmn2', 'Pcp4', 'Gng3', 'Uchl1', 'Zcchc18', 'Bcl11a', 'Peg3', 'Nap1l5', 'Pcsk2', 'Resp18', 'Atp1b1', 'Rufy3', 'Aplp1', 'Scg2', 'Gria2', 'Scg5', 'Nhlh2', 'Sgip1', 'Gm1673', 'Gpm6a', 'Nrxn3', 'Syn2', 'Lhx1os', 'Mapt', 'Cacna2d2', 'Lhx1', 'Tubb3', 'Elavl3', 'Cck', 'Fxyd6', 'Map7d2', 'Negr1', 'Cit', 'Thy1', 'Fxyd7', 'Cbarp')
marker_list[["Hb_EC"]]<-c('Hba-a2', 'Hba-a1', 'Hbb-bt', 'Hbb-bs', 'Alas2', 'Cldn5', 'Ly6c1', 'Ly6a', 'Itm2a', 'Flt1', 'Cxcl12', 'Spock2', 'Egfl7', 'Pltp', 'Slco1a4', 'Snca', 'Slc9a3r2', 'Bsg', 'Hspb1', 'Pglyrp1', 'Ctla2a', 'Apold1', 'Ifitm3', 'Bpgm', 'Id1', 'Car4', 'Klf2', 'Igfbp7', 'Crip1', 'Abhd2', '8430408G22Rik', 'Rgs5', 'Foxq1', 'Cyr61', 'Cd24a', 'Slc25a37', 'Tpm1', 'Rec114', 'Hspa1a', 'Ptn', 'Ftl1', 'Pam', 'Ube2c', 'Cald1', 'Vtn', 'Higd1b', 'Jund', 'Sparcl1', 'Tprgl', 'Jun', 'Nfkbia')
marker_list[["imNeur"]]<-c('Sox11', 'Stmn1', 'Sox4', 'Ccnd2', 'Tubb3', 'Meis2', 'Stmn2', 'Marcksl1', 'Cd24a', 'Tubb2b', 'Rtn1', 'Pfn2', 'Dcx', 'Dlx6os1', 'Map1b', 'Nrxn3', 'Hmgn2', 'Ncam1', 'Nnat', 'Pbx1', 'Celf4', 'Meg3', 'Elavl3', 'Dlx1', 'Igfbpl1', 'Stmn4', 'Gm1673', 'Bcl11a', 'Gm17750', 'Stmn3', 'Cdk5r1', 'H3f3b', '6330403K07Rik', 'Serf1', 'Basp1', 'Gad2', 'Hist3h2ba', 'Ptprs', 'Dlx2', 'Mpped2', 'Gng3', 'Islr2', 'Ccdc88a', 'Fxyd6', 'Auts2', 'Arx', 'Dlx5', 'Tiam2', 'Sp9', 'Foxg1', 'Etv1')
marker_list[["NRP"]]<-c('Sox11', 'Hmgn2', 'Stmn1', 'Ccnd2', 'Pclaf', 'Marcksl1', 'Sox4', 'Cd24a', 'Marcks', 'Mdk', 'Cenpf', 'Meis2', 'Pantr1', 'Gm1673', 'Pfn2', 'Ube2c', 'Dlx1', 'Rtn1', 'Hist1h2ap', 'Tubb3', 'Elavl3', 'Ncam1', 'Dlx2', 'Bex2', 'Dcx', 'Nnat', 'Bcl11a', 'Gm17750', 'Nrxn3', 'Foxg1', 'Arx', 'Ptprs', 'Pou3f2', 'Igfbpl1', 'Tubb2b', 'Fxyd6', 'Ppp1r14b', 'Serf1', 'Pax6', 'Pbx1', 'Elavl2', 'Map1b', 'Ccnb1', 'Mfap2', 'Dlx6os1', 'Trim2', 'Stmn3', 'Hes5', 'Sox2', 'Fabp5', 'Mpped2')
marker_list[["OLG"]]<-c('Mgp', 'Igfbp6', 'Cldn11', 'Nov', 'Igfbp5', 'Rspo3', 'Nbl1', 'Lmo4', 'Slc47a1', 'Rbp1', '1500015O10Rik', 'Efemp1', 'Ptgds', 'Emp3', 'Prg4', 'Atp1b1', 'Col1a2', 'Emb', 'Apod', 'Ogn', 'Cryab', 'Bgn', 'Pcolce', 'Perp', 'Foxp2', 'Cald1', 'Gpc3', 'S100b', 'Cd74', 'Prrx2', 'Alcam', 'Ank2', 'Aspa', 'Asgr1', 'Nupr1', 'Thbs1', 'Serping1', 'Scara3', 'Timp2', 'Gjb6', 'Gjb2', 'Foxc2', 'Spp1', 'Foxd1', 'Gstm5', 'Penk', 'Zic1', 'Cped1', 'Dapl1', 'Fstl1', 'Col1a1')
marker_list[["EC"]]<-c('Cldn5', 'Flt1', 'Ly6c1', 'Ly6a', 'Itm2a', 'Cxcl12', 'Egfl7', 'Spock2', 'Pltp', 'Slco1a4', 'Pglyrp1', 'Id1', 'Slc9a3r2', 'Ifitm3', 'Bsg', 'Apold1', 'Hspb1', 'Ctla2a', 'Klf2', 'Car4', 'Foxq1', 'Crip1', 'Igfbp7', '8430408G22Rik', 'Hspa1a', 'Edn1', 'Abhd2', 'Cyr61', 'Jun', 'Ptn', 'Nfkbia', 'Jund', 'Junb', 'Tpm1', 'Gkn3', 'Stmn2', 'Fos', 'Rgs5', 'Trim47', 'Rbpms', 'Tprgl', 'Cdkn1c', 'Pam', 'Sparcl1', 'Usp53', 'Slc38a3', 'Stmn1', 'Ctnnbip1', 'Gm26532', 'Sept4', 'B430010I23Rik')
marker_list[["AC"]]<-c('Aldoc', 'Slc1a3', 'Slc1a2', 'Plpp3', 'Ntm', 'Atp1a2', 'Mt2', 'Mt3', 'Gpr37l1', 'Ndrg2', 'Gm3764', 'Gja1', 'Clu', 'Atp1b2', 'Ntsr2', 'Mt1', 'Bcan', 'Cldn10', 'Prdx6', 'Htra1', 'Prnp', 'Dclk1', 'Gstm5', 'Cxcl14', 'Ttyh1', 'Apoe', 'Pla2g7', 'Acsl3', 'Ntrk2', 'Dbi', 'Slc6a11', 'Gpm6b', 'Mmd2', 'Ptprz1', 'Gria2', 'Mfge8', 'Sparcl1', 'Slc4a4', 'F3', 'Cpe', 'Tspan7', 'Gstm1', 'Gpm6a', 'Rorb', 'Ckb', 'Dtna', 'Slc6a1', 'Cspg5', 'Scd2', 'Slc7a10', 'Ptn')
marker_list[["TNC"]]<-c('6330403K07Rik', 'S100a6', 'Prdx6', 'Ntrk2', 'Fbxo2', 'Dbi', 'Tmem47', 'Nnat', 'Mt3', 'Cpe', 'Gpm6b', 'Mt2', 'Mlc1', 'Igfbp5', 'Fxyd1', 'Mt1', 'Scd2', 'Id4', 'Sox9', 'Aldoc', 'Slc1a3', 'Gfap', 'Apoe', 'Thrsp', 'Gstm1', 'Mdk', 'Pcsk1n', 'Meg3', 'Zcchc18', 'Myoc', 'Rax', 'S100a1', 'Ttyh1', 'Mgst1', 'Six3', 'Dtna', 'Riiad1', 'Ndrg2', 'Chchd10', 'Pbx1', 'Map1b', 'Mia', 'Pygb', 'Kctd14', 'Gprc5b', 'Slc1a2', 'Scg5', 'Ank2', 'Atp1b2', 'Apoc1', 'Nrxn2')
marker_list[["EPC"]]<-c('Ccdc153', 'Tmem212', 'Rarres2', 'Tppp3', 'Riiad1', 'Gm19935', 'Nnat', 'Chchd10', 'Mia', 'Cd24a', 'Mt3', 'Dbi', 'Clu', 'S100b', 'Calml4', 'Mt2', 'Hspa2', '1500015O10Rik', 'Gm14964', 'Sox9', 'Tmem47', 'Mlc1', 'Prdx6', 'Ntrk2', 'Fam213a', 'Ramp1', 'Mt1', 'Cpe', 'Scd2', 'Mgst1', 'Gm1673', 'Lmo4', 'Prelp', 'Ppil6', 'Gstm1', 'Id4', 'Gm973', 'Ttll3', 'Plekhb1', 'Dalrd3', 'Cfap100', 'Zcchc18', 'S100a6', 'Trp53bp2', 'Aqp4', 'Sox2', 'Mro', 'Ckb', 'Wdr60', 'Tspan15', 'Spint2')
marker_list[["MG"]]<-c('Ctss', 'C1qc', 'Hexb', 'C1qa', 'C1qb', 'Tyrobp', 'Selplg', 'Trem2', 'Tmem119', 'Cx3cr1', 'Fcer1g', 'Lgmn', 'Csf1r', 'Laptm5', 'Rgs10', 'Ctsd', 'Fcrls', 'P2ry12', 'Fcgr3', 'Olfml3', 'Ctsz', 'Gpr34', 'Lpcat2', 'Pld4', 'Grn', 'Unc93b1', 'Aif1', 'Ly86', 'Vsir', 'Siglech', 'Ccl4', 'Ccl3', 'Trf', 'Irf8', 'Cd53', 'Bin1', 'Hexa', 'Atf3', 'Junb', 'Spi1', 'Marcks', 'Cd83', 'Hk2', 'Ltc4s', 'Cd37', 'Coro1a', 'Ptgs1', 'Tgfbr1', 'Slc15a3', 'Lair1', 'Hpgds')
marker_list[["MNC"]]<-c('Cd52', 'Lyz2', 'Plac8', 'Fcer1g', 'Msrb1', 'Tyrobp', 'Coro1a', 'Alox5ap', 'Cybb', 'Lgals3', 'Lsp1', 'Lst1', 'S100a6', 'Ifitm6', 'Ccl6', 'Ifitm3', 'Rac2', 'Cytip', 'Pim1', 'Lcp1', 'Hp', 'Stk17b', 'Emp3', 'Ifi27l2a', 'Spi1', 'Itgal', 'Ms4a6c', 'S100a4', 'Wfdc17', 'Il1b', 'S100a8', 'S100a9', 'Samsn1', 'Gmfg', 'Slpi', 'Ftl1', 'Cebpb', 'Napsa', 'Ly6c2', 'Ear2', 'Retnlg', 'Wfdc21', 'Cd53', 'Mcemp1', 'G0s2', 'Plbd1', 'Cd74', 'AB124611', 'Rps11', 'Sirpb1c', 'Gpr141')
marker_list[["PC"]]<-c('Vtn', 'Higd1b', 'Ndufa4l2', 'Rgs5', 'Cald1', 'Myl9', 'Cox4i2', 'Sept4', 'Pdgfrb', 'Kcnj8', 'Ifitm1', 'Rgs4', 'Ptn', 'Atp1a2', 'P2ry14', 'Rarres2', 'Atp13a5', 'Abcc9', 'Mgp', 'Sod3', 'Slc19a1', 'Slc6a20a', 'Nbl1', 'Art3', 'Zic1', 'Ppp1r14a', 'Lhfp', 'Gper1', 'Gucy1b1', 'Notch3', 'Bgn', 'Pitpnc1', 'Car4', 'Tbx3os1', 'Gja4', 'Gm13861', 'Perp', 'Phlda1', 'Il34', 'Uchl1', 'Slc38a11', 'Carmn', 'Gpx8', 'Mfge8', 'Pth1r', 'Ecm2', 'Gucy1a1', 'Igfbp7', 'Pde4d', 'Ddit4l', 'G0s2')
marker_list[["MAC"]]<-c('Pf4', 'Lyz2', 'Ms4a7', 'Dab2', 'Mrc1', 'Fcer1g', 'C1qc', 'Ctsc', 'Wfdc17', 'C1qa', 'C1qb', 'Tyrobp', 'F13a1', 'Maf', 'Stab1', 'Trf', 'Apoe', 'Ccl7', 'Ms4a6c', 'Ccl24', 'Ccl2', 'Lst1', 'Hexa', 'Clec4n', 'Cd14', 'Cbr2', 'Ccl12', 'Cd209f', 'Cybb', 'Fcrls', 'Grn', 'Ptpn18', 'Ifi207', 'Cxcl2', 'Cd68', 'Ms4a6b', 'Fcgr2b', 'Marcksl1', 'Lgmn', 'Ftl1', 'Csf1r', 'Ctss', 'Lgals1', 'Folr2', 'Igfbp4', 'Ltc4s', 'C5ar1', 'Fcgr3', 'Tslp', 'Tgfbi', 'Cd163')
saveRDS(marker_list,"mm10_markerlist.rds")

#Gross cell type list (for both organisms)
marker_list<-NULL
marker_list[["Oligodendrocyte"]]<-c('Mobp','Cldn1','Prox1','Olig1')
marker_list[["Polydendrocyte"]]<-c('Cspg4','Pdgfra')
marker_list[["Astrocyte"]]<-c('Gfap','GluI','Slc1a2','Agt')
marker_list[["Ex_Neuron"]]<-c('Satb2','Cux2','Rorb','Pcp4','Foxp2','Col19a1','Slc17a7')
marker_list[["Microglia"]]<-c('C1qa','C1qc','Cxcr1')
marker_list[["Inhib_Neuron"]]<-c('Cnr1','Bcl11b','Gad1','Dlx1','Dlx2')
saveRDS(marker_list,"grosscelltype_markerlist.rds")


Plot Cell Type Markers

#Function to plot cell type markers
#For now just using the gross celltype marker list
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)

region_check<-function(i,object=hg38_atac){
    return(nrow(as.data.frame(LookupGeneCoords(object=object,gene=toupper(gene_list[i])))))
    }

marker_plot<-function(j,k=celltype_name,l=hg38_atac,m="hg38_",n="seurat_clusters"){
    plt<-CoveragePlot(
        object = l,
        region = j,
        group.by=n,
        extend.upstream = 1000,
        extend.downstream = 1000,
        ncol = 1,
        tile=T,
        tile.size=500,
        tile.cells=downsamp
    )
    pdf(paste0("./marker_sets/",m,k,"_",j,"_genebody_accessibility.pdf"))
    print(plt)
    dev.off()
}


feature_plot<-function(j,k=celltype_name,l=hg38_atac,m="hg38_",n="seurat_clusters"){
    plt<-FeaturePlot(
        object = l,
        features = j,
        order=T
    )
    pdf(paste0("./marker_sets/",m,k,"_",j,"_geneactivity.pdf"))
    print(plt)
    dev.off()
}


hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
marker_list<-readRDS("./marker_sets/grosscelltype_markerlist.rds")
summary(hg38_atac@meta.data$seurat_clusters) #setting tile cells to 90 to downsample
downsamp=90

for (x in 1:length(marker_list)){
    gene_list<-marker_list[[x]]
    gene_list<-gene_list[as.numeric(unlist(lapply(1:length(gene_list),FUN=region_check)))==1] #subset for genes with reads across gene body
    gene_list<-toupper(gene_list) #make gene names uppercase
    celltype_name<-names(marker_list)[x]
    mclapply(gene_list,FUN=marker_plot,k=celltype_name,mc.cores=20)
    gene_list<-gene_list[gene_list %in% row.names(hg38_atac@assays$GeneActivity@data)] #subset for genes with GA 
    mclapply(gene_list,FUN=feature_plot,k=celltype_name,mc.cores=20)
}

for (i in list.files(path="./marker_sets",pattern="markerset_hg38")){
    system(paste0("slack -F ./marker_sets/",i," ryan_todo"))
}

mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
marker_list<-readRDS("./marker_sets/grosscelltype_markerlist.rds")
summary(mm10_atac@meta.data$seurat_clusters) #setting tile cells to 99 to downsample
downsamp=30

for (x in 1:length(marker_list)){
    gene_list<-marker_list[[x]]
    gene_list<-gene_list[as.numeric(unlist(lapply(1:length(gene_list),FUN=region_check)))==1]
    celltype_name<-names(marker_list)[x]
    mclapply(gene_list,FUN=marker_plot,k=celltype_name,l=mm10_atac,m="mm10_",mc.cores=20) 
    gene_list<-gene_list[gene_list %in% row.names(mm10_atac@assays$GeneActivity@data)] #subset for genes with GA 
    mclapply(gene_list,FUN=feature_plot,k=celltype_name,l=mm10_atac,m="mm10_",mc.cores=20)

}


#For concatenating cell types into a single scrollable pdf
celltype=`ls mm10*genebody_accessibility.pdf | awk '{split($1,a,"_");print a[2]}' - | uniq`
for i in $celltype ; do convert `echo mm10_${i}_*genebody_accessibility.pdf` markerset_mm10_${i}.genebody.pdf; done

celltype=`ls mm10*geneactivity.pdf | awk '{split($1,a,"_");print a[2]}' - | uniq`
for i in $celltype ; do convert `echo mm10_${i}_*geneactivity.pdf` markerset_mm10_${i}.GA.pdf; done

celltype=`ls hg38*genebody_accessibility.pdf | awk '{split($1,a,"_");print a[2]}' - | uniq`
for i in $celltype ; do convert `echo hg38_${i}_*genebody_accessibility.pdf` markerset_hg38_${i}.genebody.pdf; done

celltype=`ls hg38*geneactivity.pdf | awk '{split($1,a,"_");print a[2]}' - | uniq`
for i in $celltype ; do convert `echo hg38_${i}_*geneactivity.pdf` markerset_hg8_${i}.GA.pdf; done


Differential Accessibillity on Clusters


setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)
library(ggplot2)
library(ggrepel)
library(dplyr)

hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")



#Perform One vs. rest DA enrichment

write("Performing one vs. rest DA enrichment per annotation grouping supplied.", stderr())

#set up an empty list for looping through
hg38_da_peaks<-list()
mm10_da_peaks<-list()

#define DA functions for parallelization
#Use LR test for atac data
da_one_v_rest<-function(i,obj,group){
    da_peaks_tmp <- FindMarkers(
        object = obj,
        ident.1 = i,
        group.by = group,
        test.use = 'LR',
        latent.vars = 'nCount_peaks',
        only.pos=T
        )
    da_peaks_tmp$da_region<-row.names(da_peaks_tmp)
    closest_genes <- ClosestFeature(obj,da_peaks_tmp$da_region)
    da_peaks_tmp<-cbind(da_peaks_tmp,closest_genes)
    da_peaks_tmp$enriched_group<-c(i)
    da_peaks_tmp$compared_group<-c("all_other_cells")
    return(da_peaks_tmp)
  }


da_one_v_one<-function(i,obj,group,j_list){
    i<-as.character(i)
    da_tmp_2<-list()
    for (j in j_list){
        if ( i != j){
        da_peaks_tmp <- FindMarkers(
            object = obj,
            ident.1 = i,
            ident.2 = j,
            group.by = group,
            test.use = 'LR',
            latent.vars = 'nCount_peaks',
            only.pos=T
            )
        da_peaks_tmp$da_region<-row.names(da_peaks_tmp)
        closest_genes <- ClosestFeature(obj,da_peaks_tmp$da_region)
        da_peaks_tmp<-cbind(da_peaks_tmp,closest_genes)
        da_peaks_tmp$enriched_group<-c(i)
        da_peaks_tmp$compared_group<-c(j)
        da_tmp_2[[paste(i,j)]]<-da_peaks_tmp
        }
    }
    return(da_tmp_2)
  }

#Perform parallel application of DA test
library(parallel)
n.cores=length(unique(hg38_atac@meta.data$seurat_clusters))
hg38_da_peaks<-mclapply(
    unique(hg38_atac@meta.data$seurat_clusters),
    FUN=da_one_v_rest,
    obj=hg38_atac,
    group="seurat_clusters",
    mc.cores=n.cores)

n.cores=length(unique(mm10_atac@meta.data$seurat_clusters))
mm10_da_peaks<-mclapply(
    unique(mm10_atac@meta.data$seurat_clusters),
    FUN=da_one_v_rest,
    obj=mm10_atac,
    group="seurat_clusters",
    mc.cores=n.cores)

#Merge the final data frame from the list for 1vrest DA
hg38_da_peaks<-do.call("rbind",hg38_da_peaks)
mm10_da_peaks<-do.call("rbind",mm10_da_peaks)

write("Outputting One v Rest DA Table.", stderr())
write.table(hg38_da_peaks,file="hg38.onevrest.da_peaks.txt",sep="\t",col.names=T,row.names=T,quote=F)
write.table(mm10_da_peaks,file="mm10.onevrest.da_peaks.txt",sep="\t",col.names=T,row.names=T,quote=F)


dat<-read.table("hg38.onevrest.da_peaks.txt",header=T,sep="\t")
dat_select<-dat %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
plt<-ggplot(dat,aes(x=avg_logFC,y=(-log(p_val)),color=as.factor(enriched_group)))+geom_point(aes(alpha=0.1))+geom_label_repel(dat=dat_select,aes(label=gene_name,size=-distance),force=3)+theme_bw()
pdf("hg38_da_peaks.pdf")
plt
dev.off()

dat<-read.table("mm10.onevrest.da_peaks.txt",header=T,sep="\t")
dat_select<-dat %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
plt<-ggplot(dat,aes(x=avg_logFC,y=(-log(p_val)),color=as.factor(enriched_group)))+geom_point(aes(alpha=0.1))+geom_label_repel(dat=dat_select,aes(label=gene_name,size=-distance),force=3)+theme_bw()
pdf("mm10_da_peaks.pdf")
plt
dev.off()

#Empty list to rerun for 1v1 comparisons
hg38_da_peaks<-list()
mm10_da_peaks<-list()
    
n.cores=length(unique(hg38_atac@meta.data$seurat_clusters))
hg38_da_peaks<-mclapply(
    unique(hg38_atac@meta.data$seurat_clusters),
    FUN=da_one_v_one,
    obj=hg38_atac,
    group="seurat_clusters",
    j_list=do.call("as.character",list(unique(hg38_atac@meta.data$seurat_clusters))),
    mc.cores=n.cores)

n.cores=length(unique(mm10_atac@meta.data$seurat_clusters))
mm10_da_peaks<-mclapply(
    unique(mm10_atac@meta.data$seurat_clusters),
    FUN=da_one_v_one,
    obj=mm10_atac,
    group="seurat_clusters",
    j_list=do.call("as.character",list(unique(mm10_atac@meta.data$seurat_clusters))),
    mc.cores=n.cores)

#Merge the final data frame from the list for 1v1 DA
hg38_da_peaks<-do.call("rbind",do.call("rbind",hg38_da_peaks))
mm10_da_peaks<-do.call("rbind",do.call("rbind",mm10_da_peaks))

write("Outputting One v One DA Table.", stderr())
write.table(hg38_da_peaks,file="hg38.onevone.da_peaks.txt",sep="\t",col.names=T,row.names=T,quote=F)
write.table(mm10_da_peaks,file="mm10.onevone.da_peaks.txt",sep="\t",col.names=T,row.names=T,quote=F)


Labeling Cell Types


setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)
library(ggplot2)
library(ggrepel)
library(dplyr)

hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")

#Cell types deteremined by TF motif and marker genebody accessibility
hg38_atac@meta.data$celltype<-"NA"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("2"),]$celltype<-"excitatory_neuron"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("3","4"),]$celltype<-"inhibitory_neuron"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("0","1"),]$celltype<-"oligodendrocytes"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("7"),]$celltype<-"polydendrocytes"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("5"),]$celltype<-"microglia"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("6"),]$celltype<-"astrocyte"
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")

mm10_atac@meta.data$celltype<-"NA"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("0","2"),]$celltype<-"excitatory_neuron"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("1"),]$celltype<-"inhibitory_neuron"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("4"),]$celltype<-"oligodendrocytes_polydendrocytes"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("5"),]$celltype<-"microglia"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("3"),]$celltype<-"astrocyte"
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")

#Also adding TSS enrichment values to meta data
hg38_tss<-read.table("hg38.bbrd.q10.tss_reads.value",header=F)
colnames(hg38_tss)<-c("cellID","TSS_enrichment")
mm10_tss<-read.table("mm10.bbrd.q10.tss_reads.value",header=F)
colnames(mm10_tss)<-c("cellID","TSS_enrichment")

hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")

hg38_annot<-hg38_atac@meta.data
mm10_annot<-mm10_atac@meta.data

hg38_annot<-merge(hg38_annot,hg38_tss,by="cellID")
mm10_annot<-merge(mm10_annot,mm10_tss,by="cellID")

row.names(hg38_annot)<-hg38_annot$cellID
row.names(mm10_annot)<-mm10_annot$cellID

hg38_atac@meta.data<-hg38_annot
mm10_atac@meta.data<-mm10_annot

saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")

Output Data

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)
library(ggplot2)
library(ggrepel)
library(dplyr)

hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")

hg38_dat<-as.data.frame(hg38_atac@meta.data)
mm10_dat<-as.data.frame(mm10_atac@meta.data)

#UMAP projection coords in 
#mm10_atac@reductions$umap@cell.embeddings
#hg38_atac@reductions$umap@cell.embeddings

write.table(hg38_dat,"hg38_cellsummaries.tsv",col.names=T,row.names=F,sep="\t",quote=F)
write.table(mm10_dat,"mm10_cellsummaries.tsv",col.names=T,row.names=F,sep="\t",quote=F)

Plotting projected read counts for human and mouse cells

library(ggplot2)
library(dplyr)

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

hg38_dat<-read.table("hg38_projected_reads.txt",sep="\t")
colnames(hg38_dat)<-c("perc_uniq","read_count","uniqread_count","cellID")
hg38_dat$species<-"human"
mm10_dat<-read.table("mm10_projected_reads.txt",sep="\t")
colnames(mm10_dat)<-c("perc_uniq","read_count","uniqread_count","cellID")
mm10_dat$species<-"mouse"
mm10_sum<-read.table("mm10_cellsummaries.tsv",head=T)
hg38_sum<-read.table("hg38_cellsummaries.tsv",head=T)

dat<-rbind(hg38_dat,mm10_dat)
dat<- as.data.frame(dat %>% 
                    group_by(species,perc_uniq) %>% 
                    summarize(mean=mean(log10(uniqread_count)),
                              sd=sd(log10(uniqread_count)),
                              median=median(log10(uniqread_count))))

ggplot(dat,aes(x=as.numeric(perc_uniq),fill = species,color=species))+
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd,color=NULL),alpha=0.1) +
geom_line(aes(y=mean,linetype="solid"))+
geom_line(aes(y=median,linetype="dashed")) +
geom_point(data=hg38_sum,aes(y=log10(uniq_reads),x=perc_uniq/100),color="red",fill="red",size=0.5)+
geom_point(data=mm10_sum,aes(y=log10(uniq_reads),x=perc_uniq/100),color="blue",fill="blue",size=0.5)+
theme_bw() + scale_x_reverse()


ggsave(file="projected_readcount.png")
ggsave(file="projected_readcount.pdf")
system("slack -F projected_readcount.pdf s3")

Subclustering of Cell Types

Starting with human cells

library(Signac)
library(Seurat)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(cicero)
library(cisTopic)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(Matrix)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(patchwork)
set.seed(1234)
library(dplyr)
library(ggrepel)


setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

#Read in data and modify to monocle CDS file
#read in RDS file.
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")


######FUNCTIONS#####
cistopic_generation<-function(x,celltype.x,species="hg38"){
#Perform cistopic on subclusters of data 
    atac_sub<-subset(x,subset=celltype==celltype.x) 
    cistopic_counts_frmt<-atac_sub$peaks@counts
    row.names(cistopic_counts_frmt)<-sub("-", ":", row.names(cistopic_counts_frmt))
    sub_cistopic<-cisTopic::createcisTopicObject(cistopic_counts_frmt)
    sub_cistopic_models<-cisTopic::runWarpLDAModels(sub_cistopic,topic=c(5,10,20:30),nCores=12,addModels=FALSE)
    saveRDS(sub_cistopic_models,file=paste(species,celltype.x,".CisTopicObject.Rds",sep="_"))

    pdf(paste(species,celltype.x,"_model_selection.pdf",sep="_"))
    par(mfrow=c(3,3))
    sub_cistopic_models <- selectModel(sub_cistopic_models, type='maximum')
    sub_cistopic_models <- selectModel(sub_cistopic_models, type='perplexity')
    sub_cistopic_models<- selectModel(sub_cistopic_models, type='derivative')
    dev.off()
    }



#UMAP Projection and clustering on selected cistopic model
clustering_loop<-function(topicmodel_list.=topicmodel_list,object_input=hg38_atac,celltype.x,topiccount_list.=topic_count_list){
    #set up subset object again
    atac_sub<-subset(object_input,subset=celltype==celltype.x) 
    #select_topic
    models_input<-readRDS(paste(species,celltype.x,".CisTopicObject.Rds",sep="_"))
    cisTopicObject<-cisTopic::selectModel(models_input,select=topiccount_list.[celltype.x],keepModels=F)
    
    #perform UMAP on topics
    topic_df<-as.data.frame(cisTopicObject@selected.model$document_expects)
    row.names(topic_df)<-paste0("Topic_",row.names(topic_df))
    dims<-as.data.frame(uwot::umap(t(topic_df),n_components=2))
    row.names(dims)<-colnames(topic_df)
    colnames(dims)<-c("x","y")
    dims$cellID<-row.names(dims)
    dims<-merge(dims,object_input@meta.data,by.x="cellID",by.y="row.names")

    #get cell embeddings
    cell_embeddings<-as.data.frame(cisTopicObject@selected.model$document_expects)
    colnames(cell_embeddings)<-cisTopicObject@cell.names
    n_topics<-nrow(cell_embeddings)
    row.names(cell_embeddings)<-paste0("topic_",1:n_topics)
    cell_embeddings<-as.data.frame(t(cell_embeddings))
    
    #get feature loadings
    feature_loadings<-as.data.frame(cisTopicObject@selected.model$topics)
    row.names(feature_loadings)<-paste0("topic_",1:n_topics)
    feature_loadings<-as.data.frame(t(feature_loadings))
    
    #combine with seurat object for celltype seuratobject  
    cistopic_obj<-CreateDimReducObject(embeddings=as.matrix(cell_embeddings),loadings=as.matrix(feature_loadings),assay="peaks",key="topic_")
    umap_dims<-as.data.frame(as.matrix(dims[2:3]))
    colnames(umap_dims)<-c("UMAP_1","UMAP_2")
    row.names(umap_dims)<-dims$cellID
    cistopic_umap<-CreateDimReducObject(embeddings=as.matrix(umap_dims),assay="peaks",key="UMAP_")
    atac_sub@reductions$cistopic<-cistopic_obj
    atac_sub@reductions$umap<-cistopic_umap
    #finally recluster
    n_topics<-ncol(Embeddings(atac_sub,reduction="cistopic"))
    #Clustering with multiple resolutions to account for different celltype complexities
    atac_sub <- FindNeighbors(object = atac_sub, reduction = 'cistopic', dims = 1:n_topics)
    atac_sub <- FindClusters(object = atac_sub,resolution=0.1)
    atac_sub <- FindClusters(object = atac_sub,verbose = TRUE,resolution=0.2)
    atac_sub <- FindClusters(object = atac_sub,verbose = TRUE,resolution=0.5)
    atac_sub <- FindClusters(object = atac_sub,verbose = TRUE,resolution=0.9)
    
    saveRDS(atac_sub,paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
    plt<-DimPlot(atac_sub,group.by=c('peaks_snn_res.0.1','peaks_snn_res.0.2','peaks_snn_res.0.5','peaks_snn_res.0.9'))
    ggsave(plt,file=paste(species,celltype.x,"clustering.pdf",sep="_"))
}


#Set up DA Test
da_one_v_one<-function(i,obj,group,j_list){
    i<-as.character(i)
    da_tmp_2<-list()
    for (j in j_list){
        if ( i != j){
        da_peaks_tmp <- FindMarkers(
            object = obj,
            ident.1 = i,
            ident.2 = j,
            group.by = group,
            test.use = 'LR',
            latent.vars = 'nCount_peaks',
            only.pos=T
            )
        da_peaks_tmp$da_region<-row.names(da_peaks_tmp)
        da_peaks_tmp$enriched_group<-c(i)
        da_peaks_tmp$compared_group<-c(j)
        da_tmp_2[[paste(i,j)]]<-da_peaks_tmp
        }
    }
    return(da_tmp_2)
  }


#Generate chromvar matrix per celltype
chromvar_loop<-function(celltype.x){
    atac_sub<-readRDS(paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
    # Add the Motif object to the assay
    atac_sub<- SetAssayData(object = atac_sub, assay = 'peaks', slot = 'motifs', new.data = motif)
    atac_sub <- RegionStats(object = atac_sub, genome = BSgenome.Hsapiens.UCSC.hg38)
    atac_sub<- RunChromVAR(object = atac_sub,genome = BSgenome.Hsapiens.UCSC.hg38)
    tfList <- getMatrixByID(JASPAR2020, ID=row.names(atac_sub@assays$chromvar@data)) 
    tfList <-unlist(lapply(names(tfList), function(x) name(tfList[[x]])))
    row.names(atac_sub@assays$chromvar@data)<-tfList
    #setting final_clusters based on given resolution
    atac_sub$final_clusters<-atac_sub@meta.data[,which(endsWith(colnames(atac_sub@meta.data),
                                                         as.character(resolution_list[celltype.x])))]
    saveRDS(atac_sub,paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
    
    #Run Differential TF accessibility across clusters
    DefaultAssay(atac_sub) <- 'chromvar'
    #set up an empty list for looping through
    da_tfs<-list()
    da_tfs<-lapply(
        unique(atac_sub$final_clusters),
        FUN=da_one_v_one,
        obj=atac_sub,
        j_list=do.call("as.character",list(unique(atac_sub$final_clusters))),
        group="final_clusters")

    
    #Merge the final data frame from the list for 1v1 DA
    da_tfs2<-do.call("rbind",do.call("rbind",da_tfs))
    write("Outputting One v One DA Table.", stderr())
    write.table(da_tfs2,file=paste(species,celltype.x,"onevone.tffactors.txt",sep="_"),sep="\t",col.names=T,row.names=T,quote=F)
    dat<-read.table(file=paste(species,celltype.x,"onevone.tffactors.txt",sep="_"))
    dat_select<-dat %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group,compared_group) %>% slice(1:2) #grabbing top 5 most significant peaks to label
    
    plt<-ggplot(dat,aes(x=avg_logFC,y=(-log(p_val)),color=as.factor(compared_group)))+
    geom_point(aes(alpha=0.1))+
    geom_label_repel(dat=dat_select,aes(label=da_region),force=3)+
    theme_bw()+facet_wrap(~enriched_group)
    ggsave(plt,file=paste(species,celltype.x,"tffactors.pdf",sep="_"))
    system(paste0("slack -F ",paste(species,celltype.x,"tffactors.pdf",sep="_")," ryan_todo"))

}


#Perform DA test for gene activity scores
geneactivity_loop<-function(celltype.x){
    atac_sub<-readRDS(paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
    #Run Differential GA accessibility across clusters
    DefaultAssay(atac_sub) <- 'GeneActivity'
    #set up an empty list for looping through
    da_ga<-list()

    da_ga<-lapply(
        unique(atac_sub$final_clusters),
        FUN=da_one_v_one,
        obj=atac_sub,
        j_list=do.call("as.character",list(unique(atac_sub$final_clusters))),
        group="final_clusters")

    #Merge the final data frame from the list for 1v1 DA
    da_ga2<-do.call("rbind",do.call("rbind",da_ga))
    write("Outputting One v One DA Table.", stderr())
    write.table(da_ga2,file=paste(species,celltype.x,"onevone.geneactivity.txt",sep="_"),sep="\t",col.names=T,row.names=T,quote=F)
    dat<-read.table(file=paste(species,celltype.x,"onevone.geneactivity.txt",sep="_"))
    dat_select<-dat %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group,compared_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
    plt<-ggplot(dat,aes(x=avg_logFC,y=(-log(p_val)),color=as.factor(compared_group)))+
    geom_point(aes(alpha=0.1))+
    geom_label_repel(dat=dat_select,aes(label=da_region),force=3)+
    theme_bw()+facet_wrap(~enriched_group)
    ggsave(plt,file=paste(species,celltype.x,"geneactivity.pdf",sep="_"))
    system(paste0("slack -F ",paste(species,celltype.x,"geneactivity.pdf",sep="_")," ryan_todo"))

}


#Set up DA Test for peaks, same test but now also finds closest genes
da_one_v_one_peak<-function(i,obj,group,j_list){
    i<-as.character(i)
    da_tmp_2<-list()
    for (j in j_list){
        if ( i != j){
        da_peaks_tmp <- FindMarkers(
            object = obj,
            ident.1 = i,
            ident.2 = j,
            group.by = group,
            test.use = 'LR',
            latent.vars = 'nCount_peaks',
            only.pos=T
            )
        da_peaks_tmp$da_region<-row.names(da_peaks_tmp)
        closest_genes <- ClosestFeature(obj,da_peaks_tmp$da_region)
        da_peaks_tmp<-cbind(da_peaks_tmp,closest_genes)
        da_peaks_tmp$enriched_group<-c(i)
        da_peaks_tmp$compared_group<-c(j)
        da_tmp_2[[paste(i,j)]]<-da_peaks_tmp
        }
    }
    return(da_tmp_2)
  }


#Perform DA test for gene activity scores
da_peaks_loop<-function(celltype.x){
    atac_sub<-readRDS(paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
    #Run Differential GA accessibility across clusters
    DefaultAssay(atac_sub) <- 'peaks'
    #set up an empty list for looping through
    da_peaks<-list()

    da_peaks<-mclapply(
        unique(atac_sub$final_clusters),
        FUN=da_one_v_one_peak,
        obj=atac_sub,
        j_list=do.call("as.character",list(unique(atac_sub$final_clusters))),
        group="final_clusters",mc.cores=5)

    #Merge the final data frame from the list for 1v1 DA
    da_peaks2<-do.call("rbind",do.call("rbind",da_peaks))
    write("Outputting One v One DA Table.", stderr())
    write.table(da_peaks2,file=paste(species,celltype.x,"onevone.peaks.txt",sep="_"),sep="\t",col.names=T,row.names=T,quote=F)
}

####################################
### Processing ###
#hg38
    #Set up variables
    species="hg38"
    celltype_list<-unique(hg38_atac$celltype)

    #Running cistopic subclustering on all identified cell types
    for (i in celltype_list){cistopic_generation(x=hg38_atac,celltype.x=i)}
    topicmodel_list<-paste(species,celltype_list,".CisTopicObject.Rds",sep="_")

    #determine model count to use for each cell type
    for (i in paste(species,celltype_list,"_model_selection.pdf",sep="_")){system(paste0("slack -F ",i," ryan_todo"))}

    #selecting topics based on derivative, making a named vector 
    topic_count_list<-c(25,22,24,27,24,22)
    names(topic_count_list)<-celltype_list

    #Running clustering loop
    for (i in celltype_list){clustering_loop(object_input=hg38_atac,celltype.x=i)}

    #selecting resolution by plots
    for (i in celltype_list){system(paste0("slack -F ",paste(species,i,"clustering.pdf",sep="_")," ryan_todo"))}

    #Set up named vector for clustering resolution, based on umap plot with multiple clustering resolutions
    resolution_list<-c(0.5,0.2,0.9,0.5,0.5,0.9)
    names(resolution_list)<-celltype_list

    #Set up ChromVAR motif object to run
    # Get a list of motif position frequency matrices from the JASPAR database
    pfm <- getMatrixSet(x = JASPAR2020,opts = list(species =9606, all_versions = FALSE))
    # Scan the DNA sequence of each peak for the presence of each motif
    motif.matrix <- CreateMotifMatrix(features = granges(hg38_atac),pwm = pfm,genome = 'hg38',use.counts = FALSE)
     # Create a new Mofif object to store the results
    motif <- CreateMotifObject(data = motif.matrix,pwm = pfm)

    #Run ChromVAR
    for (i in celltype_list){chromvar_loop(celltype.x=i)}

    #Run Gene Activity Score differences
    for (i in celltype_list){geneactivity_loop(celltype.x=i)}
    
    #DA Peaks
    for (i in celltype_list){da_peaks_loop(celltype.x=i)}
    
#mm10
    #Set up variables
    species="mm10"
    celltype_list<-unique(mm10_atac$celltype)

    
    #Running cistopic subclustering on all identified cell types
    for (i in celltype_list){cistopic_generation(x=mm10_atac,celltype.x=i,species="mm10")}
    topicmodel_list<-paste(species,celltype_list,".CisTopicObject.Rds",sep="_")

    #determine model count to use for each cell type
    for (i in paste(species,celltype_list,"_model_selection.pdf",sep="_")){system(paste0("slack -F ",i," ryan_todo"))}

    #selecting topics based on derivative, making a named vector 
    topic_count_list<-c(28,22,23,28,28,22)
    names(topic_count_list)<-celltype_list

    #Running clustering loop
    for (i in celltype_list){clustering_loop(object_input=hg38_atac,celltype.x=i)}

    #selecting resolution by plots
    for (i in celltype_list){system(paste0("slack -F ",paste(species,i,"clustering.pdf",sep="_")," ryan_todo"))}

    #Set up named vector for clustering resolution, based on umap plot with multiple clustering resolutions
    resolution_list<-c(0.5,0.2,0.9,0.5,0.5,0.9)
    names(resolution_list)<-unique(hg38_atac$celltype)

    #Set up ChromVAR motif object to run
    # Get a list of motif position frequency matrices from the JASPAR database
    pfm <- getMatrixSet(x = JASPAR2020,opts = list(species =9606, all_versions = FALSE))
    # Scan the DNA sequence of each peak for the presence of each motif
    motif.matrix <- CreateMotifMatrix(features = granges(hg38_atac),pwm = pfm,genome = 'hg38',use.counts = FALSE)
     # Create a new Mofif object to store the results
    motif <- CreateMotifObject(data = motif.matrix,pwm = pfm)

    #Run ChromVAR
    for (i in celltype_list){chromvar_loop(celltype.x=i,resolution_list.=resolution_list)}

    #Run Gene Activity Score differences
    for (i in celltype_list){geneactivity_loop(celltype.x=i,resolution_list.=resolution_list)}

Get Topic Enrichment for Subcluster

Focusing on human inhibitory neurons

library(Signac)
library(Seurat)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(cicero)
library(cisTopic)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(Matrix)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(dplyr)
library(ggrepel)
library(parallel)

#read in RDS file.
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")
iN<-readRDS("hg38_inhibitory_neuron_SeuratObject.Rds")

#Plot Markers
markers<-c("LHX6","ADARB2","CHODL","UNC5B")

DefaultAssay(iN)<-"peaks"
plt<-CoveragePlot(iN, region = markers, 
                  tile = TRUE,
                 extend.upstream=2000,
                 extend.downstream=2000,
                 tile.cells=10,ncol=1)

pdf("hg38_inhibitory_neuron.genebody.pdf",height=30)
plt
dev.off()
system("slack -F hg38_inhibitory_neuron.genebody.pdf ryan_todo")


dat_out<-cbind(as.data.frame(iN@reductions$umap@cell.embeddings[names(iN$final_clusters),]),iN$final_clusters)
write.table(dat_out,file="SourceData_Fig3a.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_Fig3a.tsv ryan_todo")

#Read in cistopic object
cistopic_object<-readRDS("hg38_inhibitory_neuron_.CisTopicObject.Rds")
#binarize cistopic to obtain peaks
cistopic_object<-cisTopic::selectModel(cistopic_object,select="25",keepModels=F)
cistopic_object <- getRegionsScores(cistopic_object, method='NormTop', scale=TRUE)
cistopic_object <- binarizecisTopics(cistopic_object, thrP=0.975, plot=TRUE)

#use chromvar to define signature TFs in topics
#Getting binarized motif signatures from Motif class in Signac
motif_signatures<-as.data.frame(iN@assays$peaks@motifs@data)

#get TF list common names
tfList <- getMatrixByID(JASPAR2020, ID=colnames(motif_signatures)) 
tfList <-unlist(lapply(names(tfList), function(x) name(tfList[[x]])))

#make a bed format file out of motifs
#x is an column number
motif_to_bed<-function(motif_signatures.=motif_signatures,x.=x){
    motif_tmp<-row.names(motif_signatures)[which(motif_signatures[,x.]>0)]
    tf<-colnames(motif_signatures)[x.]
    motif_tmp<-strsplit(motif_tmp,"-")
    seqnames<-lapply(motif_tmp,"[",1)
    start<-lapply(motif_tmp,"[",2)
    end<-lapply(motif_tmp,"[",3)
    out_bed<-as.data.frame(cbind(seqnames,start,end,tf),col.names=c("seqnames","start","end","tf"))
    return(out_bed)
}


motif_signatures_bed<-mclapply(1:ncol(motif_signatures),function(x) motif_to_bed(x.=x,motif_signatures.=motif_signatures),mc.cores=20)
#write out motifs contained within peak regions
dir.create("chromvar_bed_files")
for (x in 1:length(motif_signatures_bed)){
    y<-motif_signatures_bed[[x]]
    y$seqnames<-unlist(y$seqnames)
    y$start<-unlist(y$start)
    y$end<-unlist(y$end)
    y$tf<-unlist(y$tf)
    write.table(y,file=paste0("./chromvar_bed_files/","hg38.",tfList[x],".bed"),sep="\t")
    print(paste0("./chromvar_bed_files/","hg38.",tfList[x],".bed"))
}
#generate signatures
signatures <-paste0("./chromvar_bed_files/","hg38.",tfList,".bed")
cistopic_object <- getSignaturesRegions(cistopic_object, signatures, labels=tfList)

library(AUCell)
pred.matrix <- predictiveDistribution(cistopic_object)
aucellRankings <- AUCell_buildRankings(pred.matrix, plot=FALSE, verbose=FALSE)

# Check signature enrichment in cells
cistopic_object <- signatureCellEnrichment(cistopic_object, aucellRankings, selected.signatures='all', aucMaxRank = 0.1*nrow(aucellRankings), plot=FALSE)

#make a heatmap

plt<-signaturesHeatmap(cistopic_object,nCores=1,row_names_gp = gpar(fontsize = 1))
sig_df<- as.data.frame(plt@ht_list$"Normalised AUC score"@matrix)
write.table(sig_df,file="hg38_inhibitoryNeurons.topic_TFsignatures.txt",sep="\t",col.names=T,row.names=T,quote=F)
sum(unlist(lapply(1:nrow(sig_df),function(x) any(abs(sig_df[x,])>3))))
#76
sig_df_filt<-sig_df[unlist(lapply(1:nrow(sig_df),function(x) any(abs(sig_df[x,])>3))),]
pdf("hg38_inhibitory_neurons.cisTopic_ChromvarSignature.pdf")
plt
dev.off()

saveRDS(cistopic_object,"hg38_inhibitory_neuron_.CisTopicObject.processed.Rds")

system("slack -F hg38_inhibitory_neurons.cisTopic_ChromvarSignature.pdf ryan_todo")
system("slack -F hg38_inhibitoryNeurons.topic_TFsignatures.txt ryan_todo")

New Session for Topic marker set enrichment

#Plotting markers https://celltypes.brain-map.org/rnaseq/human_m1_10x
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
library(cisTopic)
library(ggplot2)
library(reshape2)
library(patchwork)
library(Signac)
library(ComplexHeatmap)

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")

#inhibitory neuron markers from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6919571/
markers<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/human_cortex_markers.tsv",head=T,sep="\t",comment.char="")
markers<-markers[,c("level1","level2","level3","single_markers_vs_level1")]
markers<-markers[markers$level1=="GABAergic neuron",]

level1_genesets<-lapply(unique(markers$level1),function(x) trimws(unlist(strsplit(unlist(markers[markers$level1==x,]$single_markers_vs_level1),","))))
names(level1_genesets)<-unique(markers$level1)

level2_genesets<-lapply(unique(markers$level2),function(x) trimws(unlist(strsplit(unlist(markers[markers$level2==x,]$single_markers_vs_level1),","))))
names(level2_genesets)<-unique(markers$level2)

level3_genesets<-lapply(unique(markers$level3),function(x) trimws(unlist(strsplit(unlist(markers[markers$level3==x,]$single_markers_vs_level1),","))))
names(level3_genesets)<-unique(markers$level3)

cistopic_object<-readRDS("hg38_inhibitory_neuron_.CisTopicObject.processed.Rds")
cistopic_object<-binarizecisTopics(cistopic_object)
cistopic_object <- annotateRegions(cistopic_object, txdb=TxDb.Hsapiens.UCSC.hg38.knownGene, annoDb='org.Hs.eg.db')
#peaks per topic
summary(unlist(lapply(lapply(cistopic_object@binarized.cisTopics,dim),"[",1)))
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#   1401    2109    2523    2429    2724    3239

topic_list<-names(cistopic_object@binarized.cisTopics)

#Look at enrichment of this topic for peaks in marker genes
#                             Marker_Set             Not_marker_set
#Peaks_in_topic               a                          c
#Peaks_not_in_topic           b                          d

gene_set_enrichment<-function(x,y,gene_set){
    dat_peaks<-cistopic_object@binarized.cisTopics[[x]]
    peaks_considered<-data.frame("peaks"=unlist(lapply(names(cistopic_object@binarized.cisTopics),
                                                       function(x) row.names(cistopic_object@binarized.cisTopics[[x]]))))
    peaks_considered<-merge(peaks_considered,cistopic_object@region.data,by.x="peaks",by.y="row.names",all.x=T)
    dat_peaks<-merge(dat_peaks,cistopic_object@region.data,by="row.names",all.x=T)
    a<-sum(dat_peaks$SYMBOL %in% unlist(gene_set[y]))
    b<-sum(peaks_considered$SYMBOL %in% unlist(gene_set[y]))-a #not using all peaks since all peaks were called on other cell types, using topic representatives
    c<-sum(!(dat_peaks$SYMBOL %in% unlist(gene_set[y])))
    d<-sum(!(peaks_considered$SYMBOL %in% unlist(gene_set[y])))-c #black balls in urn (peaks not assigned to topic)
    test<-matrix(c(a,b,c,d), ncol=2)
    return(c(x,y,
             as.numeric(fisher.test(test,alternative="greater")$p.value),
             as.numeric(fisher.test(test,alternative="greater")$estimate)))
}

#gene set enrichment of topic peaks in marker genes
gsea_2<-list()
for (i in topic_list){
    gsea_2[[i]]<-list()
    for (j in names(level2_genesets)){
        gsea_2[[i]][[j]]<-gene_set_enrichment(x=i,y=j,gene_set=level2_genesets)
    }
}
gsea_2<-do.call("rbind",do.call("rbind",(gsea_2)))
colnames(gsea_2)<-c("Topic","CellType","pval","estimate")
gsea_2<-as.data.frame(gsea_2)
gsea_2$pval<-as.numeric(gsea_2$pval)
gsea_2$estimate<-as.numeric(gsea_2$estimate)

#gene set enrichment of topic peaks in marker genes (level3_genesets)
gsea_3<-list()
for (i in topic_list){
    gsea_3[[i]]<-list()
    for (j in names(level3_genesets)){
        gsea_3[[i]][[j]]<-gene_set_enrichment(x=i,y=j,gene_set=level3_genesets)
    }
}
gsea_3<-do.call("rbind",do.call("rbind",(gsea_3)))
colnames(gsea_3)<-c("Topic","CellType","pval","estimate")
gsea_3<-as.data.frame(gsea_3)
gsea_3$pval<-as.numeric(gsea_3$pval)
gsea_3$estimate<-as.numeric(gsea_3$estimate)

#Order Topics by cell type and plot

#read in RDS file.
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")
iN<-readRDS("hg38_inhibitory_neuron_SeuratObject.Rds")
cistopic_cell_embeddings<-iN@reductions$cistopic@cell.embeddings
#cistopic_cell_embeddings<-cistopic_cell_embeddings[,colnames(cistopic_cell_embeddings) %in% paste("topic",1:25,sep="_")]
cellid_order<-row.names(cistopic_cell_embeddings)
cluster_annot<-setNames(iN@meta.data[match(cellid_order,iN@meta.data$cellID),]$final_cluster,cellid_order)
library(circlize)
col_fun = colorRamp2(c(0,0.3), c("white","red"))

write.table(as.data.frame(cistopic_cell_embeddings),file="SourceData_Fig3c_top.tsv",row.names=T,sep="\t",quote=F)
system("slack -F SourceData_Fig3c_top.tsv ryan_todo")
   
plt<-Heatmap(as.data.frame(cistopic_cell_embeddings),
    cluster_columns=T,
    show_row_names=F,
    clustering_distance_rows="spearman",
    clustering_distance_columns="spearman",
    left_annotation=rowAnnotation(cluster=cluster_annot),
    show_column_names=T,
    show_row_den=T,row_dend_side="left",use_raster=T, col=col_fun
)

plt1 = draw(plt) #draw so its statics
topic_ord<-column_order(plt1) #get column order

pdf("hg38_inhibitory_neuron_cistopic_cellembbed.pdf",width=30)
plt1
dev.off()
system("slack -F hg38_inhibitory_neuron_cistopic_cellembbed.pdf ryan_todo")

gsea_2$Topic <- factor(gsea_2$Topic, levels = paste0("Topic",1:25)[topic_ord])
gsea_2$CellType <- factor(gsea_2$CellType, levels = unique(gsea_2$CellType))

plt2<-ggplot(gsea_2, aes(x = CellType, y = Topic)) +
  geom_point(aes(size = estimate, color = pval)) +
  theme_bw() + scale_color_gradient(limits=c(0,0.05),low="red",high="white")+coord_flip()+scale_size(limits=c(0,3))
ggsave(plt2,file="hg38_inhibitory_neuron_celltypelevel2.pdf")
system("slack -F hg38_inhibitory_neuron_celltypelevel2.pdf ryan_todo")

gsea_3$Topic <- factor(gsea_3$Topic, levels = paste0("Topic",1:25)[topic_ord])
gsea_3$CellType <- factor(gsea_3$CellType, levels = gsea_3$CellType)

plt3<-ggplot(gsea_3, aes(x = CellType, y = Topic)) +
  geom_point(aes(size = estimate, color = pval)) +
  theme_bw() + scale_color_gradient(limits=c(0,0.05),low="orange",high="white")+coord_flip()+scale_size(limits=c(0,3))
ggsave(plt3,file="hg38_inhibitory_neuron_celltypelevel3.pdf")
system("slack -F hg38_inhibitory_neuron_celltypelevel3.pdf ryan_todo")

plt<-wrap_plots(plt2,plt3,ncol=1)
ggsave(plt,file="hg38_inhibitory_neuron_gsea.pdf")
system("slack -F hg38_inhibitory_neuron_gsea.pdf ryan_todo") 
    
gsea_2$level<-"2"
gsea_3$level<-"3"

write.table(rbind(gsea_2,gsea_3),file="SourceData_Fig3c_bot.tsv",row.names=T,sep="\t",quote=F)
system("slack -F SourceData_Fig3c_bot.tsv ryan_todo")

    
    
    


cistopic_object <- GREAT(cistopic_object, genome='hg38', fold_enrichment=2, geneHits=1, sign=0.05, request_interval=10
)

markers<-c("ADARB2","PAX6","LAMP5","VIP",
           "LHX6","SST","HTR3A","LOC102724124",
           "CDH12","PAX6-AS1","TNFAIP8L3","NMBR",
           "LCP2","DBP","LOC105371641","CA1","CHRNA4",
           "MC4R","LOC105377434","BAGE2","SYT6","TSPAN12",
           "CHRNA6","LOC105371331","PENK","QPCT","HS3ST3A1",
           "LOC101927460","PCDH20","SERPINF1","TYR","LOC100421401",
           "CHRM2","CBLN1","CCDC184","GGH","LOC644919","LBH","CASC6",
           "LIN00923","SPAG17","OPRM1",
           "ADARB2","LHX6","SST","PVALB","NPY","HPGD","B3GAT2","LOC101927132","KLHDC8A",
          "LOC105371658","NPM1P10","LOC105377401","GXYLT2","STK32A","LOC101929028","CALB1",
          "LOC105374696","ADGR6","FRZB","TH","GLP1R","LGR5","MEPE","WFDC2","SULF1","ZFPM2-AS1","MIR548F2","SCUBE3")

nrow(cistopic_object@region.data[cistopic_object@region.data$SYMBOL %in% markers,])

plt<-ontologyDotPlot(cistopic_object, top=5,var.y='name', order.by='Binom_Adjp_BH')
pdf("test_heatmap.genebody.pdf",width=30)
plt
dev.off()
system("slack -F test_heatmap.genebody.pdf ryan_todo")


iN<-readRDS("hg38_inhibitory_neuron_SeuratObject.Rds")

library(ggplot2)
library(Seurat)
library(Signac)
library(ggrepel)
library(dplyr)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")
dat_ga<-read.table("hg38_inhibitory_neuron_onevone.geneactivity.txt",head=T)
dat_tf<-read.table("hg38_inhibitory_neuron_onevone.tffactors.txt",head=T)
dat_peaks<-read.table("hg38_inhibitory_neuron_onevone.peaks.txt",head=T)
markers<-c("GAD1","LHX6","ADARB2","LAMP5","VIP","PVALB","SST","CUX2")

dat_select<-dat_peaks %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group,compared_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
#dat_select<-dat_peaks[dat_peaks$p_val_adj<0.1 & dat_peaks$gene_name %in% markers,]

dat_peaks<-dat_peaks[dat_peaks$p_val_adj<1,]

plt<-ggplot(dat_peaks,aes(x=avg_logFC,y=(-log(p_val_adj)),color=as.factor(compared_group)))+
geom_point(aes(alpha=0.1))+
geom_label_repel(dat=dat_select,aes(label=gene_name),size=2,force=5)+
theme_bw()+facet_wrap(~enriched_group)+ylim(c(0,20))+xlim(c(0,50))
ggsave(plt,file=paste(species,celltype.x,"da_peaks.pdf",sep="_"))
system(paste0("slack -F ",paste(species,celltype.x,"da_peaks.pdf",sep="_")," ryan_todo"))
    
#dat_ga[dat_ga$da_region %in% markers,]
#dat_tf[dat_tf$da_region %in% markers,]
#dat_peaks[dat_peaks$gene_name %in% markers,] 


#Generate promoter gene body gene activities (not many CCANs overlap with marker genes)
#Limiting to TFs/Gene Activity/DA Peaks identified and nominally different
dat_select<-dat_peaks[dat_peaks$distance==0,] %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group,compared_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label

#features<-as.data.frame(dat_peaks[dat_peaks$distance==0,] %>% group_by(enriched_group,compared_group) %>% head())
#features<-dat_select$gene_name
#features<-unique(c(dat_ga$da_region,dat_tf$da_region,dat_peaks$gene_name))

#inhibitory neuron markers from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6919571/
markers<-c("ADARB2","PAX6","LAMP5","VIP",
           "LHX6","SST","HTR3A","LOC102724124",
           "CDH12","PAX6-AS1","TNFAIP8L3","NMBR",
           "LCP2","DBP","LOC105371641","CA1","CHRNA4",
           "MC4R","LOC105377434","BAGE2","SYT6","TSPAN12",
           "CHRNA6","LOC105371331","PENK","QPCT","HS3ST3A1",
           "LOC101927460","PCDH20","SERPINF1","TYR","LOC100421401",
           "CHRM2","CBLN1","CCDC184","GGH","LOC644919","LBH","CASC6",
           "LIN00923","SPAG17","OPRM1",
           "ADARB2","LHX6","SST","PVALB","NPY","HPGD","B3GAT2","LOC101927132","KLHDC8A",
          "LOC105371658","NPM1P10","LOC105377401","GXYLT2","STK32A","LOC101929028","CALB1",
          "LOC105374696","ADGR6","FRZB","TH","GLP1R","LGR5","MEPE","WFDC2","SULF1","ZFPM2-AS1","MIR548F2","SCUBE3")

nrow(cistopic_object@region.data[cistopic_object@region.data$SYMBOL %in% markers,])

DefaultAssay(dat)<-"genebody"
gene.activities <- GeneActivity(dat,features=markers)
dat[['genebody']] <- CreateAssayObject(counts = gene.activities)

dat[['genebody']]<-CreateAssayObject(data=as.matrix(sweep(dat@assays$genebody@counts,2,dat$uniq_reads,`/`))) #divide by read count
    
dat <- NormalizeData(
  object = dat,
  assay = 'genebody',
  normalization.method = 'RC',
  scale.factor = 1e6
)

DefaultAssay(dat)<-"genebody"
plt<-FeaturePlot(dat, features = markers,max.cutoff='q95')

pdf("test_heatmap.genebody.pdf",width=30,height=20)
plt
dev.off()
system("slack -F test_heatmap.genebody.pdf ryan_todo")


library(reshape2)
library(ComplexHeatmap)
genebody<-as.data.frame(dat@assays$genebody@data)
genebody<-sweep(genebody,2,dat$uniq_reads,`/`) #divide by read count
genebody<-data.frame(t(genebody))
genebody$cellID<-row.names(genebody)
#append cluster ID to column
genebody$final_clusters<-dat@meta.data[match(dat@meta.data$cellID,genebody$cellID),]$final_clusters
#reshape to long format
genebody<-melt(genebody)
#group by to summarize markers by column
genebody<-as.data.frame(genebody %>% group_by(final_clusters,variable) %>% summarize(mean_ga=median(value)))
#plot as heatmap
genebody<-dcast(genebody,final_clusters~variable)
row.names(genebody)<-genebody$final_clusters
genebody<-genebody[colnames(genebody) %in% markers]
#zscore values
genebody<-scale(genebody)
plt<-Heatmap(t(genebody),
             clustering_distance_columns="pearson",
)

DefaultAssay(dat)<-"GeneActivity"

plt<-FeaturePlot(dat,features=markers)

ggsave(plt,file="inhib_markers.pdf",height=20,width=20)
system("slack -F inhib_markers.pdf ryan_todo")

DefaultAssay(dat)<-"peaks"
plt<-CoveragePlot(dat,region=markers,ncol=1,group.by="final_clusters")

ggsave(plt,file="inhib_markers.pdf",height=10*length(markers),limitsize=F)
system("slack -F inhib_markers.pdf ryan_todo")



Integration across mouse scATAC data sets

Using Harmony for integration across data sets for mouse brain data.

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Mmusculus.v79)
library(ggplot2)
library(dplyr)
library(patchwork)
library(Matrix)
library(harmony,lib.loc="/home/groups/oroaklab/src/R/R-4.0.0/lib_backup_210125")

mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
mm10_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
seqlevelsStyle(mm10_annotations) <- 'UCSC'
genome(mm10_annotations) <- "mm10"

#function to read in sparse matrix format from atac-count
read_in_sparse<-function(x){ #x is character file prefix followed by .bbrd.q10.500.counts.sparseMatrix.values.gz
IN<-as.matrix(read.table(paste0(x,".counts.sparseMatrix.values.gz")))
IN<-sparseMatrix(i=IN[,1],j=IN[,2],x=IN[,3])
COLS<-read.table(paste0(x,".counts.sparseMatrix.cols.gz"))
colnames(IN)<-COLS$V1
ROWS<-read.table(paste0(x,".counts.sparseMatrix.rows.gz"))
row.names(IN)<-ROWS$V1
writeMM(IN,file=paste0(x,".counts.mtx")) #this is to generate counts matrices in scrublet friendly format
return(IN)
}

#tenx
ours_in_tenx<-read_in_sparse("tenx.bed")
ours_in_tenx_assay <- CreateChromatinAssay(counts = ours_in_tenx, genome="mm10", min.cells = 0, annotation=mm10_annotations, sep=c("_","_"))
ours_in_tenx_object <- CreateSeuratObject(counts = ours_in_tenx_assay, assay = "peaks")
ours_in_tenx_object$tech<-"s3"
#snatac
ours_in_snatac<-read_in_sparse("snatac.bed")
ours_in_snatac_assay <- CreateChromatinAssay(counts = ours_in_snatac, genome="mm10", min.cells = 0, annotation=mm10_annotations, sep=c("_","_"))
ours_in_snatac_object <- CreateSeuratObject(counts = ours_in_snatac_assay, assay = "peaks")
ours_in_snatac_object$tech<-"s3"
#scimap
ours_in_scimap<-read_in_sparse("scimap.bed")
ours_in_scimap_assay <- CreateChromatinAssay(counts = ours_in_scimap, genome="mm10", min.cells = 0, annotation=mm10_annotations, sep=c("_","_"))
ours_in_scimap_object <- CreateSeuratObject(counts = ours_in_scimap_assay, assay = "peaks")
ours_in_scimap_object$tech<-"s3"

tenx<-readRDS("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/tenx_genomics_adultbrain_fresh_5k.Rds")
tenx$tech<-"tenx"
snatac<-readRDS("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/snatac_adultbrain.Rds")
snatac$tech<-"snatac"
scimap<-readRDS("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/scimapatac_adultbrain.Rds")
scimap$tech<-"scimap"

x<-ours_in_scimap_object
y<-scimap

integration_peaks<-function(x,y,prefix){
y<-subset(y,features=row.names(x[["peaks"]]@counts))
unintegrated <- merge(x,y)
unintegrated <- RunTFIDF(unintegrated)
unintegrated <- FindTopFeatures(unintegrated, min.cutoff = 50)
unintegrated <- RunSVD(unintegrated)
unintegrated <- RunUMAP(unintegrated, reduction = 'lsi', dims = 2:30)
p1 <- DimPlot(unintegrated, group.by = 'tech', pt.size = 0.1) + ggplot2::ggtitle("Unintegrated")
saveRDS(unintegrated,paste0(prefix,".unintegrated.Rds"))

hm.integrated <- RunHarmony(object = unintegrated, group.by.vars = 'tech', reduction = 'lsi', assay.use = 'peaks', project.dim = FALSE )
# re-compute the UMAP using corrected LSI embeddings
hm.integrated <- RunUMAP(hm.integrated, dims = 2:30, reduction = 'harmony')
p5 <- DimPlot(hm.integrated, group.by = 'tech', pt.size = 0.1) + ggplot2::ggtitle("Harmony integration")
plt<-p1 + p5  #+ p2
ggsave(plt,file=paste0(prefix,"_integration.pdf"))
system(paste0("slack -F ",paste0(prefix,"_integration.pdf")," ryan_todo"))
saveRDS(hm.integrated,paste0(prefix,".hm.integrated.Rds"))
}

integration_peaks(x=ours_in_tenx_object,y=tenx,prefix="ours_in_tenx")
integration_peaks(x=ours_in_snatac_object,y=snatac,prefix="ours_in_snatac")
integration_peaks(x=ours_in_scimap_object,y=scimap,prefix="ours_in_scimap")


integration.files<-list.files(pattern="hm.integrated.Rds")

out_data<-lapply(integration.files,function(i){
	outname<-unlist(lapply(strsplit(i,"[.]"),"[",1))
	obj<-readRDS(i)
	dat_out<-cbind(as.data.frame(obj@reductions$umap@cell.embeddings[names(obj$tech),]),obj$tech,obj$celltype,outname)
	return(dat_out)
	})


  	dat_out<-do.call("rbind",out_data)
  	write.table(dat_out,file="SourceData_Fig2i.tsv",sep="\t",quote=F,row.names=F)
	system("slack -F SourceData_Fig2i.tsv ryan_todo")

Public Data RNA Comparison

Download data from Allen Brain-span

For human: https://portal.brain-map.org/atlases-and-data/rnaseq/human-m1-10x For mouse: https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-whole-cortex-and-hippocampus-10x

#Human download
cd /home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_humancortex
wget https://brainmapportal-live-4cc80a57cd6e400d854-f7fdcae.divio-media.net/filer_public/70/32/70326830-e306-4743-a02c-a8da5bf9eb56/readme-m1-10.txt
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_m1_10x/metadata.csv
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_m1_10x/matrix.csv
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_m1_10x/trimmed_means.csv
wget https://brainmapportal-live-4cc80a57cd6e400d854-f7fdcae.divio-media.net/filer_public/0c/0c/0c0c882d-1c31-40a9-8039-3bf2706a77cd/sample-exp_component_mapping_human_10x_apr2020.zip
#Mouse download
cd /home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_mouse
wget https://brainmapportal-live-4cc80a57cd6e400d854-f7fdcae.divio-media.net/filer_public/87/14/8714d0a3-27d7-4a81-8c77-eebfd605a280/readme_mouse_10x.txt
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/metadata.csv 
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/matrix.csv
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/trimmed_means.csv
#Downloading Mouse whole brain data 
wget https://www.dropbox.com/s/kqsy9tvsklbu7c4/allen_brain.rds?dl=0

Process Data into Seurat Object

Following https://satijalab.org/seurat/v3.2/pbmc3k_tutorial.html

library(Seurat)
library(ggplot2)
#Human
setwd("/home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_humancortex")
meta_data<-read.csv("metadata.csv",header=T)
row.names(meta_data)<-meta_data$sample_name
counts<-read.csv("matrix.csv",header=T,row.names=1)
brainspan <- CreateSeuratObject(counts = as.data.frame(t(counts)), project = "brainspain", min.cells = 3, min.features = 500, meta.data=meta_data)
saveRDS(brainspan, file = "allen_brainspan_humancortex.rds")
brainspan <- NormalizeData(brainspan, normalization.method = "LogNormalize", scale.factor = 10000)
brainspan <- FindVariableFeatures(brainspan, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(brainspan)
brainspan <- ScaleData(brainspan, features = all.genes)
brainspan <- RunPCA(brainspan, features = VariableFeatures(object = brainspan))
plt<-ElbowPlot(brainspan)
ggsave(plt,file="allen_brainspan_humancortex.elbowplot.pdf")
system("slack -F allen_brainspan_humancortex.elbowplot.pdf ryan_todo")
brainspan <- FindNeighbors(brainspan, dims = 1:14)
brainspan <- FindClusters(brainspan, resolution = 0.5)
brainspan <- RunUMAP(brainspan, dims = 1:14)
plt<-DimPlot(brainspan, reduction = "umap",group.by=c("class_label","subclass_label"))
ggsave(plt,file="allen_brainspan_humancortex.dimplot.pdf",width=30)
system("slack -F allen_brainspan_humancortex.dimplot.pdf ryan_todo")
saveRDS(brainspan, file = "allen_brainspan_humancortex.rds")

#Mouse
 setwd("/home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_mouse")
 meta_data<-read.csv("metadata.csv",header=T)
 row.names(meta_data)<-meta_data$sample_name
 counts<-read.csv("matrix.csv",header=T,row.names=1,nrows=100000) #this is 1million cells, i think 10% of that will be fine, hopefully they are evenly distributed
 brainspan <- CreateSeuratObject(counts = as.data.frame(t(counts)), project = "brainspain", min.cells = 3, min.features = 500, meta.data=meta_data)
 saveRDS(brainspan, file = "allen_brainspan_mouse.rds")
 brainspan <- NormalizeData(brainspan, normalization.method = "LogNormalize", scale.factor = 10000)
 brainspan <- FindVariableFeatures(brainspan, selection.method = "vst", nfeatures = 2000)
 all.genes <- rownames(brainspan)
 brainspan <- ScaleData(brainspan, features = all.genes)
 brainspan <- RunPCA(brainspan, features = VariableFeatures(object = brainspan))
 plt<-ElbowPlot(brainspan)
 ggsave(plt,file="allen_brainspan_mouse.elbowplot.pdf")
 system("slack -F allen_brainspan_mouse.elbowplot.pdf ryan_todo")
 brainspan <- FindNeighbors(brainspan, dims = 1:15)
 brainspan <- FindClusters(brainspan, resolution = 0.5)
 brainspan <- RunUMAP(brainspan, dims = 1:15)
 plt<-DimPlot(brainspan, reduction = "umap",group.by=c("class_label","subclass_label"))
 ggsave(plt,file="allen_brainspan_mouse.dimplot.pdf",width=30)
 system("slack -F allen_brainspan_mouse.dimplot.pdf ryan_todo")
 saveRDS(brainspan, file = "allen_brainspan_mouse.rds")


Integration of ATAC and RNA for cell type identification

Retry mouse cluster id just straight up following https://satijalab.org/signac/articles/mouse_brain_vignette.html?

library(ComplexHeatmap)
library(Signac)
library(Seurat)
library(ggplot2)

setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")

#Read in data and modify to monocle CDS file
#read in RDS file.
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")

predict_celltype<-function(object,brainspan,prefix){
    transfer.anchors <- FindTransferAnchors(reference = brainspan,query = object,query.assay="GeneActivity",reduction = 'cca')
    saveRDS(transfer.anchors,file=paste0(prefix,".transferanchors.rds"))
    #predict labels for class and subclass
    predicted.labels.class <- TransferData(anchorset = transfer.anchors,refdata = brainspan$class_label,weight.reduction = "cca",dims = 1:30)
    predicted.labels.subclass <- TransferData(anchorset = transfer.anchors,refdata = brainspan$subclass_label,weight.reduction = "cca", dims = 1:30)
    object <- AddMetaData(object = object, metadata = predicted.labels.class)
    object <- AddMetaData(object = object, metadata = predicted.labels.subclass)
    saveRDS(object,file=paste0(prefix,"_SeuratObject.Rds"))

    feat<-colnames(object@meta.data)[which(grepl("prediction.score",colnames(object@meta.data)))]
    feat<-feat[feat !="prediction.score.max"]
    plt<-FeaturePlot(object,features=feat,order=T)#plot feature plots
    ggsave(plt,file=paste0(prefix,"_predicted.umap.png"),width=20,height=30)
    system(paste0("slack -F ",prefix,"_predicted.umap.png ryan_todo"))

    #plot merged cluster heatmap
    #Generate Heatmap of TF ChromVar score and TF modules
    pred<-object@meta.data[,feat]
    colnames(pred)<-substring(colnames(pred),18) #remove "prediction.score" for readability
    #Combine over subclusters
    pred<-split(pred,paste(object$seurat_clusters)) #group by rows to seurat clusters
    pred<-lapply(pred,function(x) apply(x,2,mean)) #take average across group
    pred<-do.call("rbind",pred) #condense to smaller data frame
    pred<-pred[!endsWith(row.names(pred),"NA"),] #remove doublets not assigned a subcluster

    plt1<-Heatmap(pred,clustering_distance_rows="maximum",clustering_distance_columns="maximum")
    pdf(paste0(prefix,".complexheatmap.pdf"),height=20)
    plt1
    dev.off()
    system(paste0("slack -F ",prefix,".complexheatmap.pdf ryan_todo"))

}

brainspan. <- readRDS("/home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_humancortex/allen_brainspan_humancortex.rds")
predict_celltype(object=hg38_atac,brainspan=brainspan.,prefix="hg38")

#Mouse using smaller data set that is whole brain
brainspan. <- readRDS("/home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_mouse/allen_brainspan_mouse.rds")
brainspan. <- FindVariableFeatures(object = brainspan.,nfeatures = 5000)
predict_celltype(object=mm10_atac,brainspan=brainspan.,prefix="mm10")


R Session Info with all packages loaded

Code
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS:   /home/groups/oroaklab/src/R/R-4.0.0/lib/libRblas.so
LAPACK: /home/groups/oroaklab/src/R/R-4.0.0/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
 [9] LC_ADDRESS=C               LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
 [1] grid      stats4    parallel  stats     graphics  grDevices utils
 [8] datasets  methods   base

other attached packages:
 [1] harmony_1.0
 [2] Rcpp_1.0.5
 [3] ComplexHeatmap_2.5.5
 [4] org.Hs.eg.db_3.12.0
 [5] TxDb.Hsapiens.UCSC.hg38.knownGene_3.10.0
 [6] BSgenome.Hsapiens.UCSC.hg38_1.4.3
 [7] BSgenome_1.57.6
 [8] rtracklayer_1.49.5
 [9] Biostrings_2.57.2
[10] XVector_0.29.3
[11] TFBSTools_1.27.0
[12] JASPAR2020_0.99.10
[13] dplyr_1.0.2
[14] ggrepel_0.8.2
[15] cicero_1.3.4.10
[16] Gviz_1.33.2
[17] monocle3_0.2.3.0
[18] SingleCellExperiment_1.11.6
[19] SummarizedExperiment_1.19.6
[20] DelayedArray_0.15.7
[21] matrixStats_0.57.0
[22] patchwork_1.0.1
[23] cisTopic_0.3.0
[24] SeuratWrappers_0.2.0
[25] Matrix_1.2-18
[26] EnsDb.Mmusculus.v79_2.99.0
[27] EnsDb.Hsapiens.v86_2.99.0
[28] ensembldb_2.13.1
[29] AnnotationFilter_1.13.0
[30] GenomicFeatures_1.41.3
[31] AnnotationDbi_1.51.3
[32] Biobase_2.49.1
[33] GenomicRanges_1.41.6
[34] GenomeInfoDb_1.25.11
[35] IRanges_2.23.10
[36] S4Vectors_0.27.12
[37] BiocGenerics_0.35.4
[38] Seurat_3.2.1
[39] Signac_1.1.0
[40] ggplot2_3.3.2
[41] reshape2_1.4.4

loaded via a namespace (and not attached):
  [1] rappdirs_0.3.1              SnowballC_0.7.0
  [3] GGally_2.0.0                R.methodsS3_1.8.1
  [5] tidyr_1.1.2                 bit64_4.0.5
  [7] knitr_1.30                  irlba_2.3.3
  [9] R.utils_2.10.1              data.table_1.13.0
 [11] rpart_4.1-15                KEGGREST_1.29.0
 [13] RCurl_1.98-1.2              generics_0.0.2
 [15] snow_0.4-3                  RhpcBLASctl_0.20-137
 [17] cowplot_1.1.0               RSQLite_2.2.1
 [19] VGAM_1.1-3                  RANN_2.6.1
 [21] future_1.19.1               bit_4.0.4
 [23] spatstat.data_1.4-3         httpuv_1.5.4
 [25] assertthat_0.2.1            DirichletMultinomial_1.31.0
 [27] viridis_0.5.1               xfun_0.20
 [29] hms_0.5.3                   promises_1.1.1
 [31] progress_1.2.2              caTools_1.18.0
 [33] dbplyr_1.4.4                igraph_1.2.6
 [35] DBI_1.1.0                   htmlwidgets_1.5.2
 [37] reshape_0.8.8               feather_0.3.5
 [39] rsparse_0.4.0               purrr_0.3.4
 [41] ellipsis_0.3.1              backports_1.2.0
 [43] annotate_1.67.1             biomaRt_2.45.2
 [45] deldir_0.1-29               vctrs_0.3.4
 [47] remotes_2.2.0               ROCR_1.0-11
 [49] abind_1.4-5                 withr_2.3.0
 [51] ggforce_0.3.2               checkmate_2.0.0
 [53] sctransform_0.3             GenomicAlignments_1.25.3
 [55] prettyunits_1.1.1           goftest_1.2-2
 [57] cluster_2.1.0               seqLogo_1.55.5
 [59] lazyeval_0.2.2              crayon_1.3.4
 [61] pkgconfig_2.0.3             tweenr_1.0.1
 [63] nlme_3.1-149                ProtGenerics_1.21.0
 [65] nnet_7.3-14                 rlang_0.4.8
 [67] globals_0.13.0              lifecycle_0.2.0
 [69] miniUI_0.1.1.1              BiocFileCache_1.13.1
 [71] doSNOW_1.0.18               rsvd_1.0.3
 [73] dichromat_2.0-0             polyclip_1.10-0
 [75] lmtest_0.9-38               graph_1.67.1
 [77] ggseqlogo_0.1               zoo_1.8-8
 [79] base64enc_0.1-3             GlobalOptions_0.1.2
 [81] ggridges_0.5.2              rjson_0.2.20
 [83] png_0.1-7                   viridisLite_0.3.0
 [85] RcisTarget_1.9.1            float_0.2-4
 [87] bitops_1.0-6                R.oo_1.24.0
 [89] lda_1.4.2                   KernSmooth_2.23-17
 [91] blob_1.2.1                  shape_1.4.5
 [93] stringr_1.4.0               readr_1.4.0
 [95] jpeg_0.1-8.1                CNEr_1.25.0
 [97] scales_1.1.1                memoise_1.1.0
 [99] GSEABase_1.51.1             magrittr_1.5
[101] plyr_1.8.6                  ica_1.0-2
[103] zlibbioc_1.35.0             compiler_4.0.0
[105] RColorBrewer_1.1-2          clue_0.3-57
[107] fitdistrplus_1.1-1          Rsamtools_2.5.3
[109] listenv_0.8.0               pbapply_1.4-3
[111] htmlTable_2.1.0             Formula_1.2-4
[113] text2vec_0.6                MASS_7.3-53
[115] mgcv_1.8-33                 tidyselect_1.1.0
[117] stringi_1.5.3               askpass_1.1
[119] latticeExtra_0.6-29         VariantAnnotation_1.35.3
[121] fastmatch_1.1-0             tools_4.0.0
[123] future.apply_1.6.0          circlize_0.4.10
[125] rstudioapi_0.11             TFMPvalue_0.0.8
[127] foreach_1.5.0               foreign_0.8-80
[129] lsa_0.73.2                  AUCell_1.11.0
[131] gridExtra_2.3               farver_2.0.3
[133] Rtsne_0.15                  digest_0.6.27
[135] BiocManager_1.30.10         pracma_2.2.9
[137] shiny_1.5.0                 later_1.1.0.1
[139] RcppAnnoy_0.0.16            OrganismDbi_1.31.0
[141] httr_1.4.2                  ggbio_1.37.0
[143] biovizBase_1.37.0           colorspace_1.4-1
[145] XML_3.99-0.5                tensor_1.5
[147] reticulate_1.16             splines_4.0.0
[149] lgr_0.3.4                   uwot_0.1.8
[151] RBGL_1.65.0                 RcppRoll_0.3.0
[153] spatstat.utils_1.17-0       plotly_4.9.2.1
[155] xtable_1.8-4                poweRlaw_0.70.6
[157] jsonlite_1.7.1              spatstat_1.64-1
[159] R6_2.5.0                    mlapi_0.1.0
[161] Hmisc_4.4-1                 pillar_1.4.6
[163] htmltools_0.5.0             mime_0.9
[165] glue_1.4.2                  fastmap_1.0.1
[167] BiocParallel_1.23.2         codetools_0.2-16
[169] lattice_0.20-41             tibble_3.0.4
[171] curl_4.3                    leiden_0.3.3
[173] gtools_3.8.2                GO.db_3.11.4
[175] openssl_1.4.3               survival_3.2-7
[177] munsell_0.5.0               GetoptLong_1.0.3
[179] GenomeInfoDbData_1.2.3      iterators_1.0.12
[181] gtable_0.3.0