This notebook is a continuation of “s3 Preprocessing” and details the processing of s3ATAC libraries. This notebook starts with a merged, barcode-based removal of duplicate, >Q10 filtered bam file which was subset to barcodes >=10K unique reads.
#Initial directory structure (filtered to relevant files):
/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis
├── complexity_data
│ ├── Barnyard_ATAC_A.complexity.txt
│ ├── Barnyard_ATAC_B.complexity.txt
│ ├── Barnyard_ATAC_C.complexity.txt
│ ├── Barnyard_ATAC_D.complexity.txt
│ ├── Barnyard_ATAC_E.complexity.txt
├── filtered_bam
│ ├── Barnyard_ATAC_A.bbrd.q10.filt.bam
│ ├── Barnyard_ATAC_A.bbrd.q10.filt.cellIDs.list
│ ├── Barnyard_ATAC_B.bbrd.q10.filt.bam
│ ├── Barnyard_ATAC_B.bbrd.q10.filt.cellIDs.list
│ ├── Barnyard_ATAC_C.bbrd.q10.filt.bam
│ ├── Barnyard_ATAC_C.bbrd.q10.filt.cellIDs.list
│ ├── Barnyard_ATAC_D.bbrd.q10.filt.bam
│ ├── Barnyard_ATAC_D.bbrd.q10.filt.cellIDs.list
│ ├── Barnyard_ATAC_E.bbrd.q10.filt.bam
│ ├── Barnyard_ATAC_E.bbrd.q10.filt.cellIDs.list
├── s3atac_data
│ └── s3atac_barnyard.bbrd.q10.filt.bam
├── raw_fq
│ ├── Barnyard_ATAC_A.1.fq.gz
│ ├── Barnyard_ATAC_A.2.fq.gz
│ ├── Barnyard_ATAC_B.1.fq.gz
│ ├── Barnyard_ATAC_B.2.fq.gz
│ ├── Barnyard_ATAC_C.1.fq.gz
│ ├── Barnyard_ATAC_C.2.fq.gz
│ ├── Barnyard_ATAC_D.1.fq.gz
│ ├── Barnyard_ATAC_D.2.fq.gz
│ ├── Barnyard_ATAC_E.1.fq.gz
│ ├── Barnyard_ATAC_E.2.fq.gz
#Barnyard analysis to measure per cell collision rates and split merged bam by species for realignment
scitools barnyard-compare s3atac_barnyard.bbrd.q10.filt.bam &
#New files generated by barnyard-compare
s3atac_barnyard.bbrd.q10.filt.bam s3atac_barnyard.bbrd.q10.filt.barnyard_cells.plot.r
s3atac_barnyard.bbrd.q10.filt.barnyard_cells.plot.pdf s3atac_barnyard.bbrd.q10.filt.barnyard_cells.txt
s3atac_barnyard.bbrd.q10.filt.barnyard_cells.plot.png s3atac_barnyard.bbrd.q10.filt.barnyard_stats.txt
library(reshape2)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
#read in indexes master file
idx<-read.table("/home/groups/oroaklab/src/scitools/scitools-dev/SCI_stdchem_Indexes.txt",col.names=c("idx_name","idx_cycles","idx_seq"))
idx_i7<-idx[idx$idx_cycles==1,]
colnames(idx_i7)<-c("i7_idx_name","i7_idx_cycle","i7_idx_seq")
idx_i5<-idx[idx$idx_cycles==2,]
colnames(idx_i5)<-c("i5_idx_name","i5_idx_cycle","i5_idx_seq")
idx_tn5<-idx[idx$idx_cycles==3,]
colnames(idx_tn5)<-c("tn5_idx_name","tn5_idx_cycle","tn5_idx_seq")
#read in barnyard output
barnyard<-read.table("./barnyard_analysis/s3atac_barnyard.bbrd.q10.filt.barnyard_cells.txt", header=F)
colnames(barnyard)<-c("cellID","total_reads_q20","hg38_q20","mm10_q20","perc_human","species_call")
barnyard$i7_idx_seq<-substr(barnyard$cellID,0,8)
barnyard$tn5_idx_seq<-substr(barnyard$cellID,17,24)
barnyard$i5_idx_seq<-substr(barnyard$cellID,9,16)
dat<-merge(barnyard,idx_i7,by="i7_idx_seq")
dat<-merge(dat,idx_tn5,by="tn5_idx_seq")
dat<-merge(dat,idx_i5,by="i5_idx_seq")
dat$pcr_plate<-substr(dat$i5_idx_name,5,5)
dat$i5_set<-paste0("CPT_",dat$pcr_plate)
#Per Plate breakdown of data contained as sheets in https://docs.google.com/spreadsheets/d/1mZ34KIwmr2vdjQlnqY7v_u0-Eca8mRc-HgI2r_WICXk/edit#gid=1305239528
#Not all plates are a proper barnyard mix.
i5_annot<-read.table("i5_pcr_annot.txt",sep="\t",header=T)
i7_annot<-read.table("i7_pcr_annot.txt",sep="\t",header=T)
dat<-merge(dat,i7_annot,by="i7_idx_seq")
dat<-merge(dat,i5_annot,by="i5_idx_seq")
#Plate 4 and 5 are only human (as noted in spreadsheet)
dat[dat$i5_sample=="Human",]$i7_sample_designation<-"hum_cortex"
#Also based on FACS sorting data, Plate 1, Column 6 Row E was incorrectly double sorted (experimentor error) and will be excluded
#this is also reflected in high cell count in the well (28 cells) compared to average (~8 cells)
dat<-dat[!(dat$i7_plate_column=="6" & dat$i5_row=="E" & dat$i5_pcr_plate=="Plate_1"),]
#Also removing plate 2 (used to polymerase testing)
dat<-dat[!(dat$i5_pcr_plate=="Plate_2"),]
#True barnyard results (columns 7-11 only of plates 1 2 and 3)
true_by_cellstested<-nrow(dat[dat$i7_sample_designation =="equimolar_tagment_mix",])
true_by_cellspassed<-nrow(dat[dat$i7_sample_designation =="equimolar_tagment_mix" & (dat$species_call != "Mixed"),])
true_by_cellsfailed<-nrow(dat[dat$i7_sample_designation =="equimolar_tagment_mix" & (dat$species_call == "Mixed"),])
(true_by_cellsfailed/true_by_cellstested)*2
#0.05531295
#so 5.5% estimated collision rate (19/687)
#PostTagMixed Barnyard (column 12 only)
posttag_by_cellstested<-nrow(dat[dat$i7_sample_designation =="equimolar_posttag_mix",])
posttag_by_cellspassed<-nrow(dat[dat$i7_sample_designation =="equimolar_posttag_mix" & (dat$species_call == "Mouse"),])
((posttag_by_cellstested-posttag_by_cellspassed)/posttag_by_cellstested)*2
#0
#so 0%
#Plot true barnyard and posttag barnyard cells
library(ggplot2)
plt<-ggplot(dat[dat$i7_sample_designation =="equimolar_tagment_mix",],aes(x=hg38_q20,y=mm10_q20,color=as.factor(species_call)))+geom_point()+xlab("hg38 Reads")+ylab("mm10 Reads")+theme_bw()
ggsave(plt,file="truebarnyard_readcount.pdf")
write.table(dat[dat$i7_sample_designation =="equimolar_tagment_mix",c("hg38_q20","mm10_q20","species_call")],file="SourceData_Fig2b.tsv",sep="\t",quote=F)
system("slack -F SourceData_Fig2b.tsv ryan_todo")
plt<-ggplot(dat[dat$i7_sample_designation =="equimolar_tagment_mix",],aes(x=perc_human))+geom_histogram(bins=100)+xlab("Percent Human")+ylab("Cells")+theme_bw()
ggsave(plt,file="truebarnyard_perchuman.pdf")
#Post tagmentation barnyard
plt<-ggplot(dat[dat$i7_sample_designation =="equimolar_posttag_mix",],aes(x=hg38_q20,y=mm10_q20,color=as.factor(species_call)))+geom_point()+xlab("hg38 Reads")+ylab("mm10 Reads")+theme_bw()
ggsave(plt,file="posttagbarnyard_readcount.pdf")
write.table(dat[dat$i7_sample_designation =="equimolar_posttag_mix",c("hg38_q20","mm10_q20","species_call")],file="SourceData_Fig2c.tsv",sep="\t",quote=F)
system("slack -F SourceData_Fig2c.tsv ryan_todo")
plt<-ggplot(dat[dat$i7_sample_designation =="equimolar_posttag_mix",],aes(x=perc_human))+geom_histogram(bins=100)+xlab("Percent Human")+ylab("Cells")+theme_bw()
ggsave(plt,file="posttagbarnyard_perchuman.pdf")
#Make annot files for apriori_species, called_species, plate_volume, barnyard_condition
dat$species_apriori<-as.character(dat$i7_sample_designation)
dat[dat$i7_sample_designation %in% c("equimolar_tagment_mix","equimolar_posttag_mix"),]$species_apriori<-"barnyard"
#apriori species
write.table(dat[c("cellID","species_apriori")],file="barnyard_apriorispecies.annot",sep="\t",quote=F,col.names=F,row.names=F)
#called species
write.table(dat[c("cellID","species_call")],file="barnyard_calledspecies.annot",sep="\t",quote=F,col.names=F,row.names=F)
#PCR Plate
write.table(dat[c("cellID","pcr_plate")],file="barnyard_pcrplate.annot",sep="\t",quote=F,col.names=F,row.names=F)
#Plate Volume
write.table(dat[c("cellID","i5_plate_condition")],file="barnyard_platevolume.annot",sep="\t",quote=F,col.names=F,row.names=F)
#Barnyard Condition
write.table(dat[c("cellID","i7_sample_designation")],file="barnyard_condition.annot",sep="\t",quote=F,col.names=F,row.names=F)
#Freshvfrozen
write.table(dat[c("cellID","i5_frozen")],file="barnyard_storagecondition.annot",sep="\t",quote=F,col.names=F,row.names=F)
#PCR_pol
write.table(dat[c("cellID","PCR_Pol")],file="barnyard_polymerase.annot",sep="\t",quote=F,col.names=F,row.names=F)
#Full Data Write Out
write.table(dat,file="barnyard_full_cell_summary.tsv",sep="\t",quote=F,col.names=T,row.names=F)
Split Human and mouse cells at the fastq level based on the barnyard output. Then processed in parallel.
Further processing after human and mouse cells were split
#Merge original fastqs, and split original fastqs by called species for downstream processing
cat /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/raw_fq/Barnyard_ATAC*.1.fq.gz > /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/Barnyard_ATAC.merged.1.fq.gz &
cat /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/raw_fq/Barnyard_ATAC*.2.fq.gz > /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/Barnyard_ATAC.merged.2.fq.gz &
scitools fastq-split -X -A ../barnyard_calledspecies.annot Barnyard_ATAC.merged.1.fq.gz Barnyard_ATAC.merged.2.fq.gz &
awk '{if($2="Human") print $1}' barnyard_calledspecies.annot > hg38.cellID.list
awk '{if($2="Mouse") print $1}' barnyard_calledspecies.annot > mm10.cellID.list
#realign to reference genomes with subset cellIDs
scitools fastq-align -t 10 -r 10 hg38 hg38 barnyard_calledspecies.Human.1.fq.gz barnyard_calledspecies.Human.2.fq.gz &
scitools fastq-align -t 10 -r 10 mm10 mm10 barnyard_calledspecies.Mouse.1.fq.gz barnyard_calledspecies.Mouse.2.fq.gz &
#Project out bam files to look for maximum complexity
scitools bam-project hg38.bam &
scitools bam-project mm10.bam &
#Performing bbrd and changing file name to be simplified.
scitools bam-rmdup hg38.bam &
scitools bam-rmdup mm10.bam &
#Calling peaks on human and mouse cells
scitools callpeaks hg38.bbrd.q10.bam &
scitools callpeaks mm10.bbrd.q10.bam &
# 304049 hg38.bbrd.q10.500.bed
# 181791 mm10.bbrd.q10.500.bed
#Set up cellID by peak count sparse matrix
scitools atac-count hg38.bbrd.q10.bam hg38.bbrd.q10.500.bed &
scitools atac-count mm10.bbrd.q10.bam mm10.bbrd.q10.500.bed &
#TSS enrichment calculation
module load bedops/2.4.36
scitools bam-tssenrich -X hg38.bbrd.q10.bam hg38 &
scitools bam-tssenrich -X -E hg38.bbrd.q10.bam hg38 &
scitools bam-tssenrich -X -E mm10.bbrd.q10.bam mm10 &
scitools bam-tssenrich -X mm10.bbrd.q10.bam mm10 &
#Complexity plotting (split bam into single cells then project out)
#for hg38, splitting bam into multiples of 1000 unique barcodes for faster processing and limiting IO
split --additional-suffix=.cellID.list hg38.cellID.list hg38. #default is to split to 1000 lines (so 1000 cells)
for i in hg38.??.cellID.list; do awk 'OFS="\t"{split(FILENAME,a,"[.]");print $1,a[2]}' $i ; done > hg38.cellID.split.annot # awk to concatenate split into a scitools annot file
scitools bam-split -A hg38.cellID.split.annot hg38.bam & #split bam file
for i in hg38.??.bam; do scitools bam-addrg $i & done & # add RG
for i in hg38.??.RG.bam; do samtools split -@8 -f './hg38_split/%*_%!.%.' $i ; done & #perform samtools split on RG (cellIDs)
cd hg38_split
for i in hg38*bam; do scitools bam-project -r 1000 -X -e $i; done & #perform complexity projection on 1000 read intervals
#repeat with mm10, but skip the bulk bam splitting (since cell count is lower)
scitools bam-addrg mm10.bam &
mkdir mm10_split
samtools split -@20 -f './mm10_split/%*_%!.%.' mm10.RG.bam &
cd mm10_split
for i in mm10*bam; do scitools bam-project -r 1000 -X -e $i; done &
#Grab single cell read projections
for i in ./mm10_split/*.read_projections;
do cellid=${i:21:-17};
awk -v cellid=$cellid 'OFS="\t" {print $1,$2,$3,cellid}' $i/cell_summaries.txt;
done > ./mm10_projected_reads.txt
for i in ./hg38_split/*.read_projections;
do cellid=${i:24:-17};
awk -v cellid=$cellid 'OFS="\t" {print $1,$2,$3,cellid}' $i/cell_summaries.txt;
done > ./hg38_projected_reads.txt #change in cellid string slicing to account for added "split bam identifier"
#isize distribution
scitools isize hg38.bbrd.q10.bam &
scitools isize mm10.bbrd.q10.bam &
awk 'OFS="\t" {print $1,"hg38"}' hg38.bbrd.q10.isize.values > SourceData_Fig2E.txt
awk 'OFS="\t" {print $1,"mm10"}' mm10.bbrd.q10.isize.values >> SourceData_Fig2E.txt
sort -k2,2 -k1,1n -T . --parallel=20 SourceData_Fig2E.txt | uniq -c > SourceData_Fig2E.counts.txt
Going to compress file by making a frequency table instead of just a raw list
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
dat<-read.table("SourceData_Fig2E.txt")
#Plotting hg38 tss enrichment
library(ggplot2)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
hg38_rangeR <- read.table("hg38.bbrd.q10.bulkTSSdist_1000.txt")
colnames(hg38_rangeR)<-c("distance","count")
hg38_rangeR$species<-"hg38"
mm10_rangeR <- read.table("mm10.bbrd.q10.bulkTSSdist_1000.txt")
colnames(mm10_rangeR)<-c("distance","count")
mm10_rangeR$species<-"mm10"
dat<-rbind(hg38_rangeR,mm10_rangeR)
ggplot(dat,aes(x=distance,y=count))+geom_area()+theme_bw()+facet_grid(species ~. )
ggsave("s3atac_bulk_TSS.pdf")
write.table(dat[,c("distance","count","species")],file="SourceData_Fig2f.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_Fig2f.tsv ryan_todo")
Tabix file format is a tab separated multicolumn data structure.
Column Number | Name | Description |
---|---|---|
1 | chrom | Reference genome chromosome of fragment |
2 | chromStart | Adjusted start position of fragment on chromosome. |
3 | chromEnd | Adjusted end position of fragment on chromosome. The end position is exclusive, so represents the position immediately following the fragment interval. |
4 | barcode | The 10x (or sci) cell barcode of this fragment. This corresponds to the CB tag attached to the corresponding BAM file records for this fragment. |
5 | duplicateCount | The number of PCR duplicate read pairs observed for this fragment. Sequencer-created duplicates, such as Exclusion Amp duplicates created by the NovaSeq instrument are excluded from this count. |
#human processing
input_bam="hg38.bbrd.q10.bam"
output_name=${input_bam::-13}
tabix="/home/groups/oroaklab/src/cellranger-atac/cellranger-atac-1.1.0/miniconda-atac-cs/4.3.21-miniconda-atac-cs-c10/bin/tabix"
bgzip="/home/groups/oroaklab/src/cellranger-atac/cellranger-atac-1.1.0/miniconda-atac-cs/4.3.21-miniconda-atac-cs-c10/bin/bgzip"
samtools view --threads 10 $input_bam | awk 'OFS="\t" {split($1,a,":"); print $3,$4,$8,a[1],1}' | sort -S 2G -T . --parallel=30 -k1,1 -k2,2n -k3,3n | $bgzip > $output_name.fragments.tsv.gz; wait ;
$tabix -p bed $output_name.fragments.tsv.gz &
#mouse
input_bam="mm10.bbrd.q10.bam"
output_name=${input_bam::-13}
samtools view --threads 10 $input_bam | awk 'OFS="\t" {split($1,a,":"); print $3,$4,$8,a[1],1}' | sort -S 2G -T . --parallel=10 -k1,1 -k2,2n -k3,3n | $bgzip > $output_name.fragments.tsv.gz
$tabix -p bed $output_name.fragments.tsv.gz &
Split out single-cell bams from deduplicated bams
mkdir single_cell_splits
#Add read group for cellID specific splitting
for i in *bbrd.q10.bam ; do scitools bam-addrg $i & done &
#mm10 is small enough to split as is
samtools split -@20 -f './single_cell_splits/%*_%!.full.%.' mm10.bbrd.q10.RG.bam & #perform samtools split on RG (cellIDs)
#for human too many cells for simultaneous splitting (I/O errors), going to make two temp bam files with limited RG headers
samtools view -H hg38.bbrd.q10.RG.bam | grep "^@RG" | awk '{split($2,a,":"); print a[2]}' | split --lines=500 --additional-suffix=".cellID.subset.temp.cellID.list" #split RG into 500 cell chunks
for i in *.cellID.subset.temp.cellID.list; do outname=${i::-26} ; scitools bam-filter -L $i -O hg38.bbrd.q10.RG.${outname}.temp hg38.bbrd.q10.RG.bam & done & #split bam files by RG lists
#samtools split opens as many IO streams as rg in header, so need to fix headers with just rg in new split bams
#reheader bam files
for i in *.cellID.subset.temp.cellID.list;
do outname=${i::-26} ;
samtools view -H hg38.bbrd.q10.RG.${outname}.temp.filt.bam | grep -v "^@RG" - > temp.header;
awk 'OFS="\t" {print "@RG","ID:"$1,"SM:"$1,"LB:"$i,"PL:SCI"}' $i >> temp.header ;
samtools reheader -c "cat temp.header" hg38.bbrd.q10.RG.${outname}.temp.filt.bam > hg38.bbrd.q10.RG.${outname}.temp.rehead.bam; done &
#finally split out single cells
for i in *temp.rehead.bam; do samtools split -@20 -f './single_cell_splits/%*_%!.full.%.' $i; done & #perform samtools split on RG (cellIDs)
Split out single-cell bams from raw bams for projections
#Add read group for cellID specific splitting
for i in mm10.bam hg38.bam ; do scitools bam-addrg $i & done &
#mm10 is small enough to split as is
samtools split -@20 -f './single_cell_splits/%*_%!.prededup.%.' mm10.RG.bam & #perform samtools split on RG (cellIDs)
#for human too many cells for simultaneous splitting (I/O errors), going to make two temp bam files with limited RG headers
samtools view -H hg38.RG.bam | grep "^@RG" | awk '{split($2,a,":"); print a[2]}' | split --lines=500 --additional-suffix=".cellID.subset.temp.cellID.list" #split RG into 500 cell chunks
for i in *.cellID.subset.temp.cellID.list; do outname=${i::-26} ; scitools bam-filter -L $i -O hg38.RG.${outname}.temp hg38.RG.bam & done & #split bam files by RG lists
#samtools split opens as many IO streams as rg in header, so need to fix headers with just rg in new split bams
#reheader bam files
for i in *.cellID.subset.temp.cellID.list;
do outname=${i::-26} ;
samtools view -H hg38.RG.${outname}.temp.filt.bam | grep -v "^@RG" - > temp.header;
awk 'OFS="\t" {print "@RG","ID:"$1,"SM:"$1,"LB:"$i,"PL:SCI"}' $i >> temp.header ;
samtools reheader -c "cat temp.header" hg38.RG.${outname}.temp.filt.bam > hg38.RG.${outname}.temp.rehead.bam; done &
#finally split out single cells
for i in *temp.rehead.bam; do samtools split -@20 -f './single_cell_splits/%*_%!.prededup.%.' $i; done & #perform samtools split on RG (cellIDs)
Process single-cell bams
cd single_cell_splits
mkdir single_dedup_bams
mkdir single_prededup_bams
mv *full.bam single_dedup_bams/
mv *prededup.bam single_prededup_bams/
cd single_prededup_bams
#parallelization of picard tools estimate library complexity, running on j=30 cores
parallel -j 30 "java -jar /home/groups/oroaklab/src/picard/picard-tools-1.70/EstimateLibraryComplexity.jar I={} O={}.picard.metrics" ::: **.bam
Process single-cell pre-deduplicated bams
cd single_dedup_bams
#Subsample by percentage
for j in 005 01 02 05 10 15 20 50 100; #2 = 20%, 4= 40% etc.
do for i in *.full.bam;
do samtools view -@ 10 -h -s 0.${j} $i > ${i::-9}.${j}perc.bam ;done ; done &
#Alternative approach by read count, ended up not using
#Subsample by read count
#for j in 1000 5000 10000 25000;
#do for i in *full.bam; do ((samtools view -H $i) & (samtools view -@ 10 $i | shuf -n $j)) | samtools view -@ 10 -bS - > ${i::-9}.${j}.bam ; done; done &
#remove bam files that dont hit the subsample amount
#for j in 1000 5000 10000 25000;
#do for i in *${j}.bam;
#do readcount=`samtools view $i | wc -l`;
#if (("$readcount" < "$j")) ; then rm -f $i; fi; done; done &
#merge bam files together and remove single-cell bams for readability
#splitting tmp_list again to reduce IO then merging all
for i in hg38 mm10;
do for j in 005perc 01perc 02perc 05perc 10perc 15perc 20perc 50perc 100perc;
do ls ${i}*${j}.bam > tmp_list.txt; #generate temporary list
split -l 500 --additional-suffix=.${i}.${j}.tmp_list.txt tmp_list.txt; #split list into -l lines (cells)
for k in *.${i}.${j}.tmp_list.txt;
do tmp_name=(${k//./ })
samtools merge -@ 30 -b $k ${i}.${tmp_name[0]}.tmp.${j}.merged.bam; #first stage merging of 500 cells each
done;
samtools merge -@ 30 ${i}.${j}.merged.bam `ls ${i}.*.tmp.${j}.merged.bam`; #second stage merging
done ; done &
#remove single cell files
rm -rf *perc.bam &
rm -rf *0.bam &
rm -rf *full.bam &
rm -rf *tmp_list.txt
rm -rf *tmp*bam
for i in *perc*merged.bam;
do scitools callpeaks $i ; done &
# 40782 hg38.10000.merged.500.bed
# 3173 hg38.1000.merged.500.bed
# 75744 hg38.20perc.merged.500.bed
# 64892 hg38.25000.merged.500.bed
# 137381 hg38.40perc.merged.500.bed
# 94585 hg38.5000.merged.500.bed
# 184420 hg38.60perc.merged.500.bed
# 221442 hg38.80perc.merged.500.bed
# 16732 mm10.10000.merged.500.bed
# 519 mm10.1000.merged.500.bed
# 36837 mm10.20perc.merged.500.bed
# 26208 mm10.25000.merged.500.bed
# 71047 mm10.40perc.merged.500.bed
# 41218 mm10.5000.merged.500.bed
# 99236 mm10.60perc.merged.500.bed
# 154911 mm10.80perc.merged.500.bed
#for i in hg38*bam;
#do scitools atac-count -O ${i::-4}.fullpeakset $i ../../hg38.bbrd.q10.500.bed & done &
#for i in mm10*bam;
#do scitools atac-count -O ${i:-4}.fullpeakset $i ../../mm10.bbrd.q10.500.bed & done &
for i in hg38*bam;
do scitools atac-count -O ${i::-4} $i ${i::-4}.500.bed & done &
for i in mm10*bam;
do scitools atac-count -O ${i:-4} $i ${i::-4}.500.bed & done &
#Generate count of reads per bam
for i in *bam; do
samtools view $i | awk '{split($1,a,":");print a[1]}' | sort --parallel 10 | uniq -c > ${i::-4}.counts.txt; done &
Generate Seurat Objects of downsampled data.
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(Matrix)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/single_cell_splits/single_dedup_bams")
file_in<-list.files(pattern="*.fracOnTarget.values")
file_in<-substr(file_in,1,nchar(file_in)-20)
hg38_annotations<-GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
mm10_annotations<-GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
make_seurat_object<-function(x,genome="hg38"){
IN<-as.matrix(read.table(paste0(x,".counts.sparseMatrix.values.gz")))
IN<-sparseMatrix(i=IN[,1],j=IN[,2],x=IN[,3])
COLS<-read.table(paste0(x,".counts.sparseMatrix.cols.gz"))
colnames(IN)<-COLS$V1
ROWS<-read.table(paste0(x,".counts.sparseMatrix.rows.gz"))
row.names(IN)<-ROWS$V1
counts_mat<-IN # make counts matrix from sparse matrix
# extract gene annotations from EnsDb
if(startsWith(x,"hg38")){
annotations<-hg38_annotations
} else {
annotations<-mm10_annotations
}
# change to UCSC style
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- genome
#Generate ChromatinAssay Objects
chromatinassay <- CreateChromatinAssay(
counts = counts_mat,
genome=genome,
min.cells = 1,
annotation=annotations,
sep=c("_","_"))
#Create Seurat Objects
atac <- CreateSeuratObject(
counts = chromatinassay,
assay = "peaks")
#saving unprocessed SeuratObjects
saveRDS(atac,file=paste0(x,"_SeuratObject.Rds"))
}
lapply(file_in,make_seurat_object)
Perform cisTopic on each subset seurat object
This is a custom R script to be run in bash with argument 1 being the seurat objects. This is because cistopic seems to have a memory leak if run in R internally.
library(Signac)
library(Seurat)
library(SeuratWrappers)
library(cisTopic)
set.seed(1234)
#Read in data and modify to monocle CDS file
#read in RDS file.
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/single_cell_splits/single_dedup_bams")
args = commandArgs(trailingOnly=TRUE)
######FUNCTIONS#####
cistopic_generation<-function(x){
#Perform cistopic on subclusters of data
atac_sub<-readRDS(x)
outname<-unlist(strsplit(x,"_"))[[1]]
cistopic_counts_frmt<-atac_sub$peaks@counts
row.names(cistopic_counts_frmt)<-sub("-", ":", row.names(cistopic_counts_frmt))
sub_cistopic<-cisTopic::createcisTopicObject(cistopic_counts_frmt)
sub_cistopic_models<-cisTopic::runWarpLDAModels(sub_cistopic,topic=c(10,20:30),nCores=11,addModels=FALSE)
saveRDS(sub_cistopic_models,file=paste(outname,"CisTopicObject.Rds",sep="."))
pdf(paste(outname,"model_selection.pdf",sep="_"))
par(mfrow=c(3,3))
sub_cistopic_models<- selectModel(sub_cistopic_models, type='derivative')
dev.off()
}
####################################
### Processing ###
#Running cistopic model generation on all subset data
print(paste("Running model for",args[1]))
#Running cistopic subclustering on all identified cell types
#lapply(seurat_objects[8:length(seurat_objects)], function(i) {cistopic_generation(x=i)})
cistopic_generation(x=args[1])
Using a bash loop for cistopic clustering.
for i in *SeuratObject.Rds;
do Rscript cistopic.loop.r $i; done &
#cistopic.loop.r is the above R code
library(Signac)
library(Seurat)
library(SeuratWrappers)
library(cisTopic)
library(patchwork)
library(ggplot2)
set.seed(1234)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/single_cell_splits/single_dedup_bams")
hg38_atac<-readRDS(file="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10_SeuratObject.Rds")
seurat_objects<-list.files(pattern="SeuratObject.Rds$")
topic_models<-list.files(pattern="CisTopicObject.Rds$")
#determine model count to use for each cell type
for (i in seurat_objects){
outname<-unlist(strsplit(i,"_"))[[1]]
system(paste0("slack -F ",outname,"_model_selection.pdf"," ryan_todo"))} #slack -F of ryan_todo just posts files from our cluster to a slack channel I have
#selecting topics based on derivative, making a named vector
topic_count_list<-c(24,24,25,26,26,28,30,28,23,30,30,23,
24,30,27,28,30,30,28,28,23,26,26,24)
names(topic_count_list)<-topic_models
#UMAP Projection and clustering on selected cistopic model
clustering_loop<-function(topicmodel,topiccount,seuratobject,genome_name="hg38",outname.){
outname<-outname.
object_input<-readRDS(seuratobject)
object_input$cellID<-row.names(object_input@meta.data)
#select_topic
models_input<-readRDS(topicmodel)
counts<-read.table(paste0(outname,".counts.txt"),header=F)
colnames(counts)<-c("subsamp_counts","cellID")
row.names(counts)<-counts$cellID
avg_read_count<-mean(counts$subsamp_counts)
cisTopicObject<-cisTopic::selectModel(models_input,select=topiccount,keepModels=F)
#perform UMAP on topics
topic_df<-as.data.frame(cisTopicObject@selected.model$document_expects)
row.names(topic_df)<-paste0("Topic_",row.names(topic_df))
dims<-as.data.frame(uwot::umap(t(topic_df),n_components=2))
row.names(dims)<-colnames(topic_df)
colnames(dims)<-c("x","y")
dims$cellID<-row.names(dims)
#get cell embeddings
cell_embeddings<-as.data.frame(cisTopicObject@selected.model$document_expects)
colnames(cell_embeddings)<-cisTopicObject@cell.names
n_topics<-nrow(cell_embeddings)
row.names(cell_embeddings)<-paste0("topic_",1:n_topics)
cell_embeddings<-as.data.frame(t(cell_embeddings))
#get feature loadings
feature_loadings<-as.data.frame(cisTopicObject@selected.model$topics)
row.names(feature_loadings)<-paste0("topic_",1:n_topics)
feature_loadings<-as.data.frame(t(feature_loadings))
#combine with seurat object for celltype seuratobject
cistopic_obj<-CreateDimReducObject(embeddings=as.matrix(cell_embeddings),loadings=as.matrix(feature_loadings),assay="peaks",key="topic_")
umap_dims<-as.data.frame(as.matrix(dims[,c("x","y")]))
colnames(umap_dims)<-c("UMAP_1","UMAP_2")
row.names(umap_dims)<-dims$cellID
cistopic_umap<-CreateDimReducObject(embeddings=as.matrix(umap_dims),assay="peaks",key="UMAP_")
object_input@reductions$cistopic<-cistopic_obj
object_input@reductions$umap<-cistopic_umap
if(genome_name=="hg38"){
metadata<-as.data.frame(hg38_atac@meta.data)
}else {
metadata<-as.data.frame(mm10_atac@meta.data)
}
object_input$cellID<-row.names(object_input@meta.data)
object_input<-subset(object_input,cells=metadata$cellID)
object_input<-AddMetaData(object_input,metadata)
saveRDS(object_input,seuratobject)
plt<-DimPlot(object_input,group.by=c('seurat_clusters'))+ggtitle(paste(outname,avg_read_count))
ggsave(plt,file=paste(outname,"clustering.pdf",sep="_"))
return(plt)
}
#Running clustering loop
hg38_seuratobjects<-seurat_objects[which(startsWith(seurat_objects,"hg38"))]
mm10_seuratobjects<-seurat_objects[which(startsWith(seurat_objects,"mm10"))]
plt_list_hg38<-lapply(hg38_seuratobjects,function(i) {
outname_in<-unlist(strsplit(i,"_"))[[1]]
obj_in<-paste(outname_in,"SeuratObject.Rds",sep="_")
topic_in<-paste(outname_in,"CisTopicObject.Rds",sep=".")
topic_count=topic_count_list[topic_in]
clustering_loop(seuratobject=obj_in,topicmodel=topic_in,topiccount=topic_count,genome_name="hg38",outname.=outname_in)}
)
plt_list_mm10<-lapply(mm10_seuratobjects,function(i) {
outname_in<-unlist(strsplit(i,"_"))[[1]]
obj_in<-paste(outname_in,"SeuratObject.Rds",sep="_")
topic_in<-paste(outname_in,"CisTopicObject.Rds",sep=".")
topic_count=topic_count_list[topic_in]
clustering_loop(seuratobject=obj_in,topicmodel=topic_in,topiccount=topic_count,genome_name="mm10",outname.=outname_in)}
)
plt_list_out<-wrap_plots(plt_list_hg38,guides="collect")
ggsave(plt_list_out,file="downsample.hg38.clustering.pdf",height=30,width=30,limitsize=F)
system("slack -F downsample.hg38.clustering.pdf ryan_todo")
plt_list_out<-wrap_plots(plt_list_mm10,guides="collect")
ggsave(plt_list_out,file="downsample.mm10.clustering.pdf",height=30,width=30,limitsize=F)
system("slack -F downsample.mm10.clustering.pdf ryan_todo")
out_data<-lapply(c(hg38_seuratobjects,mm10_seuratobjects), function(i){
obj<-readRDS(i)
subset_amount<-unlist(lapply(strsplit(i,"[.]"),"[",2))
species<-unlist(lapply(strsplit(i,"[.]"),"[",1))
dat_out<-cbind(as.data.frame(obj@reductions$umap@cell.embeddings[names(obj$celltype),]),obj$celltype,subset_amount,species)
return(dat_out)
})
dat_out<-do.call("rbind",out_data)
write.table(dat_out,file="SourceData_SupFig1d.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_SupFig1d.tsv ryan_todo")
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(cicero)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(Signac)
library(Seurat)
set.seed(1234)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/single_cell_splits/single_dedup_bams")
seurat_objects<-list.files(pattern="SeuratObject.Rds$")
#Read in data and modify to monocle CDS file
#read in RDS file.
cicero_processing<-function(x){
obj_in<-readRDS(x)
outname_in<-unlist(strsplit(x,"_"))[[1]]
#convert to CellDataSet format and make the cicero object
#Processing hg38 cells
obj.cds <- as.cell_data_set(x = obj_in,group_by="seurat_clusters")
obj.cicero <- make_cicero_cds( obj.cds, reduced_coordinates = reducedDims(obj.cds)$UMAP)
genome <- seqlengths(obj_in) # get the chromosome sizes from the Seurat object
genome.df <- data.frame("chr" = names(genome), "length" = genome) # convert chromosome sizes to a dataframe
conns <- run_cicero(obj.cicero, genomic_coords = genome.df) # run cicero
saveRDS(conns,paste0(outname_in,"_cicero_conns.Rds"))
ccans <- generate_ccans(conns) # generate ccans
saveRDS(ccans,paste0(outname_in,"_cicero_ccans.Rds"))
links <- ConnectionsToLinks(conns = conns, ccans = ccans) #Add connections back to Seurat object as links
Links(obj_in) <- links
saveRDS(obj_in,paste0(outname_in,"Seurat.cicero.Rds"))
}
for (i in seurat_objects){
tryCatch(
{
cicero_processing(i)
},
error=function(err){print(paste(i,"failed to generate model."))})
}
ccan_list<-list.files(pattern="_ccans.Rds$")
ccan_output<-list()
for (i in ccan_list){
ccan_in<-readRDS(i)
ccan_output[[i]]<-c(sample=i,peak_in_ccan=nrow(ccan_in),ccan_count=length(unique(ccan_in$CCAN)),average_peak_per_ccan=mean(table(ccan_in$CCAN)))
}
saveRDS(ccan_output,file="subsampling_ccanoutput.rds")
# generate unnormalized gene activity matrices
# gene annotation sample
#hg38
hg38_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
mm10_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
generate_ga<-function(x){
obj_in<-readRDS(x)
outname_in<-unlist(strsplit(x,"_"))[[1]]
conns<-readRDS(paste0(outname_in,"_cicero_conns.Rds"))
if(startsWith(outname_in,"hg38")){
annot<-hg38_annotations
} else {
annot<-mm10_annotations
}
pos <-as.data.frame(annot,row.names=NULL)
pos$chromosome<-paste0("chr",pos$seqnames)
pos$gene<-pos$gene_id
pos <- subset(pos, strand == "+")
pos <- pos[order(pos$start),]
pos <- pos[!duplicated(pos$tx_id),] # remove all but the first exons per transcript
pos$end <- pos$start + 1 # make a 1 base pair marker of the TSS
neg <-as.data.frame(annot,row.names=NULL)
neg$chromosome<-paste0("chr",neg$seqnames)
neg$gene<-neg$gene_id
neg <- subset(neg, strand == "-")
neg <- neg[order(neg$start,decreasing=TRUE),]
neg <- neg[!duplicated(neg$tx_id),] # remove all but the first exons per transcript
neg$end <- neg$end + 1 # make a 1 base pair marker of the TSS
gene_annotation<- rbind(pos, neg)
gene_annotation <- gene_annotation[,c("chromosome","start","end","gene_name")] # Make a subset of the TSS annotation columns containing just the coordinates and the gene name
names(gene_annotation)[4] <- "gene" # Rename the gene symbol column to "gene"
obj.cds<-annotate_cds_by_site(as.cell_data_set(x = obj_in,group_by="seurat_clusters"), gene_annotation)
unnorm_ga<-build_gene_activity_matrix(obj.cds, conns)
saveRDS(unnorm_ga,paste0(outname_in,".unnorm_GA.Rds"))
obj_in[['GeneActivity']]<- CreateAssayObject(counts = as.data.frame(unnorm_ga))
# normalize
obj_in <- NormalizeData(
object = obj_in,
assay = 'GeneActivity',
normalization.method = 'LogNormalize',
scale.factor = median(obj_in$nCount_peaks)
)
saveRDS(obj_in,x)
}
cicero_objects<-list.files(pattern="Seurat.cicero.Rds")
lapply(cicero_objects,generate_ga)
samtools view -b -f 2 atac_v1_adult_brain_fresh_5k_possorted_bam.bam | awk ‘{print $19}’ | sort –parallel=20 -T . | uniq -c > 10x_genomics_uniquereads.txt & |
```R
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/barnyard_analysis")
dat_10x<-read.csv("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/10x_atac_v1_adult_brain_fresh_5k_singlecell.csv")
dat_10x<-dat_10x[c("barcode","passed_filters")]
barcused_10x<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/10x_atac_v1_adult_brain_fresh_5k_barcodesused.txt")
dat_10x<-dat_10x[dat_10x$barcode %in% barcused_10x$V1,]
#summary(dat_10x$duplicate/dat_10x$total)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#0.05322 0.48545 0.52413 0.52090 0.56153 0.79080
colnames(dat_10x)<-c("cellID","uniq_reads")
dat_10x$assay<-"10x_scATAC"
dat_biorad<-readRDS("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/mousebrain-master_dataframe.rds")
#summary(1-dat_biorad$duplicateProportion)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.051 0.160 0.190 0.255 0.328 0.796
dat_biorad_projected<-dat_biorad[c("DropBarcode","librarySize")]
colnames(dat_biorad_projected)<-c("cellID","uniq_reads")
dat_biorad_projected$assay<-"dscATAC_projected"
dat_biorad<-dat_biorad[c("DropBarcode","uniqueNuclearFrags")]
colnames(dat_biorad)<-c("cellID","uniq_reads")
dat_biorad$assay<-"dscATAC"
dat_ren<-read.csv("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/GSM2668124_p56.nchrM.merge.sel_cell.qc.csv",header=F)
colnames(dat_ren)<-c("cellID","uniq_reads","perc_uniq","frip")
#> summary(dat_ren$perc_uniq)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.2508 0.2643 0.2660 0.2682 0.2724 0.3334
dat_ren<-dat_ren[c("cellID","uniq_reads")]
dat_ren$assay<-"snATAC"
dat_sciatac<-read.table("/home/groups/oroaklab/adey_lab/projects/spatial/wholebrain/wholebrain.complexity.txt",header=F)
colnames(dat_sciatac)<-c("rowname_carryover","cellID","tot_reads","uniq_reads","perc_uniq")
dat_sciatac_condition<-read.table("/home/groups/oroaklab/adey_lab/projects/spatial/wholebrain/annots_wholebrain/wholebrain_FreshvFrozen.annot")
colnames(dat_sciatac_condition)<-c("cellID","condition")
dat_sciatac<-merge(dat_sciatac,dat_sciatac_condition,by="cellID")
dat_sciatac<-dat_sciatac[dat_sciatac$condition=="Frozen",] #filter by frozen condition
dat_sciatac_dims<-read.table("/home/groups/oroaklab/adey_lab/projects/spatial/wholebrain/wholebrain.bbrd.q10.filt.500.counts.cistopic.UMAP.dims")
colnames(dat_sciatac_dims)<-c("cellID","x","y") #filter by cells used in analysis (dims file are those which underwent clustering)
dat_sciatac<-dat_sciatac[c("cellID","uniq_reads")]
dat_sciatac<-dat_sciatac[dat_sciatac$cellID %in% dat_sciatac_dims$cellID,]
dat_sciatac$assay<-"sciatac"
dat_s3<-read.table("mm10.complexity.txt",header=F)
colnames(dat_s3)<-c("rowname_carryover","cellID","tot_reads","uniq_reads","perc_uniq")
dat_s3_pcrplate<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/barnyard_pcrplate.annot",header=F,col.names=c("cellID","pcr_plate"))
dat_s3<-merge(dat_s3,dat_s3_pcrplate,by="cellID")
#summarize percent unique per plate
library(dplyr)
dat_s3<-dat_s3[dat_s3$uniq_reads>=10000,]
data.frame(dat_s3 %>% group_by(pcr_plate) %>% summarise(mean_compl=mean(perc_uniq),
median_compl=median(perc_uniq),
sd=sd(perc_uniq),
mean_uniq=mean(uniq_reads),
median_uniq=median(uniq_reads),
sd_uniq=sd(uniq_reads),
cell_count=n()))
# pcr_plate mean_compl median_compl sd mean_uniq median_uniq sd_uniq cell_count
#1 B 71.48840 71.820 3.668856 46299.46 33502.0 39659.36 381
#2 C 36.31242 27.895 16.913973 264133.51 181248.5 269722.56 298
#3 D 66.16967 67.690 7.614098 43358.67 26490.0 48451.74 273
dat_s3_b<-dat_s3[dat_s3$pcr_plate=="B",c("cellID","uniq_reads")]
dat_s3_b$assay<-"s3ATAC_Plate_B"
dat_s3_c<-dat_s3[dat_s3$pcr_plate=="C",c("cellID","uniq_reads")]
dat_s3_c$assay<-"s3ATAC_Plate_C"
dat_10x<-dat_10x[dat_10x$uniq_reads>=1000,]
dat<-rbind(dat_10x,dat_ren,dat_biorad,dat_s3_c) #dat_sciatac
dat$assay = factor(dat$assay, levels=c("snATAC", "10x_scATAC","dscATAC","s3ATAC_Plate_C")) #"sciatac",
write.table(dat[,c("assay","uniq_reads")],file="SourceData_Fig2d.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_Fig2d.tsv ryan_todo")
library(ggplot2)
ggplot(dat,aes(x=as.factor(assay),y=log10(uniq_reads),color=as.factor(assay)))+geom_boxplot()+theme_bw()+theme(axis.text.x = element_text(angle = 60, hjust=1))
ggsave("adultmusbrain_atacprotocol_comparisons.svg")
ggsave("adultmusbrain_atacprotocol_comparisons.pdf")
ggsave("adultmusbrain_atacprotocol_comparisons.png")
system("slack -F adultmusbrain_atacprotocol_comparisons.pdf ryan_todo")
i="s3ATAC_Plate_C"
for (j in unique(dat$assay)){
ttemp<-t.test(dat[dat$assay==i,]$uniq_reads,dat[dat$assay==j,]$uniq_reads,alternative="greater")
print(paste(i,j,ttemp$p.value,ttemp$estimate[1],ttemp$estimate[2],ttemp$estimate[1]/ttemp$estimate[2],ttemp$alternative,ttemp$method))
}
#[1] "s3ATAC_Plate_C,10x_scATAC,3.13338819899197e-40,264133.506711409,22715.9052708283,greater,Welch Two Sample t-test"
#[1] "s3ATAC_Plate_C,snATAC,5.7106611661251e-42,264133.506711409,15459.4017798286,greater,Welch Two Sample t-test"
#[1] "s3ATAC_Plate_C,dscATAC,1.5654951763642e-37,264133.506711409,34045.7355797055,greater,Welch Two Sample t-test"
#[1] "s3ATAC_Plate_C,sciatac,9.87044162982429e-43,264133.506711409,12263.8299081767,greater,Welch Two Sample t-test"
#[1] "s3ATAC_Plate_C,s3ATAC_Plate_C,0.5,264133.506711409,264133.506711409,greater,Welch Two Sample t-test"
#snATAC peaks
wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2668nnn/GSM2668124/suppl/GSM2668124_p56.nchrM.merge.sel_cell.ygi.txt.gz
gzip -d GSM2668124_p56.nchrM.merge.sel_cell.ygi.txt.gz
awk 'OFS="\t" {print $0}' GSM2668124_p56.nchrM.merge.sel_cell.ygi.txt > snatac_peaks.bed
wc -l snatac_peaks.bed
##140102 peaks
#10X peaks
wget https://cf.10xgenomics.com/samples/cell-atac/1.1.0/atac_v1_adult_brain_fresh_5k/atac_v1_adult_brain_fresh_5k_peaks.bed
##157797 peaks
#biorad peaks
wget https://raw.githubusercontent.com/buenrostrolab/dscATAC_analysis_code/master/mousebrain/data/mouseBrain_peaks.bed
awk 'OFS="\t" {print $0}' mouseBrain_peaks.bed | awk '(NR>1)' - > biorad_peaks.bed
##454047 peaks
#Getting counts of peak overlaps
wc -l /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10.bbrd.q10.500.bed
#174653
bedtools intersect -wb -a /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10.bbrd.q10.500.bed -b /home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/snatac_peaks.bed | wc -l
#71199 of ours in theirs
bedtools intersect -wa -a /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10.bbrd.q10.500.bed -b /home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/atac_v1_adult_brain_fresh_5k_peaks.bed | wc -l
bedtools intersect -wa -a /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/mm10.bbrd.q10.500.bed -b /home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/biorad_peaks.bed | wc -l
Using R v4.0.0 and Signac v1.0
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(Matrix)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
#function to read in sparse matrix format from atac-count
read_in_sparse<-function(x){ #x is character file prefix followed by .bbrd.q10.500.counts.sparseMatrix.values.gz
IN<-as.matrix(read.table(paste0(x,".bbrd.q10.500.counts.sparseMatrix.values.gz")))
IN<-sparseMatrix(i=IN[,1],j=IN[,2],x=IN[,3])
COLS<-read.table(paste0(x,".bbrd.q10.500.counts.sparseMatrix.cols.gz"))
colnames(IN)<-COLS$V1
ROWS<-read.table(paste0(x,".bbrd.q10.500.counts.sparseMatrix.rows.gz"))
row.names(IN)<-ROWS$V1
return(IN)
}
hg38_counts<-read_in_sparse("hg38") # make hg38 counts matrix from sparse matrix
mm10_counts<-read_in_sparse("mm10") # make mm10 counts matrix from sparse matrix
#Read in fragment path for coverage plots
mm10_fragment.path="./mm10.fragments.tsv.gz"
hg38_fragment.path="./hg38.fragments.tsv.gz"
# extract gene annotations from EnsDb
hg38_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
mm10_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
# change to UCSC style
seqlevelsStyle(hg38_annotations) <- 'UCSC'
seqlevelsStyle(mm10_annotations) <- 'UCSC'
genome(hg38_annotations) <- "hg38"
genome(mm10_annotations) <- "mm10"
#Generate ChromatinAssay Objects
hg38_chromatinassay <- CreateChromatinAssay(
counts = hg38_counts,
genome="hg38",
min.cells = 1,
annotation=hg38_annotations,
sep=c("_","_"),
fragments=hg38_fragment.path
)
mm10_chromatinassay <- CreateChromatinAssay(
counts = mm10_counts,
genome="mm10",
min.cells = 1,
annotation=mm10_annotations,
sep=c("_","_"),
fragments=mm10_fragment.path
)
#Create Seurat Objects
hg38_atac <- CreateSeuratObject(
counts = hg38_chromatinassay,
assay = "peaks"
)
mm10_atac <- CreateSeuratObject(
counts = mm10_chromatinassay,
assay = "peaks"
)
#Meta.data to be updated after clustering
#saving unprocessed SeuratObjects
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")
#renaming annot for simplified annotation file making
#rename processing_ processing. *annot
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
annot.files<-list.files(pattern="annot$")
annot.files<-annot.files[startsWith(annot.files,prefix="barnyard")]
annot_append<-read.table(annot.files[1],col.names=c("cellID",strsplit(annot.files[1],"[.]")[[1]][1]))
for (i in 2:length(annot.files)){
tmp<-read.table(annot.files[i],col.names=c("cellID",strsplit(annot.files[i],"[.]")[[1]][1]))
annot_append<-merge(annot_append,tmp,by="cellID")
}
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
hg38_atac@meta.data$cellID<-row.names(hg38_atac@meta.data)
hg38_atac@meta.data$i5_idx_seq<-substr(hg38_atac@meta.data$cellID,9,16)
hg38_atac@meta.data$i7_idx_seq<-substr(hg38_atac@meta.data$cellID,0,8)
annot<-as.data.frame(hg38_atac@meta.data)
annot<-merge(annot,annot_append,by="cellID",all.x=T)
compl_hg38<-read.table("hg38.complexity.txt")
colnames(compl_hg38)<-c("row_carryover","cellID","total_reads","uniq_reads","perc_uniq")
annot<-merge(annot,compl_hg38,by="cellID")
hg38_fracontarget<-read.table("hg38.bbrd.q10.500.fracOnTarget.values")
colnames(hg38_fracontarget)<-c("cellID","frac_reads_in_peaks")
annot<-merge(annot,hg38_fracontarget,by="cellID")
row.names(annot)<-annot$cellID
hg38_atac@meta.data<-annot
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
mm10_atac@meta.data$cellID<-row.names(mm10_atac@meta.data)
mm10_atac@meta.data$i5_idx_seq<-substr(mm10_atac@meta.data$cellID,9,16)
mm10_atac@meta.data$i7_idx_seq<-substr(mm10_atac@meta.data$cellID,0,8)
annot<-as.data.frame(mm10_atac@meta.data)
annot<-merge(annot,annot_append,by="cellID",all.x=T)
compl_mm10<-read.table("mm10.complexity.txt")
colnames(compl_mm10)<-c("row_carryover","cellID","total_reads","uniq_reads","perc_uniq")
annot<-merge(annot,compl_mm10,by="cellID")
mm10_fracontarget<-read.table("mm10.bbrd.q10.500.fracOnTarget.values")
colnames(mm10_fracontarget)<-c("cellID","frac_reads_in_peaks")
annot<-merge(annot,mm10_fracontarget,by="cellID")
row.names(annot)<-annot$cellID
mm10_atac@meta.data<-annot
#Filtering cells to those with fraction of reads in peaks >0.2
mm10_atac <- subset(mm10_atac, subset = frac_reads_in_peaks >= 0.2)
hg38_atac <- subset(hg38_atac, subset = frac_reads_in_peaks >= 0.2)
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")
#continued from above session
library(cisTopic)
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(Matrix)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
hg38_cistopic_counts_frmt<-hg38_atac$peaks@counts
mm10_cistopic_counts_frmt<-mm10_atac$peaks@counts
#renaming row names to fit granges expectation of format
row.names(hg38_cistopic_counts_frmt)<-sub("-", ":", row.names(hg38_cistopic_counts_frmt))
row.names(mm10_cistopic_counts_frmt)<-sub("-", ":", row.names(mm10_cistopic_counts_frmt))
#set up CisTopicObjects
hg38_atac_cistopic<-cisTopic::createcisTopicObject(hg38_cistopic_counts_frmt)
mm10_atac_cistopic<-cisTopic::createcisTopicObject(mm10_cistopic_counts_frmt)
#Run warp LDA on objects
hg38_atac_cistopic_models<-cisTopic::runWarpLDAModels(hg38_atac_cistopic,topic=c(5,10,20:30,40,50,55,60:70),nCores=27,addModels=FALSE)
mm10_atac_cistopic_models<-cisTopic::runWarpLDAModels(mm10_atac_cistopic,topic=c(5,10,20:30,40,50,55,60:70),nCores=27,addModels=FALSE)
#Saving all models for posterity
saveRDS(hg38_atac_cistopic_models,file="hg38_CisTopicObject.Rds")
saveRDS(mm10_atac_cistopic_models,file="mm10_CisTopicObject.Rds")
#Setting up topic count selection
pdf("hg38_atac_model_selection.pdf")
par(mfrow=c(3,3))
hg38_atac_cistopic_models <- selectModel(hg38_atac_cistopic_models, type='maximum')
hg38_atac_cistopic_models <- selectModel(hg38_atac_cistopic_models, type='perplexity')
hg38_atac_cistopic_models <- selectModel(hg38_atac_cistopic_models, type='derivative')
dev.off()
pdf("mm10_atac_model_selection.pdf")
par(mfrow=c(3,3))
mm10_atac_cistopic_models <- selectModel(mm10_atac_cistopic_models, type='maximum')
mm10_atac_cistopic_models <- selectModel(mm10_atac_cistopic_models, type='perplexity')
mm10_atac_cistopic_models <- selectModel(mm10_atac_cistopic_models, type='derivative')
dev.off()
hg38_atac_cistopic_models<-readRDS(file="hg38_CisTopicObject.Rds")
mm10_atac_cistopic_models<-readRDS(file="mm10_CisTopicObject.Rds")
#set topics based on derivative
#selected topics subject to change
mm10_selected_topic=27
hg38_selected_topic=24
mm10_cisTopicObject<-cisTopic::selectModel(mm10_atac_cistopic_models,select=mm10_selected_topic,keepModels=F)
hg38_cisTopicObject<-cisTopic::selectModel(hg38_atac_cistopic_models,select=hg38_selected_topic,keepModels=F)
#saving model selected RDS
saveRDS(hg38_cisTopicObject,file="hg38_CisTopicObject.Rds")
saveRDS(mm10_cisTopicObject,file="mm10_CisTopicObject.Rds")
#Read in cisTopic objects
hg38_cisTopicObject<-readRDS("hg38_CisTopicObject.Rds")
mm10_cisTopicObject<-readRDS("mm10_CisTopicObject.Rds")
#read in seurat format object
hg38_atac<-readRDS("hg38_SeuratObject.Rds")
mm10_atac<-readRDS("mm10_SeuratObject.Rds")
#run UMAP on topics
hg38_topic_df<-as.data.frame(hg38_cisTopicObject@selected.model$document_expects)
row.names(hg38_topic_df)<-paste0("Topic_",row.names(hg38_topic_df))
hg38_dims<-as.data.frame(uwot::umap(t(hg38_topic_df),n_components=2))
row.names(hg38_dims)<-colnames(hg38_topic_df)
colnames(hg38_dims)<-c("x","y")
hg38_dims$cellID<-row.names(hg38_dims)
hg38_dims<-merge(hg38_dims,hg38_atac@meta.data,by.x="cellID",by.y="row.names")
mm10_topic_df<-as.data.frame(mm10_cisTopicObject@selected.model$document_expects)
row.names(mm10_topic_df)<-paste0("Topic_",row.names(mm10_topic_df))
mm10_dims<-as.data.frame(uwot::umap(t(mm10_topic_df),n_components=2))
row.names(mm10_dims)<-colnames(mm10_topic_df)
colnames(mm10_dims)<-c("x","y")
mm10_dims$cellID<-row.names(mm10_dims)
mm10_dims<-merge(mm10_dims,mm10_atac@meta.data,by.x="cellID",by.y="row.names")
#plot heatmaps of topics
pdf("mm10_atac_cistopic_heatmap.pdf")
cellTopicHeatmap(mm10_cisTopicObject, method='Probability')
dev.off()
pdf("hg38_atac_cistopic_heatmap.pdf")
cellTopicHeatmap(hg38_cisTopicObject, method='Probability',use_raster=F)
dev.off()
#predictive measurements of topics
#hg38_pred.matrix <- predictiveDistribution(hg38_cisTopicObject)
#mm10_pred.matrix <- predictiveDistribution(mm10_cisTopicObject)
# Obtain signatures
#hg38_path_to_signatures <- '/home/groups/oroaklab/refs/hg38/tfbs/'
#hg38_HomerTF_signatures <- paste(hg38_path_to_signatures, list.files(hg38_path_to_signatures), sep='')
#mm10_path_to_signatures <- '/home/groups/oroaklab/refs/mm10/JASPAR2018_TF_Sites/bed_files/'
#mm10_JASPARTF_signatures <- paste(mm10_path_to_signatures, list.files(mm10_path_to_signatures), sep='')
#get a list of files in directory and strsplit into labels
#hg38_labels <- unlist(lapply(strsplit(list.files(hg38_path_to_signatures),"[.]"),"[",1))
#mm10_labels <- unlist(lapply(strsplit(list.files(mm10_path_to_signatures),"[.]"),"[",1))
#get signature region scores for topics
#hg38_cisTopicObject <- getSignaturesRegions(hg38_cisTopicObject, hg38_HomerTF_signatures, labels=hg38_labels, minOverlap = 0.4)
#mm10_cisTopicObject <- getSignaturesRegions(mm10_cisTopicObject, mm10_JASPARTF_signatures, labels=mm10_labels, minOverlap = 0.4)
#get regions from topics
#hg38_cisTopicObject <- getRegionsScores(hg38_cisTopicObject, method='NormTop', scale=TRUE)
# mm10_cisTopicObject <- getRegionsScores(mm10_cisTopicObject, method='NormTop', scale=TRUE)
#plot tfbs predictions to topics
#pdf("hg38_atac_cistopic_SignatureRegions.pdf",height=25,width=25)
#signaturesHeatmap(hg38_cisTopicObject,row_names_gp=gpar(fontsize=5))
#dev.off()
#pdf("mm10_atac_cistopic_SignatureRegions.pdf",height=25,width=25)
#signaturesHeatmap(mm10_cisTopicObject,row_names_gp=gpar(fontsize=5))
#dev.off()
#plot general annotations to topics (this still needs to be run)
#library(org.Hs.eg.db)
#library(org.Mm.eg.db)
#library(TxDb.Hsapiens.UCSC.hg38.knownGene)
#library(TxDb.Mmusculus.UCSC.mm10.knownGene)
#hg38_cisTopicObject <- annotateRegions(hg38_cisTopicObject, txdb=TxDb.Hsapiens.UCSC.hg38.knownGene, annoDb='org.Hs.eg.db')
#pdf("hg38_atac_cistopic_SignatureRegions.pdf")
#signaturesHeatmap(hg38_cisTopicObject, selected.signatures = 'annotation')
#dev.off()
#mm10_cisTopicObject <- annotateRegions(mm10_cisTopicObject, txdb=TxDb.Mmusculus.UCSC.mm10.knownGene, annoDb='org.Mm.eg.db')
#pdf("mm10_atac_cistopic_SignatureRegions.pdf")
#signaturesHeatmap(mm10_cisTopicObject, selected.signatures = 'annotation')
#dev.off()
# Compute cell rankings
#library(AUCell)
#pdf("orgo_atac_cistopic_AUCellRankings.pdf")
#aucellRankings <- AUCell_buildRankings(pred.matrix, plot=TRUE, verbose=FALSE)
#dev.off()
# Check signature enrichment in cells
#pdf("orgo_atac_cistopic_sigantureCellEnrichment.pdf")
#cisTopicObject <- signatureCellEnrichment(cisTopicObject, aucellRankings, selected.signatures='all', aucMaxRank = 0.1*nrow(aucellRankings), plot=TRUE)
#dev.off()
#pdf("orgo_atac_cistopic_gammafit_regions.pdf")
#par(mfrow=c(2,5))
#cisTopicObject <- binarizecisTopics(cisTopicObject, thrP=0.975, plot=TRUE)
#dev.off()
# #saving model selected RDS
# saveRDS(hg38_cisTopicObject,file="hg38_CisTopicObject.Rds")
# saveRDS(mm10_cisTopicObject,file="mm10_CisTopicObject.Rds")
#bin_hg38_cisTopicObject<-binarizecisTopics(hg38_cisTopicObject)
# getBedFiles(bin_hg38_cisTopicObject, path='hg38_cisTopic_beds')
#bin_mm10_cisTopicObject<-binarizecisTopics(mm10_cisTopicObject)
# getBedFiles(bin_mm10_cisTopicObject, path='mm10_cisTopic_beds')
#Read in cisTopic objects
# hg38_cisTopicObject<-readRDS("hg38_CisTopicObject.Rds")
# mm10_cisTopicObject<-readRDS("mm10_CisTopicObject.Rds")
#Add cell embeddings into seurat
hg38_cell_embeddings<-as.data.frame(hg38_cisTopicObject@selected.model$document_expects)
colnames(hg38_cell_embeddings)<-hg38_cisTopicObject@cell.names
hg38_n_topics<-nrow(hg38_cell_embeddings)
row.names(hg38_cell_embeddings)<-paste0("topic_",1:hg38_n_topics)
hg38_cell_embeddings<-as.data.frame(t(hg38_cell_embeddings))
mm10_cell_embeddings<-as.data.frame(mm10_cisTopicObject@selected.model$document_expects)
colnames(mm10_cell_embeddings)<-mm10_cisTopicObject@cell.names
mm10_n_topics<-nrow(mm10_cell_embeddings)
row.names(mm10_cell_embeddings)<-paste0("topic_",1:mm10_n_topics)
mm10_cell_embeddings<-as.data.frame(t(mm10_cell_embeddings))
#Add feature loadings into seurat
hg38_feature_loadings<-as.data.frame(hg38_cisTopicObject@selected.model$topics)
row.names(hg38_feature_loadings)<-paste0("topic_",1:hg38_n_topics)
hg38_feature_loadings<-as.data.frame(t(hg38_feature_loadings))
mm10_feature_loadings<-as.data.frame(mm10_cisTopicObject@selected.model$topics)
row.names(mm10_feature_loadings)<-paste0("topic_",1:mm10_n_topics)
mm10_feature_loadings<-as.data.frame(t(mm10_feature_loadings))
#combined cistopic results (cistopic loadings and umap with seurat object)
hg38_cistopic_obj<-CreateDimReducObject(embeddings=as.matrix(hg38_cell_embeddings),loadings=as.matrix(hg38_feature_loadings),assay="peaks",key="topic_")
hg38_umap_dims<-as.data.frame(as.matrix(hg38_dims[2:3]))
colnames(hg38_umap_dims)<-c("UMAP_1","UMAP_2")
row.names(hg38_umap_dims)<-hg38_dims$cellID
hg38_cistopic_umap<-CreateDimReducObject(embeddings=as.matrix(hg38_umap_dims),assay="peaks",key="UMAP_")
hg38_atac@reductions$cistopic<-hg38_cistopic_obj
hg38_atac@reductions$umap<-hg38_cistopic_umap
mm10_cistopic_obj<-CreateDimReducObject(embeddings=as.matrix(mm10_cell_embeddings),loadings=as.matrix(mm10_feature_loadings),assay="peaks",key="topic_")
mm10_umap_dims<-as.data.frame(as.matrix(mm10_dims[2:3]))
colnames(mm10_umap_dims)<-c("UMAP_1","UMAP_2")
row.names(mm10_umap_dims)<-mm10_dims$cellID
mm10_cistopic_umap<-CreateDimReducObject(embeddings=as.matrix(mm10_umap_dims),assay="peaks",key="UMAP_")
mm10_atac@reductions$cistopic<-mm10_cistopic_obj
mm10_atac@reductions$umap<-mm10_cistopic_umap
hg38_n_topics<-ncol(Embeddings(hg38_atac,reduction="cistopic"))
hg38_atac <- FindNeighbors(
object = hg38_atac,
reduction = 'cistopic',
dims = 1:hg38_n_topics
)
hg38_atac <- FindClusters(
object = hg38_atac,
verbose = TRUE,
resolution=0.3
)
mm10_n_topics<-ncol(Embeddings(mm10_atac,reduction="cistopic"))
mm10_atac <- FindNeighbors(
object = mm10_atac,
reduction = 'cistopic',
dims = 1:mm10_n_topics
)
mm10_atac <- FindClusters(
object = mm10_atac,
verbose = TRUE,
resolution=0.2
)
###save Seurat files
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")
out_data<-lapply(c("hg38_SeuratObject.Rds","mm10_SeuratObject.Rds"), function(i){
obj<-readRDS(i)
species<-unlist(lapply(strsplit(i,"_"),"[",1))
dat_out<-cbind(as.data.frame(obj@reductions$umap@cell.embeddings[names(obj$celltype),]),obj$celltype,obj$seurat_clusters,species)
return(dat_out)
})
dat_out<-do.call("rbind",out_data)
write.table(dat_out,file="SourceData_Fig2h.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_Fig2h.tsv ryan_todo")
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
dat_hg38<-hg38_atac@meta.data
dat_mm10<-mm10_atac@meta.data
library(ggplot2)
plt<-ggplot(dat_hg38,aes(x=barnyard_pcrplate,y=log10(uniq_reads)))+
geom_jitter()+
geom_boxplot(outlier.shape=NA)+
theme(axis.text.x = element_text(angle = 90))
ggsave(plt,file="hg38_complexity_boxplot.pdf")
ggsave(plt,file="hg38_complexity_boxplot.png")
plt<-ggplot(dat_mm10,aes(x=barnyard_pcrplate,y=log10(uniq_reads)))+
geom_jitter()+
geom_boxplot(outlier.shape=NA)+
theme(axis.text.x = element_text(angle = 90))
ggsave(plt,file="mm10_complexity_boxplot.pdf")
ggsave(plt,file="mm10_complexity_boxplot.png")
plt<-DimPlot(hg38_atac,group.by=c('seurat_clusters',
'barnyard_platevolume',
'barnyard_polymerase',
'barnyard_storagecondition',
'barnyard_pcrplate'
))
pdf("hg38.umap.pdf",width=20)
plt
dev.off()
ggsave(plt,file="hg38.umap.png",width=20)
plt<-DimPlot(mm10_atac,group.by=c('seurat_clusters',
'barnyard_platevolume',
'barnyard_polymerase',
'barnyard_storagecondition',
'barnyard_pcrplate'))
pdf("mm10.umap.pdf",width=20)
plt
dev.off()
ggsave(plt,file="mm10.umap.png",width=20)
dat_hg38$UMAP_1<-hg38_atac@reductions$umap@cell.embeddings[,1]
dat_hg38$UMAP_2<-hg38_atac@reductions$umap@cell.embeddings[,2]
dat_mm10$UMAP_1<-mm10_atac@reductions$umap@cell.embeddings[,1]
dat_mm10$UMAP_2<-mm10_atac@reductions$umap@cell.embeddings[,2]
write.table(dat_hg38,file="hg38_cell_summary.txt",col.names=T,row.names=T,sep="\t",quote=F)
write.table(dat_mm10,file="mm10_cell_summary.txt",col.names=T,row.names=T,sep="\t",quote=F)
Currently testing k argument in make_cicero_cds to see if lower cell count (using out higher reads) does a better job. Also need to make sure that it is assigning cells correctly while adding to the Seurat object. Genebody analysis looks like GA should be working better.
library(Signac)
library(Seurat)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(cicero)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
#Read in data and modify to monocle CDS file
#read in RDS file.
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
# convert to CellDataSet format and make the cicero object
#Processing hg38 cells
hg38_atac.cds <- as.cell_data_set(x = hg38_atac,group_by="seurat_clusters")
hg38_atac.cicero <- make_cicero_cds( hg38_atac.cds, reduced_coordinates = reducedDims(hg38_atac.cds)$UMAP,k=10)
genome <- seqlengths(hg38_atac) # get the chromosome sizes from the Seurat object
genome.df <- data.frame("chr" = names(genome), "length" = genome) # convert chromosome sizes to a dataframe
conns <- run_cicero(hg38_atac.cicero, genomic_coords = genome.df) # run cicero
saveRDS(conns,"hg38_cicero_conns.Rds")
ccans <- generate_ccans(conns) # generate ccans
saveRDS(ccans,"hg38_cicero_ccans.Rds")
links <- ConnectionsToLinks(conns = conns, ccans = ccans) #Add connections back to Seurat object as links
Links(hg38_atac) <- links
saveRDS(hg38_atac,file="hg38_SeuratObject.cicero.Rds")
#Processing mm10 cells
mm10_atac.cds <- as.cell_data_set(x = mm10_atac,group_by="seurat_clusters")
mm10_atac.cicero <- make_cicero_cds( mm10_atac.cds, reduced_coordinates = reducedDims(mm10_atac.cds)$UMAP,k=10)
genome <- seqlengths(mm10_atac) # get the chromosome sizes from the Seurat object
genome.df <- data.frame("chr" = names(genome), "length" = genome) # convert chromosome sizes to a dataframe
conns <- run_cicero(mm10_atac.cicero, genomic_coords = genome.df) # run cicero
saveRDS(conns,"mm10_cicero_conns.Rds")
ccans <- generate_ccans(conns) # generate ccans
saveRDS(ccans,"mm10_cicero_ccans.Rds")
links <- ConnectionsToLinks(conns = conns, ccans = ccans) #Add connections back to Seurat object as links
Links(mm10_atac) <- links
saveRDS(mm10_atac,file="mm10_SeuratObject.cicero.Rds")
hg38_atac<-readRDS(file="hg38_SeuratObject.cicero.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.cicero.Rds")
# generate unnormalized gene activity matrices
# gene annotation sample
#hg38
hg38_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
pos <-as.data.frame(hg38_annotations,row.names=NULL)
pos$chromosome<-paste0("chr",pos$seqnames)
pos$gene<-pos$gene_id
pos <- subset(pos, strand == "+")
pos <- pos[order(pos$start),]
pos <- pos[!duplicated(pos$tx_id),] # remove all but the first exons per transcript
pos$end <- pos$start + 1 # make a 1 base pair marker of the TSS
neg <-as.data.frame(hg38_annotations,row.names=NULL)
neg$chromosome<-paste0("chr",neg$seqnames)
neg$gene<-neg$gene_id
neg <- subset(neg, strand == "-")
neg <- neg[order(neg$start,decreasing=TRUE),]
neg <- neg[!duplicated(neg$tx_id),] # remove all but the first exons per transcript
neg$end <- neg$end + 1 # make a 1 base pair marker of the TSS
gene_annotation<- rbind(pos, neg)
gene_annotation <- gene_annotation[,c("chromosome","start","end","gene_name")] # Make a subset of the TSS annotation columns containing just the coordinates and the gene name
names(gene_annotation)[4] <- "gene" # Rename the gene symbol column to "gene"
geneactivity_processing<-function(cds_input,conns_input,prefix){
atac.cds<- annotate_cds_by_site(cds_input, gene_annotation)
unnorm_ga <- build_gene_activity_matrix(atac.cds, conns_input)
saveRDS(unnorm_ga,paste(prefix,"unnorm_GA.Rds",sep="."))
}
conns<-as.data.frame(readRDS("hg38_cicero_conns.Rds"))
geneactivity_processing(cds_input=as.cell_data_set(x = hg38_atac,group_by="seurat_clusters"),conns_input=conns,prefix="hg38")
#mm10
mm10_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
pos <-as.data.frame(mm10_annotations,row.names=NULL)
pos$chromosome<-paste0("chr",pos$seqnames)
pos$gene<-pos$gene_id
pos <- subset(pos, strand == "+")
pos <- pos[order(pos$start),]
pos <- pos[!duplicated(pos$tx_id),] # remove all but the first exons per transcript
pos$end <- pos$start + 1 # make a 1 base pair marker of the TSS
neg <-as.data.frame(mm10_annotations,row.names=NULL)
neg$chromosome<-paste0("chr",neg$seqnames)
neg$gene<-neg$gene_id
neg <- subset(neg, strand == "-")
neg <- neg[order(neg$start,decreasing=TRUE),]
neg <- neg[!duplicated(neg$tx_id),] # remove all but the first exons per transcript
neg$end <- neg$end + 1 # make a 1 base pair marker of the TSS
gene_annotation<- rbind(pos, neg)
gene_annotation <- gene_annotation[,c("chromosome","start","end","gene_name")] # Make a subset of the TSS annotation columns containing just the coordinates and the gene name
names(gene_annotation)[4] <- "gene" # Rename the gene symbol column to "gene"
geneactivity_processing<-function(cds_input,conns_input,prefix){
atac.cds<- annotate_cds_by_site(cds_input, gene_annotation)
unnorm_ga <- build_gene_activity_matrix(atac.cds, conns_input)
saveRDS(unnorm_ga,paste(prefix,"unnorm_GA.Rds",sep="."))
}
conns<-as.data.frame(readRDS("mm10_cicero_conns.Rds"))
geneactivity_processing(cds_input=as.cell_data_set(x = mm10_atac,group_by="seurat_clusters"),conns_input=conns,prefix="mm10")
#Add to seurat Object as new Assay
#Read in unnormalized GA
#hg38
cicero_gene_activities<-readRDS("hg38.unnorm_GA.Rds")
hg38_atac[['GeneActivity']]<- CreateAssayObject(counts = as.data.frame(cicero_gene_activities))
# normalize
hg38_atac <- NormalizeData(
object = hg38_atac,
assay = 'GeneActivity',
normalization.method = 'LogNormalize',
scale.factor = median(hg38_atac$nCount_peaks)
)
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
#mm10
cicero_gene_activities<-readRDS("mm10.unnorm_GA.Rds")
row.names(cicero_gene_activities)<-gsub("-",".",row.names(cicero_gene_activities))
mm10_atac[['GeneActivity']]<- CreateAssayObject(counts = cicero_gene_activities[!(rownames(x = cicero_gene_activities) == ""),])
# normalize
mm10_atac <- NormalizeData(
object = mm10_atac,
assay = 'GeneActivity',
normalization.method = 'LogNormalize',
scale.factor = median(mm10_atac$nCount_peaks)
)
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")
This is the signac default method of calculating gene activity
library(Signac)
library(Seurat)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
#Read in data and modify to monocle CDS file
#read in RDS file.
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
gene.activities <- GeneActivity(mm10_atac,process_n=10000)
saveRDS(gene.activities,"mm10.signac.geneactivities.RDS")
# add the gene activity matrix to the Seurat object as a new assay and normalize it
mm10_atac[['SignacGA']] <- CreateAssayObject(counts = gene.activities)
mm10_atac <- NormalizeData(
object = mm10_atac,
assay = 'SignacGA',
normalization.method = 'LogNormalize',
scale.factor = median(mm10_atac$nCount_SignacGA)
)
saveRDS(mm10_atac,file="mm10_SeuratObject.signacGA.Rds")
library(Signac)
library(Seurat)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
#Read in data and modify to monocle CDS file
#read in RDS file.
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
gene.activities <- GeneActivity(hg38_atac,process_n=10000)
saveRDS(gene.activities,"hg38.signac.geneactivities.RDS")
# add the gene activity matrix to the Seurat object as a new assay and normalize it
hg38_atac[['SignacGA']] <- CreateAssayObject(counts = gene.activities)
hg38_atac <- NormalizeData(
object = hg38_atac,
assay = 'SignacGA',
normalization.method = 'LogNormalize',
scale.factor = median(hg38_atac$nCount_SignacGA)
)
saveRDS(hg38_atac,file="hg38_SeuratObject.signacGA.Rds")
#Making a set of marker lists for human cortex and mouse whole brain
#Human cortex marker list is from https://cells.ucsc.edu/?ds=allen-celltypes%2Fhuman-cortex&meta=clusterlabel
#And stored in https://docs.google.com/spreadsheets/d/15YwK2lWh018IyxgQsRxnrINxBFlG3XtU5kE8gArYxJ4/edit?usp=sharing
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/marker_sets")
marker_list<-NULL
marker_list[["VLMC"]]<-c('ITIH5', 'ABCA9', 'DCN', 'ATP1A2', 'SAT1', 'NEAT1', 'SLC7A11', 'ABCA8', 'APOD', 'NKD1', 'PTN', 'FBLN1', 'LAMC3', 'PDGFRB', 'CXCL12', 'AKR1C2', 'SVIL', 'ABCA6', 'COLEC12', 'CYP1B1', 'UACA', 'AHNAK', 'TPCN1', 'SLC1A3', 'ADAM33', 'LAMA2', 'COL18A1', 'CYTL1', 'SLC6A1', 'EDN3', 'ARHGAP29', 'COL4A5', 'DAB2', 'BGN', 'LEPR', 'PTGDS', 'OGN', 'WDR86', 'PRELP', 'ISLR', 'PDGFRA', 'MRC2', 'COL15A1', 'SPRY4', 'SELENBP1', 'RHOJ', 'KIAA1755', 'SASH1', 'LHFP', 'RPS12P5', 'KANK2')
marker_list[["Astrocyte"]]<-c('ADGRV1', 'SLC1A3', 'ATP1A2', 'C1orf61', 'FGFR3', 'SLC1A2', 'PTGDS', 'GLUL', 'NDRG2', 'PRODH', 'NTM', 'COL5A3', 'CST3', 'MACF1', 'PTPRZ1', 'ATP1B2', 'SLC25A18', 'ZBTB20', 'MT3', 'MT2A', 'ATP13A4', 'PON2', 'AQP4', 'DTNA', 'GRAMD3', 'NCAN', 'ACSS1', 'MSI2', 'GJA1', 'SLCO1C1', 'SLC7A11', 'PAPLN', 'F3', 'RYR3', 'PSD2', 'ETNPPL', 'MYO10', 'RAPGEF3', 'TTYH1', 'SOX2', 'EMX2', 'HIF3A', 'DST', 'APOE', 'PDGFRB', 'SFXN5', 'AASS', 'FNBP1', 'SLC4A4', 'GRIN2C', 'PLXNB1')
marker_list[["L5ET"]]<-c('COL24A1', 'ADRA1A', 'COL5A2', 'CRYM', 'GRIK2', 'DGKH', 'SULF2', 'ATP6V1C2', 'VAT1L', 'NEFM', 'SPHKAP', 'NRP1', 'SLC26A4', 'COL21A1', 'TOX', 'PDE1C', 'NTNG1', 'IQCA1', 'BCL11B', 'ANXA4', 'ARAP2', 'ADAMTSL3', 'FAM126A', 'LRP2', 'LOC101927745', 'SLC5A8', 'DOCK4', 'NEFH', 'PRUNE2', 'GRB14', 'GRM8', 'MYO16', 'FAM84B', 'ANKRD18A', 'TRDMT1', 'PCSK6', 'ANO4', 'SSTR2', 'ESRRG', 'MYLIP', 'VASH2', 'KLHL32', 'RYR3', 'ZNF189', 'SLC26A4-AS1', 'EPM2A', 'CNR1', 'ST3GAL1', 'LOC105374239', 'ANKRD34B', 'SEPT4')
marker_list[["L6CT"]]<-c('SEMA3E', 'TLE4', 'EGFEM1P', 'HS3ST4', 'SYT6', 'RYR3', 'ENPP7P4', 'DLC1', 'SEMA3A', 'MEIS2', 'COL24A1', 'DIP2A', 'PCDH17', 'RXFP1', 'LRP8', 'ANKRD18A', 'FOXP2', 'NRP1', 'SEMA5A', 'LRP1B', 'LOC105374199', 'ABCC9', 'SLC24A2', 'VWA2', 'SYNJ2', 'FAM95C', 'SLC8A1-AS1', 'ITGA11', 'SERPINE2', 'KLHL5', 'NFIA', 'CDH6', 'SPARCL1', 'CPE', 'CTGF', 'TENM1', 'ANKRD26P3', 'SEZ6L', 'GRIK3', 'MCC', 'SYNPO2', 'ZFHX3', 'SULF1', 'FILIP1', 'DGKG', 'LUZP2', 'ANKRD20A5P', 'ANKRD20A7P', 'DMD', 'PCLO', 'TMEFF2')
marker_list[["L4IT"]]<-c('NTNG1', 'HTR2A', 'VAV3', 'PCP4', 'TRPC3', 'CDR1', 'PCSK1', 'BHLHE22', 'PLEKHH2', 'ITM2A', 'BTBD3', 'SORL1', 'CUX2', 'VSTM2A', 'SLC17A6', 'POU6F2', 'MEF2A', 'SEMA6D', 'RORA', 'LOC105377864', 'LRRK2', 'NEFM', 'GPR26', 'KCNH8', 'FSTL1', 'MET', 'ACVR1C', 'LOC105375817', 'KCNS1', 'ND4', 'EGR1', 'RET', 'CD74', 'TIAM1', 'PTTG1', 'PHACTR2', 'TADA1', 'SLC38A11', 'SYT2', 'TMEM215', 'LOC101927653', 'RHBDL3', 'PRKCA', 'GRAMD3', 'ILDR2', 'BMP8A', 'ITPR2', 'ND2', 'TRAF5', 'RALB', 'AIFM3')
marker_list[["L56NP"]]<-c('HTR2C', 'CRYM', 'TSHZ2', 'PCP4', 'FGFR1', 'ROBO3', 'NPSR1-AS1', 'ENPP7P8', 'SEMA3E', 'TLE4', 'IFNG-AS1', 'ENPP7P7', 'RPS3AP34', 'COL11A1', 'ALCAM', 'PCDH17', 'ENPP7P4', 'NXPH2', 'CDH6', 'MYLIP', 'DIP2A', 'MEIS2', 'CALCRL', 'LRMP', 'KCNT2', 'FABP7', 'PHLDB2', 'SORCS2', 'ZNF385D', 'CD36', 'ITGA8', 'SEL1L3', 'DOCK4', 'ANKRD20A5P', 'SPOCK1', 'TLL1', 'SLC24A2', 'KLHL5', 'MYHAS', 'BCL11B', 'KCNIP1', 'NR4A3', 'LUZP2', 'COL9A1', 'CCDC80', 'TOX', 'TRHDE', 'VWC2L', 'FAM89A', 'ABCC9', 'LINC01279')
marker_list[["L6b"]]<-c('TLE4', 'CTGF', 'RXFP1', 'SEMA3E', 'ZFHX3', 'LOC100996635', 'ENPP7P7', 'EGFEM1P', 'NFIA', 'SEMA3A', 'DLC1', 'KLHL5', 'LOC401134', 'HS3ST4', 'DGKH', 'CDH9', 'NR4A2', 'PCSK5', 'ENPP7P8', 'RYR3', 'SEMA3D', 'ADD3', 'PCDH17', 'OLFML2B', 'HPCAL1', 'MDFIC', 'ITPR2', 'COL19A1', 'TMEFF2', 'SEMA3C', 'SEZ6L', 'PQLC3', 'ENPP7P4', 'CDH11', 'DOCK4', 'MGST1', 'TENM1', 'DPP4', 'TNIK', 'GLCE', 'SLC1A2', 'FILIP1', 'VWA5A', 'ARSJ', 'ROS1', 'LRP8', 'DGKG', 'CPLX3', 'SLC24A2', '43891', 'LRP1B')
marker_list[["PVALB"]]<-c('ERBB4', 'ZNF385D', 'TAC1', 'GAD1', 'SLIT2', 'KCNC2', 'GAD2', 'PVALB', 'KLHL5', 'KCNAB3', 'PTPRM', 'C8orf4', 'SAT1', 'SLC6A1', 'RGS5', 'LHX6', 'BTBD11', 'ANK1', 'RAB3IP', 'ZNF536', 'SOX6', 'SPARCL1', 'TENM1', 'LANCL1', 'LRP8', 'MYO16', 'KCNAB1', 'PAM', 'ZNF804A', 'NEAT1', 'NXPH1', 'OSBPL3', 'SLC9A9', 'SULF1', 'WIF1', 'CRHBP', 'RND3', 'LGI2', 'SEPT4', 'SERPINI1', 'ANO4', 'LOC105369345', 'KLF12', 'NPPC', 'RUNX2', 'DOCK11', 'SPOCK3', 'PLCE1', 'GRIA4', 'KCNS3', 'ARX')
marker_list[["VIP"]]<-c('CXCL14', 'SYNPR', 'RGS12', 'ERBB4', 'SCG2', 'VIP', 'GAD1', 'CALB2', 'ADARB2', 'THSD7A', 'CNR1', 'PROX1', 'ARL4C', 'DLX6-AS1', 'SLC6A1', 'DNER', 'NR2F2', 'FXYD6', 'ATP1B2', 'TIMP2', 'COL21A1', 'DOCK10', 'TAC3', 'CHRNA2', 'IGF1', 'PTHLH', 'SLC24A3', 'AP1S2', 'RGS16', 'DLX1', 'SHISA8', 'ZNF536', 'ADAM33', 'ADRA1A', 'ASIC4', 'SLC10A4', 'ROBO1', 'CRH', 'PCDH8', 'CNTNAP4', 'ENTPD3', 'HLA-A', 'SERINC1', 'GRIK2', 'ROBO2', 'PLD5', 'SEZ6', 'LOC105373643', 'HTRA1', 'KCNT2', 'PTPRE')
marker_list[["IT"]]<-c('ENC1', 'PTPRK', 'VSNL1', 'LINC00507', 'FAM13A', 'LOC101927745', 'CCK', 'TNNT2', 'ARAP2', 'LMF1', 'EPB41L2', 'ART3', 'HTR2A', 'ANKRD18B', 'LOC105374973', 'LOC101929974', 'LOC105373009', 'ARL4A', 'OTOGL', 'CRYM', 'RFX3', 'ADAMTS3', 'SLC38A11', 'NWD2', 'PTPN3', 'COL5A2', 'COL24A1', 'ID2', 'DMD', 'MYO5C', 'RFTN1', 'LOC102724736', 'LRRIQ1', 'ATP2B4', 'IQCA1', 'HSD3BP4''LOC105370610', 'RXFP1', 'LOC102724834', 'SYN3', 'ADCYAP1', 'DPYD', 'DGKA', 'LOC101929293', 'SMAD3', 'MEF2A', 'CNGB1', 'MYH9', 'TRABD2A', 'CA10', 'SYNJ2')
marker_list[["L56ITCAR3"]]<-c('RGS12', 'POSTN', 'NR4A2', 'SYNPR', 'SMYD1', 'ITGB8', 'FRAS1', 'ANXA1', 'OLFML2B', 'ITGA11', 'SPOCK1', 'ARAP2', 'GNB4', 'ABCC3', 'CCK', 'HSD3BP4', 'ENPP7P8', 'NWD2', 'THEMIS', 'MCTP2', 'FAP', 'SEMA6D', 'PDLIM5', 'EGFEM1P', 'GFRA1', 'MAMDC2', 'CD52', 'RXFP1', 'IL7R', 'GPR63', 'GNG7', 'CRABP1', 'ANXA5', 'TPMT', 'DGKH', 'B2M', 'SYNJ2', 'DCSTAMP', 'LOC105371828', 'LOC100421670', 'GALNT14', 'PRLR', 'HLA-L', 'MYOM2', 'SFMBT2', 'MREG', 'ATP10A', 'CD63', 'GSG1L', 'GAS2L3', 'SOCS2')
marker_list[["PAX6"]]<-c('CXCL14', 'RELN', 'CNR1', 'WIF1', 'AP1S2', 'GRIK2', 'CRH', 'CCK', 'ADARB2', 'DDR2', 'SP8', 'RGS12', 'HMBOX1', 'SLC6A1', 'ENTPD3', 'FXYD6', 'GAD1', 'SEZ6L', 'SPOCK3', 'ZNF385D', 'NR2F2', 'NXPH1', 'DOCK10', 'PNOC', 'TIMP2', 'SORCS3', 'MYO16', 'SCG2', 'ROBO2', 'PLS3', 'ZNF536', 'PCDH8', 'NECAB2', 'DNER', 'FSTL5', 'PAX6', 'CDH4', 'NPAS3', 'SERINC1', 'CPLX3', 'RBMS3', 'DLX6-AS1', 'SPOCK1', 'ARL4C', 'RAB3C', 'COL16A1', 'PTPRM', 'WWP1', 'LOC102724124', 'IGF1', 'SYNPR')
marker_list[["LAMP5"]]<-c('SLC6A1', 'GAD2', 'DNER', 'KIT', 'LAMP5', 'SV2C', 'GAD1', 'FSTL5', 'FXYD6', 'PTPRT', 'AP1S2', 'DOCK10', 'ADARB2', 'GRIK1', 'GRIA4', 'ZNF536', 'CNTNAP4', 'HAPLN1', 'NXPH2', 'ERBB4', 'CXCL14', 'RAB3C', 'GRIN3A', 'MYO16', 'ARL4C', 'PTCHD4', 'NXPH1', 'ATP1B2', 'FREM1', 'RAB3IP', 'SPOCK1', 'EYA4', 'KCNC2', 'NECAB2', 'CACNA2D1', 'DLX6-AS1', 'FAT1', 'PMEPA1', 'RELN', 'GRIK2', 'ADRA1A', 'PCP4L1', 'MTSS1', 'CA2', 'SPHKAP', 'BCL11B', 'TPD52L1', 'IGF1', 'SOX2-OT', 'PTPRM', 'ATP11C')
marker_list[["Oligodendrocyte"]]<-c('PLP1', 'MBP', 'PTGDS', 'CERCAM', 'TF', 'MOBP', 'MOG', 'CARNS1', 'LOC101929249', 'SCD', 'SLC44A1', 'CLDND1', 'CLDN11', 'PXK', 'RNASE1', 'CNTN2', 'PLEKHH1', 'ABCA2', 'PHLDB1', 'CRYAB', 'SPP1', 'ENPP2', 'QKI', 'TMEM144', 'APOD', 'MYRF', 'CNDP1', 'APLP1', 'TMEM63A', 'PPP1R14A', 'OPALIN', 'GPRC5B', 'MAG', 'UGT8', 'ST18', 'CAPN3', 'LAMP2', 'QDPR', 'EDIL3', 'HAPLN2', 'ERMN', 'TTYH2', 'NDRG1', 'PMP22', 'LINC00844', 'MAL', 'SOX2-OT', 'ABCA8', 'DBNDD2', 'ANLN', 'S100B')
marker_list[["SST"]]<-c('SST', 'GRIK1', 'GAD1', 'SYNPR', 'SPOCK3', 'LOC105372768', 'PAWR', 'LRP8', 'NMU', 'RAB3B', 'GRIN3A', 'LHX6', 'TRB', 'ROBO2', 'CDH13', 'KCNA3', 'KLHL5', 'NTM', 'GAD2', 'COL25A1', 'MAFB', 'CORT', 'STXBP6', 'SLC6A1', 'PNOC', 'ARX', 'TIMP2', 'GRIK2', 'FXYD6', 'GRIK1-AS2', 'SOX6', 'MAF', 'TRHDE', 'NXPH1', 'PLCH1', 'CDH9', 'GRIK3', 'PAM', 'ELFN1', 'FLT3', 'DLX1', 'PTPRM', 'NPAS3', 'CRHBP', 'RAB3IP', 'RNASET2', 'TMEM47', 'SPARCL1', 'ANK1', 'ADCYAP1R1', 'RBP4')
marker_list[["OPC"]]<-c('PTPRZ1', 'VCAN', 'OLIG1', 'PDGFRA', 'BCAN', 'NTM', 'APOD', 'COL9A1', 'C1orf61', 'PCDH15', 'EPN2', 'HIP1R', 'COL20A1', 'LOC105379054', 'ZBTB20', 'PLEKHH2', 'SCD5', 'CST3', 'OLIG2', 'PTN', 'SOX6', 'HTRA1', 'GPNMB', 'SNX22', 'SMOC1', 'SEMA5A', 'SULF2', 'LUZP2', 'NAV1', 'CCDC50', 'CSPG5', 'SPRY4', 'ADGRG1', 'LHFPL3', 'COL9A2', 'NKAIN4', 'BCAS1', 'KAT2B', 'SDC3', 'FERMT1', 'QKI', 'CSPG4', 'LPPR1', 'DSCAM', 'KANK1', 'COL11A1', 'FGFR1', 'MYT1', 'MEGF11', 'LRP4', 'PHLDA1')
marker_list[["Pericyte"]]<-c('PDGFRB', 'IGFBP7', 'ATP1A2', 'NOTCH3', 'ITIH5', 'SLC6A12', 'HIGD1B', 'RGS5', 'DCN', 'SLC6A1', 'PTN', 'IFITM3', 'CALD1', 'GNG11', 'BGN', 'FN1', 'IFITM1', 'SLC19A1', 'NDUFA4L2', 'PLXDC1', 'B2M', 'DLC1', 'TXNIP', 'COLEC12', 'PRELP', 'ADIRF', 'SPARC', 'SLC12A7', 'SLC6A13', 'CD9', 'EPS8', 'ISYNA1', 'LGALS1', 'IFITM2', 'EPAS1', 'MYO1B', 'STOM', 'HSPB1', 'EMP2', 'HES1', 'CYBA', 'PTGDS', 'MCAM', 'MYL12A', 'PDE7B', 'CD63', 'NR2F2', 'ARHGAP29', 'FRZB', 'TNS2', 'GJC1')
marker_list[["Endothelial"]]<-c('HLA-B', 'ABCG2', 'CLDN5', 'FLT1', 'HLA-E', 'B2M', 'A2M', 'ADGRF5', 'XAF1', 'HLA-C', 'ABCB1', 'SLC2A1', 'PODXL', 'EMP2', 'IFITM3', 'CLEC3B', 'EPAS1', 'IFI27', 'ID3', 'PTPRB', 'ENG', 'IFI44', 'MFSD2A', 'NEAT1', 'GPR85', 'ID1', 'VIM', 'SRGN', 'COBLL1', 'IFI44L', 'ITM2A', 'GPCPD1', 'DUSP1', 'HLA-H', 'LIMS2', 'MT2A', 'TM4SF1', 'GIMAP7', 'IGFBP7', 'VWF', 'SLC7A5', 'STOM', 'ITGA6', 'LMO2', 'PDGFB', 'SLC38A5', 'ESAM', 'KLF2', 'SLC7A1', 'HLA-A', 'HSPB1')
marker_list[["Microglia"]]<-c('LPAR6', 'CD74', 'ADAM28', 'P2RY12', 'CSF1R', 'CX3CR1', 'C3', 'RNASET2', 'SAT1', 'A2M', 'MAF', 'ITGAX', 'BHLHE41', 'BLNK', 'SRGAP2', 'HLA-DRB1', 'APBB1IP', 'ZFP36L2', 'LAPTM5', 'P2RY13', 'IFNGR1', 'PTPRC', 'CSF3R', 'ZFP36L1', 'CYBA', 'USP53', 'SFMBT2', 'C1QC', 'CSF2RA', 'LINC01268', 'SAMSN1', 'HLA-DRA', 'RCSD1', 'ADAP2', 'FYB', 'LTC4S', 'TYROBP', 'CD53', 'ARHGAP24', 'IFI16', 'C1QB', 'SLCO2B1', 'SLC1A3', 'MS4A7', 'GPR34', 'AIF1', 'TAL1', 'ITPR2', 'DOCK8', 'HLA-DPA1', 'MEF2A')
saveRDS(marker_list,"hg38_markerlist.rds")
#Human cortex marker list is from https://cells.ucsc.edu/?ds=aging-brain
#And stored in https://docs.google.com/spreadsheets/d/15YwK2lWh018IyxgQsRxnrINxBFlG3XtU5kE8gArYxJ4/edit?usp=sharing
marker_list<-NULL
marker_list[["SMC"]]<-c('Acta2', 'Tagln', 'Myl9', 'Tpm2', 'Mylk', 'Myh11', 'Cald1', 'Tpm1', 'Crip1', 'Mustn1', 'Flna', 'Pln', 'Sncg', 'Gm13889', 'Csrp1', 'Lmod1', 'Ppp1r14a', 'Des', 'Rgs4', 'Palld', 'Fxyd1', 'Nexn', 'Rasl11a', 'Aspn', 'Filip1l', 'Map1b', 'Pde3a', 'Igfbp7', 'Gm13861', 'Ccnd2', 'Snhg18', 'Lgals1', 'Gja4', 'Gucy1a1', 'Sparcl1', 'Errfi1', 'Gper1', 'Lbh', 'Hspb2', 'Crispld2', 'Mfge8', 'Cryab', 'Bgn', 'Cnn1', 'Higd1b', 'Rrad', 'Rasl12', 'Wtip', 'Notch3', 'Nrip2', 'Perp')
marker_list[["mNeur"]]<-c('Meg3', 'Snhg11', 'Rtn1', 'Bex2', 'Ly6h', 'Celf4', 'Syt1', 'Snap25', 'Pcsk1n', 'Map1b', 'Stmn1', 'Stmn3', 'Ahi1', '6330403K07Rik', 'Reln', 'Stmn2', 'Pcp4', 'Gng3', 'Uchl1', 'Zcchc18', 'Bcl11a', 'Peg3', 'Nap1l5', 'Pcsk2', 'Resp18', 'Atp1b1', 'Rufy3', 'Aplp1', 'Scg2', 'Gria2', 'Scg5', 'Nhlh2', 'Sgip1', 'Gm1673', 'Gpm6a', 'Nrxn3', 'Syn2', 'Lhx1os', 'Mapt', 'Cacna2d2', 'Lhx1', 'Tubb3', 'Elavl3', 'Cck', 'Fxyd6', 'Map7d2', 'Negr1', 'Cit', 'Thy1', 'Fxyd7', 'Cbarp')
marker_list[["Hb_EC"]]<-c('Hba-a2', 'Hba-a1', 'Hbb-bt', 'Hbb-bs', 'Alas2', 'Cldn5', 'Ly6c1', 'Ly6a', 'Itm2a', 'Flt1', 'Cxcl12', 'Spock2', 'Egfl7', 'Pltp', 'Slco1a4', 'Snca', 'Slc9a3r2', 'Bsg', 'Hspb1', 'Pglyrp1', 'Ctla2a', 'Apold1', 'Ifitm3', 'Bpgm', 'Id1', 'Car4', 'Klf2', 'Igfbp7', 'Crip1', 'Abhd2', '8430408G22Rik', 'Rgs5', 'Foxq1', 'Cyr61', 'Cd24a', 'Slc25a37', 'Tpm1', 'Rec114', 'Hspa1a', 'Ptn', 'Ftl1', 'Pam', 'Ube2c', 'Cald1', 'Vtn', 'Higd1b', 'Jund', 'Sparcl1', 'Tprgl', 'Jun', 'Nfkbia')
marker_list[["imNeur"]]<-c('Sox11', 'Stmn1', 'Sox4', 'Ccnd2', 'Tubb3', 'Meis2', 'Stmn2', 'Marcksl1', 'Cd24a', 'Tubb2b', 'Rtn1', 'Pfn2', 'Dcx', 'Dlx6os1', 'Map1b', 'Nrxn3', 'Hmgn2', 'Ncam1', 'Nnat', 'Pbx1', 'Celf4', 'Meg3', 'Elavl3', 'Dlx1', 'Igfbpl1', 'Stmn4', 'Gm1673', 'Bcl11a', 'Gm17750', 'Stmn3', 'Cdk5r1', 'H3f3b', '6330403K07Rik', 'Serf1', 'Basp1', 'Gad2', 'Hist3h2ba', 'Ptprs', 'Dlx2', 'Mpped2', 'Gng3', 'Islr2', 'Ccdc88a', 'Fxyd6', 'Auts2', 'Arx', 'Dlx5', 'Tiam2', 'Sp9', 'Foxg1', 'Etv1')
marker_list[["NRP"]]<-c('Sox11', 'Hmgn2', 'Stmn1', 'Ccnd2', 'Pclaf', 'Marcksl1', 'Sox4', 'Cd24a', 'Marcks', 'Mdk', 'Cenpf', 'Meis2', 'Pantr1', 'Gm1673', 'Pfn2', 'Ube2c', 'Dlx1', 'Rtn1', 'Hist1h2ap', 'Tubb3', 'Elavl3', 'Ncam1', 'Dlx2', 'Bex2', 'Dcx', 'Nnat', 'Bcl11a', 'Gm17750', 'Nrxn3', 'Foxg1', 'Arx', 'Ptprs', 'Pou3f2', 'Igfbpl1', 'Tubb2b', 'Fxyd6', 'Ppp1r14b', 'Serf1', 'Pax6', 'Pbx1', 'Elavl2', 'Map1b', 'Ccnb1', 'Mfap2', 'Dlx6os1', 'Trim2', 'Stmn3', 'Hes5', 'Sox2', 'Fabp5', 'Mpped2')
marker_list[["OLG"]]<-c('Mgp', 'Igfbp6', 'Cldn11', 'Nov', 'Igfbp5', 'Rspo3', 'Nbl1', 'Lmo4', 'Slc47a1', 'Rbp1', '1500015O10Rik', 'Efemp1', 'Ptgds', 'Emp3', 'Prg4', 'Atp1b1', 'Col1a2', 'Emb', 'Apod', 'Ogn', 'Cryab', 'Bgn', 'Pcolce', 'Perp', 'Foxp2', 'Cald1', 'Gpc3', 'S100b', 'Cd74', 'Prrx2', 'Alcam', 'Ank2', 'Aspa', 'Asgr1', 'Nupr1', 'Thbs1', 'Serping1', 'Scara3', 'Timp2', 'Gjb6', 'Gjb2', 'Foxc2', 'Spp1', 'Foxd1', 'Gstm5', 'Penk', 'Zic1', 'Cped1', 'Dapl1', 'Fstl1', 'Col1a1')
marker_list[["EC"]]<-c('Cldn5', 'Flt1', 'Ly6c1', 'Ly6a', 'Itm2a', 'Cxcl12', 'Egfl7', 'Spock2', 'Pltp', 'Slco1a4', 'Pglyrp1', 'Id1', 'Slc9a3r2', 'Ifitm3', 'Bsg', 'Apold1', 'Hspb1', 'Ctla2a', 'Klf2', 'Car4', 'Foxq1', 'Crip1', 'Igfbp7', '8430408G22Rik', 'Hspa1a', 'Edn1', 'Abhd2', 'Cyr61', 'Jun', 'Ptn', 'Nfkbia', 'Jund', 'Junb', 'Tpm1', 'Gkn3', 'Stmn2', 'Fos', 'Rgs5', 'Trim47', 'Rbpms', 'Tprgl', 'Cdkn1c', 'Pam', 'Sparcl1', 'Usp53', 'Slc38a3', 'Stmn1', 'Ctnnbip1', 'Gm26532', 'Sept4', 'B430010I23Rik')
marker_list[["AC"]]<-c('Aldoc', 'Slc1a3', 'Slc1a2', 'Plpp3', 'Ntm', 'Atp1a2', 'Mt2', 'Mt3', 'Gpr37l1', 'Ndrg2', 'Gm3764', 'Gja1', 'Clu', 'Atp1b2', 'Ntsr2', 'Mt1', 'Bcan', 'Cldn10', 'Prdx6', 'Htra1', 'Prnp', 'Dclk1', 'Gstm5', 'Cxcl14', 'Ttyh1', 'Apoe', 'Pla2g7', 'Acsl3', 'Ntrk2', 'Dbi', 'Slc6a11', 'Gpm6b', 'Mmd2', 'Ptprz1', 'Gria2', 'Mfge8', 'Sparcl1', 'Slc4a4', 'F3', 'Cpe', 'Tspan7', 'Gstm1', 'Gpm6a', 'Rorb', 'Ckb', 'Dtna', 'Slc6a1', 'Cspg5', 'Scd2', 'Slc7a10', 'Ptn')
marker_list[["TNC"]]<-c('6330403K07Rik', 'S100a6', 'Prdx6', 'Ntrk2', 'Fbxo2', 'Dbi', 'Tmem47', 'Nnat', 'Mt3', 'Cpe', 'Gpm6b', 'Mt2', 'Mlc1', 'Igfbp5', 'Fxyd1', 'Mt1', 'Scd2', 'Id4', 'Sox9', 'Aldoc', 'Slc1a3', 'Gfap', 'Apoe', 'Thrsp', 'Gstm1', 'Mdk', 'Pcsk1n', 'Meg3', 'Zcchc18', 'Myoc', 'Rax', 'S100a1', 'Ttyh1', 'Mgst1', 'Six3', 'Dtna', 'Riiad1', 'Ndrg2', 'Chchd10', 'Pbx1', 'Map1b', 'Mia', 'Pygb', 'Kctd14', 'Gprc5b', 'Slc1a2', 'Scg5', 'Ank2', 'Atp1b2', 'Apoc1', 'Nrxn2')
marker_list[["EPC"]]<-c('Ccdc153', 'Tmem212', 'Rarres2', 'Tppp3', 'Riiad1', 'Gm19935', 'Nnat', 'Chchd10', 'Mia', 'Cd24a', 'Mt3', 'Dbi', 'Clu', 'S100b', 'Calml4', 'Mt2', 'Hspa2', '1500015O10Rik', 'Gm14964', 'Sox9', 'Tmem47', 'Mlc1', 'Prdx6', 'Ntrk2', 'Fam213a', 'Ramp1', 'Mt1', 'Cpe', 'Scd2', 'Mgst1', 'Gm1673', 'Lmo4', 'Prelp', 'Ppil6', 'Gstm1', 'Id4', 'Gm973', 'Ttll3', 'Plekhb1', 'Dalrd3', 'Cfap100', 'Zcchc18', 'S100a6', 'Trp53bp2', 'Aqp4', 'Sox2', 'Mro', 'Ckb', 'Wdr60', 'Tspan15', 'Spint2')
marker_list[["MG"]]<-c('Ctss', 'C1qc', 'Hexb', 'C1qa', 'C1qb', 'Tyrobp', 'Selplg', 'Trem2', 'Tmem119', 'Cx3cr1', 'Fcer1g', 'Lgmn', 'Csf1r', 'Laptm5', 'Rgs10', 'Ctsd', 'Fcrls', 'P2ry12', 'Fcgr3', 'Olfml3', 'Ctsz', 'Gpr34', 'Lpcat2', 'Pld4', 'Grn', 'Unc93b1', 'Aif1', 'Ly86', 'Vsir', 'Siglech', 'Ccl4', 'Ccl3', 'Trf', 'Irf8', 'Cd53', 'Bin1', 'Hexa', 'Atf3', 'Junb', 'Spi1', 'Marcks', 'Cd83', 'Hk2', 'Ltc4s', 'Cd37', 'Coro1a', 'Ptgs1', 'Tgfbr1', 'Slc15a3', 'Lair1', 'Hpgds')
marker_list[["MNC"]]<-c('Cd52', 'Lyz2', 'Plac8', 'Fcer1g', 'Msrb1', 'Tyrobp', 'Coro1a', 'Alox5ap', 'Cybb', 'Lgals3', 'Lsp1', 'Lst1', 'S100a6', 'Ifitm6', 'Ccl6', 'Ifitm3', 'Rac2', 'Cytip', 'Pim1', 'Lcp1', 'Hp', 'Stk17b', 'Emp3', 'Ifi27l2a', 'Spi1', 'Itgal', 'Ms4a6c', 'S100a4', 'Wfdc17', 'Il1b', 'S100a8', 'S100a9', 'Samsn1', 'Gmfg', 'Slpi', 'Ftl1', 'Cebpb', 'Napsa', 'Ly6c2', 'Ear2', 'Retnlg', 'Wfdc21', 'Cd53', 'Mcemp1', 'G0s2', 'Plbd1', 'Cd74', 'AB124611', 'Rps11', 'Sirpb1c', 'Gpr141')
marker_list[["PC"]]<-c('Vtn', 'Higd1b', 'Ndufa4l2', 'Rgs5', 'Cald1', 'Myl9', 'Cox4i2', 'Sept4', 'Pdgfrb', 'Kcnj8', 'Ifitm1', 'Rgs4', 'Ptn', 'Atp1a2', 'P2ry14', 'Rarres2', 'Atp13a5', 'Abcc9', 'Mgp', 'Sod3', 'Slc19a1', 'Slc6a20a', 'Nbl1', 'Art3', 'Zic1', 'Ppp1r14a', 'Lhfp', 'Gper1', 'Gucy1b1', 'Notch3', 'Bgn', 'Pitpnc1', 'Car4', 'Tbx3os1', 'Gja4', 'Gm13861', 'Perp', 'Phlda1', 'Il34', 'Uchl1', 'Slc38a11', 'Carmn', 'Gpx8', 'Mfge8', 'Pth1r', 'Ecm2', 'Gucy1a1', 'Igfbp7', 'Pde4d', 'Ddit4l', 'G0s2')
marker_list[["MAC"]]<-c('Pf4', 'Lyz2', 'Ms4a7', 'Dab2', 'Mrc1', 'Fcer1g', 'C1qc', 'Ctsc', 'Wfdc17', 'C1qa', 'C1qb', 'Tyrobp', 'F13a1', 'Maf', 'Stab1', 'Trf', 'Apoe', 'Ccl7', 'Ms4a6c', 'Ccl24', 'Ccl2', 'Lst1', 'Hexa', 'Clec4n', 'Cd14', 'Cbr2', 'Ccl12', 'Cd209f', 'Cybb', 'Fcrls', 'Grn', 'Ptpn18', 'Ifi207', 'Cxcl2', 'Cd68', 'Ms4a6b', 'Fcgr2b', 'Marcksl1', 'Lgmn', 'Ftl1', 'Csf1r', 'Ctss', 'Lgals1', 'Folr2', 'Igfbp4', 'Ltc4s', 'C5ar1', 'Fcgr3', 'Tslp', 'Tgfbi', 'Cd163')
saveRDS(marker_list,"mm10_markerlist.rds")
#Gross cell type list (for both organisms)
marker_list<-NULL
marker_list[["Oligodendrocyte"]]<-c('Mobp','Cldn1','Prox1','Olig1')
marker_list[["Polydendrocyte"]]<-c('Cspg4','Pdgfra')
marker_list[["Astrocyte"]]<-c('Gfap','GluI','Slc1a2','Agt')
marker_list[["Ex_Neuron"]]<-c('Satb2','Cux2','Rorb','Pcp4','Foxp2','Col19a1','Slc17a7')
marker_list[["Microglia"]]<-c('C1qa','C1qc','Cxcr1')
marker_list[["Inhib_Neuron"]]<-c('Cnr1','Bcl11b','Gad1','Dlx1','Dlx2')
saveRDS(marker_list,"grosscelltype_markerlist.rds")
#Function to plot cell type markers
#For now just using the gross celltype marker list
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)
region_check<-function(i,object=hg38_atac){
return(nrow(as.data.frame(LookupGeneCoords(object=object,gene=toupper(gene_list[i])))))
}
marker_plot<-function(j,k=celltype_name,l=hg38_atac,m="hg38_",n="seurat_clusters"){
plt<-CoveragePlot(
object = l,
region = j,
group.by=n,
extend.upstream = 1000,
extend.downstream = 1000,
ncol = 1,
tile=T,
tile.size=500,
tile.cells=downsamp
)
pdf(paste0("./marker_sets/",m,k,"_",j,"_genebody_accessibility.pdf"))
print(plt)
dev.off()
}
feature_plot<-function(j,k=celltype_name,l=hg38_atac,m="hg38_",n="seurat_clusters"){
plt<-FeaturePlot(
object = l,
features = j,
order=T
)
pdf(paste0("./marker_sets/",m,k,"_",j,"_geneactivity.pdf"))
print(plt)
dev.off()
}
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
marker_list<-readRDS("./marker_sets/grosscelltype_markerlist.rds")
summary(hg38_atac@meta.data$seurat_clusters) #setting tile cells to 90 to downsample
downsamp=90
for (x in 1:length(marker_list)){
gene_list<-marker_list[[x]]
gene_list<-gene_list[as.numeric(unlist(lapply(1:length(gene_list),FUN=region_check)))==1] #subset for genes with reads across gene body
gene_list<-toupper(gene_list) #make gene names uppercase
celltype_name<-names(marker_list)[x]
mclapply(gene_list,FUN=marker_plot,k=celltype_name,mc.cores=20)
gene_list<-gene_list[gene_list %in% row.names(hg38_atac@assays$GeneActivity@data)] #subset for genes with GA
mclapply(gene_list,FUN=feature_plot,k=celltype_name,mc.cores=20)
}
for (i in list.files(path="./marker_sets",pattern="markerset_hg38")){
system(paste0("slack -F ./marker_sets/",i," ryan_todo"))
}
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
marker_list<-readRDS("./marker_sets/grosscelltype_markerlist.rds")
summary(mm10_atac@meta.data$seurat_clusters) #setting tile cells to 99 to downsample
downsamp=30
for (x in 1:length(marker_list)){
gene_list<-marker_list[[x]]
gene_list<-gene_list[as.numeric(unlist(lapply(1:length(gene_list),FUN=region_check)))==1]
celltype_name<-names(marker_list)[x]
mclapply(gene_list,FUN=marker_plot,k=celltype_name,l=mm10_atac,m="mm10_",mc.cores=20)
gene_list<-gene_list[gene_list %in% row.names(mm10_atac@assays$GeneActivity@data)] #subset for genes with GA
mclapply(gene_list,FUN=feature_plot,k=celltype_name,l=mm10_atac,m="mm10_",mc.cores=20)
}
#For concatenating cell types into a single scrollable pdf
celltype=`ls mm10*genebody_accessibility.pdf | awk '{split($1,a,"_");print a[2]}' - | uniq`
for i in $celltype ; do convert `echo mm10_${i}_*genebody_accessibility.pdf` markerset_mm10_${i}.genebody.pdf; done
celltype=`ls mm10*geneactivity.pdf | awk '{split($1,a,"_");print a[2]}' - | uniq`
for i in $celltype ; do convert `echo mm10_${i}_*geneactivity.pdf` markerset_mm10_${i}.GA.pdf; done
celltype=`ls hg38*genebody_accessibility.pdf | awk '{split($1,a,"_");print a[2]}' - | uniq`
for i in $celltype ; do convert `echo hg38_${i}_*genebody_accessibility.pdf` markerset_hg38_${i}.genebody.pdf; done
celltype=`ls hg38*geneactivity.pdf | awk '{split($1,a,"_");print a[2]}' - | uniq`
for i in $celltype ; do convert `echo hg38_${i}_*geneactivity.pdf` markerset_hg8_${i}.GA.pdf; done
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)
library(ggplot2)
library(ggrepel)
library(dplyr)
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
#Perform One vs. rest DA enrichment
write("Performing one vs. rest DA enrichment per annotation grouping supplied.", stderr())
#set up an empty list for looping through
hg38_da_peaks<-list()
mm10_da_peaks<-list()
#define DA functions for parallelization
#Use LR test for atac data
da_one_v_rest<-function(i,obj,group){
da_peaks_tmp <- FindMarkers(
object = obj,
ident.1 = i,
group.by = group,
test.use = 'LR',
latent.vars = 'nCount_peaks',
only.pos=T
)
da_peaks_tmp$da_region<-row.names(da_peaks_tmp)
closest_genes <- ClosestFeature(obj,da_peaks_tmp$da_region)
da_peaks_tmp<-cbind(da_peaks_tmp,closest_genes)
da_peaks_tmp$enriched_group<-c(i)
da_peaks_tmp$compared_group<-c("all_other_cells")
return(da_peaks_tmp)
}
da_one_v_one<-function(i,obj,group,j_list){
i<-as.character(i)
da_tmp_2<-list()
for (j in j_list){
if ( i != j){
da_peaks_tmp <- FindMarkers(
object = obj,
ident.1 = i,
ident.2 = j,
group.by = group,
test.use = 'LR',
latent.vars = 'nCount_peaks',
only.pos=T
)
da_peaks_tmp$da_region<-row.names(da_peaks_tmp)
closest_genes <- ClosestFeature(obj,da_peaks_tmp$da_region)
da_peaks_tmp<-cbind(da_peaks_tmp,closest_genes)
da_peaks_tmp$enriched_group<-c(i)
da_peaks_tmp$compared_group<-c(j)
da_tmp_2[[paste(i,j)]]<-da_peaks_tmp
}
}
return(da_tmp_2)
}
#Perform parallel application of DA test
library(parallel)
n.cores=length(unique(hg38_atac@meta.data$seurat_clusters))
hg38_da_peaks<-mclapply(
unique(hg38_atac@meta.data$seurat_clusters),
FUN=da_one_v_rest,
obj=hg38_atac,
group="seurat_clusters",
mc.cores=n.cores)
n.cores=length(unique(mm10_atac@meta.data$seurat_clusters))
mm10_da_peaks<-mclapply(
unique(mm10_atac@meta.data$seurat_clusters),
FUN=da_one_v_rest,
obj=mm10_atac,
group="seurat_clusters",
mc.cores=n.cores)
#Merge the final data frame from the list for 1vrest DA
hg38_da_peaks<-do.call("rbind",hg38_da_peaks)
mm10_da_peaks<-do.call("rbind",mm10_da_peaks)
write("Outputting One v Rest DA Table.", stderr())
write.table(hg38_da_peaks,file="hg38.onevrest.da_peaks.txt",sep="\t",col.names=T,row.names=T,quote=F)
write.table(mm10_da_peaks,file="mm10.onevrest.da_peaks.txt",sep="\t",col.names=T,row.names=T,quote=F)
dat<-read.table("hg38.onevrest.da_peaks.txt",header=T,sep="\t")
dat_select<-dat %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
plt<-ggplot(dat,aes(x=avg_logFC,y=(-log(p_val)),color=as.factor(enriched_group)))+geom_point(aes(alpha=0.1))+geom_label_repel(dat=dat_select,aes(label=gene_name,size=-distance),force=3)+theme_bw()
pdf("hg38_da_peaks.pdf")
plt
dev.off()
dat<-read.table("mm10.onevrest.da_peaks.txt",header=T,sep="\t")
dat_select<-dat %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
plt<-ggplot(dat,aes(x=avg_logFC,y=(-log(p_val)),color=as.factor(enriched_group)))+geom_point(aes(alpha=0.1))+geom_label_repel(dat=dat_select,aes(label=gene_name,size=-distance),force=3)+theme_bw()
pdf("mm10_da_peaks.pdf")
plt
dev.off()
#Empty list to rerun for 1v1 comparisons
hg38_da_peaks<-list()
mm10_da_peaks<-list()
n.cores=length(unique(hg38_atac@meta.data$seurat_clusters))
hg38_da_peaks<-mclapply(
unique(hg38_atac@meta.data$seurat_clusters),
FUN=da_one_v_one,
obj=hg38_atac,
group="seurat_clusters",
j_list=do.call("as.character",list(unique(hg38_atac@meta.data$seurat_clusters))),
mc.cores=n.cores)
n.cores=length(unique(mm10_atac@meta.data$seurat_clusters))
mm10_da_peaks<-mclapply(
unique(mm10_atac@meta.data$seurat_clusters),
FUN=da_one_v_one,
obj=mm10_atac,
group="seurat_clusters",
j_list=do.call("as.character",list(unique(mm10_atac@meta.data$seurat_clusters))),
mc.cores=n.cores)
#Merge the final data frame from the list for 1v1 DA
hg38_da_peaks<-do.call("rbind",do.call("rbind",hg38_da_peaks))
mm10_da_peaks<-do.call("rbind",do.call("rbind",mm10_da_peaks))
write("Outputting One v One DA Table.", stderr())
write.table(hg38_da_peaks,file="hg38.onevone.da_peaks.txt",sep="\t",col.names=T,row.names=T,quote=F)
write.table(mm10_da_peaks,file="mm10.onevone.da_peaks.txt",sep="\t",col.names=T,row.names=T,quote=F)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)
library(ggplot2)
library(ggrepel)
library(dplyr)
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
#Cell types deteremined by TF motif and marker genebody accessibility
hg38_atac@meta.data$celltype<-"NA"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("2"),]$celltype<-"excitatory_neuron"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("3","4"),]$celltype<-"inhibitory_neuron"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("0","1"),]$celltype<-"oligodendrocytes"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("7"),]$celltype<-"polydendrocytes"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("5"),]$celltype<-"microglia"
hg38_atac@meta.data[hg38_atac@meta.data$seurat_clusters %in% c("6"),]$celltype<-"astrocyte"
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
mm10_atac@meta.data$celltype<-"NA"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("0","2"),]$celltype<-"excitatory_neuron"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("1"),]$celltype<-"inhibitory_neuron"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("4"),]$celltype<-"oligodendrocytes_polydendrocytes"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("5"),]$celltype<-"microglia"
mm10_atac@meta.data[mm10_atac@meta.data$seurat_clusters %in% c("3"),]$celltype<-"astrocyte"
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")
#Also adding TSS enrichment values to meta data
hg38_tss<-read.table("hg38.bbrd.q10.tss_reads.value",header=F)
colnames(hg38_tss)<-c("cellID","TSS_enrichment")
mm10_tss<-read.table("mm10.bbrd.q10.tss_reads.value",header=F)
colnames(mm10_tss)<-c("cellID","TSS_enrichment")
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
hg38_annot<-hg38_atac@meta.data
mm10_annot<-mm10_atac@meta.data
hg38_annot<-merge(hg38_annot,hg38_tss,by="cellID")
mm10_annot<-merge(mm10_annot,mm10_tss,by="cellID")
row.names(hg38_annot)<-hg38_annot$cellID
row.names(mm10_annot)<-mm10_annot$cellID
hg38_atac@meta.data<-hg38_annot
mm10_atac@meta.data<-mm10_annot
saveRDS(mm10_atac,file="mm10_SeuratObject.Rds")
saveRDS(hg38_atac,file="hg38_SeuratObject.Rds")
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(parallel)
library(ggplot2)
library(ggrepel)
library(dplyr)
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
hg38_dat<-as.data.frame(hg38_atac@meta.data)
mm10_dat<-as.data.frame(mm10_atac@meta.data)
#UMAP projection coords in
#mm10_atac@reductions$umap@cell.embeddings
#hg38_atac@reductions$umap@cell.embeddings
write.table(hg38_dat,"hg38_cellsummaries.tsv",col.names=T,row.names=F,sep="\t",quote=F)
write.table(mm10_dat,"mm10_cellsummaries.tsv",col.names=T,row.names=F,sep="\t",quote=F)
library(ggplot2)
library(dplyr)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
hg38_dat<-read.table("hg38_projected_reads.txt",sep="\t")
colnames(hg38_dat)<-c("perc_uniq","read_count","uniqread_count","cellID")
hg38_dat$species<-"human"
mm10_dat<-read.table("mm10_projected_reads.txt",sep="\t")
colnames(mm10_dat)<-c("perc_uniq","read_count","uniqread_count","cellID")
mm10_dat$species<-"mouse"
mm10_sum<-read.table("mm10_cellsummaries.tsv",head=T)
hg38_sum<-read.table("hg38_cellsummaries.tsv",head=T)
dat<-rbind(hg38_dat,mm10_dat)
dat<- as.data.frame(dat %>%
group_by(species,perc_uniq) %>%
summarize(mean=mean(log10(uniqread_count)),
sd=sd(log10(uniqread_count)),
median=median(log10(uniqread_count))))
ggplot(dat,aes(x=as.numeric(perc_uniq),fill = species,color=species))+
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd,color=NULL),alpha=0.1) +
geom_line(aes(y=mean,linetype="solid"))+
geom_line(aes(y=median,linetype="dashed")) +
geom_point(data=hg38_sum,aes(y=log10(uniq_reads),x=perc_uniq/100),color="red",fill="red",size=0.5)+
geom_point(data=mm10_sum,aes(y=log10(uniq_reads),x=perc_uniq/100),color="blue",fill="blue",size=0.5)+
theme_bw() + scale_x_reverse()
ggsave(file="projected_readcount.png")
ggsave(file="projected_readcount.pdf")
system("slack -F projected_readcount.pdf s3")
Starting with human cells
library(Signac)
library(Seurat)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(cicero)
library(cisTopic)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(Matrix)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(patchwork)
set.seed(1234)
library(dplyr)
library(ggrepel)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
#Read in data and modify to monocle CDS file
#read in RDS file.
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")
######FUNCTIONS#####
cistopic_generation<-function(x,celltype.x,species="hg38"){
#Perform cistopic on subclusters of data
atac_sub<-subset(x,subset=celltype==celltype.x)
cistopic_counts_frmt<-atac_sub$peaks@counts
row.names(cistopic_counts_frmt)<-sub("-", ":", row.names(cistopic_counts_frmt))
sub_cistopic<-cisTopic::createcisTopicObject(cistopic_counts_frmt)
sub_cistopic_models<-cisTopic::runWarpLDAModels(sub_cistopic,topic=c(5,10,20:30),nCores=12,addModels=FALSE)
saveRDS(sub_cistopic_models,file=paste(species,celltype.x,".CisTopicObject.Rds",sep="_"))
pdf(paste(species,celltype.x,"_model_selection.pdf",sep="_"))
par(mfrow=c(3,3))
sub_cistopic_models <- selectModel(sub_cistopic_models, type='maximum')
sub_cistopic_models <- selectModel(sub_cistopic_models, type='perplexity')
sub_cistopic_models<- selectModel(sub_cistopic_models, type='derivative')
dev.off()
}
#UMAP Projection and clustering on selected cistopic model
clustering_loop<-function(topicmodel_list.=topicmodel_list,object_input=hg38_atac,celltype.x,topiccount_list.=topic_count_list){
#set up subset object again
atac_sub<-subset(object_input,subset=celltype==celltype.x)
#select_topic
models_input<-readRDS(paste(species,celltype.x,".CisTopicObject.Rds",sep="_"))
cisTopicObject<-cisTopic::selectModel(models_input,select=topiccount_list.[celltype.x],keepModels=F)
#perform UMAP on topics
topic_df<-as.data.frame(cisTopicObject@selected.model$document_expects)
row.names(topic_df)<-paste0("Topic_",row.names(topic_df))
dims<-as.data.frame(uwot::umap(t(topic_df),n_components=2))
row.names(dims)<-colnames(topic_df)
colnames(dims)<-c("x","y")
dims$cellID<-row.names(dims)
dims<-merge(dims,object_input@meta.data,by.x="cellID",by.y="row.names")
#get cell embeddings
cell_embeddings<-as.data.frame(cisTopicObject@selected.model$document_expects)
colnames(cell_embeddings)<-cisTopicObject@cell.names
n_topics<-nrow(cell_embeddings)
row.names(cell_embeddings)<-paste0("topic_",1:n_topics)
cell_embeddings<-as.data.frame(t(cell_embeddings))
#get feature loadings
feature_loadings<-as.data.frame(cisTopicObject@selected.model$topics)
row.names(feature_loadings)<-paste0("topic_",1:n_topics)
feature_loadings<-as.data.frame(t(feature_loadings))
#combine with seurat object for celltype seuratobject
cistopic_obj<-CreateDimReducObject(embeddings=as.matrix(cell_embeddings),loadings=as.matrix(feature_loadings),assay="peaks",key="topic_")
umap_dims<-as.data.frame(as.matrix(dims[2:3]))
colnames(umap_dims)<-c("UMAP_1","UMAP_2")
row.names(umap_dims)<-dims$cellID
cistopic_umap<-CreateDimReducObject(embeddings=as.matrix(umap_dims),assay="peaks",key="UMAP_")
atac_sub@reductions$cistopic<-cistopic_obj
atac_sub@reductions$umap<-cistopic_umap
#finally recluster
n_topics<-ncol(Embeddings(atac_sub,reduction="cistopic"))
#Clustering with multiple resolutions to account for different celltype complexities
atac_sub <- FindNeighbors(object = atac_sub, reduction = 'cistopic', dims = 1:n_topics)
atac_sub <- FindClusters(object = atac_sub,resolution=0.1)
atac_sub <- FindClusters(object = atac_sub,verbose = TRUE,resolution=0.2)
atac_sub <- FindClusters(object = atac_sub,verbose = TRUE,resolution=0.5)
atac_sub <- FindClusters(object = atac_sub,verbose = TRUE,resolution=0.9)
saveRDS(atac_sub,paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
plt<-DimPlot(atac_sub,group.by=c('peaks_snn_res.0.1','peaks_snn_res.0.2','peaks_snn_res.0.5','peaks_snn_res.0.9'))
ggsave(plt,file=paste(species,celltype.x,"clustering.pdf",sep="_"))
}
#Set up DA Test
da_one_v_one<-function(i,obj,group,j_list){
i<-as.character(i)
da_tmp_2<-list()
for (j in j_list){
if ( i != j){
da_peaks_tmp <- FindMarkers(
object = obj,
ident.1 = i,
ident.2 = j,
group.by = group,
test.use = 'LR',
latent.vars = 'nCount_peaks',
only.pos=T
)
da_peaks_tmp$da_region<-row.names(da_peaks_tmp)
da_peaks_tmp$enriched_group<-c(i)
da_peaks_tmp$compared_group<-c(j)
da_tmp_2[[paste(i,j)]]<-da_peaks_tmp
}
}
return(da_tmp_2)
}
#Generate chromvar matrix per celltype
chromvar_loop<-function(celltype.x){
atac_sub<-readRDS(paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
# Add the Motif object to the assay
atac_sub<- SetAssayData(object = atac_sub, assay = 'peaks', slot = 'motifs', new.data = motif)
atac_sub <- RegionStats(object = atac_sub, genome = BSgenome.Hsapiens.UCSC.hg38)
atac_sub<- RunChromVAR(object = atac_sub,genome = BSgenome.Hsapiens.UCSC.hg38)
tfList <- getMatrixByID(JASPAR2020, ID=row.names(atac_sub@assays$chromvar@data))
tfList <-unlist(lapply(names(tfList), function(x) name(tfList[[x]])))
row.names(atac_sub@assays$chromvar@data)<-tfList
#setting final_clusters based on given resolution
atac_sub$final_clusters<-atac_sub@meta.data[,which(endsWith(colnames(atac_sub@meta.data),
as.character(resolution_list[celltype.x])))]
saveRDS(atac_sub,paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
#Run Differential TF accessibility across clusters
DefaultAssay(atac_sub) <- 'chromvar'
#set up an empty list for looping through
da_tfs<-list()
da_tfs<-lapply(
unique(atac_sub$final_clusters),
FUN=da_one_v_one,
obj=atac_sub,
j_list=do.call("as.character",list(unique(atac_sub$final_clusters))),
group="final_clusters")
#Merge the final data frame from the list for 1v1 DA
da_tfs2<-do.call("rbind",do.call("rbind",da_tfs))
write("Outputting One v One DA Table.", stderr())
write.table(da_tfs2,file=paste(species,celltype.x,"onevone.tffactors.txt",sep="_"),sep="\t",col.names=T,row.names=T,quote=F)
dat<-read.table(file=paste(species,celltype.x,"onevone.tffactors.txt",sep="_"))
dat_select<-dat %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group,compared_group) %>% slice(1:2) #grabbing top 5 most significant peaks to label
plt<-ggplot(dat,aes(x=avg_logFC,y=(-log(p_val)),color=as.factor(compared_group)))+
geom_point(aes(alpha=0.1))+
geom_label_repel(dat=dat_select,aes(label=da_region),force=3)+
theme_bw()+facet_wrap(~enriched_group)
ggsave(plt,file=paste(species,celltype.x,"tffactors.pdf",sep="_"))
system(paste0("slack -F ",paste(species,celltype.x,"tffactors.pdf",sep="_")," ryan_todo"))
}
#Perform DA test for gene activity scores
geneactivity_loop<-function(celltype.x){
atac_sub<-readRDS(paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
#Run Differential GA accessibility across clusters
DefaultAssay(atac_sub) <- 'GeneActivity'
#set up an empty list for looping through
da_ga<-list()
da_ga<-lapply(
unique(atac_sub$final_clusters),
FUN=da_one_v_one,
obj=atac_sub,
j_list=do.call("as.character",list(unique(atac_sub$final_clusters))),
group="final_clusters")
#Merge the final data frame from the list for 1v1 DA
da_ga2<-do.call("rbind",do.call("rbind",da_ga))
write("Outputting One v One DA Table.", stderr())
write.table(da_ga2,file=paste(species,celltype.x,"onevone.geneactivity.txt",sep="_"),sep="\t",col.names=T,row.names=T,quote=F)
dat<-read.table(file=paste(species,celltype.x,"onevone.geneactivity.txt",sep="_"))
dat_select<-dat %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group,compared_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
plt<-ggplot(dat,aes(x=avg_logFC,y=(-log(p_val)),color=as.factor(compared_group)))+
geom_point(aes(alpha=0.1))+
geom_label_repel(dat=dat_select,aes(label=da_region),force=3)+
theme_bw()+facet_wrap(~enriched_group)
ggsave(plt,file=paste(species,celltype.x,"geneactivity.pdf",sep="_"))
system(paste0("slack -F ",paste(species,celltype.x,"geneactivity.pdf",sep="_")," ryan_todo"))
}
#Set up DA Test for peaks, same test but now also finds closest genes
da_one_v_one_peak<-function(i,obj,group,j_list){
i<-as.character(i)
da_tmp_2<-list()
for (j in j_list){
if ( i != j){
da_peaks_tmp <- FindMarkers(
object = obj,
ident.1 = i,
ident.2 = j,
group.by = group,
test.use = 'LR',
latent.vars = 'nCount_peaks',
only.pos=T
)
da_peaks_tmp$da_region<-row.names(da_peaks_tmp)
closest_genes <- ClosestFeature(obj,da_peaks_tmp$da_region)
da_peaks_tmp<-cbind(da_peaks_tmp,closest_genes)
da_peaks_tmp$enriched_group<-c(i)
da_peaks_tmp$compared_group<-c(j)
da_tmp_2[[paste(i,j)]]<-da_peaks_tmp
}
}
return(da_tmp_2)
}
#Perform DA test for gene activity scores
da_peaks_loop<-function(celltype.x){
atac_sub<-readRDS(paste(species,celltype.x,"SeuratObject.Rds",sep="_"))
#Run Differential GA accessibility across clusters
DefaultAssay(atac_sub) <- 'peaks'
#set up an empty list for looping through
da_peaks<-list()
da_peaks<-mclapply(
unique(atac_sub$final_clusters),
FUN=da_one_v_one_peak,
obj=atac_sub,
j_list=do.call("as.character",list(unique(atac_sub$final_clusters))),
group="final_clusters",mc.cores=5)
#Merge the final data frame from the list for 1v1 DA
da_peaks2<-do.call("rbind",do.call("rbind",da_peaks))
write("Outputting One v One DA Table.", stderr())
write.table(da_peaks2,file=paste(species,celltype.x,"onevone.peaks.txt",sep="_"),sep="\t",col.names=T,row.names=T,quote=F)
}
####################################
### Processing ###
#hg38
#Set up variables
species="hg38"
celltype_list<-unique(hg38_atac$celltype)
#Running cistopic subclustering on all identified cell types
for (i in celltype_list){cistopic_generation(x=hg38_atac,celltype.x=i)}
topicmodel_list<-paste(species,celltype_list,".CisTopicObject.Rds",sep="_")
#determine model count to use for each cell type
for (i in paste(species,celltype_list,"_model_selection.pdf",sep="_")){system(paste0("slack -F ",i," ryan_todo"))}
#selecting topics based on derivative, making a named vector
topic_count_list<-c(25,22,24,27,24,22)
names(topic_count_list)<-celltype_list
#Running clustering loop
for (i in celltype_list){clustering_loop(object_input=hg38_atac,celltype.x=i)}
#selecting resolution by plots
for (i in celltype_list){system(paste0("slack -F ",paste(species,i,"clustering.pdf",sep="_")," ryan_todo"))}
#Set up named vector for clustering resolution, based on umap plot with multiple clustering resolutions
resolution_list<-c(0.5,0.2,0.9,0.5,0.5,0.9)
names(resolution_list)<-celltype_list
#Set up ChromVAR motif object to run
# Get a list of motif position frequency matrices from the JASPAR database
pfm <- getMatrixSet(x = JASPAR2020,opts = list(species =9606, all_versions = FALSE))
# Scan the DNA sequence of each peak for the presence of each motif
motif.matrix <- CreateMotifMatrix(features = granges(hg38_atac),pwm = pfm,genome = 'hg38',use.counts = FALSE)
# Create a new Mofif object to store the results
motif <- CreateMotifObject(data = motif.matrix,pwm = pfm)
#Run ChromVAR
for (i in celltype_list){chromvar_loop(celltype.x=i)}
#Run Gene Activity Score differences
for (i in celltype_list){geneactivity_loop(celltype.x=i)}
#DA Peaks
for (i in celltype_list){da_peaks_loop(celltype.x=i)}
#mm10
#Set up variables
species="mm10"
celltype_list<-unique(mm10_atac$celltype)
#Running cistopic subclustering on all identified cell types
for (i in celltype_list){cistopic_generation(x=mm10_atac,celltype.x=i,species="mm10")}
topicmodel_list<-paste(species,celltype_list,".CisTopicObject.Rds",sep="_")
#determine model count to use for each cell type
for (i in paste(species,celltype_list,"_model_selection.pdf",sep="_")){system(paste0("slack -F ",i," ryan_todo"))}
#selecting topics based on derivative, making a named vector
topic_count_list<-c(28,22,23,28,28,22)
names(topic_count_list)<-celltype_list
#Running clustering loop
for (i in celltype_list){clustering_loop(object_input=hg38_atac,celltype.x=i)}
#selecting resolution by plots
for (i in celltype_list){system(paste0("slack -F ",paste(species,i,"clustering.pdf",sep="_")," ryan_todo"))}
#Set up named vector for clustering resolution, based on umap plot with multiple clustering resolutions
resolution_list<-c(0.5,0.2,0.9,0.5,0.5,0.9)
names(resolution_list)<-unique(hg38_atac$celltype)
#Set up ChromVAR motif object to run
# Get a list of motif position frequency matrices from the JASPAR database
pfm <- getMatrixSet(x = JASPAR2020,opts = list(species =9606, all_versions = FALSE))
# Scan the DNA sequence of each peak for the presence of each motif
motif.matrix <- CreateMotifMatrix(features = granges(hg38_atac),pwm = pfm,genome = 'hg38',use.counts = FALSE)
# Create a new Mofif object to store the results
motif <- CreateMotifObject(data = motif.matrix,pwm = pfm)
#Run ChromVAR
for (i in celltype_list){chromvar_loop(celltype.x=i,resolution_list.=resolution_list)}
#Run Gene Activity Score differences
for (i in celltype_list){geneactivity_loop(celltype.x=i,resolution_list.=resolution_list)}
Focusing on human inhibitory neurons
library(Signac)
library(Seurat)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
library(cicero)
library(cisTopic)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Hsapiens.v86)
library(Matrix)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(dplyr)
library(ggrepel)
library(parallel)
#read in RDS file.
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")
iN<-readRDS("hg38_inhibitory_neuron_SeuratObject.Rds")
#Plot Markers
markers<-c("LHX6","ADARB2","CHODL","UNC5B")
DefaultAssay(iN)<-"peaks"
plt<-CoveragePlot(iN, region = markers,
tile = TRUE,
extend.upstream=2000,
extend.downstream=2000,
tile.cells=10,ncol=1)
pdf("hg38_inhibitory_neuron.genebody.pdf",height=30)
plt
dev.off()
system("slack -F hg38_inhibitory_neuron.genebody.pdf ryan_todo")
dat_out<-cbind(as.data.frame(iN@reductions$umap@cell.embeddings[names(iN$final_clusters),]),iN$final_clusters)
write.table(dat_out,file="SourceData_Fig3a.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_Fig3a.tsv ryan_todo")
#Read in cistopic object
cistopic_object<-readRDS("hg38_inhibitory_neuron_.CisTopicObject.Rds")
#binarize cistopic to obtain peaks
cistopic_object<-cisTopic::selectModel(cistopic_object,select="25",keepModels=F)
cistopic_object <- getRegionsScores(cistopic_object, method='NormTop', scale=TRUE)
cistopic_object <- binarizecisTopics(cistopic_object, thrP=0.975, plot=TRUE)
#use chromvar to define signature TFs in topics
#Getting binarized motif signatures from Motif class in Signac
motif_signatures<-as.data.frame(iN@assays$peaks@motifs@data)
#get TF list common names
tfList <- getMatrixByID(JASPAR2020, ID=colnames(motif_signatures))
tfList <-unlist(lapply(names(tfList), function(x) name(tfList[[x]])))
#make a bed format file out of motifs
#x is an column number
motif_to_bed<-function(motif_signatures.=motif_signatures,x.=x){
motif_tmp<-row.names(motif_signatures)[which(motif_signatures[,x.]>0)]
tf<-colnames(motif_signatures)[x.]
motif_tmp<-strsplit(motif_tmp,"-")
seqnames<-lapply(motif_tmp,"[",1)
start<-lapply(motif_tmp,"[",2)
end<-lapply(motif_tmp,"[",3)
out_bed<-as.data.frame(cbind(seqnames,start,end,tf),col.names=c("seqnames","start","end","tf"))
return(out_bed)
}
motif_signatures_bed<-mclapply(1:ncol(motif_signatures),function(x) motif_to_bed(x.=x,motif_signatures.=motif_signatures),mc.cores=20)
#write out motifs contained within peak regions
dir.create("chromvar_bed_files")
for (x in 1:length(motif_signatures_bed)){
y<-motif_signatures_bed[[x]]
y$seqnames<-unlist(y$seqnames)
y$start<-unlist(y$start)
y$end<-unlist(y$end)
y$tf<-unlist(y$tf)
write.table(y,file=paste0("./chromvar_bed_files/","hg38.",tfList[x],".bed"),sep="\t")
print(paste0("./chromvar_bed_files/","hg38.",tfList[x],".bed"))
}
#generate signatures
signatures <-paste0("./chromvar_bed_files/","hg38.",tfList,".bed")
cistopic_object <- getSignaturesRegions(cistopic_object, signatures, labels=tfList)
library(AUCell)
pred.matrix <- predictiveDistribution(cistopic_object)
aucellRankings <- AUCell_buildRankings(pred.matrix, plot=FALSE, verbose=FALSE)
# Check signature enrichment in cells
cistopic_object <- signatureCellEnrichment(cistopic_object, aucellRankings, selected.signatures='all', aucMaxRank = 0.1*nrow(aucellRankings), plot=FALSE)
#make a heatmap
plt<-signaturesHeatmap(cistopic_object,nCores=1,row_names_gp = gpar(fontsize = 1))
sig_df<- as.data.frame(plt@ht_list$"Normalised AUC score"@matrix)
write.table(sig_df,file="hg38_inhibitoryNeurons.topic_TFsignatures.txt",sep="\t",col.names=T,row.names=T,quote=F)
sum(unlist(lapply(1:nrow(sig_df),function(x) any(abs(sig_df[x,])>3))))
#76
sig_df_filt<-sig_df[unlist(lapply(1:nrow(sig_df),function(x) any(abs(sig_df[x,])>3))),]
pdf("hg38_inhibitory_neurons.cisTopic_ChromvarSignature.pdf")
plt
dev.off()
saveRDS(cistopic_object,"hg38_inhibitory_neuron_.CisTopicObject.processed.Rds")
system("slack -F hg38_inhibitory_neurons.cisTopic_ChromvarSignature.pdf ryan_todo")
system("slack -F hg38_inhibitoryNeurons.topic_TFsignatures.txt ryan_todo")
New Session for Topic marker set enrichment
#Plotting markers https://celltypes.brain-map.org/rnaseq/human_m1_10x
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
library(cisTopic)
library(ggplot2)
library(reshape2)
library(patchwork)
library(Signac)
library(ComplexHeatmap)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")
#inhibitory neuron markers from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6919571/
markers<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/human_cortex_markers.tsv",head=T,sep="\t",comment.char="")
markers<-markers[,c("level1","level2","level3","single_markers_vs_level1")]
markers<-markers[markers$level1=="GABAergic neuron",]
level1_genesets<-lapply(unique(markers$level1),function(x) trimws(unlist(strsplit(unlist(markers[markers$level1==x,]$single_markers_vs_level1),","))))
names(level1_genesets)<-unique(markers$level1)
level2_genesets<-lapply(unique(markers$level2),function(x) trimws(unlist(strsplit(unlist(markers[markers$level2==x,]$single_markers_vs_level1),","))))
names(level2_genesets)<-unique(markers$level2)
level3_genesets<-lapply(unique(markers$level3),function(x) trimws(unlist(strsplit(unlist(markers[markers$level3==x,]$single_markers_vs_level1),","))))
names(level3_genesets)<-unique(markers$level3)
cistopic_object<-readRDS("hg38_inhibitory_neuron_.CisTopicObject.processed.Rds")
cistopic_object<-binarizecisTopics(cistopic_object)
cistopic_object <- annotateRegions(cistopic_object, txdb=TxDb.Hsapiens.UCSC.hg38.knownGene, annoDb='org.Hs.eg.db')
#peaks per topic
summary(unlist(lapply(lapply(cistopic_object@binarized.cisTopics,dim),"[",1)))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1401 2109 2523 2429 2724 3239
topic_list<-names(cistopic_object@binarized.cisTopics)
#Look at enrichment of this topic for peaks in marker genes
# Marker_Set Not_marker_set
#Peaks_in_topic a c
#Peaks_not_in_topic b d
gene_set_enrichment<-function(x,y,gene_set){
dat_peaks<-cistopic_object@binarized.cisTopics[[x]]
peaks_considered<-data.frame("peaks"=unlist(lapply(names(cistopic_object@binarized.cisTopics),
function(x) row.names(cistopic_object@binarized.cisTopics[[x]]))))
peaks_considered<-merge(peaks_considered,cistopic_object@region.data,by.x="peaks",by.y="row.names",all.x=T)
dat_peaks<-merge(dat_peaks,cistopic_object@region.data,by="row.names",all.x=T)
a<-sum(dat_peaks$SYMBOL %in% unlist(gene_set[y]))
b<-sum(peaks_considered$SYMBOL %in% unlist(gene_set[y]))-a #not using all peaks since all peaks were called on other cell types, using topic representatives
c<-sum(!(dat_peaks$SYMBOL %in% unlist(gene_set[y])))
d<-sum(!(peaks_considered$SYMBOL %in% unlist(gene_set[y])))-c #black balls in urn (peaks not assigned to topic)
test<-matrix(c(a,b,c,d), ncol=2)
return(c(x,y,
as.numeric(fisher.test(test,alternative="greater")$p.value),
as.numeric(fisher.test(test,alternative="greater")$estimate)))
}
#gene set enrichment of topic peaks in marker genes
gsea_2<-list()
for (i in topic_list){
gsea_2[[i]]<-list()
for (j in names(level2_genesets)){
gsea_2[[i]][[j]]<-gene_set_enrichment(x=i,y=j,gene_set=level2_genesets)
}
}
gsea_2<-do.call("rbind",do.call("rbind",(gsea_2)))
colnames(gsea_2)<-c("Topic","CellType","pval","estimate")
gsea_2<-as.data.frame(gsea_2)
gsea_2$pval<-as.numeric(gsea_2$pval)
gsea_2$estimate<-as.numeric(gsea_2$estimate)
#gene set enrichment of topic peaks in marker genes (level3_genesets)
gsea_3<-list()
for (i in topic_list){
gsea_3[[i]]<-list()
for (j in names(level3_genesets)){
gsea_3[[i]][[j]]<-gene_set_enrichment(x=i,y=j,gene_set=level3_genesets)
}
}
gsea_3<-do.call("rbind",do.call("rbind",(gsea_3)))
colnames(gsea_3)<-c("Topic","CellType","pval","estimate")
gsea_3<-as.data.frame(gsea_3)
gsea_3$pval<-as.numeric(gsea_3$pval)
gsea_3$estimate<-as.numeric(gsea_3$estimate)
#Order Topics by cell type and plot
#read in RDS file.
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")
iN<-readRDS("hg38_inhibitory_neuron_SeuratObject.Rds")
cistopic_cell_embeddings<-iN@reductions$cistopic@cell.embeddings
#cistopic_cell_embeddings<-cistopic_cell_embeddings[,colnames(cistopic_cell_embeddings) %in% paste("topic",1:25,sep="_")]
cellid_order<-row.names(cistopic_cell_embeddings)
cluster_annot<-setNames(iN@meta.data[match(cellid_order,iN@meta.data$cellID),]$final_cluster,cellid_order)
library(circlize)
col_fun = colorRamp2(c(0,0.3), c("white","red"))
write.table(as.data.frame(cistopic_cell_embeddings),file="SourceData_Fig3c_top.tsv",row.names=T,sep="\t",quote=F)
system("slack -F SourceData_Fig3c_top.tsv ryan_todo")
plt<-Heatmap(as.data.frame(cistopic_cell_embeddings),
cluster_columns=T,
show_row_names=F,
clustering_distance_rows="spearman",
clustering_distance_columns="spearman",
left_annotation=rowAnnotation(cluster=cluster_annot),
show_column_names=T,
show_row_den=T,row_dend_side="left",use_raster=T, col=col_fun
)
plt1 = draw(plt) #draw so its statics
topic_ord<-column_order(plt1) #get column order
pdf("hg38_inhibitory_neuron_cistopic_cellembbed.pdf",width=30)
plt1
dev.off()
system("slack -F hg38_inhibitory_neuron_cistopic_cellembbed.pdf ryan_todo")
gsea_2$Topic <- factor(gsea_2$Topic, levels = paste0("Topic",1:25)[topic_ord])
gsea_2$CellType <- factor(gsea_2$CellType, levels = unique(gsea_2$CellType))
plt2<-ggplot(gsea_2, aes(x = CellType, y = Topic)) +
geom_point(aes(size = estimate, color = pval)) +
theme_bw() + scale_color_gradient(limits=c(0,0.05),low="red",high="white")+coord_flip()+scale_size(limits=c(0,3))
ggsave(plt2,file="hg38_inhibitory_neuron_celltypelevel2.pdf")
system("slack -F hg38_inhibitory_neuron_celltypelevel2.pdf ryan_todo")
gsea_3$Topic <- factor(gsea_3$Topic, levels = paste0("Topic",1:25)[topic_ord])
gsea_3$CellType <- factor(gsea_3$CellType, levels = gsea_3$CellType)
plt3<-ggplot(gsea_3, aes(x = CellType, y = Topic)) +
geom_point(aes(size = estimate, color = pval)) +
theme_bw() + scale_color_gradient(limits=c(0,0.05),low="orange",high="white")+coord_flip()+scale_size(limits=c(0,3))
ggsave(plt3,file="hg38_inhibitory_neuron_celltypelevel3.pdf")
system("slack -F hg38_inhibitory_neuron_celltypelevel3.pdf ryan_todo")
plt<-wrap_plots(plt2,plt3,ncol=1)
ggsave(plt,file="hg38_inhibitory_neuron_gsea.pdf")
system("slack -F hg38_inhibitory_neuron_gsea.pdf ryan_todo")
gsea_2$level<-"2"
gsea_3$level<-"3"
write.table(rbind(gsea_2,gsea_3),file="SourceData_Fig3c_bot.tsv",row.names=T,sep="\t",quote=F)
system("slack -F SourceData_Fig3c_bot.tsv ryan_todo")
cistopic_object <- GREAT(cistopic_object, genome='hg38', fold_enrichment=2, geneHits=1, sign=0.05, request_interval=10
)
markers<-c("ADARB2","PAX6","LAMP5","VIP",
"LHX6","SST","HTR3A","LOC102724124",
"CDH12","PAX6-AS1","TNFAIP8L3","NMBR",
"LCP2","DBP","LOC105371641","CA1","CHRNA4",
"MC4R","LOC105377434","BAGE2","SYT6","TSPAN12",
"CHRNA6","LOC105371331","PENK","QPCT","HS3ST3A1",
"LOC101927460","PCDH20","SERPINF1","TYR","LOC100421401",
"CHRM2","CBLN1","CCDC184","GGH","LOC644919","LBH","CASC6",
"LIN00923","SPAG17","OPRM1",
"ADARB2","LHX6","SST","PVALB","NPY","HPGD","B3GAT2","LOC101927132","KLHDC8A",
"LOC105371658","NPM1P10","LOC105377401","GXYLT2","STK32A","LOC101929028","CALB1",
"LOC105374696","ADGR6","FRZB","TH","GLP1R","LGR5","MEPE","WFDC2","SULF1","ZFPM2-AS1","MIR548F2","SCUBE3")
nrow(cistopic_object@region.data[cistopic_object@region.data$SYMBOL %in% markers,])
plt<-ontologyDotPlot(cistopic_object, top=5,var.y='name', order.by='Binom_Adjp_BH')
pdf("test_heatmap.genebody.pdf",width=30)
plt
dev.off()
system("slack -F test_heatmap.genebody.pdf ryan_todo")
iN<-readRDS("hg38_inhibitory_neuron_SeuratObject.Rds")
library(ggplot2)
library(Seurat)
library(Signac)
library(ggrepel)
library(dplyr)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data/subclustering")
dat_ga<-read.table("hg38_inhibitory_neuron_onevone.geneactivity.txt",head=T)
dat_tf<-read.table("hg38_inhibitory_neuron_onevone.tffactors.txt",head=T)
dat_peaks<-read.table("hg38_inhibitory_neuron_onevone.peaks.txt",head=T)
markers<-c("GAD1","LHX6","ADARB2","LAMP5","VIP","PVALB","SST","CUX2")
dat_select<-dat_peaks %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group,compared_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
#dat_select<-dat_peaks[dat_peaks$p_val_adj<0.1 & dat_peaks$gene_name %in% markers,]
dat_peaks<-dat_peaks[dat_peaks$p_val_adj<1,]
plt<-ggplot(dat_peaks,aes(x=avg_logFC,y=(-log(p_val_adj)),color=as.factor(compared_group)))+
geom_point(aes(alpha=0.1))+
geom_label_repel(dat=dat_select,aes(label=gene_name),size=2,force=5)+
theme_bw()+facet_wrap(~enriched_group)+ylim(c(0,20))+xlim(c(0,50))
ggsave(plt,file=paste(species,celltype.x,"da_peaks.pdf",sep="_"))
system(paste0("slack -F ",paste(species,celltype.x,"da_peaks.pdf",sep="_")," ryan_todo"))
#dat_ga[dat_ga$da_region %in% markers,]
#dat_tf[dat_tf$da_region %in% markers,]
#dat_peaks[dat_peaks$gene_name %in% markers,]
#Generate promoter gene body gene activities (not many CCANs overlap with marker genes)
#Limiting to TFs/Gene Activity/DA Peaks identified and nominally different
dat_select<-dat_peaks[dat_peaks$distance==0,] %>% arrange(rev(desc(p_val_adj))) %>% group_by(enriched_group,compared_group) %>% slice(1:2) #grabbing top 2 most significant peaks to label
#features<-as.data.frame(dat_peaks[dat_peaks$distance==0,] %>% group_by(enriched_group,compared_group) %>% head())
#features<-dat_select$gene_name
#features<-unique(c(dat_ga$da_region,dat_tf$da_region,dat_peaks$gene_name))
#inhibitory neuron markers from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6919571/
markers<-c("ADARB2","PAX6","LAMP5","VIP",
"LHX6","SST","HTR3A","LOC102724124",
"CDH12","PAX6-AS1","TNFAIP8L3","NMBR",
"LCP2","DBP","LOC105371641","CA1","CHRNA4",
"MC4R","LOC105377434","BAGE2","SYT6","TSPAN12",
"CHRNA6","LOC105371331","PENK","QPCT","HS3ST3A1",
"LOC101927460","PCDH20","SERPINF1","TYR","LOC100421401",
"CHRM2","CBLN1","CCDC184","GGH","LOC644919","LBH","CASC6",
"LIN00923","SPAG17","OPRM1",
"ADARB2","LHX6","SST","PVALB","NPY","HPGD","B3GAT2","LOC101927132","KLHDC8A",
"LOC105371658","NPM1P10","LOC105377401","GXYLT2","STK32A","LOC101929028","CALB1",
"LOC105374696","ADGR6","FRZB","TH","GLP1R","LGR5","MEPE","WFDC2","SULF1","ZFPM2-AS1","MIR548F2","SCUBE3")
nrow(cistopic_object@region.data[cistopic_object@region.data$SYMBOL %in% markers,])
DefaultAssay(dat)<-"genebody"
gene.activities <- GeneActivity(dat,features=markers)
dat[['genebody']] <- CreateAssayObject(counts = gene.activities)
dat[['genebody']]<-CreateAssayObject(data=as.matrix(sweep(dat@assays$genebody@counts,2,dat$uniq_reads,`/`))) #divide by read count
dat <- NormalizeData(
object = dat,
assay = 'genebody',
normalization.method = 'RC',
scale.factor = 1e6
)
DefaultAssay(dat)<-"genebody"
plt<-FeaturePlot(dat, features = markers,max.cutoff='q95')
pdf("test_heatmap.genebody.pdf",width=30,height=20)
plt
dev.off()
system("slack -F test_heatmap.genebody.pdf ryan_todo")
library(reshape2)
library(ComplexHeatmap)
genebody<-as.data.frame(dat@assays$genebody@data)
genebody<-sweep(genebody,2,dat$uniq_reads,`/`) #divide by read count
genebody<-data.frame(t(genebody))
genebody$cellID<-row.names(genebody)
#append cluster ID to column
genebody$final_clusters<-dat@meta.data[match(dat@meta.data$cellID,genebody$cellID),]$final_clusters
#reshape to long format
genebody<-melt(genebody)
#group by to summarize markers by column
genebody<-as.data.frame(genebody %>% group_by(final_clusters,variable) %>% summarize(mean_ga=median(value)))
#plot as heatmap
genebody<-dcast(genebody,final_clusters~variable)
row.names(genebody)<-genebody$final_clusters
genebody<-genebody[colnames(genebody) %in% markers]
#zscore values
genebody<-scale(genebody)
plt<-Heatmap(t(genebody),
clustering_distance_columns="pearson",
)
DefaultAssay(dat)<-"GeneActivity"
plt<-FeaturePlot(dat,features=markers)
ggsave(plt,file="inhib_markers.pdf",height=20,width=20)
system("slack -F inhib_markers.pdf ryan_todo")
DefaultAssay(dat)<-"peaks"
plt<-CoveragePlot(dat,region=markers,ncol=1,group.by="final_clusters")
ggsave(plt,file="inhib_markers.pdf",height=10*length(markers),limitsize=F)
system("slack -F inhib_markers.pdf ryan_todo")
Using Harmony for integration across data sets for mouse brain data.
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(ggplot2)
set.seed(1234)
library(EnsDb.Mmusculus.v79)
library(ggplot2)
library(dplyr)
library(patchwork)
library(Matrix)
library(harmony,lib.loc="/home/groups/oroaklab/src/R/R-4.0.0/lib_backup_210125")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
mm10_annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
seqlevelsStyle(mm10_annotations) <- 'UCSC'
genome(mm10_annotations) <- "mm10"
#function to read in sparse matrix format from atac-count
read_in_sparse<-function(x){ #x is character file prefix followed by .bbrd.q10.500.counts.sparseMatrix.values.gz
IN<-as.matrix(read.table(paste0(x,".counts.sparseMatrix.values.gz")))
IN<-sparseMatrix(i=IN[,1],j=IN[,2],x=IN[,3])
COLS<-read.table(paste0(x,".counts.sparseMatrix.cols.gz"))
colnames(IN)<-COLS$V1
ROWS<-read.table(paste0(x,".counts.sparseMatrix.rows.gz"))
row.names(IN)<-ROWS$V1
writeMM(IN,file=paste0(x,".counts.mtx")) #this is to generate counts matrices in scrublet friendly format
return(IN)
}
#tenx
ours_in_tenx<-read_in_sparse("tenx.bed")
ours_in_tenx_assay <- CreateChromatinAssay(counts = ours_in_tenx, genome="mm10", min.cells = 0, annotation=mm10_annotations, sep=c("_","_"))
ours_in_tenx_object <- CreateSeuratObject(counts = ours_in_tenx_assay, assay = "peaks")
ours_in_tenx_object$tech<-"s3"
#snatac
ours_in_snatac<-read_in_sparse("snatac.bed")
ours_in_snatac_assay <- CreateChromatinAssay(counts = ours_in_snatac, genome="mm10", min.cells = 0, annotation=mm10_annotations, sep=c("_","_"))
ours_in_snatac_object <- CreateSeuratObject(counts = ours_in_snatac_assay, assay = "peaks")
ours_in_snatac_object$tech<-"s3"
#scimap
ours_in_scimap<-read_in_sparse("scimap.bed")
ours_in_scimap_assay <- CreateChromatinAssay(counts = ours_in_scimap, genome="mm10", min.cells = 0, annotation=mm10_annotations, sep=c("_","_"))
ours_in_scimap_object <- CreateSeuratObject(counts = ours_in_scimap_assay, assay = "peaks")
ours_in_scimap_object$tech<-"s3"
tenx<-readRDS("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/tenx_genomics_adultbrain_fresh_5k.Rds")
tenx$tech<-"tenx"
snatac<-readRDS("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/snatac_adultbrain.Rds")
snatac$tech<-"snatac"
scimap<-readRDS("/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/s3ATAC_AdultMusBrain_Comparison/scimapatac_adultbrain.Rds")
scimap$tech<-"scimap"
x<-ours_in_scimap_object
y<-scimap
integration_peaks<-function(x,y,prefix){
y<-subset(y,features=row.names(x[["peaks"]]@counts))
unintegrated <- merge(x,y)
unintegrated <- RunTFIDF(unintegrated)
unintegrated <- FindTopFeatures(unintegrated, min.cutoff = 50)
unintegrated <- RunSVD(unintegrated)
unintegrated <- RunUMAP(unintegrated, reduction = 'lsi', dims = 2:30)
p1 <- DimPlot(unintegrated, group.by = 'tech', pt.size = 0.1) + ggplot2::ggtitle("Unintegrated")
saveRDS(unintegrated,paste0(prefix,".unintegrated.Rds"))
hm.integrated <- RunHarmony(object = unintegrated, group.by.vars = 'tech', reduction = 'lsi', assay.use = 'peaks', project.dim = FALSE )
# re-compute the UMAP using corrected LSI embeddings
hm.integrated <- RunUMAP(hm.integrated, dims = 2:30, reduction = 'harmony')
p5 <- DimPlot(hm.integrated, group.by = 'tech', pt.size = 0.1) + ggplot2::ggtitle("Harmony integration")
plt<-p1 + p5 #+ p2
ggsave(plt,file=paste0(prefix,"_integration.pdf"))
system(paste0("slack -F ",paste0(prefix,"_integration.pdf")," ryan_todo"))
saveRDS(hm.integrated,paste0(prefix,".hm.integrated.Rds"))
}
integration_peaks(x=ours_in_tenx_object,y=tenx,prefix="ours_in_tenx")
integration_peaks(x=ours_in_snatac_object,y=snatac,prefix="ours_in_snatac")
integration_peaks(x=ours_in_scimap_object,y=scimap,prefix="ours_in_scimap")
integration.files<-list.files(pattern="hm.integrated.Rds")
out_data<-lapply(integration.files,function(i){
outname<-unlist(lapply(strsplit(i,"[.]"),"[",1))
obj<-readRDS(i)
dat_out<-cbind(as.data.frame(obj@reductions$umap@cell.embeddings[names(obj$tech),]),obj$tech,obj$celltype,outname)
return(dat_out)
})
dat_out<-do.call("rbind",out_data)
write.table(dat_out,file="SourceData_Fig2i.tsv",sep="\t",quote=F,row.names=F)
system("slack -F SourceData_Fig2i.tsv ryan_todo")
For human: https://portal.brain-map.org/atlases-and-data/rnaseq/human-m1-10x For mouse: https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-whole-cortex-and-hippocampus-10x
#Human download
cd /home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_humancortex
wget https://brainmapportal-live-4cc80a57cd6e400d854-f7fdcae.divio-media.net/filer_public/70/32/70326830-e306-4743-a02c-a8da5bf9eb56/readme-m1-10.txt
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_m1_10x/metadata.csv
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_m1_10x/matrix.csv
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_m1_10x/trimmed_means.csv
wget https://brainmapportal-live-4cc80a57cd6e400d854-f7fdcae.divio-media.net/filer_public/0c/0c/0c0c882d-1c31-40a9-8039-3bf2706a77cd/sample-exp_component_mapping_human_10x_apr2020.zip
#Mouse download
cd /home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_mouse
wget https://brainmapportal-live-4cc80a57cd6e400d854-f7fdcae.divio-media.net/filer_public/87/14/8714d0a3-27d7-4a81-8c77-eebfd605a280/readme_mouse_10x.txt
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/metadata.csv
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/matrix.csv
wget https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/trimmed_means.csv
#Downloading Mouse whole brain data
wget https://www.dropbox.com/s/kqsy9tvsklbu7c4/allen_brain.rds?dl=0
Following https://satijalab.org/seurat/v3.2/pbmc3k_tutorial.html
library(Seurat)
library(ggplot2)
#Human
setwd("/home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_humancortex")
meta_data<-read.csv("metadata.csv",header=T)
row.names(meta_data)<-meta_data$sample_name
counts<-read.csv("matrix.csv",header=T,row.names=1)
brainspan <- CreateSeuratObject(counts = as.data.frame(t(counts)), project = "brainspain", min.cells = 3, min.features = 500, meta.data=meta_data)
saveRDS(brainspan, file = "allen_brainspan_humancortex.rds")
brainspan <- NormalizeData(brainspan, normalization.method = "LogNormalize", scale.factor = 10000)
brainspan <- FindVariableFeatures(brainspan, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(brainspan)
brainspan <- ScaleData(brainspan, features = all.genes)
brainspan <- RunPCA(brainspan, features = VariableFeatures(object = brainspan))
plt<-ElbowPlot(brainspan)
ggsave(plt,file="allen_brainspan_humancortex.elbowplot.pdf")
system("slack -F allen_brainspan_humancortex.elbowplot.pdf ryan_todo")
brainspan <- FindNeighbors(brainspan, dims = 1:14)
brainspan <- FindClusters(brainspan, resolution = 0.5)
brainspan <- RunUMAP(brainspan, dims = 1:14)
plt<-DimPlot(brainspan, reduction = "umap",group.by=c("class_label","subclass_label"))
ggsave(plt,file="allen_brainspan_humancortex.dimplot.pdf",width=30)
system("slack -F allen_brainspan_humancortex.dimplot.pdf ryan_todo")
saveRDS(brainspan, file = "allen_brainspan_humancortex.rds")
#Mouse
setwd("/home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_mouse")
meta_data<-read.csv("metadata.csv",header=T)
row.names(meta_data)<-meta_data$sample_name
counts<-read.csv("matrix.csv",header=T,row.names=1,nrows=100000) #this is 1million cells, i think 10% of that will be fine, hopefully they are evenly distributed
brainspan <- CreateSeuratObject(counts = as.data.frame(t(counts)), project = "brainspain", min.cells = 3, min.features = 500, meta.data=meta_data)
saveRDS(brainspan, file = "allen_brainspan_mouse.rds")
brainspan <- NormalizeData(brainspan, normalization.method = "LogNormalize", scale.factor = 10000)
brainspan <- FindVariableFeatures(brainspan, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(brainspan)
brainspan <- ScaleData(brainspan, features = all.genes)
brainspan <- RunPCA(brainspan, features = VariableFeatures(object = brainspan))
plt<-ElbowPlot(brainspan)
ggsave(plt,file="allen_brainspan_mouse.elbowplot.pdf")
system("slack -F allen_brainspan_mouse.elbowplot.pdf ryan_todo")
brainspan <- FindNeighbors(brainspan, dims = 1:15)
brainspan <- FindClusters(brainspan, resolution = 0.5)
brainspan <- RunUMAP(brainspan, dims = 1:15)
plt<-DimPlot(brainspan, reduction = "umap",group.by=c("class_label","subclass_label"))
ggsave(plt,file="allen_brainspan_mouse.dimplot.pdf",width=30)
system("slack -F allen_brainspan_mouse.dimplot.pdf ryan_todo")
saveRDS(brainspan, file = "allen_brainspan_mouse.rds")
Retry mouse cluster id just straight up following https://satijalab.org/signac/articles/mouse_brain_vignette.html?
library(ComplexHeatmap)
library(Signac)
library(Seurat)
library(ggplot2)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3atac_data")
#Read in data and modify to monocle CDS file
#read in RDS file.
hg38_atac<-readRDS(file="hg38_SeuratObject.Rds")
mm10_atac<-readRDS(file="mm10_SeuratObject.Rds")
predict_celltype<-function(object,brainspan,prefix){
transfer.anchors <- FindTransferAnchors(reference = brainspan,query = object,query.assay="GeneActivity",reduction = 'cca')
saveRDS(transfer.anchors,file=paste0(prefix,".transferanchors.rds"))
#predict labels for class and subclass
predicted.labels.class <- TransferData(anchorset = transfer.anchors,refdata = brainspan$class_label,weight.reduction = "cca",dims = 1:30)
predicted.labels.subclass <- TransferData(anchorset = transfer.anchors,refdata = brainspan$subclass_label,weight.reduction = "cca", dims = 1:30)
object <- AddMetaData(object = object, metadata = predicted.labels.class)
object <- AddMetaData(object = object, metadata = predicted.labels.subclass)
saveRDS(object,file=paste0(prefix,"_SeuratObject.Rds"))
feat<-colnames(object@meta.data)[which(grepl("prediction.score",colnames(object@meta.data)))]
feat<-feat[feat !="prediction.score.max"]
plt<-FeaturePlot(object,features=feat,order=T)#plot feature plots
ggsave(plt,file=paste0(prefix,"_predicted.umap.png"),width=20,height=30)
system(paste0("slack -F ",prefix,"_predicted.umap.png ryan_todo"))
#plot merged cluster heatmap
#Generate Heatmap of TF ChromVar score and TF modules
pred<-object@meta.data[,feat]
colnames(pred)<-substring(colnames(pred),18) #remove "prediction.score" for readability
#Combine over subclusters
pred<-split(pred,paste(object$seurat_clusters)) #group by rows to seurat clusters
pred<-lapply(pred,function(x) apply(x,2,mean)) #take average across group
pred<-do.call("rbind",pred) #condense to smaller data frame
pred<-pred[!endsWith(row.names(pred),"NA"),] #remove doublets not assigned a subcluster
plt1<-Heatmap(pred,clustering_distance_rows="maximum",clustering_distance_columns="maximum")
pdf(paste0(prefix,".complexheatmap.pdf"),height=20)
plt1
dev.off()
system(paste0("slack -F ",prefix,".complexheatmap.pdf ryan_todo"))
}
brainspan. <- readRDS("/home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_humancortex/allen_brainspan_humancortex.rds")
predict_celltype(object=hg38_atac,brainspan=brainspan.,prefix="hg38")
#Mouse using smaller data set that is whole brain
brainspan. <- readRDS("/home/groups/oroaklab/adey_lab/projects/sciDROP/public_data/allen_brainspan_mouse/allen_brainspan_mouse.rds")
brainspan. <- FindVariableFeatures(object = brainspan.,nfeatures = 5000)
predict_celltype(object=mm10_atac,brainspan=brainspan.,prefix="mm10")
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS: /home/groups/oroaklab/src/R/R-4.0.0/lib/libRblas.so
LAPACK: /home/groups/oroaklab/src/R/R-4.0.0/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid stats4 parallel stats graphics grDevices utils
[8] datasets methods base
other attached packages:
[1] harmony_1.0
[2] Rcpp_1.0.5
[3] ComplexHeatmap_2.5.5
[4] org.Hs.eg.db_3.12.0
[5] TxDb.Hsapiens.UCSC.hg38.knownGene_3.10.0
[6] BSgenome.Hsapiens.UCSC.hg38_1.4.3
[7] BSgenome_1.57.6
[8] rtracklayer_1.49.5
[9] Biostrings_2.57.2
[10] XVector_0.29.3
[11] TFBSTools_1.27.0
[12] JASPAR2020_0.99.10
[13] dplyr_1.0.2
[14] ggrepel_0.8.2
[15] cicero_1.3.4.10
[16] Gviz_1.33.2
[17] monocle3_0.2.3.0
[18] SingleCellExperiment_1.11.6
[19] SummarizedExperiment_1.19.6
[20] DelayedArray_0.15.7
[21] matrixStats_0.57.0
[22] patchwork_1.0.1
[23] cisTopic_0.3.0
[24] SeuratWrappers_0.2.0
[25] Matrix_1.2-18
[26] EnsDb.Mmusculus.v79_2.99.0
[27] EnsDb.Hsapiens.v86_2.99.0
[28] ensembldb_2.13.1
[29] AnnotationFilter_1.13.0
[30] GenomicFeatures_1.41.3
[31] AnnotationDbi_1.51.3
[32] Biobase_2.49.1
[33] GenomicRanges_1.41.6
[34] GenomeInfoDb_1.25.11
[35] IRanges_2.23.10
[36] S4Vectors_0.27.12
[37] BiocGenerics_0.35.4
[38] Seurat_3.2.1
[39] Signac_1.1.0
[40] ggplot2_3.3.2
[41] reshape2_1.4.4
loaded via a namespace (and not attached):
[1] rappdirs_0.3.1 SnowballC_0.7.0
[3] GGally_2.0.0 R.methodsS3_1.8.1
[5] tidyr_1.1.2 bit64_4.0.5
[7] knitr_1.30 irlba_2.3.3
[9] R.utils_2.10.1 data.table_1.13.0
[11] rpart_4.1-15 KEGGREST_1.29.0
[13] RCurl_1.98-1.2 generics_0.0.2
[15] snow_0.4-3 RhpcBLASctl_0.20-137
[17] cowplot_1.1.0 RSQLite_2.2.1
[19] VGAM_1.1-3 RANN_2.6.1
[21] future_1.19.1 bit_4.0.4
[23] spatstat.data_1.4-3 httpuv_1.5.4
[25] assertthat_0.2.1 DirichletMultinomial_1.31.0
[27] viridis_0.5.1 xfun_0.20
[29] hms_0.5.3 promises_1.1.1
[31] progress_1.2.2 caTools_1.18.0
[33] dbplyr_1.4.4 igraph_1.2.6
[35] DBI_1.1.0 htmlwidgets_1.5.2
[37] reshape_0.8.8 feather_0.3.5
[39] rsparse_0.4.0 purrr_0.3.4
[41] ellipsis_0.3.1 backports_1.2.0
[43] annotate_1.67.1 biomaRt_2.45.2
[45] deldir_0.1-29 vctrs_0.3.4
[47] remotes_2.2.0 ROCR_1.0-11
[49] abind_1.4-5 withr_2.3.0
[51] ggforce_0.3.2 checkmate_2.0.0
[53] sctransform_0.3 GenomicAlignments_1.25.3
[55] prettyunits_1.1.1 goftest_1.2-2
[57] cluster_2.1.0 seqLogo_1.55.5
[59] lazyeval_0.2.2 crayon_1.3.4
[61] pkgconfig_2.0.3 tweenr_1.0.1
[63] nlme_3.1-149 ProtGenerics_1.21.0
[65] nnet_7.3-14 rlang_0.4.8
[67] globals_0.13.0 lifecycle_0.2.0
[69] miniUI_0.1.1.1 BiocFileCache_1.13.1
[71] doSNOW_1.0.18 rsvd_1.0.3
[73] dichromat_2.0-0 polyclip_1.10-0
[75] lmtest_0.9-38 graph_1.67.1
[77] ggseqlogo_0.1 zoo_1.8-8
[79] base64enc_0.1-3 GlobalOptions_0.1.2
[81] ggridges_0.5.2 rjson_0.2.20
[83] png_0.1-7 viridisLite_0.3.0
[85] RcisTarget_1.9.1 float_0.2-4
[87] bitops_1.0-6 R.oo_1.24.0
[89] lda_1.4.2 KernSmooth_2.23-17
[91] blob_1.2.1 shape_1.4.5
[93] stringr_1.4.0 readr_1.4.0
[95] jpeg_0.1-8.1 CNEr_1.25.0
[97] scales_1.1.1 memoise_1.1.0
[99] GSEABase_1.51.1 magrittr_1.5
[101] plyr_1.8.6 ica_1.0-2
[103] zlibbioc_1.35.0 compiler_4.0.0
[105] RColorBrewer_1.1-2 clue_0.3-57
[107] fitdistrplus_1.1-1 Rsamtools_2.5.3
[109] listenv_0.8.0 pbapply_1.4-3
[111] htmlTable_2.1.0 Formula_1.2-4
[113] text2vec_0.6 MASS_7.3-53
[115] mgcv_1.8-33 tidyselect_1.1.0
[117] stringi_1.5.3 askpass_1.1
[119] latticeExtra_0.6-29 VariantAnnotation_1.35.3
[121] fastmatch_1.1-0 tools_4.0.0
[123] future.apply_1.6.0 circlize_0.4.10
[125] rstudioapi_0.11 TFMPvalue_0.0.8
[127] foreach_1.5.0 foreign_0.8-80
[129] lsa_0.73.2 AUCell_1.11.0
[131] gridExtra_2.3 farver_2.0.3
[133] Rtsne_0.15 digest_0.6.27
[135] BiocManager_1.30.10 pracma_2.2.9
[137] shiny_1.5.0 later_1.1.0.1
[139] RcppAnnoy_0.0.16 OrganismDbi_1.31.0
[141] httr_1.4.2 ggbio_1.37.0
[143] biovizBase_1.37.0 colorspace_1.4-1
[145] XML_3.99-0.5 tensor_1.5
[147] reticulate_1.16 splines_4.0.0
[149] lgr_0.3.4 uwot_0.1.8
[151] RBGL_1.65.0 RcppRoll_0.3.0
[153] spatstat.utils_1.17-0 plotly_4.9.2.1
[155] xtable_1.8-4 poweRlaw_0.70.6
[157] jsonlite_1.7.1 spatstat_1.64-1
[159] R6_2.5.0 mlapi_0.1.0
[161] Hmisc_4.4-1 pillar_1.4.6
[163] htmltools_0.5.0 mime_0.9
[165] glue_1.4.2 fastmap_1.0.1
[167] BiocParallel_1.23.2 codetools_0.2-16
[169] lattice_0.20-41 tibble_3.0.4
[171] curl_4.3 leiden_0.3.3
[173] gtools_3.8.2 GO.db_3.11.4
[175] openssl_1.4.3 survival_3.2-7
[177] munsell_0.5.0 GetoptLong_1.0.3
[179] GenomeInfoDbData_1.2.3 iterators_1.0.12
[181] gtable_0.3.0